Transformers 源码解析(九十八)
.\models\roformer\modeling_tf_roformer.py
import math
from typing import Dict, Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
TFBaseModelOutput,
TFBaseModelOutputWithPooling,
TFCausalLMOutput,
TFMaskedLMOutput,
TFMultipleChoiceModelOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFMaskedLanguageModelingLoss,
TFModelInputType,
TFMultipleChoiceLoss,
TFPreTrainedModel,
TFQuestionAnsweringLoss,
TFSequenceClassificationLoss,
TFSequenceSummary,
TFTokenClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
)
from .configuration_roformer import RoFormerConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "junnyu/roformer_chinese_base"
_CONFIG_FOR_DOC = "RoFormerConfig"
TF_ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"junnyu/roformer_chinese_small",
"junnyu/roformer_chinese_base",
"junnyu/roformer_chinese_char_small",
"junnyu/roformer_chinese_char_base",
"junnyu/roformer_small_discriminator",
"junnyu/roformer_small_generator",
]
class TFRoFormerSinusoidalPositionalEmbedding(keras.layers.Layer):
"""This module produces sinusoidal positional embeddings of any length."""
def __init__(self, num_positions: int, embedding_dim: int, **kwargs):
super().__init__(**kwargs)
if embedding_dim % 2 != 0:
raise NotImplementedError(f"odd embedding_dim {embedding_dim} not supported")
self.embedding_dim = embedding_dim
self.num_positions = num_positions
def build(self, input_shape: tf.TensorShape):
"""
Build shared token embedding layer Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
weight = self._init_weight(self.num_positions, self.embedding_dim)
self.weight = self.add_weight(
name="embeddings",
shape=[self.num_positions, self.embedding_dim],
)
weight = tf.cast(weight, dtype=self.weight.dtype)
self.weight.assign(weight)
super().build(input_shape)
@staticmethod
def _init_weight(n_pos: int, dim: int):
"""
Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
the 2nd half of the vector. [dim // 2:]
"""
position_enc = np.array(
[[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
)
table = np.zeros_like(position_enc)
table[:, 0 : dim // 2] = np.sin(position_enc[:, 0::2])
table[:, dim // 2 :] = np.cos(position_enc[:, 1::2])
table = tf.convert_to_tensor(table)
tf.stop_gradient(table)
return table
def call(self, input_shape: tf.TensorShape, past_key_values_length: int = 0):
"""Input is expected to be of size [bsz x seqlen]."""
bsz, seq_len = input_shape[:2]
positions = tf.range(past_key_values_length, seq_len + past_key_values_length, delta=1, name="range")
return tf.gather(self.weight, positions)
class TFRoFormerEmbeddings(keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config: RoFormerConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.embedding_size = config.embedding_size
self.initializer_range = config.initializer_range
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape=None):
with tf.name_scope("word_embeddings"):
self.weight = self.add_weight(
name="weight",
shape=[self.config.vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range),
)
with tf.name_scope("token_type_embeddings"):
self.token_type_embeddings = self.add_weight(
name="embeddings",
shape=[self.config.type_vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range),
)
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.embedding_size])
def call(
self,
input_ids: tf.Tensor = None,
token_type_ids: tf.Tensor = None,
inputs_embeds: tf.Tensor = None,
training: bool = False,
) -> tf.Tensor:
"""
Applies embedding based on inputs tensor.
Args:
input_ids (tf.Tensor): 输入的词汇 ID 张量
token_type_ids (tf.Tensor): 输入的类型 ID 张量
inputs_embeds (tf.Tensor): 输入的嵌入张量
training (bool): 是否在训练模式中使用 Dropout
Returns:
final_embeddings (`tf.Tensor`): 输出的嵌入张量.
"""
assert not (input_ids is None and inputs_embeds is None)
if input_ids is not None:
check_embeddings_within_bounds(input_ids, self.config.vocab_size)
inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
input_shape = shape_list(inputs_embeds)[:-1]
if token_type_ids is None:
token_type_ids = tf.fill(dims=input_shape, value=0)
token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
final_embeddings = inputs_embeds + token_type_embeds
final_embeddings = self.LayerNorm(inputs=final_embeddings)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
return final_embeddings
def __init__(self, config: RoFormerConfig, **kwargs):
super().__init__(**kwargs)
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number "
f"of attention heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
self.query = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
)
self.key = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
)
self.value = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
)
self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.rotary_value = config.rotary_value
self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
return tf.transpose(tensor, perm=[0, 2, 1, 3])
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
sinusoidal_pos: tf.Tensor,
head_mask: tf.Tensor,
output_attentions: bool,
training: bool = False,
) -> Tuple[tf.Tensor]:
batch_size = shape_list(hidden_states)[0]
mixed_query_layer = self.query(inputs=hidden_states)
mixed_key_layer = self.key(inputs=hidden_states)
mixed_value_layer = self.value(inputs=hidden_states)
query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
if sinusoidal_pos is not None:
if self.rotary_value:
query_layer, key_layer, value_layer = self.apply_rotary_position_embeddings(
sinusoidal_pos, query_layer, key_layer, value_layer
)
else:
query_layer, key_layer = self.apply_rotary_position_embeddings(sinusoidal_pos, query_layer, key_layer)
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
attention_scores = tf.divide(attention_scores, dk)
if attention_mask is not None:
attention_scores = tf.add(attention_scores, attention_mask)
attention_probs = stable_softmax(logits=attention_scores, axis=-1)
attention_probs = self.dropout(inputs=attention_probs, training=training)
if head_mask is not None:
attention_probs = tf.multiply(attention_probs, head_mask)
attention_output = tf.matmul(attention_probs, value_layer)
attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size))
outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
return outputs
def apply_rotary_position_embeddings(sinusoidal_pos, query_layer, key_layer, value_layer=None):
sin, cos = tf.split(sinusoidal_pos, num_or_size_splits=2, axis=-1)
sin_pos = tf.repeat(sin, 2, axis=-1)
cos_pos = tf.repeat(cos, 2, axis=-1)
rotate_half_query_layer = tf.stack([-query_layer[..., 1::2], query_layer[..., ::2]], axis=-1)
rotate_half_query_layer = tf.reshape(rotate_half_query_layer, shape_list(query_layer))
query_layer = query_layer * cos_pos + rotate_half_query_layer * sin_pos
rotate_half_key_layer = tf.stack([-key_layer[..., 1::2], key_layer[..., ::2]], axis=-1)
rotate_half_key_layer = tf.reshape(rotate_half_key_layer, shape_list(key_layer))
key_layer = key_layer * cos_pos + rotate_half_key_layer * sin_pos
if value_layer is not None:
rotate_half_value_layer = tf.stack([-value_layer[..., 1::2], value_layer[..., ::2]], axis=-1)
rotate_half_value_layer = tf.reshape(rotate_half_value_layer, shape_list(value_layer))
value_layer = value_layer * cos_pos + rotate_half_value_layer * sin_pos
return query_layer, key_layer, value_layer
return query_layer, key_layer
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
class TFRoFormerSelfOutput(keras.layers.Layer):
def __init__(self, config: RoFormerConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.dropout(inputs=hidden_states, training=training)
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFRoFormerAttention(keras.layers.Layer):
def __init__(self, config: RoFormerConfig, **kwargs):
super().__init__(**kwargs)
self.self_attention = TFRoFormerSelfAttention(config, name="self")
self.dense_output = TFRoFormerSelfOutput(config, name="output")
def prune_heads(self, heads):
raise NotImplementedError
def call(
self,
input_tensor: tf.Tensor,
attention_mask: tf.Tensor,
sinusoidal_pos: tf.Tensor,
head_mask: tf.Tensor,
output_attentions: bool,
training: bool = False,
) -> Tuple[tf.Tensor]:
self_outputs = self.self_attention(
hidden_states=input_tensor,
attention_mask=attention_mask,
sinusoidal_pos=sinusoidal_pos,
head_mask=head_mask,
output_attentions=output_attentions,
training=training,
)
attention_output = self.dense_output(
hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
)
outputs = (attention_output,) + self_outputs[1:]
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attention", None) is not None:
with tf.name_scope(self.self_attention.name):
self.self_attention.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
class TFRoFormerIntermediate(keras.layers.Layer):
def __init__(self, config: RoFormerConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFRoFormerOutput(keras.layers.Layer):
def __init__(self, config: RoFormerConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.dropout(inputs=hidden_states, training=training)
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFRoFormerLayer(keras.layers.Layer):
def __init__(self, config: RoFormerConfig, **kwargs):
super().__init__(**kwargs)
self.attention = TFRoFormerAttention(config, name="attention")
self.intermediate = TFRoFormerIntermediate(config, name="intermediate")
self.roformer_output = TFRoFormerOutput(config, name="output")
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
sinusoidal_pos: tf.Tensor,
head_mask: tf.Tensor,
output_attentions: bool,
training: bool = False,
) -> Tuple[tf.Tensor]:
attention_outputs = self.attention(
input_tensor=hidden_states,
attention_mask=attention_mask,
sinusoidal_pos=sinusoidal_pos,
head_mask=head_mask,
output_attentions=output_attentions,
training=training,
)
attention_output = attention_outputs[0]
intermediate_output = self.intermediate(hidden_states=attention_output)
layer_output = self.roformer_output(
hidden_states=intermediate_output, input_tensor=attention_output, training=training
)
outputs = (layer_output,) + attention_outputs[1:]
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "roformer_output", None) is not None:
with tf.name_scope(self.roformer_output.name):
self.roformer_output.build(None)
class TFRoFormerEncoder(keras.layers.Layer):
def __init__(self, config: RoFormerConfig, **kwargs):
super().__init__(**kwargs)
self.embed_positions = TFRoFormerSinusoidalPositionalEmbedding(
config.max_position_embeddings,
config.hidden_size // config.num_attention_heads,
name="embed_positions",
)
self.layer = [TFRoFormerLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
head_mask: tf.Tensor,
output_attentions: bool,
output_hidden_states: bool,
return_dict: bool,
training: bool = False,
) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
sinusoidal_pos = self.embed_positions(shape_list(hidden_states)[:-1])[None, None, :, :]
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_outputs = layer_module(
hidden_states=hidden_states,
attention_mask=attention_mask,
sinusoidal_pos=sinusoidal_pos,
head_mask=head_mask[i],
output_attentions=output_attentions,
training=training,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
return TFBaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
def __init__(self, config: RoFormerConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.embedding_size,
kernel_initializer=get_initializer(config.initializer_range),
name="dense",
)
if isinstance(config.hidden_act, str):
self.transform_act_fn = get_tf_activation(config.hidden_act)
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps,
name="LayerNorm"
)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(inputs=hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.embedding_size])
class TFRoFormerLMPredictionHead(keras.layers.Layer):
def __init__(self, config: RoFormerConfig, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs)
self.config = config
self.embedding_size = config.embedding_size
self.transform = TFRoFormerPredictionHeadTransform(config, name="transform")
self.input_embeddings = input_embeddings
def build(self, input_shape=None):
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
if self.built:
return
self.built = True
if getattr(self, "transform", None) is not None:
with tf.name_scope(self.transform.name):
self.transform.build(None)
def get_output_embeddings(self) -> keras.layers.Layer:
return self.input_embeddings
def set_output_embeddings(self, value: tf.Variable):
self.input_embeddings.weight = value
self.input_embeddings.vocab_size = shape_list(value)[0]
def get_bias(self) -> Dict[str, tf.Variable]:
return {"bias": self.bias}
def set_bias(self, value: tf.Variable):
self.bias = value["bias"]
self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.transform(hidden_states=hidden_states)
seq_length = shape_list(hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states
class TFRoFormerMLMHead(keras.layers.Layer):
def __init__(self, config: RoFormerConfig, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs)
self.predictions = TFRoFormerLMPredictionHead(config, input_embeddings, name="predictions")
def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
prediction_scores = self.predictions(hidden_states=sequence_output)
return prediction_scores
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "predictions", None) is not None:
with tf.name_scope(self.predictions.name):
self.predictions.build(None)
@keras_serializable
class TFRoFormerMainLayer(keras.layers.Layer):
config_class = RoFormerConfig
def __init__(self, config: RoFormerConfig, add_pooling_layer: bool = True, **kwargs):
super().__init__(**kwargs)
self.config = config
self.embeddings = TFRoFormerEmbeddings(config, name="embeddings")
if config.embedding_size != config.hidden_size:
self.embeddings_project = keras.layers.Dense(config.hidden_size, name="embeddings_project")
self.encoder = TFRoFormerEncoder(config, name="encoder")
def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings
def set_input_embeddings(self, value: tf.Variable):
self.embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
raise NotImplementedError
@unpack_inputs
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
):
"""
RoFormer 模型的前向传播方法,接收多个输入参数,并返回相应的输出。
这里的装饰器 @unpack_inputs 用于解包输入参数,详见其定义。
"""
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "embeddings_project", None) is not None:
with tf.name_scope(self.embeddings_project.name):
self.embeddings_project.build([None, None, self.config.embedding_size])
class TFRoFormerPreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = RoFormerConfig
base_model_prefix = "roformer"
ROFORMER_START_DOCSTRING = r"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
positional argument:
- a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
Note that when creating models and layers with
[subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
about any of this, as you can just pass inputs like you would to any other Python function!
</Tip>
Args:
config ([`RoFormerConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
ROFORMER_INPUTS_DOCSTRING = r"""
"""
@add_start_docstrings(
"The bare RoFormer Model transformer outputing raw hidden-states without any specific head on top.",
ROFORMER_START_DOCSTRING,
)
class TFRoFormerModel(TFRoFormerPreTrainedModel):
def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.roformer = TFRoFormerMainLayer(config, name="roformer")
@unpack_inputs
@add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
outputs = self.roformer(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roformer", None) is not None:
with tf.name_scope(self.roformer.name):
self.roformer.build(None)
@add_start_docstrings("""RoFormer Model with a `language modeling` head on top.""", ROFORMER_START_DOCSTRING)
class TFRoFormerForMaskedLM(TFRoFormerPreTrainedModel, TFMaskedLanguageModelingLoss):
def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
if config.is_decoder:
logger.warning(
"If you want to use `TFRoFormerForMaskedLM` make sure `config.is_decoder=False` for "
"bi-directional self-attention."
)
self.roformer = TFRoFormerMainLayer(config, name="roformer")
self.mlm = TFRoFormerMLMHead(config, input_embeddings=self.roformer.embeddings, name="mlm___cls")
def get_lm_head(self) -> keras.layers.Layer:
return self.mlm.predictions
@unpack_inputs
@add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFMaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
outputs = self.roformer(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
prediction_scores = self.mlm(sequence_output=sequence_output, training=training)
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=prediction_scores)
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFMaskedLMOutput(
loss=loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roformer", None) is not None:
with tf.name_scope(self.roformer.name):
self.roformer.build(None)
if getattr(self, "mlm", None) is not None:
with tf.name_scope(self.mlm.name):
self.mlm.build(None)
@add_start_docstrings(
"""RoFormer Model with a `language modeling` head on top for CLM fine-tuning.""", ROFORMER_START_DOCSTRING
)
class TFRoFormerForCausalLM(TFRoFormerPreTrainedModel, TFCausalLanguageModelingLoss):
def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
if not config.is_decoder:
logger.warning("If you want to use `TFRoFormerForCausalLM` as a standalone, add `is_decoder=True.`")
self.roformer = TFRoFormerMainLayer(config, name="roformer")
self.mlm = TFRoFormerMLMHead(config, input_embeddings=self.roformer.embeddings, name="mlm___cls")
def get_lm_head(self) -> keras.layers.Layer:
return self.mlm.predictions
@unpack_inputs
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFCausalLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFCausalLMOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
config.vocab_size - 1]`.
"""
outputs = self.roformer(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
logits = self.mlm(sequence_output=sequence_output, training=training)
loss = None
if labels is not None:
shifted_logits = logits[:, :-1]
labels = labels[:, 1:]
loss = self.hf_compute_loss(labels=labels, logits=shifted_logits)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFCausalLMOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roformer", None) is not None:
with tf.name_scope(self.roformer.name):
self.roformer.build(None)
if getattr(self, "mlm", None) is not None:
with tf.name_scope(self.mlm.name):
self.mlm.build(None)
class TFRoFormerClassificationHead(keras.layers.Layer):
"""Head for sentence-level classification tasks."""
def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
super().__init__(*inputs, **kwargs)
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.out_proj = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
)
if isinstance(config.hidden_act, str):
self.classifier_act_fn = get_tf_activation(config.hidden_act)
else:
self.classifier_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = hidden_states[:, 0, :]
hidden_states = self.dropout(inputs=hidden_states, training=training)
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.classifier_act_fn(hidden_states)
hidden_states = self.dropout(inputs=hidden_states, training=training)
hidden_states = self.out_proj(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
RoFormer Model transformer with a sequence classification/regression head on top e.g., for GLUE tasks.
""",
ROFORMER_START_DOCSTRING,
)
class TFRoFormerForSequenceClassification(TFRoFormerPreTrainedModel, TFSequenceClassificationLoss):
def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.roformer = TFRoFormerMainLayer(config, name="roformer")
self.classifier = TFRoFormerClassificationHead(config, name="classifier")
@unpack_inputs
@add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
outputs = self.roformer(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
logits = self.classifier(hidden_states=outputs[0], training=training)
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFSequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roformer", None) is not None:
with tf.name_scope(self.roformer.name):
self.roformer.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build(None)
"""
RoFormer Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
"""
@add_start_docstrings(
ROFORMER_START_DOCSTRING,
)
class TFRoFormerForMultipleChoice(TFRoFormerPreTrainedModel, TFMultipleChoiceLoss):
def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.roformer = TFRoFormerMainLayer(config, name="roformer")
self.sequence_summary = TFSequenceSummary(config, config.initializer_range, name="sequence_summary")
self.classifier = keras.layers.Dense(
units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(
ROFORMER_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
"""
if input_ids is not None:
num_choices = shape_list(input_ids)[1]
seq_length = shape_list(input_ids)[2]
else:
num_choices = shape_list(inputs_embeds)[1]
seq_length = shape_list(inputs_embeds)[2]
flat_input_ids = tf.reshape(tensor=input_ids, shape=(-1, seq_length)) if input_ids is not None else None
flat_attention_mask = (
tf.reshape(tensor=attention_mask, shape=(-1, seq_length)) if attention_mask is not None else None
)
flat_token_type_ids = (
tf.reshape(tensor=token_type_ids, shape=(-1, seq_length)) if token_type_ids is not None else None
)
flat_inputs_embeds = (
tf.reshape(tensor=inputs_embeds, shape=(-1, seq_length, shape_list(inputs_embeds)[3]))
if inputs_embeds is not None
else None
)
outputs = self.roformer(
input_ids=flat_input_ids,
attention_mask=flat_attention_mask,
token_type_ids=flat_token_type_ids,
head_mask=head_mask,
inputs_embeds=flat_inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
logits = self.sequence_summary(inputs=outputs[0], training=training)
logits = self.classifier(inputs=logits)
reshaped_logits = tf.reshape(tensor=logits, shape=(-1, num_choices))
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=reshaped_logits)
if not return_dict:
output = (reshaped_logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFMultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roformer", None) is not None:
with tf.name_scope(self.roformer.name):
self.roformer.build(None)
if getattr(self, "sequence_summary", None) is not None:
with tf.name_scope(self.sequence_summary.name):
self.sequence_summary.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
RoFormer Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
ROFORMER_START_DOCSTRING,
)
class TFRoFormerForTokenClassification(TFRoFormerPreTrainedModel, TFTokenClassificationLoss):
def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.roformer = TFRoFormerMainLayer(config, name="roformer")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.classifier = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
r"""
定义方法的返回类型,可以是 TFTokenClassifierOutput 或者包含 tf.Tensor 的元组。
如果方法没有返回值,应该返回 None。
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
用于计算标记分类损失的标签。索引应在 `[0, ..., config.num_labels - 1]` 范围内。
"""
outputs = self.roformer(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
sequence_output = self.dropout(inputs=sequence_output, training=training)
logits = self.classifier(inputs=sequence_output)
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFTokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roformer", None) is not None:
with tf.name_scope(self.roformer.name):
self.roformer.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
RoFormer Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
ROFORMER_START_DOCSTRING,
)
class TFRoFormerForQuestionAnswering(TFRoFormerPreTrainedModel, TFQuestionAnsweringLoss):
def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.roformer = TFRoFormerMainLayer(config, name="roformer")
self.qa_outputs = keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="qa_outputs"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
start_positions: np.ndarray | tf.Tensor | None = None,
end_positions: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
...
...
):
...
由于代码块过长,无法在此显示完整的 `call` 方法主体部分。
) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
r"""
start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
outputs = self.roformer(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
logits = self.qa_outputs(inputs=sequence_output)
start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1)
start_logits = tf.squeeze(input=start_logits, axis=-1)
end_logits = tf.squeeze(input=end_logits, axis=-1)
loss = None
if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions, "end_position": end_positions}
loss = self.hf_compute_loss(labels=labels, logits=(start_logits, end_logits))
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roformer", None) is not None:
with tf.name_scope(self.roformer.name):
self.roformer.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
.\models\roformer\tokenization_roformer.py
"""RoFormer 的标记化类。"""
import collections
import os
import unicodedata
from typing import List, Optional, Tuple
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"junnyu/roformer_chinese_small": "https://huggingface.co/junnyu/roformer_chinese_small/resolve/main/vocab.txt",
"junnyu/roformer_chinese_base": "https://huggingface.co/junnyu/roformer_chinese_base/resolve/main/vocab.txt",
"junnyu/roformer_chinese_char_small": (
"https://huggingface.co/junnyu/roformer_chinese_char_small/resolve/main/vocab.txt"
),
"junnyu/roformer_chinese_char_base": (
"https://huggingface.co/junnyu/roformer_chinese_char_base/resolve/main/vocab.txt"
),
"junnyu/roformer_small_discriminator": (
"https://huggingface.co/junnyu/roformer_small_discriminator/resolve/main/vocab.txt"
),
"junnyu/roformer_small_generator": (
"https://huggingface.co/junnyu/roformer_small_generator/resolve/main/vocab.txt"
),
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"junnyu/roformer_chinese_small": 1536,
"junnyu/roformer_chinese_base": 1536,
"junnyu/roformer_chinese_char_small": 512,
"junnyu/roformer_chinese_char_base": 512,
"junnyu/roformer_small_discriminator": 128,
"junnyu/roformer_small_generator": 128,
}
PRETRAINED_INIT_CONFIGURATION = {
"junnyu/roformer_chinese_small": {"do_lower_case": True},
"junnyu/roformer_chinese_base": {"do_lower_case": True},
"junnyu/roformer_chinese_char_small": {"do_lower_case": True},
"junnyu/roformer_chinese_char_base": {"do_lower_case": True},
"junnyu/roformer_small_discriminator": {"do_lower_case": True},
"junnyu/roformer_small_generator": {"do_lower_case": True},
}
def load_vocab(vocab_file):
"""加载一个词汇文件到一个有序字典中。"""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
def whitespace_tokenize(text):
"""对文本进行基本的空格清理和分割。"""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class BasicTokenizer(object):
"""
构造一个 BasicTokenizer 对象,执行基本的分词(分割标点符号、转换为小写等)。
Args:
do_lower_case (`bool`, *可选*, 默认为 `True`):
是否在分词时转换为小写。
never_split (`Iterable`, *可选*):
在分词过程中永远不会分割的词汇集合。仅在 `do_basic_tokenize=True` 时有效。
tokenize_chinese_chars (`bool`, *可选*, 默认为 `True`):
是否分割中文字符。
对于日语,这可能需要禁用(参见此处
[issue](https://github.com/huggingface/transformers/issues/328))。
strip_accents (`bool`, *可选*):
是否去除所有重音符号。如果未指定,则由 `lowercase` 的值决定(与原始 BERT 一致)。
do_split_on_punc (`bool`, *可选*, 默认为 `True`):
在某些情况下,我们希望跳过基本的标点符号分割,以便后续的分词可以捕获单词的完整上下文,例如缩写。
"""
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of token not to split.
"""
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
text = self._clean_text(text)
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
if self.do_lower_case:
token = token.lower()
if self.strip_accents is not False:
token = self._run_strip_accents(token)
elif self.strip_accents:
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token, never_split))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF)
or (cp >= 0x20000 and cp <= 0x2A6DF)
or (cp >= 0x2A700 and cp <= 0x2B73F)
or (cp >= 0x2B740 and cp <= 0x2B81F)
or (cp >= 0x2B820 and cp <= 0x2CEAF)
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F)
):
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xFFFD or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
tokenization using the given vocabulary.
For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through *BasicTokenizer*.
Returns:
A list of wordpiece tokens.
"""
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
def __init__(
self,
vocab_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs,
):
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
" model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
self.do_basic_tokenize = do_basic_tokenize
if do_basic_tokenize:
self.basic_tokenizer = BasicTokenizer(
do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
try:
import rjieba
except ImportError:
raise ImportError(
"You need to install rjieba to use RoFormerTokenizer. "
"See https://pypi.org/project/rjieba/ for installation."
)
self.jieba = rjieba
super().__init__(
do_lower_case=do_lower_case,
do_basic_tokenize=do_basic_tokenize,
never_split=never_split,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
@property
def do_lower_case(self):
return self.basic_tokenizer.do_lower_case
@property
def vocab_size(self):
return len(self.vocab)
def __getstate__(self):
state = self.__dict__.copy()
state["jieba"] = None
return state
def __setstate__(self, d):
self.__dict__ = d
import rjieba
self.jieba = rjieba
def get_vocab(self):
return dict(self.vocab, **self.added_tokens_encoder)
def _tokenize(self, text, use_jieba=True):
split_tokens = []
if use_jieba:
for wholword in self.jieba.cut(text, False):
if wholword in self.vocab:
split_tokens.append(wholword)
else:
char_list = self._tokenize(wholword, use_jieba=False)
split_tokens.extend(char_list)
else:
if self.do_basic_tokenize:
for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
if token in self.basic_tokenizer.never_split:
split_tokens.append(token)
else:
split_tokens += self.wordpiece_tokenizer.tokenize(token)
else:
split_tokens = self.wordpiece_tokenizer.tokenize(text)
return split_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.ids_to_tokens.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A RoFormer sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
):
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A RoFormer
sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
index = 0
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
else:
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!"
)
index = token_index
writer.write(token + "\n")
index += 1
return (vocab_file,)
.\models\roformer\tokenization_roformer_fast.py
import json
from typing import List, Optional, Tuple
from tokenizers import normalizers
from tokenizers.pre_tokenizers import BertPreTokenizer, PreTokenizer
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .tokenization_roformer import RoFormerTokenizer
from .tokenization_utils import JiebaPreTokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"junnyu/roformer_chinese_small": "https://huggingface.co/junnyu/roformer_chinese_small/resolve/main/vocab.txt",
"junnyu/roformer_chinese_base": "https://huggingface.co/junnyu/roformer_chinese_base/resolve/main/vocab.txt",
"junnyu/roformer_chinese_char_small": (
"https://huggingface.co/junnyu/roformer_chinese_char_small/resolve/main/vocab.txt"
),
"junnyu/roformer_chinese_char_base": (
"https://huggingface.co/junnyu/roformer_chinese_char_base/resolve/main/vocab.txt"
),
"junnyu/roformer_small_discriminator": (
"https://huggingface.co/junnyu/roformer_small_discriminator/resolve/main/vocab.txt"
),
"junnyu/roformer_small_generator": (
"https://huggingface.co/junnyu/roformer_small_generator/resolve/main/vocab.txt"
),
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"junnyu/roformer_chinese_small": 1536,
"junnyu/roformer_chinese_base": 1536,
"junnyu/roformer_chinese_char_small": 512,
"junnyu/roformer_chinese_char_base": 512,
"junnyu/roformer_small_discriminator": 128,
"junnyu/roformer_small_generator": 128,
}
PRETRAINED_INIT_CONFIGURATION = {
"junnyu/roformer_chinese_small": {"do_lower_case": True},
"junnyu/roformer_chinese_base": {"do_lower_case": True},
"junnyu/roformer_chinese_char_small": {"do_lower_case": True},
"junnyu/roformer_chinese_char_base": {"do_lower_case": True},
"junnyu/roformer_small_discriminator": {"do_lower_case": True},
"junnyu/roformer_small_generator": {"do_lower_case": True},
}
class RoFormerTokenizerFast(PreTrainedTokenizerFast):
r"""
Construct a "fast" RoFormer tokenizer (backed by HuggingFace's *tokenizers* library).
# `RoFormerTokenizerFast`几乎与`BertTokenizerFast`相同,实现端到端的分词:
# 标点符号分割和WordPiece。它们在处理中文时有些差异。
# 此分词器继承自`PreTrainedTokenizerFast`,其中包含大部分主要方法。用户应该
# 参考这个超类以获取有关这些方法的更多信息。
# 示例:
#
# ```
# >>> from transformers import RoFormerTokenizerFast
#
# >>> tokenizer = RoFormerTokenizerFast.from_pretrained("junnyu/roformer_chinese_base")
# >>> tokenizer.tokenize("今天天气非常好。")
# ['今', '天', '天', '气', '非常', '好', '。']
# ```
vocab_files_names = VOCAB_FILES_NAMES # 获取词汇文件的名称列表
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP # 获取预训练词汇文件的映射
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES # 获取预训练位置嵌入的最大模型输入尺寸
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION # 获取预训练初始化配置
slow_tokenizer_class = RoFormerTokenizer # 慢速分词器类为RoFormerTokenizer
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
do_lower_case=True,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs,
):
# 调用父类的初始化方法,设置基本的分词器参数
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
# 从后端分词器的normalizer状态中加载JSON数据
normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
# 如果normalizer的lowercase属性与当前设置不符,则更新
if (
normalizer_state.get("lowercase", do_lower_case) != do_lower_case
or normalizer_state.get("strip_accents", strip_accents) != strip_accents
):
normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
normalizer_state["lowercase"] = do_lower_case
normalizer_state["strip_accents"] = strip_accents
# 更新后端分词器的normalizer
self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
# 确保正确设置自定义的PreTokenizer
vocab = self.backend_tokenizer.get_vocab()
self.backend_tokenizer.pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer(vocab))
self.do_lower_case = do_lower_case
def __getstate__(self):
state = self.__dict__.copy()
# 将分词器的pre_tokenizer设置为BertPreTokenizer()
state["_tokenizer"].pre_tokenizer = BertPreTokenizer()
return state
def __setstate__(self, d):
self.__dict__ = d
# 获取当前分词器的词汇表
vocab = self.__dict__["_tokenizer"].get_vocab()
# 将分词器的pre_tokenizer设置为自定义的JiebaPreTokenizer
self.__dict__["_tokenizer"].pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer(vocab))
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A RoFormer sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary
"""
# Initialize output with CLS token ID, token_ids_0, and SEP token ID
output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
# If token_ids_1 is provided, concatenate token_ids_1 and SEP token ID
if token_ids_1 is not None:
output += token_ids_1 + [self.sep_token_id]
return output
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A RoFormer
sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary
"""
# Define SEP and CLS tokens as lists
sep = [self.sep_token_id]
cls = [self.cls_token_id]
# If token_ids_1 is None, return a list of zeros corresponding to token_ids_0 + CLS + SEP
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
# Return a concatenated list of zeros for token_ids_0 + CLS + SEP and ones for token_ids_1 + SEP
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the tokenizer's vocabulary to a directory.
Args:
save_directory (str):
Directory to save the vocabulary files.
filename_prefix (str, *optional*):
Prefix for the vocabulary files.
Returns:
`Tuple[str]`: Tuple of file paths where the vocabulary was saved.
"""
# Save the model vocabulary using the tokenizer's save method
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
def save_pretrained(
self,
save_directory,
legacy_format=None,
filename_prefix=None,
push_to_hub=False,
**kwargs,
):
"""
Save the pretrained model and its tokenizer.
Args:
save_directory (str):
Directory to save the pretrained model.
legacy_format (str, *optional*):
Legacy format compatibility.
filename_prefix (str, *optional*):
Prefix for the saved files.
push_to_hub (bool):
Whether to push the saved model to the Hugging Face model hub.
**kwargs:
Additional arguments passed to the superclass method.
Returns:
`Any`: Output of the superclass's `save_pretrained` method.
"""
self.backend_tokenizer.pre_tokenizer = BertPreTokenizer()
return super().save_pretrained(save_directory, legacy_format, filename_prefix, push_to_hub, **kwargs)
.\models\roformer\tokenization_utils.py
"""Tokenization utils for RoFormer."""
from typing import List
from tokenizers import NormalizedString, PreTokenizedString, normalizers
class JiebaPreTokenizer:
def __init__(self, vocab) -> None:
self.vocab = vocab
self.normalizers = normalizers.BertNormalizer(
clean_text=False,
handle_chinese_chars=True,
strip_accents=False,
lowercase=False,
)
try:
import rjieba
except ImportError:
raise ImportError(
"You need to install rjieba to use RoFormerTokenizer. "
"See https://pypi.org/project/rjieba/ for installation."
)
self.jieba = rjieba
def jieba_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
splits = []
for token, start, end in self.jieba.tokenize(str(normalized_string), hmm=False):
if token in self.vocab:
splits.append(normalized_string[start:end])
else:
token_list = self.normalizers.normalize_str(token).split()
for token in token_list:
if token:
end = start + len(token)
splits.append(normalized_string[start:end])
start = end
return splits
def pre_tokenize(self, pretok: PreTokenizedString):
pretok.split(self.jieba_split)
.\models\roformer\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_flax_available,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_roformer": ["ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "RoFormerConfig", "RoFormerOnnxConfig"],
"tokenization_roformer": ["RoFormerTokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_roformer_fast"] = ["RoFormerTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_roformer"] = [
"ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
"RoFormerForCausalLM",
"RoFormerForMaskedLM",
"RoFormerForMultipleChoice",
"RoFormerForQuestionAnswering",
"RoFormerForSequenceClassification",
"RoFormerForTokenClassification",
"RoFormerLayer",
"RoFormerModel",
"RoFormerPreTrainedModel",
"load_tf_weights_in_roformer",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_roformer"] = [
"TF_ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFRoFormerForCausalLM",
"TFRoFormerForMaskedLM",
"TFRoFormerForMultipleChoice",
"TFRoFormerForQuestionAnswering",
"TFRoFormerForSequenceClassification",
"TFRoFormerForTokenClassification",
"TFRoFormerLayer",
"TFRoFormerModel",
"TFRoFormerPreTrainedModel",
]
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_flax_roformer"] = [
"FLAX_ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
"FlaxRoFormerForMaskedLM",
"FlaxRoFormerForMultipleChoice",
"FlaxRoFormerForQuestionAnswering",
"FlaxRoFormerForSequenceClassification",
"FlaxRoFormerForTokenClassification",
"FlaxRoFormerModel",
"FlaxRoFormerPreTrainedModel",
]
if TYPE_CHECKING:
pass
from .configuration_roformer import ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, RoFormerConfig, RoFormerOnnxConfig
from .tokenization_roformer import RoFormerTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_roformer_fast import RoFormerTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_roformer import (
ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
RoFormerForCausalLM,
RoFormerForMaskedLM,
RoFormerForMultipleChoice,
RoFormerForQuestionAnswering,
RoFormerForSequenceClassification,
RoFormerForTokenClassification,
RoFormerLayer,
RoFormerModel,
RoFormerPreTrainedModel,
load_tf_weights_in_roformer,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_roformer import (
TF_ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
TFRoFormerForCausalLM,
TFRoFormerForMaskedLM,
TFRoFormerForMultipleChoice,
TFRoFormerForQuestionAnswering,
TFRoFormerForSequenceClassification,
TFRoFormerForTokenClassification,
TFRoFormerLayer,
TFRoFormerModel,
TFRoFormerPreTrainedModel,
)
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_flax_roformer import (
FLAX_ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
FlaxRoFormerForMaskedLM,
FlaxRoFormerForMultipleChoice,
FlaxRoFormerForQuestionAnswering,
FlaxRoFormerForSequenceClassification,
FlaxRoFormerForTokenClassification,
FlaxRoFormerModel,
FlaxRoFormerPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\rwkv\configuration_rwkv.py
""" RWKV configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"RWKV/rwkv-4-169m-pile": "https://huggingface.co/RWKV/rwkv-4-169m-pile/resolve/main/config.json",
"RWKV/rwkv-4-430m-pile": "https://huggingface.co/RWKV/rwkv-4-430m-pile/resolve/main/config.json",
"RWKV/rwkv-4-1b5-pile": "https://huggingface.co/RWKV/rwkv-4-1b5-pile/resolve/main/config.json",
"RWKV/rwkv-4-3b-pile": "https://huggingface.co/RWKV/rwkv-4-3b-pile/resolve/main/config.json",
"RWKV/rwkv-4-7b-pile": "https://huggingface.co/RWKV/rwkv-4-7b-pile/resolve/main/config.json",
"RWKV/rwkv-4-14b-pile": "https://huggingface.co/RWKV/rwkv-4-14b-pile/resolve/main/config.json",
"RWKV/rwkv-raven-1b5": "https://huggingface.co/RWKV/rwkv-raven-1b5/resolve/main/config.json",
"RWKV/rwkv-raven-3b": "https://huggingface.co/RWKV/rwkv-raven-3b/resolve/main/config.json",
"RWKV/rwkv-raven-7b": "https://huggingface.co/RWKV/rwkv-raven-7b/resolve/main/config.json",
"RWKV/rwkv-raven-14b": "https://huggingface.co/RWKV/rwkv-raven-14b/resolve/main/config.json",
}
class RwkvConfig(PretrainedConfig):
"""
This is the configuration class to store the configuration of a [`RwkvModel`]. It is used to instantiate a RWKV
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the RWVK-4
[RWKV/rwkv-4-169m-pile](https://huggingface.co/RWKV/rwkv-4-169m-pile) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
model_type = "rwkv"
attribute_map = {"max_position_embeddings": "context_length"}
def __init__(
self,
vocab_size=50277,
context_length=1024,
hidden_size=4096,
num_hidden_layers=32,
attention_hidden_size=None,
intermediate_size=None,
layer_norm_epsilon=1e-5,
bos_token_id=0,
eos_token_id=0,
rescale_every=6,
tie_word_embeddings=False,
use_cache=True,
**kwargs,
):
):
self.vocab_size = vocab_size
self.context_length = context_length
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size
self.intermediate_size = intermediate_size if intermediate_size is not None else 4 * hidden_size
self.layer_norm_epsilon = layer_norm_epsilon
self.rescale_every = rescale_every
self.use_cache = use_cache
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
super().__init__(
tie_word_embeddings=tie_word_embeddings, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs
)
.\models\rwkv\convert_rwkv_checkpoint_to_hf.py
"""Convert a RWKV checkpoint from BlinkDL to the Hugging Face format."""
import argparse
import gc
import json
import os
import re
import torch
from huggingface_hub import hf_hub_download
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerFast, RwkvConfig
from transformers.modeling_utils import WEIGHTS_INDEX_NAME, shard_checkpoint
NUM_HIDDEN_LAYERS_MAPPING = {
"169M": 12,
"430M": 24,
"1B5": 24,
"3B": 32,
"7B": 32,
"14B": 40,
}
HIDEN_SIZE_MAPPING = {
"169M": 768,
"430M": 1024,
"1B5": 2048,
"3B": 2560,
"7B": 4096,
"14B": 5120,
}
def convert_state_dict(state_dict):
state_dict_keys = list(state_dict.keys())
for name in state_dict_keys:
weight = state_dict.pop(name)
if name.startswith("emb."):
name = name.replace("emb.", "embeddings.")
if name.startswith("blocks.0.ln0"):
name = name.replace("blocks.0.ln0", "blocks.0.pre_ln")
name = re.sub(r"blocks\.(\d+)\.att", r"blocks.\1.attention", name)
name = re.sub(r"blocks\.(\d+)\.ffn", r"blocks.\1.feed_forward", name)
if name.endswith(".time_mix_k"):
name = name.replace(".time_mix_k", ".time_mix_key")
if name.endswith(".time_mix_v"):
name = name.replace(".time_mix_v", ".time_mix_value")
if name.endswith(".time_mix_r"):
name = name.replace(".time_mix_r", ".time_mix_receptance")
if name != "head.weight":
name = "rwkv." + name
state_dict[name] = weight
return state_dict
def convert_rmkv_checkpoint_to_hf_format(
repo_id, checkpoint_file, output_dir, size=None, tokenizer_file=None, push_to_hub=False, model_name=None
):
if tokenizer_file is None:
print("No `--tokenizer_file` provided, we will use the default tokenizer.")
vocab_size = 50277
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
else:
tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file)
vocab_size = len(tokenizer)
tokenizer.save_pretrained(output_dir)
possible_sizes = list(NUM_HIDDEN_LAYERS_MAPPING.keys())
if size is None:
for candidate in possible_sizes:
if candidate in checkpoint_file:
size = candidate
break
if size is None:
raise ValueError("Could not infer the size, please provide it with the `--size` argument.")
if size not in possible_sizes:
raise ValueError(f"`size` should be one of {possible_sizes}, got {size}.")
config = RwkvConfig(
vocab_size=vocab_size,
num_hidden_layers=NUM_HIDDEN_LAYERS_MAPPING[size],
hidden_size=HIDEN_SIZE_MAPPING[size],
)
config.save_pretrained(output_dir)
model_file = hf_hub_download(repo_id, checkpoint_file)
state_dict = torch.load(model_file, map_location="cpu")
state_dict = convert_state_dict(state_dict)
shards, index = shard_checkpoint(state_dict)
for shard_file, shard in shards.items():
torch.save(shard, os.path.join(output_dir, shard_file))
if index is not None:
save_index_file = os.path.join(output_dir, WEIGHTS_INDEX_NAME)
with open(save_index_file, "w", encoding="utf-8") as f:
content = json.dumps(index, indent=2, sort_keys=True) + "\n"
f.write(content)
print(
"Cleaning up shards. This may error with an OOM error, it this is the case don't worry you still have converted the model."
)
shard_files = list(shards.keys())
del state_dict
del shards
gc.collect()
for shard_file in shard_files:
state_dict = torch.load(os.path.join(output_dir, shard_file))
torch.save({k: v.cpu().clone() for k, v in state_dict.items()}, os.path.join(output_dir, shard_file))
del state_dict
gc.collect()
if push_to_hub:
if model_name is None:
raise ValueError("Please provide a `model_name` to push the model to the Hub.")
model = AutoModelForCausalLM.from_pretrained(output_dir)
model.push_to_hub(model_name, max_shard_size="2GB")
tokenizer.push_to_hub(model_name)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--repo_id", default=None, type=str, required=True, help="Repo ID from which to pull the checkpoint."
)
parser.add_argument(
"--checkpoint_file", default=None, type=str, required=True, help="Name of the checkpoint file in the repo."
)
parser.add_argument(
"--output_dir", default=None, type=str, required=True, help="Where to save the converted model."
)
parser.add_argument(
"--tokenizer_file",
default=None,
type=str,
help="Path to the tokenizer file to use (if not provided, only the model is converted).",
)
parser.add_argument(
"--size",
default=None,
type=str,
help="Size of the model. Will be inferred from the `checkpoint_file` if not passed.",
)
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Push to the Hub the converted model.",
)
parser.add_argument(
"--model_name",
default=None,
type=str,
help="Name of the pushed model on the Hub, including the username / organization.",
)
args = parser.parse_args()
convert_rmkv_checkpoint_to_hf_format(
args.repo_id,
args.checkpoint_file,
args.output_dir,
size=args.size,
tokenizer_file=args.tokenizer_file,
push_to_hub=args.push_to_hub,
model_name=args.model_name,
)
.\models\rwkv\modeling_rwkv.py
"""PyTorch RWKV 模型."""
import math
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_bitsandbytes_available,
is_ninja_available,
is_torch_cuda_available,
logging,
)
from .configuration_rwkv import RwkvConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "RWKV/rwkv-4-169m-pile"
_CONFIG_FOR_DOC = "RwkvConfig"
RWKV_PRETRAINED_MODEL_ARCHIVE_LIST = [
"RWKV/rwkv-4-169m-pile",
"RWKV/rwkv-4-430m-pile",
"RWKV/rwkv-4-1b5-pile",
"RWKV/rwkv-4-3b-pile",
"RWKV/rwkv-4-7b-pile",
"RWKV/rwkv-4-14b-pile",
"RWKV/rwkv-raven-1b5",
"RWKV/rwkv-raven-3b",
"RWKV/rwkv-raven-7b",
"RWKV/rwkv-raven-14b",
]
rwkv_cuda_kernel = None
def load_wkv_cuda_kernel(context_length):
from torch.utils.cpp_extension import load as load_kernel
global rwkv_cuda_kernel
kernel_folder = Path(__file__).resolve().parent.parent.parent / "kernels" / "rwkv"
cuda_kernel_files = [kernel_folder / f for f in ["wkv_op.cpp", "wkv_cuda.cu", "wkv_cuda_bf16.cu"]]
if rwkv_cuda_kernel is not None and rwkv_cuda_kernel.max_seq_length == context_length:
return
logger.info(f"Loading CUDA kernel for RWKV at context length of {context_length}.")
flags = [
"-res-usage",
"--maxrregcount 60",
"--use_fast_math",
"-O3",
"-Xptxas -O3",
"--extra-device-vectorization",
f"-DTmax={context_length}",
]
rwkv_cuda_kernel = load_kernel(
name=f"wkv_{context_length}",
sources=cuda_kernel_files,
verbose=(logging.get_verbosity() == logging.DEBUG),
extra_cuda_cflags=flags,
)
rwkv_cuda_kernel.max_seq_length = context_length
class RwkvLinearAttention(torch.autograd.Function):
@staticmethod
def forward(ctx, time_decay, time_first, key, value, state=None, return_state=False):
batch_size, seq_len, hidden_size = key.size()
if seq_len > rwkv_cuda_kernel.max_seq_length:
raise ValueError(
f"Cannot process a batch with {seq_len} tokens at the same time, use a maximum of "
f"{rwkv_cuda_kernel.max_seq_length} with this model."
)
if batch_size * hidden_size % min(hidden_size, 32) != 0:
raise ValueError(
f"The product of batch size ({batch_size}) and hidden size ({hidden_size}) needs to be a round "
f"multiple of {min(hidden_size, 32)}."
)
ctx.input_dtype = key.dtype
if (
time_decay.device.type != "cuda"
or time_first.device.type != "cuda"
or key.device.type != "cuda"
or value.device.type != "cuda"
):
raise ValueError("Calling the CUDA kernel for wkv attention requires all tensors to be on CUDA devices.")
time_decay = -torch.exp(time_decay.float().contiguous())
if key.dtype == torch.float16:
time_first = time_first.float()
key = key.float()
value = value.float()
time_first = time_first.contiguous()
key = key.contiguous()
value = value.contiguous()
output = torch.empty_like(key, memory_format=torch.contiguous_format)
if return_state or state is not None:
if state is None:
state = torch.zeros(
batch_size,
hidden_size,
3,
dtype=torch.float32,
device=key.device,
memory_format=torch.contiguous_format,
)
state[:, :, 2] -= 1e38
else:
state = torch.cat([s.unsqueeze(2) for s in state], dim=2).contiguous()
if key.dtype == torch.bfloat16:
forward_func = rwkv_cuda_kernel.forward_with_state_bf16
else:
forward_func = rwkv_cuda_kernel.forward_with_state
forward_func(time_decay, time_first, key, value, output, state)
else:
forward_func = rwkv_cuda_kernel.forward_bf16 if key.dtype == torch.bfloat16 else rwkv_cuda_kernel.forward
forward_func(time_decay, time_first, key, value, output)
ctx.save_for_backward(time_decay, time_first, key, value, output)
if state is not None:
state = [s.squeeze(2) for s in torch.chunk(state, 3, dim=2)]
return output.to(ctx.input_dtype), state
@staticmethod
def backward(ctx, g_output, g_state=None):
input_dtype = ctx.input_dtype
time_decay, time_first, key, value, output = ctx.saved_tensors
g_time_decay = torch.empty_like(
time_decay,
memory_format=torch.contiguous_format,
dtype=torch.bfloat16 if input_dtype == torch.bfloat16 else torch.float32,
)
g_time_first = torch.empty_like(time_first, memory_format=torch.contiguous_format)
g_key = torch.empty_like(key, memory_format=torch.contiguous_format)
g_value = torch.empty_like(value, memory_format=torch.contiguous_format)
if input_dtype == torch.float16:
g_output = g_output.float()
backward_func = rwkv_cuda_kernel.backward_bf16 if input_dtype == torch.bfloat16 else rwkv_cuda_kernel.backward
backward_func(
time_decay,
time_first,
key,
value,
output,
g_output.contiguous(),
g_time_decay,
g_time_first,
g_key,
g_value,
)
return (
g_time_decay.to(input_dtype),
g_time_first.to(input_dtype),
g_key.to(input_dtype),
g_value.to(input_dtype),
None,
None,
)
def rwkv_linear_attention_cpu(time_decay, time_first, key, value, state=None, return_state=False):
_, seq_length, _ = key.size()
output = torch.zeros_like(key)
if state is None:
num_state = torch.zeros_like(key[:, 0], dtype=torch.float32)
den_state = torch.zeros_like(key[:, 0], dtype=torch.float32)
max_state = torch.zeros_like(key[:, 0], dtype=torch.float32) - 1e38
else:
num_state, den_state, max_state = state
time_decay = -torch.exp(time_decay)
for current_index in range(seq_length):
current_key = key[:, current_index].float()
current_value = value[:, current_index]
max_for_output = torch.maximum(max_state, current_key + time_first)
e1 = torch.exp(max_state - max_for_output)
e2 = torch.exp(current_key + time_first - max_for_output)
numerator = e1 * num_state + e2 * current_value
denominator = e1 * den_state + e2
output[:, current_index] = (numerator / denominator).to(output.dtype)
max_for_state = torch.maximum(max_state + time_decay, current_key)
e1 = torch.exp(max_state + time_decay - max_for_state)
e2 = torch.exp(current_key - max_for_state)
num_state = e1 * num_state + e2 * current_value
den_state = e1 * den_state + e2
max_state = max_for_state
if return_state or state is not None:
state = [num_state, den_state, max_state]
return output, state
def rwkv_linear_attention(time_decay, time_first, key, value, state=None, return_state=False):
no_cuda = any(t.device.type != "cuda" for t in [time_decay, time_first, key, value])
one_token = key.size(1) == 1
if rwkv_cuda_kernel is None or no_cuda or one_token:
return rwkv_linear_attention_cpu(time_decay, time_first, key, value, state=state, return_state=return_state)
else:
return RwkvLinearAttention.apply(time_decay, time_first, key, value, state, return_state)
def __init__(self, config, layer_id=0):
super().__init__()
self.config = config
kernel_loaded = rwkv_cuda_kernel is not None and rwkv_cuda_kernel.max_seq_length == config.context_length
if is_ninja_available() and is_torch_cuda_available() and not kernel_loaded:
try:
load_wkv_cuda_kernel(config.context_length)
except Exception:
logger.info("Could not load the custom CUDA kernel for RWKV attention.")
self.layer_id = layer_id
hidden_size = config.hidden_size
attention_hidden_size = (
config.attention_hidden_size if config.attention_hidden_size is not None else hidden_size
)
self.attention_hidden_size = attention_hidden_size
self.time_decay = nn.Parameter(torch.empty(attention_hidden_size))
self.time_first = nn.Parameter(torch.empty(attention_hidden_size))
self.time_mix_key = nn.Parameter(torch.empty(1, 1, hidden_size))
self.time_mix_value = nn.Parameter(torch.empty(1, 1, hidden_size))
self.time_mix_receptance = nn.Parameter(torch.empty(1, 1, hidden_size))
self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
self.key = nn.Linear(hidden_size, attention_hidden_size, bias=False)
self.value = nn.Linear(hidden_size, attention_hidden_size, bias=False)
self.receptance = nn.Linear(hidden_size, attention_hidden_size, bias=False)
self.output = nn.Linear(attention_hidden_size, hidden_size, bias=False)
def extract_key_value(self, hidden, state=None):
if hidden.size(1) == 1 and state is not None:
shifted = state[1][:, :, self.layer_id]
else:
shifted = self.time_shift(hidden)
if state is not None:
shifted[:, 0] = state[1][:, :, self.layer_id]
key = hidden * self.time_mix_key + shifted * (1 - self.time_mix_key)
value = hidden * self.time_mix_value + shifted * (1 - self.time_mix_value)
receptance = hidden * self.time_mix_receptance + shifted * (1 - self.time_mix_receptance)
key = self.key(key)
value = self.value(value)
receptance = torch.sigmoid(self.receptance(receptance))
if state is not None:
state[1][:, :, self.layer_id] = hidden[:, -1]
return receptance, key, value, state
def forward(self, hidden, state=None, use_cache=False):
receptance, key, value, state = self.extract_key_value(hidden, state=state)
layer_state = tuple(s[:, :, self.layer_id] for s in state[2:]) if state is not None else None
rwkv, layer_state = rwkv_linear_attention(
self.time_decay,
self.time_first,
key,
value,
state=layer_state,
return_state=use_cache,
)
if layer_state is not None:
state[2][:, :, self.layer_id] = layer_state[0]
state[3][:, :, self.layer_id] = layer_state[1]
state[4][:, :, self.layer_id] = layer_state[2]
return self.output(receptance * rwkv), state
class RwkvFeedForward(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
self.config = config
self.layer_id = layer_id
hidden_size = config.hidden_size
intermediate_size = (
config.intermediate_size if config.intermediate_size is not None else 4 * config.hidden_size
)
self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
self.time_mix_key = nn.Parameter(torch.empty(1, 1, hidden_size))
self.time_mix_receptance = nn.Parameter(torch.empty(1, 1, hidden_size))
self.key = nn.Linear(hidden_size, intermediate_size, bias=False)
self.receptance = nn.Linear(hidden_size, hidden_size, bias=False)
self.value = nn.Linear(intermediate_size, hidden_size, bias=False)
def forward(self, hidden, state=None):
if hidden.size(1) == 1 and state is not None:
shifted = state[0][:, :, self.layer_id]
else:
shifted = self.time_shift(hidden)
if state is not None:
shifted[:, 0] = state[0][:, :, self.layer_id]
key = hidden * self.time_mix_key + shifted * (1 - self.time_mix_key)
receptance = hidden * self.time_mix_receptance + shifted * (1 - self.time_mix_receptance)
key = torch.square(torch.relu(self.key(key)))
value = self.value(key)
receptance = torch.sigmoid(self.receptance(receptance))
if state is not None:
state[0][:, :, self.layer_id] = hidden[:, -1]
return receptance * value, state
class RwkvBlock(nn.Module):
def __init__(self, config, layer_id):
super().__init__()
self.config = config
self.layer_id = layer_id
if layer_id == 0:
self.pre_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
self.ln1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
self.ln2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
self.attention = RwkvSelfAttention(config, layer_id)
self.feed_forward = RwkvFeedForward(config, layer_id)
def forward(self, hidden, state=None, use_cache=False, output_attentions=False):
if self.layer_id == 0:
hidden = self.pre_ln(hidden)
attention, state = self.attention(self.ln1(hidden), state=state, use_cache=use_cache)
hidden = hidden + attention
feed_forward, state = self.feed_forward(self.ln2(hidden), state=state)
hidden = hidden + feed_forward
outputs = (hidden, state)
if output_attentions:
outputs += (attention,)
else:
outputs += (None,)
return outputs
class RwkvPreTrainedModel(PreTrainedModel):
"""
一个抽象类,处理权重初始化和简单的预训练模型下载与加载接口。
"""
config_class = RwkvConfig
base_model_prefix = "rwkv"
_no_split_modules = ["RwkvBlock"]
_keep_in_fp32_modules = ["time_decay", "time_first"]
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""初始化权重"""
if isinstance(module, RwkvSelfAttention):
layer_id = module.layer_id
num_hidden_layers = module.config.num_hidden_layers
hidden_size = module.config.hidden_size
attention_hidden_size = module.attention_hidden_size
ratio_0_to_1 = layer_id / (num_hidden_layers - 1)
ratio_1_to_almost0 = 1.0 - (layer_id / num_hidden_layers)
time_weight = torch.tensor(
[i / hidden_size for i in range(hidden_size)],
dtype=module.time_mix_key.dtype,
device=module.time_mix_key.device,
)
time_weight = time_weight[None, None, :]
decay_speed = [
-5 + 8 * (h / (attention_hidden_size - 1)) ** (0.7 + 1.3 * ratio_0_to_1)
for h in range(attention_hidden_size)
]
decay_speed = torch.tensor(decay_speed, dtype=module.time_decay.dtype, device=module.time_decay.device)
zigzag = (
torch.tensor(
[(i + 1) % 3 - 1 for i in range(attention_hidden_size)],
dtype=module.time_first.dtype,
device=module.time_first.device,
)
* 0.5
)
with torch.no_grad():
module.time_decay.data = decay_speed
module.time_first.data = torch.ones_like(module.time_first * math.log(0.3) + zigzag)
module.time_mix_key.data = torch.pow(time_weight, ratio_1_to_almost0)
module.time_mix_value.data = torch.pow(time_weight, ratio_1_to_almost0) + 0.3 * ratio_0_to_1
module.time_mix_receptance.data = torch.pow(time_weight, 0.5 * ratio_1_to_almost0)
elif isinstance(module, RwkvFeedForward):
layer_id = module.layer_id
num_hidden_layers = module.config.num_hidden_layers
hidden_size = module.config.hidden_size
ratio_1_to_almost0 = 1.0 - (layer_id / num_hidden_layers)
time_weight = torch.tensor(
[i / hidden_size for i in range(hidden_size)],
dtype=module.time_mix_key.dtype,
device=module.time_mix_key.device,
)
time_weight = time_weight[None, None, :]
with torch.no_grad():
module.time_mix_key.data = torch.pow(time_weight, ratio_1_to_almost0)
module.time_mix_receptance.data = torch.pow(time_weight, ratio_1_to_almost0)
@dataclass
class RwkvOutput(ModelOutput):
"""
Class for the RWKV model outputs.
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
avoid providing the old `input_ids`.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: torch.FloatTensor = None
state: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
avoid providing the old `input_ids`.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
state: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
RWKV_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
# `input_ids` 是输入序列的 token 索引,在词汇表中进行查找得到。
`input_ids_length` = `sequence_length` if `past_key_values` is `None` else
`past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
sequence tokens in the vocabulary.
If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
`input_ids`.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
# 用来避免对填充 token 索引执行注意力操作的掩码。掩码值选择在 `[0, 1]` 范围内:
- 1 表示**未被掩码**的 token,
- 0 表示**被掩码**的 token。
This is currently not used by `RwkvModel`, but will be supported in the future.
[What are attention masks?](../glossary#attention-mask)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
# 可选参数,代替 `input_ids` 直接传递嵌入表示。如果希望更好地控制如何将 `input_ids` 索引转换为关联向量,
这是非常有用的,比如使用自定义的嵌入查找矩阵。
This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
# 如果提供,模型将在所有块中使用先前状态(这将给出模型对提供的 `input_ids` 和 `state_input_ids` 作为上下文的输出)。
If passed along, the model uses the previous state in all the blocks (which will give the output for the
`input_ids` provided as if the model add `state_input_ids + input_ids` as context).
use_cache (`bool`, *optional*):
# 如果设置为 `True`,则返回最后的状态,并且可以用于快速生成下一个 logits。
If set to `True`, the last state is returned and can be used to quickly generate the next logits.
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。详细信息请参见返回的张量中的 `attentions`。
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。详细信息请参见返回的张量中的 `hidden_states`。
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
# 是否返回 [`~utils.ModelOutput`] 而不是普通的元组。
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
定义一个 RwkvModel 类,继承自 RwkvPreTrainedModel 类。
@add_start_docstrings(
"The bare RWKV Model transformer outputting raw hidden-states without any specific head on top.",
RWKV_START_DOCSTRING,
)
添加文档字符串,描述该模型是一个裸的 RWKV 模型,输出未经特定顶层处理的原始隐藏状态。
class RwkvModel(RwkvPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
self.blocks = nn.ModuleList([RwkvBlock(config, layer_id=idx) for idx in range(config.num_hidden_layers)])
self.ln_out = nn.LayerNorm(config.hidden_size)
self.layers_are_rescaled = False
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.embeddings
def set_input_embeddings(self, new_embeddings):
self.embeddings = new_embeddings
@add_start_docstrings_to_model_forward(RWKV_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=RwkvOutput,
config_class=_CONFIG_FOR_DOC,
)
定义 forward 方法,接收多个输入参数,执行模型的前向传播过程。
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
state: Optional[List[torch.FloatTensor]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
def _rescale_layers(self):
if self.layers_are_rescaled == (not self.training):
return
if self.config.rescale_every > 0:
with torch.no_grad():
for block_id, block in enumerate(self.blocks):
if self.training:
block.attention.output.weight.mul_(2 ** int(block_id // self.config.rescale_every))
block.feed_forward.value.weight.mul_(2 ** int(block_id // self.config.rescale_every))
else:
if hasattr(block.attention.output.weight, "SCB"):
block.attention.output.weight.SCB.div_(2 ** int(block_id // self.config.rescale_every))
block.feed_forward.value.weight.SCB.div_(2 ** int(block_id // self.config.rescale_every))
elif hasattr(block.attention.output.weight, "quant_state"):
self._bnb_4bit_dequantize_and_rescale(block.attention.output, block_id)
self._bnb_4bit_dequantize_and_rescale(block.feed_forward.value, block_id)
else:
block.attention.output.weight.div_(2 ** int(block_id // self.config.rescale_every))
block.feed_forward.value.weight.div_(2 ** int(block_id // self.config.rescale_every))
self.layers_are_rescaled = not self.training
def _bnb_4bit_dequantize_and_rescale(self, target_layer, block_id):
r"""
Perform the dequantization and rescaling of the weights of a given layer. After that operation the layer will
be quantized again.
"""
if not is_bitsandbytes_available():
raise ImportError("Please install bitsandbytes to use this method.")
import bitsandbytes as bnb
dequant_weights = bnb.functional.dequantize_4bit(target_layer.weight.data, target_layer.weight.quant_state)
dequant_weights.div_(2 ** int(block_id // self.config.rescale_every))
quant_weight = bnb.nn.Params4bit(dequant_weights.to("cpu"), requires_grad=False).to(dequant_weights.device)
setattr(target_layer, "weight", quant_weight)
@add_start_docstrings(
"""
The RWKV Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
""",
RWKV_START_DOCSTRING,
)
class RwkvForCausalLM(RwkvPreTrainedModel):
_tied_weights_keys = ["head.weight"]
def __init__(self, config):
super().__init__(config)
self.rwkv = RwkvModel(config)
self.head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
def get_output_embeddings(self):
return self.head
def set_output_embeddings(self, new_embeddings):
self.head = new_embeddings
def generate(self, *args, **kwargs):
try:
gen_output = super().generate(*args, **kwargs)
except AttributeError as exc:
if "past_key_values" in str(exc):
raise AttributeError(
"You tried to call `generate` with a decoding strategy that manipulates `past_key_values`. RWKV "
"doesn't have that attribute, try another generation strategy instead. For the available "
"generation strategies, check this doc: https://huggingface.co/docs/transformers/en/generation_strategies#decoding-strategies"
)
else:
raise exc
return gen_output
def prepare_inputs_for_generation(self, input_ids, state=None, inputs_embeds=None, **kwargs):
if state is not None:
input_ids = input_ids[:, -1].unsqueeze(-1)
if inputs_embeds is not None and state is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
model_inputs["state"] = state
return model_inputs
@add_start_docstrings_to_model_forward(RWKV_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=RwkvCausalLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
state: Optional[List[torch.FloatTensor]] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, RwkvCausalLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
`labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
rwkv_outputs = self.rwkv(
input_ids,
inputs_embeds=inputs_embeds,
state=state,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = rwkv_outputs[0]
logits = self.head(hidden_states)
loss = None
if labels is not None:
labels = labels.to(logits.device)
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
loss_fct = CrossEntropyLoss()
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
if not return_dict:
output = (logits,) + rwkv_outputs[1:]
return ((loss,) + output) if loss is not None else output
return RwkvCausalLMOutput(
loss=loss,
logits=logits,
state=rwkv_outputs.state,
hidden_states=rwkv_outputs.hidden_states,
attentions=rwkv_outputs.attentions,
)
.\models\rwkv\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_torch_available,
)
_import_structure = {
"configuration_rwkv": ["RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP", "RwkvConfig", "RwkvOnnxConfig"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_rwkv"] = [
"RWKV_PRETRAINED_MODEL_ARCHIVE_LIST",
"RwkvForCausalLM",
"RwkvModel",
"RwkvPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_rwkv import RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP, RwkvConfig, RwkvOnnxConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_rwkv import (
RWKV_PRETRAINED_MODEL_ARCHIVE_LIST,
RwkvForCausalLM,
RwkvModel,
RwkvPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\sam\configuration_sam.py
""" SAM 模型配置"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
SAM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/sam-vit-huge": "https://huggingface.co/facebook/sam-vit-huge/resolve/main/config.json",
"facebook/sam-vit-large": "https://huggingface.co/facebook/sam-vit-large/resolve/main/config.json",
"facebook/sam-vit-base": "https://huggingface.co/facebook/sam-vit-base/resolve/main/config.json",
}
class SamPromptEncoderConfig(PretrainedConfig):
r"""
这是用于存储 [`SamPromptEncoder`] 配置的配置类。[`SamPromptEncoder`] 模块用于编码输入的 2D 点和边界框。
实例化配置默认将生成与 SAM-vit-h
[facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) 架构类似的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型输出。有关更多信息,请阅读 [`PretrainedConfig`] 的文档。
Args:
hidden_size (`int`, *optional*, 默认为 256):
隐藏状态的维度。
image_size (`int`, *optional*, 默认为 1024):
图像的预期输出分辨率。
patch_size (`int`, *optional*, 默认为 16):
每个补丁的大小(分辨率)。
mask_input_channels (`int`, *optional*, 默认为 16):
要馈送到 `MaskDecoder` 模块的通道数。
num_point_embeddings (`int`, *optional*, 默认为 4):
要使用的点嵌入数量。
hidden_act (`str`, *optional*, 默认为 `"gelu"`):
编码器和池化器中的非线性激活函数。
"""
def __init__(
self,
hidden_size=256,
image_size=1024,
patch_size=16,
mask_input_channels=16,
num_point_embeddings=4,
hidden_act="gelu",
layer_norm_eps=1e-6,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.image_size = image_size
self.patch_size = patch_size
self.image_embedding_size = image_size // patch_size
self.mask_input_channels = mask_input_channels
self.num_point_embeddings = num_point_embeddings
self.hidden_act = hidden_act
self.layer_norm_eps = layer_norm_eps
class SamMaskDecoderConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`SamMaskDecoder`]. It is used to instantiate a SAM
mask decoder to the specified arguments, defining the model architecture. Instantiating a configuration defaults
will yield a similar configuration to that of the SAM-vit-h
[facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
hidden_size (`int`, *optional*, defaults to 256):
Dimensionality of the hidden states.
hidden_act (`str`, *optional*, defaults to `"relu"`):
The non-linear activation function used inside the `SamMaskDecoder` module.
mlp_dim (`int`, *optional*, defaults to 2048):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
num_hidden_layers (`int`, *optional*, defaults to 2):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads for each attention layer in the Transformer encoder.
attention_downsample_rate (`int`, *optional*, defaults to 2):
The downsampling rate of the attention layer.
num_multimask_outputs (`int`, *optional*, defaults to 3):
The number of outputs from the `SamMaskDecoder` module. In the Segment Anything paper, this is set to 3.
iou_head_depth (`int`, *optional*, defaults to 3):
The number of layers in the IoU head module.
iou_head_hidden_dim (`int`, *optional*, defaults to 256):
The dimensionality of the hidden states in the IoU head module.
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the layer normalization layers.
"""
def __init__(
self,
hidden_size=256,
hidden_act="relu",
mlp_dim=2048,
num_hidden_layers=2,
num_attention_heads=8,
attention_downsample_rate=2,
num_multimask_outputs=3,
iou_head_depth=3,
iou_head_hidden_dim=256,
layer_norm_eps=1e-6,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.hidden_act = hidden_act
self.mlp_dim = mlp_dim
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.attention_downsample_rate = attention_downsample_rate
self.num_multimask_outputs = num_multimask_outputs
self.iou_head_depth = iou_head_depth
self.iou_head_hidden_dim = iou_head_hidden_dim
self.layer_norm_eps = layer_norm_eps
class SamVisionConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`SamVisionModel`]. It is used to instantiate a SAM
vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
defaults will yield a similar configuration to that of the SAM ViT-h
[facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
# 设置编码器层和池化层的维度大小,默认为768
hidden_size (`int`, *optional*, defaults to 768):
# Patch Encoder 中输出通道的维度大小,默认为256
output_channels (`int`, *optional*, defaults to 256):
# Transformer 编码器中隐藏层的数量,默认为12
num_hidden_layers (`int`, *optional*, defaults to 12):
# Transformer 编码器中每个注意力层的注意力头数,默认为12
num_attention_heads (`int`, *optional*, defaults to 12):
# 输入图像的通道数,默认为3
num_channels (`int`, *optional*, defaults to 3):
# 期望的输入图像分辨率,默认为1024
image_size (`int`, *optional*, defaults to 1024):
# 从输入图像中提取的补丁大小,默认为16
patch_size (`int`, *optional*, defaults to 16):
# 非线性激活函数的类型,默认为 "gelu"
hidden_act (`str`, *optional*, defaults to `"gelu"`):
# 层归一化层中使用的 epsilon 值,默认为 1e-06
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
# 注意力概率的 dropout 比率,默认为0.0(不使用 dropout)
attention_dropout (`float`, *optional*, defaults to 0.0):
# 初始化所有权重矩阵的截断正态分布的标准差,默认为1e-10
initializer_range (`float`, *optional*, defaults to 1e-10):
# 是否向查询、键、值的投影中添加偏置,默认为 True
qkv_bias (`bool`, *optional*, defaults to `True`):
# MLP 隐藏层维度与嵌入维度之比,默认为4.0
mlp_ratio (`float`, *optional*, defaults to 4.0):
# 是否使用绝对位置编码,默认为 True
use_abs_pos (`bool`, *optional*, defaults to `True`):
# 是否使用相对位置编码,默认为 True
use_rel_pos (`bool`, *optional*, defaults to `True`):
# 相对位置的窗口大小,默认为14
window_size (`int`, *optional*, defaults to 14):
# 全局注意力层的索引列表,默认为 `[2, 5, 8, 11]`
global_attn_indexes (`List[int]`, *optional*, defaults to `[2, 5, 8, 11]`):
# 位置嵌入的维度大小,默认为128
num_pos_feats (`int`, *optional*, defaults to 128):
# Transformer 编码器中 MLP 层的维度大小。如果为 `None`,则默认为 `mlp_ratio * hidden_size`
mlp_dim (`int`, *optional*):
# 初始化函数,设置Transformer模型的各项参数
def __init__(
self,
hidden_size=768, # 隐藏层大小,默认为768
output_channels=256, # 输出通道数,默认为256
num_hidden_layers=12, # 隐藏层的数量,默认为12
num_attention_heads=12, # 注意力头的数量,默认为12
num_channels=3, # 输入图像的通道数,默认为3(RGB)
image_size=1024, # 输入图像的大小,默认为1024x1024像素
patch_size=16, # 图像分块的大小,默认为16x16像素
hidden_act="gelu", # 隐藏层激活函数,默认为GELU
layer_norm_eps=1e-06, # Layer Normalization的epsilon,默认为1e-06
attention_dropout=0.0, # 注意力机制的dropout率,默认为0.0(不使用dropout)
initializer_range=1e-10, # 参数初始化的范围,默认为1e-10
qkv_bias=True, # 是否在QKV矩阵中使用偏置,默认为True
mlp_ratio=4.0, # MLP的维度扩展比例,默认为4.0
use_abs_pos=True, # 是否使用绝对位置编码,默认为True
use_rel_pos=True, # 是否使用相对位置编码,默认为True
window_size=14, # 局部注意力窗口大小,默认为14
global_attn_indexes=[2, 5, 8, 11], # 全局注意力层的索引,默认为[2, 5, 8, 11]
num_pos_feats=128, # 位置特征的数量,默认为128
mlp_dim=None, # MLP的维度,默认为hidden_size * mlp_ratio,若给定mlp_dim则使用给定值
**kwargs, # 其他未指定的参数
):
super().__init__(**kwargs) # 调用父类的初始化方法
self.hidden_size = hidden_size # 设置隐藏层大小属性
self.output_channels = output_channels # 设置输出通道数属性
self.num_hidden_layers = num_hidden_layers # 设置隐藏层数量属性
self.num_attention_heads = num_attention_heads # 设置注意力头数量属性
self.num_channels = num_channels # 设置输入图像通道数属性
self.image_size = image_size # 设置输入图像大小属性
self.patch_size = patch_size # 设置图像分块大小属性
self.hidden_act = hidden_act # 设置隐藏层激活函数属性
self.layer_norm_eps = layer_norm_eps # 设置Layer Normalization的epsilon属性
self.attention_dropout = attention_dropout # 设置注意力dropout率属性
self.initializer_range = initializer_range # 设置参数初始化范围属性
self.qkv_bias = qkv_bias # 设置是否使用QKV偏置属性
self.mlp_ratio = mlp_ratio # 设置MLP维度扩展比例属性
self.use_abs_pos = use_abs_pos # 设置是否使用绝对位置编码属性
self.use_rel_pos = use_rel_pos # 设置是否使用相对位置编码属性
self.window_size = window_size # 设置局部注意力窗口大小属性
self.global_attn_indexes = global_attn_indexes # 设置全局注意力层的索引属性
self.num_pos_feats = num_pos_feats # 设置位置特征数量属性
self.mlp_dim = int(hidden_size * mlp_ratio) if mlp_dim is None else mlp_dim # 设置MLP的维度属性,如果mlp_dim未指定则计算为hidden_size * mlp_ratio
# 定义 `SamConfig` 类,用于存储 `SamModel` 的配置信息,继承自 `PretrainedConfig`。
class SamConfig(PretrainedConfig):
# 文档字符串,描述了 `SamConfig` 的作用和用法,以及如何实例化 SAM 模型的相关参数。
r"""
[`SamConfig`] is the configuration class to store the configuration of a [`SamModel`]. It is used to instantiate a
SAM model according to the specified arguments, defining the vision model, prompt-encoder model and mask decoder
configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the
SAM-ViT-H [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vision_config (Union[`dict`, `SamVisionConfig`], *optional*):
Dictionary of configuration options used to initialize [`SamVisionConfig`].
prompt_encoder_config (Union[`dict`, `SamPromptEncoderConfig`], *optional*):
Dictionary of configuration options used to initialize [`SamPromptEncoderConfig`].
mask_decoder_config (Union[`dict`, `SamMaskDecoderConfig`], *optional*):
Dictionary of configuration options used to initialize [`SamMaskDecoderConfig`].
kwargs (*optional*):
Dictionary of keyword arguments.
Example:
```
>>> from transformers import (
... SamVisionConfig,
... SamPromptEncoderConfig,
... SamMaskDecoderConfig,
... SamModel,
... )
>>>
>>> configuration = SamConfig()
>>>
>>> model = SamModel(configuration)
>>>
>>> configuration = model.config
>>>
>>>
>>> vision_config = SamVisionConfig()
>>> prompt_encoder_config = SamPromptEncoderConfig()
>>> mask_decoder_config = SamMaskDecoderConfig()
>>> config = SamConfig(vision_config, prompt_encoder_config, mask_decoder_config)
```"""
# 类属性 `model_type`,指定模型类型为 "sam"。
model_type = "sam"
# 构造函数 `__init__`,用于初始化 `SamConfig` 类的实例。
def __init__(
self,
vision_config=None,
prompt_encoder_config=None,
mask_decoder_config=None,
initializer_range=0.02,
**kwargs,
):
# 调用父类的构造方法,传递所有的关键字参数
super().__init__(**kwargs)
# 如果 vision_config 不为 None,则使用其值;否则使用空字典
vision_config = vision_config if vision_config is not None else {}
# 如果 prompt_encoder_config 不为 None,则使用其值;否则使用空字典
prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {}
# 如果 mask_decoder_config 不为 None,则使用其值;否则使用空字典
mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {}
# 如果 vision_config 是 SamVisionConfig 类的实例,则转换为字典
if isinstance(vision_config, SamVisionConfig):
vision_config = vision_config.to_dict()
# 如果 prompt_encoder_config 是 SamPromptEncoderConfig 类的实例,则转换为字典
if isinstance(prompt_encoder_config, SamPromptEncoderConfig):
prompt_encoder_config = prompt_encoder_config.to_dict()
# 如果 mask_decoder_config 是 SamMaskDecoderConfig 类的实例,则转换为字典
if isinstance(mask_decoder_config, SamMaskDecoderConfig):
mask_decoder_config = mask_decoder_config.to_dict()
# 使用 vision_config 字典创建 SamVisionConfig 对象
self.vision_config = SamVisionConfig(**vision_config)
# 使用 prompt_encoder_config 字典创建 SamPromptEncoderConfig 对象
self.prompt_encoder_config = SamPromptEncoderConfig(**prompt_encoder_config)
# 使用 mask_decoder_config 字典创建 SamMaskDecoderConfig 对象
self.mask_decoder_config = SamMaskDecoderConfig(**mask_decoder_config)
# 设置 initializer_range 实例变量
self.initializer_range = initializer_range
.\transformers\models\sam\convert_sam_original_to_hf_format.py
"""
从原始存储库转换 SAM 检查点。
"""
import argparse
import re
import numpy as np
import requests
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import (
SamConfig,
SamImageProcessor,
SamModel,
SamProcessor,
SamVisionConfig,
)
KEYS_TO_MODIFY_MAPPING = {
"iou_prediction_head.layers.0": "iou_prediction_head.proj_in",
"iou_prediction_head.layers.1": "iou_prediction_head.layers.0",
"iou_prediction_head.layers.2": "iou_prediction_head.proj_out",
"mask_decoder.output_upscaling.0": "mask_decoder.upscale_conv1",
"mask_decoder.output_upscaling.1": "mask_decoder.upscale_layer_norm",
"mask_decoder.output_upscaling.3": "mask_decoder.upscale_conv2",
"mask_downscaling.0": "mask_embed.conv1",
"mask_downscaling.1": "mask_embed.layer_norm1",
"mask_downscaling.3": "mask_embed.conv2",
"mask_downscaling.4": "mask_embed.layer_norm2",
"mask_downscaling.6": "mask_embed.conv3",
"point_embeddings": "point_embed",
"pe_layer.positional_encoding_gaussian_matrix": "shared_embedding.positional_embedding",
"image_encoder": "vision_encoder",
"neck.0": "neck.conv1",
"neck.1": "neck.layer_norm1",
"neck.2": "neck.conv2",
"neck.3": "neck.layer_norm2",
"patch_embed.proj": "patch_embed.projection",
".norm": ".layer_norm",
"blocks": "layers",
}
def replace_keys(state_dict):
model_state_dict = {}
state_dict.pop("pixel_mean", None)
state_dict.pop("pixel_std", None)
output_hypernetworks_mlps_pattern = r".*.output_hypernetworks_mlps.(\d+).layers.(\d+).*"
for key, value in state_dict.items():
for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
if key_to_modify in key:
key = key.replace(key_to_modify, new_key)
if re.match(output_hypernetworks_mlps_pattern, key):
layer_nb = int(re.match(output_hypernetworks_mlps_pattern, key).group(2))
if layer_nb == 0:
key = key.replace("layers.0", "proj_in")
elif layer_nb == 1:
key = key.replace("layers.1", "layers.0")
elif layer_nb == 2:
key = key.replace("layers.2", "proj_out")
model_state_dict[key] = value
model_state_dict["shared_image_embedding.positional_embedding"] = model_state_dict[
"prompt_encoder.shared_embedding.positional_embedding"
]
return model_state_dict
def convert_sam_checkpoint(model_name, pytorch_dump_folder, push_to_hub, model_hub_id="ybelkada/segment-anything"):
checkpoint_path = hf_hub_download(model_hub_id, f"checkpoints/{model_name}.pth")
if "sam_vit_b" in model_name:
config = SamConfig()
elif "sam_vit_l" in model_name:
vision_config = SamVisionConfig(
hidden_size=1024,
num_hidden_layers=24,
num_attention_heads=16,
global_attn_indexes=[5, 11, 17, 23],
)
config = SamConfig(
vision_config=vision_config,
)
elif "sam_vit_h" in model_name:
vision_config = SamVisionConfig(
hidden_size=1280,
num_hidden_layers=32,
num_attention_heads=16,
global_attn_indexes=[7, 15, 23, 31],
)
config = SamConfig(
vision_config=vision_config,
)
state_dict = torch.load(checkpoint_path, map_location="cpu")
state_dict = replace_keys(state_dict)
image_processor = SamImageProcessor()
processor = SamProcessor(image_processor=image_processor)
hf_model = SamModel(config)
hf_model.load_state_dict(state_dict)
hf_model = hf_model.to("cuda")
img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
input_points = [[[400, 650]]]
input_labels = [[1]]
inputs = processor(images=np.array(raw_image), return_tensors="pt").to("cuda")
with torch.no_grad():
output = hf_model(**inputs)
scores = output.iou_scores.squeeze()
if model_name == "sam_vit_h_4b8939":
assert scores[-1].item() == 0.579890251159668
inputs = processor(
images=np.array(raw_image), input_points=input_points, input_labels=input_labels, return_tensors="pt"
).to("cuda")
with torch.no_grad():
output = hf_model(**inputs)
scores = output.iou_scores.squeeze()
assert scores[-1].item() == 0.9712603092193604
input_boxes = ((75, 275, 1725, 850),)
inputs = processor(images=np.array(raw_image), input_boxes=input_boxes, return_tensors="pt").to("cuda")
with torch.no_grad():
output = hf_model(**inputs)
scores = output.iou_scores.squeeze()
assert scores[-1].item() == 0.8686015605926514
input_points = [[[400, 650], [800, 650]]]
input_labels = [[1, 1]]
inputs = processor(
images=np.array(raw_image), input_points=input_points, input_labels=input_labels, return_tensors="pt"
).to("cuda")
with torch.no_grad():
output = hf_model(**inputs)
scores = output.iou_scores.squeeze()
assert scores[-1].item() == 0.9936047792434692
if __name__ == "__main__":
parser = argparse.ArgumentParser()
choices = ["sam_vit_b_01ec64", "sam_vit_h_4b8939", "sam_vit_l_0b3195"]
parser.add_argument(
"--model_name",
default="sam_vit_h_4b8939",
choices=choices,
type=str,
help="Path to hf config.json of model to convert",
)
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Whether to push the model and processor to the hub after converting",
)
parser.add_argument(
"--model_hub_id",
default="ybelkada/segment-anything",
choices=choices,
type=str,
help="Path to hf config.json of model to convert",
)
args = parser.parse_args()
convert_sam_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.model_hub_id)
.\models\sam\convert_sam_to_hf.py
"""
从原始仓库中转换 SAM 模型的检查点。
URL: https://github.com/facebookresearch/segment-anything.
同时支持从 https://github.com/czg1225/SlimSAM/tree/master 转换 SlimSAM 检查点。
"""
import argparse
import re
import numpy as np
import requests
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import (
SamConfig,
SamImageProcessor,
SamModel,
SamProcessor,
SamVisionConfig,
)
def get_config(model_name):
if "slimsam-50" in model_name:
vision_config = SamVisionConfig(
hidden_size=384,
mlp_dim=1536,
num_hidden_layers=12,
num_attention_heads=12,
global_attn_indexes=[2, 5, 8, 11],
)
elif "slimsam-77" in model_name:
vision_config = SamVisionConfig(
hidden_size=168,
mlp_dim=696,
num_hidden_layers=12,
num_attention_heads=12,
global_attn_indexes=[2, 5, 8, 11],
)
elif "sam_vit_b" in model_name:
vision_config = SamVisionConfig()
elif "sam_vit_l" in model_name:
vision_config = SamVisionConfig(
hidden_size=1024,
num_hidden_layers=24,
num_attention_heads=16,
global_attn_indexes=[5, 11, 17, 23],
)
elif "sam_vit_h" in model_name:
vision_config = SamVisionConfig(
hidden_size=1280,
num_hidden_layers=32,
num_attention_heads=16,
global_attn_indexes=[7, 15, 23, 31],
)
config = SamConfig(
vision_config=vision_config,
)
return config
KEYS_TO_MODIFY_MAPPING = {
"iou_prediction_head.layers.0": "iou_prediction_head.proj_in",
"iou_prediction_head.layers.1": "iou_prediction_head.layers.0",
"iou_prediction_head.layers.2": "iou_prediction_head.proj_out",
"mask_decoder.output_upscaling.0": "mask_decoder.upscale_conv1",
"mask_decoder.output_upscaling.1": "mask_decoder.upscale_layer_norm",
"mask_decoder.output_upscaling.3": "mask_decoder.upscale_conv2",
"mask_downscaling.0": "mask_embed.conv1",
"mask_downscaling.1": "mask_embed.layer_norm1",
"mask_downscaling.3": "mask_embed.conv2",
"mask_downscaling.4": "mask_embed.layer_norm2",
"mask_downscaling.6": "mask_embed.conv3",
}
"point_embeddings": "point_embed",
"pe_layer.positional_encoding_gaussian_matrix": "shared_embedding.positional_embedding",
"image_encoder": "vision_encoder",
"neck.0": "neck.conv1",
"neck.1": "neck.layer_norm1",
"neck.2": "neck.conv2",
"neck.3": "neck.layer_norm2",
"patch_embed.proj": "patch_embed.projection",
".norm": ".layer_norm",
"blocks": "layers",
}
def replace_keys(state_dict):
model_state_dict = {}
state_dict.pop("pixel_mean", None)
state_dict.pop("pixel_std", None)
output_hypernetworks_mlps_pattern = r".*.output_hypernetworks_mlps.(\d+).layers.(\d+).*"
for key, value in state_dict.items():
for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
if key_to_modify in key:
key = key.replace(key_to_modify, new_key)
if re.match(output_hypernetworks_mlps_pattern, key):
layer_nb = int(re.match(output_hypernetworks_mlps_pattern, key).group(2))
if layer_nb == 0:
key = key.replace("layers.0", "proj_in")
elif layer_nb == 1:
key = key.replace("layers.1", "layers.0")
elif layer_nb == 2:
key = key.replace("layers.2", "proj_out")
model_state_dict[key] = value
model_state_dict["shared_image_embedding.positional_embedding"] = model_state_dict[
"prompt_encoder.shared_embedding.positional_embedding"
]
return model_state_dict
def convert_sam_checkpoint(model_name, checkpoint_path, pytorch_dump_folder, push_to_hub):
config = get_config(model_name)
state_dict = torch.load(checkpoint_path, map_location="cpu")
state_dict = replace_keys(state_dict)
image_processor = SamImageProcessor()
processor = SamProcessor(image_processor=image_processor)
hf_model = SamModel(config)
hf_model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
hf_model.load_state_dict(state_dict)
hf_model = hf_model.to(device)
img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
input_points = [[[500, 375]]]
input_labels = [[1]]
inputs = processor(images=np.array(raw_image), return_tensors="pt").to(device)
with torch.no_grad():
output = hf_model(**inputs)
scores = output.iou_scores.squeeze()
if model_name == "sam_vit_b_01ec64":
inputs = processor(
images=np.array(raw_image), input_points=input_points, input_labels=input_labels, return_tensors="pt"
).to(device)
with torch.no_grad():
output = hf_model(**inputs)
scores = output.iou_scores.squeeze()
elif model_name == "sam_vit_h_4b8939":
inputs = processor(
images=np.array(raw_image), input_points=input_points, input_labels=input_labels, return_tensors="pt"
).to(device)
with torch.no_grad():
output = hf_model(**inputs)
scores = output.iou_scores.squeeze()
assert scores[-1].item() == 0.9712603092193604
input_boxes = ((75, 275, 1725, 850),)
inputs = processor(images=np.array(raw_image), input_boxes=input_boxes, return_tensors="pt").to(device)
with torch.no_grad():
output = hf_model(**inputs)
scores = output.iou_scores.squeeze()
assert scores[-1].item() == 0.8686015605926514
input_points = [[[400, 650], [800, 650]]]
input_labels = [[1, 1]]
inputs = processor(
images=np.array(raw_image), input_points=input_points, input_labels=input_labels, return_tensors="pt"
).to(device)
with torch.no_grad():
output = hf_model(**inputs)
scores = output.iou_scores.squeeze()
assert scores[-1].item() == 0.9936047792434692
if pytorch_dump_folder is not None:
processor.save_pretrained(pytorch_dump_folder)
hf_model.save_pretrained(pytorch_dump_folder)
if push_to_hub:
repo_id = f"nielsr/{model_name}" if "slimsam" in model_name else f"meta/{model_name}"
processor.push_to_hub(repo_id)
hf_model.push_to_hub(repo_id)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
choices = ["sam_vit_b_01ec64", "sam_vit_h_4b8939", "sam_vit_l_0b3195", "slimsam-50-uniform", "slimsam-77-uniform"]
parser.add_argument(
"--model_name",
default="sam_vit_h_4b8939",
choices=choices,
type=str,
help="Name of the original model to convert",
)
parser.add_argument(
"--checkpoint_path",
type=str,
required=False,
help="Path to the original checkpoint",
)
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Whether to push the model and processor to the hub after converting",
)
args = parser.parse_args()
if "slimsam" in args.model_name:
checkpoint_path = args.checkpoint_path
if checkpoint_path is None:
raise ValueError("You need to provide a checkpoint path for SlimSAM models.")
else:
checkpoint_path = hf_hub_download("ybelkada/segment-anything", f"checkpoints/{args.model_name}.pth")
convert_sam_checkpoint(args.model_name, checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)