Transformers 源码解析(三十三)
.\models\deberta\modeling_tf_deberta.py
class TFDebertaContextPooler(keras.layers.Layer):
def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(config.pooler_hidden_size, name="dense")
self.dropout = TFDebertaStableDropout(config.pooler_dropout, name="dropout")
self.config = config
def call(self, hidden_states, training: bool = False):
context_token = hidden_states[:, 0]
context_token = self.dropout(context_token, training=training)
pooled_output = self.dense(context_token)
pooled_output = get_tf_activation(self.config.pooler_hidden_act)(pooled_output)
return pooled_output
@property
def output_dim(self) -> int:
return self.config.hidden_size
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.pooler_hidden_size])
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
class TFDebertaXSoftmax(keras.layers.Layer):
"""
Masked Softmax which is optimized for saving memory
Args:
input (`tf.Tensor`): The input tensor that will apply softmax.
mask (`tf.Tensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
dim (int): The dimension that will apply softmax
"""
def __init__(self, axis=-1, **kwargs):
super().__init__(**kwargs)
self.axis = axis
def call(self, inputs: tf.Tensor, mask: tf.Tensor):
rmask = tf.logical_not(tf.cast(mask, tf.bool))
output = tf.where(rmask, float("-inf"), inputs)
output = stable_softmax(output, self.axis)
output = tf.where(rmask, 0.0, output)
return output
class TFDebertaStableDropout(keras.layers.Layer):
"""
Optimized dropout module for stabilizing the training
Args:
drop_prob (float): the dropout probabilities
"""
def __init__(self, drop_prob, **kwargs):
super().__init__(**kwargs)
self.drop_prob = drop_prob
@tf.custom_gradient
def xdropout(self, inputs):
"""
Applies dropout to the inputs, as vanilla dropout, but also scales the remaining elements up by 1/drop_prob.
"""
mask = tf.cast(
1
- tf.compat.v1.distributions.Bernoulli(probs=1.0 - self.drop_prob).sample(sample_shape=shape_list(inputs)),
tf.bool,
)
scale = tf.convert_to_tensor(1.0 / (1 - self.drop_prob), dtype=tf.float32)
if self.drop_prob > 0:
inputs = tf.where(mask, 0.0, inputs) * scale
def grad(upstream):
if self.drop_prob > 0:
return tf.where(mask, 0.0, upstream) * scale
else:
return upstream
return inputs, grad
def call(self, inputs: tf.Tensor, training: tf.Tensor = False):
if training:
return self.xdropout(inputs)
return inputs
class TFDebertaLayerNorm(keras.layers.Layer):
"""LayerNorm module in the TF style (epsilon inside the square root)."""
def __init__(self, size, eps=1e-12, **kwargs):
super().__init__(**kwargs)
self.size = size
self.eps = eps
def build(self, input_shape):
self.gamma = self.add_weight(shape=[self.size], initializer=tf.ones_initializer(), name="weight")
self.beta = self.add_weight(shape=[self.size], initializer=tf.zeros_initializer(), name="bias")
return super().build(input_shape)
def call(self, x: tf.Tensor) -> tf.Tensor:
mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True)
std = tf.math.sqrt(variance + self.eps)
return self.gamma * (x - mean) / std + self.beta
class TFDebertaSelfOutput(keras.layers.Layer):
pass
def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(config.hidden_size, name="dense")
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout")
self.config = config
def call(self, hidden_states, input_tensor, training: bool = False):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
class TFDebertaAttention(keras.layers.Layer):
def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs)
self.self = TFDebertaDisentangledSelfAttention(config, name="self")
self.dense_output = TFDebertaSelfOutput(config, name="output")
self.config = config
def call(
self,
input_tensor: tf.Tensor,
attention_mask: tf.Tensor,
query_states: tf.Tensor = None,
relative_pos: tf.Tensor = None,
rel_embeddings: tf.Tensor = None,
output_attentions: bool = False,
training: bool = False,
) -> Tuple[tf.Tensor]:
self_outputs = self.self(
hidden_states=input_tensor,
attention_mask=attention_mask,
query_states=query_states,
relative_pos=relative_pos,
rel_embeddings=rel_embeddings,
output_attentions=output_attentions,
training=training,
)
if query_states is None:
query_states = input_tensor
attention_output = self.dense_output(
hidden_states=self_outputs[0], input_tensor=query_states, training=training
)
output = (attention_output,) + self_outputs[1:]
return output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self", None) is not None:
with tf.name_scope(self.self.name):
self.self.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
class TFDebertaIntermediate(keras.layers.Layer):
def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFDebertaOutput(keras.layers.Layer):
def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout")
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
class TFDebertaLayer(keras.layers.Layer):
def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs)
self.attention = TFDebertaAttention(config, name="attention")
self.intermediate = TFDebertaIntermediate(config, name="intermediate")
self.bert_output = TFDebertaOutput(config, name="output")
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
query_states: tf.Tensor = None,
relative_pos: tf.Tensor = None,
rel_embeddings: tf.Tensor = None,
output_attentions: bool = False,
training: bool = False,
) -> Tuple[tf.Tensor]:
attention_outputs = self.attention(
input_tensor=hidden_states,
attention_mask=attention_mask,
query_states=query_states,
relative_pos=relative_pos,
rel_embeddings=rel_embeddings,
output_attentions=output_attentions,
training=training,
)
attention_output = attention_outputs[0]
intermediate_output = self.intermediate(hidden_states=attention_output)
layer_output = self.bert_output(
hidden_states=intermediate_output, input_tensor=attention_output, training=training
)
outputs = (layer_output,) + attention_outputs[1:]
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "bert_output", None) is not None:
with tf.name_scope(self.bert_output.name):
self.bert_output.build(None)
class TFDebertaEncoder(keras.layers.Layer):
def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs)
self.layer = [TFDebertaLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
self.relative_attention = getattr(config, "relative_attention", False)
self.config = config
if self.relative_attention:
self.max_relative_positions = getattr(config, "max_relative_positions", -1)
if self.max_relative_positions < 1:
self.max_relative_positions = config.max_position_embeddings
def build(self, input_shape=None):
if self.built:
return
self.built = True
if self.relative_attention:
self.rel_embeddings = self.add_weight(
name="rel_embeddings.weight",
shape=[self.max_relative_positions * 2, self.config.hidden_size],
initializer=get_initializer(self.config.initializer_range),
)
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
def get_rel_embedding(self):
rel_embeddings = self.rel_embeddings if self.relative_attention else None
return rel_embeddings
def get_attention_mask(self, attention_mask):
if len(shape_list(attention_mask)) <= 2:
extended_attention_mask = tf.expand_dims(tf.expand_dims(attention_mask, 1), 2)
attention_mask = extended_attention_mask * tf.expand_dims(tf.squeeze(extended_attention_mask, -2), -1)
attention_mask = tf.cast(attention_mask, tf.uint8)
elif len(shape_list(attention_mask)) == 3:
attention_mask = tf.expand_dims(attention_mask, 1)
return attention_mask
def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):
if self.relative_attention and relative_pos is None:
q = shape_list(query_states)[-2] if query_states is not None else shape_list(hidden_states)[-2]
relative_pos = build_relative_position(q, shape_list(hidden_states)[-2])
return relative_pos
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
query_states: tf.Tensor = None,
relative_pos: tf.Tensor = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
training: bool = False,
) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
attention_mask = self.get_attention_mask(attention_mask)
relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)
if isinstance(hidden_states, Sequence):
next_kv = hidden_states[0]
else:
next_kv = hidden_states
rel_embeddings = self.get_rel_embedding()
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_outputs = layer_module(
hidden_states=next_kv,
attention_mask=attention_mask,
query_states=query_states,
relative_pos=relative_pos,
rel_embeddings=rel_embeddings,
output_attentions=output_attentions,
training=training,
)
hidden_states = layer_outputs[0]
if query_states is not None:
query_states = hidden_states
if isinstance(hidden_states, Sequence):
next_kv = hidden_states[i + 1] if i + 1 < len(self.layer) else None
else:
next_kv = hidden_states
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
return TFBaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
def build_relative_position(query_size, key_size):
"""
根据查询和键构建相对位置关系
假设查询的绝对位置 \\(P_q\\) 范围在 (0, query_size),键的绝对位置 \\(P_k\\) 范围在 (0, key_size),
查询到键的相对位置为 \\(R_{q \\rightarrow k} = P_q - P_k\\)
Args:
query_size (int): 查询的长度
key_size (int): 键的长度
Return:
`tf.Tensor`: 形状为 [1, query_size, key_size] 的张量,表示相对位置索引
"""
q_ids = tf.range(query_size, dtype=tf.int32)
k_ids = tf.range(key_size, dtype=tf.int32)
rel_pos_ids = q_ids[:, None] - tf.tile(tf.reshape(k_ids, [1, -1]), [query_size, 1])
rel_pos_ids = rel_pos_ids[:query_size, :]
rel_pos_ids = tf.expand_dims(rel_pos_ids, axis=0)
return tf.cast(rel_pos_ids, tf.int64)
def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos):
shapes = [
shape_list(query_layer)[0],
shape_list(query_layer)[1],
shape_list(query_layer)[2],
shape_list(relative_pos)[-1],
]
return tf.broadcast_to(c2p_pos, shapes)
def p2c_dynamic_expand(c2p_pos, query_layer, key_layer):
shapes = [
shape_list(query_layer)[0],
shape_list(query_layer)[1],
shape_list(key_layer)[-2],
shape_list(key_layer)[-2],
]
return tf.broadcast_to(c2p_pos, shapes)
def pos_dynamic_expand(pos_index, p2c_att, key_layer):
shapes = shape_list(p2c_att)[:2] + [shape_list(pos_index)[-2], shape_list(key_layer)[-2]]
return tf.broadcast_to(pos_index, shapes)
def torch_gather(x, indices, gather_axis):
if gather_axis < 0:
gather_axis = tf.rank(x) + gather_axis
if gather_axis != tf.rank(x) - 1:
pre_roll = tf.rank(x) - 1 - gather_axis
permutation = tf.roll(tf.range(tf.rank(x)), pre_roll, axis=0)
x = tf.transpose(x, perm=permutation)
indices = tf.transpose(indices, perm=permutation)
else:
pre_roll = 0
flat_x = tf.reshape(x, (-1, tf.shape(x)[-1]))
flat_indices = tf.reshape(indices, (-1, tf.shape(indices)[-1]))
gathered = tf.gather(flat_x, flat_indices, batch_dims=1)
gathered = tf.reshape(gathered, tf.shape(indices))
if pre_roll != 0:
permutation = tf.roll(tf.range(tf.rank(x)), -pre_roll, axis=0)
gathered = tf.transpose(gathered, perm=permutation)
return gathered
class TFDebertaDisentangledSelfAttention(keras.layers.Layer):
"""
Disentangled self-attention module
Parameters:
config (`str`):
A model config class instance with the configuration to build a new model. The schema is similar to
*BertConfig*, for more details, please refer [`DebertaConfig`]
"""
def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs)
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.in_proj = keras.layers.Dense(
self.all_head_size * 3,
kernel_initializer=get_initializer(config.initializer_range),
name="in_proj",
use_bias=False,
)
self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else []
self.relative_attention = getattr(config, "relative_attention", False)
self.talking_head = getattr(config, "talking_head", False)
if self.talking_head:
self.head_logits_proj = keras.layers.Dense(
self.num_attention_heads,
kernel_initializer=get_initializer(config.initializer_range),
name="head_logits_proj",
use_bias=False,
)
self.head_weights_proj = keras.layers.Dense(
self.num_attention_heads,
kernel_initializer=get_initializer(config.initializer_range),
name="head_weights_proj",
use_bias=False,
)
self.softmax = TFDebertaXSoftmax(axis=-1)
if self.relative_attention:
self.max_relative_positions = getattr(config, "max_relative_positions", -1)
if self.max_relative_positions < 1:
self.max_relative_positions = config.max_position_embeddings
self.pos_dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="pos_dropout")
if "c2p" in self.pos_att_type:
self.pos_proj = keras.layers.Dense(
self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range),
name="pos_proj",
use_bias=False,
)
if "p2c" in self.pos_att_type:
self.pos_q_proj = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="pos_q_proj"
)
self.dropout = TFDebertaStableDropout(config.attention_probs_dropout_prob, name="dropout")
self.config = config
def build(self, input_shape=None):
if self.built:
return
self.built = True
self.q_bias = self.add_weight(
name="q_bias", shape=(self.all_head_size), initializer=keras.initializers.Zeros()
)
self.v_bias = self.add_weight(
name="v_bias", shape=(self.all_head_size), initializer=keras.initializers.Zeros()
)
if getattr(self, "in_proj", None) is not None:
with tf.name_scope(self.in_proj.name):
self.in_proj.build([None, None, self.config.hidden_size])
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
if getattr(self, "head_logits_proj", None) is not None:
with tf.name_scope(self.head_logits_proj.name):
self.head_logits_proj.build(None)
if getattr(self, "head_weights_proj", None) is not None:
with tf.name_scope(self.head_weights_proj.name):
self.head_weights_proj.build(None)
if getattr(self, "pos_dropout", None) is not None:
with tf.name_scope(self.pos_dropout.name):
self.pos_dropout.build(None)
if getattr(self, "pos_proj", None) is not None:
with tf.name_scope(self.pos_proj.name):
self.pos_proj.build([self.config.hidden_size])
if getattr(self, "pos_q_proj", None) is not None:
with tf.name_scope(self.pos_q_proj.name):
self.pos_q_proj.build([self.config.hidden_size])
def transpose_for_scores(self, tensor: tf.Tensor) -> tf.Tensor:
shape = shape_list(tensor)[:-1] + [self.num_attention_heads, -1]
tensor = tf.reshape(tensor=tensor, shape=shape)
return tf.transpose(tensor, perm=[0, 2, 1, 3])
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
query_states: tf.Tensor = None,
relative_pos: tf.Tensor = None,
rel_embeddings: tf.Tensor = None,
output_attentions: bool = False,
training: bool = False,
def disentangled_att_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor):
if relative_pos is None:
q = shape_list(query_layer)[-2]
relative_pos = build_relative_position(q, shape_list(key_layer)[-2])
shape_list_pos = shape_list(relative_pos)
if len(shape_list_pos) == 2:
relative_pos = tf.expand_dims(tf.expand_dims(relative_pos, 0), 0)
elif len(shape_list_pos) == 3:
relative_pos = tf.expand_dims(relative_pos, 1)
elif len(shape_list_pos) != 4:
raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {len(shape_list_pos)}")
att_span = tf.cast(
tf.minimum(
tf.maximum(shape_list(query_layer)[-2], shape_list(key_layer)[-2]), self.max_relative_positions
),
tf.int64,
)
rel_embeddings = tf.expand_dims(
rel_embeddings[self.max_relative_positions - att_span : self.max_relative_positions + att_span, :], 0
)
score = 0
if "c2p" in self.pos_att_type:
pos_key_layer = self.pos_proj(rel_embeddings)
pos_key_layer = self.transpose_for_scores(pos_key_layer)
c2p_att = tf.matmul(query_layer, tf.transpose(pos_key_layer, [0, 1, 3, 2]))
c2p_pos = tf.clip_by_value(relative_pos + att_span, 0, att_span * 2 - 1)
c2p_att = torch_gather(c2p_att, c2p_dynamic_expand(c2p_pos, query_layer, relative_pos), -1)
score += c2p_att
if "p2c" in self.pos_att_type:
pos_query_layer = self.pos_q_proj(rel_embeddings)
pos_query_layer = self.transpose_for_scores(pos_query_layer)
pos_query_layer /= tf.math.sqrt(tf.cast(shape_list(pos_query_layer)[-1] * scale_factor, dtype=tf.float32))
if shape_list(query_layer)[-2] != shape_list(key_layer)[-2]:
r_pos = build_relative_position(shape_list(key_layer)[-2], shape_list(key_layer)[-2])
else:
r_pos = relative_pos
p2c_pos = tf.clip_by_value(-r_pos + att_span, 0, att_span * 2 - 1)
p2c_att = tf.matmul(key_layer, tf.transpose(pos_query_layer, [0, 1, 3, 2]))
p2c_att = tf.transpose(
torch_gather(p2c_att, p2c_dynamic_expand(p2c_pos, query_layer, key_layer), -1), [0, 1, 3, 2]
)
if shape_list(query_layer)[-2] != shape_list(key_layer)[-2]:
pos_index = tf.expand_dims(relative_pos[:, :, :, 0], -1)
p2c_att = torch_gather(p2c_att, pos_dynamic_expand(pos_index, p2c_att, key_layer), -2)
score += p2c_att
return score
class TFDebertaEmbeddings(keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.config = config
self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings
self.position_biased_input = getattr(config, "position_biased_input", True)
self.initializer_range = config.initializer_range
if self.embedding_size != config.hidden_size:
self.embed_proj = keras.layers.Dense(
config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name="embed_proj",
use_bias=False,
)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout")
def build(self, input_shape=None):
with tf.name_scope("word_embeddings"):
self.weight = self.add_weight(
name="weight",
shape=[self.config.vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range),
)
with tf.name_scope("token_type_embeddings"):
if self.config.type_vocab_size > 0:
self.token_type_embeddings = self.add_weight(
name="embeddings",
shape=[self.config.type_vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range),
)
else:
self.token_type_embeddings = None
with tf.name_scope("position_embeddings"):
if self.position_biased_input:
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
else:
self.position_embeddings = None
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
if getattr(self, "embed_proj", None) is not None:
with tf.name_scope(self.embed_proj.name):
self.embed_proj.build([None, None, self.embedding_size])
def call(
self,
input_ids: tf.Tensor = None,
position_ids: tf.Tensor = None,
token_type_ids: tf.Tensor = None,
inputs_embeds: tf.Tensor = None,
mask: tf.Tensor = None,
training: bool = False,
) -> tf.Tensor:
"""
Applies embedding based on inputs tensor.
Returns:
final_embeddings (`tf.Tensor`): output embedding tensor.
"""
if input_ids is None and inputs_embeds is None:
raise ValueError("Need to provide either `input_ids` or `input_embeds`.")
if input_ids is not None:
check_embeddings_within_bounds(input_ids, self.config.vocab_size)
inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
input_shape = shape_list(inputs_embeds)[:-1]
if token_type_ids is None:
token_type_ids = tf.fill(dims=input_shape, value=0)
if position_ids is None:
position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)
final_embeddings = inputs_embeds
if self.position_biased_input:
position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
final_embeddings += position_embeds
if self.config.type_vocab_size > 0:
token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
final_embeddings += token_type_embeds
if self.embedding_size != self.hidden_size:
final_embeddings = self.embed_proj(final_embeddings)
final_embeddings = self.LayerNorm(final_embeddings)
if mask is not None:
if len(shape_list(mask)) != len(shape_list(final_embeddings)):
if len(shape_list(mask)) == 4:
mask = tf.squeeze(tf.squeeze(mask, axis=1), axis=1)
mask = tf.cast(tf.expand_dims(mask, axis=2), tf.float32)
final_embeddings = final_embeddings * mask
final_embeddings = self.dropout(final_embeddings, training=training)
return final_embeddings
class TFDebertaPredictionHeadTransform(keras.layers.Layer):
def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs)
self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
self.dense = keras.layers.Dense(
units=self.embedding_size,
kernel_initializer=get_initializer(config.initializer_range),
name="dense",
)
if isinstance(config.hidden_act, str):
self.transform_act_fn = get_tf_activation(config.hidden_act)
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.embedding_size])
class TFDebertaLMPredictionHead(keras.layers.Layer):
def __init__(self, config: DebertaConfig, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs)
self.config = config
self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
self.transform = TFDebertaPredictionHeadTransform(config, name="transform")
self.input_embeddings = input_embeddings
def build(self, input_shape=None):
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
if self.built:
return
self.built = True
if getattr(self, "transform", None) is not None:
with tf.name_scope(self.transform.name):
self.transform.build(None)
def get_output_embeddings(self) -> keras.layers.Layer:
return self.input_embeddings
def set_output_embeddings(self, value: tf.Variable):
self.input_embeddings.weight = value
self.input_embeddings.vocab_size = shape_list(value)[0]
def get_bias(self) -> Dict[str, tf.Variable]:
return {"bias": self.bias}
def set_bias(self, value: tf.Variable):
self.bias = value["bias"]
self.config.vocab_size = shape_list(value["bias"])[0]
hidden_states = self.transform(hidden_states=hidden_states)
seq_length = shape_list(hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states
class TFDebertaOnlyMLMHead(keras.layers.Layer):
def __init__(self, config: DebertaConfig, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs)
self.predictions = TFDebertaLMPredictionHead(config, input_embeddings, name="predictions")
def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
prediction_scores = self.predictions(hidden_states=sequence_output)
return prediction_scores
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "predictions", None) is not None:
with tf.name_scope(self.predictions.name):
self.predictions.build(None)
class TFDebertaMainLayer(keras.layers.Layer):
config_class = DebertaConfig
def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.embeddings = TFDebertaEmbeddings(config, name="embeddings")
self.encoder = TFDebertaEncoder(config, name="encoder")
def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings
def set_input_embeddings(self, value: tf.Variable):
self.embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
raise NotImplementedError
@unpack_inputs
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = shape_list(input_ids)
elif inputs_embeds is not None:
input_shape = shape_list(inputs_embeds)[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if attention_mask is None:
attention_mask = tf.fill(dims=input_shape, value=1)
if token_type_ids is None:
token_type_ids = tf.fill(dims=input_shape, value=0)
embedding_output = self.embeddings(
input_ids=input_ids,
position_ids=position_ids,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
mask=attention_mask,
training=training,
)
encoder_outputs = self.encoder(
hidden_states=embedding_output,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = encoder_outputs[0]
if not return_dict:
return (sequence_output,) + encoder_outputs[1:]
return TFBaseModelOutput(
last_hidden_state=sequence_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
"""
The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
positional argument:
- a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
Note that when creating models and layers with
[subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
about any of this, as you can just pass inputs like you would to any other Python function!
</Tip>
Parameters:
config ([`DebertaConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
Args:
input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` or `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
[What are input IDs?](../glossary
attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
- 1 表示 **未被掩码的** token,
- 0 表示 **被掩码的** token。
[What are attention masks?](../glossary
token_type_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
- 0 对应 *句子 A* 的 token,
- 1 对应 *句子 B* 的 token。
[What are token type IDs?](../glossary
position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
[What are position IDs?](../glossary
inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
output_attentions (`bool`, *optional*):
output_hidden_states (`bool`, *optional*):
return_dict (`bool`, *optional*):
"""
@add_start_docstrings(
"The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.",
DEBERTA_START_DOCSTRING,
)
"""
class TFDebertaModel(TFDebertaPreTrainedModel):
def __init__(self, config: DebertaConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.deberta = TFDebertaMainLayer(config, name="deberta")
@unpack_inputs
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
outputs = self.deberta(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "deberta", None) is not None:
with tf.name_scope(self.deberta.name):
self.deberta.build(None)
"""
@add_start_docstrings("DeBERTa Model with a `language modeling` head on top.", DEBERTA_START_DOCSTRING)
"""
class TFDebertaForMaskedLM(TFDebertaPreTrainedModel, TFMaskedLanguageModelingLoss):
def __init__(self, config: DebertaConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
if config.is_decoder:
logger.warning(
"If you want to use `TFDebertaForMaskedLM` make sure `config.is_decoder=False` for "
"bi-directional self-attention."
)
self.deberta = TFDebertaMainLayer(config, name="deberta")
self.mlm = TFDebertaOnlyMLMHead(config, input_embeddings=self.deberta.embeddings, name="cls")
def get_lm_head(self) -> keras.layers.Layer:
return self.mlm.predictions
@unpack_inputs
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFMaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
计算掩码语言建模损失的标签。索引应在 `[-100, 0, ..., config.vocab_size]` 范围内。索引为 `-100` 的标记被忽略(掩码),
损失仅计算标签在 `[0, ..., config.vocab_size]` 范围内的标记。
"""
outputs = self.deberta(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
prediction_scores = self.mlm(sequence_output=sequence_output, training=training)
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=prediction_scores)
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFMaskedLMOutput(
loss=loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "deberta", None) is not None:
with tf.name_scope(self.deberta.name):
self.deberta.build(None)
if getattr(self, "mlm", None) is not None:
with tf.name_scope(self.mlm.name):
self.mlm.build(None)
@add_start_docstrings(
"""
DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
DEBERTA_START_DOCSTRING,
)
class TFDebertaForSequenceClassification(TFDebertaPreTrainedModel, TFSequenceClassificationLoss):
def __init__(self, config: DebertaConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.deberta = TFDebertaMainLayer(config, name="deberta")
self.pooler = TFDebertaContextPooler(config, name="pooler")
drop_out = getattr(config, "cls_dropout", None)
drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
self.dropout = TFDebertaStableDropout(drop_out, name="cls_dropout")
self.classifier = keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="classifier",
)
self.output_dim = self.pooler.output_dim
@unpack_inputs
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
outputs = self.deberta(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
pooled_output = self.pooler(sequence_output, training=training)
pooled_output = self.dropout(pooled_output, training=training)
logits = self.classifier(pooled_output)
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFSequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "deberta", None) is not None:
with tf.name_scope(self.deberta.name):
self.deberta.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.output_dim])
@add_start_docstrings(
"""
DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
DEBERTA_START_DOCSTRING,
)
class TFDebertaForTokenClassification(TFDebertaPreTrainedModel, TFTokenClassificationLoss):
def __init__(self, config: DebertaConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.deberta = TFDebertaMainLayer(config, name="deberta")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.classifier = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
):
"""
DeBERTa 模型的前向传播方法,处理输入并返回输出结果。
Args:
input_ids (TFModelInputType | None): 输入的 token IDs
attention_mask (np.ndarray | tf.Tensor | None): 注意力遮罩
token_type_ids (np.ndarray | tf.Tensor | None): token 类型 IDs
position_ids (np.ndarray | tf.Tensor | None): 位置 IDs
inputs_embeds (np.ndarray | tf.Tensor | None): 嵌入式输入
output_attentions (Optional[bool]): 是否输出注意力权重
output_hidden_states (Optional[bool]): 是否输出隐藏状态
return_dict (Optional[bool]): 是否以字典形式返回结果
labels (np.ndarray | tf.Tensor | None): 标签数据
training (Optional[bool]): 是否在训练模式下
Returns:
TFTokenClassifierOutput: DeBERTa 模型的输出结果对象
"""
return super().call(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
labels=labels,
training=training,
)
) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
outputs = self.deberta(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(inputs=sequence_output)
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFTokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "deberta", None) is not None:
with tf.name_scope(self.deberta.name):
self.deberta.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
"""
DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
"""
@add_start_docstrings(
"""
添加文档字符串注释到模型的前向传播函数,描述输入的详细信息。
""",
DEBERTA_START_DOCSTRING,
)
class TFDebertaForQuestionAnswering(TFDebertaPreTrainedModel, TFQuestionAnsweringLoss):
def __init__(self, config: DebertaConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.deberta = TFDebertaMainLayer(config, name="deberta")
self.qa_outputs = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
start_positions: np.ndarray | tf.Tensor | None = None,
end_positions: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
r"""
start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
outputs = self.deberta(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
logits = self.qa_outputs(inputs=sequence_output)
start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1)
start_logits = tf.squeeze(input=start_logits, axis=-1)
end_logits = tf.squeeze(input=end_logits, axis=-1)
loss = None
if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions}
labels["end_position"] = end_positions
loss = self.hf_compute_loss(labels=labels, logits=(start_logits, end_logits))
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "deberta", None) is not None:
with tf.name_scope(self.deberta.name):
self.deberta.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
.\models\deberta\tokenization_deberta.py
""" Tokenization class for model DeBERTa."""
import json
import os
from typing import List, Optional, Tuple
import regex as re
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/deberta-base": "https://huggingface.co/microsoft/deberta-base/resolve/main/vocab.json",
"microsoft/deberta-large": "https://huggingface.co/microsoft/deberta-large/resolve/main/vocab.json",
"microsoft/deberta-xlarge": "https://huggingface.co/microsoft/deberta-xlarge/resolve/main/vocab.json",
"microsoft/deberta-base-mnli": "https://huggingface.co/microsoft/deberta-base-mnli/resolve/main/vocab.json",
"microsoft/deberta-large-mnli": "https://huggingface.co/microsoft/deberta-large-mnli/resolve/main/vocab.json",
"microsoft/deberta-xlarge-mnli": (
"https://huggingface.co/microsoft/deberta-xlarge-mnli/resolve/main/vocab.json"
),
},
"merges_file": {
"microsoft/deberta-base": "https://huggingface.co/microsoft/deberta-base/resolve/main/merges.txt",
"microsoft/deberta-large": "https://huggingface.co/microsoft/deberta-large/resolve/main/merges.txt",
"microsoft/deberta-xlarge": "https://huggingface.co/microsoft/deberta-xlarge/resolve/main/merges.txt",
"microsoft/deberta-base-mnli": "https://huggingface.co/microsoft/deberta-base-mnli/resolve/main/merges.txt",
"microsoft/deberta-large-mnli": "https://huggingface.co/microsoft/deberta-large-mnli/resolve/main/merges.txt",
"microsoft/deberta-xlarge-mnli": (
"https://huggingface.co/microsoft/deberta-xlarge-mnli/resolve/main/merges.txt"
),
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/deberta-base": 512,
"microsoft/deberta-large": 512,
"microsoft/deberta-xlarge": 512,
"microsoft/deberta-base-mnli": 512,
"microsoft/deberta-large-mnli": 512,
"microsoft/deberta-xlarge-mnli": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"microsoft/deberta-base": {"do_lower_case": False},
"microsoft/deberta-large": {"do_lower_case": False},
}
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
characters the bpe code barfs on.
The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
tables between utf-8 bytes and unicode strings.
"""
bs = (
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
)
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
def get_pairs(word):
"""
Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
class DebertaTokenizer(PreTrainedTokenizer):
"""
Construct a DeBERTa tokenizer. Based on byte-level Byte-Pair-Encoding.
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
be encoded differently whether it is at the beginning of the sentence (without space) or not:
```
>>> from transformers import DebertaTokenizer
>>> tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
>>> tokenizer("Hello world")["input_ids"]
[1, 31414, 232, 2]
>>> tokenizer(" Hello world")["input_ids"]
[1, 20920, 232, 2]
```
You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
<Tip>
When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one).
</Tip>
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
def __init__(
self,
vocab_file,
merges_file,
errors="replace",
bos_token="[CLS]",
eos_token="[SEP]",
sep_token="[SEP]",
cls_token="[CLS]",
unk_token="[UNK]",
pad_token="[PAD]",
mask_token="[MASK]",
add_prefix_space=False,
add_bos_token=False,
**kwargs,
):
):
bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
self.add_bos_token = add_bos_token
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
with open(merges_file, encoding="utf-8") as merges_handle:
bpe_merges = merges_handle.read().split("\n")[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
self.add_prefix_space = add_prefix_space
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
super().__init__(
errors=errors,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
cls_token=cls_token,
pad_token=pad_token,
mask_token=mask_token,
add_prefix_space=add_prefix_space,
add_bos_token=add_bos_token,
**kwargs,
)
@property
def vocab_size(self):
return len(self.encoder)
def get_vocab(self):
return dict(self.encoder, **self.added_tokens_encoder)
def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token)
pairs = get_pairs(word)
if not pairs:
return token
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = " ".join(word)
self.cache[token] = word
return word
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
通过连接和添加特殊 token 构建用于序列分类任务的模型输入。DeBERTa 的序列格式如下:
- 单个序列: [CLS] X [SEP]
- 序列对: [CLS] A [SEP] B [SEP]
Args:
token_ids_0 (`List[int]`):
要添加特殊 token 的 ID 列表。
token_ids_1 (`List[int]`, *可选*):
第二个序列的 ID 列表(用于序列对)。
Returns:
`List[int]`: 带有适当特殊 token 的输入 ID 列表。
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
):
"""
返回一个 mask 列表,指示哪些 token 是特殊 token。
Args:
token_ids_0 (`List[int]`):
第一个序列的 token ID 列表。
token_ids_1 (`List[int]`, *可选*):
第二个序列的 token ID 列表(用于序列对)。
already_has_special_tokens (`bool`):
是否已经包含了特殊 token。
Returns:
`List[int]`: mask 列表,每个元素为 1(特殊 token)或 0(普通 token)。
"""
special_tokens_mask = [0] * len(token_ids_0)
if already_has_special_tokens:
return special_tokens_mask
special_tokens_mask[0] = 1
special_tokens_mask[-1] = 1
if token_ids_1 is not None:
special_tokens_mask += [1] * len(token_ids_1)
return special_tokens_mask
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
else:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
else:
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def _tokenize(self, text):
"""Tokenize a string."""
bpe_tokens = []
for token in re.findall(self.pat, text):
token = "".join(
self.byte_encoder[b] for b in token.encode("utf-8")
)
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
return bpe_tokens
def _convert_token_to_id(self, token):
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
return self.decoder.get(index)
def convert_tokens_to_string(self, tokens):
text = "".join(tokens)
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
return text
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write("#version: 0.2\n")
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
text = " " + text
return (text, kwargs)
.\models\deberta\tokenization_deberta_fast.py
""" Fast Tokenization class for model DeBERTa."""
import json
from typing import List, Optional, Tuple
from tokenizers import pre_tokenizers
from ...tokenization_utils_base import AddedToken, BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .tokenization_deberta import DebertaTokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/deberta-base": "https://huggingface.co/microsoft/deberta-base/resolve/main/vocab.json",
"microsoft/deberta-large": "https://huggingface.co/microsoft/deberta-large/resolve/main/vocab.json",
"microsoft/deberta-xlarge": "https://huggingface.co/microsoft/deberta-xlarge/resolve/main/vocab.json",
"microsoft/deberta-base-mnli": "https://huggingface.co/microsoft/deberta-base-mnli/resolve/main/vocab.json",
"microsoft/deberta-large-mnli": "https://huggingface.co/microsoft/deberta-large-mnli/resolve/main/vocab.json",
"microsoft/deberta-xlarge-mnli": (
"https://huggingface.co/microsoft/deberta-xlarge-mnli/resolve/main/vocab.json"
),
},
"merges_file": {
"microsoft/deberta-base": "https://huggingface.co/microsoft/deberta-base/resolve/main/merges.txt",
"microsoft/deberta-large": "https://huggingface.co/microsoft/deberta-large/resolve/main/merges.txt",
"microsoft/deberta-xlarge": "https://huggingface.co/microsoft/deberta-xlarge/resolve/main/merges.txt",
"microsoft/deberta-base-mnli": "https://huggingface.co/microsoft/deberta-base-mnli/resolve/main/merges.txt",
"microsoft/deberta-large-mnli": "https://huggingface.co/microsoft/deberta-large-mnli/resolve/main/merges.txt",
"microsoft/deberta-xlarge-mnli": (
"https://huggingface.co/microsoft/deberta-xlarge-mnli/resolve/main/merges.txt"
),
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/deberta-base": 512,
"microsoft/deberta-large": 512,
"microsoft/deberta-xlarge": 512,
"microsoft/deberta-base-mnli": 512,
"microsoft/deberta-large-mnli": 512,
"microsoft/deberta-xlarge-mnli": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"microsoft/deberta-base": {"do_lower_case": False},
}
"microsoft/deberta-large": {"do_lower_case": False},
}
class DebertaTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" DeBERTa tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
Byte-Pair-Encoding.
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
be encoded differently whether it is at the beginning of the sentence (without space) or not:
```
>>> from transformers import DebertaTokenizerFast
>>> tokenizer = DebertaTokenizerFast.from_pretrained("microsoft/deberta-base")
>>> tokenizer("Hello world")["input_ids"]
[1, 31414, 232, 2]
>>> tokenizer(" Hello world")["input_ids"]
[1, 20920, 232, 2]
```
You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
the model was not pretrained this way, it might yield a decrease in performance.
<Tip>
When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
</Tip>
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
slow_tokenizer_class = DebertaTokenizer
def __init__(
self,
vocab_file=None,
merges_file=None,
tokenizer_file=None,
errors="replace",
bos_token="[CLS]",
eos_token="[SEP]",
sep_token="[SEP]",
cls_token="[CLS]",
unk_token="[UNK]",
pad_token="[PAD]",
mask_token="[MASK]",
add_prefix_space=False,
**kwargs,
):
super().__init__(
vocab_file,
merges_file,
tokenizer_file=tokenizer_file,
errors=errors,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
cls_token=cls_token,
pad_token=pad_token,
mask_token=mask_token,
add_prefix_space=add_prefix_space,
**kwargs,
)
self.add_bos_token = kwargs.pop("add_bos_token", False)
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
self.add_prefix_space = add_prefix_space
@property
def mask_token(self) -> str:
"""
`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
having been set.
Deberta tokenizer has a special mask token to be used in the fill-mask pipeline. The mask token will greedily
comprise the space before the *[MASK]*.
"""
if self._mask_token is None:
if self.verbose:
logger.error("Using mask_token, but it is not set yet.")
return None
return str(self._mask_token)
@mask_token.setter
def mask_token(self, value):
"""
Overriding the default behavior of the mask token to have it eat the space before it.
"""
value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
self._mask_token = value
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
def build_inputs_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and
adding special tokens. A DeBERTa sequence has the following format:
- single sequence: [CLS] X [SEP]
- pair of sequences: [CLS] A [SEP] B [SEP]
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
"""
Encodes a batch of inputs into token IDs, attention masks, and other relevant information.
Args:
*args:
Positional arguments for encoding.
**kwargs:
Keyword arguments for encoding, including `is_split_into_words`.
Returns:
`BatchEncoding`: Encoded batch containing token IDs, attention masks, and other metadata.
"""
is_split_into_words = kwargs.get("is_split_into_words", False)
assert self.add_prefix_space or not is_split_into_words, (
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
"to use it with pretokenized inputs."
)
return super()._batch_encode_plus(*args, **kwargs)
def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
is_split_into_words = kwargs.get("is_split_into_words", False)
assert self.add_prefix_space or not is_split_into_words, (
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
"to use it with pretokenized inputs."
)
return super()._encode_plus(*args, **kwargs)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
.\models\deberta\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_deberta": ["DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaConfig", "DebertaOnnxConfig"],
"tokenization_deberta": ["DebertaTokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_deberta_fast"] = ["DebertaTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_deberta"] = [
"DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
"DebertaForMaskedLM",
"DebertaForQuestionAnswering",
"DebertaForSequenceClassification",
"DebertaForTokenClassification",
"DebertaModel",
"DebertaPreTrainedModel",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_deberta"] = [
"TF_DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFDebertaForMaskedLM",
"TFDebertaForQuestionAnswering",
"TFDebertaForSequenceClassification",
"TFDebertaForTokenClassification",
"TFDebertaModel",
"TFDebertaPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig, DebertaOnnxConfig
from .tokenization_deberta import DebertaTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_deberta_fast import DebertaTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
from .modeling_deberta import (
DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
DebertaForMaskedLM,
DebertaForQuestionAnswering,
DebertaForSequenceClassification,
DebertaForTokenClassification,
DebertaModel,
DebertaPreTrainedModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_deberta import (
TF_DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
TFDebertaForMaskedLM,
TFDebertaForQuestionAnswering,
TFDebertaForSequenceClassification,
TFDebertaForTokenClassification,
TFDebertaModel,
TFDebertaPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\deberta_v2\configuration_deberta_v2.py
"""
DeBERTa-v2 model configuration
"""
from collections import OrderedDict
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
if TYPE_CHECKING:
from ... import FeatureExtractionMixin, PreTrainedTokenizerBase, TensorType
logger = logging.get_logger(__name__)
DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"microsoft/deberta-v2-xlarge": "https://huggingface.co/microsoft/deberta-v2-xlarge/resolve/main/config.json",
"microsoft/deberta-v2-xxlarge": "https://huggingface.co/microsoft/deberta-v2-xxlarge/resolve/main/config.json",
"microsoft/deberta-v2-xlarge-mnli": (
"https://huggingface.co/microsoft/deberta-v2-xlarge-mnli/resolve/main/config.json"
),
"microsoft/deberta-v2-xxlarge-mnli": (
"https://huggingface.co/microsoft/deberta-v2-xxlarge-mnli/resolve/main/config.json"
),
}
class DebertaV2Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`DebertaV2Model`]. It is used to instantiate a
DeBERTa-v2 model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the DeBERTa
[microsoft/deberta-v2-xlarge](https://huggingface.co/microsoft/deberta-v2-xlarge) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import DebertaV2Config, DebertaV2Model
>>> # Initializing a DeBERTa-v2 microsoft/deberta-v2-xlarge style configuration
>>> configuration = DebertaV2Config()
>>> # Initializing a model (with random weights) from the microsoft/deberta-v2-xlarge style configuration
>>> model = DebertaV2Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "deberta-v2"
def __init__(
self,
vocab_size=128100,
hidden_size=1536,
num_hidden_layers=24,
num_attention_heads=24,
intermediate_size=6144,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=0,
initializer_range=0.02,
layer_norm_eps=1e-7,
relative_attention=False,
max_relative_positions=-1,
pad_token_id=0,
position_biased_input=True,
pos_att_type=None,
pooler_dropout=0,
pooler_hidden_act="gelu",
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.relative_attention = relative_attention
self.max_relative_positions = max_relative_positions
self.pad_token_id = pad_token_id
self.position_biased_input = position_biased_input
if isinstance(pos_att_type, str):
pos_att_type = [x.strip() for x in pos_att_type.lower().split("|")]
self.pos_att_type = pos_att_type
self.vocab_size = vocab_size
self.layer_norm_eps = layer_norm_eps
self.pooler_hidden_size = kwargs.get("pooler_hidden_size", hidden_size)
self.pooler_dropout = pooler_dropout
self.pooler_hidden_act = pooler_hidden_act
class DebertaV2OnnxConfig(OnnxConfig):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task == "multiple-choice":
dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
else:
dynamic_axis = {0: "batch", 1: "sequence"}
if self._config.type_vocab_size > 0:
return OrderedDict(
[("input_ids", dynamic_axis), ("attention_mask", dynamic_axis), ("token_type_ids", dynamic_axis)]
)
else:
return OrderedDict([("input_ids", dynamic_axis), ("attention_mask", dynamic_axis)])
@property
def default_onnx_opset(self) -> int:
return 12
def generate_dummy_inputs(
self,
preprocessor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"],
batch_size: int = -1,
seq_length: int = -1,
num_choices: int = -1,
is_pair: bool = False,
framework: Optional["TensorType"] = None,
num_channels: int = 3,
image_width: int = 40,
image_height: int = 40,
tokenizer: "PreTrainedTokenizerBase" = None,
) -> Mapping[str, Any]:
dummy_inputs = super().generate_dummy_inputs(preprocessor=preprocessor, framework=framework)
if self._config.type_vocab_size == 0 and "token_type_ids" in dummy_inputs:
del dummy_inputs["token_type_ids"]
return dummy_inputs
.\models\deberta_v2\modeling_deberta_v2.py
""" PyTorch DeBERTa-v2 model."""
from collections.abc import Sequence
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutput,
MaskedLMOutput,
MultipleChoiceModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import softmax_backward_data
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_deberta_v2 import DebertaV2Config
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "DebertaV2Config"
_CHECKPOINT_FOR_DOC = "microsoft/deberta-v2-xlarge"
_QA_TARGET_START_INDEX = 2
_QA_TARGET_END_INDEX = 9
DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/deberta-v2-xlarge",
"microsoft/deberta-v2-xxlarge",
"microsoft/deberta-v2-xlarge-mnli",
"microsoft/deberta-v2-xxlarge-mnli",
]
class ContextPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.pooler_hidden_size, config.pooler_hidden_size)
self.dropout = StableDropout(config.pooler_dropout)
self.config = config
def forward(self, hidden_states):
context_token = hidden_states[:, 0]
context_token = self.dropout(context_token)
pooled_output = self.dense(context_token)
pooled_output = ACT2FN[self.config.pooler_hidden_act](pooled_output)
return pooled_output
@property
def output_dim(self):
return self.config.hidden_size
class XSoftmax(torch.autograd.Function):
"""
Masked Softmax which is optimized for saving memory
"""
Args:
input (`torch.tensor`): The input tensor that will apply softmax.
mask (`torch.IntTensor`):
The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
dim (int): The dimension along which softmax will be applied.
Example:
```
>>> import torch
>>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax
>>>
>>> x = torch.randn([4, 20, 100])
>>>
>>> mask = (x > 0).int()
>>>
>>> dim = -1
>>> y = XSoftmax.apply(x, mask, dim)
```
@staticmethod
def forward(self, input, mask, dim):
self.dim = dim
rmask = ~(mask.to(torch.bool))
output = input.masked_fill(rmask, torch.tensor(torch.finfo(input.dtype).min))
output = torch.softmax(output, self.dim)
output.masked_fill_(rmask, 0)
self.save_for_backward(output)
return output
@staticmethod
def backward(self, grad_output):
(output,) = self.saved_tensors
inputGrad = softmax_backward_data(self, grad_output, output, self.dim, output)
return inputGrad, None, None
@staticmethod
def symbolic(g, self, mask, dim):
import torch.onnx.symbolic_helper as sym_help
from torch.onnx.symbolic_opset9 import masked_fill, softmax
mask_cast_value = g.op("Cast", mask, to_i=sym_help.cast_pytorch_to_onnx["Long"])
r_mask = g.op(
"Cast",
g.op("Sub", g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64)), mask_cast_value),
to_i=sym_help.cast_pytorch_to_onnx["Bool"],
)
output = masked_fill(
g, self, r_mask, g.op("Constant", value_t=torch.tensor(torch.finfo(self.type().dtype()).min))
)
output = softmax(g, output, dim)
return masked_fill(g, output, r_mask, g.op("Constant", value_t=torch.tensor(0, dtype=torch.bool)))
class DropoutContext(object):
def __init__(self):
self.dropout = 0
self.mask = None
self.scale = 1
self.reuse_mask = True
def get_mask(input, local_context):
if not isinstance(local_context, DropoutContext):
dropout = local_context
mask = None
else:
dropout = local_context.dropout
dropout *= local_context.scale
mask = local_context.mask if local_context.reuse_mask else None
if dropout > 0 and mask is None:
mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).to(torch.bool)
if isinstance(local_context, DropoutContext):
if local_context.mask is None:
local_context.mask = mask
return mask, dropout
class XDropout(torch.autograd.Function):
"""Optimized dropout function to save computation and memory by using mask operation instead of multiplication."""
@staticmethod
def forward(ctx, input, local_ctx):
mask, dropout = get_mask(input, local_ctx)
ctx.scale = 1.0 / (1 - dropout)
if dropout > 0:
ctx.save_for_backward(mask)
return input.masked_fill(mask, 0) * ctx.scale
else:
return input
@staticmethod
def backward(ctx, grad_output):
if ctx.scale > 1:
(mask,) = ctx.saved_tensors
return grad_output.masked_fill(mask, 0) * ctx.scale, None
else:
return grad_output, None
@staticmethod
def symbolic(g: torch._C.Graph, input: torch._C.Value, local_ctx: Union[float, DropoutContext]) -> torch._C.Value:
from torch.onnx import symbolic_opset12
dropout_p = local_ctx
if isinstance(local_ctx, DropoutContext):
dropout_p = local_ctx.dropout
train = True
return symbolic_opset12.dropout(g, input, dropout_p, train)
class StableDropout(nn.Module):
"""
Optimized dropout module for stabilizing the training
Args:
drop_prob (float): the dropout probabilities
"""
def __init__(self, drop_prob):
super().__init__()
self.drop_prob = drop_prob
self.count = 0
self.context_stack = None
def forward(self, x):
"""
Call the module
Args:
x (`torch.tensor`): The input tensor to apply dropout
"""
if self.training and self.drop_prob > 0:
return XDropout.apply(x, self.get_context())
return x
def clear_context(self):
"""
Clear the context stack and reset count to zero.
"""
self.count = 0
self.context_stack = None
def init_context(self, reuse_mask=True, scale=1):
"""
Initialize the context stack with optional parameters.
Args:
reuse_mask (bool, optional): Whether to reuse mask for dropout. Defaults to True.
scale (int, optional): Scaling factor for dropout. Defaults to 1.
"""
if self.context_stack is None:
self.context_stack = []
self.count = 0
for c in self.context_stack:
c.reuse_mask = reuse_mask
c.scale = scale
def get_context(self):
"""
Get the current dropout context from the context stack or create a new one.
Returns:
DropoutContext: Current or newly created dropout context.
"""
if self.context_stack is not None:
if self.count >= len(self.context_stack):
self.context_stack.append(DropoutContext())
ctx = self.context_stack[self.count]
ctx.dropout = self.drop_prob
self.count += 1
return ctx
else:
return self.drop_prob
class DebertaV2SelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
self.dropout = StableDropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class DebertaV2Attention(nn.Module):
def __init__(self, config):
super().__init__()
self.self = DisentangledSelfAttention(config)
self.output = DebertaV2SelfOutput(config)
self.config = config
def forward(
self,
hidden_states,
attention_mask,
output_attentions=False,
query_states=None,
relative_pos=None,
rel_embeddings=None,
):
self_output = self.self(
hidden_states,
attention_mask,
output_attentions,
query_states=query_states,
relative_pos=relative_pos,
rel_embeddings=rel_embeddings,
)
if output_attentions:
self_output, att_matrix = self_output
if query_states is None:
query_states = hidden_states
attention_output = self.output(self_output, query_states)
if output_attentions:
return (attention_output, att_matrix)
else:
return attention_output
class DebertaV2Intermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class DebertaV2Output(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
self.dropout = StableDropout(config.hidden_dropout_prob)
self.config = config
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class DebertaV2Layer(nn.Module):
def __init__(self, config):
super().__init__()
self.attention = DebertaV2Attention(config)
self.intermediate = DebertaV2Intermediate(config)
self.output = DebertaV2Output(config)
def forward(
self,
hidden_states,
attention_mask,
query_states=None,
relative_pos=None,
rel_embeddings=None,
output_attentions=False,
):
attention_output = self.attention(
hidden_states,
attention_mask,
output_attentions=output_attentions,
query_states=query_states,
relative_pos=relative_pos,
rel_embeddings=rel_embeddings,
)
if output_attentions:
attention_output, att_matrix = attention_output
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
if output_attentions:
return (layer_output, att_matrix)
else:
return layer_output
class ConvLayer(nn.Module):
def __init__(self, config):
super().__init__()
kernel_size = getattr(config, "conv_kernel_size", 3)
groups = getattr(config, "conv_groups", 1)
self.conv_act = getattr(config, "conv_act", "tanh")
self.conv = nn.Conv1d(
config.hidden_size, config.hidden_size, kernel_size, padding=(kernel_size - 1) // 2, groups=groups
)
self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
self.dropout = StableDropout(config.hidden_dropout_prob)
self.config = config
def forward(self, hidden_states, residual_states, input_mask):
out = self.conv(hidden_states.permute(0, 2, 1).contiguous()).permute(0, 2, 1).contiguous()
rmask = (1 - input_mask).bool()
out.masked_fill_(rmask.unsqueeze(-1).expand(out.size()), 0)
out = ACT2FN[self.conv_act](self.dropout(out))
layer_norm_input = residual_states + out
output = self.LayerNorm(layer_norm_input).to(layer_norm_input)
if input_mask is None:
output_states = output
else:
if input_mask.dim() != layer_norm_input.dim():
if input_mask.dim() == 4:
input_mask = input_mask.squeeze(1).squeeze(1)
input_mask = input_mask.unsqueeze(2)
input_mask = input_mask.to(output.dtype)
output_states = output * input_mask
return output_states
class DebertaV2Encoder(nn.Module):
"""Modified BertEncoder with relative position bias support"""
def __init__(self, config):
super().__init__()
self.layer = nn.ModuleList([DebertaV2Layer(config) for _ in range(config.num_hidden_layers)])
self.relative_attention = getattr(config, "relative_attention", False)
if self.relative_attention:
self.max_relative_positions = getattr(config, "max_relative_positions", -1)
if self.max_relative_positions < 1:
self.max_relative_positions = config.max_position_embeddings
self.position_buckets = getattr(config, "position_buckets", -1)
pos_ebd_size = self.max_relative_positions * 2
if self.position_buckets > 0:
pos_ebd_size = self.position_buckets * 2
self.rel_embeddings = nn.Embedding(pos_ebd_size, config.hidden_size)
self.norm_rel_ebd = [x.strip() for x in getattr(config, "norm_rel_ebd", "none").lower().split("|")]
if "layer_norm" in self.norm_rel_ebd:
self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=True)
self.conv = ConvLayer(config) if getattr(config, "conv_kernel_size", 0) > 0 else None
self.gradient_checkpointing = False
def get_rel_embedding(self):
rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None
if rel_embeddings is not None and ("layer_norm" in self.norm_rel_ebd):
rel_embeddings = self.LayerNorm(rel_embeddings)
return rel_embeddings
def get_attention_mask(self, attention_mask):
if attention_mask.dim() <= 2:
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1)
elif attention_mask.dim() == 3:
attention_mask = attention_mask.unsqueeze(1)
return attention_mask
def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):
if self.relative_attention and relative_pos is None:
q = query_states.size(-2) if query_states is not None else hidden_states.size(-2)
relative_pos = build_relative_position(
q,
hidden_states.size(-2),
bucket_size=self.position_buckets,
max_position=self.max_relative_positions,
device=hidden_states.device,
)
return relative_pos
def forward(
self,
hidden_states,
attention_mask,
output_hidden_states=True,
output_attentions=False,
query_states=None,
relative_pos=None,
return_dict=True,
):
if attention_mask.dim() <= 2:
input_mask = attention_mask
else:
input_mask = attention_mask.sum(-2) > 0
attention_mask = self.get_attention_mask(attention_mask)
relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
if isinstance(hidden_states, Sequence):
next_kv = hidden_states[0]
else:
next_kv = hidden_states
rel_embeddings = self.get_rel_embedding()
output_states = next_kv
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (output_states,)
if self.gradient_checkpointing and self.training:
output_states = self._gradient_checkpointing_func(
layer_module.__call__,
next_kv,
attention_mask,
query_states,
relative_pos,
rel_embeddings,
output_attentions,
)
else:
output_states = layer_module(
next_kv,
attention_mask,
query_states=query_states,
relative_pos=relative_pos,
rel_embeddings=rel_embeddings,
output_attentions=output_attentions,
)
if output_attentions:
output_states, att_m = output_states
if i == 0 and self.conv is not None:
output_states = self.conv(hidden_states, output_states, input_mask)
if query_states is not None:
query_states = output_states
if isinstance(hidden_states, Sequence):
next_kv = hidden_states[i + 1] if i + 1 < len(self.layer) else None
else:
next_kv = output_states
if output_attentions:
all_attentions = all_attentions + (att_m,)
if output_hidden_states:
all_hidden_states = all_hidden_states + (output_states,)
if not return_dict:
return tuple(v for v in [output_states, all_hidden_states, all_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=output_states, hidden_states=all_hidden_states, attentions=all_attentions
)
@torch.jit.script
def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos):
return c2p_pos.expand([query_layer.size(0), query_layer.size(1), query_layer.size(2), relative_pos.size(-1)])
@torch.jit.script
def p2c_dynamic_expand(c2p_pos, query_layer, key_layer):
return c2p_pos.expand([query_layer.size(0), query_layer.size(1), key_layer.size(-2), key_layer.size(-2)])
@torch.jit.script
def pos_dynamic_expand(pos_index, p2c_att, key_layer):
return pos_index.expand(p2c_att.size()[:2] + (pos_index.size(-2), key_layer.size(-2)))
class DisentangledSelfAttention(nn.Module):
"""
Disentangled self-attention module
Parameters:
config (`DebertaV2Config`):
A model config class instance with the configuration to build a new model. The schema is similar to
*BertConfig*, for more details, please refer [`DebertaV2Config`]
"""
def __init__(self, config):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
_attention_head_size = config.hidden_size // config.num_attention_heads
self.attention_head_size = getattr(config, "attention_head_size", _attention_head_size)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
self.key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
self.value_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
self.share_att_key = getattr(config, "share_att_key", False)
self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else []
self.relative_attention = getattr(config, "relative_attention", False)
if self.relative_attention:
self.position_buckets = getattr(config, "position_buckets", -1)
self.max_relative_positions = getattr(config, "max_relative_positions", -1)
if self.max_relative_positions < 1:
self.max_relative_positions = config.max_position_embeddings
self.pos_ebd_size = self.max_relative_positions
if self.position_buckets > 0:
self.pos_ebd_size = self.position_buckets
self.pos_dropout = StableDropout(config.hidden_dropout_prob)
if not self.share_att_key:
if "c2p" in self.pos_att_type:
self.pos_key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
if "p2c" in self.pos_att_type:
self.pos_query_proj = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = StableDropout(config.attention_probs_dropout_prob)
def transpose_for_scores(self, x, attention_heads):
new_x_shape = x.size()[:-1] + (attention_heads, -1)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3).contiguous().view(-1, x.size(1), x.size(-1))
def forward(
self,
hidden_states,
attention_mask,
output_attentions=False,
query_states=None,
relative_pos=None,
rel_embeddings=None,
class DebertaV2Embeddings(nn.Module):
"""从单词、位置和令牌类型嵌入构造嵌入。"""
def __init__(self, config):
super().__init__()
pad_token_id = getattr(config, "pad_token_id", 0)
self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
self.word_embeddings = nn.Embedding(config.vocab_size, self.embedding_size, padding_idx=pad_token_id)
self.position_biased_input = getattr(config, "position_biased_input", True)
if not self.position_biased_input:
self.position_embeddings = None
else:
self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.embedding_size)
if config.type_vocab_size > 0:
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, self.embedding_size)
if self.embedding_size != config.hidden_size:
self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False)
self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
self.dropout = StableDropout(config.hidden_dropout_prob)
self.config = config
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None):
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = self.position_ids[:, :seq_length]
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
if self.position_embeddings is not None:
position_embeddings = self.position_embeddings(position_ids.long())
else:
position_embeddings = torch.zeros_like(inputs_embeds)
embeddings = inputs_embeds
if self.position_biased_input:
embeddings += position_embeddings
if self.config.type_vocab_size > 0:
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings += token_type_embeddings
if self.embedding_size != self.config.hidden_size:
embeddings = self.embed_proj(embeddings)
embeddings = self.LayerNorm(embeddings)
if mask is not None:
if mask.dim() != embeddings.dim():
if mask.dim() == 4:
mask = mask.squeeze(1).squeeze(1)
mask = mask.unsqueeze(2)
mask = mask.to(embeddings.dtype)
embeddings = embeddings * mask
embeddings = self.dropout(embeddings)
return embeddings
class DebertaV2PreTrainedModel(PreTrainedModel):
"""
用于处理权重初始化、预训练模型下载和加载的抽象类。
"""
config_class = DebertaV2Config
base_model_prefix = "deberta"
_keys_to_ignore_on_load_unexpected = ["position_embeddings"]
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""初始化权重。"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
DEBERTA_START_DOCSTRING = r"""
DeBERTa模型由何鹏程、刘晓东、高建峰、陈伟柱在论文《DeBERTa: Decoding-enhanced BERT with Disentangled
Attention》中提出。它在BERT/RoBERTa的基础上进行了两项改进,即解耦注意力和增强的掩码解码器。通过这两项改进,
在使用80GB预训练数据的大多数任务上超越了BERT/RoBERTa。
这个模型也是PyTorch的torch.nn.Module子类。
使用时可以像普通的PyTorch Module一样使用,并参考PyTorch文档处理一切一般使用和行为相关的事项。
参数:
config ([`DebertaV2Config`]): 包含模型所有参数的配置类。
使用配置文件初始化模型时不会加载模型的权重,只会加载配置信息。
查看 [`~PreTrainedModel.from_pretrained`] 方法来加载模型权重。
"""
DEBERTA_INPUTS_DOCSTRING = r"""
# 输入
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
# 输入序列的token索引,在词汇表中
Indices of input sequence tokens in the vocabulary.
# 可以使用`AutoTokenizer`获取这些索引。详见`PreTrainedTokenizer.encode`和`PreTrainedTokenizer.__call__`
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
# 注意力掩码,避免在填充的token索引上进行注意力计算
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
# 分段token索引,指示输入的第一部分和第二部分。索引在`[0, 1]`中选择:
- 0 corresponds to a *sentence A* token,
- 1 corresponds to a *sentence B* token.
[What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
# 每个输入序列token在位置嵌入中的位置索引。在范围`[0, config.max_position_embeddings - 1]`中选择。
[What are position IDs?](../glossary#position-ids)
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
# 可选项,直接传递嵌入表示,而不是传递`input_ids`。在想要更多控制如何将`input_ids`索引转换为关联向量时有用。
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。有关更多详细信息,请参见返回张量下的`attentions`。
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。有关更多详细信息,请参见返回张量下的`hidden_states`。
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
# 是否返回`~utils.ModelOutput`而不是普通元组。
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.",
DEBERTA_START_DOCSTRING,
)
class DebertaV2Model(DebertaV2PreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.embeddings = DebertaV2Embeddings(config)
self.encoder = DebertaV2Encoder(config)
self.z_steps = 0
self.config = config
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, new_embeddings):
self.embeddings.word_embeddings = new_embeddings
def _prune_heads(self, heads_to_prune):
"""
剪枝模型中的注意力头。
heads_to_prune: 要剪枝的头部字典 {层号: 要在此层中剪枝的头部列表},参见基类PreTrainedModel
"""
raise NotImplementedError("The prune function is not implemented in DeBERTa model.")
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
pass
) -> Union[Tuple, BaseModelOutput]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
input_shape = input_ids.size()
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
device = input_ids.device if input_ids is not None else inputs_embeds.device
if attention_mask is None:
attention_mask = torch.ones(input_shape, device=device)
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
embedding_output = self.embeddings(
input_ids=input_ids,
token_type_ids=token_type_ids,
position_ids=position_ids,
mask=attention_mask,
inputs_embeds=inputs_embeds,
)
encoder_outputs = self.encoder(
embedding_output,
attention_mask,
output_hidden_states=True,
output_attentions=output_attentions,
return_dict=return_dict,
)
encoded_layers = encoder_outputs[1]
if self.z_steps > 1:
hidden_states = encoded_layers[-2]
layers = [self.encoder.layer[-1] for _ in range(self.z_steps)]
query_states = encoded_layers[-1]
rel_embeddings = self.encoder.get_rel_embedding()
attention_mask = self.encoder.get_attention_mask(attention_mask)
rel_pos = self.encoder.get_rel_pos(embedding_output)
for layer in layers[1:]:
Those .g There Med J Give Read Simple Here Engage in Perhaps they had been
@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
def __init__(self, config):
super().__init__(config)
self.deberta = DebertaV2Model(config)
self.cls = DebertaV2OnlyMLMHead(config)
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
mask="[MASK]",
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, MaskedLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.deberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
prediction_scores = self.cls(sequence_output)
masked_lm_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (prediction_scores,) + outputs[1:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return MaskedLMOutput(
loss=masked_lm_loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
class DebertaV2PredictionHeadTransform(nn.Module):
def __init__(self, config):
super().__init__()
self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
self.dense = nn.Linear(config.hidden_size, self.embedding_size)
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = nn.LayerNorm(self.embedding_size, eps=config.layer_norm_eps)
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
class DebertaV2LMPredictionHead(nn.Module):
def __init__(self, config):
super().__init__()
self.transform = DebertaV2PredictionHeadTransform(config)
self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
self.decoder = nn.Linear(self.embedding_size, config.vocab_size, bias=False)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.decoder.bias = self.bias
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states)
return hidden_states
class DebertaV2OnlyMLMHead(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = DebertaV2LMPredictionHead(config)
def forward(self, sequence_output):
prediction_scores = self.predictions(sequence_output)
return prediction_scores
@add_start_docstrings(
"""
在顶部有一个序列分类/回归头部的DeBERTa模型变换器(池化输出之上的线性层),例如用于GLUE任务。
""",
DEBERTA_START_DOCSTRING,
)
class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel):
def __init__(self, config):
super().__init__(config)
num_labels = getattr(config, "num_labels", 2)
self.num_labels = num_labels
self.deberta = DebertaV2Model(config)
self.pooler = ContextPooler(config)
output_dim = self.pooler.output_dim
self.classifier = nn.Linear(output_dim, num_labels)
drop_out = getattr(config, "cls_dropout", None)
drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
self.dropout = StableDropout(drop_out)
self.post_init()
def get_input_embeddings(self):
return self.deberta.get_input_embeddings()
def set_input_embeddings(self, new_embeddings):
self.deberta.set_input_embeddings(new_embeddings)
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
"""
DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
"""
class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.deberta = DebertaV2Model(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.deberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
)
@add_start_docstrings(
"""
"""
DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
DEBERTA_START_DOCSTRING,
# 描述 DeBERTa 模型,该模型用于提取式问答任务(如 SQuAD),在隐藏状态输出的基础上添加一个用于计算起始位置和结束位置 logit 的线性层作为分类头。
# DEBERTA_START_DOCSTRING 用于引用关于 DeBERTa 模型的文档字符串的常量或变量,可能包含了模型的详细描述和用法说明。
)
# 关闭括号,用于结束类定义中的一些参数和装饰器的定义
class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel):
# 定义一个新的类,继承自DebertaV2PreTrainedModel
def __init__(self, config):
# 初始化函数,接受一个配置参数config
super().__init__(config)
# 调用父类的初始化方法
self.num_labels = config.num_labels
# 设置类属性num_labels为config中的num_labels字段值
self.deberta = DebertaV2Model(config)
# 创建一个DebertaV2Model实例,传入config作为配置参数,并将其赋值给self.deberta
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
# 创建一个线性层,将隐藏大小为config.hidden_size映射到标签数为config.num_labels的输出空间
# Initialize weights and apply final processing
self.post_init()
# 调用类中的post_init方法,用于初始化权重和应用最终处理
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
qa_target_start_index=_QA_TARGET_START_INDEX,
qa_target_end_index=_QA_TARGET_END_INDEX,
)
# 添加文档字符串和代码示例,用于模型的前向传播,根据特定的格式化字符串和样例
# 从transformers.models.deberta.modeling_deberta.DebertaForQuestionAnswering.forward中复制并将Deberta改为DebertaV2
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
start_positions: Optional[torch.Tensor] = None,
end_positions: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, QuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
# 初始化 return_dict 变量,如果未提供则使用模型配置中的默认设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用 DeBERTa 模型进行前向传播
outputs = self.deberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取模型的序列输出
sequence_output = outputs[0]
# 将序列输出传递给 QA 输出层得到 logits
logits = self.qa_outputs(sequence_output)
# 将 logits 拆分为开始位置和结束位置的 logits
start_logits, end_logits = logits.split(1, dim=-1)
# 去除不必要的维度并保持连续性
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
# 初始化总损失为 None
total_loss = None
# 如果提供了起始位置和结束位置的标签,则计算损失
if start_positions is not None and end_positions is not None:
# 如果在多 GPU 下运行,添加一个维度以匹配 logits 的维度
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# 忽略超出模型输入的位置
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
# 使用交叉熵损失函数,忽略指定的索引
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
# 如果 return_dict 为 False,则返回一个包含损失和 logits 的元组
if not return_dict:
output = (start_logits, end_logits) + outputs[1:]
return ((total_loss,) + output) if total_loss is not None else output
# 如果 return_dict 为 True,则返回一个 QuestionAnsweringModelOutput 对象
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
DeBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
DEBERTA_START_DOCSTRING,
)
class DebertaV2ForMultipleChoice(DebertaV2PreTrainedModel):
def __init__(self, config):
super().__init__(config)
num_labels = getattr(config, "num_labels", 2) # 获取配置中的标签数量,默认为2
self.num_labels = num_labels
self.deberta = DebertaV2Model(config) # 初始化DeBERTa模型
self.pooler = ContextPooler(config) # 初始化上下文池化器
output_dim = self.pooler.output_dim # 获取池化器的输出维度
self.classifier = nn.Linear(output_dim, 1) # 创建线性层,用于多选分类任务的分类
drop_out = getattr(config, "cls_dropout", None) # 获取配置中的dropout值
drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out # 如果未指定,则使用默认的隐藏层dropout概率
self.dropout = StableDropout(drop_out) # 创建稳定的dropout层
self.init_weights() # 初始化模型权重
def get_input_embeddings(self):
return self.deberta.get_input_embeddings() # 获取输入的嵌入层
def set_input_embeddings(self, new_embeddings):
self.deberta.set_input_embeddings(new_embeddings) # 设置新的输入嵌入层
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
前向传播方法,接受多个输入参数并返回模型输出。
Args:
input_ids (Optional[torch.Tensor], optional): 输入的token IDs张量. Defaults to None.
attention_mask (Optional[torch.Tensor], optional): 注意力掩码张量. Defaults to None.
token_type_ids (Optional[torch.Tensor], optional): token类型IDs张量. Defaults to None.
position_ids (Optional[torch.Tensor], optional): 位置IDs张量. Defaults to None.
inputs_embeds (Optional[torch.Tensor], optional): 输入的嵌入张量. Defaults to None.
labels (Optional[torch.Tensor], optional): 标签张量. Defaults to None.
output_attentions (Optional[bool], optional): 是否输出注意力权重. Defaults to None.
output_hidden_states (Optional[bool], optional): 是否输出隐藏状态. Defaults to None.
return_dict (Optional[bool], optional): 是否返回字典形式的输出. Defaults to None.
Returns:
MultipleChoiceModelOutput: 包含模型输出的命名元组。
"""
# TODO: Implement forward pass logic here
pass
) -> Union[Tuple, MultipleChoiceModelOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
# 确保返回字典不为None时使用配置中的返回字典设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 计算选择题个数,如果input_ids不为None,则取其第二维度的大小作为选择题数目
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
# 将输入张量展平,以便用于模型输入
flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
flat_inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
# 调用DeBERTa模型进行推断
outputs = self.deberta(
flat_input_ids,
position_ids=flat_position_ids,
token_type_ids=flat_token_type_ids,
attention_mask=flat_attention_mask,
inputs_embeds=flat_inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从模型输出中获取编码层的结果
encoder_layer = outputs[0]
# 使用池化器对编码层结果进行池化
pooled_output = self.pooler(encoder_layer)
# 对池化结果应用dropout
pooled_output = self.dropout(pooled_output)
# 将池化后的结果送入分类器得到logits
logits = self.classifier(pooled_output)
# 将logits重塑为(batch_size, num_choices)形状
reshaped_logits = logits.view(-1, num_choices)
loss = None
# 如果有标签,则计算交叉熵损失
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
# 如果不使用返回字典,则返回输出元组
if not return_dict:
output = (reshaped_logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
# 使用MultipleChoiceModelOutput对象包装结果并返回
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
.\models\deberta_v2\modeling_tf_deberta_v2.py
"""
TF 2.0 DeBERTa-v2 model.
"""
from __future__ import annotations
from typing import Dict, Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
TFBaseModelOutput,
TFMaskedLMOutput,
TFMultipleChoiceModelOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (
TFMaskedLanguageModelingLoss,
TFModelInputType,
TFMultipleChoiceLoss,
TFPreTrainedModel,
TFQuestionAnsweringLoss,
TFSequenceClassificationLoss,
TFTokenClassificationLoss,
get_initializer,
keras,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_deberta_v2 import DebertaV2Config
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "DebertaV2Config"
_CHECKPOINT_FOR_DOC = "kamalkraj/deberta-v2-xlarge"
TF_DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST = [
"kamalkraj/deberta-v2-xlarge",
]
class TFDebertaV2ContextPooler(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(config.pooler_hidden_size, name="dense")
self.dropout = TFDebertaV2StableDropout(config.pooler_dropout, name="dropout")
self.config = config
def call(self, hidden_states, training: bool = False):
context_token = hidden_states[:, 0]
context_token = self.dropout(context_token, training=training)
pooled_output = self.dense(context_token)
pooled_output = get_tf_activation(self.config.pooler_hidden_act)(pooled_output)
return pooled_output
@property
def output_dim(self) -> int:
return self.config.hidden_size
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.pooler_hidden_size])
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
class TFDebertaV2XSoftmax(keras.layers.Layer):
"""
优化内存的掩码 Softmax 层
Args:
input (`tf.Tensor`): 需要应用 softmax 的输入张量。
mask (`tf.Tensor`): 掩码矩阵,其中 0 表示在 softmax 计算中忽略该元素。
dim (int): 应用 softmax 的维度
"""
def __init__(self, axis=-1, **kwargs):
super().__init__(**kwargs)
self.axis = axis
def call(self, inputs: tf.Tensor, mask: tf.Tensor):
rmask = tf.logical_not(tf.cast(mask, tf.bool))
output = tf.where(rmask, float("-inf"), inputs)
output = stable_softmax(output, self.axis)
output = tf.where(rmask, 0.0, output)
return output
class TFDebertaV2StableDropout(keras.layers.Layer):
"""
优化训练稳定性的 Dropout 模块
Args:
drop_prob (float): dropout 概率
"""
def __init__(self, drop_prob, **kwargs):
super().__init__(**kwargs)
self.drop_prob = drop_prob
@tf.custom_gradient
def xdropout(self, inputs):
"""
对输入应用 dropout,类似于普通的 dropout,但同时将剩余元素缩放为 1/drop_prob 倍。
"""
mask = tf.cast(
1
- tf.compat.v1.distributions.Bernoulli(probs=1.0 - self.drop_prob).sample(sample_shape=shape_list(inputs)),
tf.bool,
)
scale = tf.convert_to_tensor(1.0 / (1 - self.drop_prob), dtype=tf.float32)
if self.drop_prob > 0:
inputs = tf.where(mask, 0.0, inputs) * scale
def grad(upstream):
if self.drop_prob > 0:
return tf.where(mask, 0.0, upstream) * scale
else:
return upstream
return inputs, grad
def call(self, inputs: tf.Tensor, training: tf.Tensor = False):
if training:
return self.xdropout(inputs)
return inputs
class TFDebertaV2SelfOutput(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(config.hidden_size, name="dense")
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout")
self.config = config
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
class TFDebertaV2Attention(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs)
self.self = TFDebertaV2DisentangledSelfAttention(config, name="self")
self.dense_output = TFDebertaV2SelfOutput(config, name="output")
self.config = config
def call(
self,
input_tensor: tf.Tensor,
attention_mask: tf.Tensor,
query_states: tf.Tensor = None,
relative_pos: tf.Tensor = None,
rel_embeddings: tf.Tensor = None,
output_attentions: bool = False,
training: bool = False,
) -> Tuple[tf.Tensor]:
self_outputs = self.self(
hidden_states=input_tensor,
attention_mask=attention_mask,
query_states=query_states,
relative_pos=relative_pos,
rel_embeddings=rel_embeddings,
output_attentions=output_attentions,
training=training,
)
if query_states is None:
query_states = input_tensor
attention_output = self.dense_output(
hidden_states=self_outputs[0], input_tensor=query_states, training=training
)
output = (attention_output,) + self_outputs[1:]
return output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self", None) is not None:
with tf.name_scope(self.self.name):
self.self.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
class TFDebertaV2Intermediate(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFDebertaV2Output(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout")
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
class TFDebertaV2Layer(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs)
self.attention = TFDebertaV2Attention(config, name="attention")
self.intermediate = TFDebertaV2Intermediate(config, name="intermediate")
self.bert_output = TFDebertaV2Output(config, name="output")
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
query_states: tf.Tensor = None,
relative_pos: tf.Tensor = None,
rel_embeddings: tf.Tensor = None,
output_attentions: bool = False,
training: bool = False,
) -> Tuple[tf.Tensor]:
attention_outputs = self.attention(
input_tensor=hidden_states,
attention_mask=attention_mask,
query_states=query_states,
relative_pos=relative_pos,
rel_embeddings=rel_embeddings,
output_attentions=output_attentions,
training=training,
)
attention_output = attention_outputs[0]
intermediate_output = self.intermediate(hidden_states=attention_output)
layer_output = self.bert_output(
hidden_states=intermediate_output,
input_tensor=attention_output,
training=training
)
outputs = (layer_output,) + attention_outputs[1:]
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "bert_output", None) is not None:
with tf.name_scope(self.bert_output.name):
self.bert_output.build(None)
class TFDebertaV2ConvLayer(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs)
self.kernel_size = getattr(config, "conv_kernel_size", 3)
self.conv_act = get_tf_activation(getattr(config, "conv_act", "tanh"))
self.padding = (self.kernel_size - 1) // 2
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout")
self.config = config
def build(self, input_shape=None):
if self.built:
return
self.built = True
with tf.name_scope("conv"):
self.conv_kernel = self.add_weight(
name="kernel",
shape=[self.kernel_size, self.config.hidden_size, self.config.hidden_size],
initializer=get_initializer(self.config.initializer_range),
)
self.conv_bias = self.add_weight(
name="bias", shape=[self.config.hidden_size], initializer=tf.zeros_initializer()
)
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
def call(
self, hidden_states: tf.Tensor, residual_states: tf.Tensor, input_mask: tf.Tensor, training: bool = False
) -> tf.Tensor:
out = tf.nn.conv2d(
tf.expand_dims(hidden_states, 1),
tf.expand_dims(self.conv_kernel, 0),
strides=1,
padding=[[0, 0], [0, 0], [self.padding, self.padding], [0, 0]],
)
out = tf.squeeze(tf.nn.bias_add(out, self.conv_bias), 1)
rmask = tf.cast(1 - input_mask, tf.bool)
out = tf.where(tf.broadcast_to(tf.expand_dims(rmask, -1), shape_list(out)), 0.0, out)
out = self.dropout(out, training=training)
out = self.conv_act(out)
layer_norm_input = residual_states + out
output = self.LayerNorm(layer_norm_input)
if input_mask is None:
output_states = output
else:
if len(shape_list(input_mask)) != len(shape_list(layer_norm_input)):
if len(shape_list(input_mask)) == 4:
input_mask = tf.squeeze(tf.squeeze(input_mask, axis=1), axis=1)
input_mask = tf.cast(tf.expand_dims(input_mask, axis=2), tf.float32)
output_states = output * input_mask
return output_states
def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs)
self.layer = [TFDebertaV2Layer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
self.relative_attention = getattr(config, "relative_attention", False)
self.config = config
if self.relative_attention:
self.max_relative_positions = getattr(config, "max_relative_positions", -1)
if self.max_relative_positions < 1:
self.max_relative_positions = config.max_position_embeddings
self.position_buckets = getattr(config, "position_buckets", -1)
self.pos_ebd_size = self.max_relative_positions * 2
if self.position_buckets > 0:
self.pos_ebd_size = self.position_buckets * 2
self.norm_rel_ebd = [x.strip() for x in getattr(config, "norm_rel_ebd", "none").lower().split("|")]
if "layer_norm" in self.norm_rel_ebd:
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.conv = TFDebertaV2ConvLayer(config, name="conv") if getattr(config, "conv_kernel_size", 0) > 0 else None
def build(self, input_shape=None):
if self.built:
return
self.built = True
if self.relative_attention:
self.rel_embeddings = self.add_weight(
name="rel_embeddings.weight",
shape=[self.pos_ebd_size, self.config.hidden_size],
initializer=get_initializer(self.config.initializer_range),
)
if getattr(self, "conv", None) is not None:
with tf.name_scope(self.conv.name):
self.conv.build(None)
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, self.config.hidden_size])
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
def get_rel_embedding(self):
rel_embeddings = self.rel_embeddings if self.relative_attention else None
if rel_embeddings is not None and ("layer_norm" in self.norm_rel_ebd):
rel_embeddings = self.LayerNorm(rel_embeddings)
return rel_embeddings
def get_attention_mask(self, attention_mask):
if len(shape_list(attention_mask)) <= 2:
extended_attention_mask = tf.expand_dims(tf.expand_dims(attention_mask, 1), 2)
attention_mask = extended_attention_mask * tf.expand_dims(tf.squeeze(extended_attention_mask, -2), -1)
attention_mask = tf.cast(attention_mask, tf.uint8)
elif len(shape_list(attention_mask)) == 3:
attention_mask = tf.expand_dims(attention_mask, 1)
return attention_mask
if self.relative_attention and relative_pos is None:
q = shape_list(query_states)[-2] if query_states is not None else shape_list(hidden_states)[-2]
relative_pos = build_relative_position(
q,
shape_list(hidden_states)[-2],
bucket_size=self.position_buckets,
max_position=self.max_relative_positions,
)
return relative_pos
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
query_states: tf.Tensor = None,
relative_pos: tf.Tensor = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
training: bool = False,
) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
if len(shape_list(attention_mask)) <= 2:
input_mask = attention_mask
else:
input_mask = tf.cast(tf.math.reduce_sum(attention_mask, axis=-2) > 0, dtype=tf.uint8)
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
attention_mask = self.get_attention_mask(attention_mask)
relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)
next_kv = hidden_states
rel_embeddings = self.get_rel_embedding()
output_states = next_kv
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (output_states,)
layer_outputs = layer_module(
hidden_states=next_kv,
attention_mask=attention_mask,
query_states=query_states,
relative_pos=relative_pos,
rel_embeddings=rel_embeddings,
output_attentions=output_attentions,
training=training,
)
output_states = layer_outputs[0]
if i == 0 and self.conv is not None:
output_states = self.conv(hidden_states, output_states, input_mask)
next_kv = output_states
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (output_states,)
if not return_dict:
return tuple(v for v in [output_states, all_hidden_states, all_attentions] if v is not None)
return TFBaseModelOutput(
last_hidden_state=output_states, hidden_states=all_hidden_states, attentions=all_attentions
)
def make_log_bucket_position(relative_pos, bucket_size, max_position):
sign = tf.math.sign(relative_pos)
mid = bucket_size // 2
abs_pos = tf.where((relative_pos < mid) & (relative_pos > -mid), mid - 1, tf.math.abs(relative_pos))
log_pos = (
tf.math.ceil(
tf.cast(tf.math.log(abs_pos / mid), tf.float32) / tf.math.log((max_position - 1) / mid) * (mid - 1)
)
+ mid
)
bucket_pos = tf.cast(
tf.where(abs_pos <= mid, tf.cast(relative_pos, tf.float32), log_pos * tf.cast(sign, tf.float32)), tf.int32
)
return bucket_pos
def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-1):
"""
Build relative position according to the query and key
We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key
\\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} = P_q -
P_k\\)
Args:
query_size (int): the length of query
key_size (int): the length of key
bucket_size (int): the size of position bucket
max_position (int): the maximum allowed absolute position
Return:
`tf.Tensor`: A tensor with shape [1, query_size, key_size]
"""
q_ids = tf.range(query_size, dtype=tf.int32)
k_ids = tf.range(key_size, dtype=tf.int32)
rel_pos_ids = q_ids[:, None] - tf.tile(tf.expand_dims(k_ids, axis=0), [shape_list(q_ids)[0], 1])
if bucket_size > 0 and max_position > 0:
rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size, max_position)
rel_pos_ids = rel_pos_ids[:query_size, :]
rel_pos_ids = tf.expand_dims(rel_pos_ids, axis=0)
return tf.cast(rel_pos_ids, tf.int64)
def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos):
shapes = [
shape_list(query_layer)[0],
shape_list(query_layer)[1],
shape_list(query_layer)[2],
shape_list(relative_pos)[-1],
]
return tf.broadcast_to(c2p_pos, shapes)
def p2c_dynamic_expand(c2p_pos, query_layer, key_layer):
shapes = [
shape_list(query_layer)[0],
shape_list(query_layer)[1],
shape_list(key_layer)[-2],
shape_list(key_layer)[-2],
]
return tf.broadcast_to(c2p_pos, shapes)
def pos_dynamic_expand(pos_index, p2c_att, key_layer):
shapes = shape_list(p2c_att)[:2] + [shape_list(pos_index)[-2], shape_list(key_layer)[-2]]
return tf.broadcast_to(pos_index, shapes)
def take_along_axis(x, indices):
pass
if isinstance(tf.distribute.get_strategy(), tf.distribute.TPUStrategy):
one_hot_indices = tf.one_hot(indices, depth=x.shape[-1], dtype=x.dtype)
gathered = tf.einsum("ijkl,ijl->ijk", one_hot_indices, x)
else:
gathered = tf.gather(x, indices, batch_dims=2)
return gathered
class TFDebertaV2DisentangledSelfAttention(keras.layers.Layer):
"""
Disentangled self-attention module
Parameters:
config (`DebertaV2Config`):
A model config class instance with the configuration to build a new model. The schema is similar to
*BertConfig*, for more details, please refer [`DebertaV2Config`]
"""
def transpose_for_scores(self, tensor: tf.Tensor, attention_heads: int) -> tf.Tensor:
tensor_shape = shape_list(tensor)
shape = tensor_shape[:-1] + [attention_heads, tensor_shape[-1] // attention_heads]
tensor = tf.reshape(tensor=tensor, shape=shape)
tensor = tf.transpose(tensor, perm=[0, 2, 1, 3])
x_shape = shape_list(tensor)
tensor = tf.reshape(tensor, shape=[-1, x_shape[-2], x_shape[-1]])
return tensor
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
query_states: tf.Tensor = None,
relative_pos: tf.Tensor = None,
rel_embeddings: tf.Tensor = None,
output_attentions: bool = False,
training: bool = False,
):
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query_proj", None) is not None:
with tf.name_scope(self.query_proj.name):
self.query_proj.build([None, None, self.config.hidden_size])
if getattr(self, "key_proj", None) is not None:
with tf.name_scope(self.key_proj.name):
self.key_proj.build([None, None, self.config.hidden_size])
if getattr(self, "value_proj", None) is not None:
with tf.name_scope(self.value_proj.name):
self.value_proj.build([None, None, self.config.hidden_size])
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
if getattr(self, "pos_dropout", None) is not None:
with tf.name_scope(self.pos_dropout.name):
self.pos_dropout.build(None)
if getattr(self, "pos_key_proj", None) is not None:
with tf.name_scope(self.pos_key_proj.name):
self.pos_key_proj.build([None, None, self.config.hidden_size])
if getattr(self, "pos_query_proj", None) is not None:
with tf.name_scope(self.pos_query_proj.name):
self.pos_query_proj.build([None, None, self.config.hidden_size])
class TFDebertaV2Embeddings(keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.config = config
self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings
self.position_biased_input = getattr(config, "position_biased_input", True)
self.initializer_range = config.initializer_range
if self.embedding_size != config.hidden_size:
self.embed_proj = keras.layers.Dense(
config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name="embed_proj",
use_bias=False,
)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout")
def build(self, input_shape=None):
with tf.name_scope("word_embeddings"):
self.weight = self.add_weight(
name="weight",
shape=[self.config.vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range),
)
with tf.name_scope("token_type_embeddings"):
if self.config.type_vocab_size > 0:
self.token_type_embeddings = self.add_weight(
name="embeddings",
shape=[self.config.type_vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range),
)
else:
self.token_type_embeddings = None
with tf.name_scope("position_embeddings"):
if self.position_biased_input:
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
else:
self.position_embeddings = None
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
if getattr(self, "embed_proj", None) is not None:
with tf.name_scope(self.embed_proj.name):
self.embed_proj.build([None, None, self.embedding_size])
def call(
self,
input_ids: tf.Tensor = None,
position_ids: tf.Tensor = None,
token_type_ids: tf.Tensor = None,
inputs_embeds: tf.Tensor = None,
mask: tf.Tensor = None,
training: bool = False,
def apply_embeddings(
self,
input_ids: Optional[tf.Tensor] = None,
inputs_embeds: Optional[tf.Tensor] = None,
token_type_ids: Optional[tf.Tensor] = None,
position_ids: Optional[tf.Tensor] = None,
mask: Optional[tf.Tensor] = None,
training: bool = False,
) -> tf.Tensor:
"""
Applies embedding based on inputs tensor.
Returns:
final_embeddings (`tf.Tensor`): output embedding tensor.
"""
if input_ids is None and inputs_embeds is None:
raise ValueError("Need to provide either `input_ids` or `inputs_embeds`.")
if input_ids is not None:
check_embeddings_within_bounds(input_ids, self.config.vocab_size)
inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
input_shape = shape_list(inputs_embeds)[:-1]
if token_type_ids is None:
token_type_ids = tf.fill(dims=input_shape, value=0)
if position_ids is None:
position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)
final_embeddings = inputs_embeds
if self.position_biased_input:
position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
final_embeddings += position_embeds
if self.config.type_vocab_size > 0:
token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
final_embeddings += token_type_embeds
if self.embedding_size != self.hidden_size:
final_embeddings = self.embed_proj(final_embeddings)
final_embeddings = self.LayerNorm(final_embeddings)
if mask is not None:
if len(shape_list(mask)) != len(shape_list(final_embeddings)):
if len(shape_list(mask)) == 4:
mask = tf.squeeze(tf.squeeze(mask, axis=1), axis=1)
mask = tf.cast(tf.expand_dims(mask, axis=2), tf.float32)
final_embeddings = final_embeddings * mask
final_embeddings = self.dropout(final_embeddings, training=training)
return final_embeddings
class TFDebertaV2PredictionHeadTransform(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs)
self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
self.dense = keras.layers.Dense(
units=self.embedding_size,
kernel_initializer=get_initializer(config.initializer_range),
name="dense",
)
if isinstance(config.hidden_act, str):
self.transform_act_fn = get_tf_activation(config.hidden_act)
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.embedding_size])
class TFDebertaV2LMPredictionHead(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs)
self.config = config
self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
self.transform = TFDebertaV2PredictionHeadTransform(config, name="transform")
self.input_embeddings = input_embeddings
def build(self, input_shape=None):
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
if self.built:
return
self.built = True
if getattr(self, "transform", None) is not None:
with tf.name_scope(self.transform.name):
self.transform.build(None)
def get_output_embeddings(self) -> keras.layers.Layer:
return self.input_embeddings
def set_output_embeddings(self, value: tf.Variable):
self.input_embeddings.weight = value
self.input_embeddings.vocab_size = shape_list(value)[0]
def get_bias(self) -> Dict[str, tf.Variable]:
return {"bias": self.bias}
def set_bias(self, value: tf.Variable):
self.bias = value["bias"]
self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.transform(hidden_states=hidden_states)
seq_length = shape_list(hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states
class TFDebertaV2OnlyMLMHead(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs)
self.predictions = TFDebertaV2LMPredictionHead(config, input_embeddings, name="predictions")
def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
prediction_scores = self.predictions(hidden_states=sequence_output)
return prediction_scores
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "predictions", None) is not None:
with tf.name_scope(self.predictions.name):
self.predictions.build(None)
class TFDebertaV2MainLayer(keras.layers.Layer):
config_class = DebertaV2Config
def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs)
self.config = config
self.embeddings = TFDebertaV2Embeddings(config, name="embeddings")
self.encoder = TFDebertaV2Encoder(config, name="encoder")
def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings
def set_input_embeddings(self, value: tf.Variable):
self.embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
raise NotImplementedError
@unpack_inputs
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
):
) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = shape_list(input_ids)
elif inputs_embeds is not None:
input_shape = shape_list(inputs_embeds)[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if attention_mask is None:
attention_mask = tf.fill(dims=input_shape, value=1)
if token_type_ids is None:
token_type_ids = tf.fill(dims=input_shape, value=0)
embedding_output = self.embeddings(
input_ids=input_ids,
position_ids=position_ids,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
mask=attention_mask,
training=training,
)
encoder_outputs = self.encoder(
hidden_states=embedding_output,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = encoder_outputs[0]
if not return_dict:
return (sequence_output,) + encoder_outputs[1:]
return TFBaseModelOutput(
last_hidden_state=sequence_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
class TFDebertaV2PreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = DebertaV2Config
base_model_prefix = "deberta"
DEBERTA_START_DOCSTRING = r"""
The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
positional argument:
- a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
Note that when creating models and layers with
[subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
about any of this, as you can just pass inputs like you would to any other Python function!
</Tip>
"""
Parameters:
config ([`DebertaV2Config`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
This block defines the docstring for inputs expected by the DeBERTaV2Model class.
It specifies the arguments and their types that can be passed to the model.
@add_start_docstrings is a decorator that adds a specific docstring template to the class.
It describes the bare DeBERTa Model transformer outputting raw hidden-states without specific head.
The class TFDebertaV2Model inherits from TFDebertaV2PreTrainedModel and represents the DeBERTa V2 model.
It initializes with a configuration object and optional inputs, and initializes a TFDebertaV2MainLayer named "deberta".
unpack_inputs is a decorator that likely unpacks and prepares inputs before feeding them into the model.
"""
class TFDebertaV2Model(TFDebertaV2PreTrainedModel):
def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.deberta = TFDebertaV2MainLayer(config, name="deberta")
@unpack_inputs
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
outputs = self.deberta(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "deberta", None) is not None:
with tf.name_scope(self.deberta.name):
self.deberta.build(None)
@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
class TFDebertaV2ForMaskedLM(TFDebertaV2PreTrainedModel, TFMaskedLanguageModelingLoss):
def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
if config.is_decoder:
logger.warning(
"If you want to use `TFDebertaV2ForMaskedLM` make sure `config.is_decoder=False` for "
"bi-directional self-attention."
)
self.deberta = TFDebertaV2MainLayer(config, name="deberta")
self.mlm = TFDebertaV2OnlyMLMHead(config, input_embeddings=self.deberta.embeddings, name="cls")
def get_lm_head(self) -> keras.layers.Layer:
return self.mlm.predictions
@unpack_inputs
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFMaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
outputs = self.deberta(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
prediction_scores = self.mlm(sequence_output=sequence_output, training=training)
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=prediction_scores)
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFMaskedLMOutput(
loss=loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "deberta", None) is not None:
with tf.name_scope(self.deberta.name):
self.deberta.build(None)
if getattr(self, "mlm", None) is not None:
with tf.name_scope(self.mlm.name):
self.mlm.build(None)
"""
DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
"""
class TFDebertaV2ForSequenceClassification(TFDebertaV2PreTrainedModel, TFSequenceClassificationLoss):
def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.deberta = TFDebertaV2MainLayer(config, name="deberta")
self.pooler = TFDebertaV2ContextPooler(config, name="pooler")
drop_out = getattr(config, "cls_dropout", None)
drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
self.dropout = TFDebertaV2StableDropout(drop_out, name="cls_dropout")
self.classifier = keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="classifier",
)
self.output_dim = self.pooler.output_dim
@unpack_inputs
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
**kwargs
):
"""
接收输入参数,执行DeBERTa V2模型的前向传播,返回分类任务的输出结果。
"""
) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
outputs = self.deberta(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
pooled_output = self.pooler(sequence_output, training=training)
pooled_output = self.dropout(pooled_output, training=training)
logits = self.classifier(pooled_output)
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFSequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "deberta", None) is not None:
with tf.name_scope(self.deberta.name):
self.deberta.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.output_dim])
@add_start_docstrings(
"""
DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
DEBERTA_START_DOCSTRING,
)
class TFDebertaV2ForTokenClassification(TFDebertaV2PreTrainedModel, TFTokenClassificationLoss):
def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.deberta = TFDebertaV2MainLayer(config, name="deberta")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.classifier = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
outputs = self.deberta(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(inputs=sequence_output)
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFTokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "deberta", None) is not None:
with tf.name_scope(self.deberta.name):
self.deberta.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
DEBERTA_START_DOCSTRING,
)
class TFDebertaV2ForQuestionAnswering(TFDebertaV2PreTrainedModel, TFQuestionAnsweringLoss):
def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.deberta = TFDebertaV2MainLayer(config, name="deberta")
self.qa_outputs = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
start_positions: np.ndarray | tf.Tensor | None = None,
end_positions: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
**kwargs,
):
"""
执行 DeBERTaV2ForQuestionAnswering 的前向传播。
Args:
input_ids: 输入的 token IDs
attention_mask: 输入的注意力掩码
token_type_ids: 输入的 token 类型 IDs
position_ids: 输入的位置 IDs
inputs_embeds: 替代输入的嵌入表示
output_attentions: 是否输出注意力权重
output_hidden_states: 是否输出隐藏状态
return_dict: 是否返回字典格式的输出
start_positions: 起始位置的标签
end_positions: 结束位置的标签
training: 是否为训练模式
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.deberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
**kwargs,
)
if training:
start_logits, end_logits = self.qa_outputs(outputs.last_hidden_state)
return start_logits, end_logits
return TFQuestionAnsweringModelOutput(
start_logits=outputs.start_logits,
end_logits=outputs.end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
r"""
start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
outputs = self.deberta(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
logits = self.qa_outputs(inputs=sequence_output)
start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1)
start_logits = tf.squeeze(input=start_logits, axis=-1)
end_logits = tf.squeeze(input=end_logits, axis=-1)
loss = None
if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions}
labels["end_position"] = end_positions
loss = self.hf_compute_loss(labels=labels, logits=(start_logits, end_logits))
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "deberta", None) is not None:
with tf.name_scope(self.deberta.name):
self.deberta.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
DeBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
DEBERTA_START_DOCSTRING,
)
class TFDebertaV2ForMultipleChoice(TFDebertaV2PreTrainedModel, TFMultipleChoiceLoss):
"""
DeBERTa V2 model for multiple choice tasks. Extends TFDebertaV2PreTrainedModel and TFMultipleChoiceLoss.
This class defines a model architecture with a DeBERTa V2 main layer, dropout, context pooler, and a dense
classifier layer for multiple choice classification tasks.
"""
def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
"""
Initializes TFDebertaV2ForMultipleChoice.
Args:
config (DebertaV2Config): The model configuration class specifying the model architecture and hyperparameters.
*inputs: Variable length argument list for passing inputs to parent classes.
**kwargs: Additional keyword arguments passed to parent classes.
"""
super().__init__(config, *inputs, **kwargs)
self.deberta = TFDebertaV2MainLayer(config, name="deberta")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.pooler = TFDebertaV2ContextPooler(config, name="pooler")
self.classifier = keras.layers.Dense(
units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.output_dim = self.pooler.output_dim
@unpack_inputs
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
**kwargs
):
"""
Perform the forward pass of the TFDebertaV2ForMultipleChoice model.
Args:
input_ids (TFModelInputType, optional): Input ids of shape (batch_size, num_choices, sequence_length).
attention_mask (np.ndarray or tf.Tensor, optional): Attention mask of shape (batch_size, num_choices, sequence_length).
token_type_ids (np.ndarray or tf.Tensor, optional): Token type ids of shape (batch_size, num_choices, sequence_length).
position_ids (np.ndarray or tf.Tensor, optional): Position ids of shape (batch_size, num_choices, sequence_length).
inputs_embeds (np.ndarray or tf.Tensor, optional): Embedded inputs of shape (batch_size, num_choices, sequence_length, hidden_size).
output_attentions (bool, optional): Whether to output attentions.
output_hidden_states (bool, optional): Whether to output hidden states.
return_dict (bool, optional): Whether to return a dictionary.
labels (np.ndarray or tf.Tensor, optional): Labels for multiple choice task.
training (bool, optional): Whether the model is in training mode.
**kwargs: Additional keyword arguments for future extension.
Returns:
TFMultipleChoiceModelOutput: Output class with scores and optionally other relevant outputs.
"""
pass
) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
"""
if input_ids is not None:
num_choices = shape_list(input_ids)[1]
seq_length = shape_list(input_ids)[2]
else:
num_choices = shape_list(inputs_embeds)[1]
seq_length = shape_list(inputs_embeds)[2]
flat_input_ids = tf.reshape(tensor=input_ids, shape=(-1, seq_length)) if input_ids is not None else None
flat_attention_mask = (
tf.reshape(tensor=attention_mask, shape=(-1, seq_length)) if attention_mask is not None else None
)
flat_token_type_ids = (
tf.reshape(tensor=token_type_ids, shape=(-1, seq_length)) if token_type_ids is not None else None
)
flat_position_ids = (
tf.reshape(tensor=position_ids, shape=(-1, seq_length)) if position_ids is not None else None
)
flat_inputs_embeds = (
tf.reshape(tensor=inputs_embeds, shape=(-1, seq_length, shape_list(inputs_embeds)[3]))
if inputs_embeds is not None
else None
)
outputs = self.deberta(
input_ids=flat_input_ids,
attention_mask=flat_attention_mask,
token_type_ids=flat_token_type_ids,
position_ids=flat_position_ids,
inputs_embeds=flat_inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
pooled_output = self.pooler(sequence_output, training=training)
pooled_output = self.dropout(pooled_output, training=training)
logits = self.classifier(pooled_output)
reshaped_logits = tf.reshape(tensor=logits, shape=(-1, num_choices))
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=reshaped_logits)
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFMultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "deberta", None) is not None:
with tf.name_scope(self.deberta.name):
self.deberta.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
if getattr(self, "classifier", None) is not None:
self.classifier.build([None, None, self.output_dim])