Transformers 源码解析(五十一)
.\models\funnel\modeling_tf_funnel.py
""" TF 2.0 Funnel model."""
from __future__ import annotations
import warnings
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
TFBaseModelOutput,
TFMaskedLMOutput,
TFMultipleChoiceModelOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (
TFMaskedLanguageModelingLoss,
TFModelInputType,
TFMultipleChoiceLoss,
TFPreTrainedModel,
TFQuestionAnsweringLoss,
TFSequenceClassificationLoss,
TFTokenClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_funnel import FunnelConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "FunnelConfig"
TF_FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST = [
"funnel-transformer/small",
"funnel-transformer/small-base",
"funnel-transformer/medium",
"funnel-transformer/medium-base",
"funnel-transformer/intermediate",
"funnel-transformer/intermediate-base",
"funnel-transformer/large",
"funnel-transformer/large-base",
"funnel-transformer/xlarge-base",
"funnel-transformer/xlarge",
]
INF = 1e6
class TFFunnelEmbeddings(keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.config = config
self.hidden_size = config.hidden_size
self.initializer_std = 1.0 if config.initializer_std is None else config.initializer_std
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout)
def build(self, input_shape=None):
with tf.name_scope("word_embeddings"):
self.weight = self.add_weight(
name="weight",
shape=[self.config.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_std),
)
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.d_model])
def call(self, input_ids=None, inputs_embeds=None, training=False):
"""
Applies embedding based on inputs tensor.
Returns:
final_embeddings (`tf.Tensor`): output embedding tensor.
"""
assert not (input_ids is None and inputs_embeds is None)
assert not (input_ids is not None and inputs_embeds is not None)
if input_ids is not None:
check_embeddings_within_bounds(input_ids, self.config.vocab_size)
inputs_embeds = tf.gather(self.weight, input_ids)
final_embeddings = self.LayerNorm(inputs=inputs_embeds)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
return final_embeddings
"""
Contains helpers for `TFFunnelRelMultiheadAttention`.
"""
cls_token_type_id: int = 2
def __init__(self, config):
self.d_model = config.d_model
self.attention_type = config.attention_type
self.num_blocks = config.num_blocks
self.separate_cls = config.separate_cls
self.truncate_seq = config.truncate_seq
self.pool_q_only = config.pool_q_only
self.pooling_type = config.pooling_type
self.sin_dropout = keras.layers.Dropout(config.hidden_dropout)
self.cos_dropout = keras.layers.Dropout(config.hidden_dropout)
self.pooling_mult = None
def init_attention_inputs(self, inputs_embeds, attention_mask=None, token_type_ids=None, training=False):
"""Returns the attention inputs associated to the inputs of the model."""
self.pooling_mult = 1
self.seq_len = seq_len = shape_list(inputs_embeds)[1]
position_embeds = self.get_position_embeds(seq_len, training=training)
token_type_mat = self.token_type_ids_to_mat(token_type_ids) if token_type_ids is not None else None
cls_mask = (
tf.pad(tf.ones([seq_len - 1, seq_len - 1], dtype=inputs_embeds.dtype), [[1, 0], [1, 0]])
if self.separate_cls
else None
)
return (position_embeds, token_type_mat, attention_mask, cls_mask)
def token_type_ids_to_mat(self, token_type_ids):
"""Convert `token_type_ids` to `token_type_mat`."""
token_type_mat = tf.equal(tf.expand_dims(token_type_ids, -1), tf.expand_dims(token_type_ids, -2))
cls_ids = tf.equal(token_type_ids, tf.constant([self.cls_token_type_id], dtype=token_type_ids.dtype))
cls_mat = tf.logical_or(tf.expand_dims(cls_ids, -1), tf.expand_dims(cls_ids, -2))
return tf.logical_or(cls_mat, token_type_mat)
def stride_pool_pos(self, pos_id, block_index):
"""
Pool `pos_id` while keeping the cls token separate (if `self.separate_cls=True`).
"""
if self.separate_cls:
cls_pos = tf.constant([-(2**block_index) + 1], dtype=pos_id.dtype)
pooled_pos_id = pos_id[1:-1] if self.truncate_seq else pos_id[1:]
return tf.concat([cls_pos, pooled_pos_id[::2]], 0)
else:
return pos_id[::2]
def relative_pos(self, pos, stride, pooled_pos=None, shift=1):
"""
Build the relative positional vector between `pos` and `pooled_pos`.
构建 `pos` 和 `pooled_pos` 之间的相对位置向量。
"""
if pooled_pos is None:
pooled_pos = pos
ref_point = pooled_pos[0] - pos[0]
num_remove = shift * shape_list(pooled_pos)[0]
max_dist = ref_point + num_remove * stride
min_dist = pooled_pos[0] - pos[-1]
return tf.range(max_dist, min_dist - 1, -stride)
def stride_pool(self, tensor, axis):
"""
Perform pooling by stride slicing the tensor along the given axis.
在给定的轴上通过步进切片对张量进行池化。
"""
if tensor is None:
return None
if isinstance(axis, (list, tuple)):
for ax in axis:
tensor = self.stride_pool(tensor, ax)
return tensor
if isinstance(tensor, (tuple, list)):
return type(tensor)(self.stride_pool(x, axis) for x in tensor)
axis %= len(shape_list(tensor))
axis_slice = slice(None, -1, 2) if self.separate_cls and self.truncate_seq else slice(None, None, 2)
enc_slice = [slice(None)] * axis + [axis_slice]
if self.separate_cls:
cls_slice = [slice(None)] * axis + [slice(None, 1)]
tensor = tf.concat([tensor[cls_slice], tensor], axis)
return tensor[enc_slice]
def pool_tensor(self, tensor, mode="mean", stride=2):
"""Apply 1D pooling to a tensor of size [B x T (x H)]."""
if tensor is None:
return None
if isinstance(tensor, (tuple, list)):
return type(tensor)(self.pool_tensor(x, mode=mode, stride=stride) for x in tensor)
if self.separate_cls:
suffix = tensor[:, :-1] if self.truncate_seq else tensor
tensor = tf.concat([tensor[:, :1], suffix], axis=1)
ndim = len(shape_list(tensor))
if ndim == 2:
tensor = tensor[:, :, None]
if mode == "mean":
tensor = tf.nn.avg_pool1d(tensor, stride, strides=stride, data_format="NWC", padding="SAME")
elif mode == "max":
tensor = tf.nn.max_pool1d(tensor, stride, strides=stride, data_format="NWC", padding="SAME")
elif mode == "min":
tensor = -tf.nn.max_pool1d(-tensor, stride, strides=stride, data_format="NWC", padding="SAME")
else:
raise NotImplementedError("The supported modes are 'mean', 'max' and 'min'.")
return tf.squeeze(tensor, 2) if ndim == 2 else tensor
def pre_attention_pooling(self, output, attention_inputs):
"""Pool `output` and the proper parts of `attention_inputs` before the attention layer."""
position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs
if self.pool_q_only:
if self.attention_type == "factorized":
position_embeds = self.stride_pool(position_embeds[:2], 0) + position_embeds[2:]
token_type_mat = self.stride_pool(token_type_mat, 1)
cls_mask = self.stride_pool(cls_mask, 0)
output = self.pool_tensor(output, mode=self.pooling_type)
else:
self.pooling_mult *= 2
if self.attention_type == "factorized":
position_embeds = self.stride_pool(position_embeds, 0)
token_type_mat = self.stride_pool(token_type_mat, [1, 2])
cls_mask = self.stride_pool(cls_mask, [1, 2])
attention_mask = self.pool_tensor(attention_mask, mode="min")
output = self.pool_tensor(output, mode=self.pooling_type)
attention_inputs = (position_embeds, token_type_mat, attention_mask, cls_mask)
return output, attention_inputs
def post_attention_pooling(self, attention_inputs):
"""Pool the proper parts of `attention_inputs` after the attention layer."""
position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs
if self.pool_q_only:
self.pooling_mult *= 2
if self.attention_type == "factorized":
position_embeds = position_embeds[:2] + self.stride_pool(position_embeds[2:], 0)
token_type_mat = self.stride_pool(token_type_mat, 2)
cls_mask = self.stride_pool(cls_mask, 1)
attention_mask = self.pool_tensor(attention_mask, mode="min")
attention_inputs = (position_embeds, token_type_mat, attention_mask, cls_mask)
return attention_inputs
def _relative_shift_gather(positional_attn, context_len, shift):
batch_size, n_head, seq_len, max_rel_len = shape_list(positional_attn)
positional_attn = tf.reshape(positional_attn, [batch_size, n_head, max_rel_len, seq_len])
positional_attn = positional_attn[:, :, shift:, :]
positional_attn = tf.reshape(positional_attn, [batch_size, n_head, seq_len, max_rel_len - shift])
positional_attn = positional_attn[..., :context_len]
return positional_attn
class TFFunnelRelMultiheadAttention(keras.layers.Layer):
def __init__(self, config, block_index, **kwargs):
super().__init__(**kwargs)
self.attention_type = config.attention_type
self.n_head = n_head = config.n_head
self.d_head = d_head = config.d_head
self.d_model = d_model = config.d_model
self.initializer_range = config.initializer_range
self.block_index = block_index
self.hidden_dropout = keras.layers.Dropout(config.hidden_dropout)
self.attention_dropout = keras.layers.Dropout(config.attention_dropout)
initializer = get_initializer(config.initializer_range)
self.q_head = keras.layers.Dense(
n_head * d_head, use_bias=False, kernel_initializer=initializer, name="q_head"
)
self.k_head = keras.layers.Dense(n_head * d_head, kernel_initializer=initializer, name="k_head")
self.v_head = keras.layers.Dense(n_head * d_head, kernel_initializer=initializer, name="v_head")
self.post_proj = keras.layers.Dense(d_model, kernel_initializer=initializer, name="post_proj")
self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
self.scale = 1.0 / (d_head**0.5)
def build(self, input_shape=None):
n_head, d_head, d_model = self.n_head, self.d_head, self.d_model
initializer = get_initializer(self.initializer_range)
self.r_w_bias = self.add_weight(
shape=(n_head, d_head), initializer=initializer, trainable=True, name="r_w_bias"
)
self.r_r_bias = self.add_weight(
shape=(n_head, d_head), initializer=initializer, trainable=True, name="r_r_bias"
)
self.r_kernel = self.add_weight(
shape=(d_model, n_head, d_head), initializer=initializer, trainable=True, name="r_kernel"
)
self.r_s_bias = self.add_weight(
shape=(n_head, d_head), initializer=initializer, trainable=True, name="r_s_bias"
)
self.seg_embed = self.add_weight(
shape=(2, n_head, d_head), initializer=initializer, trainable=True, name="seg_embed"
)
if self.built:
return
self.built = True
if getattr(self, "q_head", None) is not None:
with tf.name_scope(self.q_head.name):
self.q_head.build([None, None, d_model])
if getattr(self, "k_head", None) is not None:
with tf.name_scope(self.k_head.name):
self.k_head.build([None, None, d_model])
if getattr(self, "v_head", None) is not None:
with tf.name_scope(self.v_head.name):
self.v_head.build([None, None, d_model])
if getattr(self, "post_proj", None) is not None:
with tf.name_scope(self.post_proj.name):
self.post_proj.build([None, None, n_head * d_head])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, d_model])
def relative_positional_attention(self, position_embeds, q_head, context_len, cls_mask=None):
"""Relative attention score for the positional encodings"""
if self.attention_type == "factorized":
phi, pi, psi, omega = position_embeds
u = self.r_r_bias * self.scale
w_r = self.r_kernel
q_r_attention = tf.einsum("binh,dnh->bind", q_head + u, w_r)
q_r_attention_1 = q_r_attention * phi[:, None]
q_r_attention_2 = q_r_attention * pi[:, None]
positional_attn = tf.einsum("bind,jd->bnij", q_r_attention_1, psi) + tf.einsum(
"bind,jd->bnij", q_r_attention_2, omega
)
else:
if shape_list(q_head)[1] != context_len:
shift = 2
r = position_embeds[self.block_index][1]
else:
shift = 1
r = position_embeds[self.block_index][0]
v = self.r_r_bias * self.scale
w_r = self.r_kernel
r_head = tf.einsum("td,dnh->tnh", r, w_r)
positional_attn = tf.einsum("binh,tnh->bnit", q_head + v, r_head)
positional_attn = _relative_shift_gather(positional_attn, context_len, shift)
if cls_mask is not None:
positional_attn *= cls_mask
return positional_attn
def relative_token_type_attention(self, token_type_mat, q_head, cls_mask=None):
"""Relative attention score for the token_type_ids"""
if token_type_mat is None:
return 0
batch_size, seq_len, context_len = shape_list(token_type_mat)
r_s_bias = self.r_s_bias * self.scale
token_type_bias = tf.einsum("bind,snd->bnis", q_head + r_s_bias, self.seg_embed)
token_type_mat = tf.tile(token_type_mat[:, None], [1, shape_list(q_head)[2], 1, 1])
diff_token_type, same_token_type = tf.split(token_type_bias, 2, axis=-1)
token_type_attn = tf.where(
token_type_mat,
tf.tile(same_token_type, [1, 1, 1, context_len]),
tf.tile(diff_token_type, [1, 1, 1, context_len]),
)
if cls_mask is not None:
token_type_attn *= cls_mask
return token_type_attn
def call(self, query, key, value, attention_inputs, output_attentions=False, training=False):
position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs
batch_size, seq_len, _ = shape_list(query)
context_len = shape_list(key)[1]
n_head, d_head = self.n_head, self.d_head
q_head = tf.reshape(self.q_head(query), [batch_size, seq_len, n_head, d_head])
k_head = tf.reshape(self.k_head(key), [batch_size, context_len, n_head, d_head])
v_head = tf.reshape(self.v_head(value), [batch_size, context_len, n_head, d_head])
q_head = q_head * self.scale
r_w_bias = self.r_w_bias * self.scale
content_score = tf.einsum("bind,bjnd->bnij", q_head + r_w_bias, k_head)
positional_attn = self.relative_positional_attention(position_embeds, q_head, context_len, cls_mask)
token_type_attn = self.relative_token_type_attention(token_type_mat, q_head, cls_mask)
attn_score = content_score + positional_attn + token_type_attn
if attention_mask is not None:
attention_mask = tf.cast(attention_mask, dtype=attn_score.dtype)
attn_score = attn_score - (INF * (1 - attention_mask[:, None, None]))
attn_prob = stable_softmax(attn_score, axis=-1)
attn_prob = self.attention_dropout(attn_prob, training=training)
attn_vec = tf.einsum("bnij,bjnd->bind", attn_prob, v_head)
attn_out = self.post_proj(tf.reshape(attn_vec, [batch_size, seq_len, n_head * d_head]))
attn_out = self.hidden_dropout(attn_out, training=training)
output = self.layer_norm(query + attn_out)
return (output, attn_prob) if output_attentions else (output,)
class TFFunnelPositionwiseFFN(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
initializer = get_initializer(config.initializer_range)
self.linear_1 = keras.layers.Dense(config.d_inner, kernel_initializer=initializer, name="linear_1")
self.activation_function = get_tf_activation(config.hidden_act)
self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.linear_2 = keras.layers.Dense(config.d_model, kernel_initializer=initializer, name="linear_2")
self.dropout = keras.layers.Dropout(config.hidden_dropout)
self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
self.config = config
def call(self, hidden, training=False):
h = self.linear_1(hidden)
h = self.activation_function(h)
h = self.activation_dropout(h, training=training)
h = self.linear_2(h)
h = self.dropout(h, training=training)
return self.layer_norm(hidden + h)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "linear_1", None) is not None:
with tf.name_scope(self.linear_1.name):
self.linear_1.build([None, None, self.config.d_model])
if getattr(self, "linear_2", None) is not None:
with tf.name_scope(self.linear_2.name):
self.linear_2.build([None, None, self.config.d_inner])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
class TFFunnelLayer(keras.layers.Layer):
def __init__(self, config, block_index, **kwargs):
super().__init__(**kwargs)
self.attention = TFFunnelRelMultiheadAttention(config, block_index, name="attention")
self.ffn = TFFunnelPositionwiseFFN(config, name="ffn")
def call(self, query, key, value, attention_inputs, output_attentions=False, training=False):
attn = self.attention(
query, key, value, attention_inputs, output_attentions=output_attentions, training=training
)
output = self.ffn(attn[0], training=training)
return (output, attn[1]) if output_attentions else (output,)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "ffn", None) is not None:
with tf.name_scope(self.ffn.name):
self.ffn.build(None)
class TFFunnelEncoder(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.separate_cls = config.separate_cls
self.pool_q_only = config.pool_q_only
self.block_repeats = config.block_repeats
self.attention_structure = TFFunnelAttentionStructure(config)
self.blocks = [
[TFFunnelLayer(config, block_index, name=f"blocks_._{block_index}_._{i}") for i in range(block_size)]
for block_index, block_size in enumerate(config.block_sizes)
]
def call(
self,
inputs_embeds,
attention_mask=None,
token_type_ids=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
training=False,
):
attention_inputs = self.attention_structure.init_attention_inputs(
inputs_embeds,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
training=training,
)
hidden = inputs_embeds
all_hidden_states = (inputs_embeds,) if output_hidden_states else None
all_attentions = () if output_attentions else None
for block_index, block in enumerate(self.blocks):
pooling_flag = shape_list(hidden)[1] > (2 if self.separate_cls else 1)
pooling_flag = pooling_flag and block_index > 0
pooled_hidden = tf.zeros(shape_list(hidden))
if pooling_flag:
pooled_hidden, attention_inputs = self.attention_structure.pre_attention_pooling(
hidden, attention_inputs
)
for layer_index, layer in enumerate(block):
for repeat_index in range(self.block_repeats[block_index]):
do_pooling = (repeat_index == 0) and (layer_index == 0) and pooling_flag
if do_pooling:
query = pooled_hidden
key = value = hidden if self.pool_q_only else pooled_hidden
else:
query = key = value = hidden
layer_output = layer(
query, key, value, attention_inputs, output_attentions=output_attentions, training=training
)
hidden = layer_output[0]
if do_pooling:
attention_inputs = self.attention_structure.post_attention_pooling(attention_inputs)
if output_attentions:
all_attentions = all_attentions + layer_output[1:]
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden,)
if not return_dict:
return tuple(v for v in [hidden, all_hidden_states, all_attentions] if v is not None)
return TFBaseModelOutput(last_hidden_state=hidden, hidden_states=all_hidden_states, attentions=all_attentions)
def build(self, input_shape=None):
if self.built:
return
self.built = True
for block in self.blocks:
for layer in block:
with tf.name_scope(layer.name):
layer.build(None)
def upsample(x, stride, target_len, separate_cls=True, truncate_seq=False):
"""
Upsample tensor `x` to match `target_len` by repeating the tokens `stride` time on the sequence length dimension.
将张量 `x` 进行上采样,使其在序列长度维度上重复 `stride` 次,以匹配 `target_len` 的长度。
"""
if stride == 1:
return x
if separate_cls:
cls = x[:, :1]
x = x[:, 1:]
output = tf.repeat(x, repeats=stride, axis=1)
if separate_cls:
if truncate_seq:
output = tf.pad(output, [[0, 0], [0, stride - 1], [0, 0]])
output = output[:, : target_len - 1]
output = tf.concat([cls, output], axis=1)
else:
output = output[:, :target_len]
return output
class TFFunnelDecoder(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.separate_cls = config.separate_cls
self.truncate_seq = config.truncate_seq
self.stride = 2 ** (len(config.block_sizes) - 1)
self.attention_structure = TFFunnelAttentionStructure(config)
self.layers = [TFFunnelLayer(config, 0, name=f"layers_._{i}") for i in range(config.num_decoder_layers)]
def call(
self,
final_hidden,
first_block_hidden,
attention_mask=None,
token_type_ids=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
training=False,
):
upsampled_hidden = upsample(
final_hidden,
stride=self.stride,
target_len=shape_list(first_block_hidden)[1],
separate_cls=self.separate_cls,
truncate_seq=self.truncate_seq,
)
hidden = upsampled_hidden + first_block_hidden
all_hidden_states = (hidden,) if output_hidden_states else None
all_attentions = () if output_attentions else None
attention_inputs = self.attention_structure.init_attention_inputs(
hidden,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
training=training,
)
for layer in self.layers:
layer_output = layer(
hidden, hidden, hidden, attention_inputs, output_attentions=output_attentions, training=training
)
hidden = layer_output[0]
if output_attentions:
all_attentions = all_attentions + layer_output[1:]
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden,)
if not return_dict:
return tuple(v for v in [hidden, all_hidden_states, all_attentions] if v is not None)
return TFBaseModelOutput(last_hidden_state=hidden, hidden_states=all_hidden_states, attentions=all_attentions)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFFunnelBaseLayer(keras.layers.Layer):
"""Base model without decoder"""
config_class = FunnelConfig
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.config = config
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.return_dict = config.use_return_dict
self.embeddings = TFFunnelEmbeddings(config, name="embeddings")
self.encoder = TFFunnelEncoder(config, name="encoder")
def get_input_embeddings(self):
return self.embeddings
def set_input_embeddings(self, value):
self.embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune):
raise NotImplementedError
@unpack_inputs
def call(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
training=False,
):
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = shape_list(input_ids)
elif inputs_embeds is not None:
input_shape = shape_list(inputs_embeds)[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if attention_mask is None:
attention_mask = tf.fill(input_shape, 1)
if token_type_ids is None:
token_type_ids = tf.fill(input_shape, 0)
if inputs_embeds is None:
inputs_embeds = self.embeddings(input_ids, training=training)
encoder_outputs = self.encoder(
inputs_embeds,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return encoder_outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
@keras_serializable
class TFFunnelMainLayer(keras.layers.Layer):
"""Base model with decoder"""
config_class = FunnelConfig
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.config = config
self.block_sizes = config.block_sizes
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.return_dict = config.use_return_dict
self.embeddings = TFFunnelEmbeddings(config, name="embeddings")
self.encoder = TFFunnelEncoder(config, name="encoder")
self.decoder = TFFunnelDecoder(config, name="decoder")
def get_input_embeddings(self):
return self.embeddings
def set_input_embeddings(self, value):
self.embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune):
raise NotImplementedError
@unpack_inputs
def call(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
training=False,
):
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = shape_list(input_ids)
elif inputs_embeds is not None:
input_shape = shape_list(inputs_embeds)[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if attention_mask is None:
attention_mask = tf.fill(input_shape, 1)
if token_type_ids is None:
token_type_ids = tf.fill(input_shape, 0)
if inputs_embeds is None:
inputs_embeds = self.embeddings(input_ids, training=training)
encoder_outputs = self.encoder(
inputs_embeds,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
output_attentions=output_attentions,
output_hidden_states=True,
return_dict=return_dict,
training=training,
)
decoder_outputs = self.decoder(
final_hidden=encoder_outputs[0],
first_block_hidden=encoder_outputs[1][self.block_sizes[0]],
attention_mask=attention_mask,
token_type_ids=token_type_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
if not return_dict:
idx = 0
outputs = (decoder_outputs[0],)
if output_hidden_states:
idx += 1
outputs = outputs + (encoder_outputs[1] + decoder_outputs[idx],)
if output_attentions:
idx += 1
outputs = outputs + (encoder_outputs[2] + decoder_outputs[idx],)
return outputs
return TFBaseModelOutput(
last_hidden_state=decoder_outputs[0],
hidden_states=(encoder_outputs.hidden_states + decoder_outputs.hidden_states)
if output_hidden_states
else None,
attentions=(encoder_outputs.attentions + decoder_outputs.attentions) if output_attentions else None,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
class TFFunnelDiscriminatorPredictions(keras.layers.Layer):
"""Prediction module for the discriminator, made up of two dense layers."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
initializer = get_initializer(config.initializer_range)
self.dense = keras.layers.Dense(config.d_model, kernel_initializer=initializer, name="dense")
self.activation_function = get_tf_activation(config.hidden_act)
self.dense_prediction = keras.layers.Dense(1, kernel_initializer=initializer, name="dense_prediction")
self.config = config
def call(self, discriminator_hidden_states):
hidden_states = self.dense(discriminator_hidden_states)
hidden_states = self.activation_function(hidden_states)
logits = tf.squeeze(self.dense_prediction(hidden_states))
return logits
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.d_model])
if getattr(self, "dense_prediction", None) is not None:
with tf.name_scope(self.dense_prediction.name):
self.dense_prediction.build([None, None, self.config.d_model])
class TFFunnelMaskedLMHead(keras.layers.Layer):
"""Masked Language Model (MLM) head for TFFunnel model."""
def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs)
self.config = config
self.hidden_size = config.hidden_size
self.input_embeddings = input_embeddings
def build(self, input_shape):
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape)
def get_output_embeddings(self):
return self.input_embeddings
def set_output_embeddings(self, value):
self.input_embeddings.weight = value
self.input_embeddings.vocab_size = shape_list(value)[0]
def get_bias(self):
return {"bias": self.bias}
def set_bias(self, value):
self.bias = value["bias"]
self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states, training=False):
seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states
class TFFunnelClassificationHead(keras.layers.Layer):
"""Classification head for TFFunnel model."""
def __init__(self, config, n_labels, **kwargs):
super().__init__(**kwargs)
initializer = get_initializer(config.initializer_range)
self.linear_hidden = keras.layers.Dense(config.d_model, kernel_initializer=initializer, name="linear_hidden")
self.dropout = keras.layers.Dropout(config.hidden_dropout)
self.linear_out = keras.layers.Dense(n_labels, kernel_initializer=initializer, name="linear_out")
self.config = config
def call(self, hidden, training=False):
hidden = self.linear_hidden(hidden)
hidden = keras.activations.tanh(hidden)
hidden = self.dropout(hidden, training=training)
return self.linear_out(hidden)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "linear_hidden", None) is not None:
with tf.name_scope(self.linear_hidden.name):
self.linear_hidden.build([None, None, self.config.d_model])
if getattr(self, "linear_out", None) is not None:
with tf.name_scope(self.linear_out.name):
self.linear_out.build([None, None, self.config.d_model])
@staticmethod
def convert_attention_mask(attention_mask: tf.Tensor, dtype: tf.DType = tf.float32) -> tf.Tensor:
"""
Converts a 2D Tensor to a boolean mask with shape [batch_size, 1, 1, sequence_length].
Args:
attention_mask (:obj:`tf.Tensor`): The attention mask.
dtype (:obj:`tf.DType`, `optional`, defaults to :obj:`tf.float32`):
The datatype of the resulting mask tensor.
Returns:
:obj:`tf.Tensor`: The boolean mask tensor.
"""
pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
positional argument:
- a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
Note that when creating models and layers with
[subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
about any of this, as you can just pass inputs like you would to any other Python function!
</Tip>
Parameters:
config ([`XxxConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
这是一个长字符串,用于文档化函数参数说明。
详细说明了模型输入的各个参数及其形状和含义。
"""
@add_start_docstrings(
"""
基础的Funnel Transformer模型,输出原始隐藏状态,没有上采样头(也称为解码器)或任何特定任务的头部。
""",
FUNNEL_START_DOCSTRING,
)
class TFFunnelBaseModel(TFFunnelPreTrainedModel):
"""
Funnel Transformer模型的基类,继承自TFFunnelPreTrainedModel。
继承自TFFunnelPreTrainedModel的功能和属性将被此基类继承和使用。
"""
# 初始化函数,用于创建一个新的Funnel模型实例
def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
# 调用父类的初始化函数,传入配置和其他可变参数
super().__init__(config, *inputs, **kwargs)
# 创建一个TFFunnelBaseLayer的实例作为该模型的核心组件,命名为"funnel"
self.funnel = TFFunnelBaseLayer(config, name="funnel")
# 调用函数,将输入传递给funnel模型的前向方法,返回模型输出
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="funnel-transformer/small-base",
output_type=TFBaseModelOutput,
config_class=_CONFIG_FOR_DOC,
)
@unpack_inputs
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[Tuple[tf.Tensor], TFBaseModelOutput]:
# 调用self.funnel的call方法,将各种输入参数传递给Funnel模型
return self.funnel(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# serving_output函数,用于生成模型服务的输出
def serving_output(self, output):
# 创建TFBaseModelOutput实例作为输出,包含last_hidden_state、hidden_states和attentions
# 注意:hidden_states和attentions未使用tf.convert_to_tensor转换,因为它们维度不同
return TFBaseModelOutput(
last_hidden_state=output.last_hidden_state,
hidden_states=output.hidden_states,
attentions=output.attentions,
)
# build函数,用于构建模型,设置各个组件的连接和初始化
def build(self, input_shape=None):
# 如果模型已经构建过,则直接返回
if self.built:
return
# 将模型标记为已构建
self.built = True
# 如果存在self.funnel属性,则在tf的命名作用域内构建funnel模型
if getattr(self, "funnel", None) is not None:
with tf.name_scope(self.funnel.name):
self.funnel.build(None)
@add_start_docstrings(
"""
The bare Funnel Transformer Model transformer outputting raw hidden-states without any specific head on top.
""",
FUNNEL_START_DOCSTRING,
)
class TFFunnelModel(TFFunnelPreTrainedModel):
"""
Funnel Transformer model for processing raw hidden-states without additional heads.
Args:
config (FunnelConfig): The model configuration class instance.
Attributes:
funnel (TFFunnelMainLayer): The main layer of the Funnel Transformer.
"""
def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
super().__init__(config, *inputs, **kwargs)
# Initialize Funnel main layer
self.funnel = TFFunnelMainLayer(config, name="funnel")
@unpack_inputs
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="funnel-transformer/small",
output_type=TFBaseModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[Tuple[tf.Tensor], TFBaseModelOutput]:
"""
Perform the forward pass of the Funnel model.
Args:
input_ids (TFModelInputType | None): Input token IDs.
attention_mask (np.ndarray | tf.Tensor | None): Mask for attention scores.
token_type_ids (np.ndarray | tf.Tensor | None): Segment token indices.
inputs_embeds (np.ndarray | tf.Tensor | None): Embedded inputs.
output_attentions (Optional[bool]): Whether to output attentions.
output_hidden_states (Optional[bool]): Whether to output hidden states.
return_dict (Optional[bool]): Whether to return as dictionary.
training (bool): Whether in training mode.
Returns:
Union[Tuple[tf.Tensor], TFBaseModelOutput]: The model outputs.
"""
return self.funnel(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
def serving_output(self, output):
"""
Format the model output for serving.
Args:
output: Output from the model.
Returns:
TFBaseModelOutput: Formatted output for serving.
"""
# Ensure compatibility for non-tensor outputs
return TFBaseModelOutput(
last_hidden_state=output.last_hidden_state,
hidden_states=output.hidden_states,
attentions=output.attentions,
)
def build(self, input_shape=None):
"""
Build the model layers.
Args:
input_shape: Shape of the input tensor.
"""
if self.built:
return
self.built = True
if getattr(self, "funnel", None) is not None:
with tf.name_scope(self.funnel.name):
self.funnel.build(None)
@add_start_docstrings(
"""
Funnel model with a binary classification head on top as used during pretraining for identifying generated tokens.
""",
FUNNEL_START_DOCSTRING,
)
class TFFunnelForPreTraining(TFFunnelPreTrainedModel):
"""
Funnel Transformer model for pretraining with a binary classification head.
Args:
config (FunnelConfig): The model configuration class instance.
Attributes:
funnel (TFFunnelMainLayer): The main layer of the Funnel Transformer.
discriminator_predictions (TFFunnelDiscriminatorPredictions): Predictions layer for discriminator.
"""
def __init__(self, config: FunnelConfig, **kwargs) -> None:
super().__init__(config, **kwargs)
# Initialize Funnel main layer and discriminator predictions layer
self.funnel = TFFunnelMainLayer(config, name="funnel")
self.discriminator_predictions = TFFunnelDiscriminatorPredictions(config, name="discriminator_predictions")
@unpack_inputs
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=TFFunnelForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
**kwargs,
) -> Union[Tuple[tf.Tensor], TFFunnelForPreTrainingOutput]:
r"""
模型调用方法,接收多个输入参数,生成预测输出或模型状态。
Returns:
返回一个元组或 TFFunnelForPreTrainingOutput 对象,包含模型的输出 logits 和可能的状态信息。
Examples:
```
>>> from transformers import AutoTokenizer, TFFunnelForPreTraining
>>> import torch
>>> tokenizer = AutoTokenizer.from_pretrained("funnel-transformer/small")
>>> model = TFFunnelForPreTraining.from_pretrained("funnel-transformer/small")
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
>>> logits = model(inputs).logits
```"""
# 使用输入调用模型的主干网络(如 Funnel),生成鉴别器的隐藏状态
discriminator_hidden_states = self.funnel(
input_ids,
attention_mask,
token_type_ids,
inputs_embeds,
output_attentions,
output_hidden_states,
return_dict=return_dict,
training=training,
)
# 获取鉴别器的序列输出(通常是隐藏状态的第一个元素)
discriminator_sequence_output = discriminator_hidden_states[0]
# 将鉴别器序列输出传递给鉴别器预测模块,生成最终的预测 logits
logits = self.discriminator_predictions(discriminator_sequence_output)
# 如果不要求返回字典形式的输出,则返回 logits 和其它鉴别器隐藏状态
if not return_dict:
return (logits,) + discriminator_hidden_states[1:]
# 否则,返回包含 logits、隐藏状态和注意力权重的 TFFunnelForPreTrainingOutput 对象
return TFFunnelForPreTrainingOutput(
logits=logits,
hidden_states=discriminator_hidden_states.hidden_states,
attentions=discriminator_hidden_states.attentions,
)
def serving_output(self, output):
# 输出服务化接口,不将 hidden_states 和 attentions 转换为 Tensor,因为它们具有不同的维度
return TFFunnelForPreTrainingOutput(
logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions
)
def build(self, input_shape=None):
# 模型构建方法,如果已经构建过则直接返回
if self.built:
return
self.built = True
# 如果存在主干网络 (funnel),则在命名空间下构建它
if getattr(self, "funnel", None) is not None:
with tf.name_scope(self.funnel.name):
self.funnel.build(None)
# 如果存在鉴别器预测模块,则在命名空间下构建它
if getattr(self, "discriminator_predictions", None) is not None:
with tf.name_scope(self.discriminator_predictions.name):
self.discriminator_predictions.build(None)
@add_start_docstrings("""Funnel Model with a `language modeling` head on top.""", FUNNEL_START_DOCSTRING)
class TFFunnelForMaskedLM(TFFunnelPreTrainedModel, TFMaskedLanguageModelingLoss):
def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
super().__init__(config, *inputs, **kwargs)
# 初始化 Funnel 主层,并命名为 "funnel"
self.funnel = TFFunnelMainLayer(config, name="funnel")
# 初始化 Funnel Masked LM Head,并关联到 Funnel embeddings,命名为 "lm_head"
self.lm_head = TFFunnelMaskedLMHead(config, self.funnel.embeddings, name="lm_head")
def get_lm_head(self) -> TFFunnelMaskedLMHead:
# 返回 Funnel Masked LM Head 对象
return self.lm_head
def get_prefix_bias_name(self) -> str:
# 发出警告,指出方法 get_prefix_bias_name 已被弃用,建议使用 `get_bias`
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
# 返回 lm_head 对象的名称前缀,与当前对象名称组合而成的字符串
return self.name + "/" + self.lm_head.name
@unpack_inputs
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="funnel-transformer/small",
output_type=TFMaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: bool = False,
) -> Union[Tuple[tf.Tensor], TFMaskedLMOutput]:
r"""
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
# 调用 Funnel 主层进行模型前向传播
outputs = self.funnel(
input_ids,
attention_mask,
token_type_ids,
inputs_embeds,
output_attentions,
output_hidden_states,
return_dict=return_dict,
training=training,
)
# 获取序列输出(即模型输出的第一个元素)
sequence_output = outputs[0]
# 使用 lm_head 处理序列输出,得到预测分数
prediction_scores = self.lm_head(sequence_output, training=training)
# 如果没有传入 labels,则损失为 None;否则计算 masked language modeling 损失
loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores)
# 如果不需要返回字典,则返回 tuple 格式的输出
if not return_dict:
output = (prediction_scores,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
# 如果需要返回字典格式的输出,则构建 TFMaskedLMOutput 对象
return TFMaskedLMOutput(
loss=loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 定义一个方法用于处理输出的 TFMaskedLMOutput 对象,输入和输出都是 TFMaskedLMOutput 类型
def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
# 不将 hidden_states 和 attentions 转换为 Tensor,因为它们的维度各不相同
# output.logits 是输出的对数概率
return TFMaskedLMOutput(logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions)
# 构建方法,用于构建模型
def build(self, input_shape=None):
# 如果已经构建过,直接返回
if self.built:
return
self.built = True # 标记模型已经构建
# 如果有 funnel 属性,构建 funnel
if getattr(self, "funnel", None) is not None:
with tf.name_scope(self.funnel.name):
self.funnel.build(None) # 调用 funnel 的 build 方法
# 如果有 lm_head 属性,构建 lm_head
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build(None) # 调用 lm_head 的 build 方法
@add_start_docstrings(
"""
Funnel Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
output) e.g. for GLUE tasks.
""",
FUNNEL_START_DOCSTRING,
)
class TFFunnelForSequenceClassification(TFFunnelPreTrainedModel, TFSequenceClassificationLoss):
def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
# 初始化Funnel模型的基础层
self.funnel = TFFunnelBaseLayer(config, name="funnel")
# 初始化Funnel模型的分类头部
self.classifier = TFFunnelClassificationHead(config, config.num_labels, name="classifier")
@unpack_inputs
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="funnel-transformer/small-base",
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: bool = False,
) -> Union[Tuple[tf.Tensor], TFSequenceClassifierOutput]:
r"""
labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 调用Funnel模型的前向传播
outputs = self.funnel(
input_ids,
attention_mask,
token_type_ids,
inputs_embeds,
output_attentions,
output_hidden_states,
return_dict=return_dict,
training=training,
)
# 获取最后一层隐藏状态
last_hidden_state = outputs[0]
# 获取汇聚的输出
pooled_output = last_hidden_state[:, 0]
# 通过分类器预测logits
logits = self.classifier(pooled_output, training=training)
# 计算损失,如果提供了标签
loss = None if labels is None else self.hf_compute_loss(labels, logits)
# 如果不要求返回字典,则返回元组形式的输出
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
# 返回TFSequenceClassifierOutput对象,包括损失、logits、隐藏状态和注意力分布
return TFSequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 处理模型输出,不对 hidden_states 和 attentions 使用 tf.convert_to_tensor 转换,
# 因为它们的维度不同
def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
# 返回一个新的 TFSequenceClassifierOutput 对象,保留 logits、hidden_states 和 attentions
return TFSequenceClassifierOutput(
logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions
)
# 构建模型
def build(self, input_shape=None):
# 如果模型已经构建,直接返回
if self.built:
return
# 将模型标记为已构建状态
self.built = True
# 如果存在 self.funnel 属性,则构建 self.funnel
if getattr(self, "funnel", None) is not None:
# 使用 self.funnel 的名称作为命名空间
with tf.name_scope(self.funnel.name):
# 调用 self.funnel 的 build 方法
self.funnel.build(None)
# 如果存在 self.classifier 属性,则构建 self.classifier
if getattr(self, "classifier", None) is not None:
# 使用 self.classifier 的名称作为命名空间
with tf.name_scope(self.classifier.name):
# 调用 self.classifier 的 build 方法
self.classifier.build(None)
@add_start_docstrings(
"""
Funnel Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
FUNNEL_START_DOCSTRING,
)
class TFFunnelForMultipleChoice(TFFunnelPreTrainedModel, TFMultipleChoiceLoss):
"""
使用 Funnel 模型,并在其顶部添加一个多选分类头部(一个线性层位于汇总输出之上,并带有 softmax),例如用于 RocStories/SWAG 任务。
继承自 TFFunnelPreTrainedModel 和 TFMultipleChoiceLoss。
"""
def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
"""
初始化方法,设置模型的配置参数和输入。
Args:
config (FunnelConfig): Funnel 模型的配置对象。
*inputs: 可变位置参数,传递给父类构造函数。
**kwargs: 关键字参数,传递给父类构造函数。
"""
super().__init__(config, *inputs, **kwargs)
# 创建 Funnel 的基础层对象,命名为 "funnel"
self.funnel = TFFunnelBaseLayer(config, name="funnel")
# 创建 Funnel 分类头部对象,用于多选分类,输出维度为 1,命名为 "classifier"
self.classifier = TFFunnelClassificationHead(config, 1, name="classifier")
@property
def dummy_inputs(self):
"""
返回一个字典,包含用于模型前向传播的虚拟输入数据。
Returns:
dict: 包含虚拟输入数据的字典,键为 "input_ids",值为形状为 (3, 3, 4) 的 tf.Tensor。
"""
return {"input_ids": tf.ones((3, 3, 4), dtype=tf.int32)}
@unpack_inputs
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint="funnel-transformer/small-base",
output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: bool = False,
**kwargs
) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
"""
模型前向传播方法,接受多种输入和控制参数。
Args:
input_ids (TFModelInputType, optional): 输入的 token IDs,形状为 (batch_size, num_choices, sequence_length)。
attention_mask (np.ndarray or tf.Tensor, optional): 注意力掩码,形状与 input_ids 相同。
token_type_ids (np.ndarray or tf.Tensor, optional): token 类型 IDs,形状与 input_ids 相同。
inputs_embeds (np.ndarray or tf.Tensor, optional): 嵌入输入,形状为 (batch_size, num_choices, sequence_length, embedding_dim)。
output_attentions (bool, optional): 是否返回注意力权重。
output_hidden_states (bool, optional): 是否返回隐藏状态。
return_dict (bool, optional): 是否返回字典形式的输出。
labels (np.ndarray or tf.Tensor, optional): 分类标签,形状为 (batch_size, num_choices)。
training (bool, optional): 是否为训练模式。
Returns:
Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]: 返回模型的输出结果。
"""
# 函数实现由装饰器 add_start_docstrings_to_model_forward 和 add_code_sample_docstrings 添加的文档字符串提供详细信息。
pass # 实际上的前向传播逻辑在具体的调用中执行,这里暂时不做任何操作,保留 pass 语句。
) -> Union[Tuple[tf.Tensor], TFMultipleChoiceModelOutput]:
r"""
labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
"""
# 如果提供了 `input_ids`,则获取其第二个维度的大小作为选择数量,第三个维度的大小作为序列长度
if input_ids is not None:
num_choices = shape_list(input_ids)[1]
seq_length = shape_list(input_ids)[2]
else:
# 如果未提供 `input_ids`,则使用 `inputs_embeds` 的第二个和第三个维度作为选择数量和序列长度
num_choices = shape_list(inputs_embeds)[1]
seq_length = shape_list(inputs_embeds)[2]
# 将输入张量展平成二维张量,以便与模型处理的期望形状匹配
flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
flat_inputs_embeds = (
tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3]))
if inputs_embeds is not None
else None
)
# 调用模型的前向传播函数 `funnel`,传递展平后的输入张量和其他相关参数
outputs = self.funnel(
flat_input_ids,
attention_mask=flat_attention_mask,
token_type_ids=flat_token_type_ids,
inputs_embeds=flat_inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 从模型输出中获取最后一层隐藏状态和池化输出
last_hidden_state = outputs[0]
pooled_output = last_hidden_state[:, 0]
# 使用分类器模型 `classifier` 对池化输出进行分类预测
logits = self.classifier(pooled_output, training=training)
# 将 logits 重新形状为二维张量,以匹配多选选择的期望形状
reshaped_logits = tf.reshape(logits, (-1, num_choices))
# 如果未提供标签 `labels`,则损失值为 None;否则使用 `hf_compute_loss` 函数计算损失
loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)
# 如果 `return_dict` 为 False,则返回一个元组,包含损失值和模型输出的其他部分
if not return_dict:
output = (reshaped_logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
# 如果 `return_dict` 为 True,则返回一个 `TFMultipleChoiceModelOutput` 对象,包含损失值、logits、隐藏状态和注意力权重
return TFMultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
# 作为服务输出,直接将给定的输出对象中的 logits、hidden_states 和 attentions 作为输出返回
return TFMultipleChoiceModelOutput(
logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions
)
# 定义 build 方法,用于构建模型
def build(self, input_shape=None):
# 如果模型已经构建过,则直接返回
if self.built:
return
# 设置模型状态为已构建
self.built = True
# 如果模型中有名为 "funnel" 的子模型存在
if getattr(self, "funnel", None) is not None:
# 在命名空间下构建 "funnel" 子模型
with tf.name_scope(self.funnel.name):
self.funnel.build(None)
# 如果模型中有名为 "classifier" 的子模型存在
if getattr(self, "classifier", None) is not None:
# 在命名空间下构建 "classifier" 子模型
with tf.name_scope(self.classifier.name):
self.classifier.build(None)
@add_start_docstrings(
"""
Funnel Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
FUNNEL_START_DOCSTRING,
)
class TFFunnelForTokenClassification(TFFunnelPreTrainedModel, TFTokenClassificationLoss):
def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels # 初始化模型的标签数量
self.funnel = TFFunnelMainLayer(config, name="funnel") # 创建主要的Funnel层
self.dropout = keras.layers.Dropout(config.hidden_dropout) # 设置dropout层
self.classifier = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) # 设置分类器,用于将隐藏状态输出映射到标签空间
self.config = config # 存储配置信息
@unpack_inputs
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="funnel-transformer/small",
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: bool = False,
) -> Union[Tuple[tf.Tensor], TFTokenClassifierOutput]:
r"""
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
outputs = self.funnel(
input_ids,
attention_mask,
token_type_ids,
inputs_embeds,
output_attentions,
output_hidden_states,
return_dict=return_dict,
training=training,
) # 调用Funnel模型的前向传播
sequence_output = outputs[0] # 获取模型输出的序列隐藏状态
sequence_output = self.dropout(sequence_output, training=training) # 在训练时应用dropout
logits = self.classifier(sequence_output) # 将序列隐藏状态映射到标签空间的logits
loss = None if labels is None else self.hf_compute_loss(labels, logits) # 如果有标签,则计算损失
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output # 如果不返回字典,则返回元组形式的输出
return TFTokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) # 如果返回字典,则使用TFTokenClassifierOutput包装输出
# 定义一个方法,用于处理模型的输出,将其转换为 TFTokenClassifierOutput 类型
def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
# 由于 hidden_states 和 attentions 的维度不同,并非所有都可以通过 tf.convert_to_tensor 转换为张量
# 所以这里不对它们进行转换
return TFTokenClassifierOutput(
logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions
)
# 定义一个方法,用于构建模型
def build(self, input_shape=None):
# 如果模型已经构建过,直接返回
if self.built:
return
# 标记模型为已构建状态
self.built = True
# 如果存在名为 "funnel" 的属性,构建 funnel 模型
if getattr(self, "funnel", None) is not None:
with tf.name_scope(self.funnel.name):
self.funnel.build(None)
# 如果存在名为 "classifier" 的属性,构建 classifier 模型
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
# 构建 classifier 模型,输入形状为 [None, None, self.config.hidden_size]
self.classifier.build([None, None, self.config.hidden_size])
# 使用装饰器添加模型文档字符串,描述 Funnel 模型在提取式问答任务(如 SQuAD)上的用途
@add_start_docstrings(
"""
Funnel Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
FUNNEL_START_DOCSTRING, # 引用已定义的 FUNNEL_START_DOCSTRING
)
# 定义 TFFunnelForQuestionAnswering 类,继承自 TFFunnelPreTrainedModel 和 TFQuestionAnsweringLoss
class TFFunnelForQuestionAnswering(TFFunnelPreTrainedModel, TFQuestionAnsweringLoss):
# 初始化方法
def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
super().__init__(config, *inputs, **kwargs)
# 设置模型的标签数目
self.num_labels = config.num_labels
# 创建 Funnel 主层,并命名为 "funnel"
self.funnel = TFFunnelMainLayer(config, name="funnel")
# 创建用于问答输出的 Dense 层,输出维度为 config.num_labels,使用指定的初始化器
self.qa_outputs = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
# 保存配置信息
self.config = config
# 使用装饰器定义 call 方法,用于模型的前向传播
@unpack_inputs
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="funnel-transformer/small",
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
start_positions: np.ndarray | tf.Tensor | None = None,
end_positions: np.ndarray | tf.Tensor | None = None,
training: bool = False,
) -> Union[Tuple[tf.Tensor], TFQuestionAnsweringModelOutput]:
r"""
start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
outputs = self.funnel(
input_ids,
attention_mask,
token_type_ids,
inputs_embeds,
output_attentions,
output_hidden_states,
return_dict=return_dict,
training=training,
)
# 获取模型的输出序列表示
sequence_output = outputs[0]
# 通过全连接层获取起始位置和结束位置的预测分数
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = tf.split(logits, 2, axis=-1)
start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1)
loss = None
# 如果提供了起始位置和结束位置的标签,则计算损失
if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions, "end_position": end_positions}
loss = self.hf_compute_loss(labels, (start_logits, end_logits))
# 如果不要求返回字典,则返回起始位置和结束位置的预测分数以及额外的输出
if not return_dict:
output = (start_logits, end_logits) + outputs[1:]
return ((loss,) + output) if loss is not None else output
# 返回 TFQuestionAnsweringModelOutput 对象,包含损失、起始位置预测分数、结束位置预测分数、隐藏状态和注意力权重
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
# 针对服务输出,直接复制输入的 TFQuestionAnsweringModelOutput 对象
# 不转换 hidden_states 和 attentions 到 Tensor,因为它们具有不同的维度
return TFQuestionAnsweringModelOutput(
start_logits=output.start_logits,
end_logits=output.end_logits,
hidden_states=output.hidden_states,
attentions=output.attentions,
)
def build(self, input_shape=None):
if self.built:
return
# 标记模型已经构建
self.built = True
# 如果模型具有 "funnel" 属性,则构建 "funnel" 模型
if getattr(self, "funnel", None) is not None:
with tf.name_scope(self.funnel.name):
self.funnel.build(None)
# 如果模型具有 "qa_outputs" 属性,则构建 "qa_outputs" 层
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
.\models\funnel\tokenization_funnel.py
import collections
import os
import unicodedata
from typing import List, Optional, Tuple
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
_model_names = [
"small",
"small-base",
"medium",
"medium-base",
"intermediate",
"intermediate-base",
"large",
"large-base",
"xlarge",
"xlarge-base",
]
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"funnel-transformer/small": "https://huggingface.co/funnel-transformer/small/resolve/main/vocab.txt",
"funnel-transformer/small-base": "https://huggingface.co/funnel-transformer/small-base/resolve/main/vocab.txt",
"funnel-transformer/medium": "https://huggingface.co/funnel-transformer/medium/resolve/main/vocab.txt",
"funnel-transformer/medium-base": (
"https://huggingface.co/funnel-transformer/medium-base/resolve/main/vocab.txt"
),
"funnel-transformer/intermediate": (
"https://huggingface.co/funnel-transformer/intermediate/resolve/main/vocab.txt"
),
"funnel-transformer/intermediate-base": (
"https://huggingface.co/funnel-transformer/intermediate-base/resolve/main/vocab.txt"
),
"funnel-transformer/large": "https://huggingface.co/funnel-transformer/large/resolve/main/vocab.txt",
"funnel-transformer/large-base": "https://huggingface.co/funnel-transformer/large-base/resolve/main/vocab.txt",
"funnel-transformer/xlarge": "https://huggingface.co/funnel-transformer/xlarge/resolve/main/vocab.txt",
"funnel-transformer/xlarge-base": (
"https://huggingface.co/funnel-transformer/xlarge-base/resolve/main/vocab.txt"
),
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {f"funnel-transformer/{name}": 512 for name in _model_names}
PRETRAINED_INIT_CONFIGURATION = {f"funnel-transformer/{name}": {"do_lower_case": True} for name in _model_names}
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
def whitespace_tokenize(text):
"""对文本进行基本的空白字符清理和分割。"""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class FunnelTokenizer(PreTrainedTokenizer):
r"""
构建一个Funnel Transformer的分词器。基于WordPiece。
这个分词器继承自[`PreTrainedTokenizer`],包含大部分主要方法。用户应参考这个超类以获取更多关于这些方法的信息。
"""
Args:
vocab_file (`str`):
File containing the vocabulary.
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
do_basic_tokenize (`bool`, *optional*, defaults to `True`):
Whether or not to do basic tokenization before WordPiece.
never_split (`Iterable`, *optional*):
Collection of tokens which will never be split during tokenization. Only has an effect when
`do_basic_tokenize=True`
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (`str`, *optional*, defaults to `"<sep>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (`str`, *optional*, defaults to `"<cls>"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (`str`, *optional*, defaults to `"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sentence token.
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sentence token.
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters.
This should likely be deactivated for Japanese (see this
[issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
```
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
cls_token_type_id: int = 2
def __init__(
self,
vocab_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="<unk>",
sep_token="<sep>",
pad_token="<pad>",
cls_token="<cls>",
mask_token="<mask>",
bos_token="<s>",
eos_token="</s>",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs,
):
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
" model use `tokenizer = FunnelTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
self.do_basic_tokenize = do_basic_tokenize
if do_basic_tokenize:
self.basic_tokenizer = BasicTokenizer(
do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
super().__init__(
do_lower_case=do_lower_case,
do_basic_tokenize=do_basic_tokenize,
never_split=never_split,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
bos_token=bos_token,
eos_token=eos_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
@property
def do_lower_case(self):
return self.basic_tokenizer.do_lower_case
@property
def vocab_size(self):
return len(self.vocab)
def get_vocab(self):
return dict(self.vocab, **self.added_tokens_encoder)
def _tokenize(self, text, split_special_tokens=False):
split_tokens = []
if self.do_basic_tokenize:
for token in self.basic_tokenizer.tokenize(
text, never_split=self.all_special_tokens if not split_special_tokens else None
):
if token in self.basic_tokenizer.never_split:
split_tokens.append(token)
else:
split_tokens += self.wordpiece_tokenizer.tokenize(token)
else:
split_tokens = self.wordpiece_tokenizer.tokenize(text)
return split_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.ids_to_tokens.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
):
"""
Retrieve special tokens mask from the list of token IDs.
Args:
token_ids_0 (`List[int]`):
List of IDs corresponding to the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether the token list is already formatted with special tokens.
Returns:
`List[int]`: List of integers representing whether each token is special (1) or not (0).
"""
if already_has_special_tokens:
return [0] * len(token_ids_0)
mask = [1] * len(token_ids_0)
sep = [self.sep_token_id]
if token_ids_1 is not None:
mask += sep + [0] * len(token_ids_1)
else:
mask += [0]
return mask
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A Funnel
Transformer sequence pair mask has the following format:
```
2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0]
return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
index = 0
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
else:
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!"
)
index = token_index
writer.write(token + "\n")
index += 1
return (vocab_file,)
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of token not to split.
"""
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
text = self._clean_text(text)
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
if self.do_lower_case:
token = token.lower()
if self.strip_accents is not False:
token = self._run_strip_accents(token)
elif self.strip_accents:
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token, never_split))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF)
or (cp >= 0x20000 and cp <= 0x2A6DF)
or (cp >= 0x2A700 and cp <= 0x2B73F)
or (cp >= 0x2B740 and cp <= 0x2B81F)
or (cp >= 0x2B820 and cp <= 0x2CEAF)
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F)
):
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xFFFD or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
tokenization using the given vocabulary.
For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through *BasicTokenizer*.
Returns:
A list of wordpiece tokens.
"""
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
.\models\funnel\tokenization_funnel_fast.py
""" Tokenization class for Funnel Transformer."""
import json
from typing import List, Optional, Tuple
from tokenizers import normalizers
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .tokenization_funnel import FunnelTokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
_model_names = [
"small",
"small-base",
"medium",
"medium-base",
"intermediate",
"intermediate-base",
"large",
"large-base",
"xlarge",
"xlarge-base",
]
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"funnel-transformer/small": "https://huggingface.co/funnel-transformer/small/resolve/main/vocab.txt",
"funnel-transformer/small-base": "https://huggingface.co/funnel-transformer/small-base/resolve/main/vocab.txt",
"funnel-transformer/medium": "https://huggingface.co/funnel-transformer/medium/resolve/main/vocab.txt",
"funnel-transformer/medium-base": (
"https://huggingface.co/funnel-transformer/medium-base/resolve/main/vocab.txt"
),
"funnel-transformer/intermediate": (
"https://huggingface.co/funnel-transformer/intermediate/resolve/main/vocab.txt"
),
"funnel-transformer/intermediate-base": (
"https://huggingface.co/funnel-transformer/intermediate-base/resolve/main/vocab.txt"
),
"funnel-transformer/large": "https://huggingface.co/funnel-transformer/large/resolve/main/vocab.txt",
"funnel-transformer/large-base": "https://huggingface.co/funnel-transformer/large-base/resolve/main/vocab.txt",
"funnel-transformer/xlarge": "https://huggingface.co/funnel-transformer/xlarge/resolve/main/vocab.txt",
"funnel-transformer/xlarge-base": (
"https://huggingface.co/funnel-transformer/xlarge-base/resolve/main/vocab.txt"
),
},
"tokenizer_file": {
"funnel-transformer/small": "https://huggingface.co/funnel-transformer/small/resolve/main/tokenizer.json",
"funnel-transformer/small-base": (
"https://huggingface.co/funnel-transformer/small-base/resolve/main/tokenizer.json"
),
"funnel-transformer/medium": "https://huggingface.co/funnel-transformer/medium/resolve/main/tokenizer.json",
"funnel-transformer/medium-base": (
"https://huggingface.co/funnel-transformer/medium-base/resolve/main/tokenizer.json"
),
"funnel-transformer/intermediate": (
"https://huggingface.co/funnel-transformer/intermediate/resolve/main/tokenizer.json"
),
"funnel-transformer/intermediate-base": (
"https://huggingface.co/funnel-transformer/intermediate-base/resolve/main/tokenizer.json"
),
"funnel-transformer/large": "https://huggingface.co/funnel-transformer/large/resolve/main/tokenizer.json",
"funnel-transformer/large-base": (
"https://huggingface.co/funnel-transformer/large-base/resolve/main/tokenizer.json"
),
"funnel-transformer/xlarge": "https://huggingface.co/funnel-transformer/xlarge/resolve/main/tokenizer.json",
"funnel-transformer/xlarge-base": (
"https://huggingface.co/funnel-transformer/xlarge-base/resolve/main/tokenizer.json"
),
},
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {f"funnel-transformer/{name}": 512 for name in _model_names}
PRETRAINED_INIT_CONFIGURATION = {f"funnel-transformer/{name}": {"do_lower_case": True} for name in _model_names}
class FunnelTokenizerFast(PreTrainedTokenizerFast):
r"""
Construct a "fast" Funnel Transformer tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
slow_tokenizer_class = FunnelTokenizer
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
cls_token_type_id: int = 2
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
do_lower_case=True,
unk_token="<unk>",
sep_token="<sep>",
pad_token="<pad>",
cls_token="<cls>",
mask_token="<mask>",
bos_token="<s>",
eos_token="</s>",
clean_text=True,
tokenize_chinese_chars=True,
strip_accents=None,
wordpieces_prefix="##",
**kwargs,
):
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
bos_token=bos_token,
eos_token=eos_token,
clean_text=clean_text,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
wordpieces_prefix=wordpieces_prefix,
**kwargs,
)
normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
if (
normalizer_state.get("lowercase", do_lower_case) != do_lower_case
or normalizer_state.get("strip_accents", strip_accents) != strip_accents
or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
):
normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
normalizer_state["lowercase"] = do_lower_case
normalizer_state["strip_accents"] = strip_accents
normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
self.do_lower_case = do_lower_case
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
通过连接和添加特殊标记,从序列或序列对构建用于序列分类任务的模型输入。Funnel 序列的格式如下:
- 单个序列: `[CLS] X [SEP]`
- 序列对: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
要添加特殊标记的 ID 列表。
token_ids_1 (`List[int]`, *optional*):
可选的第二个 ID 列表,用于序列对。
Returns:
`List[int]`: 包含适当特殊标记的输入 ID 列表。
"""
output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
if token_ids_1 is not None:
output += token_ids_1 + [self.sep_token_id]
return output
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A Funnel
Transformer sequence pair mask has the following format:
```
2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0]
return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
.\models\funnel\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_funnel": ["FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP", "FunnelConfig"],
"convert_funnel_original_tf_checkpoint_to_pytorch": [],
"tokenization_funnel": ["FunnelTokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_funnel_fast"] = ["FunnelTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_funnel"] = [
"FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST",
"FunnelBaseModel",
"FunnelForMaskedLM",
"FunnelForMultipleChoice",
"FunnelForPreTraining",
"FunnelForQuestionAnswering",
"FunnelForSequenceClassification",
"FunnelForTokenClassification",
"FunnelModel",
"FunnelPreTrainedModel",
"load_tf_weights_in_funnel",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_funnel"] = [
"TF_FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFFunnelBaseModel",
"TFFunnelForMaskedLM",
"TFFunnelForMultipleChoice",
"TFFunnelForPreTraining",
"TFFunnelForQuestionAnswering",
"TFFunnelForSequenceClassification",
"TFFunnelForTokenClassification",
"TFFunnelModel",
"TFFunnelPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_funnel import FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP, FunnelConfig
from .tokenization_funnel import FunnelTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_funnel_fast import FunnelTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
from .modeling_funnel import (
FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST,
FunnelBaseModel,
FunnelForMaskedLM,
FunnelForMultipleChoice,
FunnelForPreTraining,
FunnelForQuestionAnswering,
FunnelForSequenceClassification,
FunnelForTokenClassification,
FunnelModel,
FunnelPreTrainedModel,
load_tf_weights_in_funnel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_funnel import (
TF_FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST,
TFFunnelBaseModel,
TFFunnelForMaskedLM,
TFFunnelForMultipleChoice,
TFFunnelForPreTraining,
TFFunnelForQuestionAnswering,
TFFunnelForSequenceClassification,
TFFunnelForTokenClassification,
TFFunnelModel,
TFFunnelPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\fuyu\configuration_fuyu.py
""" Fuyu model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto import CONFIG_MAPPING
logger = logging.get_logger(__name__)
FUYU_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"adept/fuyu-8b": "https://huggingface.co/adept/fuyu-8b/resolve/main/config.json",
}
class FuyuConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`FuyuForCausalLM`]. It is used to instantiate an
Fuyu model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the
[adept/fuyu-8b](https://huggingface.co/adept/fuyu-8b).
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
```
>>> from transformers import FuyuConfig
>>> # Initializing a Fuyu fuyu-7b style configuration
>>> configuration = FuyuConfig()
```
"""
model_type = "fuyu"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=262144,
hidden_size=4096,
intermediate_size=16384,
num_hidden_layers=36,
num_attention_heads=64,
hidden_act="relu2",
max_position_embeddings=16384,
image_size=300,
patch_size=30,
num_channels=3,
initializer_range=0.02,
layer_norm_eps=1e-5,
use_cache=True,
tie_word_embeddings=False,
rope_theta=25000.0,
rope_scaling=None,
qk_layernorm=True,
hidden_dropout=0.0,
attention_dropout=0.0,
partial_rotary_factor=0.5,
pad_token_id=None,
bos_token_id=1,
eos_token_id=2,
text_config=None,
**kwargs,
):
super().__init__(
vocab_size=vocab_size,
hidden_size=hidden_size,
intermediate_size=intermediate_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
hidden_act=hidden_act,
max_position_embeddings=max_position_embeddings,
initializer_range=initializer_range,
layer_norm_eps=layer_norm_eps,
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
**kwargs,
)
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.use_cache = use_cache
self.tie_word_embeddings = tie_word_embeddings
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.qk_layernorm = qk_layernorm
self.hidden_dropout = hidden_dropout
self.attention_dropout = attention_dropout
self.partial_rotary_factor = partial_rotary_factor
self.text_config = text_config
):
if text_config is None:
text_config = {
"vocab_size": vocab_size,
"max_position_embeddings": max_position_embeddings,
"hidden_size": hidden_size,
"intermediate_size": intermediate_size,
"num_hidden_layers": num_hidden_layers,
"num_attention_heads": num_attention_heads,
"hidden_act": hidden_act,
"initializer_range": initializer_range,
"layer_norm_eps": layer_norm_eps,
"use_cache": use_cache,
"rope_theta": rope_theta,
"rope_scaling": rope_scaling,
"qk_layernorm": qk_layernorm,
"hidden_dropout": hidden_dropout,
"attention_dropout": attention_dropout,
"partial_rotary_factor": partial_rotary_factor,
"pad_token_id": pad_token_id,
"bos_token_id": bos_token_id,
"eos_token_id": eos_token_id,
"tie_word_embeddings": tie_word_embeddings,
}
logger.info("text_config is None. initializing the text model with default values.")
text_model_type = text_config["model_type"] if "model_type" in text_config else "persimmon"
self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.qk_layernorm = qk_layernorm
self.hidden_dropout = hidden_dropout
self.attention_dropout = attention_dropout
self.partial_rotary_factor = partial_rotary_factor
self._rope_scaling_validation()
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
def _rope_scaling_validation(self):
"""
Validate the `rope_scaling` configuration.
"""
if self.rope_scaling is None:
return
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
raise ValueError(
"`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
f"got {self.rope_scaling}"
)
rope_scaling_type = self.rope_scaling.get("type", None)
rope_scaling_factor = self.rope_scaling.get("factor", None)
if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
raise ValueError(
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
)
if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
.\models\fuyu\convert_fuyu_model_weights_to_hf.py
import argparse
import os
import sys
import warnings
import flatdict
import torch
from transformers import FuyuConfig, FuyuForCausalLM, LlamaTokenizer
try:
from transformers import LlamaTokenizerFast
tokenizer_class = LlamaTokenizerFast
except ImportError as e:
warnings.warn(e)
warnings.warn(
"The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
)
tokenizer_class = LlamaTokenizer
KEYS_TO_MODIFY_MAPPING = {
"self_attention": "self_attn",
"language_model.encoder": "language_model.model",
"word_embeddings_for_head": "language_model.lm_head",
"language_model.embedding.word_embeddings": "language_model.model.embed_tokens",
"vit_encoder.linear_encoder": "vision_embed_tokens",
}
KEYS_TO_REMOVE = {
"rotary_emb.inv_freq",
"image_patch_projection",
"image_patch_projection.weight",
"image_patch_projection.bias",
}
def rename_state_dict(state_dict):
model_state_dict = {}
for key, value in state_dict.items():
for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
if key_to_modify in key:
key = key.replace(key_to_modify, new_key)
if key in KEYS_TO_REMOVE:
continue
model_state_dict[key] = value
return model_state_dict
def convert_fuyu_checkpoint(pytorch_dump_folder_path, ada_lib_path, pt_model_path, safe_serialization=False):
sys.path.insert(0, ada_lib_path)
model_state_dict_base = torch.load(pt_model_path, map_location="cpu")
state_dict = flatdict.FlatDict(model_state_dict_base["model"], ".")
state_dict = rename_state_dict(state_dict)
transformers_config = FuyuConfig()
model = FuyuForCausalLM(transformers_config).to(torch.bfloat16)
model.load_state_dict(state_dict)
model.save_pretrained(pytorch_dump_folder_path, safe_serialization=safe_serialization)
transformers_config.save_pretrained(pytorch_dump_folder_path)
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--input_dir",
help="Location of Fuyu weights, which contains tokenizer.model and model folders",
)
parser.add_argument(
"--pt_model_path",
help="Location of Fuyu `model_optim_rng.pt`",
)
parser.add_argument(
"--output_dir",
help="Location to write HF model and tokenizer",
)
parser.add_argument(
"--ada_lib_path",
help="Location of original source code from adept to deserialize .pt checkpoint",
)
parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
args = parser.parse_args()
spm_path = os.path.join(args.input_dir, "adept_vocab.model")
convert_fuyu_checkpoint(
pytorch_dump_folder_path=args.output_dir,
pt_model_path=args.pt_model_path,
safe_serialization=args.safe_serialization,
ada_lib_path=args.ada_lib_path,
)
tokenizer = tokenizer_class(spm_path, bos_token="|ENDOFTEXT|", eos_token="|ENDOFTEXT|")
tokenizer.save_pretrained(args.output_dir)
if __name__ == "__main__":
main()
.\models\fuyu\image_processing_fuyu.py
"""Image processor class for Fuyu."""
import math
from typing import Dict, List, Optional, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature
from ...image_transforms import (
pad,
resize,
to_channel_dimension_format,
)
from ...image_utils import (
ChannelDimension,
ImageInput,
PILImageResampling,
get_image_size,
infer_channel_dimension_format,
is_scaled_image,
is_valid_image,
make_list_of_images,
to_numpy_array,
validate_preprocess_arguments,
)
from ...utils import (
TensorType,
is_torch_available,
is_torch_device,
is_torch_dtype,
logging,
requires_backends,
)
if is_torch_available():
import torch
logger = logging.get_logger(__name__)
def make_list_of_list_of_images(
images: Union[List[List[ImageInput]], List[ImageInput], ImageInput],
) -> List[List[ImageInput]]:
if is_valid_image(images):
return [[images]]
if isinstance(images, list) and all(isinstance(image, list) for image in images):
return images
if isinstance(images, list):
return [make_list_of_images(image) for image in images]
raise ValueError("images must be a list of list of images or a list of images or an image.")
class FuyuBatchFeature(BatchFeature):
"""
BatchFeature class for Fuyu image processor and processor.
The outputs dictionary from the processors contains a mix of tensors and lists of tensors.
"""
def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
"""
Convert the inner content to tensors.
Args:
tensor_type (`str` or [`~utils.TensorType`], *optional*):
The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
`None`, no modification is done.
"""
if tensor_type is None:
return self
is_tensor, as_tensor = self._get_is_as_tensor_fns(tensor_type=tensor_type)
def _convert_tensor(elem):
if is_tensor(elem):
return elem
return as_tensor(elem)
def _safe_convert_tensor(elem):
try:
return _convert_tensor(elem)
except:
if key == "overflowing_values":
raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
raise ValueError(
"Unable to create tensor, you should probably activate padding "
"with 'padding=True' to have batched tensors with the same length."
)
for key, value in self.items():
if isinstance(value, list) and isinstance(value[0], list):
self[key] = [[_safe_convert_tensor(elem) for elem in elems] for elems in value]
elif isinstance(value, list):
self[key] = [_safe_convert_tensor(elem) for elem in value]
else:
self[key] = _safe_convert_tensor(value)
return self
def to(self, *args, **kwargs) -> "BatchFeature":
"""
Send all values to device by calling `v.to(*args, **kwargs)` (PyTorch only). This should support casting in
different `dtypes` and sending the `BatchFeature` to a different `device`.
Args:
args (`Tuple`):
Will be passed to the `to(...)` function of the tensors.
kwargs (`Dict`, *optional*):
Will be passed to the `to(...)` function of the tensors.
Returns:
[`BatchFeature`]: The same instance after modification.
"""
requires_backends(self, ["torch"])
import torch
new_data = {}
device = kwargs.get("device")
if device is None and len(args) > 0:
arg = args[0]
if is_torch_dtype(arg):
pass
elif isinstance(arg, str) or is_torch_device(arg) or isinstance(arg, int):
device = arg
else:
raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")
def _to(elem):
if torch.is_floating_point(elem):
return elem.to(*args, **kwargs)
if device is not None:
return elem.to(device=device)
return elem
for k, v in self.items():
if isinstance(v, list) and isinstance(v[0], list):
new_v = []
for elems in v:
new_v.append([_to(elem) for elem in elems])
new_data[k] = new_v
elif isinstance(v, list):
new_data[k] = [_to(elem) for elem in v]
else:
new_data[k] = _to(v)
self.data = new_data
return self
class FuyuImageProcessor(BaseImageProcessor):
"""
This class should handle the image processing part before the main FuyuForCausalLM. In particular, it should
handle:
- Processing Images:
Taking a batch of images as input. If the images are variable-sized, it resizes them based on the desired patch
dimensions. The image output is always img_h, img_w of (1080, 1920)
Then, it patches up these images using the patchify_image function.
- Creating Image Input IDs:
For each patch, a placeholder ID is given to identify where these patches belong in a token sequence. For
variable-sized images, each line of patches is terminated with a newline ID.
- Image Patch Indices:
For each image patch, the code maintains an index where these patches should be inserted in a token stream.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image to `size`.
size (`Dict[str, int]`, *optional*, defaults to `{"height": 1080, "width": 1920}`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
do_pad (`bool`, *optional*, defaults to `True`):
Whether to pad the image to `size`.
padding_value (`float`, *optional*, defaults to 1.0):
The value to pad the image with.
padding_mode (`str`, *optional*, defaults to `"constant"`):
The padding mode to use when padding the image.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image.
image_mean (`float`, *optional*, defaults to 0.5):
The mean to use when normalizing the image.
image_std (`float`, *optional*, defaults to 0.5):
The standard deviation to use when normalizing the image.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image.
rescale_factor (`float`, *optional*, defaults to `1 / 255`):
The factor to use when rescaling the image.
patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 30, "width": 30}`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
"""
model_input_names = [
"images",
"image_input_ids",
"image_patches",
"image_patch_indices_per_batch",
"image_patch_indices_per_subsequence",
]
def __init__(
self,
do_resize: bool = True,
size: Optional[Dict[str, int]] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
do_pad: bool = True,
padding_value: float = 1.0,
padding_mode: str = "constant",
do_normalize: bool = True,
image_mean: Union[float, List[float]] = 0.5,
image_std: Union[float, List[float]] = 0.5,
do_rescale: bool = True,
rescale_factor: float = 1 / 255,
patch_size: Optional[Dict[str, int]] = None,
**kwargs,
):
super().__init__(**kwargs)
self.do_resize = do_resize
self.size = size if size is not None else {"height": 1080, "width": 1920}
self.resample = resample
self.do_pad = do_pad
self.padding_value = padding_value
self.padding_mode = padding_mode
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.patch_size = patch_size if patch_size is not None else {"height": 30, "width": 30}
self._valid_processor_keys = [
"images",
"do_resize",
"size",
"resample",
"do_pad",
"padding_value",
"padding_mode",
"do_normalize",
"image_mean",
"image_std",
"do_rescale",
"rescale_factor",
"patch_size",
"return_tensors",
"data_format",
"input_data_format",
]
def resize_image(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize an image to `(size["height"], size["width"])`.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
Returns:
`np.ndarray`: The resized image.
"""
image_height, image_width = get_image_size(image, input_data_format)
target_height, target_width = size["height"], size["width"]
if image_width <= target_width and image_height <= target_height:
return image
height_scale_factor = target_height / image_height
width_scale_factor = target_width / image_width
optimal_scale_factor = min(height_scale_factor, width_scale_factor)
new_height = int(image_height * optimal_scale_factor)
new_width = int(image_width * optimal_scale_factor)
scaled_image = resize(
image=image,
size=(new_height, new_width),
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
return scaled_image
) -> np.ndarray:
"""
Pad an image to `(size["height"], size["width"])`.
Args:
image (`np.ndarray`):
Image to pad.
size (`Dict[str, int]`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
data_format (`ChannelDimension` or `str`, *optional*):
The data format of the output image. If unset, the same format as the input image is used.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
image_height, image_width = get_image_size(image, input_data_format)
target_height, target_width = size["height"], size["width"]
padding_top = 0
padding_left = 0
padding_bottom = target_height - image_height
padding_right = target_width - image_width
padded_image = pad(
image,
padding=((padding_top, padding_bottom), (padding_left, padding_right)),
mode=mode,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
)
return padded_image
def preprocess(
self,
images,
do_resize: Optional[bool] = None,
size: Optional[Dict[str, int]] = None,
resample: Optional[PILImageResampling] = None,
do_pad: Optional[bool] = None,
padding_value: Optional[float] = None,
padding_mode: Optional[str] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[float] = None,
image_std: Optional[float] = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
patch_size: Optional[Dict[str, int]] = None,
data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
return_tensors: Optional[TensorType] = None,
def get_num_patches(self, image_height: int, image_width: int, patch_size: Dict[str, int] = None) -> int:
"""
Calculate number of patches required to encode an image.
Args:
image_height (`int`):
Height of the image.
image_width (`int`):
Width of the image.
patch_size (`Dict[str, int]`, *optional*, defaults to `self.patch_size`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
"""
patch_size = patch_size if patch_size is not None else self.patch_size
patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
if image_height % patch_height != 0:
raise ValueError(f"{image_height=} must be divisible by {patch_height}")
if image_width % patch_width != 0:
raise ValueError(f"{image_width=} must be divisible by {patch_width}")
num_patches_per_dim_h = image_height // patch_height
num_patches_per_dim_w = image_width // patch_width
num_patches = num_patches_per_dim_h * num_patches_per_dim_w
return num_patches
def patchify_image(self, image: "torch.Tensor", patch_size: Optional[Dict[str, int]] = None) -> "torch.Tensor":
"""
Convert an image into a tensor of patches.
Args:
image (`torch.Tensor`):
Image to convert. Shape: [batch, channels, height, width]
patch_size (`Dict[str, int]`, *optional*, defaults to `self.patch_size`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
"""
requires_backends(self, ["torch"])
patch_size = patch_size if patch_size is not None else self.patch_size
patch_height, patch_width = patch_size["height"], patch_size["width"]
batch_size, channels, _, _ = image.shape
unfolded_along_height = image.unfold(2, patch_height, patch_height)
patches = unfolded_along_height.unfold(3, patch_width, patch_width)
patches = patches.contiguous()
patches = patches.view(batch_size, channels, -1, patch_height, patch_width)
patches = patches.permute(0, 2, 3, 4, 1)
patches = patches.reshape(batch_size, -1, channels * patch_height * patch_width)
return patches
def preprocess_with_tokenizer_info(
self,
image_input: "torch.Tensor",
image_present: "torch.Tensor",
image_unpadded_h: "torch.Tensor",
image_unpadded_w: "torch.Tensor",
image_placeholder_id: int,
image_newline_id: int,
variable_sized: bool,
patch_size: Optional[Dict[str, int]] = None,
):
"""
Preprocess image tensors along with tokenizer information.
Args:
image_input (`torch.Tensor`):
Input image tensor.
image_present (`torch.Tensor`):
Present image tensor.
image_unpadded_h (`torch.Tensor`):
Unpadded image height tensor.
image_unpadded_w (`torch.Tensor`):
Unpadded image width tensor.
image_placeholder_id (`int`):
Placeholder ID for the image.
image_newline_id (`int`):
Newline ID for the image.
variable_sized (`bool`):
Whether the image size varies.
patch_size (`Dict[str, int]`, *optional*, defaults to `self.patch_size`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
"""
pass
.\models\fuyu\modeling_fuyu.py
"""
The bare Fuyu Model outputting raw hidden-states without any specific head on top.
This model inherits from `PreTrainedModel`. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch `torch.nn.Module` subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
"""
"Fuyu Model with a language modeling head on top for causal language model conditioned on image patches and text.",
FUYU_START_DOCSTRING,
)
class FuyuForCausalLM(FuyuPreTrainedModel):
def __init__(self, config: FuyuConfig):
super().__init__(config)
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
self.language_model = AutoModelForCausalLM.from_config(config.text_config)
self.vision_embed_tokens = nn.Linear(
config.patch_size * config.patch_size * config.num_channels, config.hidden_size
)
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.language_model.get_input_embeddings()
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
def gather_continuous_embeddings(
self,
word_embeddings: torch.Tensor,
continuous_embeddings: List[torch.Tensor],
image_patch_input_indices: torch.Tensor,
) -> torch.Tensor:
"""This function places the continuous_embeddings into the word_embeddings at the locations
indicated by image_patch_input_indices. Different batch elements can have different numbers of continuous
embeddings.
Args:
word_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Tensor of word embeddings.
continuous_embeddings (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
Tensor of continuous embeddings. The length of the list is the batch size. Each entry is shape
[num_image_embeddings, hidden], and num_image_embeddings needs to match the number of non-negative
indices in image_patch_input_indices for that batch element.
image_patch_input_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Tensor of indices of the image patches in the input_ids tensor.
"""
if not (word_embeddings.shape[0] == len(continuous_embeddings)):
raise ValueError(
f"Batch sizes must match! Got {len(continuous_embeddings)=} and {word_embeddings.shape[0]=}"
)
output_embeddings = word_embeddings.clone()
for batch_idx in range(word_embeddings.shape[0]):
dst_indices = torch.nonzero(image_patch_input_indices[batch_idx] >= 0, as_tuple=True)[0]
src_indices = image_patch_input_indices[batch_idx][dst_indices]
if src_indices.shape[0] > continuous_embeddings[batch_idx].shape[0]:
raise ValueError(
f"Number of continuous embeddings {continuous_embeddings[batch_idx].shape=} does not match "
f"number of continuous token ids {src_indices.shape=} in batch element {batch_idx}."
)
output_embeddings[batch_idx, dst_indices] = continuous_embeddings[batch_idx][src_indices]
return output_embeddings
def forward(
self,
input_ids: torch.LongTensor = None,
image_patches: torch.Tensor = None,
image_patches_indices: torch.Tensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
if past_key_values:
input_ids = input_ids[:, -1:]
position_ids = kwargs.get("position_ids", None)
if attention_mask is not None and position_ids is None:
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
if past_key_values:
position_ids = position_ids[:, -1].unsqueeze(-1)
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
if image_patches_indices is not None:
model_inputs["image_patches_indices"] = image_patches_indices
model_inputs.update(
{
"position_ids": position_ids,
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"attention_mask": attention_mask,
"image_patches_indices": image_patches_indices if past_key_values is None else None,
"image_patches": image_patches if past_key_values is None else None,
}
)
return model_inputs
.\models\fuyu\processing_fuyu.py
"""
GIT 的图像/文本处理器类
"""
import re
from typing import Dict, List, Optional, Tuple, Union
import numpy as np
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import PaddingStrategy, TruncationStrategy
from ...utils import TensorType, is_torch_available, logging, requires_backends
if is_torch_available():
from .image_processing_fuyu import FuyuBatchFeature
logger = logging.get_logger(__name__)
if is_torch_available():
import torch
TEXT_REPR_BBOX_OPEN = "<box>"
TEXT_REPR_BBOX_CLOSE = "</box>"
TEXT_REPR_POINT_OPEN = "<point>"
TEXT_REPR_POINT_CLOSE = "</point>"
TOKEN_BBOX_OPEN_STRING = "<0x00>"
TOKEN_BBOX_CLOSE_STRING = "<0x01>"
TOKEN_POINT_OPEN_STRING = "<0x02>"
TOKEN_POINT_CLOSE_STRING = "<0x03>"
BEGINNING_OF_ANSWER_STRING = "<0x04>"
def full_unpacked_stream_to_tensor(
all_bi_tokens_to_place: List[int],
full_unpacked_stream: List["torch.Tensor"],
fill_value: int,
batch_size: int,
new_seq_len: int,
offset: int,
) -> "torch.Tensor":
"""将解压的令牌流(即批次中每个项目的张量列表)进行必要的填充,以创建一个形状为 batch_size x new_seq_len 的单个张量。
"""
assert len(all_bi_tokens_to_place) == batch_size
assert len(full_unpacked_stream) == batch_size
new_padded_tensor = torch.full(
[batch_size, new_seq_len],
fill_value=fill_value,
dtype=full_unpacked_stream[0].dtype,
device=full_unpacked_stream[0].device,
)
for bi in range(batch_size):
tokens_to_place = all_bi_tokens_to_place[bi]
new_padded_tensor[bi, :tokens_to_place] = full_unpacked_stream[bi][offset : tokens_to_place + offset]
return new_padded_tensor
def construct_full_unpacked_stream(
num_real_text_tokens: Union[List[List[int]], "torch.Tensor"],
input_stream: "torch.Tensor",
image_tokens: List[List["torch.Tensor"]],
batch_size: int,
num_sub_sequences: int,
) -> List["torch.Tensor"]:
"""接受形状为 B x S x ? 的 input_stream 张量。对于每个子序列,添加所需的
"""
all_bi_stream = []
for batch_index in range(batch_size):
all_si_stream = []
image_adjustment = image_tokens[batch_index][0]
subsequence_stream = torch.cat([image_adjustment, input_stream[batch_index, 0]], dim=0)
num_real_tokens = image_adjustment.shape[0] + num_real_text_tokens[batch_index][0]
all_si_stream.append(subsequence_stream[:num_real_tokens])
all_bi_stream.append(torch.cat(all_si_stream, dim=0))
return all_bi_stream
def _replace_string_repr_with_token_tags(prompt: str) -> str:
prompt = prompt.replace(TEXT_REPR_POINT_OPEN, TOKEN_POINT_OPEN_STRING)
prompt = prompt.replace(TEXT_REPR_POINT_CLOSE, TOKEN_POINT_CLOSE_STRING)
prompt = prompt.replace(TEXT_REPR_BBOX_OPEN, TOKEN_BBOX_OPEN_STRING)
prompt = prompt.replace(TEXT_REPR_BBOX_CLOSE, TOKEN_BBOX_CLOSE_STRING)
return prompt
def _segment_prompt_into_text_token_conversions(prompt: str) -> List:
"""
Given a string prompt, converts the prompt into a list of TextTokenConversions.
"""
prompt_text_list: List = []
regex_pattern = re.compile(
f"({TOKEN_BBOX_OPEN_STRING}|{TOKEN_BBOX_CLOSE_STRING}|{TOKEN_POINT_OPEN_STRING}|{TOKEN_POINT_CLOSE_STRING})"
)
prompt_split = regex_pattern.split(prompt)
for i, elem in enumerate(prompt_split):
if len(elem) == 0 or elem in [
TOKEN_BBOX_OPEN_STRING,
TOKEN_BBOX_CLOSE_STRING,
TOKEN_POINT_OPEN_STRING,
TOKEN_POINT_CLOSE_STRING,
]:
continue
prompt_text_list.append(
(elem, i > 1 and prompt_split[i - 1] in [TOKEN_BBOX_OPEN_STRING, TOKEN_POINT_OPEN_STRING])
)
return prompt_text_list
def _transform_coordinates_and_tokenize(prompt: str, scale_factor: float, tokenizer) -> List[int]:
"""
This function transforms the prompt in the following fashion:
- <box> <point> and </box> </point> to their respective token mappings
- extract the coordinates from the tag
- transform the coordinates into the transformed image space
- return the prompt tokens with the transformed coordinates and new tags
Bounding boxes and points MUST be in the following format: <box>y1, x1, y2, x2</box> <point>x, y</point> The spaces
and punctuation added above are NOT optional.
"""
prompt = _replace_string_repr_with_token_tags(prompt)
prompt_text_list = _segment_prompt_into_text_token_conversions(prompt)
transformed_prompt_tokens: List[int] = []
for elem in prompt_text_list:
if elem[1]:
within_tag_tokenized = _transform_within_tags(elem[0], scale_factor, tokenizer)
transformed_prompt_tokens.extend(within_tag_tokenized)
else:
transformed_prompt_tokens.extend(tokenizer(elem[0], add_special_tokens=False).input_ids)
return transformed_prompt_tokens
def _transform_within_tags(text: str, scale_factor: float, tokenizer) -> List[int]:
"""
Given a bounding box of the fashion <box>1, 2, 3, 4</box> | <point>1, 2</point> This function is responsible for
converting 1, 2, 3, 4 into tokens of 1 2 3 4 without any commas.
"""
num_int_strs = text.split(",")
if len(num_int_strs) == 2:
token_space_open_string = tokenizer.vocab[TOKEN_POINT_OPEN_STRING]
token_space_close_string = tokenizer.vocab[TOKEN_POINT_CLOSE_STRING]
else:
token_space_open_string = tokenizer.vocab[TOKEN_BBOX_OPEN_STRING]
token_space_close_string = tokenizer.vocab[TOKEN_BBOX_CLOSE_STRING]
num_ints = [float(num.strip()) for num in num_int_strs]
if len(num_ints) == 2:
num_ints_translated = scale_point_to_transformed_image(x=num_ints[0], y=num_ints[1], scale_factor=scale_factor)
elif len(num_ints) == 4:
num_ints_translated = scale_bbox_to_transformed_image(
top=num_ints[0],
left=num_ints[1],
bottom=num_ints[2],
right=num_ints[3],
scale_factor=scale_factor,
)
else:
raise ValueError(f"Invalid number of ints: {len(num_ints)}")
tokens = [tokenizer.vocab[str(num)] for num in num_ints_translated]
return [token_space_open_string] + tokens + [token_space_close_string]
def _tokenize_prompts_with_image_and_batch(
tokenizer,
prompts: List[List[str]],
scale_factors: Optional[List[List["torch.Tensor"]]],
max_tokens_to_generate: int,
max_position_embeddings: int,
add_BOS: bool,
add_beginning_of_answer_token: bool,
) -> Tuple["torch.Tensor", "torch.Tensor"]:
"""
Given a set of prompts and number of tokens to generate:
- tokenize prompts
- set the sequence length to be the max of length of prompts plus the number of tokens we would like to generate
- pad all the sequences to this length so we can convert them into a 3D tensor.
"""
if scale_factors is not None:
transformed_prompt_tokens = []
for prompt_seq, scale_factor_seq in zip(prompts, scale_factors):
transformed_prompt_tokens.append(
[
_transform_coordinates_and_tokenize(prompt, scale_factor.item(), tokenizer)
for prompt, scale_factor in zip(prompt_seq, scale_factor_seq)
]
)
else:
transformed_prompt_tokens = [[tokenizer.tokenize(prompt) for prompt in prompt_seq] for prompt_seq in prompts]
prompts_tokens = transformed_prompt_tokens
if add_BOS:
bos_token = tokenizer.vocab["<s>"]
else:
bos_token = tokenizer.vocab["|ENDOFTEXT|"]
prompts_tokens = [[[bos_token] + x for x in prompt_seq] for prompt_seq in prompts_tokens]
if add_beginning_of_answer_token:
boa = tokenizer.vocab[BEGINNING_OF_ANSWER_STRING]
for token_seq in prompts_tokens:
token_seq[-1].append(boa)
prompts_length = [[len(x) for x in prompts_tokens_seq] for prompts_tokens_seq in prompts_tokens]
max_prompt_len: int = np.max(prompts_length)
samples_length = min(max_prompt_len + max_tokens_to_generate, max_position_embeddings)
if max_prompt_len + max_tokens_to_generate > max_position_embeddings:
logger.warning(
f"Max subsequence prompt length of {max_prompt_len} + max tokens to generate {max_tokens_to_generate}",
f"exceeds context length of {max_position_embeddings}. Will generate as many tokens as possible.",
)
for prompt_tokens_seq, prompts_length_seq in zip(prompts_tokens, prompts_length):
for prompt_tokens, prompt_length in zip(prompt_tokens_seq, prompts_length_seq):
if len(prompt_tokens) > samples_length:
raise ValueError("Length of subsequence prompt exceeds sequence length.")
padding_size = samples_length - prompt_length
prompt_tokens.extend([tokenizer.vocab["|ENDOFTEXT|"]] * padding_size)
prompts_tokens_tensor = torch.tensor(prompts_tokens, dtype=torch.int64)
prompts_length_tensor = torch.tensor(prompts_length, dtype=torch.int64)
return prompts_tokens_tensor, prompts_length_tensor
def original_to_transformed_h_coords(original_coords, scale_h):
return np.round(original_coords * scale_h).astype(np.int32)
def original_to_transformed_w_coords(original_coords, scale_w):
return np.round(original_coords * scale_w).astype(np.int32)
def scale_point_to_transformed_image(x: float, y: float, scale_factor: float) -> List[int]:
x_scaled = original_to_transformed_w_coords(np.array([x / 2]), scale_factor)[0]
y_scaled = original_to_transformed_h_coords(np.array([y / 2]), scale_factor)[0]
return [x_scaled, y_scaled]
def scale_bbox_to_transformed_image(
top: float, left: float, bottom: float, right: float, scale_factor: float
) -> List[int]:
top_scaled = original_to_transformed_w_coords(np.array([top / 2]), scale_factor)[0]
left_scaled = original_to_transformed_h_coords(np.array([left / 2]), scale_factor)[0]
bottom_scaled = original_to_transformed_w_coords(np.array([bottom / 2]), scale_factor)[0]
right_scaled = original_to_transformed_h_coords(np.array([right / 2]), scale_factor)[0]
return [top_scaled, left_scaled, bottom_scaled, right_scaled]
class FuyuProcessor(ProcessorMixin):
r"""
构造一个 Fuyu 处理器,将 Fuyu 图像处理器和 Llama 分词器封装为单个处理器。
[`FuyuProcessor`] 提供了 [`FuyuImageProcessor`] 和 [`LlamaTokenizerFast`] 的所有功能。查看 [`~FuyuProcessor.__call__`] 和 [`~FuyuProcessor.decode`] 获取更多信息。
Args:
image_processor ([`FuyuImageProcessor`]):
必需的图像处理器输入。
tokenizer ([`LlamaTokenizerFast`]):
必需的分词器输入。
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "FuyuImageProcessor"
tokenizer_class = "AutoTokenizer"
def __init__(self, image_processor, tokenizer):
super().__init__(image_processor=image_processor, tokenizer=tokenizer)
self.image_processor = image_processor
self.tokenizer = tokenizer
self.max_tokens_to_generate = 10
self.max_position_embeddings = 16384
self.pad_token_id = 0
self.dummy_image_index = -1
def _left_pad_inputs_with_attention_mask(self, model_inputs: List[Dict], return_attention_mask: bool):
max_length_input_ids = max(entry["input_ids"].shape[1] for entry in model_inputs)
max_length_image_patch_indices = max(entry["image_patches_indices"].shape[1] for entry in model_inputs)
batched_inputs = {"input_ids": [], "image_patches": [], "image_patches_indices": [], "attention_mask": []}
for entry in model_inputs:
for key, tensor in entry.items():
if key == "input_ids":
num_padding_tokens = max_length_input_ids - tensor.shape[1]
padded_input_ids = torch.cat(
[
torch.full((tensor.shape[0], num_padding_tokens), self.pad_token_id, dtype=torch.long),
tensor,
],
dim=1,
)
batched_inputs[key].append(padded_input_ids)
attention_mask = torch.cat(
[torch.zeros(tensor.shape[0], num_padding_tokens, dtype=torch.long), torch.ones_like(tensor)],
dim=1,
)
batched_inputs["attention_mask"].append(attention_mask)
elif key == "image_patches":
batched_inputs[key].append(tensor)
else:
num_padding_indices = max_length_image_patch_indices - tensor.shape[1]
padded_indices = torch.cat(
[
torch.full(
(tensor.shape[0], num_padding_indices), self.dummy_image_index, dtype=torch.long
),
tensor,
],
dim=1,
)
batched_inputs[key].append(padded_indices)
batched_keys = ["input_ids", "image_patches_indices"]
if return_attention_mask:
batched_keys.append("attention_mask")
for key in batched_keys:
batched_inputs[key] = torch.cat(batched_inputs[key], dim=0)
return batched_inputs
):
image_present = torch.ones(1, 1, 1)
model_image_input = self.image_processor.preprocess_with_tokenizer_info(
image_input=tensor_batch_images,
image_present=image_present,
image_unpadded_h=image_unpadded_heights,
image_unpadded_w=image_unpadded_widths,
image_placeholder_id=image_placeholder_id,
image_newline_id=image_newline_id,
variable_sized=True,
)
prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
tokenizer=self.tokenizer,
prompts=prompts,
scale_factors=scale_factors,
max_tokens_to_generate=self.max_tokens_to_generate,
max_position_embeddings=self.max_position_embeddings,
add_BOS=True,
add_beginning_of_answer_token=True,
)
image_padded_unpacked_tokens = construct_full_unpacked_stream(
num_real_text_tokens=prompts_length,
input_stream=prompt_tokens,
image_tokens=model_image_input["image_input_ids"],
batch_size=1,
num_sub_sequences=self.subsequence_length,
)
unpacked_image_patch_indices_per_batch = construct_full_unpacked_stream(
num_real_text_tokens=prompts_length,
input_stream=torch.full_like(prompt_tokens, -1),
image_tokens=model_image_input["image_patch_indices_per_batch"],
batch_size=1,
num_sub_sequences=self.subsequence_length,
)
max_prompt_length = max(x.shape[-1] for x in image_padded_unpacked_tokens)
max_seq_len_batch = min(max_prompt_length + self.max_tokens_to_generate, self.max_position_embeddings)
tokens
def __call__(
self,
text=None,
images=None,
add_special_tokens: bool = True,
return_attention_mask: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_token_type_ids: bool = False,
return_length: bool = False,
verbose: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
):
"""
这个方法用于调用 LlamaTokenizerFast 的 `PreTrainedTokenizer.__call__` 方法,接收多种参数并处理。
请参考 LlamaTokenizerFast 的 `PreTrainedTokenizer.__call__` 方法的文档了解更多信息。
"""
return self.tokenizer.__call__(
text=text,
images=images,
add_special_tokens=add_special_tokens,
return_attention_mask=return_attention_mask,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_token_type_ids=return_token_type_ids,
return_length=return_length,
verbose=verbose,
return_tensors=return_tensors,
**kwargs,
)
def batch_decode(self, *args, **kwargs):
"""
这个方法将其所有参数转发给 LlamaTokenizerFast 的 `PreTrainedTokenizer.batch_decode` 方法。
请参考 LlamaTokenizerFast 的 `PreTrainedTokenizer.batch_decode` 方法的文档了解更多信息。
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
这个方法将其所有参数转发给 LlamaTokenizerFast 的 `PreTrainedTokenizer.decode` 方法。
请参考 LlamaTokenizerFast 的 `PreTrainedTokenizer.decode` 方法的文档了解更多信息。
"""
return self.tokenizer.decode(*args, **kwargs)
.\models\fuyu\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
_import_structure = {
"configuration_fuyu": ["FUYU_PRETRAINED_CONFIG_ARCHIVE_MAP", "FuyuConfig"],
}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_fuyu"] = ["FuyuImageProcessor"]
_import_structure["processing_fuyu"] = ["FuyuProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_fuyu"] = [
"FuyuForCausalLM",
"FuyuPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_fuyu import FUYU_PRETRAINED_CONFIG_ARCHIVE_MAP, FuyuConfig
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_fuyu import FuyuImageProcessor
from .processing_fuyu import FuyuProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_fuyu import (
FuyuForCausalLM,
FuyuPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\gemma\configuration_gemma.py
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
GEMMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
class GemmaConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`GemmaModel`]. It is used to instantiate an Gemma
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Gemma-7B.
e.g. [google/gemma-7b](https://huggingface.co/google/gemma-7b)
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
```
>>> from transformers import GemmaModel, GemmaConfig
>>> # Initializing a Gemma gemma-7b style configuration
>>> configuration = GemmaConfig()
>>> # Initializing a model from the gemma-7b style configuration
>>> model = GemmaModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "gemma"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=256000,
hidden_size=3072,
intermediate_size=24576,
num_hidden_layers=28,
num_attention_heads=16,
num_key_value_heads=16,
head_dim=256,
hidden_act="gelu_pytorch_tanh",
hidden_activation=None,
max_position_embeddings=8192,
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
pad_token_id=0,
eos_token_id=1,
bos_token_id=2,
tie_word_embeddings=True,
rope_theta=10000.0,
attention_bias=False,
attention_dropout=0.0,
**kwargs,
):
super().__init__(
vocab_size=vocab_size,
hidden_size=hidden_size,
intermediate_size=intermediate_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
num_key_value_heads=num_key_value_heads,
head_dim=head_dim,
hidden_act=hidden_act,
hidden_activation=hidden_activation,
max_position_embeddings=max_position_embeddings,
initializer_range=initializer_range,
rms_norm_eps=rms_norm_eps,
use_cache=use_cache,
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
bos_token_id=bos_token_id,
tie_word_embeddings=tie_word_embeddings,
rope_theta=rope_theta,
attention_bias=attention_bias,
attention_dropout=attention_dropout,
**kwargs,
)
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.head_dim = head_dim
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.hidden_activation = hidden_activation
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.head_dim = head_dim
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.hidden_activation = hidden_activation
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)