Transformers 源码解析(五十八)
.\models\hubert\modeling_tf_hubert.py
""" TensorFlow Hubert 模型."""
from __future__ import annotations
import warnings
from typing import Any, Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput
from ...modeling_tf_utils import (
TFPreTrainedModel,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import shape_list, stable_softmax
from ...utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_hubert import HubertConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "HubertConfig"
TF_HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/hubert-base-ls960",
]
LARGE_NEGATIVE = -1e8
def _sample_without_replacement(distribution, num_samples):
"""
未实现的无重复分类抽样。目前可以使用 Gumbel-max 技巧代替 - 参见
https://github.com/tensorflow/tensorflow/issues/9260 了解更多信息
"""
z = -tf.math.log(tf.random.uniform(shape_list(distribution), 0, 1))
_, indices = tf.nn.top_k(distribution + z, num_samples)
return indices
def _scatter_values_on_batch_indices(values, batch_indices, output_shape):
"""
类似于 PyTorch 中的 scatter 函数,使用格式为 (batch_dim, indices) 的索引
"""
indices_shape = shape_list(batch_indices)
broad_casted_batch_dims = tf.reshape(
tf.broadcast_to(tf.expand_dims(tf.range(indices_shape[0]), axis=-1), indices_shape), [1, -1]
)
pair_indices = tf.transpose(tf.concat([broad_casted_batch_dims, tf.reshape(batch_indices, [1, -1])], 0))
return tf.scatter_nd(pair_indices, tf.reshape(values, [-1]), output_shape)
def _compute_mask_indices(
shape: Tuple[int, int],
mask_prob: float,
mask_length: int,
min_masks: int = 0,
def compute_random_mask_spans(shape: Tuple[int, int],
attention_mask: Optional[tf.Tensor] = None,
mask_prob: float = 0.15,
mask_length: int = 10,
min_masks: int = 0) -> tf.Tensor:
"""
Computes random mask spans for a given shape
Args:
shape: the shape for which to compute masks.
should be of size 2 where first element is batch size and 2nd is timesteps
attention_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
mask_prob:
probability for each token to be chosen as start of the span to be masked. this will be multiplied by
number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
however due to overlaps, the actual number will be smaller (unless no_overlap is True)
mask_length: size of the mask
min_masks: minimum number of masked spans
Adapted from fairseq's data_utils.py.
"""
batch_size, sequence_length = shape
if mask_length < 1:
raise ValueError("`mask_length` has to be bigger than 0.")
tf.debugging.assert_less(
mask_length,
sequence_length,
message=(
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and"
f" `sequence_length`: {sequence_length}`"
),
)
num_masked_spans = mask_prob * tf.cast(sequence_length, tf.float32) / mask_length + tf.random.uniform((1,))
num_masked_spans = tf.maximum(num_masked_spans, min_masks)
num_masked_spans = tf.cast(num_masked_spans, tf.int32)
num_masked_spans = tf.math.minimum(sequence_length // mask_length, num_masked_spans)
num_masked_spans = tf.squeeze(num_masked_spans)
spec_aug_mask = tf.zeros((batch_size, sequence_length), dtype=tf.int32)
uniform_dist = tf.ones((batch_size, sequence_length - (mask_length - 1)))
spec_aug_mask_idxs = _sample_without_replacement(uniform_dist, num_masked_spans)
spec_aug_mask_idxs = tf.expand_dims(spec_aug_mask_idxs, -1)
spec_aug_mask_idxs = tf.tile(spec_aug_mask_idxs, (1, 1, mask_length))
spec_aug_mask_idxs = tf.reshape(spec_aug_mask_idxs, (batch_size, num_masked_spans * mask_length))
offsets = tf.range(mask_length)[tf.newaxis, tf.newaxis, :]
offsets = tf.tile(offsets, (batch_size, num_masked_spans, 1))
offsets = tf.reshape(offsets, (batch_size, num_masked_spans * mask_length))
spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
spec_aug_mask = _scatter_values_on_batch_indices(
tf.ones_like(spec_aug_mask_idxs), spec_aug_mask_idxs, tf.shape(spec_aug_mask)
)
return spec_aug_mask
def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
"""
src_len = shape_list(mask)[1]
tgt_len = tgt_len if tgt_len is not None else src_len
one_cst = tf.constant(1.0)
mask = tf.cast(mask, dtype=one_cst.dtype)
expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))
return (one_cst - expanded_mask) * LARGE_NEGATIVE
class TFHubertGroupNorm(keras.layers.Layer):
"""
From tensorflow-addons https://www.tensorflow.org/addons/api_docs/python/tfa/layers/GroupNormalization
"""
def __init__(
self,
groups: int = 32,
axis: int = -1,
epsilon: float = 1e-3,
center: bool = True,
scale: bool = True,
beta_initializer: keras.initializers.Initializer = "zeros",
gamma_initializer: keras.initializers.Initializer = "ones",
beta_regularizer: keras.regularizers.Regularizer = None,
gamma_regularizer: keras.regularizers.Regularizer = None,
beta_constraint: keras.constraints.Constraint = None,
gamma_constraint: keras.constraints.Constraint = None,
**kwargs,
):
super().__init__(**kwargs)
self.supports_masking = True
self.groups = groups
self.axis = axis
self.epsilon = epsilon
self.center = center
self.scale = scale
self.beta_initializer = keras.initializers.get(beta_initializer)
self.gamma_initializer = keras.initializers.get(gamma_initializer)
self.beta_regularizer = keras.regularizers.get(beta_regularizer)
self.gamma_regularizer = keras.regularizers.get(gamma_regularizer)
self.beta_constraint = keras.constraints.get(beta_constraint)
self.gamma_constraint = keras.constraints.get(gamma_constraint)
self._check_axis()
def build(self, input_shape):
self._check_if_input_shape_is_none(input_shape)
self._set_number_of_groups_for_instance_norm(input_shape)
self._check_size_of_dimensions(input_shape)
self._create_input_spec(input_shape)
self._add_gamma_weight(input_shape)
self._add_beta_weight(input_shape)
self.built = True
super().build(input_shape)
def call(self, inputs):
input_shape = keras.backend.int_shape(inputs)
tensor_input_shape = tf.shape(inputs)
reshaped_inputs, group_shape = self._reshape_into_groups(inputs, input_shape, tensor_input_shape)
normalized_inputs = self._apply_normalization(reshaped_inputs, input_shape)
is_instance_norm = (input_shape[self.axis] // self.groups) == 1
if not is_instance_norm:
outputs = tf.reshape(normalized_inputs, tensor_input_shape)
else:
outputs = normalized_inputs
return outputs
def get_config(self):
config = {
"groups": self.groups,
"axis": self.axis,
"epsilon": self.epsilon,
"center": self.center,
"scale": self.scale,
"beta_initializer": keras.initializers.serialize(self.beta_initializer),
"gamma_initializer": keras.initializers.serialize(self.gamma_initializer),
"beta_regularizer": keras.regularizers.serialize(self.beta_regularizer),
"gamma_regularizer": keras.regularizers.serialize(self.gamma_regularizer),
"beta_constraint": keras.constraints.serialize(self.beta_constraint),
"gamma_constraint": keras.constraints.serialize(self.gamma_constraint),
}
base_config = super().get_config()
return {**base_config, **config}
def compute_output_shape(self, input_shape):
return input_shape
def _reshape_into_groups(self, inputs, input_shape, tensor_input_shape):
group_shape = [tensor_input_shape[i] for i in range(len(input_shape))]
is_instance_norm = (input_shape[self.axis] // self.groups) == 1
if not is_instance_norm:
group_shape[self.axis] = input_shape[self.axis] // self.groups
group_shape.insert(self.axis, self.groups)
group_shape = tf.stack(group_shape)
reshaped_inputs = tf.reshape(inputs, group_shape)
return reshaped_inputs, group_shape
else:
return inputs, group_shape
def _apply_normalization(self, reshaped_inputs, input_shape):
group_shape = keras.backend.int_shape(reshaped_inputs)
group_reduction_axes = list(range(1, len(group_shape)))
is_instance_norm = (input_shape[self.axis] // self.groups) == 1
if not is_instance_norm:
axis = -2 if self.axis == -1 else self.axis - 1
else:
axis = -1 if self.axis == -1 else self.axis - 1
group_reduction_axes.pop(axis)
mean, variance = tf.nn.moments(reshaped_inputs, group_reduction_axes, keepdims=True)
gamma, beta = self._get_reshaped_weights(input_shape)
normalized_inputs = tf.nn.batch_normalization(
reshaped_inputs,
mean=mean,
variance=variance,
scale=gamma,
offset=beta,
variance_epsilon=self.epsilon,
)
return normalized_inputs
def _get_reshaped_weights(self, input_shape):
broadcast_shape = self._create_broadcast_shape(input_shape)
gamma = None
beta = None
if self.scale:
gamma = tf.reshape(self.gamma, broadcast_shape)
if self.center:
beta = tf.reshape(self.beta, broadcast_shape)
return gamma, beta
def _check_if_input_shape_is_none(self, input_shape):
dim = input_shape[self.axis]
if dim is None:
raise ValueError(
"Axis "
+ str(self.axis)
+ " of input tensor should have a defined dimension but the layer received an input with shape "
+ str(input_shape)
+ "."
)
def _set_number_of_groups_for_instance_norm(self, input_shape):
dim = input_shape[self.axis]
if self.groups == -1:
self.groups = dim
def _check_size_of_dimensions(self, input_shape):
dim = input_shape[self.axis]
if dim < self.groups:
raise ValueError(
"Number of groups ("
+ str(self.groups)
+ ") cannot be more than the number of channels ("
+ str(dim)
+ ")."
)
if dim % self.groups != 0:
raise ValueError(
"Number of groups ("
+ str(self.groups)
+ ") must be a multiple of the number of channels ("
+ str(dim)
+ ")."
)
def _check_axis(self):
if self.axis == 0:
raise ValueError(
"You are trying to normalize your batch axis. Do you want to use tf.layer.batch_normalization instead"
)
def _create_input_spec(self, input_shape):
dim = input_shape[self.axis]
self.input_spec = keras.layers.InputSpec(ndim=len(input_shape), axes={self.axis: dim})
def _add_gamma_weight(self, input_shape):
dim = input_shape[self.axis]
shape = (dim,)
if self.scale:
self.gamma = self.add_weight(
shape=shape,
name="gamma",
initializer=self.gamma_initializer,
regularizer=self.gamma_regularizer,
constraint=self.gamma_constraint,
)
else:
self.gamma = None
def _add_beta_weight(self, input_shape):
dim = input_shape[self.axis]
shape = (dim,)
if self.center:
self.beta = self.add_weight(
shape=shape,
name="beta",
initializer=self.beta_initializer,
regularizer=self.beta_regularizer,
constraint=self.beta_constraint,
)
else:
self.beta = None
def _create_broadcast_shape(self, input_shape):
broadcast_shape = [1] * len(input_shape)
is_instance_norm = (input_shape[self.axis] // self.groups) == 1
if not is_instance_norm:
broadcast_shape[self.axis] = input_shape[self.axis] // self.groups
broadcast_shape.insert(self.axis, self.groups)
else:
broadcast_shape[self.axis] = self.groups
return broadcast_shape
class TFHubertWeightNormConv1D(keras.layers.Conv1D):
"""从 https://www.tensorflow.org/probability/api_docs/python/tfp/layers/weight_norm/WeightNorm 改编"""
def __init__(self, filters, kernel_size, groups, explicit_padding, **kwargs):
super().__init__(
filters=filters,
kernel_size=kernel_size,
groups=groups,
padding="valid",
use_bias=True,
bias_initializer="he_normal",
**kwargs,
)
self.explicit_padding = explicit_padding
self.filter_axis = 2
self.kernel_norm_axes = tf.constant([0, 1])
def _init_norm(self):
"""设置权重向量的范数。"""
kernel_norm = tf.sqrt(tf.reduce_sum(tf.square(self.weight_v), axis=self.kernel_norm_axes))
self.weight_g.assign(kernel_norm[:, tf.newaxis, tf.newaxis])
def _normalize_kernel(self):
"""生成归一化的权重。"""
kernel = tf.nn.l2_normalize(self.weight_v, axis=self.kernel_norm_axes) * tf.transpose(self.weight_g)
self.kernel = tf.transpose(kernel)
def build(self, input_shape):
if not self.built:
super().build(input_shape)
self.kernel = tf.Variable(tf.transpose(self.kernel), name="weight_v", trainable=True)
self.weight_v = self.kernel
self.weight_g = self.add_weight(
name="weight_g",
shape=(int(self.weight_v.shape[self.filter_axis]), 1, 1),
initializer="ones",
dtype=self.weight_v.dtype,
trainable=True,
)
self._init_norm()
self.bias = self.add_weight(name="bias", shape=(self.filters,), initializer="zeros", trainable=True)
def call(self, inputs):
self._normalize_kernel()
padded_inputs = tf.pad(inputs, ((0, 0), (self.explicit_padding, self.explicit_padding), (0, 0)))
output = super().call(padded_inputs)
return output
def __init__(self, config: HubertConfig, layer_id: int = 0, **kwargs: Any) -> None:
super().__init__(**kwargs)
self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0 else 1
self.out_conv_dim = config.conv_dim[layer_id]
self.conv = keras.layers.Conv1D(
filters=self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id],
strides=config.conv_stride[layer_id],
use_bias=config.conv_bias,
name="conv",
)
self.activation = get_tf_activation(config.feat_extract_activation)
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.conv(hidden_states)
hidden_states = self.activation(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv", None) is not None:
with tf.name_scope(self.conv.name):
self.conv.build([None, None, self.in_conv_dim])
class TFHubertLayerNormConvLayer(keras.layers.Layer):
def __init__(self, config: HubertConfig, layer_id: int = 0, **kwargs: Any) -> None:
super().__init__(**kwargs)
self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0 else 1
self.out_conv_dim = config.conv_dim[layer_id]
self.conv = keras.layers.Conv1D(
filters=self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id],
strides=config.conv_stride[layer_id],
use_bias=config.conv_bias,
name="conv",
)
self.layer_norm = keras.layers.LayerNormalization(name="layer_norm", epsilon=config.layer_norm_eps)
self.activation = get_tf_activation(config.feat_extract_activation)
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.conv(hidden_states)
hidden_states = self.layer_norm(hidden_states)
hidden_states = self.activation(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv", None) is not None:
with tf.name_scope(self.conv.name):
self.conv.build([None, None, self.in_conv_dim])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.out_conv_dim])
class TFHubertGroupNormConvLayer(keras.layers.Layer):
def __init__(self, config: HubertConfig, layer_id: int = 0, **kwargs: Any) -> None:
super().__init__(**kwargs)
self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0 else 1
self.out_conv_dim = config.conv_dim[layer_id]
self.conv = keras.layers.Conv1D(
filters=self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id],
strides=config.conv_stride[layer_id],
use_bias=config.conv_bias,
name="conv",
)
self.activation = get_tf_activation(config.feat_extract_activation)
self.layer_norm = TFHubertGroupNorm(groups=self.out_conv_dim, epsilon=config.layer_norm_eps, name="layer_norm")
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.conv(hidden_states)
hidden_states = self.layer_norm(hidden_states)
hidden_states = self.activation(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv", None) is not None:
with tf.name_scope(self.conv.name):
self.conv.build([None, None, self.in_conv_dim])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.out_conv_dim])
class TFHubertPositionalConvEmbedding(keras.layers.Layer):
def __init__(self, config: HubertConfig, **kwargs: Any) -> None:
super().__init__(**kwargs)
self.conv = TFHubertWeightNormConv1D(
filters=config.hidden_size,
kernel_size=config.num_conv_pos_embeddings,
groups=config.num_conv_pos_embedding_groups,
explicit_padding=config.num_conv_pos_embeddings // 2,
name="conv",
)
self.padding = TFHubertSamePadLayer(config.num_conv_pos_embeddings)
self.activation = get_tf_activation(config.feat_extract_activation)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.conv(hidden_states)
hidden_states = self.padding(hidden_states)
hidden_states = self.activation(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv", None) is not None:
with tf.name_scope(self.conv.name):
self.conv.build([None, None, self.config.hidden_size])
def build(self, input_shape=None):
if self.built:
return
self.built = True
for conv_layer in self.conv_layers:
with tf.name_scope(conv_layer.name):
conv_layer.build(None)
class TFHubertFeatureExtractor(TFHubertFeatureEncoder):
def __init__(self, config, **kwargs):
super().__init__(config, **kwargs)
warnings.warn(
f"The class `{self.__class__.__name__}` has been depreciated "
"and will be removed in Transformers v5. "
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
FutureWarning,
)
class TFHubertFeatureProjection(keras.layers.Layer):
def __init__(self, config: HubertConfig, **kwargs):
super().__init__(**kwargs)
self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
self.projection = keras.layers.Dense(
units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
bias_initializer="zeros",
name="projection",
)
self.dropout = keras.layers.Dropout(rate=config.feat_proj_dropout)
self.config = config
def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.layer_norm(hidden_states)
hidden_states = self.projection(hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.conv_dim[-1]])
if getattr(self, "projection", None) is not None:
with tf.name_scope(self.projection.name):
self.projection.build([None, None, self.config.conv_dim[-1]])
class TFHubertAttention(keras.layers.Layer):
"""Multi-headed attention from "Attention Is All You Need"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
**kwargs,
):
super().__init__(**kwargs)
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
super().__init__(**kwargs)
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
def call(
self,
hidden_states: tf.Tensor,
key_value_states: tf.Tensor | None = None,
past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
attention_mask: tf.Tensor | None = None,
layer_head_mask: tf.Tensor | None = None,
training: Optional[bool] = False,
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
def call(
self,
hidden_states: tf.Tensor,
key_value_states: tf.Tensor | None = None,
past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
attention_mask: tf.Tensor | None = None,
layer_head_mask: tf.Tensor | None = None,
training: Optional[bool] = False,
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
class TFHubertFeedForward(keras.layers.Layer):
def __init__(self, config: HubertConfig, **kwargs):
super().__init__(**kwargs)
self.intermediate_dropout = keras.layers.Dropout(config.activation_dropout)
self.intermediate_dense = keras.layers.Dense(
units=config.intermediate_size,
kernel_initializer=get_initializer(config.initializer_range),
bias_initializer="zeros",
name="intermediate_dense",
)
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
self.output_dense = keras.layers.Dense(
units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
bias_initializer="zeros",
name="output_dense",
)
self.output_dropout = keras.layers.Dropout(config.hidden_dropout)
self.config = config
def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.intermediate_dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
hidden_states = self.intermediate_dropout(hidden_states, training=training)
hidden_states = self.output_dense(hidden_states)
hidden_states = self.output_dropout(hidden_states, training=training)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "intermediate_dense", None) is not None:
with tf.name_scope(self.intermediate_dense.name):
self.intermediate_dense.build([None, None, self.config.hidden_size])
if getattr(self, "output_dense", None) is not None:
with tf.name_scope(self.output_dense.name):
self.output_dense.build([None, None, self.config.intermediate_size])
class TFHubertEncoderLayer(keras.layers.Layer):
def __init__(self, config: HubertConfig, **kwargs):
super().__init__(**kwargs)
self.attention = TFHubertAttention(
embed_dim=config.hidden_size,
num_heads=config.num_attention_heads,
dropout=config.attention_dropout,
is_decoder=False,
name="attention",
)
self.dropout = keras.layers.Dropout(config.hidden_dropout)
self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
self.feed_forward = TFHubertFeedForward(config, name="feed_forward")
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="final_layer_norm")
self.config = config
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor | None = None,
output_attentions: Optional[bool] = False,
training: bool = False,
) -> Tuple[tf.Tensor]:
attn_residual = hidden_states
hidden_states, attn_weights, _ = self.attention(
hidden_states, attention_mask=attention_mask, training=training
)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = attn_residual + hidden_states
hidden_states = self.layer_norm(hidden_states)
hidden_states = hidden_states + self.feed_forward(hidden_states)
hidden_states = self.final_layer_norm(hidden_states)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.hidden_size])
if getattr(self, "feed_forward", None) is not None:
with tf.name_scope(self.feed_forward.name):
self.feed_forward.build(None)
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.config.hidden_size])
class TFHubertEncoderLayerStableLayerNorm(keras.layers.Layer):
def __init__(self, config: HubertConfig, **kwargs):
super().__init__(**kwargs)
self.attention = TFHubertAttention(
embed_dim=config.hidden_size,
num_heads=config.num_attention_heads,
dropout=config.attention_dropout,
is_decoder=False,
name="attention",
)
self.dropout = keras.layers.Dropout(config.hidden_dropout)
self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
self.feed_forward = TFHubertFeedForward(config, name="feed_forward")
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="final_layer_norm")
self.config = config
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor | None = None,
output_attentions: Optional[bool] = False,
training: bool = False,
) -> Tuple[tf.Tensor]:
attn_residual = hidden_states
hidden_states = self.layer_norm(hidden_states)
hidden_states, attn_weights, _ = self.attention(
hidden_states, attention_mask=attention_mask, training=training
)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = attn_residual + hidden_states
hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.hidden_size])
if getattr(self, "feed_forward", None) is not None:
with tf.name_scope(self.feed_forward.name):
self.feed_forward.build(None)
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.config.hidden_size])
class TFHubertEncoder(keras.layers.Layer):
def __init__(self, config: HubertConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.pos_conv_embed = TFHubertPositionalConvEmbedding(config, name="pos_conv_embed")
self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
self.dropout = keras.layers.Dropout(config.hidden_dropout)
self.layer = [TFHubertEncoderLayer(config, name=f"layers.{i}") for i in range(config.num_hidden_layers)]
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor | None = None,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
training: Optional[bool] = False,
) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
if attention_mask is not None:
hidden_states = hidden_states * tf.expand_dims(attention_mask, -1)
attention_mask = _expand_mask(attention_mask)
else:
attention_mask = None
position_embeddings = self.pos_conv_embed(hidden_states)
hidden_states = hidden_states + position_embeddings
hidden_states = self.layer_norm(hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
dropout_probability = np.random.uniform(0, 1)
if training and (dropout_probability < self.config.layerdrop):
continue
layer_outputs = layer_module(
hidden_states=hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
training=training,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return TFBaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "pos_conv_embed", None) is not None:
with tf.name_scope(self.pos_conv_embed.name):
self.pos_conv_embed.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.hidden_size])
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
class TFHubertEncoderStableLayerNorm(keras.layers.Layer):
def __init__(self, config: HubertConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.pos_conv_embed = TFHubertPositionalConvEmbedding(config, name="pos_conv_embed")
self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
self.dropout = keras.layers.Dropout(config.hidden_dropout)
self.layer = [
TFHubertEncoderLayerStableLayerNorm(config, name=f"layers.{i}") for i in range(config.num_hidden_layers)
]
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor | None = None,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
training: Optional[bool] = False,
) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
if attention_mask is not None:
hidden_states = hidden_states * tf.expand_dims(attention_mask, -1)
attention_mask = _expand_mask(attention_mask)
else:
attention_mask = None
position_embeddings = self.pos_conv_embed(hidden_states)
hidden_states = hidden_states + position_embeddings
hidden_states = self.dropout(hidden_states, training=training)
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
dropout_probability = np.random.uniform(0, 1)
if training and (dropout_probability < self.config.layerdrop):
continue
layer_outputs = layer_module(
hidden_states=hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
training=training,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
hidden_states = self.layer_norm(hidden_states)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return TFBaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
if self.built:
return
self.built = True
if getattr(self, "pos_conv_embed", None) is not None:
with tf.name_scope(self.pos_conv_embed.name):
self.pos_conv_embed.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.hidden_size])
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFHubertMainLayer(keras.layers.Layer):
config_class = HubertConfig
def __init__(self, config: HubertConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.feature_extractor = TFHubertFeatureEncoder(config, name="feature_extractor")
self.feature_projection = TFHubertFeatureProjection(config, name="feature_projection")
if config.do_stable_layer_norm:
self.encoder = TFHubertEncoderStableLayerNorm(config, name="encoder")
else:
self.encoder = TFHubertEncoder(config, name="encoder")
def build(self, input_shape=None):
self.masked_spec_embed = self.add_weight(
shape=(self.config.hidden_size,), initializer="uniform", trainable=True, name="masked_spec_embed"
)
if self.built:
return
self.built = True
if getattr(self, "feature_extractor", None) is not None:
with tf.name_scope(self.feature_extractor.name):
self.feature_extractor.build(None)
if getattr(self, "feature_projection", None) is not None:
with tf.name_scope(self.feature_projection.name):
self.feature_projection.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
def _get_feat_extract_output_lengths(self, input_lengths: tf.Tensor):
"""
计算卷积层的输出长度
"""
def _conv_out_length(input_length, kernel_size, stride):
return (input_length - kernel_size) // stride + 1
for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
return input_lengths
def _mask_hidden_states(self, hidden_states: tf.Tensor, mask_time_indices: tf.Tensor | None = None):
"""
Masks extracted features along time axis and/or along feature axis according to
[SpecAugment](https://arxiv.org/abs/1904.08779).
"""
batch_size, sequence_length, hidden_size = shape_list(hidden_states)
if not getattr(self.config, "apply_spec_augment", True):
return hidden_states
if mask_time_indices is not None:
hidden_states = tf.where(
tf.cast(mask_time_indices[:, :, tf.newaxis], tf.bool),
self.masked_spec_embed[tf.newaxis, tf.newaxis, :],
hidden_states,
)
elif self.config.mask_time_prob > 0:
mask_time_indices = _compute_mask_indices(
(batch_size, sequence_length),
mask_prob=self.config.mask_time_prob,
mask_length=self.config.mask_time_length,
min_masks=2,
)
hidden_states = tf.where(
tf.cast(mask_time_indices[:, :, tf.newaxis], tf.bool),
self.masked_spec_embed[tf.newaxis, tf.newaxis, :],
hidden_states,
)
if self.config.mask_feature_prob > 0:
mask_feature_indices = _compute_mask_indices(
(batch_size, hidden_size),
mask_prob=self.config.mask_feature_prob,
mask_length=self.config.mask_feature_length,
)
hidden_states = tf.where(mask_feature_indices[:, tf.newaxis, :], hidden_states, 0)
return hidden_states
@unpack_inputs
def call(
self,
input_values: tf.Tensor,
attention_mask: tf.Tensor | None = None,
token_type_ids: tf.Tensor | None = None,
position_ids: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
output_attentions: tf.Tensor | None = None,
output_hidden_states: tf.Tensor | None = None,
return_dict: Optional[bool] = None,
training: bool = False,
**kwargs: Any,
):
hidden_states = self.feature_extractor(tf.cast(input_values, tf.float32), training=training)
if attention_mask is not None:
output_lengths = self._get_feat_extract_output_lengths(tf.reduce_sum(attention_mask, -1))
attention_mask = tf.sequence_mask(
output_lengths, maxlen=shape_list(hidden_states)[1], dtype=hidden_states.dtype
)
hidden_states = self.feature_projection(hidden_states, training=training)
mask_time_indices = kwargs.get("mask_time_indices", None)
if training:
hidden_states = self._mask_hidden_states(hidden_states, mask_time_indices=mask_time_indices)
encoder_outputs = self.encoder(
hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
hidden_states = encoder_outputs[0]
if not return_dict:
return (hidden_states,) + encoder_outputs[1:]
return TFBaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
class TFHubertPreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = HubertConfig
base_model_prefix = "hubert"
main_input_name = "input_values"
@property
def input_signature(self):
return {
"input_values": tf.TensorSpec((None, 16000), tf.float32, name="input_values"),
"attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
"token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
}
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
logger.warning(
f"\n{self.__class__.__name__} has backpropagation operations that are NOT supported on CPU. If you wish "
"to train/fine-tune this model, you need a GPU or a TPU"
)
HUBERT_START_DOCSTRING = r"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
positional argument:
- a single Tensor with `input_values` only and nothing else: `model(input_values)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_values, attention_mask])` or `model([input_values, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
`model({"input_values": input_values, "token_type_ids": token_type_ids})`
Note that when creating models and layers with
[subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
about any of this, as you can just pass inputs like you would to any other Python function!
</Tip>
Args:
config ([`HubertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
HUBERT_INPUTS_DOCSTRING = r"""
"""
@add_start_docstrings(
"The bare TFHubert Model transformer outputing raw hidden-states without any specific head on top.",
HUBERT_START_DOCSTRING,
)
class TFHubertModel(TFHubertPreTrainedModel):
def __init__(self, config: HubertConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.config = config
self.hubert = TFHubertMainLayer(config, name="hubert")
@add_start_docstrings_to_model_forward(HUBERT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFBaseModelOutput, config_class=_CONFIG_FOR_DOC)
@unpack_inputs
def call(
self,
input_values: tf.Tensor,
attention_mask: tf.Tensor | None = None,
token_type_ids: tf.Tensor | None = None,
position_ids: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
"""
根据给定的输入执行模型的前向传播,返回模型输出。
Args:
input_values (tf.Tensor): 输入张量,代表输入特征。
attention_mask (tf.Tensor, optional): 注意力掩码张量,用于控制注意力分配。默认为 None。
token_type_ids (tf.Tensor, optional): 标记类型 ID 张量,用于多序列输入。默认为 None。
position_ids (tf.Tensor, optional): 位置 ID 张量,用于指示输入中每个位置的位置信息。默认为 None。
head_mask (tf.Tensor, optional): 头部掩码张量,用于控制多头注意力中每个头的重要性。默认为 None。
inputs_embeds (tf.Tensor, optional): 嵌入输入张量,用于直接提供输入的嵌入表示。默认为 None。
output_attentions (bool, optional): 是否输出注意力权重。默认为 None。
output_hidden_states (bool, optional): 是否输出隐藏状态。默认为 None。
return_dict (bool, optional): 是否以字典形式返回结果。默认为 None。
training (bool, optional): 是否处于训练模式。默认为 False。
Returns:
Union[TFBaseModelOutput, Tuple[tf.Tensor]]: 模型的输出结果,包含隐藏状态和/或注意力权重,具体取决于参数设置。
Example:
```
>>> from transformers import AutoProcessor, TFHubertModel
>>> from datasets import load_dataset
>>> import soundfile as sf
>>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
>>> model = TFHubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
>>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech
... return batch
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array)
>>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1
>>> hidden_states = model(input_values).last_hidden_state
```
"""
output_hidden_states = output_hidden_states if output_hidden_states else self.config.output_hidden_states
output_attentions = output_attentions if output_attentions else self.config.output_attentions
return_dict = return_dict if return_dict else self.config.return_dict
outputs = self.hubert(
input_values=input_values,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "hubert", None) is not None:
with tf.name_scope(self.hubert.name):
self.hubert.build(None)
@add_start_docstrings(
"""TFHubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
HUBERT_START_DOCSTRING,
)
class TFHubertForCTC(TFHubertPreTrainedModel):
def __init__(self, config: HubertConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.hubert = TFHubertMainLayer(config, name="hubert")
self.dropout = keras.layers.Dropout(config.final_dropout)
self.lm_head = keras.layers.Dense(config.vocab_size, name="lm_head")
self.output_hidden_size = (
config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
)
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.hubert.feature_extractor.trainable = False
@add_start_docstrings_to_model_forward(HUBERT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFCausalLMOutput, config_class=_CONFIG_FOR_DOC)
@unpack_inputs
def call(
self,
input_values: tf.Tensor,
attention_mask: tf.Tensor | None = None,
token_type_ids: tf.Tensor | None = None,
position_ids: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
labels: tf.Tensor | None = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
):
"""
Call method to process inputs and return outputs, adhering to Hubert model's forward function.
"""
pass
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "hubert", None) is not None:
with tf.name_scope(self.hubert.name):
self.hubert.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build([None, None, self.output_hidden_size])
.\models\hubert\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_available, is_torch_available
_import_structure = {"configuration_hubert": ["HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "HubertConfig"]}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_hubert"] = [
"HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
"HubertForCTC",
"HubertForSequenceClassification",
"HubertModel",
"HubertPreTrainedModel",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_hubert"] = [
"TF_HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFHubertForCTC",
"TFHubertModel",
"TFHubertPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_hubert import HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, HubertConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_hubert import (
HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
HubertForCTC,
HubertForSequenceClassification,
HubertModel,
HubertPreTrainedModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_hubert import (
TF_HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFHubertForCTC,
TFHubertModel,
TFHubertPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\ibert\configuration_ibert.py
""" I-BERT configuration"""
from collections import OrderedDict
from typing import Mapping
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"kssteven/ibert-roberta-base": "https://huggingface.co/kssteven/ibert-roberta-base/resolve/main/config.json",
"kssteven/ibert-roberta-large": "https://huggingface.co/kssteven/ibert-roberta-large/resolve/main/config.json",
"kssteven/ibert-roberta-large-mnli": (
"https://huggingface.co/kssteven/ibert-roberta-large-mnli/resolve/main/config.json"
),
}
class IBertConfig(PretrainedConfig):
"""
This is the configuration class to store the configuration of a [`IBertModel`]. It is used to instantiate a I-BERT
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the IBERT
[kssteven/ibert-roberta-base](https://huggingface.co/kssteven/ibert-roberta-base) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
model_type = "ibert"
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
position_embedding_type="absolute",
quant_mode=False,
force_dequant="none",
**kwargs,
):
"""
Initializes an IBertConfig object with default values for its parameters.
构造函数,初始化 IBertConfig 对象,设置各个参数的默认值。
"""
):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.position_embedding_type = position_embedding_type
self.quant_mode = quant_mode
self.force_dequant = force_dequant
class IBertOnnxConfig(OnnxConfig):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task == "multiple-choice":
dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
else:
dynamic_axis = {0: "batch", 1: "sequence"}
return OrderedDict(
[
("input_ids", dynamic_axis),
("attention_mask", dynamic_axis),
]
)
.\models\ibert\modeling_ibert.py
"""PyTorch I-BERT model."""
import math
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import gelu
from ...modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
BaseModelOutputWithPoolingAndCrossAttentions,
MaskedLMOutput,
MultipleChoiceModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "kssteven/ibert-roberta-base"
_CONFIG_FOR_DOC = "IBertConfig"
IBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"kssteven/ibert-roberta-base",
"kssteven/ibert-roberta-large",
"kssteven/ibert-roberta-large-mnli",
]
class IBertEmbeddings(nn.Module):
"""
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
IBertEmbeddings 类,与 BertEmbeddings 相同,但稍作调整以支持位置嵌入索引。
"""
def __init__(self, config):
super().__init__()
self.quant_mode = config.quant_mode
self.embedding_bit = 8
self.embedding_act_bit = 16
self.act_bit = 8
self.ln_input_bit = 22
self.ln_output_bit = 32
self.word_embeddings = QuantEmbedding(
config.vocab_size,
config.hidden_size,
padding_idx=config.pad_token_id,
weight_bit=self.embedding_bit,
quant_mode=self.quant_mode,
)
self.token_type_embeddings = QuantEmbedding(
config.type_vocab_size, config.hidden_size, weight_bit=self.embedding_bit, quant_mode=self.quant_mode
)
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.padding_idx = config.pad_token_id
self.position_embeddings = QuantEmbedding(
config.max_position_embeddings,
config.hidden_size,
padding_idx=self.padding_idx,
weight_bit=self.embedding_bit,
quant_mode=self.quant_mode,
)
self.embeddings_act1 = QuantAct(self.embedding_act_bit, quant_mode=self.quant_mode)
self.embeddings_act2 = QuantAct(self.embedding_act_bit, quant_mode=self.quant_mode)
self.LayerNorm = IntLayerNorm(
config.hidden_size,
eps=config.layer_norm_eps,
output_bit=self.ln_output_bit,
quant_mode=self.quant_mode,
force_dequant=config.force_dequant,
)
self.output_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
):
if position_ids is None:
if input_ids is not None:
position_ids = create_position_ids_from_input_ids(
input_ids, self.padding_idx, past_key_values_length
).to(input_ids.device)
else:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
if inputs_embeds is None:
inputs_embeds, inputs_embeds_scaling_factor = self.word_embeddings(input_ids)
else:
inputs_embeds_scaling_factor = None
token_type_embeddings, token_type_embeddings_scaling_factor = self.token_type_embeddings(token_type_ids)
embeddings, embeddings_scaling_factor = self.embeddings_act1(
inputs_embeds,
inputs_embeds_scaling_factor,
identity=token_type_embeddings,
identity_scaling_factor=token_type_embeddings_scaling_factor,
)
if self.position_embedding_type == "absolute":
position_embeddings, position_embeddings_scaling_factor = self.position_embeddings(position_ids)
embeddings, embeddings_scaling_factor = self.embeddings_act1(
embeddings,
embeddings_scaling_factor,
identity=position_embeddings,
identity_scaling_factor=position_embeddings_scaling_factor,
)
embeddings, embeddings_scaling_factor = self.LayerNorm(embeddings, embeddings_scaling_factor)
embeddings = self.dropout(embeddings)
embeddings, embeddings_scaling_factor = self.output_activation(embeddings, embeddings_scaling_factor)
return embeddings, embeddings_scaling_factor
def create_position_ids_from_inputs_embeds(self, inputs_embeds):
"""
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
Args:
inputs_embeds: torch.Tensor
Returns: torch.Tensor
"""
input_shape = inputs_embeds.size()[:-1]
sequence_length = input_shape[1]
position_ids = torch.arange(
self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
)
return position_ids.unsqueeze(0).expand(input_shape)
class IBertSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.quant_mode = config.quant_mode
self.weight_bit = 8
self.bias_bit = 32
self.act_bit = 8
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = QuantLinear(
config.hidden_size,
self.all_head_size,
bias=True,
weight_bit=self.weight_bit,
bias_bit=self.bias_bit,
quant_mode=self.quant_mode,
per_channel=True,
)
self.key = QuantLinear(
config.hidden_size,
self.all_head_size,
bias=True,
weight_bit=self.weight_bit,
bias_bit=self.bias_bit,
quant_mode=self.quant_mode,
per_channel=True,
)
self.value = QuantLinear(
config.hidden_size,
self.all_head_size,
bias=True,
weight_bit=self.weight_bit,
bias_bit=self.bias_bit,
quant_mode=self.quant_mode,
per_channel=True,
)
self.query_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
self.key_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
self.value_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
self.output_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
if self.position_embedding_type != "absolute":
raise ValueError("I-BERT only supports 'absolute' for `config.position_embedding_type`")
self.softmax = IntSoftmax(self.act_bit, quant_mode=self.quant_mode, force_dequant=config.force_dequant)
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states,
hidden_states_scaling_factor,
attention_mask=None,
head_mask=None,
output_attentions=False,
def __init__(self, config):
super().__init__()
self.quant_mode = config.quant_mode
self.act_bit = 8
self.weight_bit = 8
self.bias_bit = 32
self.ln_input_bit = 22
self.ln_output_bit = 32
self.dense = QuantLinear(
config.hidden_size,
config.hidden_size,
bias=True,
weight_bit=self.weight_bit,
bias_bit=self.bias_bit,
quant_mode=self.quant_mode,
per_channel=True,
)
self.ln_input_act = QuantAct(self.ln_input_bit, quant_mode=self.quant_mode)
self.LayerNorm = IntLayerNorm(
config.hidden_size,
eps=config.layer_norm_eps,
output_bit=self.ln_output_bit,
quant_mode=self.quant_mode,
force_dequant=config.force_dequant,
)
self.output_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, hidden_states_scaling_factor, input_tensor, input_tensor_scaling_factor):
hidden_states, hidden_states_scaling_factor = self.dense(hidden_states, hidden_states_scaling_factor)
hidden_states = self.dropout(hidden_states)
hidden_states, hidden_states_scaling_factor = self.ln_input_act(
hidden_states,
hidden_states_scaling_factor,
identity=input_tensor,
identity_scaling_factor=input_tensor_scaling_factor,
)
hidden_states, hidden_states_scaling_factor = self.LayerNorm(hidden_states, hidden_states_scaling_factor)
hidden_states, hidden_states_scaling_factor = self.output_activation(
hidden_states, hidden_states_scaling_factor
)
return hidden_states, hidden_states_scaling_factor
class IBertAttention(nn.Module):
def __init__(self, config):
super().__init__()
self.quant_mode = config.quant_mode
self.self = IBertSelfAttention(config)
self.output = IBertSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states,
hidden_states_scaling_factor,
attention_mask=None,
head_mask=None,
output_attentions=False,
):
self_outputs, self_outputs_scaling_factor = self.self(
hidden_states,
hidden_states_scaling_factor,
attention_mask,
head_mask,
output_attentions,
)
attention_output, attention_output_scaling_factor = self.output(
self_outputs[0], self_outputs_scaling_factor[0], hidden_states, hidden_states_scaling_factor
)
outputs = (attention_output,) + self_outputs[1:]
outputs_scaling_factor = (attention_output_scaling_factor,) + self_outputs_scaling_factor[1:]
return outputs, outputs_scaling_factor
class IBertIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.quant_mode = config.quant_mode
self.act_bit = 8
self.weight_bit = 8
self.bias_bit = 32
self.dense = QuantLinear(
config.hidden_size,
config.intermediate_size,
bias=True,
weight_bit=self.weight_bit,
bias_bit=self.bias_bit,
quant_mode=self.quant_mode,
per_channel=True,
)
if config.hidden_act != "gelu":
raise ValueError("I-BERT only supports 'gelu' for `config.hidden_act`")
self.intermediate_act_fn = IntGELU(quant_mode=self.quant_mode, force_dequant=config.force_dequant)
self.output_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
def forward(self, hidden_states, hidden_states_scaling_factor):
hidden_states, hidden_states_scaling_factor = self.dense(hidden_states, hidden_states_scaling_factor)
hidden_states, hidden_states_scaling_factor = self.intermediate_act_fn(
hidden_states, hidden_states_scaling_factor
)
hidden_states, hidden_states_scaling_factor = self.output_activation(
hidden_states, hidden_states_scaling_factor
)
return hidden_states, hidden_states_scaling_factor
class IBertOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.quant_mode = config.quant_mode
self.act_bit = 8
self.weight_bit = 8
self.bias_bit = 32
self.ln_input_bit = 22
self.ln_output_bit = 32
self.dense = QuantLinear(
config.intermediate_size,
config.hidden_size,
bias=True,
weight_bit=self.weight_bit,
bias_bit=self.bias_bit,
quant_mode=self.quant_mode,
per_channel=True,
)
self.ln_input_act = QuantAct(self.ln_input_bit, quant_mode=self.quant_mode)
self.LayerNorm = IntLayerNorm(
config.hidden_size,
eps=config.layer_norm_eps,
output_bit=self.ln_output_bit,
quant_mode=self.quant_mode,
force_dequant=config.force_dequant,
)
self.output_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, hidden_states_scaling_factor, input_tensor, input_tensor_scaling_factor):
hidden_states, hidden_states_scaling_factor = self.dense(hidden_states, hidden_states_scaling_factor)
hidden_states = self.dropout(hidden_states)
hidden_states, hidden_states_scaling_factor = self.ln_input_act(
hidden_states,
hidden_states_scaling_factor,
identity=input_tensor,
identity_scaling_factor=input_tensor_scaling_factor,
)
hidden_states, hidden_states_scaling_factor = self.LayerNorm(hidden_states, hidden_states_scaling_factor)
hidden_states, hidden_states_scaling_factor = self.output_activation(
hidden_states, hidden_states_scaling_factor
)
return hidden_states, hidden_states_scaling_factor
class IBertLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.quant_mode = config.quant_mode
self.act_bit = 8
self.seq_len_dim = 1
self.attention = IBertAttention(config)
self.intermediate = IBertIntermediate(config)
self.output = IBertOutput(config)
self.pre_intermediate_act = QuantAct(self.act_bit, quant_mode=self.quant_mode)
self.pre_output_act = QuantAct(self.act_bit, quant_mode=self.quant_mode)
def forward(
self,
hidden_states,
hidden_states_scaling_factor,
attention_mask=None,
head_mask=None,
output_attentions=False,
):
self_attention_outputs, self_attention_outputs_scaling_factor = self.attention(
hidden_states,
hidden_states_scaling_factor,
attention_mask,
head_mask,
output_attentions=output_attentions,
)
attention_output = self_attention_outputs[0]
attention_output_scaling_factor = self_attention_outputs_scaling_factor[0]
outputs = self_attention_outputs[1:]
layer_output, layer_output_scaling_factor = self.feed_forward_chunk(
attention_output, attention_output_scaling_factor
)
outputs = (layer_output,) + outputs
return outputs
def feed_forward_chunk(self, attention_output, attention_output_scaling_factor):
attention_output, attention_output_scaling_factor = self.pre_intermediate_act(
attention_output, attention_output_scaling_factor
)
intermediate_output, intermediate_output_scaling_factor = self.intermediate(
attention_output, attention_output_scaling_factor
)
intermediate_output, intermediate_output_scaling_factor = self.pre_output_act(
intermediate_output, intermediate_output_scaling_factor
)
layer_output, layer_output_scaling_factor = self.output(
intermediate_output, intermediate_output_scaling_factor, attention_output, attention_output_scaling_factor
)
return layer_output, layer_output_scaling_factor
class IBertEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.quant_mode = config.quant_mode
self.layer = nn.ModuleList([IBertLayer(config) for _ in range(config.num_hidden_layers)])
def forward(
self,
hidden_states,
hidden_states_scaling_factor,
attention_mask=None,
head_mask=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
):
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
all_cross_attentions = None
next_decoder_cache = None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
layer_outputs = layer_module(
hidden_states,
hidden_states_scaling_factor,
attention_mask,
layer_head_mask,
output_attentions,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v
for v in [
hidden_states,
next_decoder_cache,
all_hidden_states,
all_self_attentions,
all_cross_attentions,
]
if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
)
class IBertPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.quant_mode = config.quant_mode
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states):
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class IBertPreTrainedModel(PreTrainedModel):
"""
一个抽象类,用于处理权重初始化和简单的接口,用于下载和加载预训练模型。
"""
config_class = IBertConfig
base_model_prefix = "ibert"
def _init_weights(self, module):
"""初始化权重"""
if isinstance(module, (QuantLinear, nn.Linear)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, (QuantEmbedding, nn.Embedding)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, (IntLayerNorm, nn.LayerNorm)):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
def resize_token_embeddings(self, new_num_tokens=None):
raise NotImplementedError("`resize_token_embeddings` is not supported for I-BERT.")
IBERT_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`IBertConfig`]): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
IBERT_INPUTS_DOCSTRING = r"""
This string contains the docstring for explaining the inputs accepted by the IBERT model.
This docstring should describe the expected inputs for the model, such as input tensors or data structures,
their types, shapes, and any preprocessing requirements.
It provides guidance on how to format and prepare data for the model's forward pass, ensuring compatibility
with the model's architecture and requirements.
This documentation helps users understand how to correctly interface with the model, ensuring inputs are
correctly formatted to achieve expected results.
"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
output_attentions (`bool`, *optional*):
output_hidden_states (`bool`, *optional*):
return_dict (`bool`, *optional*):
"""
@add_start_docstrings(
"The bare I-BERT Model transformer outputting raw hidden-states without any specific head on top.",
IBERT_START_DOCSTRING,
)
class IBertModel(IBertPreTrainedModel):
"""
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in [Attention is
all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
"""
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
self.quant_mode = config.quant_mode
# Initialize the embeddings layer for the IBERT model
self.embeddings = IBertEmbeddings(config)
# Initialize the encoder layer for the IBERT model
self.encoder = IBertEncoder(config)
# Initialize the pooling layer if specified
self.pooler = IBertPooler(config) if add_pooling_layer else None
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
# Return the word embeddings from the embeddings layer
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
# Set new word embeddings to the embeddings layer
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
# Iterate over layers and prune specific attention heads in each layer
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
# Forward pass through the IBERT model
# Detailed arguments are passed to handle different configurations
pass
@add_start_docstrings(
"I-BERT Model with a `language modeling` head on top.",
IBERT_START_DOCSTRING
)
class IBertForMaskedLM(IBertPreTrainedModel):
_tied_weights_keys = ["lm_head.decoder.bias", "lm_head.decoder.weight"]
def __init__(self, config):
super().__init__(config)
# Initialize the IBERT model without a pooling layer
self.ibert = IBertModel(config, add_pooling_layer=False)
# Initialize the language modeling head for IBERT
self.lm_head = IBertLMHead(config)
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
# Return the decoder weights from the language modeling head
return self.lm_head.decoder
def set_output_embeddings(self, new_embeddings):
# 将语言模型头部的解码器层替换为新的嵌入层
self.lm_head.decoder = new_embeddings
@add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
mask="<mask>",
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[MaskedLMOutput, Tuple[torch.FloatTensor]]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
Used to hide legacy arguments that have been deprecated.
"""
# 确定是否返回字典格式的输出
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用iBERT模型进行前向传播
outputs = self.ibert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取序列输出
sequence_output = outputs[0]
# 将序列输出传递给语言模型头部以获取预测分数
prediction_scores = self.lm_head(sequence_output)
masked_lm_loss = None
# 如果存在标签,则计算掩码语言建模损失
if labels is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
# 如果不需要返回字典格式的输出,则组装最终输出
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
# 如果需要返回字典格式的输出,则创建MaskedLMOutput对象
return MaskedLMOutput(
loss=masked_lm_loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
class IBertLMHead(nn.Module):
"""I-BERT Head for masked language modeling."""
def __init__(self, config):
super().__init__()
# 初始化一个全连接层,输入和输出维度都是 config.hidden_size
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
# 初始化 LayerNorm 层,对隐藏层进行归一化,eps 是归一化过程中的小数值
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 初始化一个全连接层,输入维度是 config.hidden_size,输出维度是 config.vocab_size
self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
# 初始化一个偏置参数,大小是 config.vocab_size
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
# 将偏置参数赋给 decoder 层的偏置
self.decoder.bias = self.bias
def forward(self, features, **kwargs):
# 将输入 features 输入全连接层 dense
x = self.dense(features)
# 使用 GELU 激活函数处理全连接层输出
x = gelu(x)
# 对处理后的结果进行 LayerNorm 归一化
x = self.layer_norm(x)
# 使用全连接层 decoder 将结果映射回词汇表大小,加上偏置
x = self.decoder(x)
return x
def _tie_weights(self):
# 如果两个权重被分离(在TPU上或者当偏置被重新调整大小时),将偏置与 decoder 的偏置相连
self.bias = self.decoder.bias
@add_start_docstrings(
"""
I-BERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
output) e.g. for GLUE tasks.
""",
IBERT_START_DOCSTRING,
)
class IBertForSequenceClassification(IBertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 设置分类任务的类别数
self.num_labels = config.num_labels
# 初始化 IBertModel,不添加池化层
self.ibert = IBertModel(config, add_pooling_layer=False)
# 初始化 IBertClassificationHead
self.classifier = IBertClassificationHead(config)
# 初始化权重并进行最终处理
self.post_init()
@add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[SequenceClassifierOutput, Tuple[torch.FloatTensor]]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 如果 return_dict 不为 None,则使用指定的 return_dict 值;否则使用 self.config.use_return_dict 的设定
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用预训练模型 `ibert` 进行处理,获取输出结果
outputs = self.ibert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从模型输出中提取序列输出(一般是经过分类器之前的最后一层隐藏状态)
sequence_output = outputs[0]
# 将序列输出传入分类器,得到 logits(预测的分类/回归结果)
logits = self.classifier(sequence_output)
# 初始化损失为 None
loss = None
# 如果有提供标签 labels
if labels is not None:
# 如果问题类型未定义
if self.config.problem_type is None:
# 根据 num_labels 的情况设置问题类型
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
# 根据问题类型计算损失
if self.config.problem_type == "regression":
loss_fct = MSELoss() # 使用均方误差损失函数
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss() # 使用交叉熵损失函数
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss() # 使用带 logits 的二元交叉熵损失函数
loss = loss_fct(logits, labels)
# 如果 return_dict 为 False,返回带有 logits 和其他输出的元组
if not return_dict:
output = (logits,) + outputs[2:] # 将 logits 和额外的输出合并为元组
return ((loss,) + output) if loss is not None else output
# 如果 return_dict 为 True,返回 SequenceClassifierOutput 对象
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
I-BERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
IBERT_START_DOCSTRING,
)
class IBertForMultipleChoice(IBertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 初始化 I-BERT 模型
self.ibert = IBertModel(config)
# Dropout 层,用于随机失活
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 分类器,线性层,将隐藏状态映射到单个输出值
self.classifier = nn.Linear(config.hidden_size, 1)
# 初始化权重并进行最终处理
self.post_init()
@add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
) -> Union[MultipleChoiceModelOutput, Tuple[torch.FloatTensor]]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
# 根据 `return_dict` 是否为 `None` 确定是否使用配置中的 `use_return_dict`
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 获取输入 `input_ids` 的第二维度大小作为 `num_choices`
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
# 将 `input_ids`, `position_ids`, `token_type_ids`, `attention_mask`, `inputs_embeds` 扁平化处理
flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
flat_inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
# 调用 `ibert` 模型,传入扁平化的参数,返回模型的输出结果
outputs = self.ibert(
flat_input_ids,
position_ids=flat_position_ids,
token_type_ids=flat_token_type_ids,
attention_mask=flat_attention_mask,
head_mask=head_mask,
inputs_embeds=flat_inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取汇聚的输出
pooled_output = outputs[1]
# 对汇聚的输出应用 dropout
pooled_output = self.dropout(pooled_output)
# 使用分类器得出 logits
logits = self.classifier(pooled_output)
# 重塑 logits 的形状,以适应多项选择的结构
reshaped_logits = logits.view(-1, num_choices)
loss = None
# 如果存在 `labels`,计算交叉熵损失
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
# 如果 `return_dict` 为 False,则返回扁平化后的输出和额外的隐藏状态
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
# 否则,返回一个包含损失、重塑后的 logits、隐藏状态和注意力的 `MultipleChoiceModelOutput` 对象
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
I-BERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
IBERT_START_DOCSTRING,
)
class IBertForTokenClassification(IBertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels # 从配置中获取标签数量
self.ibert = IBertModel(config, add_pooling_layer=False) # 初始化基于IBert的模型,不包含池化层
self.dropout = nn.Dropout(config.hidden_dropout_prob) # 使用配置中的dropout概率初始化dropout层
self.classifier = nn.Linear(config.hidden_size, config.num_labels) # 使用隐藏层大小和标签数量初始化分类器线性层
# 初始化权重并进行最终处理
self.post_init()
@add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[TokenClassifierOutput, Tuple[torch.FloatTensor]]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用IBert模型的forward方法,传递参数并获取输出
outputs = self.ibert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0] # 获取IBert模型的输出序列
sequence_output = self.dropout(sequence_output) # 应用dropout层到序列输出上
logits = self.classifier(sequence_output) # 应用分类器线性层到序列输出上,得到logits
loss = None
if labels is not None:
# 如果提供了标签,计算交叉熵损失
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
# 如果不要求返回字典形式的输出,按原始格式输出
output = (logits,) + outputs[2:] # 将logits和其他输出状态组合起来
return ((loss,) + output) if loss is not None else output
# 如果要求返回字典形式的输出,构建TokenClassifierOutput对象并返回
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
class IBertClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
def __init__(self, config):
super().__init__()
# 定义一个全连接层,输入和输出维度都是 config.hidden_size
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
# 定义一个 dropout 层,用于防止过拟合,dropout 概率为 config.hidden_dropout_prob
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 定义一个全连接层,输入维度为 config.hidden_size,输出维度为 config.num_labels
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, features, **kwargs):
# 从 features 中获取每个样本的第一个 token 的隐藏状态,相当于取 [CLS] token
hidden_states = features[:, 0, :]
# 对隐藏状态进行 dropout
hidden_states = self.dropout(hidden_states)
# 将 dropout 后的隐藏状态输入全连接层进行线性变换
hidden_states = self.dense(hidden_states)
# 对全连接层的输出应用 tanh 激活函数
hidden_states = torch.tanh(hidden_states)
# 再次对隐藏状态进行 dropout
hidden_states = self.dropout(hidden_states)
# 将 dropout 后的隐藏状态输入最终的全连接层进行线性变换,得到模型的输出
hidden_states = self.out_proj(hidden_states)
return hidden_states
@add_start_docstrings(
"""
I-BERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
IBERT_START_DOCSTRING,
)
class IBertForQuestionAnswering(IBertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 保存标签数量
self.num_labels = config.num_labels
# 初始化 I-BERT 模型,不加入 pooling 层
self.ibert = IBertModel(config, add_pooling_layer=False)
# 定义一个全连接层,用于生成问题回答的输出
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
# 初始化模型权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[QuestionAnsweringModelOutput, Tuple[torch.FloatTensor]]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
# 设置返回字典是否已经指定,如果未指定则使用模型配置中的设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用模型的前向传播,获取模型输出
outputs = self.ibert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从模型输出中获取序列输出
sequence_output = outputs[0]
# 将序列输出传入问答头部,获取起始和结束 logits
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
total_loss = None
if start_positions is not None and end_positions is not None:
# 如果 start_positions 或 end_positions 是多维的,在第一个维度上进行压缩
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# 忽略超出模型输入的起始/结束位置
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
# 定义交叉熵损失函数,忽略指定的索引
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
# 计算起始和结束位置的平均损失
total_loss = (start_loss + end_loss) / 2
if not return_dict:
# 如果不返回字典,则输出损失和 logits 等信息
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
# 返回一个 QuestionAnsweringModelOutput 对象,包括损失、起始和结束 logits,以及其他隐藏状态和注意力信息
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 根据输入的 `input_ids` 生成对应的位置标识符。非填充符号被替换为它们的位置数字,位置数字从 `padding_idx+1` 开始计数。
# 填充符号被忽略。此函数改编自 fairseq 的 *utils.make_positions*。
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's *utils.make_positions*.
Args:
input_ids (`torch.LongTensor`):
Indices of input sequence tokens in the vocabulary.
Returns: torch.Tensor
"""
# 使用 input_ids.ne(padding_idx) 生成一个 mask,标记非填充符号为 1,填充符号为 0
mask = input_ids.ne(padding_idx).int()
# 在每行中计算累积的非填充符号数量,类型转换为与 mask 相同的类型,然后加上 past_key_values_length
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
# 将 incremental_indices 转换为长整型(torch.long),然后加上 padding_idx 得到最终的位置标识符
return incremental_indices.long() + padding_idx
.\models\ibert\quant_modules.py
import decimal
import numpy as np
import torch
from torch import nn
from torch.autograd import Function
from ...utils import logging
logger = logging.get_logger(__name__)
class QuantEmbedding(nn.Module):
"""
`torch.nn.Embedding` 的量化版本。在 `torch.nn.Embedding` 的基础上增加了量化特定的参数。
Args:
weight_bit (`int`, *optional*, defaults to `8`):
权重的量化位宽。
momentum (`float`, *optional*, defaults to `0.95`):
更新激活量化范围的动量。
quant_mode (`bool`, *optional*, defaults to `False`):
是否对该层进行量化。
"""
def __init__(
self,
num_embeddings,
embedding_dim,
padding_idx=None,
max_norm=None,
norm_type=2.0,
scale_grad_by_freq=False,
sparse=False,
_weight=None,
weight_bit=8,
momentum=0.95,
quant_mode=False,
):
super().__init__()
self.num_ = num_embeddings
self.dim = embedding_dim
self.padding_idx = padding_idx
self.max_norm = max_norm
self.norm_type = norm_type
self.scale_grad_by_freq = scale_grad_by_freq
self.sparse = sparse
self.weight = nn.Parameter(torch.zeros([num_embeddings, embedding_dim]))
self.register_buffer("weight_scaling_factor", torch.zeros(1))
self.register_buffer("weight_integer", torch.zeros_like(self.weight))
self.weight_bit = weight_bit
self.momentum = momentum
self.quant_mode = quant_mode
self.percentile_mode = False
self.weight_function = SymmetricQuantFunction.apply
def forward(self, x, positions=None, incremental_state=None):
if not self.quant_mode:
return (
nn.functional.embedding(
x,
self.weight,
self.padding_idx,
self.max_norm,
self.norm_type,
self.scale_grad_by_freq,
self.sparse,
),
None,
)
w = self.weight
w_transform = w.data.detach()
w_min = w_transform.min().expand(1)
w_max = w_transform.max().expand(1)
self.weight_scaling_factor = symmetric_linear_quantization_params(self.weight_bit, w_min, w_max, False)
self.weight_integer = self.weight_function(
self.weight, self.weight_bit, self.percentile_mode, self.weight_scaling_factor
)
emb_int = nn.functional.embedding(
x,
self.weight_integer,
self.padding_idx,
self.max_norm,
self.norm_type,
self.scale_grad_by_freq,
self.sparse,
)
return emb_int * self.weight_scaling_factor, self.weight_scaling_factor
class QuantAct(nn.Module):
"""
Quantizes the given activation.
Args:
activation_bit (`int`):
Bitwidth for the quantized activation.
act_range_momentum (`float`, *optional*, defaults to `0.95`):
Momentum for updating the activation quantization range.
per_channel (`bool`, *optional*, defaults to `False`):
Whether to or not use channel-wise quantization.
channel_len (`int`, *optional*):
Specify the channel length when set the *per_channel* True.
quant_mode (`bool`, *optional`, defaults to `False`):
Whether or not the layer is quantized.
"""
def __init__(self, activation_bit, act_range_momentum=0.95, per_channel=False, channel_len=None, quant_mode=False):
super().__init__()
self.activation_bit = activation_bit
self.act_range_momentum = act_range_momentum
self.quant_mode = quant_mode
self.per_channel = per_channel
self.percentile = False
self.act_function = SymmetricQuantFunction.apply
if not self.per_channel:
self.register_buffer("x_min", torch.zeros(1))
self.register_buffer("x_max", torch.zeros(1))
self.register_buffer("act_scaling_factor", torch.zeros(1))
self.x_min -= 1e-5
self.x_max += 1e-5
else:
raise NotImplementedError("per-channel mode is not currently supported for activation.")
def __repr__(self):
return (
f"{self.__class__.__name__}(activation_bit={self.activation_bit}, "
f"quant_mode: {self.quant_mode}, Act_min: {self.x_min.item():.2f}, "
f"Act_max: {self.x_max.item():.2f})"
)
def forward(
self,
x,
pre_act_scaling_factor=None,
identity=None,
identity_scaling_factor=None,
specified_min=None,
specified_max=None,
):
x_act = x if identity is None else identity + x
if self.training:
assert not self.percentile, "percentile mode is not currently supported for activation."
assert not self.per_channel, "per-channel mode is not currently supported for activation."
x_min = x_act.data.min()
x_max = x_act.data.max()
assert (
x_max.isnan().sum() == 0 and x_min.isnan().sum() == 0
), "NaN detected when computing min/max of the activation"
if self.x_min.min() > -1.1e-5 and self.x_max.max() < 1.1e-5:
self.x_min = self.x_min + x_min
self.x_max = self.x_max + x_max
elif self.act_range_momentum == -1:
self.x_min = torch.min(self.x_min, x_min)
self.x_max = torch.max(self.x_max, x_max)
else:
self.x_min = self.x_min * self.act_range_momentum + x_min * (1 - self.act_range_momentum)
self.x_max = self.x_max * self.act_range_momentum + x_max * (1 - self.act_range_momentum)
if not self.quant_mode:
return x_act, None
x_min = self.x_min if specified_min is None else specified_min
x_max = self.x_max if specified_max is None else specified_max
self.act_scaling_factor = symmetric_linear_quantization_params(
self.activation_bit, x_min, x_max, per_channel=self.per_channel
)
if pre_act_scaling_factor is None:
quant_act_int = self.act_function(x, self.activation_bit, self.percentile, self.act_scaling_factor)
else:
quant_act_int = FixedPointMul.apply(
x,
pre_act_scaling_factor,
self.activation_bit,
self.act_scaling_factor,
identity,
identity_scaling_factor,
)
correct_output_scale = self.act_scaling_factor.view(-1)
return quant_act_int * correct_output_scale, self.act_scaling_factor
class QuantLinear(nn.Module):
"""
Quantized version of `torch.nn.Linear`. Adds quantization-specific arguments on top of `torch.nn.Linear`.
Args:
weight_bit (`int`, *optional*, defaults to `8`):
Bitwidth for the quantized weight.
bias_bit (`int`, *optional*, defaults to `32`):
Bitwidth for the quantized bias.
per_channel (`bool`, *optional*, defaults to `False`):
Whether or not to use channel-wise quantization.
quant_mode (`bool`, *optional*, defaults to `False`):
Whether or not the layer is quantized.
"""
def __init__(
self, in_features, out_features, bias=True, weight_bit=8, bias_bit=32, per_channel=False, quant_mode=False
):
super().__init__()
self.in_features = in_features
self.out_features = out_features
self.weight = nn.Parameter(torch.zeros([out_features, in_features]))
self.register_buffer("weight_integer", torch.zeros_like(self.weight))
self.register_buffer("fc_scaling_factor", torch.zeros(self.out_features))
if bias:
self.bias = nn.Parameter(torch.zeros(out_features))
self.register_buffer("bias_integer", torch.zeros_like(self.bias))
self.weight_bit = weight_bit
self.quant_mode = quant_mode
self.per_channel = per_channel
self.bias_bit = bias_bit
self.quant_mode = quant_mode
self.percentile_mode = False
self.weight_function = SymmetricQuantFunction.apply
def __repr__(self):
s = super().__repr__()
s = f"({s} weight_bit={self.weight_bit}, quant_mode={self.quant_mode})"
return s
def forward(self, x, prev_act_scaling_factor=None):
if not self.quant_mode:
return nn.functional.linear(x, weight=self.weight, bias=self.bias), None
assert prev_act_scaling_factor is not None and prev_act_scaling_factor.shape == (1,), (
"Input activation to the QuantLinear layer should be globally (non-channel-wise) quantized. "
"Please add a QuantAct layer with `per_channel = True` before this QuantAct layer"
)
w = self.weight
w_transform = w.data.detach()
if self.per_channel:
w_min, _ = torch.min(w_transform, dim=1, out=None)
w_max, _ = torch.max(w_transform, dim=1, out=None)
else:
w_min = w_transform.min().expand(1)
w_max = w_transform.max().expand(1)
self.fc_scaling_factor = symmetric_linear_quantization_params(self.weight_bit, w_min, w_max, self.per_channel)
self.weight_integer = self.weight_function(
self.weight, self.weight_bit, self.percentile_mode, self.fc_scaling_factor
)
bias_scaling_factor = self.fc_scaling_factor * prev_act_scaling_factor
if self.bias is not None:
self.bias_integer = self.weight_function(self.bias, self.bias_bit, False, bias_scaling_factor)
prev_act_scaling_factor = prev_act_scaling_factor.view(1, -1)
x_int = x / prev_act_scaling_factor
return (
nn.functional.linear(x_int, weight=self.weight_integer, bias=self.bias_integer) * bias_scaling_factor,
bias_scaling_factor,
)
class IntGELU(nn.Module):
"""
Quantized version of `torch.nn.GELU`. Adds quantization-specific arguments on top of `torch.nn.GELU`.
Args:
quant_mode (`bool`, *optional*, defaults to `False`):
Whether or not the layer is quantized.
force_dequant (`str`, *optional*, defaults to `"none"`):
Force dequantize the layer if either "gelu" or "nonlinear" is given.
"""
def __init__(self, quant_mode=True, force_dequant="none"):
super().__init__()
self.quant_mode = quant_mode
if force_dequant in ["nonlinear", "gelu"]:
logger.info("Force dequantize gelu")
self.quant_mode = False
if not self.quant_mode:
self.activation_fn = nn.GELU()
self.k = 1.4142
self.const = 14
self.coeff = [-0.2888, -1.769, 1]
self.coeff[2] /= self.coeff[0]
def int_erf(self, x_int, scaling_factor):
b_int = torch.floor(self.coeff[1] / scaling_factor)
c_int = torch.floor(self.coeff[2] / scaling_factor**2)
sign = torch.sign(x_int)
abs_int = torch.min(torch.abs(x_int), -b_int)
y_int = sign * ((abs_int + b_int) ** 2 + c_int)
scaling_factor = scaling_factor**2 * self.coeff[0]
y_int = floor_ste.apply(y_int / 2**self.const)
scaling_factor = scaling_factor * 2**self.const
return y_int, scaling_factor
def forward(self, x, scaling_factor=None):
if not self.quant_mode:
return self.activation_fn(x), None
x_int = x / scaling_factor
sigmoid_int, sigmoid_scaling_factor = self.int_erf(x_int, scaling_factor / self.k)
shift_int = 1.0 // sigmoid_scaling_factor
x_int = x_int * (sigmoid_int + shift_int)
scaling_factor = scaling_factor * sigmoid_scaling_factor / 2
return x_int * scaling_factor, scaling_factor
class IntSoftmax(nn.Module):
"""
Quantized version of `torch.nn.Softmax`. Adds quantization-specific arguments on top of `torch.nn.Softmax`.
Args:
output_bit (`int`):
Bitwidth for the layer output activation.
quant_mode (`bool`, *optional*, defaults to `False`):
Whether or not the layer is quantized.
force_dequant (`str`, *optional*, defaults to `"none"`):
Force dequantize the layer if either "softmax" or "nonlinear" is given.
"""
def __init__(self, output_bit, quant_mode=False, force_dequant="none"):
super().__init__()
self.output_bit = output_bit
self.max_bit = 32
self.quant_mode = quant_mode
if force_dequant in ["nonlinear", "softmax"]:
logger.info("Force dequantize softmax")
self.quant_mode = False
self.act = QuantAct(16, quant_mode=self.quant_mode)
self.x0 = -0.6931
self.const = 30
self.coef = [0.35815147, 0.96963238, 1.0]
self.coef[1] /= self.coef[0]
self.coef[2] /= self.coef[0]
def int_polynomial(self, x_int, scaling_factor):
with torch.no_grad():
b_int = torch.floor(self.coef[1] / scaling_factor)
c_int = torch.floor(self.coef[2] / scaling_factor**2)
z = (x_int + b_int) * x_int + c_int
scaling_factor = self.coef[0] * scaling_factor**2
return z, scaling_factor
def int_exp(self, x_int, scaling_factor):
with torch.no_grad():
x0_int = torch.floor(self.x0 / scaling_factor)
x_int = torch.max(x_int, self.const * x0_int)
q = floor_ste.apply(x_int / x0_int)
r = x_int - x0_int * q
exp_int, exp_scaling_factor = self.int_polynomial(r, scaling_factor)
exp_int = torch.clamp(floor_ste.apply(exp_int * 2 ** (self.const - q)), min=0)
scaling_factor = exp_scaling_factor / 2**self.const
return exp_int, scaling_factor
def forward(self, x, scaling_factor):
if not self.quant_mode:
return nn.functional.softmax(x, dim=-1), None
x_int = x / scaling_factor
x_int_max, _ = x_int.max(dim=-1, keepdim=True)
x_int = x_int - x_int_max
exp_int, exp_scaling_factor = self.int_exp(x_int, scaling_factor)
exp, exp_scaling_factor = self.act(exp_int, exp_scaling_factor)
exp_int = exp / exp_scaling_factor
exp_int_sum = exp_int.sum(dim=-1, keepdim=True)
factor = floor_ste.apply(2**self.max_bit / exp_int_sum)
exp_int = floor_ste.apply(exp_int * factor / 2 ** (self.max_bit - self.output_bit))
scaling_factor = 1 / 2**self.output_bit
return exp_int * scaling_factor, scaling_factor
"""
Quantized version of `torch.nn.LayerNorm`. Adds quantization-specific arguments on top of `torch.nn.LayerNorm`.
Args:
normalized_shape (`int` or `list` or `torch.Size`):
Shape of the input tensor over which normalization is applied.
eps (`float`):
Small value added to the denominator for numerical stability.
output_bit (`int`, *optional*, defaults to `8`):
Bitwidth for the layer output activation.
quant_mode (`bool`, *optional*, defaults to `False`):
Whether or not the layer is quantized.
force_dequant (`str`, *optional*, defaults to `"none"`):
If set to `"layernorm"` or `"nonlinear"`, forces dequantization of the layer.
Attributes:
weight (`torch.nn.Parameter`):
Learnable parameter representing the scaling factor.
bias (`torch.nn.Parameter`):
Learnable parameter representing the bias.
shift (`torch.Tensor`):
Buffer holding the shift value for dynamic adjustment.
output_bit (`int`):
Bitwidth for the layer output activation.
max_bit (`int`):
Maximum allowable bitwidth for quantization.
dim_sqrt (`None`):
Placeholder for the square root of the dimension, initially `None`.
activation (`QuantAct`):
Instance of `QuantAct` for quantization-aware activation.
Methods:
set_shift(self, y_int):
Adjusts `self.shift` based on the input tensor `y_int`.
overflow_fallback(self, y_int):
Handles overflow during training and adjusts `self.shift` accordingly.
Notes:
- This class extends `torch.nn.Module` and integrates quantization-specific features.
- It manages parameters for scaling and bias, quantization mode, and dynamic shift adjustments.
- The `QuantAct` instance `activation` handles activation quantization within the layer.
"""
def __init__(self, normalized_shape, eps, output_bit=8, quant_mode=False, force_dequant="none"):
super().__init__()
self.normalized_shape = normalized_shape
self.eps = eps
self.weight = nn.Parameter(torch.zeros(normalized_shape))
self.bias = nn.Parameter(torch.zeros(normalized_shape))
self.quant_mode = quant_mode
if force_dequant in ["nonlinear", "layernorm"]:
logger.info("Force dequantize layernorm")
self.quant_mode = False
self.register_buffer("shift", torch.zeros(1))
self.output_bit = output_bit
self.max_bit = 32
self.dim_sqrt = None
self.activation = QuantAct(self.output_bit, quant_mode=self.quant_mode)
def set_shift(self, y_int):
"""
Adjusts `self.shift` based on the input tensor `y_int`.
Args:
y_int (`torch.Tensor`):
Integer tensor representing the quantized activation values.
"""
with torch.no_grad():
y_sq_int = y_int**2
var_int = torch.sum(y_sq_int, axis=2, keepdim=True)
shift = (torch.log2(torch.sqrt(var_int / 2**self.max_bit)).ceil()).max()
shift_old = self.shift
self.shift = torch.max(self.shift, shift)
logger.info(f"Dynamic shift adjustment: {int(shift_old)} -> {int(self.shift)}")
def overflow_fallback(self, y_int):
"""
Handles overflow during training and adjusts `self.shift` accordingly.
Args:
y_int (`torch.Tensor`):
Integer tensor representing the quantized activation values.
Returns:
`torch.Tensor`: Tensor representing the adjusted variance after shift.
"""
self.set_shift(y_int)
y_int_shifted = floor_ste.apply(y_int / 2**self.shift)
y_sq_int = y_int_shifted**2
var_int = torch.sum(y_sq_int, axis=2, keepdim=True)
return var_int
def forward(self, x, scaling_factor=None):
if not self.quant_mode:
mean = x.mean(axis=2, keepdim=True)
y = x - mean
var = torch.mean(y**2, axis=2, keepdim=True)
x = y / torch.sqrt(self.eps + var)
x = x * self.weight + self.bias
return x, None
if self.dim_sqrt is None:
n = torch.tensor(x.shape[2], dtype=torch.float)
self.dim_sqrt = torch.sqrt(n).to(x.device)
x_int = x / scaling_factor
mean_int = round_ste.apply(x_int.mean(axis=2, keepdim=True))
y_int = x_int - mean_int
y_int_shifted = floor_ste.apply(y_int / 2**self.shift)
y_sq_int = y_int_shifted**2
var_int = torch.sum(y_sq_int, axis=2, keepdim=True)
if self.training:
if var_int.max() >= 2**self.max_bit:
var_int = self.overflow_fallback(y_int)
assert var_int.max() < 2**self.max_bit + 0.1, (
"Error detected in overflow handling: "
"`var_int` exceeds `self.max_bit` (the maximum possible bit width)"
)
std_int = floor_ste.apply(torch.sqrt(var_int)) * 2**self.shift
factor = floor_ste.apply(2**31 / std_int)
y_int = floor_ste.apply(y_int * factor / 2)
scaling_factor = self.dim_sqrt / 2**30
bias = self.bias.data.detach() / (self.weight.data.detach())
bias_int = floor_ste.apply(bias / scaling_factor)
y_int = y_int + bias_int
scaling_factor = scaling_factor * self.weight
x = y_int * scaling_factor
return x, scaling_factor
def get_percentile_min_max(input, lower_percentile, upper_percentile, output_tensor=False):
"""
Calculate the percentile max and min values in a given tensor
Args:
input (`torch.Tensor`):
The target tensor to calculate percentile max and min.
lower_percentile (`float`):
If 0.1, means we return the value of the smallest 0.1% value in the tensor as percentile min.
upper_percentile (`float`):
If 99.9, means we return the value of the largest 0.1% value in the tensor as percentile max.
output_tensor (`bool`, *optional*, defaults to `False`):
If True, this function returns tensors, otherwise it returns values.
Returns:
`Tuple(torch.Tensor, torch.Tensor)`: Percentile min and max value of *input*
"""
input_length = input.shape[0]
lower_index = round(input_length * (1 - lower_percentile * 0.01))
upper_index = round(input_length * upper_percentile * 0.01)
upper_bound = torch.kthvalue(input, k=upper_index).values
if lower_percentile == 0:
lower_bound = upper_bound * 0
else:
lower_bound = -torch.kthvalue(-input, k=lower_index).values
if not output_tensor:
lower_bound = lower_bound.item()
upper_bound = upper_bound.item()
return lower_bound, upper_bound
def linear_quantize(input, scale, zero_point, inplace=False):
"""
Quantize single-precision input tensor to integers with the given scaling factor and zeropoint.
Args:
input (`torch.Tensor`):
Single-precision input tensor to be quantized.
scale (`torch.Tensor`):
Scaling factor for quantization.
zero_point (`torch.Tensor`):
Shift for quantization.
inplace (`bool`, *optional*, defaults to `False`):
Whether to compute inplace or not.
Returns:
`torch.Tensor`: Linearly quantized value of *input* according to *scale* and *zero_point*.
"""
if len(input.shape) == 4:
scale = scale.view(-1, 1, 1, 1)
zero_point = zero_point.view(-1, 1, 1, 1)
elif len(input.shape) == 2:
scale = scale.view(-1, 1)
zero_point = zero_point.view(-1, 1)
else:
scale = scale.view(-1)
zero_point = zero_point.view(-1)
if inplace:
input.mul_(1.0 / scale).add_(zero_point).round_()
return input
return torch.round(1.0 / scale * input + zero_point)
def symmetric_linear_quantization_params(num_bits, saturation_min, saturation_max, per_channel=False):
"""
Compute the scaling factor with the given quantization range for symmetric quantization.
"""
with torch.no_grad():
n = 2 ** (num_bits - 1) - 1
if per_channel:
scale, _ = torch.max(torch.stack([saturation_min.abs(), saturation_max.abs()], dim=1), dim=1)
scale = torch.clamp(scale, min=1e-8) / n
else:
scale = max(saturation_min.abs(), saturation_max.abs())
scale = torch.clamp(scale, min=1e-8) / n
return scale
class SymmetricQuantFunction(Function):
"""
Class to quantize the given floating-point values using symmetric quantization with given range and bitwidth.
"""
@staticmethod
def forward(ctx, x, k, percentile_mode, scale):
"""
Args:
x (`torch.Tensor`):
Floating point tensor to be quantized.
k (`int`):
Quantization bitwidth.
percentile_mode (`bool`):
Whether or not to use percentile calibration.
scale (`torch.Tensor`):
Pre-calculated scaling factor for *x*. Note that the current implementation of SymmetricQuantFunction
requires pre-calculated scaling factor.
Returns:
`torch.Tensor`: Symmetric-quantized value of *input*.
"""
zero_point = torch.tensor(0.0).to(scale.device)
n = 2 ** (k - 1) - 1
new_quant_x = linear_quantize(x, scale, zero_point, inplace=False)
new_quant_x = torch.clamp(new_quant_x, -n, n - 1)
ctx.scale = scale
return new_quant_x
@staticmethod
def backward(ctx, grad_output):
scale = ctx.scale
if len(grad_output.shape) == 4:
scale = scale.view(-1, 1, 1, 1)
elif len(grad_output.shape) == 2:
scale = scale.view(-1, 1)
else:
scale = scale.view(-1)
return grad_output.clone() / scale, None, None, None, None
class floor_ste(Function):
"""
Straight-through Estimator(STE) for torch.floor()
"""
@staticmethod
def forward(ctx, x):
return torch.floor(x)
@staticmethod
def backward(ctx, grad_output):
return grad_output.clone()
class round_ste(Function):
"""
Straight-through Estimator(STE) for torch.round()
"""
@staticmethod
def forward(ctx, x):
return torch.round(x)
@staticmethod
def backward(ctx, grad_output):
return grad_output.clone()
def batch_frexp(inputs, max_bit=31):
"""
Decompose the scaling factor into mantissa and twos exponent.
Args:
scaling_factor (`torch.Tensor`):
Target scaling factor to decompose.
Returns:
``Tuple(torch.Tensor, torch.Tensor)`: mantisa and exponent
"""
shape_of_input = inputs.size()
inputs = inputs.view(-1)
output_m, output_e = np.frexp(inputs.cpu().numpy())
tmp_m = []
for m in output_m:
int_m_shifted = int(
decimal.Decimal(m * (2**max_bit)).quantize(decimal.Decimal("1"), rounding=decimal.ROUND_HALF_UP)
)
tmp_m.append(int_m_shifted)
output_m = np.array(tmp_m)
output_e = float(max_bit) - output_e
return (
torch.from_numpy(output_m).to(inputs.device).view(shape_of_input),
torch.from_numpy(output_e).to(inputs.device).view(shape_of_input),
)
class FixedPointMul(Function):
"""
Function to perform fixed-point arithmetic that can match integer arithmetic on hardware.
Args:
pre_act (`torch.Tensor`):
Input tensor.
pre_act_scaling_factor (`torch.Tensor`):
Scaling factor of the input tensor *pre_act*.
bit_num (`int`):
Quantization bitwidth.
z_scaling_factor (`torch.Tensor`):
Scaling factor of the output tensor.
identity (`torch.Tensor`, *optional*):
Identity tensor, if exists.
identity_scaling_factor (`torch.Tensor`, *optional*):
Scaling factor of the identity tensor *identity*, if exists.
Returns:
`torch.Tensor`: Output tensor(*pre_act* if *identity* is not given, otherwise the addition of *pre_act* and
*identity*), whose scale is rescaled to *z_scaling_factor*.
"""
@staticmethod
def forward(
ctx,
pre_act,
pre_act_scaling_factor,
bit_num,
z_scaling_factor,
identity=None,
identity_scaling_factor=None,
):
if len(pre_act_scaling_factor.shape) == 3:
reshape = lambda x: x
else:
reshape = lambda x: x.view(1, 1, -1)
ctx.identity = identity
n = 2 ** (bit_num - 1) - 1
with torch.no_grad():
pre_act_scaling_factor = reshape(pre_act_scaling_factor)
if identity is not None:
identity_scaling_factor = reshape(identity_scaling_factor)
ctx.z_scaling_factor = z_scaling_factor
z_int = torch.round(pre_act / pre_act_scaling_factor)
_A = pre_act_scaling_factor.type(torch.double)
_B = (z_scaling_factor.type(torch.float)).type(torch.double)
new_scale = _A / _B
new_scale = reshape(new_scale)
m, e = batch_frexp(new_scale)
output = z_int.type(torch.double) * m.type(torch.double)
output = torch.round(output / (2.0**e))
if identity is not None:
wx_int = torch.round(identity / identity_scaling_factor)
_A = identity_scaling_factor.type(torch.double)
_B = (z_scaling_factor.type(torch.float)).type(torch.double)
new_scale = _A / _B
new_scale = reshape(new_scale)
m1, e1 = batch_frexp(new_scale)
output1 = wx_int.type(torch.double) * m1.type(torch.double)
output1 = torch.round(output1 / (2.0**e1))
output = output1 + output
return torch.clamp(output.type(torch.float), -n - 1, n)
@staticmethod
def backward(ctx, grad_output):
identity_grad = None
if ctx.identity is not None:
identity_grad = grad_output.clone() / ctx.z_scaling_factor
return grad_output.clone() / ctx.z_scaling_factor, None, None, None, None, identity_grad, None
.\models\ibert\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {"configuration_ibert": ["IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "IBertConfig", "IBertOnnxConfig"]}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_ibert"] = [
"IBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
"IBertForMaskedLM",
"IBertForMultipleChoice",
"IBertForQuestionAnswering",
"IBertForSequenceClassification",
"IBertForTokenClassification",
"IBertModel",
"IBertPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_ibert import IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, IBertConfig, IBertOnnxConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_ibert import (
IBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
IBertForMaskedLM,
IBertForMultipleChoice,
IBertForQuestionAnswering,
IBertForSequenceClassification,
IBertForTokenClassification,
IBertModel,
IBertPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\idefics\configuration_idefics.py
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"HuggingFaceM4/idefics-9b": "https://huggingface.co/HuggingFaceM4/idefics-9b/blob/main/config.json",
"HuggingFaceM4/idefics-80b": "https://huggingface.co/HuggingFaceM4/idefics-80b/blob/main/config.json",
}
class IdeficsVisionConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the Idefics-9B.
e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
model_type = "idefics"
attribute_map = {
"hidden_size": "embed_dim",
}
def __init__(
self,
embed_dim=768,
image_size=224,
intermediate_size=5120,
patch_size=14,
num_hidden_layers=32,
num_attention_heads=16,
num_channels=3,
hidden_act="gelu",
layer_norm_eps=1e-5,
attention_dropout=0.0,
initializer_range=0.02,
initializer_factor=1.0,
**kwargs,
):
self.embed_dim = embed_dim
self.image_size = image_size
self.intermediate_size = intermediate_size
self.patch_size = patch_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_channels = num_channels
self.layer_norm_eps = layer_norm_eps
self.attention_dropout = attention_dropout
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.hidden_act = hidden_act
super().__init__(**kwargs)
class IdeficsConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the Idefics-9B.
e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import IdeficsModel, IdeficsConfig
>>> # Initializing a Idefics idefics-9b style configuration
>>> configuration = IdeficsConfig()
```
注释:
声明一个名为 IdeficsConfig 的配置类,用于存储 `IdeficsModel` 的配置信息。
该配置类根据指定的参数实例化一个 Idefics 模型,定义模型的架构。
使用默认参数实例化配置类会产生类似 Idefics-9B 模型的配置。
例如,[HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b) 提供了相关的预训练模型。
Configuration objects 继承自 [`PretrainedConfig`],可用于控制模型的输出。详细信息请参阅 [`PretrainedConfig`] 的文档。
```
>>> # 从 idefics-9b 风格的配置中初始化一个模型
>>> model = IdeficsModel(configuration)
>>> # 访问模型的配置信息
>>> configuration = model.config
):
# 初始化函数,设置模型的各项参数
self.vocab_size = vocab_size
# 额外词汇表大小
self.additional_vocab_size = additional_vocab_size
# 隐藏层大小
self.hidden_size = hidden_size
# 中间层大小
self.intermediate_size = intermediate_size
# 隐藏层的数量
self.num_hidden_layers = num_hidden_layers
# 注意力头的数量
self.num_attention_heads = num_attention_heads
# dropout 概率
self.dropout = dropout
# 隐藏层激活函数
self.hidden_act = hidden_act
# 初始化范围
self.initializer_range = initializer_range
# alpha 初始化器
self.alpha_initializer = alpha_initializer
# alpha 初始化范围
self.alphas_initializer_range = alphas_initializer_range
# alpha 类型
self.alpha_type = alpha_type
# RMS 规范化的 epsilon
self.rms_norm_eps = rms_norm_eps
# 是否使用缓存
self.use_cache = use_cache
# 交叉层间隔
self.cross_layer_interval = cross_layer_interval
# qk 层归一化
self.qk_layer_norms = qk_layer_norms
# 冻结视觉层
self.freeze_vision_layers = freeze_vision_layers
# 冻结文本层
self.freeze_text_layers = freeze_text_layers
# 冻结文本模块例外
self.freeze_text_module_exceptions = freeze_text_module_exceptions
# 冻结视觉模块例外
self.freeze_vision_module_exceptions = freeze_vision_module_exceptions
# 冻结 LM 头部
self.freeze_lm_head = freeze_lm_head
# 是否使用重采样器
self.use_resampler = use_resampler
# 如果 perceiver_config 为 None,则使用默认配置
if perceiver_config is None:
self.perceiver_config = IdeficsPerceiverConfig()
# 如果 perceiver_config 是字典类型,则使用给定的配置创建 IdeficsPerceiverConfig 对象
elif isinstance(perceiver_config, dict):
self.perceiver_config = IdeficsPerceiverConfig(**perceiver_config)
# 如果 perceiver_config 已经是 IdeficsPerceiverConfig 类型,则直接使用它
elif isinstance(perceiver_config, IdeficsPerceiverConfig):
self.perceiver_config = perceiver_config
# 如果 vision_config 为 None,则使用默认配置
if vision_config is None:
self.vision_config = IdeficsVisionConfig()
# 如果 vision_config 是字典类型,则使用给定的配置创建 IdeficsVisionConfig 对象
elif isinstance(vision_config, dict):
self.vision_config = IdeficsVisionConfig(**vision_config)
# 如果 vision_config 已经是 IdeficsVisionConfig 类型,则直接使用它
elif isinstance(vision_config, IdeficsVisionConfig):
self.vision_config = vision_config
# 调用父类的初始化方法,设置特殊标记的 token ID 和其他参数
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
# 注意:不要在构造函数中进行任何基于 __init__ 参数的检查,
# 因为 PretrainedConfig.from_dict 首先使用配置字典实例化类,然后
# 仅在 from_pretrained 中使用 from_pretrained 的 kwargs 更新配置对象,
# 所以在实例化此对象时,许多属性具有默认值,尚未被覆盖。
# 请在运行父类的 from_pretrained 后,在 from_pretrained 中执行任何必要的检查。
.\models\idefics\image_processing_idefics.py
"""Image processor class for Idefics."""
from typing import Callable, Dict, List, Optional, Union
from PIL import Image
from ...image_processing_utils import BaseImageProcessor, BatchFeature
from ...image_transforms import resize, to_channel_dimension_format
from ...image_utils import (
ChannelDimension,
ImageInput,
PILImageResampling,
make_list_of_images,
to_numpy_array,
valid_images,
)
from ...utils import TensorType, is_torch_available
IDEFICS_STANDARD_MEAN = [0.48145466, 0.4578275, 0.40821073]
IDEFICS_STANDARD_STD = [0.26862954, 0.26130258, 0.27577711]
def convert_to_rgb(image):
if image.mode == "RGB":
return image
image_rgba = image.convert("RGBA")
background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
alpha_composite = Image.alpha_composite(background, image_rgba)
alpha_composite = alpha_composite.convert("RGB")
return alpha_composite
class IdeficsImageProcessor(BaseImageProcessor):
r"""
Constructs a Idefics image processor.
Args:
image_size (`int`, *optional*, defaults to 224):
Resize to image size
image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
Can be overridden by the `image_std` parameter in the `preprocess` method.
image_num_channels (`int`, *optional*, defaults to 3):
Number of image channels.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
image_size: int = 224,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
image_num_channels: Optional[int] = 3,
**kwargs,
) -> None:
super().__init__(**kwargs)
self.image_size = image_size
self.image_num_channels = image_num_channels
self.image_mean = image_mean
self.image_std = image_std
def preprocess(
self,
images: ImageInput,
image_num_channels: Optional[int] = 3,
image_size: Optional[Dict[str, int]] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
transform: Callable = None,
**kwargs,
.\models\idefics\modeling_idefics.py
""" PyTorch Idefics model. """
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple, Union
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
from ... import PreTrainedModel
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask_for_sdpa
from ...modeling_outputs import ModelOutput
from ...modeling_utils import PretrainedConfig
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_idefics import IdeficsConfig
from .perceiver import IdeficsPerceiverResampler
from .vision import IdeficsVisionTransformer
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "IdeficsConfig"
IDEFICS_PRETRAINED_MODEL_ARCHIVE_LIST = [
"HuggingFaceM4/idefics-9b",
"HuggingFaceM4/idefics-80b",
]
@dataclass
class IdeficsBaseModelOutputWithPast(ModelOutput):
"""
Base class for Idefics model's outputs that may also contain a past key/values (to speed up sequential decoding).
"""
"""
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
模型最后一层的输出隐藏状态序列。
如果使用了 `past_key_values`,则只输出形状为 `(batch_size, 1, hidden_size)` 的每个序列的最后一个隐藏状态。
past_key_values (`tuple(tuple(torch.FloatTensor))`, *可选*, 当传入 `use_cache=True` 或 `config.use_cache=True` 时返回):
长度为 `config.n_layers` 的元组,每个元组包含两个张量,形状为 `(batch_size, num_heads, sequence_length, embed_size_per_head)`。
包含预计算的隐藏状态(自注意力块中的键和值,以及如果 `config.is_encoder_decoder=True` 在交叉注意力块中也包含),
可用于加速序列解码。
hidden_states (`tuple(torch.FloatTensor)`, *可选*, 当传入 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
元组的 `torch.FloatTensor`(如果模型具有嵌入层,则为嵌入输出的张量 + 每层的输出张量),形状为 `(batch_size, sequence_length, hidden_size)`。
模型每一层的隐藏状态,加上可选的初始嵌入层输出。
attentions (`tuple(torch.FloatTensor)`, *可选*, 当传入 `output_attentions=True` 或 `config.output_attentions=True` 时返回):
元组的 `torch.FloatTensor`(每层一个),形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
自注意力头中注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
image_hidden_states (`tuple(torch.FloatTensor)`, *可选*):
元组的 `torch.FloatTensor`(图像嵌入输出的一个,形状为 `(batch_size, num_images, sequence_length, hidden_size)`)。
模型通过视觉编码器生成的图像隐藏状态,以及通过感知者生成的图像隐藏状态。
"""
last_hidden_state: torch.FloatTensor = None
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class IdeficsCausalLMOutputWithPast(ModelOutput):
"""
Base class for Idefics causal language model (or autoregressive) outputs.
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
sequence_length, hidden_size)`).
image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
past_key_values: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
def expand_inputs_for_generation(
input_ids,
expand_size=1,
is_encoder_decoder=False,
attention_mask=None,
encoder_outputs=None,
**model_kwargs,
):
"""
扩展输入以用于生成
Args:
input_ids: 输入的 token IDs
expand_size: 扩展的大小,用于生成的副本数
is_encoder_decoder: 是否是编码器-解码器结构
attention_mask: 注意力掩码
encoder_outputs: 编码器的输出,用于解码器的输入
**model_kwargs: 其他模型的关键字参数
"""
expanded_return_idx = (
torch.arange(input_ids.shape[0]).view(-1, 1).repeat(1, expand_size).view(-1).to(input_ids.device)
)
input_ids = input_ids.index_select(0, expanded_return_idx)
model_kwargs["pixel_values"] = model_kwargs.get("pixel_values", None)
model_kwargs["image_encoder_embeddings"] = model_kwargs.get("image_encoder_embeddings", None)
model_kwargs["perceiver_embeddings"] = model_kwargs.get("perceiver_embeddings", None)
model_kwargs["image_attention_mask"] = model_kwargs.get("image_attention_mask", None)
if "token_type_ids" in model_kwargs:
token_type_ids = model_kwargs["token_type_ids"]
model_kwargs["token_type_ids"] = token_type_ids.index_select(0, expanded_return_idx)
if attention_mask is not None:
model_kwargs["attention_mask"] = attention_mask.index_select(0, expanded_return_idx)
if model_kwargs["image_attention_mask"] is not None:
model_kwargs["image_attention_mask"] = model_kwargs["image_attention_mask"].index_select(
0, expanded_return_idx
)
if model_kwargs["pixel_values"] is not None:
model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(0, expanded_return_idx)
elif model_kwargs["image_encoder_embeddings"] is not None:
model_kwargs["image_encoder_embeddings"] = model_kwargs["image_encoder_embeddings"].index_select(
0, expanded_return_idx
)
elif model_kwargs["perceiver_embeddings"] is not None:
model_kwargs["perceiver_embeddings"] = model_kwargs["perceiver_embeddings"].index_select(
0, expanded_return_idx
)
return input_ids, model_kwargs
def prepare_inputs_for_generation(input_ids, past_key_values=None, **kwargs):
token_type_ids = kwargs.get("token_type_ids", None)
if past_key_values:
input_ids = input_ids[:, -1].unsqueeze(-1)
if token_type_ids is not None:
token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
attention_mask = kwargs.get("attention_mask", None)
position_ids = kwargs.get("position_ids", None)
if attention_mask is not None and position_ids is None:
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
if past_key_values:
position_ids = position_ids[:, -1].unsqueeze(-1)
pixel_values = kwargs.get("pixel_values", None)
image_encoder_embeddings = kwargs.get("image_encoder_embeddings", None)
perceiver_embeddings = kwargs.get("perceiver_embeddings", None)
image_attention_mask = kwargs.get("image_attention_mask", None)
interpolate_pos_encoding = kwargs.get("interpolate_pos_encoding", False)
return {
"input_ids": input_ids,
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"position_ids": position_ids,
"attention_mask": attention_mask,
"token_type_ids": token_type_ids,
"pixel_values": pixel_values,
"image_encoder_embeddings": image_encoder_embeddings,
"perceiver_embeddings": perceiver_embeddings,
"image_attention_mask": image_attention_mask,
"interpolate_pos_encoding": interpolate_pos_encoding,
}
def freeze_model(model, module_exceptions=[]):
mapping = {
"LayerNorm": nn.LayerNorm,
"Linear": nn.Linear,
"Embedding": nn.Embedding,
}
module_exceptions_mapped = [mapping[m] for m in module_exceptions]
for module in model.modules():
if module_exceptions and any(isinstance(module, t) for t in module_exceptions_mapped):
module.requires_grad_(True)
else:
module.requires_grad_(False)
return model
class IdeficsDecoupledEmbedding(nn.Embedding):
"""
实现参数解耦以允许冻结(或不冻结)嵌入的子集。在实践中,regular `weight` 可以训练或冻结
(即 `partially_freeze=True`),如果 `num_additional_embeddings > 0`,则会创建
`num_additional_embeddings` 个额外的始终训练的参数。如果 `num_additional_embeddings=0`,
则模块默认为 `nn.Embedding` 的常规行为。
"""
def __init__(
self,
num_embeddings,
num_additional_embeddings,
embedding_dim,
partially_freeze: Optional[bool] = False,
device=None,
dtype=None,
padding_idx=None,
**kwargs,
) -> None:
"""
Args:
num_embeddings (`int`):
Size of the dictionary of embeddings
num_additional_embeddings (`int`):
Number of additional embeddings. Only useful when you `partially_freeze=True`.
embedding_dim (`int`):
The size of each embedding vector
partially_freeze: (`bool`, *optional*, defaults to `False`):
If `True`, the regular `weight` will be frozen. `additional_weight` is never frozen.
padding_idx (`int`, *optional*):
The padding index (needs to be less than num_embeddings)
Note: there are a lot of other parameters to initialize a standard `nn.Embedding` such as `padding_idx`,
`max_norm` or `norm_type`. We are not supporting these.
"""
if padding_idx is not None and padding_idx > num_embeddings:
raise ValueError(f"padding_idx must be within num_embeddings. Got {padding_idx} and {num_embeddings}")
super().__init__(
num_embeddings=num_embeddings,
embedding_dim=embedding_dim,
device=device,
dtype=dtype,
padding_idx=padding_idx,
**kwargs,
)
self.num_embeddings = num_embeddings
self.padding_idx = padding_idx
self.num_additional_embeddings = num_additional_embeddings
self.partially_freeze = partially_freeze
if partially_freeze:
self.weight.requires_grad_(False)
if self.num_additional_embeddings > 0:
self.additional_embedding = nn.Embedding(
num_embeddings=self.num_additional_embeddings,
embedding_dim=embedding_dim,
device=device,
dtype=dtype,
)
def forward(self, input_ids):
"""
前向传播函数,用于模型的正向计算过程。
we have 2 embeddings, with different indices - one pretrained self.weight and another
self.additional_embedding.weight that is being trained.
我们有两个嵌入层,它们有不同的索引范围:
- 一个是预训练的 self.weight
- 另一个是正在训练的 self.additional_embedding.weight
in order to make a lookup of the input ids, we:
为了查找输入的 id,我们执行以下步骤:
1. find out the indices of the entries belonging to the 2nd embedding
1. 找出属于第二个嵌入层的条目的索引
2. extract those values while subtracting the size of the first embedding (num_embeddings), since the 2nd
embedding starts from 0 and not num_embeddings
2. 提取这些值,同时减去第一个嵌入层的大小(num_embeddings),因为第二个嵌入层的索引从0开始,而不是从num_embeddings开始
3. perform the 2nd embedding lookup
3. 执行第二个嵌入层的查找操作
4. now we handle the 1st embedding, we overwrite indices belonging to the 2nd embedding with a padding index
4. 现在处理第一个嵌入层,我们用填充索引覆盖属于第二个嵌入层的索引
5. perform the 1st embedding lookup
5. 执行第一个嵌入层的查找操作
6. now we overwrite the values in the 1st embedding lookup with the values of the 2nd embedding lookup
6. 现在我们用第二个嵌入层查找的值覆盖第一个嵌入层查找的值
note: for the 1st embedding lookup we could have looked up only the low indices and not do the padding, but
then we have to create a new tensor and populate it with 2 tensors that are spread out across various indices -
i.e. not a simple concat - I haven't benchmarked the complex case if it's any faster, given that seqlens are
usually relatively short it's probably not faster or if faster not by much - but might be a good idea to
measure.
注意:对于第一个嵌入层的查找,我们本可以只查找低索引而不进行填充,但那样我们就必须创建一个新的张量,并用两个分散在不同索引上的张量填充它 - 也就是不简单的连接操作 - 我还没有对复杂情况进行基准测试,如果更快的话,鉴于序列长度通常相对较短,可能并不更快,或者如果更快,提升也不会很大 - 但是测量一下可能是个好主意。
"""
if self.num_additional_embeddings == 0:
return F.embedding(input_ids, self.weight)
input_ids = input_ids.clone()
additional_vocab_indices = torch.where(input_ids >= self.num_embeddings)
input_ids_additional_vocab = input_ids[additional_vocab_indices]
additional_embeddings = self.additional_embedding(input_ids_additional_vocab - self.num_embeddings)
input_ids[additional_vocab_indices] = 0
full_vector = F.embedding(input_ids, self.weight)
full_vector[additional_vocab_indices] = additional_embeddings
return full_vector
def extra_repr(self) -> str:
"""
返回模型的额外信息,用于描述模型的属性。
Returns:
返回包含模型属性的字符串
"""
return "num_embeddings={}, num_additional_embeddings={}, embedding_dim={}, partially_freeze={}".format(
self.num_embeddings,
self.num_additional_embeddings,
self.embedding_dim,
self.partially_freeze,
)
class IdeficsDecoupledLinear(nn.Linear):
"""
Implements a decoupling of parameters to allow freezing (or not) a subset of the parameters. In practise, the
regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `out_additional_features` > 0,
then it will create `out_additional_features * in_features` additional parameters that are always trained. If
`out_additional_features=0`, then the module defaults back to the regular behavior of `nn.Linear`.
"""
def __init__(
self,
in_features: int,
out_features: int,
out_additional_features: int = 0,
bias: bool = True,
partially_freeze: bool = True,
device=None,
dtype=None,
) -> None:
"""
out_additional_features: int. Number of additional trainable dimensions. Only makes sense when
`partially_freeze=True`. partially_freeze: bool. If True, the regular `weight` will be frozen and extra
parameters (if any) will be trainable. If False, default to the regular behavior of nn.Linear.
"""
super().__init__(in_features, out_features, bias, device, dtype)
self.out_additional_features = out_additional_features
self.partially_freeze = partially_freeze
self.in_features = in_features
self.out_features = out_features
if partially_freeze:
self.weight.requires_grad_(False)
if bias:
self.bias.requires_grad_(False)
if out_additional_features > 0:
self.additional_fc = nn.Linear(
in_features=in_features,
out_features=out_additional_features,
bias=bias,
device=device,
dtype=dtype,
)
def forward(self, input: torch.Tensor) -> torch.Tensor:
output = F.linear(input, self.weight, self.bias)
if self.out_additional_features > 0:
additional_features = self.additional_fc(input)
output = torch.cat((output, additional_features), -1)
return output
def extra_repr(self) -> str:
"""Overwriting `nn.Linear.extra_repr` to include new parameters."""
return "in_features={}, out_features={}, out_additional_features={}, bias={}, partially_freeze={}".format(
self.in_features,
self.out_features,
self.out_additional_features,
self.bias is not None,
self.partially_freeze,
)
class IdeficsRMSNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
"""
IdeficsRMSNorm is equivalent to T5LayerNorm
"""
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.variance_epsilon = eps
def forward(self, hidden_states):
variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
if self.weight.dtype in [torch.float16, torch.bfloat16]:
hidden_states = hidden_states.to(self.weight.dtype)
return self.weight * hidden_states
ALL_LAYERNORM_LAYERS.append(IdeficsRMSNorm)
class IdeficsEmbedding(torch.nn.Module):
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
super().__init__()
self.dim = dim
self.max_position_embeddings = max_position_embeddings
self.base = base
inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
self.register_buffer("inv_freq", inv_freq, persistent=False)
self._set_cos_sin_cache(
seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
)
def _set_cos_sin_cache(self, seq_len, device, dtype):
self.max_seq_len_cached = seq_len
t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
freqs = torch.einsum("i,j->ij", t, self.inv_freq)
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
def forward(self, x, seq_len=None):
if seq_len > self.max_seq_len_cached:
self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
return (
self.cos_cached[:seq_len].to(dtype=x.dtype),
self.sin_cached[:seq_len].to(dtype=x.dtype),
)
def rotate_half(x):
"""将输入的隐藏维度的一半进行旋转。"""
x1 = x[..., : x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2 :]
return torch.cat((-x2, x1), dim=-1)
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
"""将旋转位置编码应用到查询和键张量上。
# 通过使用位置索引从余弦向量中选择对应的值,并在指定的维度上进行unsqueeze操作,以便与q和k的形状进行广播。
cos = cos[position_ids].unsqueeze(unsqueeze_dim)
# 通过使用位置索引从正弦向量中选择对应的值,并在指定的维度上进行unsqueeze操作,以便与q和k的形状进行广播。
sin = sin[position_ids].unsqueeze(unsqueeze_dim)
# 将查询向量q与余弦向量cos相乘并加上查询向量q与正弦向量sin经过rotate_half函数后的乘积,生成旋转后的查询向量。
q_embed = (q * cos) + (rotate_half(q) * sin)
# 将键向量k与余弦向量cos相乘并加上键向量k与正弦向量sin经过rotate_half函数后的乘积,生成旋转后的键向量。
k_embed = (k * cos) + (rotate_half(k) * sin)
# 返回旋转后的查询向量和键向量组成的元组。
return q_embed, k_embed
# 这段代码改编自 LlamaMLP
class IdeficsMLP(nn.Module):
def __init__(
self,
hidden_size: int,
intermediate_size: int,
hidden_act: str,
):
super().__init__()
# 定义一个线性层,用于门控投影,输入维度为 hidden_size,输出维度为 intermediate_size,无偏置
self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
# 定义一个线性层,用于下游投影,输入维度为 intermediate_size,输出维度为 hidden_size,无偏置
self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
# 定义一个线性层,用于上游投影,输入维度为 hidden_size,输出维度为 intermediate_size,无偏置
self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
# 激活函数为根据 hidden_act 参数选择的激活函数,从全局字典 ACT2FN 中获取
self.act_fn = ACT2FN[hidden_act]
def forward(self, x):
# 执行前向传播,结合门控投影、激活函数和上游投影,然后通过下游投影得到输出
return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
# 这段代码改编自 LlamaAttention
class IdeficsAttention(nn.Module):
"""来自 'Attention Is All You Need' 论文中的多头注意力"""
def __init__(
self,
hidden_size: int,
num_heads: int,
dropout: float = 0.0,
is_cross_attention: bool = False,
config: PretrainedConfig = None,
qk_layer_norms: bool = False,
):
super().__init__()
):
super().__init__() # 调用父类的初始化方法
self.hidden_size = hidden_size # 设置模型的隐藏层大小
self.num_heads = num_heads # 设置注意力头的数量
self.head_dim = hidden_size // num_heads # 计算每个注意力头的维度
self.dropout = dropout # 设置dropout的比例
self.is_causal = True # 设定是否是因果注意力机制
if (self.head_dim * num_heads) != self.hidden_size:
raise ValueError(
f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
f" and `num_heads`: {num_heads})."
) # 检查隐藏层大小是否能够被注意力头数量整除,如果不能则抛出数值错误异常
self.is_cross_attention = is_cross_attention # 标记是否是交叉注意力
if not hasattr(nn.functional, "scaled_dot_product_attention"):
raise ValueError("this model requires pytorch 2.0 or higher") # 检查是否支持所需的PyTorch版本
if self.is_cross_attention:
kv_input_dim = (
self.hidden_size if not hasattr(config.vision_config, "embed_dim") else config.vision_config.embed_dim
)
self.q_proj = nn.Linear(
self.hidden_size,
num_heads * self.head_dim,
bias=False,
) # 创建查询投影层
self.k_proj = nn.Linear(kv_input_dim, num_heads * self.head_dim, bias=False) # 创建键投影层
self.v_proj = nn.Linear(
kv_input_dim,
num_heads * self.head_dim,
bias=False,
) # 创建值投影层
else:
self.q_proj = nn.Linear(
self.hidden_size,
num_heads * self.head_dim,
bias=False,
) # 创建查询投影层
self.k_proj = nn.Linear(
self.hidden_size,
num_heads * self.head_dim,
bias=False,
) # 创建键投影层
self.v_proj = nn.Linear(
self.hidden_size,
num_heads * self.head_dim,
bias=False,
) # 创建值投影层
self.o_proj = nn.Linear(
num_heads * self.head_dim,
hidden_size,
bias=False,
) # 创建输出投影层
self.rotary_emb = IdeficsEmbedding(self.head_dim) # 创建旋转嵌入层对象
self.qk_layer_norms = qk_layer_norms # 设置是否进行查询和键的层标准化
if self.qk_layer_norms:
self.q_layer_norm = IdeficsRMSNorm(self.head_dim, eps=config.rms_norm_eps) # 创建查询层标准化对象
self.k_layer_norm = IdeficsRMSNorm(self.head_dim, eps=config.rms_norm_eps) # 创建键层标准化对象
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
# 将张量重塑为(batch_size, sequence_length, num_heads, head_dim),并转置维度以符合注意力机制的需求
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: bool = False,
use_cache: bool = False,
# this was adapted from LlamaDecoderLayer
# 定义一个名为 IdeficsDecoderLayer 的类,继承自 nn.Module
class IdeficsDecoderLayer(nn.Module):
def __init__(self, config: IdeficsConfig):
super().__init__()
# 初始化隐藏层大小
self.hidden_size = config.hidden_size
# 创建自注意力层对象,使用配置中的参数进行初始化
self.self_attn = IdeficsAttention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
dropout=config.dropout,
config=config,
)
# 创建MLP对象,使用配置中的参数进行初始化
self.mlp = IdeficsMLP(
hidden_size=self.hidden_size,
intermediate_size=config.intermediate_size,
hidden_act=config.hidden_act,
)
# 创建输入层归一化对象,使用配置中的隐藏大小和RMS归一化的epsilon参数进行初始化
self.input_layernorm = IdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
# 创建注意力后归一化对象,使用配置中的隐藏大小和RMS归一化的epsilon参数进行初始化
self.post_attention_layernorm = IdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
# 设置Dropout概率,使用配置中的dropout参数
self.dropout = config.dropout
# 定义前向传播方法
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
"""
residual = hidden_states # 保留输入 hidden_states 的原始值,用于后续残差连接
hidden_states = self.input_layernorm(hidden_states) # 使用层归一化对输入进行归一化处理
# Self Attention
hidden_states, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # 对输出进行 dropout 处理
hidden_states = residual + hidden_states # 残差连接:将归一化前的输入与经过 self attention 和 dropout 处理后的输出相加
# Fully Connected
residual = hidden_states # 保留上一步操作后的值,用于后续残差连接
hidden_states = self.post_attention_layernorm(hidden_states) # 使用层归一化对输出进行归一化处理
hidden_states = self.mlp(hidden_states) # 使用全连接层进行线性变换
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # 对输出进行 dropout 处理
hidden_states = residual + hidden_states # 残差连接:将归一化后的输出与经过 MLP 和 dropout 处理后的输出相加
outputs = (hidden_states,) # 将处理后的 hidden_states 放入输出元组中
if output_attentions:
outputs += (self_attn_weights,) # 如果需要输出 attention 权重,则将 self_attn_weights 放入输出元组中
if use_cache:
outputs += (present_key_value,) # 如果需要使用缓存的过去键值状态,则将 present_key_value 放入输出元组中
return outputs # 返回包含处理后的结果的元组
# 定义自定义的 gated cross-attention 层,继承自 nn.Module
class IdeficsGatedCrossAttentionLayer(nn.Module):
# 前向传播方法定义,接收多个输入参数
def forward(
self,
hidden_states: torch.Tensor, # 输入的隐藏状态张量
attention_mask: Optional[torch.Tensor] = None, # 可选的注意力遮罩张量
image_hidden_states: Optional[torch.Tensor] = None, # 可选的图像隐藏状态张量
image_attention_mask: Optional[torch.Tensor] = None, # 可选的图像注意力遮罩张量
cross_attention_gate: Optional[torch.Tensor] = None, # 可选的交叉注意力门控张量
output_attentions: Optional[bool] = False, # 是否输出注意力权重的标志
use_cache: Optional[bool] = False, # 是否使用缓存的标志
past_key_value: Optional[Tuple[torch.Tensor]] = None, # 可选的过去的键值对元组
LLAMA_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`IdeficsConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
@add_start_docstrings(
"The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
LLAMA_START_DOCSTRING,
)
# 定义一个预训练模型类,继承自 PreTrainedModel
class IdeficsPreTrainedModel(PreTrainedModel):
config_class = IdeficsConfig # 使用 IdeficsConfig 类作为配置类
base_model_prefix = "model" # 基础模型前缀为 "model"
supports_gradient_checkpointing = True # 支持梯度检查点
_no_split_modules = ["IdeficsDecoderLayer", "IdeficsGatedCrossAttentionLayer"] # 不拆分的模块列表
_supports_sdpa = True # 支持自动分配并行性加速(Self-Delegated Parallelism Acceleration, SDPA)
def _init_weights(self, module):
# 重要提示:这个 Idefics 的移植版本不适用于从头训练,只能用于推理和微调
# 因此,初始化权重的正确代码已被删除。m4 代码库应该用于从头训练,并包含正确的代码。
std = self.config.initializer_range
if isinstance(module, nn.Linear): # 如果是线性层
module.weight.data.normal_(mean=0.0, std=std) # 权重初始化为正态分布
if module.bias is not None:
module.bias.data.zero_() # 如果存在偏置,将其初始化为零
elif isinstance(module, nn.Embedding): # 如果是嵌入层
module.weight.data.normal_(mean=0.0, std=std) # 权重初始化为正态分布
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_() # 如果存在填充索引,将其初始化为零
# 从 transformers.modeling_utils.PreTrainedModel._check_and_enable_sdpa 适配而来
@classmethod
# 定义一个类方法 `_check_and_enable_sdpa`,用于检查并启用 SDPA 注意力机制配置
def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False) -> PretrainedConfig:
# 检查是否启用了 `use_bettertransformer` 属性,用于决定是否返回原始配置
_is_bettertransformer = getattr(cls, "use_bettertransformer", False)
if _is_bettertransformer:
return config
# 如果不仅仅是进行硬性检查,设置注意力实现方式为 "sdpa"
if not hard_check_only:
config._attn_implementation = "sdpa"
# 返回修改后的配置对象
return config
# 定义一个多行字符串,用于文档化LLaMA输入的说明文档
LLAMA_INPUTS_DOCSTRING = r"""
"""
# 使用装饰器为IdeficsModel类添加文档字符串,在输出原始隐藏状态时不添加特定的顶部头信息
@add_start_docstrings(
"The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
LLAMA_START_DOCSTRING,
)
# 定义IdeficsModel类,继承自IdeficsPreTrainedModel类
class IdeficsModel(IdeficsPreTrainedModel):
"""
Transformer解码器,由`config.num_hidden_layers`层组成。每一层是一个[`IdeficsDecoderLayer`]
Args:
config: IdeficsConfig
"""
def __init__(self, config: IdeficsConfig):
# 调用父类的构造函数进行初始化
super().__init__(config)
# 将config参数保存在实例变量中
self.config = config
# 设置填充索引为config中定义的pad_token_id
self.padding_idx = config.pad_token_id
# 设置词汇表大小为config中定义的vocab_size
self.vocab_size = config.vocab_size
# 创建IdeficsDecoupledEmbedding实例,并保存在embed_tokens实例变量中
self.embed_tokens = IdeficsDecoupledEmbedding(
num_embeddings=config.vocab_size,
num_additional_embeddings=config.additional_vocab_size,
embedding_dim=config.hidden_size,
partially_freeze=config.freeze_text_layers,
padding_idx=self.padding_idx,
)
# 设置图像尺寸和视觉配置,从config参数中获取
self.image_size = config.vision_config.image_size
self.vision_config = config.vision_config
# 创建IdeficsVisionTransformer实例,并保存在vision_model实例变量中
self.vision_model = IdeficsVisionTransformer(config.vision_config)
# 如果config中设置了使用resampler,则创建IdeficsPerceiverResampler实例,并保存在perceiver_resampler实例变量中
if config.use_resampler:
perceiver_config = config.perceiver_config
self.perceiver_resampler = IdeficsPerceiverResampler(
config,
config.vision_config.embed_dim,
perceiver_config.resampler_depth,
perceiver_config.resampler_n_heads,
perceiver_config.resampler_head_dim,
perceiver_config.resampler_n_latents,
)
# 创建包含config.num_hidden_layers个IdeficsDecoderLayer实例的模块列表,并保存在layers实例变量中
self.layers = nn.ModuleList([IdeficsDecoderLayer(config) for _ in range(config.num_hidden_layers)])
# 设置跨层间隔为config中定义的cross_layer_interval
self.cross_layer_interval = config.cross_layer_interval
# 计算跨层注意力层的数量
num_cross_layers = config.num_hidden_layers // self.cross_layer_interval
# 创建包含num_cross_layers个IdeficsGatedCrossAttentionLayer实例的模块列表,并保存在gated_cross_attn_layers实例变量中
self.gated_cross_attn_layers = nn.ModuleList(
[IdeficsGatedCrossAttentionLayer(config) for _ in range(num_cross_layers)]
)
# 设置梯度检查点标志为False
self.gradient_checkpointing = False
# 创建IdeficsRMSNorm实例,并保存在norm实例变量中
self.norm = IdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
# 初始化权重并进行最终处理
self.post_init()
# 冻结相关参数
self.freeze_relevant_params(config)
# 方法:冻结相关参数
def freeze_relevant_params(self, config=None):
if config is None:
config = self.config
# 如果配置中指定冻结文本层,则调用freeze_text_layers方法冻结相关模块
if config.freeze_text_layers:
self.freeze_text_layers(config.freeze_text_module_exceptions)
# 如果配置中指定冻结视觉层,则调用freeze_vision_layers方法冻结视觉模型
if config.freeze_vision_layers:
freeze_model(self.vision_model, module_exceptions=config.freeze_vision_module_exceptions)
# 方法:冻结文本层
def freeze_text_layers(self, module_exceptions=[]):
# 遍历self.layers和self.norm列表中的模块,调用freeze_model函数冻结指定模块
for module in [self.layers, self.norm]:
freeze_model(module, module_exceptions=module_exceptions)
# 方法:冻结视觉层
def freeze_vision_layers(self, module_exceptions=[]):
# 调用freeze_model函数冻结self.vision_model中指定的模块
freeze_model(self.vision_model, module_exceptions=module_exceptions)
# 方法:获取输入嵌入层
def get_input_embeddings(self):
return self.embed_tokens
# 设置模型的输入嵌入表示
def set_input_embeddings(self, value):
self.embed_tokens = value
# 在模型前向传播过程中添加注释到模型文档的装饰器
@add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
def forward(
self,
input_ids: torch.LongTensor = None, # 输入的token IDs,类型为LongTensor
attention_mask: Optional[torch.Tensor] = None, # 注意力遮罩,可选的Tensor类型
position_ids: Optional[torch.LongTensor] = None, # 位置IDs,可选的LongTensor类型
past_key_values: Optional[List[torch.FloatTensor]] = None, # 过去的键值对,可选的浮点数张量列表
inputs_embeds: Optional[torch.FloatTensor] = None, # 输入的嵌入表示,可选的浮点数张量
pixel_values: Optional[torch.FloatTensor] = None, # 像素值,可选的浮点数张量
image_encoder_embeddings: Optional[torch.FloatTensor] = None, # 图像编码器嵌入,可选的浮点数张量
perceiver_embeddings: Optional[torch.FloatTensor] = None, # 感知器嵌入,可选的浮点数张量
image_attention_mask: Optional[torch.Tensor] = None, # 图像注意力遮罩,可选的Tensor类型
use_cache: Optional[bool] = None, # 是否使用缓存,可选的布尔类型
output_attentions: Optional[bool] = None, # 是否输出注意力权重,可选的布尔类型
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,可选的布尔类型
interpolate_pos_encoding: Optional[bool] = False, # 是否插值位置编码,布尔类型,默认为False
return_dict: Optional[bool] = None, # 是否返回字典格式的输出,可选的布尔类型
class IdeficsForVisionText2Text(IdeficsPreTrainedModel):
# 在加载时需要忽略的键列表,用于处理缺失情况
_keys_to_ignore_on_load_missing = [r"lm_head.weight"]
# 要绑定权重的键列表,指定需要共享权重的模型参数
_tied_weights_keys = ["model.embed_tokens.weight", "lm_head.weight"]
def __init__(self, config, vision_model=None):
# 调用父类的初始化方法,传入配置参数
super().__init__(config)
# 使用给定的配置参数初始化 IdeficsModel 模型
self.model = IdeficsModel(config)
# 使用 IdeficsDecoupledLinear 初始化 lm_head 层
self.lm_head = IdeficsDecoupledLinear(
in_features=config.hidden_size,
out_features=config.vocab_size,
out_additional_features=config.additional_vocab_size,
bias=False,
partially_freeze=config.freeze_lm_head,
)
# 执行初始化权重并进行最终处理
self.post_init()
def get_input_embeddings(self):
# 返回模型的 embed_tokens 层,用于输入嵌入
return self.model.embed_tokens
def set_input_embeddings(self, value):
# 设置模型的 embed_tokens 层,用于输入嵌入
self.model.embed_tokens = value
def get_output_embeddings(self):
# 返回 lm_head 层,用于输出嵌入
return self.lm_head
def set_output_embeddings(self, new_embeddings):
# 设置 lm_head 层,用于输出嵌入
self.lm_head = new_embeddings
def set_decoder(self, decoder):
# 设置模型的 decoder 层
self.model = decoder
def get_decoder(self):
# 返回模型的 decoder 层
return self.model
def tie_weights(self):
"""
重写 `transformers.modeling_utils.PreTrainedModel.tie_weights` 方法,
处理 IdeficsDecoupledLinear 和 IdeficsDecoupledEmbedding 的情况。
"""
output_embeddings = self.get_output_embeddings()
input_embeddings = self.get_input_embeddings()
# 如果配置允许绑定词嵌入,则将输出嵌入的权重设置为输入嵌入的权重
if getattr(self.config, "tie_word_embeddings", True):
output_embeddings.weight = input_embeddings.weight
# 如果存在额外的嵌入,则也绑定额外的嵌入权重
if input_embeddings.num_additional_embeddings > 0:
assert output_embeddings.out_additional_features == input_embeddings.num_additional_embeddings
output_embeddings.additional_fc.weight = input_embeddings.additional_embedding.weight
# 更新输出嵌入的特征数和额外特征数,以匹配输入嵌入的数目
if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
output_embeddings.out_features = input_embeddings.num_embeddings
if hasattr(output_embeddings, "out_additional_features") and hasattr(
input_embeddings, "num_additional_embeddings"
):
output_embeddings.out_additional_features = input_embeddings.num_additional_embeddings
@add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=IdeficsCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
# 定义一个方法用于处理前向推断过程中的输入数据
def forward(
self,
input_ids: torch.LongTensor = None, # 输入的token ID序列,默认为None
attention_mask: Optional[torch.Tensor] = None, # 可选的注意力掩码张量,默认为None
position_ids: Optional[torch.LongTensor] = None, # 可选的位置ID张量,默认为None
past_key_values: Optional[List[torch.FloatTensor]] = None, # 可选的过去键值对列表,默认为None
inputs_embeds: Optional[torch.FloatTensor] = None, # 可选的嵌入输入张量,默认为None
pixel_values: Optional[torch.FloatTensor] = None, # 可选的像素值张量,默认为None
image_encoder_embeddings: Optional[torch.FloatTensor] = None, # 可选的图像编码器嵌入张量,默认为None
perceiver_embeddings: Optional[torch.FloatTensor] = None, # 可选的感知器嵌入张量,默认为None
image_attention_mask: Optional[torch.Tensor] = None, # 可选的图像注意力掩码张量,默认为None
labels: Optional[torch.LongTensor] = None, # 可选的标签张量,默认为None
use_cache: Optional[bool] = None, # 可选的缓存使用标志,默认为None
output_attentions: Optional[bool] = None, # 可选的输出注意力张量,默认为None
output_hidden_states: Optional[bool] = None, # 可选的输出隐藏状态标志,默认为None
interpolate_pos_encoding: Optional[bool] = False, # 可选的位置编码插值标志,默认为False
return_dict: Optional[bool] = None, # 可选的返回字典标志,默认为None
):
# 定义一个方法用于准备生成过程中的输入数据
def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
# 从kwargs中获取image_hidden_states参数,如果存在的话
image_hidden_states = kwargs.pop("image_hidden_states", None)
if image_hidden_states is not None:
# 如果配置中使用resampler,则将perceiver_embeddings设置为image_hidden_states,否则设置为None
if self.config.use_resampler:
kwargs["perceiver_embeddings"] = image_hidden_states
else:
kwargs["image_encoder_embeddings"] = image_hidden_states
kwargs["pixel_values"] = None # 将像素值设置为None
# 调用准备生成输入数据的函数,传递input_ids、past以及其他未处理的kwargs参数
inputs = prepare_inputs_for_generation(input_ids, past=past, **kwargs)
unwanted_kwargs = ["token_type_ids"] # 定义一个不需要的kwargs参数列表
for kwarg in unwanted_kwargs:
inputs.pop(kwarg, None) # 从inputs中移除不需要的kwargs参数
return inputs # 返回处理后的inputs字典
@staticmethod
def _expand_inputs_for_generation(
*args,
**model_kwargs,
):
# 调用扩展生成输入数据的函数,传递args和model_kwargs参数
return expand_inputs_for_generation(*args, **model_kwargs)
# 定义一个方法,用于生成过程中更新模型关键字参数
def _update_model_kwargs_for_generation(
self,
outputs: ModelOutput, # 输出模型的结果
model_kwargs: Dict[str, Any], # 模型关键字参数的字典
is_encoder_decoder: bool = False, # 是否是编码器-解码器结构,默认为False
standardize_cache_format: bool = False, # 是否标准化缓存格式,默认为False
) -> Dict[str, Any]: # 返回更新后的模型关键字参数的字典
# 调用父类的更新模型关键字参数函数,传递outputs、model_kwargs、is_encoder_decoder和standardize_cache_format参数
model_kwargs = super()._update_model_kwargs_for_generation(
outputs,
model_kwargs,
is_encoder_decoder,
standardize_cache_format,
)
# 如果model_kwargs中包含'image_attention_mask'键
if "image_attention_mask" in model_kwargs:
image_attention_mask = model_kwargs["image_attention_mask"]
# 取图像注意力掩码的最后一个mask并添加一个维度
last_mask = image_attention_mask[:, -1, :].unsqueeze(1)
model_kwargs["image_attention_mask"] = last_mask # 更新模型关键字参数中的'image_attention_mask'为最后一个mask
# 获取预计算的image_hidden_states并添加到模型关键字参数中
model_kwargs["image_hidden_states"] = outputs.image_hidden_states
return model_kwargs # 返回更新后的模型关键字参数的字典
@staticmethod
def _reorder_cache(past, beam_idx):
reordered_past = ()
# 遍历每一层的过去状态,并按beam_idx重新排序
for layer_past in past:
reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
return reordered_past # 返回重新排序后的过去状态