Transformers 源码解析(五十七)
.\models\groupvit\modeling_tf_groupvit.py
""" TF 2.0 GroupViT model."""
from __future__ import annotations
import collections.abc
import math
from dataclasses import dataclass
from typing import Any, Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling
from ...modeling_tf_utils import (
TFModelInputType,
TFPreTrainedModel,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_tensorflow_probability_available,
logging,
replace_return_docstrings,
)
from .configuration_groupvit import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig
logger = logging.get_logger(__name__)
if is_tensorflow_probability_available():
try:
import tensorflow_probability as tfp
_ = tfp.distributions.Normal(loc=0.0, scale=1.0)
except ImportError:
logger.error(
"GroupViT models are not usable since `tensorflow_probability` can't be loaded. "
"It seems you have `tensorflow_probability` installed with the wrong tensorflow version."
"Please try to reinstall it following the instructions here: https://github.com/tensorflow/probability."
)
_CHECKPOINT_FOR_DOC = "nvidia/groupvit-gcc-yfcc"
TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"nvidia/groupvit-gcc-yfcc",
]
LARGE_NEGATIVE = -1e8
def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
"""
src_len = shape_list(mask)[1]
tgt_len = tgt_len if tgt_len is not None else src_len
one_cst = tf.constant(1.0)
mask = tf.cast(mask, dtype=one_cst.dtype)
expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))
return (one_cst - expanded_mask) * LARGE_NEGATIVE
def contrastive_loss(logits: tf.Tensor) -> tf.Tensor:
return tf.math.reduce_mean(
keras.metrics.sparse_categorical_crossentropy(
y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True
)
)
def groupvit_loss(similarity: tf.Tensor) -> tf.Tensor:
caption_loss = contrastive_loss(similarity)
image_loss = contrastive_loss(tf.transpose(similarity))
return (caption_loss + image_loss) / 2.0
def hard_softmax(logits: tf.Tensor, dim: int) -> tf.Tensor:
y_soft = stable_softmax(logits, dim)
index = tf.argmax(y_soft, dim)
y_hard = tf.one_hot(
index,
depth=shape_list(logits)[dim],
axis=range(len(shape_list(logits)))[dim],
dtype=y_soft.dtype,
)
ret = y_hard - tf.stop_gradient(y_soft) + y_soft
return ret
def gumbel_softmax(logits: tf.Tensor, tau: float = 1, hard: bool = False, dim: int = -1) -> tf.Tensor:
gumbel_dist = tfp.distributions.Gumbel(0.0, 1.0)
gumbels = gumbel_dist.sample(tf.shape(logits), dtype=logits.dtype)
gumbels = (logits + gumbels) / tau
y_soft = stable_softmax(gumbels, dim)
if hard:
index = tf.argmax(y_soft, dim)
y_hard = tf.one_hot(
index,
depth=shape_list(logits)[dim],
axis=range(len(shape_list(logits)))[dim],
dtype=y_soft.dtype,
)
ret = y_hard - tf.stop_gradient(y_soft) + y_soft
else:
ret = y_soft
return ret
def resize_attention_map(attentions: tf.Tensor, height: int, width: int, align_corners: bool = False) -> tf.Tensor:
"""
Args:
attentions (`tf.Tensor`): shape 为 [batch_size, groups, feat_height*feat_width] 的注意力图
height (`int`): 输出注意力图的高度
width (`int`): 输出注意力图的宽度
align_corners (`bool`, *optional*): `nn.functional.interpolate` 的 `align_corner` 参数。
Returns:
`tf.Tensor`: shape 为 [batch_size, groups, height, width] 的调整大小后的注意力图
"""
scale = (height * width // attentions.shape[2]) ** 0.5
if height > width:
feat_width = int(np.round(width / scale))
feat_height = shape_list(attentions)[2] // feat_width
else:
feat_height = int(np.round(height / scale))
feat_width = shape_list(attentions)[2] // feat_height
batch_size = shape_list(attentions)[0]
groups = shape_list(attentions)[1]
attentions = tf.reshape(attentions, (batch_size, groups, feat_height, feat_width))
attentions = tf.transpose(attentions, perm=(0, 2, 3, 1))
if align_corners:
attentions = tf.compat.v1.image.resize(
attentions,
size=(height, width),
method="bilinear",
align_corners=align_corners,
)
else:
attentions = tf.image.resize(attentions, size=(height, width), method="bilinear")
attentions = tf.transpose(attentions, perm=(0, 3, 1, 2))
return attentions
def get_grouping_from_attentions(attentions: Tuple[tf.Tensor], hw_shape: Tuple[int]) -> tf.Tensor:
"""
Args:
attentions (`tuple(tf.Tensor)`): TFGroupViTVisionTransformer 返回的注意力图元组
hw_shape (`tuple(int)`): 输出注意力图的高度和宽度
Returns:
`tf.Tensor`: 形状为 [batch_size, groups, height, width] 的注意力图
"""
attn_maps = []
prev_attn_masks = None
for attn_masks in attentions:
attn_masks = tf.transpose(attn_masks, perm=(0, 2, 1))
if prev_attn_masks is None:
prev_attn_masks = attn_masks
else:
prev_attn_masks = tf.matmul(prev_attn_masks, attn_masks)
cur_attn_map = resize_attention_map(tf.transpose(prev_attn_masks, perm=(0, 2, 1)), *hw_shape)
attn_maps.append(cur_attn_map)
final_grouping = attn_maps[-1]
return tf.stop_gradient(final_grouping)
Args:
loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
Contrastive loss for image-text similarity.
logits_per_image (`tf.Tensor` of shape `(image_batch_size, text_batch_size)`):
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
similarity scores.
logits_per_text (`tf.Tensor` of shape `(text_batch_size, image_batch_size)`):
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
similarity scores.
segmentation_logits (`tf.Tensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
Classification scores for each pixel.
<Tip warning={true}>
The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
original image size as post-processing. You should always check your logits shape and resize as needed.
</Tip>
text_embeds (`tf.Tensor` of shape `(batch_size, output_dim)`):
The text embeddings obtained by applying the projection layer to the pooled output of
[`TFGroupViTTextModel`].
image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)`):
The image embeddings obtained by applying the projection layer to the pooled output of
[`TFGroupViTVisionModel`].
text_model_output (`TFBaseModelOutputWithPooling`):
The output of the [`TFGroupViTTextModel`].
vision_model_output (`TFBaseModelOutputWithPooling`):
The output of the [`TFGroupViTVisionModel`].
"""
# Initialize optional attributes with None
loss: tf.Tensor | None = None
logits_per_image: tf.Tensor = None
logits_per_text: tf.Tensor = None
segmentation_logits: tf.Tensor = None
text_embeds: tf.Tensor = None
image_embeds: tf.Tensor = None
text_model_output: TFBaseModelOutputWithPooling = None
vision_model_output: TFBaseModelOutputWithPooling = None
# Define method to convert attributes to a tuple, excluding specific complex types
def to_tuple(self) -> Tuple[Any]:
return tuple(
# If key is not in exclusion list, return the attribute value; otherwise, convert complex type to tuple
self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
for k in self.keys()
)
# 定义 TFGroupViTCrossAttentionLayer 类,继承自 keras.layers.Layer
class TFGroupViTCrossAttentionLayer(keras.layers.Layer):
# 初始化方法,接受 GroupViTVisionConfig 类型的 config 参数和其他关键字参数
def __init__(self, config: GroupViTVisionConfig, **kwargs):
super().__init__(**kwargs)
# 创建 TFGroupViTAttention 实例并赋给 self.attn 属性,名称为 "attn"
self.attn = TFGroupViTAttention(config, name="attn")
# 创建 LayerNormalization 实例并赋给 self.norm2 属性,使用 config 中的参数
self.norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm2")
# 创建 TFGroupViTMLP 实例并赋给 self.mlp 属性,名称为 "mlp"
self.mlp = TFGroupViTMLP(config, name="mlp")
# 创建 LayerNormalization 实例并赋给 self.norm_post 属性,使用 config 中的参数
self.norm_post = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_post")
# 将 config 参数赋给 self.config 属性
self.config = config
# call 方法,定义层的正向传播逻辑,接受 query, key 和 training 参数,返回一个 tf.Tensor
def call(self, query: tf.Tensor, key: tf.Tensor, training: bool = False) -> tf.Tensor:
# 将 query 赋给变量 x
x = query
# 使用 self.attn 对象处理 query 和 encoder_hidden_states=key,并加到 x 上
x = x + self.attn(query, encoder_hidden_states=key)[0]
# 使用 self.norm2 对 x 进行 LayerNormalization,然后传入 self.mlp 处理并加到 x 上
x = x + self.mlp(self.norm2(x))
# 对 x 使用 self.norm_post 进行 LayerNormalization
x = self.norm_post(x)
# 返回处理后的 x
return x
# build 方法,用于构建层,设置内部变量和子层的建立过程
def build(self, input_shape=None):
# 如果已经构建过,则直接返回
if self.built:
return
# 标记为已构建
self.built = True
# 检查并建立 self.attn 对象
if getattr(self, "attn", None) is not None:
with tf.name_scope(self.attn.name):
self.attn.build(None)
# 检查并建立 self.norm2 对象
if getattr(self, "norm2", None) is not None:
with tf.name_scope(self.norm2.name):
self.norm2.build([None, None, self.config.hidden_size])
# 检查并建立 self.mlp 对象
if getattr(self, "mlp", None) is not None:
with tf.name_scope(self.mlp.name):
self.mlp.build(None)
# 检查并建立 self.norm_post 对象
if getattr(self, "norm_post", None) is not None:
with tf.name_scope(self.norm_post.name):
self.norm_post.build([None, None, self.config.hidden_size])
# 定义 TFGroupViTAssignAttention 类,继承自 keras.layers.Layer
class TFGroupViTAssignAttention(keras.layers.Layer):
# 初始化方法,接受 GroupViTVisionConfig 类型的 config 参数和其他关键字参数
def __init__(self, config: GroupViTVisionConfig, **kwargs):
super().__init__(**kwargs)
# 计算缩放因子,赋给 self.scale 属性
self.scale = config.hidden_size**-0.5
# 创建 Dense 层实例 q_proj,并赋给 self.q_proj 属性,名称为 "q_proj"
self.q_proj = keras.layers.Dense(config.hidden_size, name="q_proj")
# 创建 Dense 层实例 k_proj,并赋给 self.k_proj 属性,名称为 "k_proj"
self.k_proj = keras.layers.Dense(config.hidden_size, name="k_proj")
# 创建 Dense 层实例 v_proj,并赋给 self.v_proj 属性,名称为 "v_proj"
self.v_proj = keras.layers.Dense(config.hidden_size, name="v_proj")
# 创建 Dense 层实例 proj,并赋给 self.proj 属性,名称为 "proj"
self.proj = keras.layers.Dense(config.hidden_size, name="proj")
# 从 config 参数获取 assign_eps,并赋给 self.assign_eps 属性
self.assign_eps = config.assign_eps
# 将 config 参数赋给 self.config 属性
self.config = config
# get_attn 方法,接受 attn, gumbel, hard, training 参数,返回处理后的 attn tf.Tensor
def get_attn(self, attn: tf.Tensor, gumbel: bool = True, hard: bool = True, training: bool = False) -> tf.Tensor:
# 如果 gumbel 为真且在训练状态下,使用 gumbel_softmax 处理 attn
if gumbel and training:
attn = gumbel_softmax(attn, dim=-2, hard=hard)
else:
# 否则根据 hard 的值选择合适的 softmax 函数处理 attn
if hard:
attn = hard_softmax(attn, dim=-2)
else:
attn = stable_softmax(attn, axis=-2)
# 返回处理后的 attn
return attn
# 实现注意力机制的调用函数,计算并返回注意力加权后的输出及软注意力分布
def call(self, query: tf.Tensor, key: tf.Tensor, training: bool = False):
# 将key作为value备份
value = key
# 对query进行投影操作,将其映射到指定维度空间
query = self.q_proj(query)
# 对key进行投影操作,将其映射到指定维度空间
key = self.k_proj(key)
# 对value进行投影操作,将其映射到指定维度空间
value = self.v_proj(value)
# 计算原始注意力分布,query与key的转置矩阵相乘,并乘以缩放因子
raw_attn = tf.matmul(query, key, transpose_b=True) * self.scale
# 根据原始注意力分布获取注意力权重,可选择性使用Gumbel-Softmax进行采样
attn = self.get_attn(raw_attn, training=training)
soft_attn = self.get_attn(raw_attn, training=training, gumbel=False, hard=False)
# 归一化注意力权重,防止数值不稳定,加上一个小的常数eps
attn = attn / (tf.math.reduce_sum(attn, axis=-1, keepdims=True) + self.assign_eps)
# 根据注意力权重对value进行加权求和,得到最终输出
out = tf.matmul(attn, value)
# 对输出结果进行最终投影,映射到指定的维度空间
out = self.proj(out)
# 返回最终输出结果及soft_attn,即软注意力分布
return out, soft_attn
# 构建注意力层,设置各投影操作的维度
def build(self, input_shape=None):
# 如果已经构建过,直接返回
if self.built:
return
# 标记为已构建
self.built = True
# 如果q_proj存在,则设置其输入形状
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.config.hidden_size])
# 如果k_proj存在,则设置其输入形状
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.config.hidden_size])
# 如果v_proj存在,则设置其输入形状
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.config.hidden_size])
# 如果proj存在,则设置其输入形状
if getattr(self, "proj", None) is not None:
with tf.name_scope(self.proj.name):
self.proj.build([None, None, self.config.hidden_size])
class TFGroupViTTokenAssign(keras.layers.Layer):
def __init__(self, config: GroupViTVisionConfig, num_group_token: int, num_output_group: int, **kwargs):
super().__init__(**kwargs)
self.num_output_group = num_output_group
# 对 group_tokens 进行层归一化
self.norm_tokens = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_tokens")
# 根据配置计算 MLP 的维度
assign_mlp_ratio = (
config.assign_mlp_ratio
if isinstance(config.assign_mlp_ratio, collections.abc.Iterable)
else (config.assign_mlp_ratio, config.assign_mlp_ratio)
)
tokens_dim, channels_dim = [int(x * config.hidden_size) for x in assign_mlp_ratio]
# 创建用于中间 MLP 的层
self.mlp_inter = TFGroupViTMixerMLP(config, num_group_token, tokens_dim, num_output_group, name="mlp_inter")
# 对 post_tokens 进行层归一化
self.norm_post_tokens = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_post_tokens")
# 对输入 x 进行层归一化
self.norm_x = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_x")
# 创建用于前分配注意力的层
self.pre_assign_attn = TFGroupViTCrossAttentionLayer(config, name="pre_assign_attn")
# 创建分配注意力的层
self.assign = TFGroupViTAssignAttention(config, name="assign")
# 对新的 x 进行层归一化
self.norm_new_x = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_new_x")
# 创建用于通道的 MLP 层
self.mlp_channels = TFGroupViTMLP(
config, config.hidden_size, channels_dim, config.hidden_size, name="mlp_channels"
)
self.config = config
def project_group_token(self, group_tokens: tf.Tensor) -> tf.Tensor:
"""
Args:
group_tokens (tf.Tensor): group tokens, [batch_size, num_group_tokens, channels]
Returns:
projected_group_tokens (tf.Tensor): [batch_size, num_output_groups, channels]
"""
# 使用中间 MLP 层对 group_tokens 进行投影
projected_group_tokens = self.mlp_inter(group_tokens)
# 对投影后的 group_tokens 进行层归一化
projected_group_tokens = self.norm_post_tokens(projected_group_tokens)
return projected_group_tokens
def call(self, image_tokens: tf.Tensor, group_tokens: tf.Tensor, training: bool = False):
"""
Args:
image_tokens (`tf.Tensor`): image tokens, of shape [batch_size, input_length, channels]
group_tokens (`tf.Tensor`): group tokens, [batch_size, num_group_tokens, channels]
"""
# 对 group_tokens 进行层归一化
group_tokens = self.norm_tokens(group_tokens)
# 对 image_tokens 进行层归一化
image_tokens = self.norm_x(image_tokens)
# 使用中间 MLP 层对 group_tokens 进行投影,得到投影后的 group_tokens
projected_group_tokens = self.project_group_token(group_tokens)
# 使用前分配注意力层对投影后的 group_tokens 和 image_tokens 进行处理
projected_group_tokens = self.pre_assign_attn(projected_group_tokens, image_tokens)
# 使用分配注意力层对投影后的 group_tokens 和 image_tokens 进行分配
new_image_tokens, attention = self.assign(projected_group_tokens, image_tokens)
# 将投影后的 group_tokens 添加到新的 image_tokens 上
new_image_tokens += projected_group_tokens
# 对新的 image_tokens 进行层归一化和 MLP 通道操作
new_image_tokens = new_image_tokens + self.mlp_channels(self.norm_new_x(new_image_tokens))
return new_image_tokens, attention
# 定义 build 方法,用于构建模型结构
def build(self, input_shape=None):
# 如果模型已经构建完成,直接返回,避免重复构建
if self.built:
return
# 将模型标记为已构建状态
self.built = True
# 如果存在 norm_tokens 属性,则构建与其相关的操作
if getattr(self, "norm_tokens", None) is not None:
# 使用 tf.name_scope 为 norm_tokens 创建命名空间
with tf.name_scope(self.norm_tokens.name):
# 使用 norm_tokens 属性构建操作,输入形状为 [None, None, self.config.hidden_size]
self.norm_tokens.build([None, None, self.config.hidden_size])
# 如果存在 mlp_inter 属性,则构建与其相关的操作
if getattr(self, "mlp_inter", None) is not None:
# 使用 tf.name_scope 为 mlp_inter 创建命名空间
with tf.name_scope(self.mlp_inter.name):
# 使用 mlp_inter 属性构建操作,输入形状为 None
self.mlp_inter.build(None)
# 如果存在 norm_post_tokens 属性,则构建与其相关的操作
if getattr(self, "norm_post_tokens", None) is not None:
# 使用 tf.name_scope 为 norm_post_tokens 创建命名空间
with tf.name_scope(self.norm_post_tokens.name):
# 使用 norm_post_tokens 属性构建操作,输入形状为 [None, None, self.config.hidden_size]
self.norm_post_tokens.build([None, None, self.config.hidden_size])
# 如果存在 norm_x 属性,则构建与其相关的操作
if getattr(self, "norm_x", None) is not None:
# 使用 tf.name_scope 为 norm_x 创建命名空间
with tf.name_scope(self.norm_x.name):
# 使用 norm_x 属性构建操作,输入形状为 [None, None, self.config.hidden_size]
self.norm_x.build([None, None, self.config.hidden_size])
# 如果存在 pre_assign_attn 属性,则构建与其相关的操作
if getattr(self, "pre_assign_attn", None) is not None:
# 使用 tf.name_scope 为 pre_assign_attn 创建命名空间
with tf.name_scope(self.pre_assign_attn.name):
# 使用 pre_assign_attn 属性构建操作,输入形状为 None
self.pre_assign_attn.build(None)
# 如果存在 assign 属性,则构建与其相关的操作
if getattr(self, "assign", None) is not None:
# 使用 tf.name_scope 为 assign 创建命名空间
with tf.name_scope(self.assign.name):
# 使用 assign 属性构建操作,输入形状为 None
self.assign.build(None)
# 如果存在 norm_new_x 属性,则构建与其相关的操作
if getattr(self, "norm_new_x", None) is not None:
# 使用 tf.name_scope 为 norm_new_x 创建命名空间
with tf.name_scope(self.norm_new_x.name):
# 使用 norm_new_x 属性构建操作,输入形状为 [None, None, self.config.hidden_size]
self.norm_new_x.build([None, None, self.config.hidden_size])
# 如果存在 mlp_channels 属性,则构建与其相关的操作
if getattr(self, "mlp_channels", None) is not None:
# 使用 tf.name_scope 为 mlp_channels 创建命名空间
with tf.name_scope(self.mlp_channels.name):
# 使用 mlp_channels 属性构建操作,输入形状为 None
self.mlp_channels.build(None)
# Adapted from transformers.models.vit.modeling_tf_vit.TFViTPatchEmbeddings with ViT->GroupViT
# 这个类从 TFViTPatchEmbeddings 修改而来,用于 GroupViT 模型
class TFGroupViTPatchEmbeddings(keras.layers.Layer):
"""
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
"""
def __init__(self, config: GroupViTConfig, **kwargs):
super().__init__(**kwargs)
# 从配置中获取图像大小和补丁大小
image_size, patch_size = config.image_size, config.patch_size
num_channels = config.num_channels
# hidden_size 作为成员变量保存,因为在调用方法中会用到
self.hidden_size = config.hidden_size
# 如果图像大小和补丁大小不是可迭代对象,则转换为元组
image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
# 计算图像中的补丁数量
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
# 保存图像大小、补丁大小、补丁数量、通道数和配置
self.image_size = image_size
self.patch_size = patch_size
self.num_patches = num_patches
self.num_channels = num_channels
self.config = config
# 创建投影层,用于将像素值投影到隐藏状态的大小
self.projection = keras.layers.Conv2D(
filters=self.hidden_size, # 输出通道数即隐藏状态的维度
kernel_size=patch_size, # 卷积核大小设置为补丁大小
strides=patch_size, # 步长设置为补丁大小,用于不重叠地提取补丁
padding="valid", # 使用有效填充方式
data_format="channels_last", # 数据格式为通道在最后
use_bias=True, # 使用偏置
kernel_initializer=get_initializer(self.config.initializer_range), # 使用指定初始化器初始化权重
bias_initializer="zeros", # 偏置初始化为零
name="projection", # 层的名称为 projection
)
def call(
self, pixel_values: tf.Tensor, interpolate_pos_encoding: bool = False, training: bool = False
# call 方法接收像素值张量、是否插值位置编码的标志和训练模式的标志
# 未完整注释,仅包含方法签名部分,后续应继续添加注释
) -> tf.Tensor:
# 从输入的像素值张量中获取批量大小、通道数、高度和宽度
batch_size, num_channels, height, width = shape_list(pixel_values)
# 如果在执行即时模式并且像素值的通道数与配置中设置的通道数不匹配,则引发值错误
if tf.executing_eagerly() and num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
# 如果不插值位置编码且在执行即时模式下,输入图像的高度或宽度与模型配置的不匹配,则引发值错误
if (
not interpolate_pos_encoding
and tf.executing_eagerly()
and (height != self.image_size[0] or width != self.image_size[1])
):
raise ValueError(
f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
)
# 当在 CPU 上运行时,`keras.layers.Conv2D` 不支持 `NCHW` 格式,因此将输入格式从 `NCHW` 转换为 `NHWC`
# shape = (batch_size, in_height, in_width, in_channels=num_channels)
pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
# 使用投影层对像素值进行投影
projection = self.projection(pixel_values)
# 将2D空间维度改为单个时间维度
# shape = (batch_size, num_patches, out_channels=embed_dim)
num_patches = (width // self.patch_size[1]) * (height // self.patch_size[0])
# 在 TFGroupViTVisionEmbeddings 中,此层的嵌入将被层归一化处理
# LayerNormalization 层需要具有静态的最后一个维度(否则在使用符号张量时会导致 test_keras_save_load 失败)
# 这就是为什么在 reshape 方法中使用了 hidden_size
embeddings = tf.reshape(tensor=projection, shape=(batch_size, num_patches, self.hidden_size))
return embeddings
def build(self, input_shape=None):
# 如果已经构建过,则直接返回
if self.built:
return
self.built = True
# 如果 projection 属性已存在,则构建 projection 层
if getattr(self, "projection", None) is not None:
with tf.name_scope(self.projection.name):
self.projection.build([None, None, None, self.num_channels])
# Adapted from transformers.vit.modeling_tf_vit.TFViTEmbeddings
class TFGroupViTVisionEmbeddings(keras.layers.Layer):
"""
Construct the position and patch embeddings.
"""
def __init__(self, config: GroupViTVisionConfig, **kwargs):
super().__init__(**kwargs)
# 初始化补丁嵌入层对象
self.patch_embeddings = TFGroupViTPatchEmbeddings(config, name="patch_embeddings")
# 添加 dropout 层,使用配置中的 dropout 率
self.dropout = keras.layers.Dropout(rate=config.dropout, name="dropout")
# 添加 LayerNormalization 层,使用配置中的 epsilon 值
self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
# 保存配置对象
self.config = config
def build(self, input_shape=None):
# 获取补丁数量
num_patches = self.patch_embeddings.num_patches
# 添加位置嵌入权重,形状为 (1, num_patches, hidden_size),使用零初始化
self.position_embeddings = self.add_weight(
shape=(1, num_patches, self.config.hidden_size),
initializer="zeros",
trainable=True,
name="position_embeddings",
)
if self.built:
return
self.built = True
# 如果已经构建,直接返回
if getattr(self, "patch_embeddings", None) is not None:
with tf.name_scope(self.patch_embeddings.name):
self.patch_embeddings.build(None)
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
# 构建 LayerNormalization 层,输入形状为 [None, None, hidden_size]
self.layernorm.build([None, None, self.config.hidden_size])
def interpolate_pos_encoding(self, embeddings, height, width) -> tf.Tensor:
"""
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
resolution images.
Source:
https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py
"""
# 获取 embeddings 的形状信息
batch_size, num_patches, dim = shape_list(embeddings)
# 获取位置编码的数量
num_positions = shape_list(self.position_embeddings)[1]
# 如果补丁数量与位置编码数量相等,并且高度与宽度也相等,则直接返回位置编码
if num_patches == num_positions and height == width:
return self.position_embeddings
# 否则,进行插值处理
patch_pos_embed = self.position_embeddings
h0 = height // self.config.patch_size
w0 = width // self.config.patch_size
# 使用双三次插值方法调整位置编码的大小
patch_pos_embed = tf.image.resize(
images=tf.reshape(
patch_pos_embed, shape=(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
),
size=(h0, w0),
method="bicubic",
)
patch_pos_embed = tf.reshape(tensor=patch_pos_embed, shape=(1, -1, dim))
return patch_pos_embed
def call(
self, pixel_values: tf.Tensor, interpolate_pos_encoding: bool = False, training: bool = False
):
# 神经网络层的调用方法,根据像素值计算输出
# 定义函数的返回类型为 TensorFlow 张量
) -> tf.Tensor:
# 从 pixel_values 的形状列表中获取高度和宽度信息
_, _, height, width = shape_list(pixel_values)
# 将像素值转换为补丁的嵌入向量,并根据需要插值位置编码
embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
# 对嵌入向量进行层归一化处理
embeddings = self.layernorm(embeddings)
# 如果需要插值位置编码,则将其添加到每个 token 的嵌入向量中
if interpolate_pos_encoding:
embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
else:
# 否则,直接添加预定义的位置编码到嵌入向量中
embeddings = embeddings + self.position_embeddings
# 对嵌入向量应用 dropout 操作
embeddings = self.dropout(embeddings)
# 返回处理后的嵌入向量
return embeddings
# 从 transformers.models.clip.modeling_tf_clip.TFCLIPTextEmbeddings 复制代码,修改为 GroupViT
class TFGroupViTTextEmbeddings(keras.layers.Layer):
def __init__(self, config: GroupViTTextConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.hidden_size # 设置嵌入维度为配置文件中的隐藏大小
self.config = config # 保存配置信息
def build(self, input_shape: tf.TensorShape = None):
with tf.name_scope("token_embedding"):
# 添加权重矩阵,形状为 (词汇表大小, 嵌入维度),根据配置的初始化因子和范围进行初始化
self.weight = self.add_weight(
shape=(self.config.vocab_size, self.embed_dim),
initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
trainable=True,
name="weight",
)
with tf.name_scope("position_embedding"):
# 添加位置嵌入矩阵,形状为 (最大位置嵌入数, 嵌入维度),根据配置的初始化因子和范围进行初始化
self.position_embedding = self.add_weight(
shape=(self.config.max_position_embeddings, self.embed_dim),
initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
trainable=True,
name="embeddings",
)
super().build(input_shape)
def call(
self,
input_ids: tf.Tensor = None,
position_ids: tf.Tensor = None,
inputs_embeds: tf.Tensor = None,
) -> tf.Tensor:
"""
根据输入张量应用嵌入。
返回:
final_embeddings (`tf.Tensor`): 输出的嵌入张量。
"""
if input_ids is None and inputs_embeds is None:
raise ValueError("You have to specify either input_ids or inputs_embeds") # 抛出数值错误,要求指定 input_ids 或 inputs_embeds
if inputs_embeds is None:
check_embeddings_within_bounds(input_ids, self.config.vocab_size) # 检查嵌入索引是否在范围内
inputs_embeds = tf.gather(params=self.weight, indices=input_ids) # 根据输入的 input_ids 从权重中获取嵌入向量
input_shape = shape_list(inputs_embeds)[:-1] # 获取输入嵌入张量的形状
if position_ids is None:
position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0) # 如果位置嵌入为空,则创建一个位置张量
position_embeds = tf.gather(params=self.position_embedding, indices=position_ids) # 根据位置索引获取位置嵌入向量
position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) # 按指定倍数复制位置嵌入向量
final_embeddings = inputs_embeds + position_embeds # 最终的嵌入向量为输入嵌入向量加上位置嵌入向量
return final_embeddings
class TFGroupViTStage(keras.layers.Layer):
"""这对应于 GroupViT 实现中的 `GroupingLayer` 类。"""
def __init__(
self,
config: GroupViTVisionConfig,
depth: int,
num_prev_group_token: int,
num_group_token: int,
num_output_group: int,
**kwargs,
):
super().__init__(**kwargs) # 调用父类的构造方法,传递任意关键字参数
self.config = config # 设置当前对象的config属性为传入的config参数
self.depth = depth # 设置当前对象的depth属性为传入的depth参数
self.num_group_token = num_group_token # 设置当前对象的num_group_token属性为传入的num_group_token参数
self.layers = [TFGroupViTEncoderLayer(config, name=f"layers_._{i}") for i in range(depth)] # 根据depth参数创建TFGroupViTEncoderLayer对象的列表,每个对象命名为layers_._{i}
if num_group_token > 0:
self.downsample = TFGroupViTTokenAssign(
config=config,
num_group_token=num_group_token,
num_output_group=num_output_group,
name="downsample",
) # 如果num_group_token大于0,则创建TFGroupViTTokenAssign对象赋值给self.downsample属性,使用传入的config、num_group_token、num_output_group参数
else:
self.downsample = None # 否则将self.downsample属性设为None
if num_prev_group_token > 0 and num_group_token > 0:
self.group_projector = [
keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="group_projector.0"),
TFGroupViTMixerMLP(
config, num_prev_group_token, config.hidden_size // 2, num_group_token, name="group_projector.1"
),
] # 如果num_prev_group_token和num_group_token均大于0,则创建包含LayerNormalization和TFGroupViTMixerMLP对象的列表,赋值给self.group_projector属性,使用传入的config、num_prev_group_token、config.hidden_size和num_group_token参数
else:
self.group_projector = None # 否则将self.group_projector属性设为None
def build(self, input_shape=None):
if self.num_group_token > 0:
self.group_token = self.add_weight(
shape=(1, self.num_group_token, self.config.hidden_size),
initializer="zeros",
trainable=True,
name="group_token",
) # 如果num_group_token大于0,则创建形状为(1, num_group_token, config.hidden_size)的可训练权重,赋值给self.group_token属性
else:
self.group_token = None # 否则将self.group_token属性设为None
if self.built:
return # 如果已经构建过,则直接返回
self.built = True # 标记已经构建过
if getattr(self, "downsample", None) is not None:
with tf.name_scope(self.downsample.name):
self.downsample.build(None) # 如果self.downsample不为None,则使用其名称作为作用域,在作用域内调用其build方法
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None) # 遍历self.layers列表中的每个层对象,使用其名称作为作用域,在作用域内调用其build方法
if getattr(self, "group_projector", None) is not None:
with tf.name_scope(self.group_projector[0].name):
self.group_projector[0].build([None, None, self.config.hidden_size]) # 如果self.group_projector不为None,则使用其第一个元素的名称作为作用域,在作用域内调用其build方法,传入形状为[None, None, config.hidden_size]的参数
with tf.name_scope(self.group_projector[1].name):
self.group_projector[1].build(None) # 使用self.group_projector的第二个元素的名称作为作用域,在作用域内调用其build方法,不传入任何参数
@property
def with_group_token(self):
return self.group_token is not None # 返回self.group_token是否不为None的布尔值
def split_x(self, x: tf.Tensor) -> tf.Tensor:
if self.with_group_token:
return x[:, : -self.num_group_token], x[:, -self.num_group_token :] # 如果self.with_group_token为True,则返回x张量的前部分(去掉最后self.num_group_token列)和后部分(最后self.num_group_token列)
else:
return x, None # 否则返回x张量和None
def concat_x(self, x: tf.Tensor, group_token: tf.Tensor | None = None) -> tf.Tensor:
if group_token is None:
return x # 如果group_token为None,则直接返回x张量
return tf.concat([x, group_token], axis=1) # 否则在axis=1的维度上连接x张量和group_token张量,并返回结果张量
def call(
self,
hidden_states: tf.Tensor,
prev_group_token: tf.Tensor | None = None,
output_attentions: bool = False,
training: bool = False,
) -> Tuple[tf.Tensor]:
"""
Args:
hidden_states (`tf.Tensor`): 输入层的张量,形状为 `(batch, seq_len, embed_dim)`
attention_mask (`tf.Tensor`): 注意力掩码,形状为 `(batch, 1, tgt_len, src_len)`,其中填充元素由极大负值指示。
output_attentions (`bool`, *可选*):
是否返回 Grouping block 的分组张量。
"""
# 如果开启了 group token 功能
if self.with_group_token:
# 复制并展开 group token 到与 hidden_states 相同的形状
group_token = tf.tile(self.group_token, multiples=(shape_list(hidden_states)[0], 1, 1))
# 如果存在 group_projector,对 group token 应用每一层的 projector
if self.group_projector is not None:
for layer in self.group_projector:
prev_group_token = layer(prev_group_token)
group_token = group_token + prev_group_token # 将前一个 group token 添加到当前 group token
else:
group_token = None
x = hidden_states
# 将 hidden_states 和 group_token 进行连接
cat_x = self.concat_x(x, group_token)
# 遍历每一层并应用
for layer in self.layers:
layer_out = layer(
cat_x,
attention_mask=None,
causal_attention_mask=None,
output_attentions=None,
)
cat_x = layer_out[0] # 更新 cat_x 到当前层的输出
# 将 cat_x 拆分回原始的 hidden_states 和 group_token
x, group_token = self.split_x(cat_x)
attention = None
# 如果存在 downsample 层,进行降采样操作
if self.downsample is not None:
x, attention = self.downsample(x, group_token)
# 输出结果为 (x, group_token),如果需要返回 attentions,则加入 attention
outputs = (x, group_token)
if output_attentions:
outputs = outputs + (attention,)
return outputs
class TFGroupViTMLP(keras.layers.Layer):
# TFGroupViTMLP 类,继承自 keras.layers.Layer
def __init__(
self,
config: GroupViTVisionConfig,
hidden_size: Optional[int] = None,
intermediate_size: Optional[int] = None,
output_size: Optional[int] = None,
**kwargs,
):
# 初始化函数,接受配置 config 和可选的隐藏大小、中间大小、输出大小等参数
super().__init__(**kwargs)
self.config = config
# 获取激活函数
self.activation_fn = get_tf_activation(config.hidden_act)
# 设置隐藏大小,默认从配置中获取
hidden_size = hidden_size if hidden_size is not None else config.hidden_size
# 设置中间大小,默认从配置中获取
intermediate_size = intermediate_size if intermediate_size is not None else config.intermediate_size
# 设置输出大小,默认为隐藏大小
output_size = output_size if output_size is not None else hidden_size
# 创建 Dense 层 fc1,用于中间层
self.fc1 = keras.layers.Dense(intermediate_size, name="fc1")
# 创建 Dense 层 fc2,用于输出层
self.fc2 = keras.layers.Dense(output_size, name="fc2")
self.intermediate_size = intermediate_size
self.hidden_size = hidden_size
def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
# 调用函数,传入隐藏状态 hidden_states 和训练标志 training
# 将 hidden_states 输入到 fc1 中
hidden_states = self.fc1(hidden_states)
# 使用激活函数处理 fc1 输出
hidden_states = self.activation_fn(hidden_states)
# 将处理后的 hidden_states 输入到 fc2 中
hidden_states = self.fc2(hidden_states)
return hidden_states
def build(self, input_shape=None):
# 构建函数,在第一次调用时构建层的内部变量
if self.built:
return
self.built = True
# 构建 fc1 层
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.hidden_size])
# 构建 fc2 层
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.intermediate_size])
class TFGroupViTMixerMLP(TFGroupViTMLP):
# TFGroupViTMixerMLP 类,继承自 TFGroupViTMLP
def call(self, x, training: bool = False):
# 调用函数,传入输入 x 和训练标志 training
# 调用父类 TFGroupViTMLP 的 call 方法,将 x 转置后输入
x = super().call(hidden_states=tf.transpose(x, perm=(0, 2, 1)))
# 返回转置后的结果
return tf.transpose(x, perm=(0, 2, 1))
# Adapted from transformers.models.clip.modeling_tf_clip.TFCLIPAttention
class TFGroupViTAttention(keras.layers.Layer):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
# TFGroupViTAttention 类,继承自 keras.layers.Layer
"""来自《Attention Is All You Need》论文的多头注意力"""
def __init__(self, config: GroupViTConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.hidden_size # 设置嵌入维度为配置中的隐藏大小
self.num_attention_heads = config.num_attention_heads # 设置注意力头的数量为配置中的注意力头数
self.attention_head_size = self.embed_dim // self.num_attention_heads # 计算每个注意力头的大小
if self.attention_head_size * self.num_attention_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: "
f"{self.num_attention_heads})."
)
factor = config.initializer_factor # 从配置中获取初始化因子
# 计算输入投影的标准差
in_proj_std = (self.embed_dim**-0.5) * ((2 * config.num_hidden_layers) ** -0.5) * factor
# 计算输出投影的标准差
out_proj_std = (self.embed_dim**-0.5) * factor
self.sqrt_att_head_size = math.sqrt(self.attention_head_size) # 计算注意力头大小的平方根
# 初始化查询投影层,使用自定义的初始化器
self.q_proj = keras.layers.Dense(
units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="q_proj"
)
# 初始化键投影层,使用自定义的初始化器
self.k_proj = keras.layers.Dense(
units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="k_proj"
)
# 初始化值投影层,使用自定义的初始化器
self.v_proj = keras.layers.Dense(
units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="v_proj"
)
# 初始化 dropout 层,设定丢弃率为配置中的注意力丢弃率
self.dropout = keras.layers.Dropout(rate=config.attention_dropout)
# 初始化输出投影层,使用自定义的初始化器
self.out_proj = keras.layers.Dense(
units=self.embed_dim, kernel_initializer=get_initializer(out_proj_std), name="out_proj"
)
# 从 transformers.models.bert.modeling_tf_bert.TFBertSelfAttention.transpose_for_scores 复制而来
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
# 将张量从 [batch_size, seq_length, all_head_size] 重塑为 [batch_size, seq_length, num_attention_heads, attention_head_size]
tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
# 将张量从 [batch_size, seq_length, num_attention_heads, attention_head_size] 转置为 [batch_size, num_attention_heads, seq_length, attention_head_size]
return tf.transpose(tensor, perm=[0, 2, 1, 3])
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor = None,
causal_attention_mask: tf.Tensor = None,
output_attentions: bool = None,
encoder_hidden_states: tf.Tensor = None,
training: bool = False,
) -> Tuple[tf.Tensor]:
"""Input shape: Batch x Time x Channel"""
# 获取隐藏状态的批次大小
batch_size = shape_list(hidden_states)[0]
# 判断是否为跨注意力机制
is_cross_attention = encoder_hidden_states is not None
# 计算混合后的查询向量
mixed_query_layer = self.q_proj(inputs=hidden_states)
if is_cross_attention:
# 若为跨注意力,计算混合后的键和值向量
mixed_key_layer = self.k_proj(inputs=encoder_hidden_states)
mixed_value_layer = self.v_proj(inputs=encoder_hidden_states)
else:
# 否则,计算混合后的键和值向量
mixed_key_layer = self.k_proj(inputs=hidden_states)
mixed_value_layer = self.v_proj(inputs=hidden_states)
# 调整张量形状以进行注意力得分计算
query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
# 计算注意力分数,即查询向量和键向量的点积
# 结果维度为(batch size, num_heads, seq_len_q, seq_len_k)
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
# 将注意力分数除以 sqrt(注意力头大小)
dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
attention_scores = tf.divide(attention_scores, dk)
# 先应用因果注意力掩码
if causal_attention_mask is not None:
# 加上因果注意力掩码(在 TFCLIPModel call() 函数中预先计算)
attention_scores = tf.add(attention_scores, causal_attention_mask)
# 若存在普通注意力掩码,则也应用
if attention_mask is not None:
# 加上普通注意力掩码(在 TFCLIPModel call() 函数中预先计算)
attention_scores = tf.add(attention_scores, attention_mask)
# 将注意力分数归一化为概率
_attention_probs = stable_softmax(logits=attention_scores, axis=-1)
# 对注意力概率进行 dropout 处理
attention_probs = self.dropout(inputs=_attention_probs)
# 计算注意力输出值
attention_output = tf.matmul(attention_probs, value_layer)
attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
# 重新整形注意力输出张量的形状
attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.embed_dim))
# 通过输出投影层处理注意力输出
attention_output = self.out_proj(attention_output)
# 根据模型不同的输出设置,返回注意力输出和可能的注意力权重
outputs = (attention_output, _attention_probs) if output_attentions else (attention_output,)
return outputs
# 构建方法用于构造模型,根据输入形状初始化模型的各个组件
def build(self, input_shape=None):
# 如果已经构建过,直接返回,避免重复构建
if self.built:
return
# 设置标志位为已构建
self.built = True
# 如果存在查询投影层,则构建查询投影层
if getattr(self, "q_proj", None) is not None:
# 在命名空间内构建查询投影层,输入形状为 [None, None, self.embed_dim]
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
# 如果存在键投影层,则构建键投影层
if getattr(self, "k_proj", None) is not None:
# 在命名空间内构建键投影层,输入形状为 [None, None, self.embed_dim]
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
# 如果存在值投影层,则构建值投影层
if getattr(self, "v_proj", None) is not None:
# 在命名空间内构建值投影层,输入形状为 [None, None, self.embed_dim]
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
# 如果存在输出投影层,则构建输出投影层
if getattr(self, "out_proj", None) is not None:
# 在命名空间内构建输出投影层,输入形状为 [None, None, self.embed_dim]
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
# 从 transformers.models.clip.modeling_tf_clip.TFCLIPEncoderLayer 复制代码并改名为 TFGroupViTEncoderLayer,用于 GroupViT 模型
class TFGroupViTEncoderLayer(keras.layers.Layer):
# 初始化函数,接受 GroupViTConfig 对象作为配置参数
def __init__(self, config: GroupViTConfig, **kwargs):
super().__init__(**kwargs)
# 设定嵌入维度为隐藏大小
self.embed_dim = config.hidden_size
# 创建自注意力层,使用 TFGroupViTAttention 类,并命名为 "self_attn"
self.self_attn = TFGroupViTAttention(config, name="self_attn")
# 创建第一个层规范化层,使用 LayerNormalization,设定 epsilon 为 config.layer_norm_eps,命名为 "layer_norm1"
self.layer_norm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1")
# 创建 MLP 层,使用 TFGroupViTMLP 类,命名为 "mlp"
self.mlp = TFGroupViTMLP(config, name="mlp")
# 创建第二个层规范化层,使用 LayerNormalization,设定 epsilon 为 config.layer_norm_eps,命名为 "layer_norm2"
self.layer_norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2")
# 调用函数,实现层的前向传播逻辑
def call(
self,
hidden_states: tf.Tensor, # 输入的隐藏状态张量,形状为 `(batch, seq_len, embed_dim)`
attention_mask: tf.Tensor, # 注意力掩码张量,形状为 `(batch, 1, tgt_len, src_len)`
causal_attention_mask: tf.Tensor, # 因果注意力掩码张量,形状为 `(batch, 1, tgt_len, src_len)`
output_attentions: bool, # 是否输出所有注意力层的注意力张量
training: bool = False, # 是否处于训练模式
) -> Tuple[tf.Tensor]: # 返回类型为包含一个张量的元组
"""
Args:
hidden_states (`tf.Tensor`): 输入层的隐藏状态,形状为 `(batch, seq_len, embed_dim)`
attention_mask (`tf.Tensor`): 注意力掩码,形状为 `(batch, 1, tgt_len, src_len)`
其中填充元素由非常大的负值表示。
causal_attention_mask (`tf.Tensor`): 因果注意力掩码,形状为 `(batch, 1, tgt_len, src_len)`
其中填充元素由非常大的负值表示。
output_attentions (`bool`):
是否返回所有注意力层的注意力张量。查看返回的 `outputs` 中的详细信息。
"""
residual = hidden_states
# 应用第一个层规范化层
hidden_states = self.layer_norm1(inputs=hidden_states)
# 使用 self_attn 进行自注意力计算
attention_outputs = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask,
output_attentions=output_attentions,
training=training,
)
# 取自注意力输出的第一个张量作为新的隐藏状态
hidden_states = attention_outputs[0]
# 添加残差连接
hidden_states = residual + hidden_states
residual = hidden_states
# 应用第二个层规范化层
hidden_states = self.layer_norm2(inputs=hidden_states)
# 应用 MLP 层
hidden_states = self.mlp(hidden_states=hidden_states)
# 添加残差连接
hidden_states = residual + hidden_states
# 如果需要输出注意力张量,则将其添加到输出中
outputs = (hidden_states,) + attention_outputs[1:] # 如果输出注意力张量,则添加它们
return outputs
# 构建神经网络层次结构。如果已经构建过,则直接返回。
def build(self, input_shape=None):
if self.built:
return
# 标记该层次已经构建
self.built = True
# 如果存在 self_attn 属性,则构建 self_attn 层
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
# 如果存在 layer_norm1 属性,则构建 layer_norm1 层
if getattr(self, "layer_norm1", None) is not None:
with tf.name_scope(self.layer_norm1.name):
self.layer_norm1.build([None, None, self.embed_dim])
# 如果存在 mlp 属性,则构建 mlp 层
if getattr(self, "mlp", None) is not None:
with tf.name_scope(self.mlp.name):
self.mlp.build(None)
# 如果存在 layer_norm2 属性,则构建 layer_norm2 层
if getattr(self, "layer_norm2", None) is not None:
with tf.name_scope(self.layer_norm2.name):
self.layer_norm2.build([None, None, self.embed_dim])
# 从 transformers.models.clip.modeling_tf_clip.TFGroupViTTextEncoder 适配而来的自定义层
class TFGroupViTTextEncoder(keras.layers.Layer):
def __init__(self, config: GroupViTTextConfig, **kwargs):
super().__init__(**kwargs)
# 初始化多层 TFGroupViTEncoderLayer,根据配置创建指定数量的编码器层
self.layers = [TFGroupViTEncoderLayer(config, name=f"layers_._{i}") for i in range(config.num_hidden_layers)]
def call(
self,
hidden_states, # 输入的隐藏状态张量
attention_mask: tf.Tensor, # 注意力掩码张量
causal_attention_mask: tf.Tensor, # 因果注意力掩码张量
output_attentions: bool, # 是否输出注意力矩阵
output_hidden_states: bool, # 是否输出隐藏状态
return_dict: bool, # 是否返回字典格式的输出
training: bool = False, # 是否处于训练模式
) -> Union[Tuple, TFBaseModelOutput]:
# 如果需要输出隐藏状态,则初始化空元组存储编码器状态
encoder_states = () if output_hidden_states else None
# 如果需要输出注意力矩阵,则初始化空元组存储注意力矩阵
all_attentions = () if output_attentions else None
# 遍历每个编码器层进行前向传播
for idx, encoder_layer in enumerate(self.layers):
if output_hidden_states:
# 如果需要输出隐藏状态,将当前隐藏状态添加到状态元组中
encoder_states = encoder_states + (hidden_states,)
# 调用编码器层的前向传播,获取层的输出
layer_outputs = encoder_layer(
hidden_states,
attention_mask,
causal_attention_mask,
output_attentions=output_attentions,
)
# 更新隐藏状态为编码器层的输出的第一个元素
hidden_states = layer_outputs[0]
if output_attentions:
# 如果需要输出注意力矩阵,将当前层的注意力矩阵添加到 all_attentions 元组中
all_attentions = all_attentions + (layer_outputs[1],)
# 如果需要输出隐藏状态,将最终的隐藏状态添加到状态元组中
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
# 如果不需要返回字典格式的输出,根据需要返回隐藏状态、编码器状态和注意力矩阵
if not return_dict:
return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
# 否则,返回 TFBaseModelOutput 对象,包含最终的隐藏状态、编码器状态和注意力矩阵
return TFBaseModelOutput(
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
)
def build(self, input_shape=None):
# 如果已经构建过,则直接返回
if self.built:
return
self.built = True
# 如果存在 self.layers,则遍历每个层并构建它们
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
# TFGroupViTVisionEncoder 类
class TFGroupViTVisionEncoder(keras.layers.Layer):
def __init__(self, config: GroupViTVisionConfig, **kwargs) -> None:
super().__init__(**kwargs)
# 初始化多个 TFGroupViTStage,根据配置创建多个视觉编码阶段
self.stages = [
TFGroupViTStage(
config=config,
depth=config.depths[i],
num_group_token=config.num_group_tokens[i],
num_output_group=config.num_output_groups[i],
num_prev_group_token=config.num_output_groups[i - 1] if i > 0 else 0,
name=f"stages_._{i}",
)
for i in range(len(config.depths))
]
def call(
self,
hidden_states: tf.Tensor, # 输入的隐藏状态张量
output_hidden_states: bool, # 是否输出隐藏状态
output_attentions: bool, # 是否输出注意力矩阵
return_dict: bool, # 是否返回字典格式的输出
training: bool = False, # 是否处于训练模式
) -> None:
# 遍历每个视觉编码阶段进行前向传播
for stage in self.stages:
# 调用每个阶段的前向传播函数
hidden_states = stage(
hidden_states,
output_hidden_states=output_hidden_states,
output_attentions=output_attentions,
return_dict=return_dict,
)
# 返回最终的隐藏状态张量作为视觉编码器的输出
return hidden_states
) -> Union[tuple, TFBaseModelOutput]:
# 如果输出隐藏状态,则初始化一个空元组,否则设为 None
all_hidden_states = () if output_hidden_states else None
# 如果输出注意力权重,则初始化一个空元组,否则设为 None
all_groupings = () if output_attentions else None
# 初始化 group_tokens 为 None
group_tokens = None
# 遍历 self.stages 中的每个阶段
for stage in self.stages:
# 如果输出隐藏状态,则将当前隐藏状态加入到 all_hidden_states 中
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 调用当前阶段的处理函数,获取当前层的输出
layer_outputs = stage(hidden_states, group_tokens, output_attentions)
# 更新隐藏状态为当前层输出的第一个元素
hidden_states = layer_outputs[0]
# 更新 group_tokens 为当前层输出的第二个元素
group_tokens = layer_outputs[1]
# 如果输出注意力权重且当前层有有效的注意力权重输出,则将其加入 all_groupings 中
if output_attentions and layer_outputs[2] is not None:
all_groupings = all_groupings + (layer_outputs[2],)
# 如果输出隐藏状态,则将最终隐藏状态加入 all_hidden_states 中
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 如果不要求返回字典形式的输出,则按顺序返回非空的结果元组
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_groupings] if v is not None)
# 如果需要返回字典形式的输出,则构造 TFBaseModelOutput 对象返回
return TFBaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_groupings
)
def build(self, input_shape=None):
# 如果已经构建过,则直接返回
if self.built:
return
# 标记当前模型已经构建
self.built = True
# 如果模型已经定义了 stages 属性,则对每个层进行构建
if getattr(self, "stages", None) is not None:
for layer in self.stages:
# 使用当前层的名称为其创建命名空间,并进行构建
with tf.name_scope(layer.name):
layer.build(None)
# 从 transformers.models.clip.modeling_tf_clip.TFCLIPTextTransformer 复制的代码,将 CLIPText 改为 GroupViTText,将 CLIPEncoder 改为 GroupViTTextEncoder
class TFGroupViTTextTransformer(keras.layers.Layer):
def __init__(self, config: GroupViTTextConfig, **kwargs):
super().__init__(**kwargs)
# 初始化 GroupViTTextEmbeddings 层,用于处理输入文本的嵌入表示
self.embeddings = TFGroupViTTextEmbeddings(config, name="embeddings")
# 初始化 GroupViTTextEncoder 层,用于对嵌入表示进行编码得到输出特征
self.encoder = TFGroupViTTextEncoder(config, name="encoder")
# 最终的层归一化,用于规范化最终输出的特征向量
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="final_layer_norm")
# 用于计算 `pooled_output` 的相关属性
self.eos_token_id = config.eos_token_id # EOS(结束符)的 token ID
self.embed_dim = config.hidden_size # 嵌入维度大小
def call(
self,
input_ids: TFModelInputType,
attention_mask: tf.Tensor,
position_ids: tf.Tensor,
output_attentions: bool,
output_hidden_states: bool,
return_dict: bool,
training: bool = False,
# 函数的参数:input_ids 输入的模型输入,attention_mask 注意力掩码,position_ids 位置编码,output_attentions 是否输出注意力权重,output_hidden_states 是否输出隐藏状态,return_dict 是否返回字典格式的输出
) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
# 获取输入 `input_ids` 的形状信息
input_shape = shape_list(input_ids)
# 使用输入的 `input_ids` 和 `position_ids` 作为参数,调用嵌入层对象 `self.embeddings` 进行嵌入操作
embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids)
# 从输入形状信息中提取批大小和序列长度
batch_size, seq_length = input_shape
# CLIP 文本模型使用因果掩码,在这里准备它
# https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
causal_attention_mask = self._build_causal_attention_mask(batch_size, seq_length, dtype=embedding_output.dtype)
# 检查注意力掩码并扩展其维度
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
attention_mask = _expand_mask(attention_mask)
# 调用编码器 `self.encoder`,传入嵌入输出、注意力掩码等参数,并接收编码器的输出
encoder_outputs = self.encoder(
hidden_states=embedding_output,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 从编码器输出中提取序列输出
sequence_output = encoder_outputs[0]
# 对序列输出进行最终层归一化处理
sequence_output = self.final_layer_norm(inputs=sequence_output)
# 如果 `self.eos_token_id` 等于 2
if self.eos_token_id == 2:
# 如果 `eos_token_id` 在 PR #24773 之前是错误的,保持之前的操作
# 对 `sequence_output` 进行聚合操作,选择每个序列中最高数值的位置作为池化输出
pooled_output = tf.gather_nd(
params=sequence_output,
indices=tf.stack(
values=(tf.range(input_shape[0], dtype=tf.int64), tf.math.argmax(input_ids, axis=-1)), axis=1
),
)
else:
# 如果 `eos_token_id` 在 PR #24773 中被更新了,允许额外新标记的使用
# 对 `sequence_output` 进行聚合操作,选择每个序列中 `self.eos_token_id` 的位置作为池化输出
pooled_output = tf.gather_nd(
params=sequence_output,
indices=tf.stack(
values=(
tf.range(input_shape[0], dtype=tf.int64),
tf.math.argmax(tf.cast(input_ids == self.eos_token_id, dtype=tf.int8), axis=-1),
),
axis=1,
),
)
# 如果不返回字典形式的结果,则返回元组形式的输出
if not return_dict:
return (sequence_output, pooled_output) + encoder_outputs[1:]
# 返回 TFBaseModelOutputWithPooling 对象,包含序列输出、池化输出、编码器的隐藏状态和注意力权重
return TFBaseModelOutputWithPooling(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
# 构建因果注意力掩码,用于自注意力机制
def _build_causal_attention_mask(self, batch_size, seq_length, dtype=tf.float32):
# 如果 seq_length 是运行时值,不能用 tf.constant 处理。根据 TensorFlow 文档,tf.fill 可处理动态形状:
# https://www.tensorflow.org/api_docs/python/tf/fill
diag = tf.cast(tf.fill((seq_length,), 0.0), dtype)
# 创建一个二维加性注意力掩码,所有位置均被掩盖
to_mask = tf.cast(tf.fill((seq_length, seq_length), -10000.0), dtype)
# 将二维矩阵的对角线及其以下三角部分设置为 0,即不被掩盖的位置
# 提示:将二维矩阵视为 (query_seq, key_seq) 的空间
to_mask = tf.linalg.band_part(to_mask, 0, -1)
# to_mask = tf.linalg.band_part(to_mask, -1, 0) # 这行代码是注释掉的备选方案
to_mask = tf.linalg.set_diag(to_mask, diagonal=diag)
return tf.broadcast_to(input=to_mask, shape=(batch_size, 1, seq_length, seq_length))
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
# Adapted from transformers.models.clip.modeling_tf_clip.TFCLIPVisionTransformer
# 自transformers库中TFCLIPVisionTransformer模块改编而来,用于处理视觉信息的Transformer模型
class TFGroupViTVisionTransformer(keras.layers.Layer):
def __init__(self, config: GroupViTVisionConfig, **kwargs):
super().__init__(**kwargs)
# 初始化视觉嵌入层对象,使用GroupViTVisionEmbeddings处理视觉嵌入
self.embeddings = TFGroupViTVisionEmbeddings(config, name="embeddings")
# 初始化视觉编码器对象,使用GroupViTVisionEncoder处理视觉编码
self.encoder = TFGroupViTVisionEncoder(config, name="encoder")
# 初始化层归一化对象,设置epsilon值为config中定义的层归一化epsilon值
self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
# 设置嵌入维度为config中定义的隐藏层大小
self.embed_dim = config.hidden_size
# 定义模型调用方法,接收像素值和其他配置参数,并返回模型输出
def call(
self,
pixel_values: TFModelInputType,
output_attentions: bool,
output_hidden_states: bool,
return_dict: bool,
training: bool = False,
) -> Union[Tuple, TFBaseModelOutputWithPooling]:
# 获取嵌入层的输出,即像素值的嵌入表示
embedding_output = self.embeddings(pixel_values)
# 将嵌入输出传入编码器,获取编码器的输出
encoder_outputs = self.encoder(
hidden_states=embedding_output,
output_hidden_states=output_hidden_states,
output_attentions=output_attentions,
return_dict=return_dict,
)
# 获取编码器输出的最后隐藏状态
last_hidden_state = encoder_outputs[0]
# 对最后隐藏状态进行层归一化处理
last_hidden_state = self.layernorm(last_hidden_state)
# 计算池化输出,通过对最后隐藏状态在第1维度上求均值得到
pooled_output = tf.math.reduce_mean(last_hidden_state, axis=1)
# 如果不需要返回字典形式的结果,则返回元组形式的输出
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
# 如果需要返回字典形式的结果,则构建TFBaseModelOutputWithPooling对象返回
return TFBaseModelOutputWithPooling(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
# 构建方法,在首次调用时构建嵌入层、编码器和层归一化对象
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果已定义嵌入层,则使用tf.name_scope构建嵌入层
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
# 如果已定义编码器,则使用tf.name_scope构建编码器
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
# 如果已定义层归一化对象,则使用tf.name_scope构建层归一化对象
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, None, self.embed_dim])
@keras_serializable
# 从transformers.models.clip.modeling_tf_clip.TFCLIPTextMainLayer复制而来,将CLIP替换为GroupViT
# 从transformers库中TFCLIPTextMainLayer模块复制而来,修改为处理GroupViT的文本主层
class TFGroupViTTextMainLayer(keras.layers.Layer):
config_class = GroupViTTextConfig
def __init__(self, config: GroupViTTextConfig, **kwargs):
super().__init__(**kwargs)
# 初始化配置对象
self.config = config
# 初始化文本模型对象,使用TFGroupViTTextTransformer处理文本信息
self.text_model = TFGroupViTTextTransformer(config, name="text_model")
# 获取输入嵌入层对象的方法,返回文本模型的嵌入层对象
def get_input_embeddings(self) -> keras.layers.Layer:
return self.text_model.embeddings
# 设置输入嵌入层对象的方法,设置文本模型的嵌入层权重和词汇大小
def set_input_embeddings(self, value: tf.Variable):
self.text_model.embeddings.weight = value
self.text_model.embeddings.vocab_size = shape_list(value)[0]
# 对输入参数进行解包处理的装饰器
@unpack_inputs
# 定义一个方法 `call`,用于执行模型的前向传播
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
# 如果 `input_ids` 为空,则抛出数值错误
if input_ids is None:
raise ValueError("You have to specify input_ids")
# 获取 `input_ids` 的形状
input_shape = shape_list(input_ids)
# 如果 `attention_mask` 为空,则创建一个形状与 `input_shape` 相同的张量,填充值为 1
if attention_mask is None:
attention_mask = tf.fill(dims=input_shape, value=1)
# 调用 `text_model` 进行文本模型的前向传播,并传递相应的参数
text_model_outputs = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 返回文本模型的输出结果
return text_model_outputs
# 定义一个方法 `build`,用于构建模型
def build(self, input_shape=None):
# 如果模型已经构建过,则直接返回
if self.built:
return
# 标记模型已构建
self.built = True
# 如果存在 `text_model` 属性,则在其命名空间内构建模型
if getattr(self, "text_model", None) is not None:
with tf.name_scope(self.text_model.name):
self.text_model.build(None)
# 使用 keras_serializable 装饰器使该类可以序列化为 Keras 模型
@keras_serializable
# 从 transformers.models.clip.modeling_tf_clip.TFCLIPVisionMainLayer 复制而来,并将 CLIP 改为 GroupViT
class TFGroupViTVisionMainLayer(keras.layers.Layer):
# 指定配置类为 GroupViTVisionConfig
config_class = GroupViTVisionConfig
def __init__(self, config: GroupViTVisionConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
# 创建 TFGroupViTVisionTransformer 模型,并命名为 vision_model
self.vision_model = TFGroupViTVisionTransformer(config, name="vision_model")
# 返回 vision_model 的 embeddings 层作为输入嵌入
def get_input_embeddings(self) -> keras.layers.Layer:
return self.vision_model.embeddings
# 对输入进行解包,并调用 vision_model 进行前向传播
@unpack_inputs
def call(
self,
pixel_values: TFModelInputType | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
# 如果 pixel_values 为 None,则抛出数值错误
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
# 调用 vision_model 进行前向传播,并返回其输出
vision_model_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return vision_model_outputs
# 构建层次结构,如果已经构建过,则直接返回
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果 vision_model 存在,则在其命名空间下构建模型
if getattr(self, "vision_model", None) is not None:
with tf.name_scope(self.vision_model.name):
self.vision_model.build(None)
# 使用 keras_serializable 装饰器使该类可以序列化为 Keras 模型
@keras_serializable
# 从 transformers.models.clip.modeling_tf_clip.TFCLIPMainLayer 改编而来
class TFGroupViTMainLayer(keras.layers.Layer):
# 指定配置类为 GroupViTConfig
config_class = GroupViTConfig
# 初始化方法,接受一个配置对象 config 和其他关键字参数
def __init__(self, config: GroupViTConfig, **kwargs):
# 调用父类的初始化方法
super().__init__(**kwargs)
# 检查 config.text_config 是否为 GroupViTTextConfig 类型,否则引发 ValueError 异常
if not isinstance(config.text_config, GroupViTTextConfig):
raise ValueError(
"config.text_config is expected to be of type GroupViTTextConfig but is of type"
f" {type(config.text_config)}."
)
# 检查 config.vision_config 是否为 GroupViTVisionConfig 类型,否则引发 ValueError 异常
if not isinstance(config.vision_config, GroupViTVisionConfig):
raise ValueError(
"config.vision_config is expected to be of type GroupViTVisionConfig but is of type"
f" {type(config.vision_config)}."
)
# 将传入的 config 对象赋值给实例变量 self.config
self.config = config
# 从 config 对象中获取 text_config 和 vision_config 对象,并分别赋值给 text_config 和 vision_config 变量
text_config = config.text_config
vision_config = config.vision_config
# 设置实例变量,分别为投影维度和投影中间维度
self.projection_dim = config.projection_dim
self.projection_intermediate_dim = config.projection_intermediate_dim
# 设置文本嵌入维度和视觉嵌入维度
self.text_embed_dim = text_config.hidden_size
self.vision_embed_dim = vision_config.hidden_size
# 创建 TFGroupViTTextTransformer 对象,用于文本模型的转换,命名为 "text_model"
self.text_model = TFGroupViTTextTransformer(text_config, name="text_model")
# 创建 TFGroupViTVisionTransformer 对象,用于视觉模型的转换,命名为 "vision_model"
self.vision_model = TFGroupViTVisionTransformer(vision_config, name="vision_model")
# 定义视觉投影层,包括 Dense 层、BatchNormalization 层和 ReLU 激活函数层
self.visual_projection = [
keras.layers.Dense(self.projection_intermediate_dim, name="visual_projection.0"),
keras.layers.BatchNormalization(name="visual_projection.1", momentum=0.9, epsilon=1e-5),
keras.layers.ReLU(name="visual_projection.2"),
keras.layers.Dense(self.projection_dim, name="visual_projection.3"),
]
# 定义文本投影层,包括 Dense 层、BatchNormalization 层和 ReLU 激活函数层
self.text_projection = [
keras.layers.Dense(self.projection_intermediate_dim, name="text_projection.0"),
keras.layers.BatchNormalization(name="text_projection.1", momentum=0.9, epsilon=1e-5),
keras.layers.ReLU(name="text_projection.2"),
keras.layers.Dense(self.projection_dim, name="text_projection.3"),
]
def build(self, input_shape=None):
# 添加一个可训练的名为logit_scale的权重,初始值为config中的logit_scale_init_value
self.logit_scale = self.add_weight(
shape=(1,),
initializer=keras.initializers.Constant(self.config.logit_scale_init_value),
trainable=True,
name="logit_scale",
)
# 如果模型已经建立,则直接返回
if self.built:
return
# 标记模型已经建立
self.built = True
# 如果存在text_model,则构建text_model
if getattr(self, "text_model", None) is not None:
with tf.name_scope(self.text_model.name):
self.text_model.build(None)
# 如果存在vision_model,则构建vision_model
if getattr(self, "vision_model", None) is not None:
with tf.name_scope(self.vision_model.name):
self.vision_model.build(None)
# 如果存在visual_projection,则分别构建其各个层
if getattr(self, "visual_projection", None) is not None:
with tf.name_scope(self.visual_projection[0].name):
self.visual_projection[0].build([None, None, None, self.vision_embed_dim])
with tf.name_scope(self.visual_projection[1].name):
self.visual_projection[1].build((None, self.projection_intermediate_dim))
with tf.name_scope(self.visual_projection[3].name):
self.visual_projection[3].build([None, None, None, self.projection_intermediate_dim])
# 如果存在text_projection,则分别构建其各个层
if getattr(self, "text_projection", None) is not None:
with tf.name_scope(self.text_projection[0].name):
self.text_projection[0].build([None, None, None, self.text_embed_dim])
with tf.name_scope(self.text_projection[1].name):
self.text_projection[1].build((None, self.projection_intermediate_dim))
with tf.name_scope(self.text_projection[3].name):
self.text_projection[3].build([None, None, None, self.projection_intermediate_dim])
@unpack_inputs
def get_text_features(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> tf.Tensor:
# 如果未提供input_ids,则抛出数值错误异常
if input_ids is None:
raise ValueError("You have to specify either input_ids")
# 获取input_ids的形状
input_shape = shape_list(input_ids)
# 如果未提供attention_mask,则使用全1填充
if attention_mask is None:
attention_mask = tf.fill(dims=input_shape, value=1)
# 使用text_model处理输入,获取文本输出
text_outputs = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 从文本输出中获取汇总的输出
pooled_output = text_outputs[1]
# 将汇总的输出依次经过text_projection的每一层处理
for layer in self.text_projection:
pooled_output = layer(pooled_output)
# 返回文本特征
text_features = pooled_output
return text_features
@unpack_inputs
# 定义一个方法,用于获取图像特征
def get_image_features(
self,
pixel_values: TFModelInputType | None = None, # 像素值输入,可以为 None
output_attentions: Optional[bool] = None, # 是否输出注意力权重,默认为 None
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,默认为 None
return_dict: Optional[bool] = None, # 是否返回字典形式的输出,默认为 None
training: bool = False, # 是否处于训练模式,默认为 False
) -> tf.Tensor: # 返回类型为 TensorFlow 的张量
# 如果像素值为 None,则抛出 ValueError 异常
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
# 使用视觉模型处理像素值,根据参数选择是否返回注意力权重和隐藏状态
vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 获取视觉模型输出的汇总特征(一般是第二个输出)
pooled_output = vision_outputs[1]
# 通过每层的可调用层对汇总特征进行变换
for layer in self.visual_projection:
pooled_output = layer(pooled_output)
# 将处理后的图像特征赋给变量 image_features
image_features = pooled_output
# 返回图像特征
return image_features
# 使用装饰器 unpack_inputs 定义一个方法,用于调用模型
def call(
self,
input_ids: TFModelInputType | None = None, # 输入的 token IDs,可以为 None
pixel_values: TFModelInputType | None = None, # 像素值输入,可以为 None
attention_mask: np.ndarray | tf.Tensor | None = None, # 注意力掩码,可以为 None
position_ids: np.ndarray | tf.Tensor | None = None, # 位置 IDs,可以为 None
return_loss: Optional[bool] = None, # 是否返回损失,默认为 None
output_attentions: Optional[bool] = None, # 是否输出注意力权重,默认为 None
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,默认为 None
output_segmentation: Optional[bool] = None, # 是否输出分割结果,默认为 None
return_dict: Optional[bool] = None, # 是否返回字典形式的输出,默认为 None
training: bool = False, # 是否处于训练模式,默认为 False
# GROUPVIT_TEXT_INPUTS_DOCSTRING 变量,包含了关于输入格式的文档字符串,用于说明 TF 2.0 模型接受的输入格式。
GROUPVIT_TEXT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
output_attentions (`bool`, *optional*):
output_hidden_states (`bool`, *optional*):
return_dict (`bool`, *optional*):
training (`bool`, *optional*, defaults to `False``):
"""
GROUPVIT_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]`, `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
eager mode, in graph mode the value will always be set to True.
training (`bool`, *optional*, defaults to `False``):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
GROUPVIT_INPUTS_DOCSTRING = r"""
A docstring describing the inputs expected by a function or method in the GROUPVIT module.
"""
Args:
input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]`, `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
# 输入序列标记在词汇表中的索引。
# 可以使用 [`AutoTokenizer`] 获取索引。详见 [`PreTrainedTokenizer.__call__`] 和 [`PreTrainedTokenizer.encode`]。
# [什么是输入 ID?](../glossary#input-ids)
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]`, `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
# 像素值。可以使用 [`AutoImageProcessor`] 获取像素值。详见 [`CLIPImageProcessor.__call__`]。
attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
# 遮罩,避免在填充的标记索引上执行注意力操作。
# 遮罩值在 `[0, 1]` 之间:
# - 1 表示**未遮罩**的标记,
# - 0 表示**已遮罩**的标记。
# [什么是注意力遮罩?](../glossary#attention-mask)
position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
# 每个输入序列标记在位置嵌入中的位置索引。
# 选择范围为 `[0, config.max_position_embeddings - 1]`。
# [什么是位置 ID?](../glossary#position-ids)
return_loss (`bool`, *optional*):
# 是否返回对比损失。
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。详见返回的张量中的 `attentions`。
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。详见返回的张量中的 `hidden_states`。
return_dict (`bool`, *optional*):
# 是否返回 [`~utils.ModelOutput`] 而不是普通元组。
training (`bool`, *optional*, defaults to `False`):
# 是否在训练模式中使用模型(某些模块在训练和评估之间具有不同的行为)。
"""
TFGroupViTTextModel 类定义了一个基于 TFGroupViTPreTrainedModel 的文本模型。
"""
class TFGroupViTTextModel(TFGroupViTPreTrainedModel):
# 设置配置类
config_class = GroupViTTextConfig
# 主输入名称
main_input_name = "input_ids"
def __init__(self, config: GroupViTTextConfig, *inputs, **kwargs):
# 调用父类构造函数
super().__init__(config, *inputs, **kwargs)
# 初始化 TFGroupViTTextMainLayer 实例作为模型的主要组件
self.groupvit = TFGroupViTTextMainLayer(config, name="groupvit")
@unpack_inputs
# 将输入参数解包后,添加文档字符串到模型的前向传播方法
@add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
# 替换模型前向传播方法的返回文档字符串
@replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=GroupViTTextConfig)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
r"""
模型的前向传播函数,接受输入参数并返回模型的输出。
Returns:
TFBaseModelOutputWithPooling 或者包含 tf.Tensor 的元组
Examples:
示例用法,展示了如何使用模型进行推理。
```
>>> from transformers import CLIPTokenizer, TFGroupViTTextModel
>>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> model = TFGroupViTTextModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output
```
"""
# 调用 self.groupvit 的前向传播方法并返回结果
outputs = self.groupvit(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def build(self, input_shape=None):
# 如果已经构建完成,则直接返回
if self.built:
return
# 设置构建完成标志
self.built = True
# 如果 self.groupvit 存在,则在 TensorFlow 的命名空间内构建组件
if getattr(self, "groupvit", None) is not None:
with tf.name_scope(self.groupvit.name):
self.groupvit.build(None)
class TFGroupViTVisionModel(TFGroupViTPreTrainedModel):
# 设置配置类
config_class = GroupViTVisionConfig
# 主输入名称
main_input_name = "pixel_values"
def __init__(self, config: GroupViTVisionConfig, *inputs, **kwargs):
# 调用父类构造函数
super().__init__(config, *inputs, **kwargs)
# 初始化 TFGroupViTVisionMainLayer 实例作为模型的主要组件
self.groupvit = TFGroupViTVisionMainLayer(config, name="groupvit")
@unpack_inputs
# 添加文档字符串到模型的前向传播方法
@add_start_docstrings_to_model_forward(GROUPVIT_VISION_INPUTS_DOCSTRING)
# 替换模型前向传播方法的返回文档字符串
@replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=GroupViTVisionConfig)
def call(
self,
pixel_values: TFModelInputType | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
r"""
返回模型的输出结果。
Examples:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, TFGroupViTVisionModel
>>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> model = TFGroupViTVisionModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="tf")
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output
```
"""
outputs = self.groupvit(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "groupvit", None) is not None:
with tf.name_scope(self.groupvit.name):
self.groupvit.build(None)
# 使用装饰器添加文档字符串,指定类的起始文档字符串
@add_start_docstrings(GROUPVIT_START_DOCSTRING)
# 定义 TFGroupViTModel 类,继承自 TFGroupViTPreTrainedModel 类
class TFGroupViTModel(TFGroupViTPreTrainedModel):
# 指定配置类为 GroupViTConfig
config_class = GroupViTConfig
# 初始化方法,接受 GroupViTConfig 类型的配置对象和其他参数
def __init__(self, config: GroupViTConfig, *inputs, **kwargs):
# 调用父类的初始化方法
super().__init__(config, *inputs, **kwargs)
# 创建 TFGroupViTMainLayer 实例,命名为 groupvit
self.groupvit = TFGroupViTMainLayer(config, name="groupvit")
# 使用装饰器添加文档字符串到模型前向传播方法
@unpack_inputs
@add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
def get_text_features(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> tf.Tensor:
r"""
返回文本特征张量 (`tf.Tensor` of shape `(batch_size, output_dim`):
通过将投影层应用于 [`TFGroupViTTextModel`] 的汇总输出所得到的文本嵌入。
示例:
```
>>> from transformers import CLIPTokenizer, TFGroupViTModel
>>> model = TFGroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")
>>> text_features = model.get_text_features(**inputs)
```"""
# 调用 TFGroupViTMainLayer 的 get_text_features 方法,返回文本特征张量
text_features = self.groupvit.get_text_features(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 返回文本特征张量
return text_features
# 使用装饰器添加文档字符串到模型前向传播方法
@unpack_inputs
@add_start_docstrings_to_model_forward(GROUPVIT_VISION_INPUTS_DOCSTRING)
def get_image_features(
self,
pixel_values: TFModelInputType | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> tf.Tensor:
r"""
"""
) -> tf.Tensor:
r"""
Returns:
image_features (`tf.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by applying
the projection layer to the pooled output of [`TFGroupViTVisionModel`].
Examples:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, TFGroupViTModel
>>> model = TFGroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="tf")
>>> image_features = model.get_image_features(**inputs)
```"""
# 调用 TFGroupViTVisionModel 的方法获取图像特征
image_features = self.groupvit.get_image_features(
pixel_values=pixel_values, # 图像像素值
output_attentions=output_attentions, # 是否输出注意力权重
output_hidden_states=output_hidden_states, # 是否输出隐藏状态
return_dict=return_dict, # 是否以字典形式返回结果
training=training, # 是否处于训练模式
)
return image_features
@unpack_inputs
@add_start_docstrings_to_model_forward(GROUPVIT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=TFGroupViTModelOutput, config_class=GroupViTConfig)
def call(
self,
input_ids: TFModelInputType | None = None,
pixel_values: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
return_loss: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_segmentation: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[TFGroupViTModelOutput, Tuple[tf.Tensor]]:
r"""
Returns:
Examples:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, TFGroupViTModel
>>> import tensorflow as tf
>>> model = TFGroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(
... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="tf", padding=True
... )
>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image
>>> probs = tf.math.softmax(logits_per_image, axis=1)
```"""
# 调用模型的 forward 方法,传递输入参数进行推理
outputs = self.groupvit(
input_ids=input_ids,
pixel_values=pixel_values,
attention_mask=attention_mask,
position_ids=position_ids,
return_loss=return_loss,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
output_segmentation=output_segmentation,
return_dict=return_dict,
training=training,
)
return outputs
def serving_output(self, output: TFGroupViTModelOutput) -> TFGroupViTModelOutput:
# TODO: As is this currently fails with saved_model=True, because
# TensorFlow cannot trace through nested dataclasses. Reference:
# https://github.com/huggingface/transformers/pull/16886
# 返回模型输出作为服务端输出
return output
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果模型已经构建,则直接返回
if getattr(self, "groupvit", None) is not None:
# 使用 TensorFlow 的命名空间来构建模型组件
with tf.name_scope(self.groupvit.name):
self.groupvit.build(None)
.\models\groupvit\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_available, is_torch_available
_import_structure = {
"configuration_groupvit": [
"GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP",
"GroupViTConfig",
"GroupViTOnnxConfig",
"GroupViTTextConfig",
"GroupViTVisionConfig",
],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_groupvit"] = [
"GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST",
"GroupViTModel",
"GroupViTPreTrainedModel",
"GroupViTTextModel",
"GroupViTVisionModel",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_groupvit"] = [
"TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFGroupViTModel",
"TFGroupViTPreTrainedModel",
"TFGroupViTTextModel",
"TFGroupViTVisionModel",
]
if TYPE_CHECKING:
from .configuration_groupvit import (
GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP,
GroupViTConfig,
GroupViTOnnxConfig,
GroupViTTextConfig,
GroupViTVisionConfig,
)
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_groupvit import (
GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST,
GroupViTModel,
GroupViTPreTrainedModel,
GroupViTTextModel,
GroupViTVisionModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_groupvit import (
TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFGroupViTModel,
TFGroupViTPreTrainedModel,
TFGroupViTTextModel,
TFGroupViTVisionModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\herbert\tokenization_herbert.py
import json
import os
import re
import unicodedata
from typing import List, Optional, Tuple
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"allegro/herbert-base-cased": "https://huggingface.co/allegro/herbert-base-cased/resolve/main/vocab.json"
},
"merges_file": {
"allegro/herbert-base-cased": "https://huggingface.co/allegro/herbert-base-cased/resolve/main/merges.txt"
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"allegro/herbert-base-cased": 514}
PRETRAINED_INIT_CONFIGURATION = {}
def get_pairs(word):
"""
Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
strings)
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
def replace_unicode_punct(text):
"""
Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
"""
text = text.replace(",", ",")
text = re.sub(r"。\s*", ". ", text)
text = text.replace("、", ",")
text = text.replace("”", '"')
text = text.replace("“", '"')
text = text.replace("∶", ":")
text = text.replace(":", ":")
text = text.replace("?", "?")
text = text.replace("《", '"')
text = text.replace("》", '"')
text = text.replace(")", ")")
text = text.replace("!", "!")
text = text.replace("(", "(")
text = text.replace(";", ";")
text = text.replace("1", "1")
text = text.replace("」", '"')
text = text.replace("「", '"')
text = text.replace("0", "0")
text = text.replace("3", "3")
text = text.replace("2", "2")
text = text.replace("5", "5")
text = text.replace("6", "6")
text = text.replace("9", "9")
text = text.replace("7", "7")
text = text.replace("8", "8")
text = text.replace("4", "4")
text = re.sub(r".\s*", ". ", text)
text = text.replace("~", "~")
text = text.replace("’", "'")
text = text.replace("…", "...")
text = text.replace("━", "-")
text = text.replace("〈", "<")
text = text.replace("〉", ">")
text = text.replace("【", "[")
text = text.replace("】", "]")
text = text.replace("%", "%")
return text
def remove_non_printing_char(text):
"""
这个函数用于移除文本中的非打印字符。
"""
output = []
for char in text:
cat = unicodedata.category(char)
if cat.startswith("C"):
continue
output.append(char)
return "".join(output)
def whitespace_tokenize(text):
"""对文本进行基本的空白符号清理和分割。"""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class BasicTokenizer(object):
"""
构造一个BasicTokenizer对象,用于运行基本的分词(标点符号分割、小写化等)。
Args:
do_lower_case (`bool`, *optional*, defaults to `True`):
是否在分词时将输入转换为小写。
never_split (`Iterable`, *optional*):
在分词时不会被拆分的token集合。仅在`do_basic_tokenize=True`时生效。
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
是否对中文字符进行分词。
对于日语,应该将其停用(见此问题)。
strip_accents (`bool`, *optional*):
是否去除所有的重音符号。如果未指定此选项,则将根据`lowercase`的值确定(与原始BERT相同)。
do_split_on_punc (`bool`, *optional*, defaults to `True`):
在某些情况下,我们希望跳过基本的标点符号分割,以便稍后的分词可以捕捉到单词的完整上下文,如缩写。
"""
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of token not to split.
"""
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
text = self._clean_text(text)
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
if self.do_lower_case:
token = token.lower()
if self.strip_accents is not False:
token = self._run_strip_accents(token)
elif self.strip_accents:
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token, never_split))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF)
or (cp >= 0x20000 and cp <= 0x2A6DF)
or (cp >= 0x2A700 and cp <= 0x2B73F)
or (cp >= 0x2B740 and cp <= 0x2B81F)
or (cp >= 0x2B820 and cp <= 0x2CEAF)
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F)
):
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xFFFD or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
"""
Construct a BPE tokenizer for HerBERT.
Peculiarities:
- uses BERT's pre-tokenizer: BaseTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of a
punctuation character will be treated separately.
- Such pretokenized input is BPE subtokenized
This tokenizer inherits from [`XLMTokenizer`] which contains most of the methods. Users should refer to the
superclass for more information regarding methods.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(
self,
vocab_file,
merges_file,
tokenizer_file=None,
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
sep_token="</s>",
bos_token="<s>",
do_lowercase_and_remove_accent=False,
additional_special_tokens=[
"<special0>",
"<special1>",
"<special2>",
"<special3>",
"<special4>",
"<special5>",
"<special6>",
"<special7>",
"<special8>",
"<special9>",
],
lang2id=None,
id2lang=None,
**kwargs,
):
):
try:
import sacremoses
except ImportError:
raise ImportError(
"You need to install sacremoses to use HerbertTokenizer. "
"See https://pypi.org/project/sacremoses/ for installation."
)
self.sm = sacremoses
self.cache_moses_punct_normalizer = {}
self.cache_moses_tokenizer = {}
self.lang_with_custom_tokenizer = {"zh", "th", "ja"}
self.do_lowercase_and_remove_accent = do_lowercase_and_remove_accent
self.lang2id = lang2id
self.id2lang = id2lang
if lang2id is not None and id2lang is not None:
assert len(lang2id) == len(id2lang)
self.ja_word_tokenizer = None
self.zh_word_tokenizer = None
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
with open(merges_file, encoding="utf-8") as merges_handle:
merges = merges_handle.read().split("\n")[:-1]
merges = [tuple(merge.split()[:2]) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
super().__init__(
unk_token=unk_token,
bos_token=bos_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
additional_special_tokens=additional_special_tokens,
lang2id=lang2id,
id2lang=id2lang,
do_lowercase_and_remove_accent=do_lowercase_and_remove_accent,
tokenizer_file=None,
**kwargs,
)
self.bert_pre_tokenizer = BasicTokenizer(
do_lower_case=False,
never_split=self.all_special_tokens,
tokenize_chinese_chars=False,
strip_accents=False,
)
@property
def do_lower_case(self):
return self.do_lowercase_and_remove_accent
def moses_punct_norm(self, text, lang):
if lang not in self.cache_moses_punct_normalizer:
punct_normalizer = self.sm.MosesPunctNormalizer(lang=lang)
self.cache_moses_punct_normalizer[lang] = punct_normalizer
else:
punct_normalizer = self.cache_moses_punct_normalizer[lang]
return punct_normalizer.normalize(text)
if lang not in self.cache_moses_tokenizer:
moses_tokenizer = self.sm.MosesTokenizer(lang=lang)
self.cache_moses_tokenizer[lang] = moses_tokenizer
else:
moses_tokenizer = self.cache_moses_tokenizer[lang]
return moses_tokenizer.tokenize(text, return_str=False, escape=False)
def moses_pipeline(self, text, lang):
text = replace_unicode_punct(text)
text = self.moses_punct_norm(text, lang)
text = remove_non_printing_char(text)
return text
def ja_tokenize(self, text):
if self.ja_word_tokenizer is None:
try:
import Mykytea
self.ja_word_tokenizer = Mykytea.Mykytea(
f"-model {os.path.expanduser('~')}/local/share/kytea/model.bin"
)
except (AttributeError, ImportError):
logger.error(
"Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper"
" (https://github.com/chezou/Mykytea-python) with the following steps"
)
logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea")
logger.error("2. autoreconf -i")
logger.error("3. ./configure --prefix=$HOME/local")
logger.error("4. make && make install")
logger.error("5. pip install kytea")
raise
return list(self.ja_word_tokenizer.getWS(text))
@property
def vocab_size(self):
return len(self.encoder)
def get_vocab(self):
return dict(self.encoder, **self.added_tokens_encoder)
def bpe
def bpe(self, token):
word = tuple(token[:-1]) + (token[-1] + "</w>",)
if token in self.cache:
return self.cache[token]
pairs = get_pairs(word)
if not pairs:
return token + "</w>"
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = " ".join(word)
if word == "\n </w>":
word = "\n</w>"
self.cache[token] = word
return word
def _tokenize(self, text):
pre_tokens = self.bert_pre_tokenizer.tokenize(text)
split_tokens = []
for token in pre_tokens:
if token:
split_tokens.extend(list(self.bpe(token).split(" ")))
return split_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
out_string = "".join(tokens).replace("</w>", " ").strip()
return out_string
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Generate token type IDs (segment IDs) from a pair of token ID lists for sequence classification tasks. Each token
ID list represents a sequence (or a pair of sequences).
Args:
token_ids_0 (`List[int]`):
List of IDs representing the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs representing the second sequence in a pair.
Returns:
`List[int]`: List of token type IDs (segment IDs) where each element corresponds to a token in the input
sequences. Typically, `0` is used for the first sequence and `1` for the second sequence in a pair.
"""
bos = [self.bos_token_id]
sep = [self.sep_token_id]
if token_ids_1 is None:
return [0] * len(bos + token_ids_0 + sep)
return [0] * len(bos + token_ids_0 + sep) + [1] * len(token_ids_1 + sep)
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
else:
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file
def __getstate__(self):
state = self.__dict__.copy()
state["sm"] = None
return state
def __setstate__(self, d):
self.__dict__ = d
try:
import sacremoses
except ImportError:
raise ImportError(
"You need to install sacremoses to use XLMTokenizer. "
"See https://pypi.org/project/sacremoses/ for installation."
)
self.sm = sacremoses
.\models\herbert\tokenization_herbert_fast.py
from typing import List, Optional, Tuple
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .tokenization_herbert import HerbertTokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"allegro/herbert-base-cased": "https://huggingface.co/allegro/herbert-base-cased/resolve/main/vocab.json"
},
"merges_file": {
"allegro/herbert-base-cased": "https://huggingface.co/allegro/herbert-base-cased/resolve/main/merges.txt"
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"allegro/herbert-base-cased": 514}
PRETRAINED_INIT_CONFIGURATION = {}
class HerbertTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "Fast" BPE tokenizer for HerBERT (backed by HuggingFace's *tokenizers* library).
Peculiarities:
- uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of
a punctuation character will be treated separately.
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the methods. Users should refer to the
superclass for more information regarding methods.
Args:
vocab_file (`str`):
Path to the vocabulary file.
merges_file (`str`):
Path to the merges file.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
slow_tokenizer_class = HerbertTokenizer
def __init__(
self,
vocab_file=None,
merges_file=None,
tokenizer_file=None,
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
sep_token="</s>",
**kwargs,
):
super().__init__(
vocab_file,
merges_file,
tokenizer_file=tokenizer_file,
cls_token=cls_token,
unk_token=unk_token,
pad_token=pad_token,
mask_token=mask_token,
sep_token=sep_token,
**kwargs,
)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
从一个序列或者一对序列构建模型输入,用于序列分类任务,通过连接和添加特殊标记。像BERT和HerBERT序列有如下格式:
- 单个序列: `<s> X </s>`
- 一对序列: `<s> A </s> B </s>`
Args:
token_ids_0 (`List[int]`):
要添加特殊标记的ID列表。
token_ids_1 (`List[int]`, *可选*):
第二个序列的ID列表,用于序列对。
Returns:
`List[int]`: 包含适当特殊标记的输入ID列表。
"""
cls = [self.cls_token_id]
sep = [self.sep_token_id]
if token_ids_1 is None:
return cls + token_ids_0 + sep
return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
从没有添加特殊标记的标记列表中提取序列ID。当使用分词器的 `prepare_for_model` 方法添加特殊标记时调用此方法。
Args:
token_ids_0 (`List[int]`):
ID列表。
token_ids_1 (`List[int]`, *可选*):
第二个序列的ID列表,用于序列对。
already_has_special_tokens (`bool`, *可选*, 默认为 `False`):
标记列表是否已经按模型的要求格式化为特殊标记。
Returns:
`List[int]`: 一个整数列表,范围为 [0, 1]:1 表示特殊标记,0 表示序列标记。
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
):
"""
从序列创建令牌类型ID列表,用于区分一对序列中的每个部分。
Args:
token_ids_0 (`List[int]`):
第一个序列的ID列表。
token_ids_1 (`List[int]`, *可选*):
第二个序列的ID列表,用于序列对。
Returns:
`List[int]`: 一个整数列表,表示每个令牌的类型ID。
"""
token_type_ids = []
for _ in token_ids_0:
token_type_ids.append(0)
if token_ids_1 is not None:
for _ in token_ids_1:
token_type_ids.append(1)
return token_type_ids
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. HerBERT, like
BERT sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
Args:
token_ids_0 (`List[int]`):
List of IDs for the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of token type IDs according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
.\models\herbert\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available
_import_structure = {"tokenization_herbert": ["HerbertTokenizer"]}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_herbert_fast"] = ["HerbertTokenizerFast"]
if TYPE_CHECKING:
from .tokenization_herbert import HerbertTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_herbert_fast import HerbertTokenizerFast
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\hubert\configuration_hubert.py
""" Hubert model configuration"""
import functools
import operator
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/hubert-base-ls960": "https://huggingface.co/facebook/hubert-base-ls960/resolve/main/config.json",
}
class HubertConfig(PretrainedConfig):
r"""
这是用于存储 [`HubertModel`] 配置的类。用于根据指定的参数实例化 Hubert 模型,
定义模型架构。使用默认值实例化配置将产生类似于 Hubert
[facebook/hubert-base-ls960](https://huggingface.co/facebook/hubert-base-ls960) 架构的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型的输出。更多信息请阅读
[`PretrainedConfig`] 的文档。
Example:
```
>>> from transformers import HubertModel, HubertConfig
>>> # 初始化一个 Hubert facebook/hubert-base-ls960 风格的配置
>>> configuration = HubertConfig()
>>> # 使用配置初始化一个模型,其模型风格为 facebook/hubert-base-ls960
>>> model = HubertModel(configuration)
>>> # 访问模型配置
>>> configuration = model.config
```
"""
model_type = "hubert"
def __init__(
self,
vocab_size=32,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout=0.1,
activation_dropout=0.1,
attention_dropout=0.1,
feat_proj_layer_norm=True,
feat_proj_dropout=0.0,
final_dropout=0.1,
layerdrop=0.1,
initializer_range=0.02,
layer_norm_eps=1e-5,
feat_extract_norm="group",
feat_extract_activation="gelu",
conv_dim=(512, 512, 512, 512, 512, 512, 512),
conv_stride=(5, 2, 2, 2, 2, 2, 2),
conv_kernel=(10, 3, 3, 3, 3, 2, 2),
conv_bias=False,
num_conv_pos_embeddings=128,
num_conv_pos_embedding_groups=16,
do_stable_layer_norm=False,
apply_spec_augment=True,
mask_time_prob=0.05,
mask_time_length=10,
mask_time_min_masks=2,
mask_feature_prob=0.0,
mask_feature_length=10,
mask_feature_min_masks=0,
ctc_loss_reduction="sum",
ctc_zero_infinity=False,
use_weighted_layer_sum=False,
classifier_proj_size=256,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
**kwargs,
):
):
super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
self.hidden_size = hidden_size
self.feat_extract_norm = feat_extract_norm
self.feat_extract_activation = feat_extract_activation
self.conv_dim = list(conv_dim)
self.conv_stride = list(conv_stride)
self.conv_kernel = list(conv_kernel)
self.conv_bias = conv_bias
self.num_conv_pos_embeddings = num_conv_pos_embeddings
self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
self.num_feat_extract_layers = len(self.conv_dim)
self.num_hidden_layers = num_hidden_layers
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.num_attention_heads = num_attention_heads
self.hidden_dropout = hidden_dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.feat_proj_layer_norm = feat_proj_layer_norm
self.feat_proj_dropout = feat_proj_dropout
self.final_dropout = final_dropout
self.layerdrop = layerdrop
self.layer_norm_eps = layer_norm_eps
self.initializer_range = initializer_range
self.vocab_size = vocab_size
self.do_stable_layer_norm = do_stable_layer_norm
self.use_weighted_layer_sum = use_weighted_layer_sum
self.classifier_proj_size = classifier_proj_size
if (
(len(self.conv_stride) != self.num_feat_extract_layers)
or (len(self.conv_kernel) != self.num_feat_extract_layers)
or (len(self.conv_dim) != self.num_feat_extract_layers)
):
raise ValueError(
"Configuration for convolutional layers is incorrect. It is required that `len(config.conv_dim)` =="
" `len(config.conv_stride)` == `len(config.conv_kernel)`, but is `len(config.conv_dim) ="
f" {len(self.conv_dim)}`, `len(config.conv_stride) = {len(self.conv_stride)}`,"
f" `len(config.conv_kernel) = {len(self.conv_kernel)}`."
)
self.apply_spec_augment = apply_spec_augment
self.mask_time_prob = mask_time_prob
self.mask_time_length = mask_time_length
self.mask_time_min_masks = mask_time_min_masks
self.mask_feature_prob = mask_feature_prob
self.mask_feature_length = mask_feature_length
self.mask_feature_min_masks = mask_feature_min_masks
self.ctc_loss_reduction = ctc_loss_reduction
self.ctc_zero_infinity = ctc_zero_infinity
@property
def inputs_to_logits_ratio(self):
return functools.reduce(operator.mul, self.conv_stride, 1)
.\models\hubert\convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py
"""Convert Hubert checkpoint."""
import argparse
import torch
from s3prl.hub import distilhubert
from transformers import HubertConfig, HubertModel, Wav2Vec2FeatureExtractor, logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
MAPPING = {
"post_extract_proj": "feature_projection.projection",
"encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
"self_attn.k_proj": "encoder.layers.*.attention.k_proj",
"self_attn.v_proj": "encoder.layers.*.attention.v_proj",
"self_attn.q_proj": "encoder.layers.*.attention.q_proj",
"self_attn.out_proj": "encoder.layers.*.attention.out_proj",
"self_attn_layer_norm": "encoder.layers.*.layer_norm",
"fc1": "encoder.layers.*.feed_forward.intermediate_dense",
"fc2": "encoder.layers.*.feed_forward.output_dense",
"final_layer_norm": "encoder.layers.*.final_layer_norm",
"encoder.layer_norm": "encoder.layer_norm",
"mask_emb": "masked_spec_embed",
}
def set_recursively(hf_pointer, key, value, full_name, weight_type):
"""
递归设置模型参数的函数。
Args:
hf_pointer (object): 要设置的模型参数的指针对象。
key (str): 参数的名称路径。
value (torch.Tensor): 要设置的参数值。
full_name (str): 参数的完整名称。
weight_type (str): 参数类型,如'weight', 'bias'等。
Raises:
AssertionError: 如果要设置的参数形状与预期不符合,抛出异常。
"""
for attribute in key.split("."):
hf_pointer = getattr(hf_pointer, attribute)
if weight_type is not None:
hf_shape = getattr(hf_pointer, weight_type).shape
else:
hf_shape = hf_pointer.shape
assert hf_shape == value.shape, (
f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
f" {value.shape} for {full_name}"
)
if weight_type == "weight":
hf_pointer.weight.data = value
elif weight_type == "weight_g":
hf_pointer.weight_g.data = value
elif weight_type == "weight_v":
hf_pointer.weight_v.data = value
elif weight_type == "bias":
hf_pointer.bias.data = value
else:
hf_pointer.data = value
logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
def recursively_load_weights(fairseq_model, hf_model):
"""
递归加载权重函数。
Args:
fairseq_model: Fairseq模型对象。
hf_model: HuggingFace模型对象。
"""
unused_weights = []
fairseq_dict = fairseq_model.state_dict()
feature_extractor = hf_model.feature_extractor
for name, value in fairseq_dict.items():
is_used = False
if "conv_layers" in name:
load_conv_layer(
name,
value,
feature_extractor,
unused_weights,
hf_model.config.feat_extract_norm == "group",
)
is_used = True
else:
for key, mapped_key in MAPPING.items():
mapped_key = mapped_key
if key in name:
is_used = True
if "*" in mapped_key:
layer_index = name.split(key)[0].split(".")[-2]
mapped_key = mapped_key.replace("*", layer_index)
if "weight_g" in name:
weight_type = "weight_g"
elif "weight_v" in name:
weight_type = "weight_v"
elif "weight" in name:
weight_type = "weight"
elif "bias" in name:
weight_type = "bias"
else:
weight_type = None
set_recursively(hf_model, mapped_key, value, name, weight_type)
continue
if not is_used:
unused_weights.append(name)
logger.warning(f"Unused weights: {unused_weights}")
def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
name = full_name.split("conv_layers.")[-1]
items = name.split(".")
layer_id = int(items[0])
type_id = int(items[1])
if type_id == 0:
if "bias" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
f"{full_name} has size {value.shape}, but"
f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].conv.bias.data = value
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
elif "weight" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
f"{full_name} has size {value.shape}, but"
f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].conv.weight.data = value
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
if "bias" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
" found."
)
feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
elif "weight" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
f"{full_name} has size {value.shape}, but"
f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
else:
unused_weights.append(full_name)
def convert_config(model):
config = HubertConfig()
fs_config = model.config
config.activation_dropout = fs_config.activation_dropout
config.apply_spec_augment = False
config.attention_dropout = fs_config.attention_dropout
config.conv_bias = False
conv_layers = eval(fs_config.extractor_conv_feature_layers)
config.conv_dim = [x[0] for x in conv_layers]
config.conv_kernel = [x[1] for x in conv_layers]
config.conv_stride = [x[2] for x in conv_layers]
config.feat_extract_activation = "gelu"
config.feat_extract_norm = "layer" if fs_config.extractor_mode == "layer_norm" else "group"
config.feat_proj_layer_norm = False
config.feat_proj_dropout = 0.0
config.final_dropout = 0.0
config.hidden_act = fs_config.activation_fn
config.hidden_dropout = fs_config.dropout
config.hidden_size = fs_config.encoder_embed_dim
config.initializer_range = 0.02
config.intermediate_size = fs_config.encoder_ffn_embed_dim
config.layer_norm_eps = 1e-5
config.layerdrop = 0.0
config.num_attention_heads = fs_config.encoder_attention_heads
config.num_conv_pos_embedding_groups = fs_config.conv_pos_groups
config.num_conv_pos_embeddings = fs_config.conv_pos
config.num_feat_extract_layers = len(conv_layers)
config.num_hidden_layers = fs_config.encoder_layers
return config
@torch.no_grad()
def convert_hubert_checkpoint(pytorch_dump_folder_path, config_path=None):
model = distilhubert().model.model
if config_path is not None:
config = HubertConfig.from_pretrained(config_path)
else:
config = convert_config(model)
model = model.eval()
feature_extractor = Wav2Vec2FeatureExtractor(
feature_size=1,
sampling_rate=16000,
padding_value=0,
do_normalize=False,
return_attention_mask=False,
)
hf_model = HubertModel(config)
recursively_load_weights(model, hf_model)
feature_extractor.save_pretrained(pytorch_dump_folder_path)
hf_model.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
args = parser.parse_args()
convert_hubert_checkpoint(args.pytorch_dump_folder_path, args.config_path)
.\models\hubert\convert_hubert_original_pytorch_checkpoint_to_pytorch.py
import argparse
import json
import os
import fairseq
import torch
from fairseq.data import Dictionary
from transformers import (
HubertConfig,
HubertForCTC,
HubertModel,
Wav2Vec2CTCTokenizer,
Wav2Vec2FeatureExtractor,
Wav2Vec2Processor,
logging,
)
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
MAPPING = {
"post_extract_proj": "feature_projection.projection",
"encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
"self_attn.k_proj": "encoder.layers.*.attention.k_proj",
"self_attn.v_proj": "encoder.layers.*.attention.v_proj",
"self_attn.q_proj": "encoder.layers.*.attention.q_proj",
"self_attn.out_proj": "encoder.layers.*.attention.out_proj",
"self_attn_layer_norm": "encoder.layers.*.layer_norm",
"fc1": "encoder.layers.*.feed_forward.intermediate_dense",
"fc2": "encoder.layers.*.feed_forward.output_dense",
"final_layer_norm": "encoder.layers.*.final_layer_norm",
"encoder.layer_norm": "encoder.layer_norm",
"w2v_model.layer_norm": "feature_projection.layer_norm",
"w2v_encoder.proj": "lm_head",
"mask_emb": "masked_spec_embed",
}
def set_recursively(hf_pointer, key, value, full_name, weight_type):
"""
递归设置指针指向的属性值,并记录日志。
Args:
hf_pointer (object): Transformers 模型中的属性指针
key (str): 属性名称,用点分隔表示层次结构
value (torch.Tensor): 设置的值
full_name (str): 完整名称,用于日志记录
weight_type (str): 权重类型,如 'weight', 'bias' 等
Raises:
AssertionError: 如果设置的值的形状与预期不符合
"""
for attribute in key.split("."):
hf_pointer = getattr(hf_pointer, attribute)
if weight_type is not None:
hf_shape = getattr(hf_pointer, weight_type).shape
else:
hf_shape = hf_pointer.shape
assert hf_shape == value.shape, (
f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
f" {value.shape} for {full_name}"
)
if weight_type == "weight":
hf_pointer.weight.data = value
elif weight_type == "weight_g":
hf_pointer.weight_g.data = value
elif weight_type == "weight_v":
hf_pointer.weight_v.data = value
elif weight_type == "bias":
hf_pointer.bias.data = value
else:
hf_pointer.data = value
logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
def recursively_load_weights(fairseq_model, hf_model, is_finetuned):
"""
递归加载 Fairseq 模型的权重到 Transformers 模型中。
Args:
fairseq_model (FairseqModel): Fairseq 模型对象
hf_model (PreTrainedModel): Transformers 模型对象
is_finetuned (bool): 是否为微调模型
Returns:
None
"""
unused_weights = []
fairseq_dict = fairseq_model.state_dict()
feature_extractor = hf_model.hubert.feature_extractor if is_finetuned else hf_model.feature_extractor
for name, value in fairseq_dict.items():
is_used = False
if "conv_layers" in name:
load_conv_layer(
name,
value,
feature_extractor,
unused_weights,
hf_model.config.feat_extract_norm == "group",
)
is_used = True
else:
for key, mapped_key in MAPPING.items():
mapped_key = "hubert." + mapped_key if (is_finetuned and mapped_key != "lm_head") else mapped_key
if key in name or (key.split("w2v_model.")[-1] == name.split(".")[0] and not is_finetuned):
is_used = True
if "*" in mapped_key:
layer_index = name.split(key)[0].split(".")[-2]
mapped_key = mapped_key.replace("*", layer_index)
if "weight_g" in name:
weight_type = "weight_g"
elif "weight_v" in name:
weight_type = "weight_v"
elif "weight" in name:
weight_type = "weight"
elif "bias" in name:
weight_type = "bias"
else:
weight_type = None
set_recursively(hf_model, mapped_key, value, name, weight_type)
continue
if not is_used:
unused_weights.append(name)
logger.warning(f"Unused weights: {unused_weights}")
def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
name = full_name.split("conv_layers.")[-1]
items = name.split(".")
layer_id = int(items[0])
type_id = int(items[1])
if type_id == 0:
if "bias" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
f"{full_name} has size {value.shape}, but"
f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].conv.bias.data = value
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
elif "weight" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
f"{full_name} has size {value.shape}, but"
f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].conv.weight.data = value
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
if "bias" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
" found."
)
feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
elif "weight" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
f"{full_name} has size {value.shape}, but"
f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
else:
unused_weights.append(full_name)
@torch.no_grad()
def convert_hubert_checkpoint(
checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
):
"""
Copy/paste/tweak model's weights to transformers design.
"""
if config_path is not None:
config = HubertConfig.from_pretrained(config_path)
else:
config = HubertConfig()
if is_finetuned:
if dict_path:
target_dict = Dictionary.load(dict_path)
config.bos_token_id = target_dict.pad_index
config.pad_token_id = target_dict.bos_index
config.eos_token_id = target_dict.eos_index
config.vocab_size = len(target_dict.symbols)
vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
if not os.path.isdir(pytorch_dump_folder_path):
logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
return
os.makedirs(pytorch_dump_folder_path, exist_ok=True)
with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
json.dump(target_dict.indices, vocab_handle)
tokenizer = Wav2Vec2CTCTokenizer(
vocab_path,
unk_token=target_dict.unk_word,
pad_token=target_dict.pad_word,
bos_token=target_dict.bos_word,
eos_token=target_dict.eos_word,
word_delimiter_token="|",
do_lower_case=False,
)
return_attention_mask = True if config.feat_extract_norm == "layer" else False
feature_extractor = Wav2Vec2FeatureExtractor(
feature_size=1,
sampling_rate=16000,
padding_value=0,
do_normalize=True,
return_attention_mask=return_attention_mask,
)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor.save_pretrained(pytorch_dump_folder_path)
hf_wav2vec = HubertForCTC(config)
else:
hf_wav2vec = HubertModel(config)
model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path])
model = model[0].eval()
recursively_load_weights(model, hf_wav2vec, is_finetuned)
hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
parser.add_argument(
"--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
)
args = parser.parse_args()
convert_hubert_checkpoint(
args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
)
.\models\hubert\convert_hubert_original_s3prl_checkpoint_to_pytorch.py
"""Convert Hubert checkpoint."""
import argparse
import torch
from transformers import HubertConfig, HubertForSequenceClassification, Wav2Vec2FeatureExtractor, logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
SUPPORTED_MODELS = ["UtteranceLevel"]
@torch.no_grad()
def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path, model_dump_path):
"""
Copy/paste/tweak model's weights to transformers design.
将模型的权重复制/粘贴/调整到 transformers 设计中。
"""
checkpoint = torch.load(checkpoint_path, map_location="cpu")
if checkpoint["Config"]["downstream_expert"]["modelrc"]["select"] not in SUPPORTED_MODELS:
raise NotImplementedError(f"The supported s3prl models are {SUPPORTED_MODELS}")
downstream_dict = checkpoint["Downstream"]
hf_congfig = HubertConfig.from_pretrained(config_path)
hf_model = HubertForSequenceClassification.from_pretrained(base_model_name, config=hf_congfig)
hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
base_model_name, return_attention_mask=True, do_normalize=False
)
if hf_congfig.use_weighted_layer_sum:
hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"]
hf_model.projector.weight.data = downstream_dict["projector.weight"]
hf_model.projector.bias.data = downstream_dict["projector.bias"]
hf_model.classifier.weight.data = downstream_dict["model.post_net.linear.weight"]
hf_model.classifier.bias.data = downstream_dict["model.post_net.linear.bias"]
hf_feature_extractor.save_pretrained(model_dump_path)
hf_model.save_pretrained(model_dump_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--base_model_name", default=None, type=str, help="Name of the huggingface pretrained base model."
)
parser.add_argument("--config_path", default=None, type=str, help="Path to the huggingface classifier config.")
parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to the s3prl checkpoint.")
parser.add_argument("--model_dump_path", default=None, type=str, help="Path to the final converted model.")
args = parser.parse_args()
convert_s3prl_checkpoint(args.base_model_name, args.config_path, args.checkpoint_path, args.model_dump_path)
.\models\hubert\modeling_hubert.py
""" PyTorch Hubert model."""
import warnings
from typing import Optional, Tuple, Union
import numpy as np
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_hubert import HubertConfig
logger = logging.get_logger(__name__)
_HIDDEN_STATES_START_POSITION = 1
_CONFIG_FOR_DOC = "HubertConfig"
_CHECKPOINT_FOR_DOC = "facebook/hubert-large-ls960-ft"
_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]
_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
_CTC_EXPECTED_LOSS = 22.68
_SEQ_CLASS_CHECKPOINT = "superb/hubert-base-superb-ks"
_SEQ_CLASS_EXPECTED_OUTPUT = "'_unknown_'"
_SEQ_CLASS_EXPECTED_LOSS = 8.53
HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/hubert-base-ls960",
]
def _compute_mask_indices(
shape: Tuple[int, int],
mask_prob: float,
mask_length: int,
attention_mask: Optional[torch.LongTensor] = None,
min_masks: int = 0,
) -> np.ndarray:
"""
计算给定形状的随机掩码跨度。用于实现 ASR(自动语音识别)中的 SpecAugment 数据增强方法。
注意,此方法未经过优化,应在 CPU 上作为训练期间的预处理的一部分运行,而不是在 TPU 上运行。
"""
return np.ndarray
Args:
shape: The shape for which to compute masks. This should be of a tuple of size 2 where
the first element is the batch size and the second element is the length of the axis to span.
mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
independently generated mask spans of length `mask_length` is computed by
`mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
actual percentage will be smaller.
mask_length: size of the mask
min_masks: minimum number of masked spans
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
each batch dimension.
"""
# 解包形状参数
batch_size, sequence_length = shape
# 检查是否小于1
if mask_length < 1:
raise ValueError("`mask_length` has to be bigger than 0.")
# 检查是否大于序列长度
if mask_length > sequence_length:
raise ValueError(
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
f" and `sequence_length`: {sequence_length}`"
)
# epsilon 用于概率舍入
epsilon = np.random.rand(1).item()
def compute_num_masked_span(input_length):
"""Given input length, compute how many spans should be masked"""
# 计算应该被遮罩的 span 数量
num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
# 确保遮罩的 span 数量不低于最小要求
num_masked_span = max(num_masked_span, min_masks)
# 确保遮罩的 span 不超过序列长度
if num_masked_span * mask_length > sequence_length:
num_masked_span = sequence_length // mask_length
# 确保遮罩的 span 不超过 input_length - (mask_length - 1)
if input_length - (mask_length - 1) < num_masked_span:
num_masked_span = max(input_length - (mask_length - 1), 0)
return num_masked_span
# 计算每个 batch 中的遮罩 span 的数量
input_lengths = (
attention_mask.sum(-1).detach().tolist()
if attention_mask is not None
else [sequence_length for _ in range(batch_size)]
)
# 创建用于 SpecAugment 的遮罩数组
spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
spec_aug_mask_idxs = []
# 计算最大允许的遮罩 span 数量
max_num_masked_span = compute_num_masked_span(sequence_length)
# 如果最大允许的遮罩 span 数量为 0,则直接返回空的遮罩数组
if max_num_masked_span == 0:
return spec_aug_mask
# 对于每个输入长度进行循环处理
for input_length in input_lengths:
# 计算当前输入的被遮挡(masked)span的数量
num_masked_span = compute_num_masked_span(input_length)
# 随机选择要遮挡的索引位置
spec_aug_mask_idx = np.random.choice(
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
)
# 选择第一个被抽样的索引,用作填充向量的虚拟索引
# 确保所有批次的维度一致,因为可能存在概率舍入
# 选择第一个样本只是将这些向量填充两次。
if len(spec_aug_mask_idx) == 0:
# 这种情况只能发生在`input_length`严格小于`sequence_length`的情况下,
# 此时最后一个标记必须是填充标记,我们可以将其用作虚拟掩码ID
dummy_mask_idx = sequence_length - 1
else:
dummy_mask_idx = spec_aug_mask_idx[0]
# 将虚拟索引添加到`spec_aug_mask_idx`数组末尾,使其达到最大遮挡span数量
spec_aug_mask_idx = np.concatenate(
[spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
)
spec_aug_mask_idxs.append(spec_aug_mask_idx)
# 将列表转换为NumPy数组
spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
# 将遮挡的索引扩展为遮挡span
spec_aug_mask_idxs = np.broadcast_to(
spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
)
# 将形状重新整理为(batch_size, max_num_masked_span * mask_length)
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
# 添加偏移量到起始索引,以创建遮挡span
offsets = np.arange(mask_length)[None, None, :]
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
batch_size, max_num_masked_span * mask_length
)
spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
# 确保索引不会超过序列长度
if spec_aug_mask_idxs.max() > sequence_length - 1:
spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
# 将1散布到遮挡的索引位置
np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
# 返回生成的遮挡mask
return spec_aug_mask
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2NoLayerNormConvLayer 复制过来,将 Wav2Vec2 替换为 Hubert
class HubertNoLayerNormConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
# 如果 layer_id 大于 0,则使用前一个卷积层的输出维度作为输入维度,否则使用 1
self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
# 使用当前层的卷积维度作为输出维度
self.out_conv_dim = config.conv_dim[layer_id]
# 创建一个一维卷积层
self.conv = nn.Conv1d(
self.in_conv_dim,
self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id],
stride=config.conv_stride[layer_id],
bias=config.conv_bias,
)
# 根据配置选择激活函数
self.activation = ACT2FN[config.feat_extract_activation]
# 前向传播函数
def forward(self, hidden_states):
# 对输入的隐藏状态应用卷积操作
hidden_states = self.conv(hidden_states)
# 应用激活函数
hidden_states = self.activation(hidden_states)
return hidden_states
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2LayerNormConvLayer 复制过来,将 Wav2Vec2 替换为 Hubert
class HubertLayerNormConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
# 如果 layer_id 大于 0,则使用前一个卷积层的输出维度作为输入维度,否则使用 1
self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
# 使用当前层的卷积维度作为输出维度
self.out_conv_dim = config.conv_dim[layer_id]
# 创建一个一维卷积层
self.conv = nn.Conv1d(
self.in_conv_dim,
self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id],
stride=config.conv_stride[layer_id],
bias=config.conv_bias,
)
# 创建一个 LayerNorm 层,对输出维度进行归一化
self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
# 根据配置选择激活函数
self.activation = ACT2FN[config.feat_extract_activation]
# 前向传播函数
def forward(self, hidden_states):
# 对输入的隐藏状态应用卷积操作
hidden_states = self.conv(hidden_states)
# 将卷积输出的维度换位,以便于 LayerNorm 的应用
hidden_states = hidden_states.transpose(-2, -1)
# 应用 LayerNorm
hidden_states = self.layer_norm(hidden_states)
# 恢复维度的排列顺序
hidden_states = hidden_states.transpose(-2, -1)
# 应用激活函数
hidden_states = self.activation(hidden_states)
return hidden_states
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GroupNormConvLayer 复制过来,将 Wav2Vec2 替换为 Hubert
class HubertGroupNormConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
# 如果 layer_id 大于 0,则使用前一个卷积层的输出维度作为输入维度,否则使用 1
self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
# 使用当前层的卷积维度作为输出维度
self.out_conv_dim = config.conv_dim[layer_id]
# 创建一个一维卷积层
self.conv = nn.Conv1d(
self.in_conv_dim,
self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id],
stride=config.conv_stride[layer_id],
bias=config.conv_bias,
)
# 根据配置选择激活函数
self.activation = ACT2FN[config.feat_extract_activation]
# 创建一个 GroupNorm 层,分组数为输出维度,通道数为输出维度
self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
# 前向传播函数
def forward(self, hidden_states):
# 对输入的隐藏状态应用卷积操作
hidden_states = self.conv(hidden_states)
# 应用 GroupNorm
hidden_states = self.layer_norm(hidden_states)
# 应用激活函数
hidden_states = self.activation(hidden_states)
return hidden_states
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PositionalConvEmbedding 复制代码,并将 Wav2Vec2 替换为 Hubert
class HubertPositionalConvEmbedding(nn.Module):
def __init__(self, config):
super().__init__()
# 定义一维卷积层,用于位置编码
self.conv = nn.Conv1d(
config.hidden_size,
config.hidden_size,
kernel_size=config.num_conv_pos_embeddings,
padding=config.num_conv_pos_embeddings // 2,
groups=config.num_conv_pos_embedding_groups,
)
# 初始化权重归一化函数
weight_norm = nn.utils.weight_norm
if hasattr(nn.utils.parametrizations, "weight_norm"):
weight_norm = nn.utils.parametrizations.weight_norm
# 如果启用了 DeepSpeed zero3 加速
if is_deepspeed_zero3_enabled():
import deepspeed
# 使用 GatheredParameters 将权重进行分组
with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
self.conv = weight_norm(self.conv, name="weight", dim=2)
# 注册外部参数以便 DeepSpeed 管理
deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
else:
# 否则正常进行权重归一化
self.conv = weight_norm(self.conv, name="weight", dim=2)
# 创建用于填充的层
self.padding = HubertSamePadLayer(config.num_conv_pos_embeddings)
# 选择激活函数
self.activation = ACT2FN[config.feat_extract_activation]
def forward(self, hidden_states):
# 转置隐藏状态张量的维度
hidden_states = hidden_states.transpose(1, 2)
# 进行卷积操作
hidden_states = self.conv(hidden_states)
# 进行填充操作
hidden_states = self.padding(hidden_states)
# 应用激活函数
hidden_states = self.activation(hidden_states)
# 再次转置隐藏状态张量的维度
hidden_states = hidden_states.transpose(1, 2)
return hidden_states
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer 复制代码,并将 Wav2Vec2 替换为 Hubert
class HubertSamePadLayer(nn.Module):
def __init__(self, num_conv_pos_embeddings):
super().__init__()
# 根据卷积位置编码的数量确定是否需要移除填充
self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
def forward(self, hidden_states):
# 如果需要移除填充,则进行切片操作
if self.num_pad_remove > 0:
hidden_states = hidden_states[:, :, : -self.num_pad_remove]
return hidden_states
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder 复制代码,并将 Wav2Vec2 替换为 Hubert
class HubertFeatureEncoder(nn.Module):
"""从原始音频波形构建特征"""
# 构造函数留空,直接继承 nn.Module 的构造函数
# 初始化方法,接受一个配置对象作为参数
def __init__(self, config):
# 调用父类(superclass)的初始化方法
super().__init__()
# 根据配置文件中的特征提取归一化方式进行不同处理
if config.feat_extract_norm == "group":
# 如果是"group"方式,创建一组卷积层,第一个使用组归一化
conv_layers = [HubertGroupNormConvLayer(config, layer_id=0)] + [
HubertNoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
]
elif config.feat_extract_norm == "layer":
# 如果是"layer"方式,创建一组卷积层,全部使用层归一化
conv_layers = [HubertLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)]
else:
# 如果归一化方式不是合法值,抛出数值错误异常
raise ValueError(
f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
)
# 将创建的卷积层列表转换为 nn.ModuleList,使其成为 nn.Module 的一部分
self.conv_layers = nn.ModuleList(conv_layers)
# 梯度检查点技术默认关闭
self.gradient_checkpointing = False
# 默认所有参数需要梯度计算
self._requires_grad = True
# 冻结模型参数,使其不再计算梯度
def _freeze_parameters(self):
# 遍历所有参数,设置其 requires_grad 属性为 False
for param in self.parameters():
param.requires_grad = False
# 同时设置模型的 _requires_grad 属性为 False
self._requires_grad = False
# 前向传播方法
def forward(self, input_values):
# 将输入数据转换为二维张量
hidden_states = input_values[:, None]
# 如果需要梯度并且当前处于训练模式,确保 hidden_states 的 requires_grad 属性为 True
if self._requires_grad and self.training:
hidden_states.requires_grad = True
# 遍历所有卷积层并逐层进行前向传播计算
for conv_layer in self.conv_layers:
# 如果需要梯度、开启了梯度检查点并且处于训练模式,则使用梯度检查点技术
if self._requires_grad and self.gradient_checkpointing and self.training:
hidden_states = self._gradient_checkpointing_func(
conv_layer.__call__, # 调用当前卷积层的前向传播方法
hidden_states, # 当前隐藏状态作为输入
)
else:
# 否则,直接调用当前卷积层的前向传播方法
hidden_states = conv_layer(hidden_states)
# 返回最终的隐藏状态作为输出
return hidden_states
class HubertFeatureExtractor(HubertFeatureEncoder):
# 继承自HubertFeatureEncoder类的特征提取器类
def __init__(self, config):
super().__init__(config)
# 警告:该类已被弃用,将在Transformers v5中移除,请使用`HubertFeatureEncoder`代替。
warnings.warn(
f"The class `{self.__class__.__name__}` has been depreciated "
"and will be removed in Transformers v5. "
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
FutureWarning,
)
class HubertFeatureProjection(nn.Module):
# Hubert特征投影模块
def __init__(self, config):
super().__init__()
self.feat_proj_layer_norm = config.feat_proj_layer_norm
if self.feat_proj_layer_norm:
# 如果配置中包含特征投影层标准化,则初始化LayerNorm
self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
# 线性映射投影到隐藏层大小
self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
# 随机失活层
self.dropout = nn.Dropout(config.feat_proj_dropout)
def forward(self, hidden_states):
# 非投影的隐藏状态用于量化
if self.feat_proj_layer_norm:
hidden_states = self.layer_norm(hidden_states)
hidden_states = self.projection(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
# 从transformers.models.bart.modeling_bart.BartAttention复制到HubertAttention,将Bart->Hubert
class HubertAttention(nn.Module):
"""来自'Attention Is All You Need'论文的多头注意力机制"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
is_causal: bool = False,
config: Optional[HubertConfig] = None,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
self.config = config
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.is_causal = is_causal
# 线性映射层,用于查询、键、值和输出投影
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
# 重新塑造张量形状,以便进行多头注意力计算
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
```
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward复制到HubertFeedForward,用Hubert替换Wav2Vec2
class HubertFeedForward(nn.Module):
def __init__(self, config):
super().__init__()
self.intermediate_dropout = nn.Dropout(config.activation_dropout)
# 定义中间层全连接层,将隐藏大小转换为中间大小
self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
# 根据配置选择隐藏层激活函数
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
# 定义输出全连接层,将中间大小转换回隐藏大小
self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.output_dropout = nn.Dropout(config.hidden_dropout)
def forward(self, hidden_states):
# 中间全连接层和激活函数
hidden_states = self.intermediate_dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
hidden_states = self.intermediate_dropout(hidden_states)
# 输出全连接层和dropout
hidden_states = self.output_dense(hidden_states)
hidden_states = self.output_dropout(hidden_states)
return hidden_states
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayer复制到HubertEncoderLayer,用Hubert替换Wav2Vec2
class HubertEncoderLayer(nn.Module):
def __init__(self, config):
super().__init__()
# 定义注意力层,使用HubertAttention
self.attention = HubertAttention(
embed_dim=config.hidden_size,
num_heads=config.num_attention_heads,
dropout=config.attention_dropout,
is_decoder=False,
)
self.dropout = nn.Dropout(config.hidden_dropout)
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 定义FeedForward层,使用HubertFeedForward
self.feed_forward = HubertFeedForward(config)
self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, hidden_states, attention_mask=None, output_attentions=False):
# 记录注意力残差
attn_residual = hidden_states
# 执行注意力计算
hidden_states, attn_weights, _ = self.attention(
hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
)
hidden_states = self.dropout(hidden_states)
# 添加注意力残差到隐藏状态
hidden_states = attn_residual + hidden_states
# 层归一化
hidden_states = self.layer_norm(hidden_states)
# 使用FeedForward层处理隐藏状态
hidden_states = hidden_states + self.feed_forward(hidden_states)
hidden_states = self.final_layer_norm(hidden_states)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AttnAdapterLayer复制到HubertAttnAdapterLayer,用Hubert替换Wav2Vec2
class HubertAttnAdapterLayer(nn.Module):
def __init__(self, config):
"""
Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
up training throughput.
"""
# 调用父类的初始化方法
super().__init__()
# 设置输入维度为配置文件中的适配器注意力维度
self.input_dim = config.adapter_attn_dim
# 设置隐藏维度为配置文件中的隐藏大小
self.hidden_dim = config.hidden_size
# 使用LayerNorm对隐藏状态进行归一化
self.norm = nn.LayerNorm(self.hidden_dim)
# 第一个线性层,将隐藏状态映射到适配器注意力维度
self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim)
# 激活函数ReLU
self.act_fn = nn.ReLU()
# 第二个线性层,将适配器注意力维度映射回隐藏维度
self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim)
def forward(self, hidden_states: torch.FloatTensor):
# 对输入的隐藏状态进行LayerNorm归一化
hidden_states = self.norm(hidden_states)
# 第一个线性层的前向传播
hidden_states = self.linear_1(hidden_states)
# 应用ReLU激活函数
hidden_states = self.act_fn(hidden_states)
# 第二个线性层的前向传播
hidden_states = self.linear_2(hidden_states)
# 返回处理后的隐藏状态
return hidden_states
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayerStableLayerNorm 复制过来,将 Wav2Vec2 替换为 Hubert
class HubertEncoderLayerStableLayerNorm(nn.Module):
def __init__(self, config):
super().__init__()
# 定义自注意力层 HubertAttention,使用配置中的隐藏尺寸、注意力头数和注意力丢弃率,作为编码器而非解码器
self.attention = HubertAttention(
embed_dim=config.hidden_size,
num_heads=config.num_attention_heads,
dropout=config.attention_dropout,
is_decoder=False,
)
# 定义 Dropout 层,使用配置中的隐藏层丢弃率
self.dropout = nn.Dropout(config.hidden_dropout)
# 定义 LayerNorm 层,使用配置中的隐藏尺寸和层标准化系数
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 定义前馈神经网络 HubertFeedForward
self.feed_forward = HubertFeedForward(config)
# 定义最终的 LayerNorm 层,使用配置中的隐藏尺寸和层标准化系数
self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 如果配置中有 adapter_attn_dim 属性,则定义 HubertAttnAdapterLayer,否则为 None
if getattr(config, "adapter_attn_dim", None) is not None:
self.adapter_layer = HubertAttnAdapterLayer(config)
else:
self.adapter_layer = None
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
# 保存注意力残差
attn_residual = hidden_states
# 应用 LayerNorm 层
hidden_states = self.layer_norm(hidden_states)
# 应用自注意力层 HubertAttention,获取注意力权重(如果需要),输出新的隐藏状态
hidden_states, attn_weights, _ = self.attention(
hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
)
# 应用 Dropout
hidden_states = self.dropout(hidden_states)
# 加上注意力残差,形成新的隐藏状态
hidden_states = attn_residual + hidden_states
# 应用前馈神经网络,并加上最终的 LayerNorm
hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
# 如果存在 adapter_layer,则应用它
if self.adapter_layer is not None:
hidden_states = hidden_states + self.adapter_layer(hidden_states)
# 输出包含最终隐藏状态的元组 outputs
outputs = (hidden_states,)
# 如果需要输出注意力权重,则将注意力权重加入 outputs 元组
if output_attentions:
outputs += (attn_weights,)
return outputs
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Encoder 复制过来,将 Wav2Vec2 替换为 Hubert
class HubertEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
# 定义位置卷积嵌入层 HubertPositionalConvEmbedding
self.pos_conv_embed = HubertPositionalConvEmbedding(config)
# 定义 LayerNorm 层,使用配置中的隐藏尺寸和层标准化系数
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 定义 Dropout 层,使用配置中的隐藏层丢弃率
self.dropout = nn.Dropout(config.hidden_dropout)
# 定义多层 HubertEncoderLayer 层,并放入 nn.ModuleList 中
self.layers = nn.ModuleList([HubertEncoderLayer(config) for _ in range(config.num_hidden_layers)])
# 是否启用渐变检查点
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.tensor,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 初始化隐藏状态输出,根据需要创建空元组或者None
all_hidden_states = () if output_hidden_states else None
# 初始化自注意力输出,根据需要创建空元组或者None
all_self_attentions = () if output_attentions else None
if attention_mask is not None:
# 确保填充的标记输出为0
expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
hidden_states[~expand_attention_mask] = 0
# 扩展注意力掩码
attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
attention_mask = attention_mask.expand(
attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
)
# 计算位置嵌入
position_embeddings = self.pos_conv_embed(hidden_states)
# 将位置嵌入加到隐藏状态上
hidden_states = hidden_states + position_embeddings
# LayerNorm 归一化
hidden_states = self.layer_norm(hidden_states)
# Dropout
hidden_states = self.dropout(hidden_states)
# 检查是否启用了deepspeed zero3
deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
# 遍历所有层进行处理
for layer in self.layers:
if output_hidden_states:
# 如果需要输出隐藏状态,则将当前层的隐藏状态添加到all_hidden_states中
all_hidden_states = all_hidden_states + (hidden_states,)
# 添加LayerDrop(参见https://arxiv.org/abs/1909.11556 进行描述)
dropout_probability = torch.rand([])
# 根据LayerDrop的概率决定是否跳过当前层
skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
if not skip_the_layer or deepspeed_zero3_is_enabled:
# 如果不跳过当前层或者启用了deepspeed zero3,则进行前向传播
if self.gradient_checkpointing and self.training:
# 使用梯度检查点进行前向传播(checkpointing)
layer_outputs = self._gradient_checkpointing_func(
layer.__call__,
hidden_states,
attention_mask,
output_attentions,
)
else:
# 普通的前向传播
layer_outputs = layer(
hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
)
hidden_states = layer_outputs[0]
if skip_the_layer:
# 如果跳过当前层,则输出设置为None
layer_outputs = (None, None)
if output_attentions:
# 如果需要输出注意力权重,则将当前层的注意力权重添加到all_self_attentions中
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
# 如果需要输出隐藏状态,则将最终的隐藏状态添加到all_hidden_states中
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
# 如果不需要返回字典形式的输出,则返回一个元组,过滤掉为None的部分
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
# 如果需要返回字典形式的输出,则创建BaseModelOutput对象并返回
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderStableLayerNorm 复制代码,并将 Wav2Vec2 替换为 Hubert
class HubertEncoderStableLayerNorm(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
# 初始化位置编码卷积嵌入层,使用 HubertPositionalConvEmbedding 类
self.pos_conv_embed = HubertPositionalConvEmbedding(config)
# 初始化层归一化层,归一化隐藏状态特征向量
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 初始化丢弃层,以减少隐藏状态特征向量中的部分信息,防止过拟合
self.dropout = nn.Dropout(config.hidden_dropout)
# 初始化层列表,包含 HubertEncoderLayerStableLayerNorm 类的隐藏层,数量由配置中的 num_hidden_layers 决定
self.layers = nn.ModuleList(
[HubertEncoderLayerStableLayerNorm(config) for _ in range(config.num_hidden_layers)]
)
# 梯度检查点设置为关闭状态
self.gradient_checkpointing = False
def forward(
self,
hidden_states,
attention_mask=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
):
# 初始化所有隐藏状态为一个空元组,如果不输出隐藏状态则为 None
all_hidden_states = () if output_hidden_states else None
# 初始化所有自注意力权重为一个空元组,如果不输出注意力权重则为 None
all_self_attentions = () if output_attentions else None
# 如果存在注意力遮罩
if attention_mask is not None:
# 确保填充的标记不被注意到
expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
hidden_states[~expand_attention_mask] = 0
# 扩展注意力遮罩
attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
attention_mask = attention_mask.expand(
attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
)
# 计算位置嵌入
position_embeddings = self.pos_conv_embed(hidden_states)
hidden_states = hidden_states + position_embeddings
hidden_states = self.dropout(hidden_states)
# 检查是否启用了 DeepSpeed Zero3
deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
# 对每个层进行循环
for layer in self.layers:
# 如果输出隐藏状态,则记录当前隐藏状态
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 添加 LayerDrop(参见 https://arxiv.org/abs/1909.11556)
dropout_probability = torch.rand([])
# 根据 LayerDrop 的概率决定是否跳过当前层
skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
if not skip_the_layer or deepspeed_zero3_is_enabled:
# 如果启用了梯度检查点和处于训练模式,则使用梯度检查点来调用当前层
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer.__call__,
hidden_states,
attention_mask,
output_attentions,
)
else:
# 否则直接调用当前层
layer_outputs = layer(
hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
)
hidden_states = layer_outputs[0]
# 如果跳过当前层,则层输出为空
if skip_the_layer:
layer_outputs = (None, None)
# 如果输出注意力权重,则记录当前层的自注意力权重
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
# 对最终的隐藏状态进行 Layer Norm 处理
hidden_states = self.layer_norm(hidden_states)
# 如果输出隐藏状态,则记录最终的隐藏状态
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 如果不返回字典形式的结果,则按顺序返回相关结果
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
# 否则返回 Base Model Output 对象
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
# 指定配置类为 HubertConfig
config_class = HubertConfig
# 模型的前缀名为 "hubert"
base_model_prefix = "hubert"
# 主要输入的名称为 "input_values"
main_input_name = "input_values"
# 支持梯度检查点
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
# 如果是线性层,则使用正态分布初始化权重
if isinstance(module, nn.Linear):
# 与 TensorFlow 版本略有不同,后者使用截断正态分布进行初始化
# 参考 https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
# 如果是 LayerNorm 或 GroupNorm,则初始化偏置为零,权重为1
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
# 如果是 1D 卷积层
elif isinstance(module, nn.Conv1d):
# 检查是否启用了 DeepSpeed 的 Zero3 模式
if is_deepspeed_zero3_enabled():
import deepspeed
# 如果模块有 weight_v 和 weight_g 属性
if hasattr(module, "weight_v") and hasattr(module, "weight_g"):
# 使用 GatheredParameters 进行初始化
with deepspeed.zero.GatheredParameters([module.weight_v, module.weight_g], modifier_rank=0):
nn.init.kaiming_normal_(module.weight.data)
else:
# 使用 GatheredParameters 进行初始化
with deepspeed.zero.GatheredParameters(module.weight, modifier_rank=0):
nn.init.kaiming_normal_(module.weight.data)
else:
# 使用 kaiming_normal_ 方法初始化权重
nn.init.kaiming_normal_(module.weight.data)
# 如果是线性层或 1D 卷积层,并且有偏置,则将偏置初始化为零
if isinstance(module, (nn.Linear, nn.Conv1d)) and module.bias is not None:
module.bias.data.zero_()
def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
"""
Computes the output length of the convolutional layers
"""
def _conv_out_length(input_length, kernel_size, stride):
# 计算 1D 卷积层的输出长度,使用公式来源于 PyTorch 文档
return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
# 遍历配置中的卷积核大小和步长
for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
# 更新输入长度为卷积层输出长度
input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
return input_lengths
# 定义一个方法,用于生成特征向量的注意力掩码
def _get_feature_vector_attention_mask(self, feature_vector_length: int, attention_mask: torch.LongTensor):
# 根据注意力掩码的长度信息计算输出长度,并转换为长整型
output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
# 获取当前批次的大小
batch_size = attention_mask.shape[0]
# 初始化一个全零的注意力掩码张量,形状为(batch_size, feature_vector_length)
attention_mask = torch.zeros(
(batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
)
# 设置注意力掩码的部分值为1,确保在输出长度之前的所有位置都被关注
attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
# 翻转张量,并对每行进行累积求和,然后再次翻转,并将结果转换为布尔类型
attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
# 返回生成的注意力掩码张量
return attention_mask
HUBERT_START_DOCSTRING = r"""
Hubert was proposed in [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden
Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia,
Ruslan Salakhutdinov, Abdelrahman Mohamed.
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving etc.).
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`HubertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
HUBERT_INPUTS_DOCSTRING = r"""
Placeholder for the documentation string describing the inputs expected by the `Hubert` model.
"""
Args:
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
# 输入的原始语音波形的浮点值。可以通过将 `.flac` 或 `.wav` 音频文件加载到 `List[float]` 或 `numpy.ndarray` 类型的数组中获得。可以使用 `soundfile` 库 (`pip install soundfile`)。使用 [`AutoProcessor`] 进行填充和转换成 `torch.FloatTensor` 类型的张量,详见 [`Wav2Vec2Processor.__call__`]。
attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
# 遮罩,用于避免在填充标记索引上执行卷积和注意力操作。遮罩值为 `[0, 1]`:
- 1 表示 **未被遮罩** 的标记,
- 0 表示 **被遮罩** 的标记。
[什么是注意力遮罩?](../glossary#attention-mask)
<Tip warning={true}>
只有当相应的处理器具有 `config.return_attention_mask == True` 时才应传递 `attention_mask`。对于所有处理器具有 `config.return_attention_mask == False` 的模型,例如 [hubert-base](https://huggingface.co/facebook/hubert-base-ls960),不应传递 `attention_mask` 以避免在进行批量推理时性能下降。对于这样的模型,`input_values` 应简单地填充为 0 并且不传递 `attention_mask`。请注意,这些模型在 `input_values` 是否填充上也会产生略微不同的结果。
</Tip>
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。有关更多详细信息,请参见返回的张量下的 `attentions`。
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。有关更多详细信息,请参见返回的张量下的 `hidden_states`。
return_dict (`bool`, *optional*):
# 是否返回 [`~utils.ModelOutput`] 而不是普通的元组。
"""
在代码中添加注释,解释每个语句的作用和功能。
"""
@add_start_docstrings(
"The bare Hubert Model transformer outputting raw hidden-states without any specific head on top.",
HUBERT_START_DOCSTRING,
)
class HubertModel(HubertPreTrainedModel):
def __init__(self, config: HubertConfig):
super().__init__(config)
self.config = config # 初始化模型配置
self.feature_extractor = HubertFeatureEncoder(config) # 使用给定配置创建特征提取器
self.feature_projection = HubertFeatureProjection(config) # 使用给定配置创建特征投影器
if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
# 如果配置中有时间或特征掩码概率大于零,则初始化一个可学习的掩码嵌入向量
if config.do_stable_layer_norm:
self.encoder = HubertEncoderStableLayerNorm(config)
# 如果配置要求稳定的层归一化,则使用稳定层归一化版本的编码器
else:
self.encoder = HubertEncoder(config)
# 否则使用普通版本的编码器
# 初始化权重并应用最终处理
self.post_init()
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states复制而来
def _mask_hidden_states(
self,
hidden_states: torch.FloatTensor,
mask_time_indices: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
# 掩码模型的隐藏状态,可以选择性地使用时间索引或注意力掩码
):
"""
对隐藏状态进行掩码操作。
Args:
hidden_states (torch.FloatTensor): 输入的隐藏状态张量。
mask_time_indices (Optional[torch.FloatTensor]): 可选的时间索引掩码张量。
attention_mask (Optional[torch.LongTensor]): 可选的注意力掩码张量。
"""
):
"""
Masks extracted features along time axis and/or along feature axis according to
[SpecAugment](https://arxiv.org/abs/1904.08779).
"""
# `config.apply_spec_augment` can set masking to False
# 检查配置中是否允许应用 SpecAugment,如果不允许,则直接返回隐藏状态
if not getattr(self.config, "apply_spec_augment", True):
return hidden_states
# generate indices & apply SpecAugment along time axis
batch_size, sequence_length, hidden_size = hidden_states.size()
if mask_time_indices is not None:
# apply SpecAugment along time axis with given mask_time_indices
# 根据给定的 mask_time_indices 在时间轴上应用 SpecAugment
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
elif self.config.mask_time_prob > 0 and self.training:
# calculate mask_time_indices if not provided explicitly
# 如果未明确提供 mask_time_indices,则计算它
mask_time_indices = _compute_mask_indices(
(batch_size, sequence_length),
mask_prob=self.config.mask_time_prob,
mask_length=self.config.mask_time_length,
attention_mask=attention_mask,
min_masks=self.config.mask_time_min_masks,
)
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
if self.config.mask_feature_prob > 0 and self.training:
# generate indices & apply SpecAugment along feature axis
# 生成索引并沿特征轴应用 SpecAugment
mask_feature_indices = _compute_mask_indices(
(batch_size, hidden_size),
mask_prob=self.config.mask_feature_prob,
mask_length=self.config.mask_feature_length,
min_masks=self.config.mask_feature_min_masks,
)
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
hidden_states[mask_feature_indices] = 0
return hidden_states
"""
Returns a tuple containing model outputs or a BaseModelOutput.
Example:
```
>>> from transformers import AutoProcessor, HubertModel
>>> from datasets import load_dataset
>>> import soundfile as sf
>>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
>>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
>>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech
... return batch
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array)
>>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values
>>> hidden_states = model(input_values).last_hidden_state
```"""
# Initialize variables with default values if not provided
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# Extract features from input_values using feature_extractor
extract_features = self.feature_extractor(input_values)
extract_features = extract_features.transpose(1, 2) # Transpose dimensions for further processing
# Compute attention mask specific to feature vectors if provided
if attention_mask is not None:
attention_mask = self._get_feature_vector_attention_mask(extract_features.shape[1], attention_mask)
# Project features into hidden states
hidden_states = self.feature_projection(extract_features)
# Mask certain time indices in hidden states if specified
hidden_states = self._mask_hidden_states(hidden_states, mask_time_indices=mask_time_indices)
# Encode hidden states using the encoder
encoder_outputs = self.encoder(
hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# Extract last hidden states from encoder outputs
hidden_states = encoder_outputs[0]
# Return model outputs based on return_dict flag
if not return_dict:
return (hidden_states,) + encoder_outputs[1:] # Return tuple of hidden states and additional outputs
# Return BaseModelOutput object with specified attributes
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
@add_start_docstrings(
"""Hubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
HUBERT_START_DOCSTRING,
)
# 定义了一个名为 HubertForCTC 的类,继承自 HubertPreTrainedModel
# 此类实现了带有语言建模头部的 Hubert 模型,用于连接主义时间分类(CTC)任务。
class HubertForCTC(HubertPreTrainedModel):
def __init__(self, config, target_lang: Optional[str] = None):
super().__init__(config)
# 初始化 Hubert 模型
self.hubert = HubertModel(config)
# Dropout 层
self.dropout = nn.Dropout(config.final_dropout)
# 可选的目标语言设定
self.target_lang = target_lang
# 检查配置中是否定义了词汇表大小,如果没有则抛出异常
if config.vocab_size is None:
raise ValueError(
f"You are trying to instantiate {self.__class__} with a configuration that "
"does not define the vocabulary size of the language model head. Please "
"instantiate the model as follows: `HubertForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
"or define `vocab_size` of your model's configuration."
)
# 根据配置定义线性层作为语言建模头部
output_hidden_size = (
config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
)
self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
# 初始化权重并应用最终处理
self.post_init()
# 覆盖 PreTrainedModel 的 tie_weights 方法,以便在通过 from_pretrained(...) 传递 target_lang=... 时能正确加载适配器权重
def tie_weights(self):
"""
This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
passing `target_lang=...` to `from_pretrained(...)`.
This method is **not** supposed to be called by the user and is prone to be changed in the future.
"""
# 注意,tie_weights 通常用于绑定输入和输出嵌入权重。在这里重新定义它的目的是为了正确加载 Hubert 的适配器层,
# 以便不需要引入新的 API 到 PreTrainedModel。虽然有些技巧性,但 Hubert 永远不需要绑定输入和输出嵌入,因此在这里重新用于适配器加载是可以接受的。
# 获取目标语言
target_lang = self.target_lang
# 如果 target_lang 不为 None,且配置中未定义 adapter_attn_dim,则抛出异常
if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
# 如果 target_lang 为 None,且配置中定义了 adapter_attn_dim,则记录日志提示用户默认 target_lang 为 'eng'
elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
logger.info("By default `target_lang` is set to 'eng'.")
# 如果 target_lang 不为 None,则加载适配器
elif target_lang is not None:
self.load_adapter(target_lang, force_load=True)
# 调用此函数将冻结特征编码器的梯度计算,使其在训练过程中不会更新参数。
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
# 发出警告提示,说明函数 `freeze_feature_extractor` 将在 Transformers v5 中移除,并建议使用 `freeze_feature_encoder` 替代。
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
# 调用 `freeze_feature_encoder` 方法来实现特征编码器参数的冻结。
self.freeze_feature_encoder()
# 调用此函数将冻结特征编码器的梯度计算,使其在训练过程中不会更新参数。
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
# 调用 Hubert 模型中的特征提取器的 `_freeze_parameters` 方法来冻结参数。
self.hubert.feature_extractor._freeze_parameters()
# 调用此函数将冻结基础模型的梯度计算,使其在训练过程中不会更新参数,仅分类头会更新。
def freeze_base_model(self):
"""
Calling this function will disable the gradient computation for the base model so that its parameters will not
be updated during training. Only the classification head will be updated.
"""
# 遍历 Hubert 模型的所有参数,并将其 `requires_grad` 设置为 False,以冻结基础模型的参数。
for param in self.hubert.parameters():
param.requires_grad = False
# 重写 `forward` 方法,将其注解添加到模型的前向传播文档中,并附上代码示例的文档字符串。
@add_start_docstrings_to_model_forward(HUBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=CausalLMOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_CTC_EXPECTED_OUTPUT,
expected_loss=_CTC_EXPECTED_LOSS,
)
def forward(
self,
input_values: Optional[torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: Optional[torch.Tensor] = None,
) -> Union[Tuple, CausalLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
config.vocab_size - 1]`.
"""
# 初始化返回字典,如果未指定则使用配置中的返回字典设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用 Hubert 模型,获取输出结果
outputs = self.hubert(
input_values,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从输出中获取隐藏状态,并应用 dropout
hidden_states = outputs[0]
hidden_states = self.dropout(hidden_states)
# 将隐藏状态传入语言模型头部,生成预测 logits
logits = self.lm_head(hidden_states)
# 初始化损失为 None
loss = None
if labels is not None:
# 如果标签存在,检查标签值是否超出词汇表大小,如果是则引发 ValueError
if labels.max() >= self.config.vocab_size:
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
# 根据注意力掩码计算输入长度
attention_mask = (
attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
)
input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
# 假设填充的标记用 -100 填充,不被注意到时
# 创建标签掩码以计算目标长度
labels_mask = labels >= 0
target_lengths = labels_mask.sum(-1)
flattened_targets = labels.masked_select(labels_mask)
# 使用 log_softmax 计算对数概率
log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
# 禁用 cuDNN 以确保兼容性
with torch.backends.cudnn.flags(enabled=False):
# 计算 CTC 损失
loss = nn.functional.ctc_loss(
log_probs,
flattened_targets,
input_lengths,
target_lengths,
blank=self.config.pad_token_id,
reduction=self.config.ctc_loss_reduction,
zero_infinity=self.config.ctc_zero_infinity,
)
# 如果不要求返回字典,则根据输出格式构建返回结果
if not return_dict:
output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
return ((loss,) + output) if loss is not None else output
# 如果要求返回字典,则创建 CausalLMOutput 对象并返回
return CausalLMOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
)
# 使用 Hubert 模型进行序列分类,该模型在顶部有一个用于分类的线性层(基于池化输出)
@add_start_docstrings(
"""
Hubert Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
SUPERB Keyword Spotting.
""",
HUBERT_START_DOCSTRING,
)
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification 复制而来,将 Wav2Vec2 改为 Hubert,wav2vec2 改为 hubert,WAV_2_VEC_2 改为 HUBERT
class HubertForSequenceClassification(HubertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 如果配置中存在 `add_adapter` 属性且为 True,则抛出异常,因为序列分类不支持使用 Hubert 适配器
if hasattr(config, "add_adapter") and config.add_adapter:
raise ValueError(
"Sequence classification does not support the use of Hubert adapters (config.add_adapter=True)"
)
# 创建 HubertModel 对象
self.hubert = HubertModel(config)
# 计算层数,包括变换器层和输入嵌入层
num_layers = config.num_hidden_layers + 1
# 如果配置指定使用加权层求和,则初始化层权重
if config.use_weighted_layer_sum:
self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
# 创建投影层,将隐藏状态映射到分类器投影空间
self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
# 创建分类器层,将投影后的特征映射到类别数量
self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
# 初始化权重并应用最终处理
self.post_init()
# 冻结特征提取器,不再更新其参数
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
# 冻结特征编码器,不再更新其参数
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.hubert.feature_extractor._freeze_parameters()
# 冻结基础模型,不再更新其参数,只更新分类头
def freeze_base_model(self):
"""
Calling this function will disable the gradient computation for the base model so that its parameters will not
be updated during training. Only the classification head will be updated.
"""
for param in self.hubert.parameters():
param.requires_grad = False
# 将 HUBERT_INPUTS_DOCSTRING 添加到模型前向传播函数的文档字符串中
@add_start_docstrings_to_model_forward(HUBERT_INPUTS_DOCSTRING)
# 将代码示例的文档字符串添加到模型前向传播函数的文档字符串中,指定了检查点、输出类型、配置类、模态(audio)、预期输出和预期损失
@add_code_sample_docstrings(
checkpoint=_SEQ_CLASS_CHECKPOINT,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
modality="audio",
expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
)
def forward(
self,
input_values: Optional[torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: Optional[torch.Tensor] = None,
) -> Union[Tuple, SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 设置是否返回字典形式的输出结果,默认为模型配置中指定的值
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果配置中指定使用加权层求和的隐藏状态,则设置为True,否则使用传入参数
output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
# 使用 Hubert 模型进行前向传播,获取输出结果
outputs = self.hubert(
input_values,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 如果配置中指定使用加权层求和的隐藏状态
if self.config.use_weighted_layer_sum:
# 从输出结果中提取隐藏状态
hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
# 将隐藏状态堆叠起来,按照加权向量进行加权求和
hidden_states = torch.stack(hidden_states, dim=1)
# 对加权向量进行 softmax 归一化处理
norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
# 按加权向量加权求和隐藏状态
hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
else:
# 否则直接使用第一个输出的隐藏状态
hidden_states = outputs[0]
# 使用投影层进行映射
hidden_states = self.projector(hidden_states)
# 如果没有传入注意力掩码,则计算平均池化输出
if attention_mask is None:
pooled_output = hidden_states.mean(dim=1)
else:
# 否则根据注意力掩码生成填充掩码,将填充位置的隐藏状态置为0
padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
hidden_states[~padding_mask] = 0.0
# 计算填充掩码后的池化输出
pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
# 使用分类器计算 logits
logits = self.classifier(pooled_output)
# 初始化损失为 None
loss = None
# 如果传入了标签,则计算交叉熵损失
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
# 如果不要求返回字典形式的输出
if not return_dict:
# 组装输出元组,包括 logits 和隐藏状态
output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
# 如果有损失,则将损失加入输出元组
return ((loss,) + output) if loss is not None else output
# 如果要求返回字典形式的输出结果
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)