Transformers 源码解析(一百零三)
.\models\sew\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {"configuration_sew": ["SEW_PRETRAINED_CONFIG_ARCHIVE_MAP", "SEWConfig"]}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_sew"] = [
"SEW_PRETRAINED_MODEL_ARCHIVE_LIST",
"SEWForCTC",
"SEWForSequenceClassification",
"SEWModel",
"SEWPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_sew import SEW_PRETRAINED_CONFIG_ARCHIVE_MAP, SEWConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_sew import (
SEW_PRETRAINED_MODEL_ARCHIVE_LIST,
SEWForCTC,
SEWForSequenceClassification,
SEWModel,
SEWPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\sew_d\configuration_sew_d.py
""" SEW-D model configuration"""
import functools
import operator
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
SEW_D_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"asapp/sew-d-tiny-100k": "https://huggingface.co/asapp/sew-d-tiny-100k/resolve/main/config.json",
}
class SEWDConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`SEWDModel`]. It is used to instantiate a SEW-D
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the SEW-D
[asapp/sew-d-tiny-100k](https://huggingface.co/asapp/sew-d-tiny-100k) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import SEWDConfig, SEWDModel
>>> # Initializing a SEW-D asapp/sew-d-tiny-100k style configuration
>>> configuration = SEWDConfig()
>>> # Initializing a model (with random weights) from the asapp/sew-d-tiny-100k style configuration
>>> model = SEWDModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "sew-d"
def __init__(
self,
vocab_size=32,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
squeeze_factor=2,
max_position_embeddings=512,
position_buckets=256,
share_att_key=True,
relative_attention=True,
pos_att_type=("p2c", "c2p"),
norm_rel_ebd="layer_norm",
hidden_act="gelu_python",
hidden_dropout=0.1,
activation_dropout=0.1,
attention_dropout=0.1,
feat_proj_dropout=0.0,
final_dropout=0.1,
initializer_range=0.02,
layer_norm_eps=1e-7,
feature_layer_norm_eps=1e-5,
feat_extract_norm="group",
feat_extract_activation="gelu",
conv_dim=(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512),
conv_stride=(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1),
conv_kernel=(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1),
conv_bias=False,
num_conv_pos_embeddings=128,
num_conv_pos_embedding_groups=16,
apply_spec_augment=True,
mask_time_prob=0.05,
mask_time_length=10,
mask_time_min_masks=2,
mask_feature_prob=0.0,
mask_feature_length=10,
mask_feature_min_masks=0,
ctc_loss_reduction="mean",
ctc_zero_infinity=False,
use_weighted_layer_sum=False,
classifier_proj_size=256,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
**kwargs,
):
@property
def inputs_to_logits_ratio(self):
return functools.reduce(operator.mul, self.conv_stride, 1)
@property
def hidden_dropout(self):
logger.warning_once("hidden_dropout is not used by the model and will be removed as config attribute in v4.35")
return self._hidden_dropout
def to_dict(self):
"""
Serializes this instance to a Python dictionary.
"""
output = super().to_dict()
output["hidden_dropout"] = output.pop("_hidden_dropout")
return output
.\models\sew_d\convert_sew_d_original_pytorch_checkpoint_to_pytorch.py
"""Convert SEW checkpoint."""
import argparse
import json
import os
import fairseq
import torch
from fairseq.data import Dictionary
from sew_asapp import tasks
from transformers import (
SEWDConfig,
SEWDForCTC,
SEWDModel,
Wav2Vec2CTCTokenizer,
Wav2Vec2FeatureExtractor,
Wav2Vec2Processor,
logging,
)
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
MAPPING = {
"post_extract_proj": "feature_projection",
"encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
"attention.self.query_proj": "encoder.encoder.layer.*.attention.self.query_proj",
"attention.self.key_proj": "encoder.encoder.layer.*.attention.self.key_proj",
"attention.self.value_proj": "encoder.encoder.layer.*.attention.self.value_proj",
"attention.output.dense": "encoder.encoder.layer.*.attention.output.dense",
"attention.output.LayerNorm": "encoder.encoder.layer.*.attention.output.LayerNorm",
"intermediate.dense": "encoder.encoder.layer.*.intermediate.dense",
"output.dense": "encoder.encoder.layer.*.output.dense",
"output.LayerNorm": "encoder.encoder.layer.*.output.LayerNorm",
"encoder.encoder.rel_embeddings": "encoder.encoder.rel_embeddings",
"encoder.encoder.LayerNorm": "encoder.encoder.LayerNorm",
"encoder.upsample.0": "encoder.upsample.projection",
"encoder.layer_norm": "encoder.layer_norm",
"w2v_model.layer_norm": "layer_norm",
"w2v_encoder.proj": "lm_head",
"mask_emb": "masked_spec_embed",
}
def set_recursively(hf_pointer, key, value, full_name, weight_type):
for attribute in key.split("."):
hf_pointer = getattr(hf_pointer, attribute)
if weight_type is not None:
hf_shape = getattr(hf_pointer, weight_type).shape
else:
hf_shape = hf_pointer.shape
assert hf_shape == value.shape, (
f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
f" {value.shape} for {full_name}"
)
if weight_type == "weight":
hf_pointer.weight.data = value
elif weight_type == "weight_g":
hf_pointer.weight_g.data = value
elif weight_type == "weight_v":
hf_pointer.weight_v.data = value
elif weight_type == "bias":
hf_pointer.bias.data = value
else:
hf_pointer.data = value
logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
def recursively_load_weights(fairseq_model, hf_model, is_finetuned):
unused_weights = []
fairseq_dict = fairseq_model.state_dict()
feature_extractor = hf_model.sew_d.feature_extractor if is_finetuned else hf_model.feature_extractor
for name, value in fairseq_dict.items():
is_used = False
if "conv_layers" in name:
load_conv_layer(
name,
value,
feature_extractor,
unused_weights,
hf_model.config.feat_extract_norm == "group",
)
is_used = True
else:
for key, mapped_key in MAPPING.items():
mapped_key = "sew_d." + mapped_key if (is_finetuned and mapped_key != "lm_head") else mapped_key
if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
is_used = True
if "*" in mapped_key:
layer_index = name.split(key)[0].split(".")[-2]
if not layer_index.isnumeric():
continue
mapped_key = mapped_key.replace("*", layer_index)
if "weight_g" in name:
weight_type = "weight_g"
elif "weight_v" in name:
weight_type = "weight_v"
elif "weight" in name:
weight_type = "weight"
elif "bias" in name:
weight_type = "bias"
else:
weight_type = None
set_recursively(hf_model, mapped_key, value, name, weight_type)
continue
if not is_used:
unused_weights.append(name)
logger.warning(f"Unused weights: {unused_weights}")
def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
name = full_name.split("conv_layers.")[-1]
items = name.split(".")
layer_id = int(items[0])
type_id = int(items[1])
if type_id == 0:
if "bias" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
f"{full_name} has size {value.shape}, but"
f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].conv.bias.data = value
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
elif "weight" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
f"{full_name} has size {value.shape}, but"
f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].conv.weight.data = value
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
if "bias" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
" found."
)
feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
elif "weight" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
f"{full_name} has size {value.shape}, but"
f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
else:
unused_weights.append(full_name)
def convert_config(model, is_finetuned):
config = SEWDConfig()
if is_finetuned:
fs_config = model.w2v_encoder.w2v_model.cfg
else:
fs_config = model.cfg
config.conv_bias = fs_config.conv_bias
conv_layers = eval(fs_config.conv_feature_layers)
config.conv_dim = [x[0] for x in conv_layers]
config.conv_kernel = [x[1] for x in conv_layers]
config.conv_stride = [x[2] for x in conv_layers]
config.feat_extract_activation = "gelu"
config.feat_extract_norm = "layer" if fs_config.extractor_mode == "layer_norm" else "group"
config.final_dropout = 0.0
config.hidden_act = fs_config.activation_fn.name
config.hidden_size = fs_config.encoder_embed_dim
config.initializer_range = 0.02
config.intermediate_size = fs_config.encoder_ffn_embed_dim
config.layer_norm_eps = 1e-5
config.layerdrop = fs_config.encoder_layerdrop
config.num_attention_heads = fs_config.encoder_attention_heads
config.num_conv_pos_embedding_groups = fs_config.conv_pos_groups
config.num_conv_pos_embeddings = fs_config.conv_pos
config.num_feat_extract_layers = len(conv_layers)
config.num_hidden_layers = fs_config.encoder_layers
config.squeeze_factor = fs_config.squeeze_factor
config.max_position_embeddings = fs_config.max_position_embeddings
config.position_buckets = fs_config.position_buckets
config.share_att_key = fs_config.share_att_key
config.relative_attention = fs_config.relative_attention
config.position_biased_input = fs_config.position_biased_input
config.pos_att_type = tuple(fs_config.pos_att_type.split("|"))
config.norm_rel_ebd = fs_config.norm_rel_ebd
if is_finetuned:
fs_config = model.cfg
config.final_dropout = fs_config.final_dropout
config.layerdrop = fs_config.layerdrop
config.activation_dropout = fs_config.activation_dropout
config.apply_spec_augment = fs_config.mask_prob > 0 or fs_config.mask_channel_prob > 0
config.attention_dropout = fs_config.attention_dropout
config.feat_proj_dropout = fs_config.dropout_input
config.hidden_dropout = fs_config.dropout
config.mask_feature_length = fs_config.mask_channel_length
config.mask_feature_prob = fs_config.mask_channel_prob
config.mask_time_length = fs_config.mask_length
config.mask_time_prob = fs_config.mask_prob
config.feature_extractor_type = "Wav2Vec2FeatureExtractor"
config.tokenizer_class = "Wav2Vec2CTCTokenizer"
return config
@torch.no_grad()
def convert_sew_checkpoint(
checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
):
"""
Copy/paste/tweak model's weights to transformers design.
"""
pass
if is_finetuned:
model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
[checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
)
else:
model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path])
if config_path is not None:
config = SEWDConfig.from_pretrained(config_path)
else:
config = convert_config(model[0], is_finetuned)
model = model[0].eval()
return_attention_mask = True if config.feat_extract_norm == "layer" else False
feature_extractor = Wav2Vec2FeatureExtractor(
feature_size=1,
sampling_rate=16000,
padding_value=0,
do_normalize=True,
return_attention_mask=return_attention_mask,
)
if is_finetuned:
if dict_path:
target_dict = Dictionary.load(dict_path)
target_dict.indices[target_dict.bos_word] = target_dict.pad_index
target_dict.indices[target_dict.pad_word] = target_dict.bos_index
config.bos_token_id = target_dict.pad_index
config.pad_token_id = target_dict.bos_index
config.eos_token_id = target_dict.eos_index
config.vocab_size = len(target_dict.symbols)
vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
if not os.path.isdir(pytorch_dump_folder_path):
logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
return
os.makedirs(pytorch_dump_folder_path, exist_ok=True)
with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
json.dump(target_dict.indices, vocab_handle)
tokenizer = Wav2Vec2CTCTokenizer(
vocab_path,
unk_token=target_dict.unk_word,
pad_token=target_dict.pad_word,
bos_token=target_dict.bos_word,
eos_token=target_dict.eos_word,
word_delimiter_token="|",
do_lower_case=False,
)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor.save_pretrained(pytorch_dump_folder_path)
hf_model = SEWDForCTC(config)
else:
hf_model = SEWDModel(config)
feature_extractor.save_pretrained(pytorch_dump_folder_path)
recursively_load_weights(model, hf_model, is_finetuned)
hf_model.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
parser.add_argument(
"--is_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
)
args = parser.parse_args()
convert_sew_checkpoint(
args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, args.is_finetuned
)
.\models\sew_d\modeling_sew_d.py
""" PyTorch SEW model."""
import math
import warnings
from collections.abc import Sequence
from typing import Optional, Tuple, Union
import numpy as np
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss, LayerNorm
from ...activations import ACT2FN
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import softmax_backward_data
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_sew_d import SEWDConfig
logger = logging.get_logger(__name__)
_HIDDEN_STATES_START_POSITION = 1
_CONFIG_FOR_DOC = "SEWDConfig"
_CHECKPOINT_FOR_DOC = "asapp/sew-d-tiny-100k-ft-ls100h"
_EXPECTED_OUTPUT_SHAPE = [1, 292, 384]
_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTIL OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
_CTC_EXPECTED_LOSS = 0.21
_SEQ_CLASS_CHECKPOINT = "anton-l/sew-d-mid-400k-ft-keyword-spotting"
_SEQ_CLASS_EXPECTED_OUTPUT = "'_unknown_'"
_SEQ_CLASS_EXPECTED_LOSS = 3.16
SEW_D_PRETRAINED_MODEL_ARCHIVE_LIST = [
"asapp/sew-d-tiny-100k",
"asapp/sew-d-small-100k",
"asapp/sew-d-mid-100k",
"asapp/sew-d-mid-k127-100k",
"asapp/sew-d-base-100k",
"asapp/sew-d-base-plus-100k",
"asapp/sew-d-mid-400k",
"asapp/sew-d-mid-k127-400k",
"asapp/sew-d-base-plus-400k",
]
def _compute_mask_indices(
shape: Tuple[int, int],
mask_prob: float,
mask_length: int,
attention_mask: Optional[torch.LongTensor] = None,
min_masks: int = 0,
) -> np.ndarray:
"""
计算给定形状的随机掩码间隔。用于实现“SpecAugment: ASR 的简单数据增强方法”。
https://arxiv.org/abs/1904.08779
注意,此方法未经优化以在 TPU 上运行,并应作为训练过程中的预处理步骤在 CPU 上运行。
"""
num_masks = int(round(shape[0] * mask_prob))
num_masks = max(num_masks, min_masks)
mask_indices = np.full(shape[0], -1, dtype=np.int64)
for i in range(num_masks):
start = np.random.randint(0, shape[0] - mask_length + 1)
mask_indices[start : start + mask_length] = 1
return mask_indices
"""
batch_size, sequence_length = shape
# 如果掩码长度小于1,抛出值错误异常
if mask_length < 1:
raise ValueError("`mask_length` has to be bigger than 0.")
# 如果掩码长度大于序列长度,抛出值错误异常
if mask_length > sequence_length:
raise ValueError(
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
f" and `sequence_length`: {sequence_length}`"
)
# epsilon用于概率舍入
epsilon = np.random.rand(1).item()
def compute_num_masked_span(input_length):
"""给定输入长度,计算应掩码的跨度数"""
num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
num_masked_span = max(num_masked_span, min_masks)
# 确保掩码跨度数 <= 序列长度
if num_masked_span * mask_length > sequence_length:
num_masked_span = sequence_length // mask_length
# 确保掩码跨度数 <= 输入长度 - (掩码长度 - 1)
if input_length - (mask_length - 1) < num_masked_span:
num_masked_span = max(input_length - (mask_length - 1), 0)
return num_masked_span
# 计算批量中的掩码跨度数
input_lengths = (
attention_mask.sum(-1).detach().tolist()
if attention_mask is not None
else [sequence_length for _ in range(batch_size)]
)
# SpecAugment掩码初始化
spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
spec_aug_mask_idxs = []
# 计算在序列长度内最大的掩码跨度数
max_num_masked_span = compute_num_masked_span(sequence_length)
# 如果最大掩码跨度数为0,直接返回掩码矩阵
if max_num_masked_span == 0:
return spec_aug_mask
for input_length in input_lengths:
# 计算当前输入的被遮罩段数
num_masked_span = compute_num_masked_span(input_length)
# 随机选择要遮罩的索引位置
spec_aug_mask_idx = np.random.choice(
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
)
# 选择第一个随机索引作为虚拟索引,用于在批处理中填充向量,确保所有批次具有相同的维度
# 由于概率舍入,选择第一个样本使得向量填充两次
if len(spec_aug_mask_idx) == 0:
# 如果 `input_length` 严格小于 `sequence_length`,则只能发生这种情况
# 最后一个标记必须是填充标记,可以用作虚拟掩码 ID
dummy_mask_idx = sequence_length - 1
else:
dummy_mask_idx = spec_aug_mask_idx[0]
# 将虚拟掩码索引与随机生成的掩码索引合并
spec_aug_mask_idx = np.concatenate(
[spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
)
spec_aug_mask_idxs.append(spec_aug_mask_idx)
# 将列表转换为 numpy 数组
spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
# 将掩码索引扩展为掩码段
spec_aug_mask_idxs = np.broadcast_to(
spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
)
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
# 将起始索引添加偏移量,以创建掩码段
offsets = np.arange(mask_length)[None, None, :]
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
batch_size, max_num_masked_span * mask_length
)
spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
# 确保索引不超过序列长度
if spec_aug_mask_idxs.max() > sequence_length - 1:
spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
# 在指定的索引位置上进行散布,创建掩码
np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
# 返回最终的掩码结果
return spec_aug_mask
# Copied from transformers.models.deberta_v2.modeling_deberta_v2.make_log_bucket_position
def make_log_bucket_position(relative_pos, bucket_size, max_position):
# 计算相对位置的符号
sign = torch.sign(relative_pos)
# 计算桶的中间位置
mid = bucket_size // 2
# 计算绝对位置
abs_pos = torch.where(
(relative_pos < mid) & (relative_pos > -mid),
torch.tensor(mid - 1).type_as(relative_pos),
torch.abs(relative_pos),
)
# 计算对数位置
log_pos = (
torch.ceil(torch.log(abs_pos / mid) / torch.log(torch.tensor((max_position - 1) / mid)) * (mid - 1)) + mid
)
# 根据绝对位置是否小于等于桶的中间位置选择最终的位置
bucket_pos = torch.where(abs_pos <= mid, relative_pos.type_as(log_pos), log_pos * sign)
return bucket_pos
# Copied from transformers.models.deberta_v2.modeling_deberta_v2.build_relative_position
def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-1, device=None):
"""
根据查询和键构建相对位置
假设查询的绝对位置 \\(P_q\\) 范围是 (0, query_size),键的绝对位置 \\(P_k\\) 范围是 (0, key_size),
则查询到键的相对位置为 \\(R_{q \\rightarrow k} = P_q - P_k\\)
Args:
query_size (int): 查询的长度
key_size (int): 键的长度
bucket_size (int): 位置桶的大小
max_position (int): 允许的最大绝对位置
device (`torch.device`): 创建张量所用的设备
Return:
`torch.LongTensor`: 形状为 [1, query_size, key_size] 的张量
"""
# 创建查询 ID 序列和键 ID 序列
q_ids = torch.arange(0, query_size, device=device)
k_ids = torch.arange(0, key_size, device=device)
# 计算相对位置 ID
rel_pos_ids = q_ids[:, None] - k_ids[None, :]
# 如果指定了桶的大小和最大绝对位置,则应用对数桶化
if bucket_size > 0 and max_position > 0:
rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size, max_position)
rel_pos_ids = rel_pos_ids.to(torch.long)
# 限制相对位置的长度,并添加批次维度
rel_pos_ids = rel_pos_ids[:query_size, :]
rel_pos_ids = rel_pos_ids.unsqueeze(0)
return rel_pos_ids
@torch.jit.script
# Copied from transformers.models.deberta.modeling_deberta.c2p_dynamic_expand
def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos):
return c2p_pos.expand([query_layer.size(0), query_layer.size(1), query_layer.size(2), relative_pos.size(-1)])
@torch.jit.script
# Copied from transformers.models.deberta.modeling_deberta.p2c_dynamic_expand
def p2c_dynamic_expand(c2p_pos, query_layer, key_layer):
return c2p_pos.expand([query_layer.size(0), query_layer.size(1), key_layer.size(-2), key_layer.size(-2)])
@torch.jit.script
# Copied from transformers.models.deberta.modeling_deberta.pos_dynamic_expand
def pos_dynamic_expand(pos_index, p2c_att, key_layer):
return pos_index.expand(p2c_att.size()[:2] + (pos_index.size(-2), key_layer.size(-2)))
# Copied from transformers.models.deberta.modeling_deberta.get_mask
def get_mask(input, local_context):
if not isinstance(local_context, DropoutContext):
# 如果 local_context 不是 DropoutContext 的实例,则使用传入的 dropout 值,否则返回空的 mask
dropout = local_context
mask = None
# 如果条件不成立,执行以下操作
else:
# 从局部上下文中获取 dropout 参数
dropout = local_context.dropout
# 将 dropout 参数乘以局部上下文的缩放因子
dropout *= local_context.scale
# 如果局部上下文不重用掩码,则 mask 为 None;否则,从局部上下文中获取掩码
mask = local_context.mask if local_context.reuse_mask else None
# 如果 dropout 大于 0 并且 mask 为 None,则执行以下操作
if dropout > 0 and mask is None:
# 创建一个与 input 张量相同形状的随机掩码,并转换为布尔型
mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).to(torch.bool)
# 如果局部上下文对象是 DropoutContext 的实例,则执行以下操作
if isinstance(local_context, DropoutContext):
# 如果局部上下文的掩码为 None,则将当前掩码赋值给局部上下文的掩码
if local_context.mask is None:
local_context.mask = mask
# 返回计算得到的掩码和 dropout 参数
return mask, dropout
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2NoLayerNormConvLayer复制代码,替换Wav2Vec2为SEWD
class SEWDNoLayerNormConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
# 如果layer_id大于0,则设置输入卷积维度为config.conv_dim[layer_id - 1],否则设置为1
self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
# 设置输出卷积维度为config.conv_dim[layer_id]
self.out_conv_dim = config.conv_dim[layer_id]
# 创建一个一维卷积层
self.conv = nn.Conv1d(
self.in_conv_dim,
self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id], # 使用config中的卷积核大小
stride=config.conv_stride[layer_id], # 使用config中的步幅大小
bias=config.conv_bias, # 使用config中的偏置
)
# 设置激活函数为ACT2FN[config.feat_extract_activation]
self.activation = ACT2FN[config.feat_extract_activation]
# 定义前向传播函数
def forward(self, hidden_states):
# 将输入hidden_states通过卷积层self.conv
hidden_states = self.conv(hidden_states)
# 将卷积后的hidden_states应用激活函数self.activation
hidden_states = self.activation(hidden_states)
return hidden_states
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2LayerNormConvLayer复制代码,替换Wav2Vec2为SEWD
class SEWDLayerNormConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
# 如果layer_id大于0,则设置输入卷积维度为config.conv_dim[layer_id - 1],否则设置为1
self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
# 设置输出卷积维度为config.conv_dim[layer_id]
self.out_conv_dim = config.conv_dim[layer_id]
# 创建一个一维卷积层
self.conv = nn.Conv1d(
self.in_conv_dim,
self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id], # 使用config中的卷积核大小
stride=config.conv_stride[layer_id], # 使用config中的步幅大小
bias=config.conv_bias, # 使用config中的偏置
)
# 创建一个LayerNorm层,对输出卷积维度进行归一化
self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
# 设置激活函数为ACT2FN[config.feat_extract_activation]
self.activation = ACT2FN[config.feat_extract_activation]
# 定义前向传播函数
def forward(self, hidden_states):
# 将输入hidden_states通过卷积层self.conv
hidden_states = self.conv(hidden_states)
# 将hidden_states的维度进行转置,将倒数第二维与倒数第一维交换
hidden_states = hidden_states.transpose(-2, -1)
# 将转置后的hidden_states通过LayerNorm层self.layer_norm进行归一化
hidden_states = self.layer_norm(hidden_states)
# 再次将hidden_states的维度进行转置,将倒数第二维与倒数第一维交换回来
hidden_states = hidden_states.transpose(-2, -1)
# 将归一化后的hidden_states应用激活函数self.activation
hidden_states = self.activation(hidden_states)
return hidden_states
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GroupNormConvLayer复制代码,替换Wav2Vec2为SEWD
class SEWDGroupNormConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
# 如果layer_id大于0,则设置输入卷积维度为config.conv_dim[layer_id - 1],否则设置为1
self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
# 设置输出卷积维度为config.conv_dim[layer_id]
self.out_conv_dim = config.conv_dim[layer_id]
# 创建一个一维卷积层
self.conv = nn.Conv1d(
self.in_conv_dim,
self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id], # 使用config中的卷积核大小
stride=config.conv_stride[layer_id], # 使用config中的步幅大小
bias=config.conv_bias, # 使用config中的偏置
)
# 设置激活函数为ACT2FN[config.feat_extract_activation]
self.activation = ACT2FN[config.feat_extract_activation]
# 创建一个GroupNorm层,对输出卷积维度进行分组归一化
self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
# 定义前向传播函数
def forward(self, hidden_states):
# 将输入hidden_states通过卷积层self.conv
hidden_states = self.conv(hidden_states)
# 将卷积后的hidden_states通过GroupNorm层self.layer_norm进行归一化
hidden_states = self.layer_norm(hidden_states)
# 将归一化后的hidden_states应用激活函数self.activation
hidden_states = self.activation(hidden_states)
return hidden_states
# 从transformers.models.sew.modeling_sew.SEWPositionalConvEmbedding复制而来,修改SEW为SEWD
class SEWDPositionalConvEmbedding(nn.Module):
def __init__(self, config):
super().__init__()
# 定义一个一维卷积层,用于位置编码的卷积
self.conv = nn.Conv1d(
config.hidden_size,
config.hidden_size,
kernel_size=config.num_conv_pos_embeddings,
padding=config.num_conv_pos_embeddings // 2,
groups=config.num_conv_pos_embedding_groups,
stride=config.squeeze_factor,
)
# 如果启用了deepspeed的zero3功能
if is_deepspeed_zero3_enabled():
import deepspeed
# 使用zero3的gathered parameters将权重进行分布式处理
with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
# 注册卷积层的权重变量给deepspeed.zero
deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
else:
# 对卷积层的权重进行权重归一化处理
self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
# 创建一个用于卷积后padding的层
self.padding = SEWDSamePadLayer(config.num_conv_pos_embeddings)
# 激活函数选择,根据配置选择不同的激活函数
self.activation = ACT2FN[config.feat_extract_activation]
def forward(self, hidden_states):
# 进行一维卷积
hidden_states = self.conv(hidden_states)
# 进行padding处理
hidden_states = self.padding(hidden_states)
# 使用选择的激活函数进行激活
hidden_states = self.activation(hidden_states)
return hidden_states
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer复制而来,修改Wav2Vec2为SEW
class SEWDSamePadLayer(nn.Module):
def __init__(self, num_conv_pos_embeddings):
super().__init__()
# 根据卷积位置编码数目确定是否需要移除的padding数量
self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
def forward(self, hidden_states):
# 如果需要移除padding,则进行裁剪
if self.num_pad_remove > 0:
hidden_states = hidden_states[:, :, : -self.num_pad_remove]
return hidden_states
# 从transformers.models.sew.modeling_sew.SEWUpsampling复制而来,修改SEW为SEWD
class SEWDUpsampling(nn.Module):
def __init__(self, config):
super().__init__()
# 定义一个线性层,用于上采样投影
self.projection = nn.Linear(config.hidden_size, config.hidden_size * config.squeeze_factor)
# 根据配置选择不同的激活函数
self.activation = ACT2FN[config.feat_extract_activation]
# 保存下采样倍数
self.squeeze_factor = config.squeeze_factor
def forward(self, hidden_states):
# 进行线性投影
hidden_states = self.projection(hidden_states)
# 使用选择的激活函数进行激活
hidden_states = self.activation(hidden_states)
# 如果下采样因子大于1
if self.squeeze_factor > 1:
# 将嵌入通道转换为序列长度
bsz, src_len, src_embed_dim = hidden_states.size()
tgt_len = src_len * self.squeeze_factor
tgt_embed_dim = src_embed_dim // self.squeeze_factor
hidden_states = hidden_states.reshape(bsz, src_len, self.squeeze_factor, tgt_embed_dim)
hidden_states = hidden_states.reshape(bsz, tgt_len, tgt_embed_dim)
return hidden_states
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder复制而来,修改Wav2Vec2为SEWD
class SEWDFeatureEncoder(nn.Module):
"""从原始音频波形构建特征"""
def __init__(self, config):
super().__init__()
# 根据配置选择特征提取的归一化方式
if config.feat_extract_norm == "group":
# 如果是group归一化,则创建一系列卷积层
conv_layers = [SEWDGroupNormConvLayer(config, layer_id=0)] + [
SEWDNoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
]
elif config.feat_extract_norm == "layer":
# 如果是layer归一化,则创建一系列卷积层
conv_layers = [SEWDLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)]
else:
# 若配置不匹配则抛出异常
raise ValueError(
f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
)
# 将卷积层列表转换为ModuleList
self.conv_layers = nn.ModuleList(conv_layers)
self.gradient_checkpointing = False
self._requires_grad = True
def _freeze_parameters(self):
# 冻结所有参数,使其不需要梯度更新
for param in self.parameters():
param.requires_grad = False
self._requires_grad = False
def forward(self, input_values):
# 将输入值添加一个维度,用于处理
hidden_states = input_values[:, None]
# 如果需要梯度并且正在训练,确保hidden_states需要梯度
if self._requires_grad and self.training:
hidden_states.requires_grad = True
# 遍历所有卷积层进行前向传播
for conv_layer in self.conv_layers:
if self._requires_grad and self.gradient_checkpointing and self.training:
# 如果开启了梯度检查点功能,使用梯度检查点函数进行前向传播
hidden_states = self._gradient_checkpointing_func(
conv_layer.__call__,
hidden_states,
)
else:
# 否则直接通过卷积层进行前向传播
hidden_states = conv_layer(hidden_states)
# 返回处理后的隐藏状态
return hidden_states
class SEWDFeatureExtractor(SEWDFeatureEncoder):
def __init__(self, config):
super().__init__(config)
# 发出警告,表明该类将被弃用并在未来版本中移除,建议使用基类替代
warnings.warn(
f"The class `{self.__class__.__name__}` has been depreciated "
"and will be removed in Transformers v5. "
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
FutureWarning,
)
# 从transformers.models.deberta.modeling_deberta.ContextPooler复制而来
class ContextPooler(nn.Module):
def __init__(self, config):
super().__init__()
# 创建线性层和稳定的dropout层
self.dense = nn.Linear(config.pooler_hidden_size, config.pooler_hidden_size)
self.dropout = StableDropout(config.pooler_dropout)
self.config = config
def forward(self, hidden_states):
# 通过简单地获取第一个token的隐藏状态来“池化”模型
context_token = hidden_states[:, 0]
context_token = self.dropout(context_token)
pooled_output = self.dense(context_token)
pooled_output = ACT2FN[self.config.pooler_hidden_act](pooled_output)
# 返回池化后的输出
return pooled_output
@property
def output_dim(self):
# 返回输出维度大小,与隐藏大小相同
return self.config.hidden_size
# 从transformers.models.deberta.modeling_deberta.XSoftmax复制而来
class XSoftmax(torch.autograd.Function):
"""
Masked Softmax which is optimized for saving memory
Args:
input (`torch.tensor`): The input tensor that will apply softmax.
mask (`torch.IntTensor`):
The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
dim (int): The dimension that will apply softmax
Example:
```
>>> import torch
>>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax
>>>
>>> x = torch.randn([4, 20, 100])
>>>
>>> mask = (x > 0).int()
>>>
>>> dim = -1
>>> y = XSoftmax.apply(x, mask, dim)
```"""
@staticmethod
def forward(self, input, mask, dim):
# 设置对象的维度属性
self.dim = dim
# 创建反向掩码,将输入掩码转换为布尔类型取反
rmask = ~(mask.to(torch.bool))
# 用最小的浮点数填充输入中的掩码位置
output = input.masked_fill(rmask, torch.tensor(torch.finfo(input.dtype).min))
# 在指定维度上应用 softmax 函数
output = torch.softmax(output, self.dim)
# 将输出中掩码位置重新填充为0
output.masked_fill_(rmask, 0)
# 保存输出作为反向传播的一部分
self.save_for_backward(output)
return output
@staticmethod
def backward(self, grad_output):
# 获取保存的输出张量
(output,) = self.saved_tensors
# 调用自定义的 softmax 反向传播函数计算输入梯度
inputGrad = softmax_backward_data(self, grad_output, output, self.dim, output)
return inputGrad, None, None
@staticmethod
def symbolic(g, self, mask, dim):
import torch.onnx.symbolic_helper as sym_help
from torch.onnx.symbolic_opset9 import masked_fill, softmax
# 将掩码转换为长整型
mask_cast_value = g.op("Cast", mask, to_i=sym_help.cast_pytorch_to_onnx["Long"])
# 计算反向掩码,使用 ONNX 运算符
r_mask = g.op(
"Cast",
g.op("Sub", g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64)), mask_cast_value),
to_i=sym_help.cast_pytorch_to_onnx["Bool"],
)
# 使用 ONNX 运算符对输入进行掩码填充
output = masked_fill(
g, self, r_mask, g.op("Constant", value_t=torch.tensor(torch.finfo(self.type().dtype()).min))
)
# 使用 ONNX 运算符在指定维度上应用 softmax
output = softmax(g, output, dim)
return masked_fill(g, output, r_mask, g.op("Constant", value_t=torch.tensor(0, dtype=torch.bool)))
# Copied from transformers.models.deberta.modeling_deberta.DropoutContext
class DropoutContext(object):
def __init__(self):
self.dropout = 0
self.mask = None
self.scale = 1
self.reuse_mask = True
# Copied from transformers.models.deberta.modeling_deberta.XDropout
class XDropout(torch.autograd.Function):
"""Optimized dropout function to save computation and memory by using mask operation instead of multiplication."""
@staticmethod
def forward(ctx, input, local_ctx):
# 调用函数获取掩码和 dropout 概率
mask, dropout = get_mask(input, local_ctx)
# 计算缩放比例
ctx.scale = 1.0 / (1 - dropout)
if dropout > 0:
# 保存掩码用于反向传播
ctx.save_for_backward(mask)
# 应用掩码并乘以缩放比例
return input.masked_fill(mask, 0) * ctx.scale
else:
return input
# 定义静态方法,用于在反向传播时计算梯度
def backward(ctx, grad_output):
# 如果上下文中的缩放值大于1,则执行以下操作
if ctx.scale > 1:
# 从上下文保存的张量中获取掩码
(mask,) = ctx.saved_tensors
# 将梯度张量中的被掩码位置清零,并乘以缩放因子,然后返回
return grad_output.masked_fill(mask, 0) * ctx.scale, None
else:
# 如果缩放值不大于1,则直接返回梯度和空值
return grad_output, None
# 定义静态方法,用于在符号图中生成 Dropout 操作
def symbolic(g: torch._C.Graph, input: torch._C.Value, local_ctx: Union[float, DropoutContext]) -> torch._C.Value:
# 导入符号化操作集12
from torch.onnx import symbolic_opset12
# 如果 local_ctx 是 DropoutContext 类型,则获取其中的 dropout 率
dropout_p = local_ctx
if isinstance(local_ctx, DropoutContext):
dropout_p = local_ctx.dropout
# 在导出过程中,稳定的 Dropout 只在训练时调用此函数
train = True
# TODO: 我们应该检查 opset_version 是否大于12,但目前没有很好的方法来执行此检查。
# 如今,如果 opset_version < 12,导出将会因为 CheckerError 而失败。
# 一旦 https://github.com/pytorch/pytorch/issues/78391 问题得到解决,可以像下面这样处理:
# if opset_version < 12:
# return torch.onnx.symbolic_opset9.dropout(g, input, dropout_p, train)
# 使用符号化操作集12中的 dropout 函数生成符号化节点
return symbolic_opset12.dropout(g, input, dropout_p, train)
# Copied from transformers.models.deberta.modeling_deberta.StableDropout
class StableDropout(nn.Module):
"""
Optimized dropout module for stabilizing the training
Args:
drop_prob (float): the dropout probabilities
"""
def __init__(self, drop_prob):
super().__init__()
# 初始化稳定的dropout模块
self.drop_prob = drop_prob # 设置dropout概率
self.count = 0 # 上下文堆栈计数
self.context_stack = None # 上下文堆栈初始化为空
def forward(self, x):
"""
Call the module
Args:
x (`torch.tensor`): The input tensor to apply dropout
"""
if self.training and self.drop_prob > 0:
return XDropout.apply(x, self.get_context()) # 如果处于训练状态且dropout概率大于0,则应用自定义的dropout操作
return x # 否则直接返回输入
def clear_context(self):
# 清空上下文堆栈
self.count = 0
self.context_stack = None
def init_context(self, reuse_mask=True, scale=1):
if self.context_stack is None:
self.context_stack = [] # 如果上下文堆栈为空,则初始化为空列表
self.count = 0 # 计数器归零
for c in self.context_stack:
c.reuse_mask = reuse_mask # 设置重用掩码标志
c.scale = scale # 设置比例
def get_context(self):
if self.context_stack is not None:
if self.count >= len(self.context_stack):
self.context_stack.append(DropoutContext()) # 如果计数超过堆栈长度,则添加新的dropout上下文
ctx = self.context_stack[self.count] # 获取当前计数对应的dropout上下文
ctx.dropout = self.drop_prob # 设置dropout概率
self.count += 1 # 计数器加一
return ctx # 返回dropout上下文
else:
return self.drop_prob # 如果上下文堆栈为空,则返回dropout概率本身
# Copied from transformers.models.deberta.modeling_deberta.DebertaSelfOutput with DebertaV2->SEWD, DebertaLayerNorm->LayerNorm, hidden_dropout_prob->activation_dropout
class SEWDSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size) # 使用线性层变换隐藏状态
self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) # 应用LayerNorm进行归一化
self.dropout = StableDropout(config.activation_dropout) # 使用稳定的dropout模块
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states) # 线性变换隐藏状态
hidden_states = self.dropout(hidden_states) # 应用稳定的dropout
hidden_states = self.LayerNorm(hidden_states + input_tensor) # 使用LayerNorm对变换后的隐藏状态进行归一化
return hidden_states # 返回处理后的隐藏状态
# Copied from transformers.models.deberta_v2.modeling_deberta_v2.DisentangledSelfAttention with attention_probs_dropout_prob->attention_dropout, hidden_dropout_prob->activation_dropout
class DisentangledSelfAttention(nn.Module):
"""
Disentangled self-attention module
Parameters:
config (`DebertaV2Config`):
A model config class instance with the configuration to build a new model. The schema is similar to
*BertConfig*, for more details, please refer [`DebertaV2Config`]
"""
# 定义类和其中的初始化方法,包含Transformer注意力机制相关参数和组件
def __init__(self, config):
# 调用基类初始化方法,默认调用具有模型特定特征的方法
super().__init__()
# 验证隐藏维度是否是 attention head 的倍数,否则会抛出错误
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
# 初始化注意力头数量
self.num_attention_heads = config.num_attention_heads
_attention_head_size = config.hidden_size // config.num_attention_heads # 默认计算每个头的大小
# 根据配置中self.attention_head_size的设置进行可能的调整
self.attention_head_size = getattr(config, "attention_head_size", _attention_head_size)
# 计算头数乘以每个头的大小,用于计算总头大小
self.all_head_size = self.num_attention_heads * self.attention_head_size
# 创建线性投影层以将输入映射到所需的输出维度
self.query_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
self.key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
self.value_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
# 检查是否共享注意力键
self.share_att_key = getattr(config, "share_att_key", False)
# 设置注意力类型的参数列表
self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else []
# 检查是否使用相对注意力机制
self.relative_attention = getattr(config, "relative_attention", False)
# 使用相对注意力时,将 position_buckets 和 max_relative_positions 等参数的默认值设定
if self.relative_attention:
self.position_buckets = getattr(config, "position_buckets", -1)
self.max_relative_positions = getattr(config, "max_relative_positions", -1)
# 设置 max_relative_positions 初始值为 max_position_embeddings,除非使用 position_buckets 或者其小于 1
if self.max_relative_positions < 1:
self.max_relative_positions = config.max_position_embeddings
# 计算实际的相对位置嵌入大小
self.pos_ebd_size = self.max_relative_positions
# 如果 position_buckets 参数已配置,调整 pos_ebd_size 大小
if self.position_buckets > 0:
self.pos_ebd_size = self.position_buckets
# 初始化位置 dropout 层
self.pos_dropout = StableDropout(config.activation_dropout)
# 如果不共享attention键,则创建额外的线性投影层用于处理位置相关的输入
if "c2p" in self.pos_att_type:
self.pos_key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
if "p2c" in self.pos_att_type:
self.pos_query_proj = nn.Linear(config.hidden_size, self.all_head_size)
# 初始化模型的下垂dropout层
self.dropout = StableDropout(config.attention_dropout)
# 随后定义了 batch 处理数据的内部步骤 x 转换函数
def transpose_for_scores(self, x, attention_heads):
# 获取数据和头数维度形状
new_x_shape = x.size()[:-1] + (attention_heads, -1)
# 重塑数据的形状以准备在循环过程中使用
x = x.view(new_x_shape)
# 转置以将数据按注意力头划分
return x.permute(0, 2, 1, 3).contiguous().view(-1, x.size(1), x.size(-1))
# 随后定义了前向传播方法,处理输入数据
def forward(
self,
hidden_states,
attention_mask,
output_attentions=False,
query_states=None,
relative_pos=None,
rel_embeddings=None,
# Copied from transformers.models.deberta.modeling_deberta.DebertaAttention with Deberta->SEWD
class SEWDAttention(nn.Module):
def __init__(self, config):
super().__init__()
# 初始化自注意力层,使用SEWD版本的DisentangledSelfAttention
self.self = DisentangledSelfAttention(config)
# 初始化自注意力层输出层,使用SEWD版本的SEWDSelfOutput
self.output = SEWDSelfOutput(config)
self.config = config
def forward(
self,
hidden_states,
attention_mask,
output_attentions=False,
query_states=None,
relative_pos=None,
rel_embeddings=None,
):
# 执行自注意力计算,调用SEWD版本的DisentangledSelfAttention模型
self_output = self.self(
hidden_states,
attention_mask,
output_attentions,
query_states=query_states,
relative_pos=relative_pos,
rel_embeddings=rel_embeddings,
)
if output_attentions:
self_output, att_matrix = self_output
if query_states is None:
query_states = hidden_states
# 执行自注意力输出层计算,调用SEWD版本的SEWDSelfOutput模型
attention_output = self.output(self_output, query_states)
if output_attentions:
return (attention_output, att_matrix) # 返回注意力输出和注意力矩阵(如果有的话)
else:
return attention_output # 返回注意力输出结果
# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->SEWD
class SEWDIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
# 使用线性层将隐藏状态的大小转换为中间状态大小
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
# 根据配置选择激活函数
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 使用线性层进行转换
hidden_states = self.dense(hidden_states)
# 应用中间激活函数
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states # 返回转换后的中间状态
# Copied from transformers.models.deberta.modeling_deberta.DebertaOutput with DebertaLayerNorm->LayerNorm, hidden_dropout_prob->activation_dropout
class SEWDOutput(nn.Module):
def __init__(self, config):
super().__init__()
# 使用线性层将中间状态大小转换为隐藏状态大小
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
# 使用SEWD版本的LayerNorm,初始化LayerNorm层
self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
# 使用SEWD版本的StableDropout,初始化稳定Dropout层
self.dropout = StableDropout(config.activation_dropout)
self.config = config
def forward(self, hidden_states, input_tensor):
# 使用线性层进行转换
hidden_states = self.dense(hidden_states)
# 应用稳定Dropout
hidden_states = self.dropout(hidden_states)
# 应用LayerNorm
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states # 返回处理后的隐藏状态
# Copied from transformers.models.deberta.modeling_deberta.DebertaLayer with Deberta->SEWD
class SEWDLayer(nn.Module):
def __init__(self, config):
super().__init__()
# 初始化SEWD版本的注意力层、中间层和输出层
self.attention = SEWDAttention(config)
self.intermediate = SEWDIntermediate(config)
self.output = SEWDOutput(config)
# 定义神经网络模型中的前向传播函数,用于计算每个层的输出
def forward(
self,
hidden_states, # 输入的隐藏状态,通常是模型中前一层的输出
attention_mask, # 注意力掩码,指定哪些位置需要进行注意力计算
query_states=None, # 查询状态,用于多头注意力机制中的查询
relative_pos=None, # 相对位置编码,用于自注意力机制中的位置编码
rel_embeddings=None, # 相对位置嵌入,用于计算相对位置偏移
output_attentions=False, # 是否输出注意力矩阵
):
# 调用注意力层计算注意力输出
attention_output = self.attention(
hidden_states, # 输入的隐藏状态
attention_mask, # 注意力掩码
output_attentions=output_attentions, # 是否输出注意力矩阵的标志
query_states=query_states, # 查询状态
relative_pos=relative_pos, # 相对位置编码
rel_embeddings=rel_embeddings, # 相对位置嵌入
)
# 如果需要输出注意力矩阵,则解包注意力输出
if output_attentions:
attention_output, att_matrix = attention_output
# 将注意力输出传入中间层进行处理
intermediate_output = self.intermediate(attention_output)
# 将中间层的输出传入输出层,生成最终层的输出
layer_output = self.output(intermediate_output, attention_output)
# 如果需要输出注意力矩阵,则返回输出层的输出和注意力矩阵
if output_attentions:
return (layer_output, att_matrix)
else:
# 否则,仅返回输出层的输出
return layer_output
# Copied from transformers.models.deberta_v2.modeling_deberta_v2.ConvLayer
# 定义一个名为 ConvLayer 的类,继承自 nn.Module
class ConvLayer(nn.Module):
# 初始化方法,接受一个 config 对象作为参数
def __init__(self, config):
# 调用父类的初始化方法
super().__init__()
# 从 config 中获取卷积核大小,默认为 3
kernel_size = getattr(config, "conv_kernel_size", 3)
# 从 config 中获取卷积的分组数,默认为 1
groups = getattr(config, "conv_groups", 1)
# 从 config 中获取卷积激活函数,默认为 "tanh"
self.conv_act = getattr(config, "conv_act", "tanh")
# 创建一个 1 维卷积层,输入和输出通道数都为 config.hidden_size,卷积核大小为 kernel_size
# padding 设置为 (kernel_size - 1) // 2 保证卷积后维度不变
# groups 参数控制分组卷积
self.conv = nn.Conv1d(
config.hidden_size, config.hidden_size, kernel_size, padding=(kernel_size - 1) // 2, groups=groups
)
# 创建一个 LayerNorm 层,输入维度为 config.hidden_size,eps 参数为 config.layer_norm_eps
self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
# 创建一个稳定 Dropout 层,dropout 概率为 config.hidden_dropout_prob
self.dropout = StableDropout(config.hidden_dropout_prob)
# 将 config 对象保存到当前对象的 config 属性中
self.config = config
# 前向传播方法,接受 hidden_states、residual_states 和 input_mask 作为输入
def forward(self, hidden_states, residual_states, input_mask):
# 对 hidden_states 进行维度变换,将第二维和第三维交换,然后做卷积操作
out = self.conv(hidden_states.permute(0, 2, 1).contiguous()).permute(0, 2, 1).contiguous()
# 创建一个逻辑张量 rmask,标识 input_mask 为 0 的位置
rmask = (1 - input_mask).bool()
# 将 out 张量中 rmask 为 True 的位置置为 0
out.masked_fill_(rmask.unsqueeze(-1).expand(out.size()), 0)
# 对 out 张量应用指定的激活函数 ACT2FN[self.conv_act],然后加上 dropout 处理
out = ACT2FN[self.conv_act](self.dropout(out))
# 计算 layer_norm_input,即 residual_states 和 out 的和
layer_norm_input = residual_states + out
# 对 layer_norm_input 应用 LayerNorm 层,然后赋值给 output
output = self.LayerNorm(layer_norm_input).to(layer_norm_input)
# 如果 input_mask 为 None,则直接将 output 赋值给 output_states
if input_mask is None:
output_states = output
else:
# 如果 input_mask 的维度与 layer_norm_input 的维度不同,进行维度调整
if input_mask.dim() != layer_norm_input.dim():
if input_mask.dim() == 4:
input_mask = input_mask.squeeze(1).squeeze(1)
input_mask = input_mask.unsqueeze(2)
# 将 input_mask 转换为与 output 相同的数据类型,并与 output 相乘,得到 output_states
input_mask = input_mask.to(output.dtype)
output_states = output * input_mask
# 返回 output_states
return output_states
# Copied from transformers.models.deberta_v2.modeling_deberta_v2.DebertaV2Encoder with DebertaV2->SEWD
# 定义一个名为 SEWDTransformerEncoder 的类,继承自 nn.Module
class SEWDTransformerEncoder(nn.Module):
"""Modified BertEncoder with relative position bias support"""
# 初始化方法,接受一个 config 对象作为参数
def __init__(self, config):
# 调用父类的初始化方法
super().__init__()
# 创建一个包含多个 SEWDLayer 的 ModuleList,层数为 config.num_hidden_layers
self.layer = nn.ModuleList([SEWDLayer(config) for _ in range(config.num_hidden_layers)])
# 从 config 中获取是否支持相对位置偏置的标志,默认为 False
self.relative_attention = getattr(config, "relative_attention", False)
# 如果支持相对位置偏置
if self.relative_attention:
# 从 config 中获取最大相对位置的范围,默认为 -1
self.max_relative_positions = getattr(config, "max_relative_positions", -1)
# 如果最大相对位置小于 1,则设置为 config.max_position_embeddings
if self.max_relative_positions < 1:
self.max_relative_positions = config.max_position_embeddings
# 从 config 中获取位置桶的数量,默认为 -1
self.position_buckets = getattr(config, "position_buckets", -1)
# 计算位置嵌入的尺寸
pos_ebd_size = self.max_relative_positions * 2
# 如果指定了位置桶的数量,则重新计算位置嵌入的尺寸
if self.position_buckets > 0:
pos_ebd_size = self.position_buckets * 2
# 创建一个 nn.Embedding 层用于存储相对位置嵌入
self.rel_embeddings = nn.Embedding(pos_ebd_size, config.hidden_size)
# 从 config 中获取并解析 norm_rel_ebd 字符串,设置是否使用 LayerNorm 进行相对位置嵌入的归一化
self.norm_rel_ebd = [x.strip() for x in getattr(config, "norm_rel_ebd", "none").lower().split("|")]
# 如果设置了 "layer_norm",则创建一个 LayerNorm 层,用于相对位置嵌入的归一化
if "layer_norm" in self.norm_rel_ebd:
self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=True)
# 如果 config 中指定了卷积核大小大于 0,则创建一个 ConvLayer
self.conv = ConvLayer(config) if getattr(config, "conv_kernel_size", 0) > 0 else None
# 默认关闭梯度检查点
self.gradient_checkpointing = False
# 返回相对位置嵌入(如果启用相对注意力机制),否则返回空值
def get_rel_embedding(self):
rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None
# 如果相对位置嵌入不为空,并且规范化名称包含"layer_norm"
if rel_embeddings is not None and ("layer_norm" in self.norm_rel_ebd):
# 对相对位置嵌入进行层标准化处理
rel_embeddings = self.LayerNorm(rel_embeddings)
return rel_embeddings
# 获取注意力遮罩,根据不同维度扩展遮罩的尺寸
def get_attention_mask(self, attention_mask):
if attention_mask.dim() <= 2:
# 在维度1和2上扩展注意力遮罩的尺寸
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
# 生成扩展后的注意力遮罩
attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1)
elif attention_mask.dim() == 3:
# 如果遮罩是3维的,则在维度1上进行扩展
attention_mask = attention_mask.unsqueeze(1)
return attention_mask
# 获取相对位置编码,如果启用相对注意力机制且相对位置未提供,则构建相对位置编码
def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):
if self.relative_attention and relative_pos is None:
# 如果启用相对注意力机制且未提供相对位置,则根据输入的大小构建相对位置编码
q = query_states.size(-2) if query_states is not None else hidden_states.size(-2)
relative_pos = build_relative_position(
q,
hidden_states.size(-2),
bucket_size=self.position_buckets,
max_position=self.max_relative_positions,
device=hidden_states.device,
)
return relative_pos
# 前向传播函数,接收输入的隐藏状态和注意力遮罩等参数,并返回模型的输出
def forward(
self,
hidden_states,
attention_mask,
output_hidden_states=True,
output_attentions=False,
query_states=None,
relative_pos=None,
return_dict=True,
):
# 如果注意力掩码的维度小于等于2,直接使用作为输入掩码
if attention_mask.dim() <= 2:
input_mask = attention_mask
else:
# 否则,将注意力掩码在倒数第二个维度上求和,并检查大于0的部分作为输入掩码
input_mask = attention_mask.sum(-2) > 0
# 获取注意力掩码,根据模型定义的方法
attention_mask = self.get_attention_mask(attention_mask)
# 获取相对位置编码,用于当前层的注意力计算
relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)
# 初始化用于存储所有隐藏状态和注意力权重的变量,根据输出设置决定是否需要存储
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
# 如果隐藏状态是一个序列,取第一个作为下一步的键值对
if isinstance(hidden_states, Sequence):
next_kv = hidden_states[0]
else:
next_kv = hidden_states
# 获取相对位置编码矩阵
rel_embeddings = self.get_rel_embedding()
# 初始化输出状态为当前的键值对
output_states = next_kv
# 遍历每一层的神经网络模块
for i, layer_module in enumerate(self.layer):
# 如果需要输出隐藏状态,则将当前状态加入到所有隐藏状态中
if output_hidden_states:
all_hidden_states = all_hidden_states + (output_states,)
# 如果开启了梯度检查点且正在训练阶段,使用梯度检查点函数计算当前层输出状态
if self.gradient_checkpointing and self.training:
output_states = self._gradient_checkpointing_func(
layer_module.__call__,
next_kv,
attention_mask,
query_states,
relative_pos,
rel_embeddings,
output_attentions,
)
else:
# 否则,正常调用当前层的前向传播函数
output_states = layer_module(
next_kv,
attention_mask,
query_states=query_states,
relative_pos=relative_pos,
rel_embeddings=rel_embeddings,
output_attentions=output_attentions,
)
# 如果需要输出注意力权重,从输出状态中提取注意力权重
if output_attentions:
output_states, att_m = output_states
# 如果是第一层且存在卷积模块,将当前隐藏状态与输入掩码传递给卷积模块
if i == 0 and self.conv is not None:
output_states = self.conv(hidden_states, output_states, input_mask)
# 如果有查询状态,更新为当前输出状态,并更新下一步的键值对
if query_states is not None:
query_states = output_states
if isinstance(hidden_states, Sequence):
next_kv = hidden_states[i + 1] if i + 1 < len(self.layer) else None
else:
next_kv = output_states
# 如果需要输出注意力权重,将当前层计算得到的注意力权重加入到所有注意力中
if output_attentions:
all_attentions = all_attentions + (att_m,)
# 如果需要输出隐藏状态,将最后一层的输出状态加入到所有隐藏状态中
if output_hidden_states:
all_hidden_states = all_hidden_states + (output_states,)
# 如果不需要以字典形式返回结果,则返回元组,过滤掉值为None的项
if not return_dict:
return tuple(v for v in [output_states, all_hidden_states, all_attentions] if v is not None)
# 否则,以BaseModelOutput形式返回结果,包括最后隐藏状态、所有隐藏状态和所有注意力权重
return BaseModelOutput(
last_hidden_state=output_states, hidden_states=all_hidden_states, attentions=all_attentions
)
# 定义 SEWDEncoder 类,继承自 nn.Module,用于实现一个自定义的编码器模型
class SEWDEncoder(nn.Module):
# 初始化方法,接受一个 config 参数
def __init__(self, config):
super().__init__()
self.config = config
# 初始化位置卷积嵌入层对象
self.pos_conv_embed = SEWDPositionalConvEmbedding(config)
# 初始化一维平均池化层
self.pool = nn.AvgPool1d(config.squeeze_factor, config.squeeze_factor)
# 初始化 SEWDTransformerEncoder 编码器
self.encoder = SEWDTransformerEncoder(config)
# 初始化 SEWDUpsampling 上采样层
self.upsample = SEWDUpsampling(config)
# 梯度检查点设置为 False
self.gradient_checkpointing = False
# 前向传播方法,接受多个参数
def forward(
self,
hidden_states: torch.tensor,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 计算最大编码器长度
max_encoder_length = hidden_states.shape[1] // self.config.squeeze_factor
# 如果没有给定注意力掩码,则创建一个全为 1 的张量作为默认注意力掩码
if attention_mask is None:
attention_mask = torch.ones(
(hidden_states.shape[0], max_encoder_length), dtype=torch.long, device=hidden_states.device
)
else:
# 将注意力掩码为 False 的位置对应的隐藏状态设为 0
hidden_states[~attention_mask.bool()] = 0.0
# 计算输入长度并应用池化公式以获取真实的输出长度
input_lengths = (attention_mask.long()).sum(-1)
output_lengths = input_lengths // self.config.squeeze_factor
# 生成注意力掩码,限制注意力范围在有效输出长度内
attention_ids = (
torch.arange(0, max_encoder_length, device=output_lengths.device)
.view(1, -1)
.expand(output_lengths.shape[0], -1)
)
attention_mask = (attention_ids < output_lengths.view(-1, 1)).long()
# 记录输入时间步数
n_input_timesteps = hidden_states.shape[1]
# 将隐藏状态维度转置,以适应位置嵌入计算
hidden_states = hidden_states.transpose(1, 2)
# 计算位置嵌入
position_embeddings = self.pos_conv_embed(hidden_states)
# 对隐藏状态进行池化操作
pooled_hidden_states = self.pool(hidden_states)
# 选择较小的长度作为最终的隐藏状态长度
min_length = min(position_embeddings.size(-1), pooled_hidden_states.size(-1))
# 将池化后的隐藏状态和位置嵌入相加得到最终的隐藏状态表示
hidden_states = pooled_hidden_states[..., :min_length] + position_embeddings[..., :min_length]
# 将隐藏状态维度再次转置为输出形状
hidden_states = hidden_states.transpose(1, 2)
# 将最终隐藏状态传入编码器进行编码,获取编码器输出
encoder_outputs = self.encoder(hidden_states, attention_mask, output_hidden_states, output_attentions)
# 对编码器输出进行上采样操作
hidden_states = self.upsample(encoder_outputs.last_hidden_state)
# 如果上采样后的长度小于输入长度,则进行填充操作
if hidden_states.shape[1] < n_input_timesteps:
hidden_states = nn.functional.pad(hidden_states, (0, 0, 0, n_input_timesteps - hidden_states.shape[1]))
# 如果 return_dict 为 False,则返回非空的元组
if not return_dict:
return tuple(
v for v in [hidden_states, encoder_outputs.hidden_states, encoder_outputs.attentions] if v is not None
)
# 返回 BaseModelOutput 对象,包含最终的隐藏状态、编码器的隐藏状态和注意力权重
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
# SEWDPreTrainedModel 是一个抽象类,继承自 PreTrainedModel,用于处理权重初始化、预训练模型的下载和加载接口
class SEWDPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
# 定义 SEWDConfig 类作为配置类
config_class = SEWDConfig
# 设置基础模型前缀为 "sew-d"
base_model_prefix = "sew-d"
# 设置主输入名称为 "input_values"
main_input_name = "input_values"
# 支持梯度检查点
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""初始化权重"""
# 如果模块是 SEWDPositionalConvEmbedding 的实例
if isinstance(module, SEWDPositionalConvEmbedding):
# 初始化卷积层的权重为正态分布
nn.init.normal_(
module.conv.weight,
mean=0,
std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
)
# 初始化卷积层的偏置为常数0
nn.init.constant_(module.conv.bias, 0)
elif isinstance(module, nn.Linear):
# 对线性层的权重进行初始化,使用正态分布,标准差为配置中的初始化范围
# 这里与 TensorFlow 版本略有不同,后者使用截断正态分布进行初始化
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
# 对层归一化和分组归一化的偏置初始化为零
module.bias.data.zero_()
# 对层归一化和分组归一化的权重初始化为1
module.weight.data.fill_(1.0)
elif isinstance(module, nn.Conv1d):
# 如果启用了 DeepSpeed Zero3
if is_deepspeed_zero3_enabled():
import deepspeed
# 如果模块有权重分布,使用 GatheredParameters 进行初始化
if hasattr(module, "weight_v") and hasattr(module, "weight_g"):
with deepspeed.zero.GatheredParameters([module.weight_v, module.weight_g], modifier_rank=0):
# 使用 Kaiming 正态分布初始化权重
nn.init.kaiming_normal_(module.weight.data)
else:
with deepspeed.zero.GatheredParameters(module.weight, modifier_rank=0):
# 使用 Kaiming 正态分布初始化权重
nn.init.kaiming_normal_(module.weight.data)
else:
# 使用 Kaiming 正态分布初始化权重
nn.init.kaiming_normal_(module.weight.data)
elif isinstance(module, nn.Embedding):
# 使用正态分布初始化嵌入层的权重,标准差为配置中的初始化范围
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
# 如果设置了填充索引,将对应索引的权重初始化为零
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
# 如果模块是线性层或卷积层且有偏置,则将偏置初始化为零
if isinstance(module, (nn.Linear, nn.Conv1d)) and module.bias is not None:
module.bias.data.zero_()
def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
"""
计算卷积层的输出长度
"""
def _conv_out_length(input_length, kernel_size, stride):
# 从 https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html 取得的一维卷积层输出长度公式
return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
# 遍历配置中的卷积核大小和步长,计算每一层卷积的输出长度
for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
# 返回最终的输入长度
return input_lengths
# 根据给定的特征向量长度和注意力掩码计算输出长度
output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
# 获取批次大小
batch_size = attention_mask.shape[0]
# 创建一个全零注意力掩码张量,形状为(batch_size, feature_vector_length),与输入掩码相同的数据类型和设备
attention_mask = torch.zeros(
(batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
)
# 设置输出长度之前的所有位置为1,以确保这些位置被完全考虑
attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
# 反转注意力掩码张量,沿着最后一个维度进行累积求和,并再次反转,最终转换为布尔类型
attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
# 返回处理后的注意力掩码张量
return attention_mask
@add_start_docstrings(
"The bare SEW-D Model transformer outputting raw hidden-states without any specific head on top.",
SEWD_START_DOCSTRING,
)
# 使用add_start_docstrings装饰器添加文档字符串,描述SEW-D模型输出原始隐藏状态,没有特定的输出头部
# 继承自SEWDPreTrainedModel,该类可能定义在transformers.models.sew.modeling_sew.SEWModel中,将SEW替换为SEWD,layer_norm_eps替换为feature_layer_norm_eps
class SEWDModel(SEWDPreTrainedModel):
# 初始化方法,接受一个 SEWDConfig 类型的参数 config
def __init__(self, config: SEWDConfig):
# 调用父类的初始化方法,传入 config 参数
super().__init__(config)
# 将 config 参数保存在对象的 config 属性中
self.config = config
# 使用 SEWDFeatureEncoder 类根据 config 创建特征提取器对象,保存在 feature_extractor 属性中
self.feature_extractor = SEWDFeatureEncoder(config)
# 创建一个具有指定维度的 LayerNorm 层,eps 参数为 config 中的 feature_layer_norm_eps
self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.feature_layer_norm_eps)
# 判断是否需要将特征向量投影到不同的维度
self.project_features = config.conv_dim[-1] != config.hidden_size
if self.project_features:
# 如果需要投影特征向量,创建一个 Linear 层,将 conv_dim[-1] 维度投影到 hidden_size 维度
self.feature_projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
# 创建一个 Dropout 层,用于特征投影的 dropout,dropout 率为 config.feat_proj_dropout
self.feature_dropout = nn.Dropout(config.feat_proj_dropout)
# 如果 config 中指定了 mask_time_prob 或 mask_feature_prob 大于 0.0
if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
# 创建一个随机初始化的可学习参数,大小为 hidden_size
self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
# 使用 SEWDEncoder 类根据 config 创建编码器对象,保存在 encoder 属性中
self.encoder = SEWDEncoder(config)
# 调用类的后期初始化方法
self.post_init()
# 以下方法是从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states 复制而来
def _mask_hidden_states(
self,
hidden_states: torch.FloatTensor,
mask_time_indices: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
):
"""
Masks extracted features along time axis and/or along feature axis according to
[SpecAugment](https://arxiv.org/abs/1904.08779).
"""
# `config.apply_spec_augment` can set masking to False
# 检查配置中是否禁用了 SpecAugment,如果是,则直接返回隐藏状态
if not getattr(self.config, "apply_spec_augment", True):
return hidden_states
# generate indices & apply SpecAugment along time axis
batch_size, sequence_length, hidden_size = hidden_states.size()
if mask_time_indices is not None:
# apply SpecAugment along time axis with given mask_time_indices
# 使用给定的 mask_time_indices 对时间轴进行 SpecAugment
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
elif self.config.mask_time_prob > 0 and self.training:
# compute mask indices if not provided and apply SpecAugment along time axis
# 如果未提供 mask_time_indices,则计算掩码索引并沿时间轴应用 SpecAugment
mask_time_indices = _compute_mask_indices(
(batch_size, sequence_length),
mask_prob=self.config.mask_time_prob,
mask_length=self.config.mask_time_length,
attention_mask=attention_mask,
min_masks=self.config.mask_time_min_masks,
)
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
if self.config.mask_feature_prob > 0 and self.training:
# generate indices & apply SpecAugment along feature axis
# 如果训练模式且配置中开启了 mask_feature_prob,则生成索引并沿特征轴应用 SpecAugment
mask_feature_indices = _compute_mask_indices(
(batch_size, hidden_size),
mask_prob=self.config.mask_feature_prob,
mask_length=self.config.mask_feature_length,
min_masks=self.config.mask_feature_min_masks,
)
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
hidden_states[mask_feature_indices] = 0
return hidden_states
@add_start_docstrings_to_model_forward(SEWD_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutput,
config_class=_CONFIG_FOR_DOC,
modality="audio",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
input_values: Optional[torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
mask_time_indices: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 定义方法的返回类型为元组或BaseModelOutput类型
) -> Union[Tuple, BaseModelOutput]:
# 如果未指定输出注意力的配置,则使用模型的默认设置
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 如果未指定输出隐藏状态的配置,则使用模型的默认设置
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果未指定返回字典的配置,则使用模型的默认设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 提取特征向量
extract_features = self.feature_extractor(input_values)
# 调整特征向量的维度顺序
extract_features = extract_features.transpose(1, 2)
# 对特征向量进行层归一化处理
extract_features = self.layer_norm(extract_features)
# 如果需要将特征向量投影,则进行投影
if self.project_features:
extract_features = self.feature_projection(extract_features)
# 对特征向量进行特征丢弃(dropout)
hidden_states = self.feature_dropout(extract_features)
# 如果存在注意力遮罩,则根据特征向量的形状生成相应的减少注意力遮罩
if attention_mask is not None:
# 计算与特征向量对应的减少注意力遮罩
attention_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
# 根据时间索引遮罩隐藏状态
hidden_states = self._mask_hidden_states(hidden_states, mask_time_indices=mask_time_indices)
# 将隐藏状态和其他配置传递给编码器进行处理
encoder_outputs = self.encoder(
hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从编码器输出中提取隐藏状态
hidden_states = encoder_outputs[0]
# 如果不要求返回字典,则返回隐藏状态和编码器输出的其他部分
if not return_dict:
return (hidden_states,) + encoder_outputs[1:]
# 返回BaseModelOutput对象,包含最终隐藏状态、所有隐藏状态和注意力值
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
@add_start_docstrings(
"""SEW-D Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
SEWD_START_DOCSTRING,
)
# SEWDForCTC 类,用于在 Connectionist Temporal Classification (CTC) 上添加一个语言建模头部的 SEW-D 模型。
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC 复制而来,将 Wav2Vec2 改为 SEWD,wav2vec2 改为 sew_d,WAV_2_VEC_2 改为 SEWD
class SEWDForCTC(SEWDPreTrainedModel):
def __init__(self, config, target_lang: Optional[str] = None):
super().__init__(config)
# 初始化 SEWD 模型和 dropout 层
self.sew_d = SEWDModel(config)
self.dropout = nn.Dropout(config.final_dropout)
self.target_lang = target_lang
# 检查是否定义了语言模型头部的词汇表大小
if config.vocab_size is None:
raise ValueError(
f"You are trying to instantiate {self.__class__} with a configuration that "
"does not define the vocabulary size of the language model head. Please "
"instantiate the model as follows: `SEWDForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
"or define `vocab_size` of your model's configuration."
)
# 根据配置确定输出隐藏层大小
output_hidden_size = (
config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
)
# 初始化语言模型头部线性层
self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
# 初始化权重并进行最终处理
self.post_init()
def tie_weights(self):
"""
This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
passing `target_lang=...` to `from_pretrained(...)`.
This method is **not** supposed to be called by the user and is prone to be changed in the future.
"""
# 注意,`tie_weights` 通常用于绑定输入和输出嵌入权重。在这里,我们重新定义此方法,以便在 SEWD 中正确加载适配器层,避免引入新的 API 到 `PreTrainedModel`。
# 如果 `target_lang` 不是 None,并且配置中未定义 `adapter_attn_dim`,则会引发 ValueError。
# 如果 `target_lang` 是 None,并且配置中定义了 `adapter_attn_dim`,则记录日志信息。
# 如果 `target_lang` 不是 None,则强制加载指定的适配器层。
target_lang = self.target_lang
if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
logger.info("By default `target_lang` is set to 'eng'.")
elif target_lang is not None:
self.load_adapter(target_lang, force_load=True)
# 调用此函数将冻结特征编码器的梯度计算,使其参数在训练期间不会更新。
def freeze_feature_extractor(self):
# 发出警告,提示该方法即将被弃用并在 Transformers v5 中移除,建议使用等效的 `freeze_feature_encoder` 方法。
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
# 调用 `freeze_feature_encoder` 方法冻结特征编码器。
self.freeze_feature_encoder()
# 调用此函数将禁用特征编码器的梯度计算,使其参数在训练期间不会更新。
def freeze_feature_encoder(self):
# 调用内部函数 `_freeze_parameters` 来冻结特征编码器的参数。
self.sew_d.feature_extractor._freeze_parameters()
# 调用此函数将禁用基础模型的梯度计算,使其参数在训练期间不会更新,只有分类头会被更新。
def freeze_base_model(self):
# 遍历 `self.sew_d` 的所有参数,将它们的梯度计算设为 False。
for param in self.sew_d.parameters():
param.requires_grad = False
# 在模型前向传播过程中的参数注解和代码示例的添加函数修饰器
@add_start_docstrings_to_model_forward(SEWD_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=CausalLMOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_CTC_EXPECTED_OUTPUT,
expected_loss=_CTC_EXPECTED_LOSS,
)
# 模型的前向传播函数,接受输入值、注意力掩码等多个可选参数
def forward(
self,
input_values: Optional[torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: Optional[torch.Tensor] = None,
) -> Union[Tuple, CausalLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
config.vocab_size - 1]`.
"""
# Determine if return_dict is explicitly provided; otherwise, use the default from model config
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# Perform the forward pass through the model's sequence to sequence decoder
outputs = self.sew_d(
input_values,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# Extract hidden states from the model outputs and apply dropout
hidden_states = outputs[0]
hidden_states = self.dropout(hidden_states)
# Compute logits from the language model head
logits = self.lm_head(hidden_states)
# Initialize loss as None
loss = None
# Calculate loss only if labels are provided
if labels is not None:
# Check if any label value exceeds the vocabulary size, which would be invalid
if labels.max() >= self.config.vocab_size:
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
# Retrieve input lengths from attention_mask, defaulting to all ones if mask is None
attention_mask = (
attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
)
input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
# Determine target lengths and flatten the targets tensor
labels_mask = labels >= 0
target_lengths = labels_mask.sum(-1)
flattened_targets = labels.masked_select(labels_mask)
# Compute log probabilities using log_softmax for the logits
log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
# Disable cudnn for this section due to compatibility issues with fp16
with torch.backends.cudnn.flags(enabled=False):
# Compute the connectionist temporal classification (CTC) loss
loss = nn.functional.ctc_loss(
log_probs,
flattened_targets,
input_lengths,
target_lengths,
blank=self.config.pad_token_id,
reduction=self.config.ctc_loss_reduction,
zero_infinity=self.config.ctc_zero_infinity,
)
# If return_dict is False, return output tuple without loss
if not return_dict:
output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
return ((loss,) + output) if loss is not None else output
# If return_dict is True, return CausalLMOutput object with all relevant outputs
return CausalLMOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
)
@add_start_docstrings(
"""
SEWD Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like SUPERB
Keyword Spotting.
""",
SEWD_START_DOCSTRING,
)
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification中复制而来,将Wav2Vec2改为SEWD,wav2vec2改为sew_d,WAV_2_VEC_2改为SEWD
class SEWDForSequenceClassification(SEWDPreTrainedModel):
def __init__(self, config):
super().__init__(config)
if hasattr(config, "add_adapter") and config.add_adapter:
raise ValueError(
"Sequence classification does not support the use of SEWD adapters (config.add_adapter=True)"
)
self.sew_d = SEWDModel(config)
num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings
if config.use_weighted_layer_sum:
self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
# Initialize weights and apply final processing
self.post_init()
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.sew_d.feature_extractor._freeze_parameters()
def freeze_base_model(self):
"""
Calling this function will disable the gradient computation for the base model so that its parameters will not
be updated during training. Only the classification head will be updated.
"""
for param in self.sew_d.parameters():
param.requires_grad = False
@add_start_docstrings_to_model_forward(SEWD_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_SEQ_CLASS_CHECKPOINT,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
modality="audio",
expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
)
def forward(
self,
input_values: Optional[torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: Optional[torch.Tensor] = None,
) -> Union[Tuple, SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 设定是否返回结果的字典形式,默认根据模型配置决定
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果配置中指定了使用加权层求和,则输出隐藏状态
output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
# 调用模型的前向计算
outputs = self.sew_d(
input_values,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 如果配置中指定了使用加权层求和
if self.config.use_weighted_layer_sum:
# 获取隐藏状态的列表
hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
# 将隐藏状态堆叠起来形成张量
hidden_states = torch.stack(hidden_states, dim=1)
# 计算归一化的权重
norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
# 对隐藏状态进行加权求和
hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
else:
# 否则直接取第一个输出作为隐藏状态
hidden_states = outputs[0]
# 将隐藏状态传递给投影器
hidden_states = self.projector(hidden_states)
# 如果没有给定注意力掩码,则对隐藏状态进行平均池化
if attention_mask is None:
pooled_output = hidden_states.mean(dim=1)
else:
# 否则根据注意力掩码获取特征向量并进行加权池化
padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
hidden_states[~padding_mask] = 0.0
pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
# 将池化输出传递给分类器得到预测的 logits
logits = self.classifier(pooled_output)
# 计算损失,如果给定了标签
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
# 根据是否返回字典形式决定输出的结构
if not return_dict:
output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
return ((loss,) + output) if loss is not None else output
# 返回带有损失、logits、隐藏状态和注意力的结果字典形式
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
.\models\sew_d\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {"configuration_sew_d": ["SEW_D_PRETRAINED_CONFIG_ARCHIVE_MAP", "SEWDConfig"]}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_sew_d"] = [
"SEW_D_PRETRAINED_MODEL_ARCHIVE_LIST",
"SEWDForCTC",
"SEWDForSequenceClassification",
"SEWDModel",
"SEWDPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_sew_d import SEW_D_PRETRAINED_CONFIG_ARCHIVE_MAP, SEWDConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_sew_d import (
SEW_D_PRETRAINED_MODEL_ARCHIVE_LIST,
SEWDForCTC,
SEWDForSequenceClassification,
SEWDModel,
SEWDPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\siglip\configuration_siglip.py
""" Siglip model configuration"""
import os
from typing import Union
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
SIGLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"google/siglip-base-patch16-224": "https://huggingface.co/google/siglip-base-patch16-224/resolve/main/config.json",
}
class SiglipTextConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`SiglipTextModel`]. It is used to instantiate a
Siglip text encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the text encoder of the Siglip
[google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
model_type = "siglip_text_model"
def __init__(
self,
vocab_size=32000,
hidden_size=768,
intermediate_size=3072,
num_hidden_layers=12,
num_attention_heads=12,
max_position_embeddings=64,
hidden_act="gelu_pytorch_tanh",
layer_norm_eps=1e-6,
attention_dropout=0.0,
pad_token_id=1,
bos_token_id=49406,
eos_token_id=49407,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.max_position_embeddings = max_position_embeddings
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
self.attention_dropout = attention_dropout
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
cls._set_token_in_kwargs(kwargs)
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
if config_dict.get("model_type") == "siglip":
config_dict = config_dict["text_config"]
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
return cls.from_dict(config_dict, **kwargs)
class SiglipVisionConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
[google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
num_channels (`int`, *optional*, defaults to 3):
Number of channels in the input images.
image_size (`int`, *optional*, defaults to 224):
The size (resolution) of each image.
patch_size (`int`, *optional*, defaults to 16):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
Example:
```
>>> from transformers import SiglipVisionConfig, SiglipVisionModel
>>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
>>> configuration = SiglipVisionConfig()
>>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
>>> model = SiglipVisionModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "siglip_vision_model"
def __init__(
self,
hidden_size=768,
intermediate_size=3072,
num_hidden_layers=12,
num_attention_heads=12,
num_channels=3,
image_size=224,
patch_size=16,
hidden_act="gelu_pytorch_tanh",
layer_norm_eps=1e-6,
attention_dropout=0.0,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_channels = num_channels
self.patch_size = patch_size
self.image_size = image_size
self.attention_dropout = attention_dropout
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
cls._set_token_in_kwargs(kwargs)
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
if config_dict.get("model_type") == "siglip":
config_dict = config_dict["vision_config"]
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
return cls.from_dict(config_dict, **kwargs)
class SiglipConfig(PretrainedConfig):
r"""
[`SiglipConfig`] 是用来存储 [`SiglipModel`] 配置的类。它用于根据指定的参数实例化一个Siglip模型,
定义了文本模型和视觉模型的配置。使用默认值实例化配置将产生类似于Siglip [google/siglip-base-patch16-224]
架构的配置。
配置对象继承自 [`PretrainedConfig`],可以用来控制模型的输出。详细信息请参阅 [`PretrainedConfig`] 的文档。
Args:
text_config (`dict`, *optional*):
用于初始化 [`SiglipTextConfig`] 的配置选项字典。
vision_config (`dict`, *optional*):
用于初始化 [`SiglipVisionConfig`] 的配置选项字典。
kwargs (*optional*):
关键字参数字典。
Example:
```
>>> from transformers import SiglipConfig, SiglipModel
>>> # 使用google/siglip-base-patch16-224风格的配置初始化SiglipConfig
>>> configuration = SiglipConfig()
>>> # 使用google/siglip-base-patch16-224风格的配置初始化SiglipModel(随机权重)
>>> model = SiglipModel(configuration)
>>> # 访问模型配置
>>> configuration = model.config
>>> # 我们还可以从SiglipTextConfig和SiglipVisionConfig初始化SiglipConfig
>>> from transformers import SiglipTextConfig, SiglipVisionConfig
>>> # 初始化SiglipText和SiglipVision配置
>>> config_text = SiglipTextConfig()
>>> config_vision = SiglipVisionConfig()
>>> config = SiglipConfig.from_text_vision_configs(config_text, config_vision)
```"""
model_type = "siglip"
def __init__(self, text_config=None, vision_config=None, **kwargs):
super().__init__(**kwargs)
if text_config is None:
text_config = {}
logger.info("`text_config` is `None`. Initializing the `SiglipTextConfig` with default values.")
if vision_config is None:
vision_config = {}
logger.info("`vision_config` is `None`. initializing the `SiglipVisionConfig` with default values.")
self.text_config = SiglipTextConfig(**text_config)
self.vision_config = SiglipVisionConfig(**vision_config)
self.initializer_factor = 1.0
@classmethod
@classmethod
def from_text_vision_configs(cls, text_config: SiglipTextConfig, vision_config: SiglipVisionConfig, **kwargs):
r"""
Instantiate a [`SiglipConfig`] (or a derived class) from siglip text model configuration and siglip vision
model configuration.
Returns:
[`SiglipConfig`]: An instance of a configuration object
"""
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
.\models\siglip\convert_siglip_to_hf.py
def get_siglip_config(model_name):
config = SiglipConfig()
vocab_size = 250000 if "i18n" in model_name else 32000
image_size = model_name_to_image_size[model_name]
patch_size = 16 if "patch16" in model_name else 14
config.vision_config.image_size = image_size
config.vision_config.patch_size = patch_size
config.text_config.vocab_size = vocab_size
if "base" in model_name:
pass
elif "large" in model_name:
config.text_config.hidden_size = 1024
config.text_config.intermediate_size = 4096
config.text_config.num_hidden_layers = 24
config.text_config.num_attention_heads = 16
config.vision_config.hidden_size = 1024
config.vision_config.intermediate_size = 4096
config.vision_config.num_hidden_layers = 24
config.vision_config.num_attention_heads = 16
elif "so400m" in model_name:
config.text_config.hidden_size = 1152
config.text_config.intermediate_size = 4304
config.text_config.num_hidden_layers = 27
config.text_config.num_attention_heads = 16
config.vision_config.hidden_size = 1152
config.vision_config.intermediate_size = 4304
config.vision_config.num_hidden_layers = 27
config.vision_config.num_attention_heads = 16
else:
raise ValueError("Model not supported")
return config
def create_rename_keys(config):
rename_keys = []
rename_keys.append(("params/img/embedding/kernel", "vision_model.embeddings.patch_embedding.weight"))
rename_keys.append(("params/img/embedding/bias", "vision_model.embeddings.patch_embedding.bias"))
rename_keys.append(("params/img/pos_embedding", "vision_model.embeddings.position_embedding.weight"))
for i in range(config.vision_config.num_hidden_layers):
rename_keys.append((f"params/img/Transformer/encoderblock_{i}/LayerNorm_0/scale", f"vision_model.encoder.layers.{i}.layer_norm1.weight"))
rename_keys.append((f"params/img/Transformer/encoderblock_{i}/LayerNorm_0/bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias"))
rename_keys.append((f"params/img/Transformer/encoderblock_{i}/LayerNorm_1/scale", f"vision_model.encoder.layers.{i}.layer_norm2.weight"))
rename_keys.append((f"params/img/Transformer/encoderblock_{i}/LayerNorm_1/bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias"))
rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MlpBlock_0/Dense_0/kernel", f"vision_model.encoder.layers.{i}.mlp.fc1.weight"))
rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MlpBlock_0/Dense_0/bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias"))
rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MlpBlock_0/Dense_1/kernel", f"vision_model.encoder.layers.{i}.mlp.fc2.weight"))
rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MlpBlock_0/Dense_1/bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias"))
rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/key/kernel", f"vision_model.encoder.layers.{i}.self_attn.k_proj.weight"))
rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/key/bias", f"vision_model.encoder.layers.{i}.self_attn.k_proj.bias"))
rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/value/kernel", f"vision_model.encoder.layers.{i}.self_attn.v_proj.weight"))
rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/value/bias", f"vision_model.encoder.layers.{i}.self_attn.v_proj.bias"))
rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/query/kernel", f"vision_model.encoder.layers.{i}.self_attn.q_proj.weight"))
rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/query/bias", f"vision_model.encoder.layers.{i}.self_attn.q_proj.bias"))
rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/out/kernel", f"vision_model.encoder.layers.{i}.self_attn.out_proj.weight"))
rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/out/bias", f"vision_model.encoder.layers.{i}.self_attn.out_proj.bias"))
rename_keys.append(("params/img/Transformer/encoder_norm/scale", "vision_model.post_layernorm.weight"))
rename_keys.append(("params/img/Transformer/encoder_norm/bias", "vision_model.post_layernorm.bias"))
rename_keys.append(("params/img/MAPHead_0/probe", "vision_model.head.probe"))
rename_keys.append(("params/img/MAPHead_0/LayerNorm_0/scale", "vision_model.head.layernorm.weight"))
rename_keys.append(("params/img/MAPHead_0/LayerNorm_0/bias", "vision_model.head.layernorm.bias"))
rename_keys.append(("params/img/MAPHead_0/MlpBlock_0/Dense_0/kernel", "vision_model.head.mlp.fc1.weight"))
rename_keys.append(("params/img/MAPHead_0/MlpBlock_0/Dense_0/bias", "vision_model.head.mlp.fc1.bias"))
rename_keys.append(("params/img/MAPHead_0/MlpBlock_0/Dense_1/kernel", "vision_model.head.mlp.fc2.weight"))
rename_keys.append(("params/img/MAPHead_0/MlpBlock_0/Dense_1/bias", "vision_model.head.mlp.fc2.bias"))
rename_keys.append(("params/img/MAPHead_0/MultiHeadDotProductAttention_0/out/kernel", "vision_model.head.attention.out_proj.weight"))
rename_keys.append(("params/img/MAPHead_0/MultiHeadDotProductAttention_0/out/bias", "vision_model.head.attention.out_proj.bias"))
rename_keys.append(("params/txt/Embed_0/embedding", "text_model.embeddings.token_embedding.weight"))
rename_keys.append(("params/txt/pos_embedding", "text_model.embeddings.position_embedding.weight"))
for i in range(config.text_config.num_hidden_layers):
rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/LayerNorm_0/scale", f"text_model.encoder.layers.{i}.layer_norm1.weight"))
rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/LayerNorm_0/bias", f"text_model.encoder.layers.{i}.layer_norm1.bias"))
rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/LayerNorm_1/scale", f"text_model.encoder.layers.{i}.layer_norm2.weight"))
rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/LayerNorm_1/bias", f"text_model.encoder.layers.{i}.layer_norm2.bias"))
rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MlpBlock_0/Dense_0/kernel", f"text_model.encoder.layers.{i}.mlp.fc1.weight"))
rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MlpBlock_0/Dense_0/bias", f"text_model.encoder.layers.{i}.mlp.fc1.bias"))
rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MlpBlock_0/Dense_1/kernel", f"text_model.encoder.layers.{i}.mlp.fc2.weight"))
rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MlpBlock_0/Dense_1/bias", f"text_model.encoder.layers.{i}.mlp.fc2.bias"))
rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/key/kernel", f"text_model.encoder.layers.{i}.self_attn.k_proj.weight"))
rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/key/bias", f"text_model.encoder.layers.{i}.self_attn.k_proj.bias"))
rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/value/kernel", f"text_model.encoder.layers.{i}.self_attn.v_proj.weight"))
rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/value/bias", f"text_model.encoder.layers.{i}.self_attn.v_proj.bias"))
rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/query/kernel", f"text_model.encoder.layers.{i}.self_attn.q_proj.weight"))
rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/query/bias", f"text_model.encoder.layers.{i}.self_attn.q_proj.bias"))
rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/out/kernel", f"text_model.encoder.layers.{i}.self_attn.out_proj.weight"))
rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/out/bias", f"text_model.encoder.layers.{i}.self_attn.out_proj.bias"))
rename_keys.append(("params/txt/Encoder_0/encoder_norm/scale", "text_model.final_layer_norm.weight"))
rename_keys.append(("params/txt/Encoder_0/encoder_norm/bias", "text_model.final_layer_norm.bias"))
rename_keys.append(("params/txt/head/kernel", "text_model.head.weight"))
rename_keys.append(("params/txt/head/bias", "text_model.head.bias"))
rename_keys.append(("params/t", "logit_scale"))
rename_keys.append(("params/b", "logit_bias"))
return rename_keys
def rename_key(dct, old, new, config):
val = dct.pop(old)
if ("out_proj" in new or "v_proj" in new or "k_proj" in new or "q_proj" in new) and "vision" in new:
val = val.reshape(-1, config.vision_config.hidden_size)
if ("out_proj" in new or "v_proj" in new or "k_proj" in new or "q_proj" in new) and "text" in new:
val = val.reshape(-1, config.text_config.hidden_size)
if "patch_embedding.weight" in new:
val = val.transpose(3, 2, 0, 1)
elif new.endswith("weight") and "position_embedding" not in new and "token_embedding" not in new:
val = val.T
if "position_embedding" in new and "vision" in new:
val = val.reshape(-1, config.vision_config.hidden_size)
if "position_embedding" in new and "text" in new:
val = val.reshape(-1, config.text_config.hidden_size)
if new.endswith("bias"):
val = val.reshape(-1)
dct[new] = torch.from_numpy(val)
def read_in_q_k_v_head(state_dict, config):
key_proj_weight = (
state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/key/kernel")
.reshape(-1, config.vision_config.hidden_size)
.T
)
key_proj_bias = state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/key/bias").reshape(-1)
value_proj_weight = (
state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/value/kernel")
.reshape(-1, config.vision_config.hidden_size)
.T
)
value_proj_bias = state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/value/bias").reshape(-1)
query_proj_weight = (
state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/query/kernel")
.reshape(-1, config.vision_config.hidden_size)
.T
)
query_proj_bias = state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/query/bias").reshape(-1)
state_dict["vision_model.head.attention.in_proj_weight"] = torch.from_numpy(
np.concatenate([query_proj_weight, key_proj_weight, value_proj_weight], axis=0)
)
state_dict["vision_model.head.attention.in_proj_bias"] = torch.from_numpy(
np.concatenate([query_proj_bias, key_proj_bias, value_proj_bias], axis=0)
)
def convert_siglip_checkpoint(model_name, pytorch_dump_folder_path, verify_logits=True, push_to_hub=False):
"""
Copy/paste/tweak model's weights to our SigLIP structure.
"""
config = get_siglip_config(model_name)
checkpoint = model_name_to_checkpoint[model_name]
if "i18n" in model_name:
vocab_file = "/Users/nielsrogge/Documents/SigLIP/multilingual_vocab/sentencepiece.model"
else:
vocab_file = "/Users/nielsrogge/Documents/SigLIP/english_vocab/sentencepiece.model"
data = load(checkpoint)
state_dict = flatten_nested_dict(data)
rename_keys = create_rename_keys(config)
for src, dest in rename_keys:
rename_key(state_dict, src, dest, config)
read_in_q_k_v_head(state_dict, config)
model = SiglipModel(config).eval()
model.load_state_dict(state_dict)
image_size = config.vision_config.image_size
size = {"height": image_size, "width": image_size}
image_processor = SiglipImageProcessor(size=size)
tokenizer = SiglipTokenizer(vocab_file=vocab_file, model_input_names=["input_ids"])
processor = SiglipProcessor(image_processor=image_processor, tokenizer=tokenizer)
url_1 = "https://cdn.openai.com/multimodal-neurons/assets/apple/apple-ipod.jpg"
image_1 = Image.open(requests.get(url_1, stream=True).raw).convert("RGB")
url_2 = "https://cdn.openai.com/multimodal-neurons/assets/apple/apple-blank.jpg"
image_2 = Image.open(requests.get(url_2, stream=True).raw).convert("RGB")
texts = ["an apple", "a picture of an apple"]
inputs = processor(images=[image_1, image_2], text=texts, return_tensors="pt", padding="max_length")
if image_size == 224:
filename = "siglip_pixel_values.pt"
elif image_size == 256:
filename = "siglip_pixel_values_256.pt"
elif image_size == 384:
filename = "siglip_pixel_values_384.pt"
elif image_size == 512:
filename = "siglip_pixel_values_512.pt"
else:
raise ValueError("Image size not supported")
filepath = hf_hub_download(repo_id="nielsr/test-image", filename=filename, repo_type="dataset")
original_pixel_values = torch.load(filepath)
filepath = hf_hub_download(repo_id="nielsr/test-image", filename="siglip_input_ids.pt", repo_type="dataset")
original_input_ids = torch.load(filepath)
if "i18n" not in model_name:
assert inputs.input_ids.tolist() == original_input_ids.tolist()
print("Mean of original pixel values:", original_pixel_values.mean())
print("Mean of new pixel values:", inputs.pixel_values.mean())
with torch.no_grad():
outputs = model(input_ids=inputs.input_ids, pixel_values=original_pixel_values)
print(outputs.logits_per_image[:3, :3])
probs = torch.sigmoid(outputs.logits_per_image)
print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
print(f"{probs[0][1]:.1%} that image 0 is '{texts[1]}'")
if verify_logits:
if model_name == "siglip-base-patch16-224":
expected_slice = torch.tensor(
[[-2.9621, -2.1672], [-0.2713, 0.2910]],
)
elif model_name == "siglip-base-patch16-256":
expected_slice = torch.tensor(
[[-3.1146, -1.9894], [-0.7312, 0.6387]],
)
elif model_name == "siglip-base-patch16-384":
expected_slice = torch.tensor(
[[-2.8098, -2.1891], [-0.4242, 0.4102]],
)
elif model_name == "siglip-base-patch16-512":
expected_slice = torch.tensor(
[[-2.7899, -2.2668], [-0.4295, -0.0735]],
)
elif model_name == "siglip-large-patch16-256":
expected_slice = torch.tensor(
[[-1.5827, -0.5801], [-0.9153, 0.1363]],
)
elif model_name == "siglip-large-patch16-384":
expected_slice = torch.tensor(
[[-2.1523, -0.2899], [-0.2959, 0.7884]],
)
elif model_name == "siglip-so400m-patch14-384":
expected_slice = torch.tensor([[-1.2441, -0.6649], [-0.7060, 0.7374]])
elif model_name == "siglip-base-patch16-256-i18n":
expected_slice = torch.tensor(
[[-0.9064, 0.1073], [-0.0299, 0.5304]],
)
assert torch.allclose(outputs.logits_per_image[:3, :3], expected_slice, atol=1e-4)
print("Looks ok!")
if pytorch_dump_folder_path is not None:
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving processor to {pytorch_dump_folder_path}")
processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
model.push_to_hub(f"nielsr/{model_name}")
processor.push_to_hub(f"nielsr/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
default="siglip-base-patch16-224",
type=str,
choices=model_name_to_checkpoint.keys(),
help="Name of the model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
)
parser.add_argument(
"--verify_logits",
action="store_false",
help="Whether to verify logits against the original implementation.",
)
parser.add_argument(
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
)
args = parser.parse_args()
convert_siglip_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.verify_logits, args.push_to_hub)
.\models\siglip\image_processing_siglip.py
from typing import Dict, List, Optional, Union
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
resize,
to_channel_dimension_format,
)
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
to_numpy_array,
valid_images,
validate_kwargs,
validate_preprocess_arguments,
)
from ...utils import TensorType, is_vision_available, logging
logger = logging.get_logger(__name__)
if is_vision_available():
import PIL
class SiglipImageProcessor(BaseImageProcessor):
r"""
Constructs a SigLIP image processor.
"""
class ImagePreprocessing:
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PILImageResampling.BICUBIC,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
**kwargs,
):
def __init__(
self,
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"height": 224, "width": 224}
image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self._valid_processor_keys = [
"images",
"do_resize",
"size",
"resample",
"do_rescale",
"rescale_factor",
"do_normalize",
"image_mean",
"image_std",
"return_tensors",
"data_format",
"input_data_format",
]
def preprocess(
self,
images: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
do_rescale: bool = None,
rescale_factor: float = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
.\models\siglip\modeling_siglip.py
import math
import warnings
from dataclasses import dataclass
from typing import Any, Optional, Tuple, Union
import numpy as np
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from torch.nn.init import _calculate_fan_in_and_fan_out
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "SiglipConfig"
_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
_IMAGE_CLASS_CHECKPOINT = "google/siglip-base-patch16-224"
_IMAGE_CLASS_EXPECTED_OUTPUT = "LABEL_1"
SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/siglip-base-patch16-224",
]
def _trunc_normal_(tensor, mean, std, a, b):
def norm_cdf(x):
return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
if (mean < a - 2 * std) or (mean > b + 2 * std):
warnings.warn(
"mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
"The distribution of values may be incorrect.",
stacklevel=2,
)
l = norm_cdf((a - mean) / std)
u = norm_cdf((b - mean) / std)
tensor.uniform_(2 * l - 1, 2 * u - 1)
tensor.erfinv_()
tensor.mul_(std * math.sqrt(2.0))
tensor.add_(mean)
tensor.clamp_(min=a, max=b)
def trunc_normal_tf_(
tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
) -> torch.Tensor:
"""Fills the input Tensor with values drawn from a truncated
normal distribution. The values are effectively drawn from the
normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
with values outside :math:`[a, b]` redrawn until they are within
the bounds. The method used for generating the random values works
best when :math:`a \\leq \text{mean} \\leq b`.
NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
and the result is subsequently scaled and shifted by the mean and std args.
Args:
tensor: an n-dimensional `torch.Tensor`
mean: the mean of the normal distribution
std: the standard deviation of the normal distribution
a: the minimum cutoff value
b: the maximum cutoff value
"""
with torch.no_grad():
_trunc_normal_(tensor, 0, 1.0, a, b)
tensor.mul_(std).add_(mean)
def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
if mode == "fan_in":
denom = fan_in
elif mode == "fan_out":
denom = fan_out
elif mode == "fan_avg":
denom = (fan_in + fan_out) / 2
variance = scale / denom
if distribution == "truncated_normal":
trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
elif distribution == "normal":
with torch.no_grad():
tensor.normal_(std=math.sqrt(variance))
elif distribution == "uniform":
bound = math.sqrt(3 * variance)
with torch.no_grad():
tensor.uniform_(-bound, bound)
else:
raise ValueError(f"invalid distribution {distribution}")
def lecun_normal_(tensor):
variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
def default_flax_embed_init(tensor):
variance_scaling_(tensor, mode="fan_in", distribution="normal")
@dataclass
class SiglipVisionModelOutput(ModelOutput):
"""
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
# 定义函数的参数列表,用于描述函数接受的输入参数以及它们的数据类型和形状
Args:
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
图像嵌入,通过将投影层应用于池化输出得到。是一个可选参数,当模型使用 `with_projection=True` 初始化时返回。
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
模型最后一层的输出隐藏状态序列。是一个形状为 `(batch_size, sequence_length, hidden_size)` 的张量。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
模型每一层的隐藏状态元组。如果模型具有嵌入层,则包括嵌入输出,形状为 `(batch_size, sequence_length, hidden_size)`。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
注意力权重元组,用于计算自注意力头中的加权平均值。形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
"""
image_embeds: Optional[torch.FloatTensor] = None
last_hidden_state: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class SiglipTextModelOutput(ModelOutput):
"""
文本模型输出的基类,同时包含最后隐藏状态的汇聚。
Args:
text_embeds (`torch.FloatTensor`,形状为 `(batch_size, output_dim)`,可选项,在初始化模型时设置 `with_projection=True` 时返回):
通过将投影层应用于池化输出获得的文本嵌入。
last_hidden_state (`torch.FloatTensor`,形状为 `(batch_size, sequence_length, hidden_size)`):
模型最后一层的隐藏状态的序列。
hidden_states (`tuple(torch.FloatTensor)`,可选项,在传递 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
元组 `torch.FloatTensor`(如果模型具有嵌入层,则为嵌入层的输出,以及每一层的输出),
形状为 `(batch_size, sequence_length, hidden_size)`。
模型在每层输出的隐藏状态以及可选的初始嵌入输出。
attentions (`tuple(torch.FloatTensor)`,可选项,在传递 `output_attentions=True` 或 `config.output_attentions=True` 时返回):
元组 `torch.FloatTensor`(每层一个),形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
"""
text_embeds: Optional[torch.FloatTensor] = None
last_hidden_state: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class SiglipOutput(ModelOutput):
"""
Siglip 输出的基类。
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
Contrastive loss for image-text similarity.
logits_per_image: (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
similarity scores.
logits_per_text: (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
similarity scores.
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`):
The text embeddings obtained by applying the projection layer to the pooled output of `SiglipTextModel`.
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`):
The image embeddings obtained by applying the projection layer to the pooled output of `SiglipVisionModel`.
text_model_output (`BaseModelOutputWithPooling`):
The output of the `SiglipTextModel`.
vision_model_output (`BaseModelOutputWithPooling`):
The output of the `SiglipVisionModel`.
"""
# 定义一个类,用于封装对比损失和模型输出
loss: Optional[torch.FloatTensor] = None
logits_per_image: torch.FloatTensor = None
logits_per_text: torch.FloatTensor = None
text_embeds: torch.FloatTensor = None
image_embeds: torch.FloatTensor = None
text_model_output: BaseModelOutputWithPooling = None
vision_model_output: BaseModelOutputWithPooling = None
def to_tuple(self) -> Tuple[Any]:
# 返回包含类属性的元组,但是对于"text_model_output"和"vision_model_output"属性,返回其转换为元组后的值
return tuple(
self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
for k in self.keys()
)
class SiglipVisionEmbeddings(nn.Module):
def __init__(self, config: SiglipVisionConfig):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size # 从配置中获取隐藏大小作为嵌入维度
self.image_size = config.image_size # 从配置中获取图像大小
self.patch_size = config.patch_size # 从配置中获取补丁大小
self.patch_embedding = nn.Conv2d(
in_channels=config.num_channels, # 输入通道数
out_channels=self.embed_dim, # 输出通道数(嵌入维度)
kernel_size=self.patch_size, # 卷积核大小(补丁大小)
stride=self.patch_size, # 卷积步长(补丁大小)
padding="valid", # 卷积填充方式为有效填充
)
self.num_patches = (self.image_size // self.patch_size) ** 2 # 计算图像中的补丁数量
self.num_positions = self.num_patches # 位置嵌入的位置数量等于补丁数量
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) # 创建位置嵌入层
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
# 注册位置 ID 缓冲区,用于存储位置索引的张量
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
patch_embeds = self.patch_embedding(pixel_values) # 使用卷积层对像素值进行补丁嵌入
embeddings = patch_embeds.flatten(2).transpose(1, 2) # 将补丁嵌入展平并进行维度转置
embeddings = embeddings + self.position_embedding(self.position_ids)
# 加上位置嵌入,以增强补丁嵌入的语义表示
return embeddings
# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->Siglip
class SiglipTextEmbeddings(nn.Module):
def __init__(self, config: SiglipTextConfig):
super().__init__()
embed_dim = config.hidden_size # 从配置中获取隐藏大小作为嵌入维度
self.token_embedding = nn.Embedding(config.vocab_size, embed_dim) # 创建标记嵌入层
self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim) # 创建位置嵌入层
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
# 注册位置 ID 缓冲区,用于存储位置索引的张量,支持序列化时的导出
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
) -> torch.Tensor:
seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
# 计算输入序列的长度
if position_ids is None:
position_ids = self.position_ids[:, :seq_length]
# 如果未提供位置 ID,则使用预注册的位置 ID,并根据序列长度截取
if inputs_embeds is None:
inputs_embeds = self.token_embedding(input_ids)
# 如果未提供嵌入张量,则使用输入标记 ID 进行嵌入
position_embeddings = self.position_embedding(position_ids)
# 获取位置嵌入张量
embeddings = inputs_embeds + position_embeddings
# 将标记嵌入和位置嵌入相加,生成最终的嵌入表示
return embeddings
class SiglipAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
# Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
# 初始化函数,用于初始化一个注意力机制模型对象
def __init__(self, config):
# 调用父类的初始化方法
super().__init__()
# 将传入的配置对象保存在实例变量中
self.config = config
# 设置嵌入维度为配置对象中的隐藏大小
self.embed_dim = config.hidden_size
# 设置注意力头的数量为配置对象中的注意力头数量
self.num_heads = config.num_attention_heads
# 计算每个注意力头的维度
self.head_dim = self.embed_dim // self.num_heads
# 检查嵌入维度是否可以整除注意力头数量,否则抛出数值错误
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {self.num_heads})."
)
# 设置缩放因子为头维度的负半数
self.scale = self.head_dim**-0.5
# 设置注意力机制中的丢弃率为配置对象中的注意力丢弃率
self.dropout = config.attention_dropout
# 初始化线性层,用于键、值、查询、输出的投影
self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
# 前向传播函数,执行输入张量的注意力计算
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
"""Input shape: Batch x Time x Channel"""
# 获取隐藏状态张量的维度信息
batch_size, q_len, _ = hidden_states.size()
# 将隐藏状态张量投影到查询向量空间
query_states = self.q_proj(hidden_states)
# 将隐藏状态张量投影到键向量空间
key_states = self.k_proj(hidden_states)
# 将隐藏状态张量投影到值向量空间
value_states = self.v_proj(hidden_states)
# 将投影后的张量重新形状为 (batch_size, q_len, num_heads, head_dim),并交换维度
query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
# 获取键-值对应的序列长度
k_v_seq_len = key_states.shape[-2]
# 计算注意力权重,使用 query 和 key 的点积,并乘以缩放因子
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
# 检查注意力权重的维度是否符合预期
if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
raise ValueError(
f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
f" {attn_weights.size()}"
)
# 如果有注意力掩码,则将其加到注意力权重上
if attention_mask is not None:
if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
raise ValueError(
f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
)
attn_weights = attn_weights + attention_mask
# 将注意力权重转换为 float32 类型,并进行 dropout
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
# 计算加权后的值向量
attn_output = torch.matmul(attn_weights, value_states)
# 检查输出的注意力张量的维度是否符合预期
if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
f" {attn_output.size()}"
)
# 重新调整注意力输出的维度顺序,并保证连续的内存布局
attn_output = attn_output.transpose(1, 2).contiguous()
# 将注意力输出重新形状为 (batch_size, q_len, embed_dim)
attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
# 对输出应用最终的投影层变换
attn_output = self.out_proj(attn_output)
# 返回注意力输出以及注意力权重
return attn_output, attn_weights
# 从 transformers.models.clip.modeling_clip.CLIPMLP 复制而来,将 CLIP 替换为 Siglip
class SiglipMLP(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.activation_fn = ACT2FN[config.hidden_act] # 获取激活函数
self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) # 第一个全连接层
self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) # 第二个全连接层
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.fc1(hidden_states) # 第一个全连接层的前向传播
hidden_states = self.activation_fn(hidden_states) # 应用激活函数
hidden_states = self.fc2(hidden_states) # 第二个全连接层的前向传播
return hidden_states
# 从 transformers.models.clip.modeling_clip.CLIPEncoderLayer 复制而来,将 CLIP 替换为 Siglip
class SiglipEncoderLayer(nn.Module):
def __init__(self, config: SiglipConfig):
super().__init__()
self.embed_dim = config.hidden_size # 嵌入维度
self.self_attn = SiglipAttention(config) # 自注意力机制
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) # 第一个层归一化
self.mlp = SiglipMLP(config) # 多层感知机
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) # 第二个层归一化
# 忽略复制部分
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.FloatTensor]:
"""
Args:
hidden_states (`torch.FloatTensor`):
输入的张量形状为 `(batch, seq_len, embed_dim)`。
attention_mask (`torch.FloatTensor`):
注意力掩码形状为 `(batch, 1, q_len, k_v_seq_len)`,其中填充元素由非常大的负值表示。
output_attentions (`bool`, *optional*, defaults to `False`):
是否返回所有注意力层的注意力张量。查看返回的张量中的 `attentions` 获取更多细节。
"""
residual = hidden_states # 保留残差连接
hidden_states = self.layer_norm1(hidden_states) # 第一个层归一化
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
) # 自注意力层的前向传播
hidden_states = residual + hidden_states # 残差连接
residual = hidden_states # 更新残差连接
hidden_states = self.layer_norm2(hidden_states) # 第二个层归一化
hidden_states = self.mlp(hidden_states) # 多层感知机的前向传播
hidden_states = residual + hidden_states # 残差连接
outputs = (hidden_states,) # 输出结果
if output_attentions:
outputs += (attn_weights,) # 如果需要输出注意力权重,则加入到输出中
return outputs
class SiglipPreTrainedModel(PreTrainedModel):
"""
一个处理权重初始化和下载预训练模型的抽象类。
"""
config_class = SiglipConfig # 配置类
base_model_prefix = "siglip" # 基础模型前缀
supports_gradient_checkpointing = True # 支持梯度检查点
def _init_weights(self, module):
"""Initialize the weights"""
# 如果模块是 SiglipVisionEmbeddings 类型
if isinstance(module, SiglipVisionEmbeddings):
# 根据配置选择隐藏大小,初始化位置嵌入权重
width = (
self.config.vision_config.hidden_size
if isinstance(self.config, SiglipConfig)
else self.config.hidden_size
)
nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
# 如果模块是 nn.Embedding 类型
elif isinstance(module, nn.Embedding):
# 调用默认的 Flax 嵌入初始化方法
default_flax_embed_init(module.weight)
# 如果模块是 SiglipAttention 类型
elif isinstance(module, SiglipAttention):
# 使用 Xavier 均匀分布初始化权重
nn.init.xavier_uniform_(module.q_proj.weight)
nn.init.xavier_uniform_(module.k_proj.weight)
nn.init.xavier_uniform_(module.v_proj.weight)
nn.init.xavier_uniform_(module.out_proj.weight)
# 初始化偏置为零
nn.init.zeros_(module.q_proj.bias)
nn.init.zeros_(module.k_proj.bias)
nn.init.zeros_(module.v_proj.bias)
nn.init.zeros_(module.out_proj.bias)
# 如果模块是 SiglipMLP 类型
elif isinstance(module, SiglipMLP):
# 使用 Xavier 均匀分布初始化全连接层权重
nn.init.xavier_uniform_(module.fc1.weight)
nn.init.xavier_uniform_(module.fc2.weight)
# 使用小的正态分布初始化偏置
nn.init.normal_(module.fc1.bias, std=1e-6)
nn.init.normal_(module.fc2.bias, std=1e-6)
# 如果模块是 SiglipMultiheadAttentionPoolingHead 类型
elif isinstance(module, SiglipMultiheadAttentionPoolingHead):
# 使用 Xavier 均匀分布初始化 probe 数据
nn.init.xavier_uniform_(module.probe.data)
# 使用 Xavier 均匀分布初始化注意力层的权重
nn.init.xavier_uniform_(module.attention.in_proj_weight.data)
# 初始化注意力层的偏置为零
nn.init.zeros_(module.attention.in_proj_bias.data)
# 如果模块是 SiglipModel 类型
elif isinstance(module, SiglipModel):
# 初始化 logit_scale 数据为 log(1.0)
logit_scale_init = torch.log(torch.tensor(1.0))
module.logit_scale.data.fill_(logit_scale_init)
# 初始化 logit_bias 数据为零
module.logit_bias.data.zero_()
# 如果模块是 nn.Linear 或 nn.Conv2d 类型
elif isinstance(module, (nn.Linear, nn.Conv2d)):
# 使用 LeCun 正态分布初始化权重
lecun_normal_(module.weight)
# 如果有偏置,初始化偏置为零
if module.bias is not None:
nn.init.zeros_(module.bias)
# 如果模块是 nn.LayerNorm 类型
elif isinstance(module, nn.LayerNorm):
# 初始化偏置为零
module.bias.data.zero_()
# 初始化权重为 1.0
module.weight.data.fill_(1.0)
# SIGLIP_START_DOCSTRING 是一个包含模型介绍信息的原始字符串,用于说明该模型继承自 PreTrainedModel 类,
# 可以查看超类文档以了解通用方法(如下载或保存模型、调整输入嵌入大小、修剪头等)。
SIGLIP_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`SiglipConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# SIGLIP_TEXT_INPUTS_DOCSTRING 是一个包含文本输入信息的原始字符串,用于说明模型输入的参数和类型。
SIGLIP_TEXT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
# SIGLIP_VISION_INPUTS_DOCSTRING 是一个空字符串,暂未填充任何文档内容。
SIGLIP_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
output_attentions (`bool`, *optional*):
output_hidden_states (`bool`, *optional*):
return_dict (`bool`, *optional*):
"""
SIGLIP_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
输入序列标记在词汇表中的索引。默认情况下将忽略填充。
可以使用 [`AutoTokenizer`] 获得这些索引。详情请参见 [`PreTrainedTokenizer.encode`] 和 [`PreTrainedTokenizer.__call__`]。
[什么是输入 ID?](../glossary
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
遮罩,避免在填充的标记索引上执行注意力计算。遮罩值在 `[0, 1]` 之间:
- 1 表示 **未被遮罩** 的标记,
- 0 表示 **被遮罩** 的标记。
[什么是注意力遮罩?](../glossary
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
每个输入序列标记在位置嵌入中的位置索引。选择范围为 `[0, config.max_position_embeddings - 1]`。
[什么是位置 ID?](../glossary
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
像素值。默认情况下将忽略填充。可以使用 [`AutoImageProcessor`] 获取像素值。详情请参见 [`CLIPImageProcessor.__call__`]。
return_loss (`bool`, *optional*):
是否返回对比损失。
output_attentions (`bool`, *optional*):
是否返回所有注意力层的注意力张量。返回的张量中有关 `attentions` 的更多细节。
output_hidden_states (`bool`, *optional*):
是否返回所有层的隐藏状态。返回的张量中有关 `hidden_states` 的更多细节。
return_dict (`bool`, *optional*):
是否返回 [`~utils.ModelOutput`] 而非普通元组。
"""
# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
class SiglipEncoder(nn.Module):
"""
由 `config.num_hidden_layers` 个自注意力层组成的 Transformer 编码器。每一层都是一个 [`SiglipEncoderLayer`]。
Args:
config: SiglipConfig
"""
def __init__(self, config: SiglipConfig):
super().__init__()
self.config = config
# 创建包含多个 `SiglipEncoderLayer` 的模块列表
self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
# 梯度检查点标志,默认为 False
self.gradient_checkpointing = False
# 忽略复制
def forward(
self,
inputs_embeds,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
def __init__(self, config: SiglipTextConfig):
super().__init__()
self.config = config
embed_dim = config.hidden_size
self.embeddings = SiglipTextEmbeddings(config) # 初始化文本嵌入层对象
self.encoder = SiglipEncoder(config) # 初始化编码器对象
self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) # 初始化最终层规范化对象
self.head = nn.Linear(embed_dim, embed_dim) # 创建线性层,用于处理池化输出
@add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipTextConfig)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
Returns:
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is None:
raise ValueError("You have to specify input_ids") # 如果没有提供输入ID,则抛出数值错误
input_shape = input_ids.size()
input_ids = input_ids.view(-1, input_shape[-1]) # 将输入ID调整为二维形状
hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids) # 调用文本嵌入层进行输入嵌入
# note: SigLIP's text model does not use a causal mask, unlike the original CLIP model.
# expand attention_mask
if attention_mask is not None:
# [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype) # 准备四维注意力掩码
encoder_outputs = self.encoder(
inputs_embeds=hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
) # 调用编码器进行前向传播
last_hidden_state = encoder_outputs[0] # 取编码器输出的最后隐藏状态
last_hidden_state = self.final_layer_norm(last_hidden_state) # 对最后隐藏状态进行规范化
# Assuming "sticky" EOS tokenization, last token is always EOS.
pooled_output = last_hidden_state[:, -1, :] # 汇集最终的输出,假设“sticky” EOS 标记化
pooled_output = self.head(pooled_output) # 通过线性层处理池化输出
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:] # 如果不返回字典形式,则返回元组
return BaseModelOutputWithPooling(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
) # 返回字典形式的输出结果,包括最后隐藏状态、池化输出、隐藏状态和注意力权重
# 从 SigLIP 模型派生的文本模型,没有额外的头部或顶层投影
@add_start_docstrings(
"""The text model from SigLIP without any head or projection on top.""",
SIGLIP_START_DOCSTRING, # 添加了 SigLIP 的起始文档字符串
)
class SiglipTextModel(SiglipPreTrainedModel):
config_class = SiglipTextConfig # 设置配置类为 SiglipTextConfig
_no_split_modules = ["SiglipTextEmbeddings", "SiglipEncoderLayer"] # 不可分割的模块列表
def __init__(self, config: SiglipTextConfig):
super().__init__(config)
self.text_model = SiglipTextTransformer(config) # 初始化 SiglipTextTransformer 模型
# 初始化权重并应用最终处理
self.post_init()
def get_input_embeddings(self) -> nn.Module:
return self.text_model.embeddings.token_embedding # 获取输入嵌入层
def set_input_embeddings(self, value):
self.text_model.embeddings.token_embedding = value # 设置输入嵌入层
@add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING) # 添加前向传播方法的文档字符串
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipTextConfig) # 替换返回值文档字符串
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
Returns:
Examples:
```
>>> from transformers import AutoTokenizer, SiglipTextModel
>>> model = SiglipTextModel.from_pretrained("google/siglip-base-patch16-224")
>>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")
>>>
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output
```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
return self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
class SiglipVisionTransformer(nn.Module):
def __init__(self, config: SiglipVisionConfig):
super().__init__()
self.config = config
embed_dim = config.hidden_size
self.embeddings = SiglipVisionEmbeddings(config) # 初始化视觉嵌入层
self.encoder = SiglipEncoder(config) # 初始化编码器
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) # 初始化后层归一化
self.head = SiglipMultiheadAttentionPoolingHead(config) # 初始化多头注意力池化头部
@add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING) # 添加前向传播方法的文档字符串
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
# 使用装饰器 @replace_return_docstrings 替换返回值的文档字符串,指定输出类型为 BaseModelOutputWithPooling,配置类为 SiglipVisionConfig
def forward(
self,
pixel_values,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
Returns:
该方法不返回具体文本,但应根据输出类型 BaseModelOutputWithPooling 进行说明。
"""
# 如果 output_attentions 不为 None,则使用该值;否则使用 self.config.output_attentions
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 如果 output_hidden_states 不为 None,则使用该值;否则使用 self.config.output_hidden_states
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果 return_dict 不为 None,则使用该值;否则使用 self.config.use_return_dict
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 将像素值传递给嵌入层,得到隐藏状态
hidden_states = self.embeddings(pixel_values)
# 调用编码器进行前向传播,传递隐藏状态和其他配置参数
encoder_outputs = self.encoder(
inputs_embeds=hidden_states,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取编码器的最后隐藏状态并通过后层标准化层处理
last_hidden_state = encoder_outputs[0]
last_hidden_state = self.post_layernorm(last_hidden_state)
# 将处理后的最后隐藏状态传递给头部层,得到池化输出
pooled_output = self.head(last_hidden_state)
# 如果 return_dict 为 False,则返回元组形式的结果
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
# 如果 return_dict 为 True,则返回 BaseModelOutputWithPooling 类型的对象
return BaseModelOutputWithPooling(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
class SiglipMultiheadAttentionPoolingHead(nn.Module):
"""Multihead Attention Pooling."""
def __init__(self, config: SiglipVisionConfig):
super().__init__()
self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.mlp = SiglipMLP(config)
def forward(self, hidden_state):
batch_size = hidden_state.shape[0]
probe = self.probe.repeat(batch_size, 1, 1)
# 使用注意力机制处理隐藏状态,probe作为query和key,hidden_state作为value
hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
residual = hidden_state
# 使用 LayerNorm 进行归一化处理
hidden_state = self.layernorm(hidden_state)
# 使用 MLP 进行多层感知机处理,然后加上残差连接
hidden_state = residual + self.mlp(hidden_state)
# 返回处理后的隐藏状态的第一个维度(通常是batch维度)的第一个元素
return hidden_state[:, 0]
@add_start_docstrings(
"""The vision model from SigLIP without any head or projection on top.""",
SIGLIP_START_DOCSTRING,
)
class SiglipVisionModel(SiglipPreTrainedModel):
config_class = SiglipVisionConfig
main_input_name = "pixel_values"
def __init__(self, config: SiglipVisionConfig):
super().__init__(config)
self.vision_model = SiglipVisionTransformer(config)
# 初始化权重并应用最终处理
self.post_init()
def get_input_embeddings(self) -> nn.Module:
# 返回视觉模型的嵌入层
return self.vision_model.embeddings.patch_embedding
@add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
def forward(
self,
pixel_values,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
返回:
示例:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, SiglipVisionModel
>>> model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
>>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output
```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
return self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 使用指定的文档字符串模板为类添加文档字符串
@add_start_docstrings(SIGLIP_START_DOCSTRING)
class SiglipModel(SiglipPreTrainedModel):
# 设置配置类为 SiglipConfig
config_class = SiglipConfig
def __init__(self, config: SiglipConfig):
# 调用父类的初始化方法
super().__init__(config)
# 检查配置文件中的文本配置是否为 SiglipTextConfig 类型
if not isinstance(config.text_config, SiglipTextConfig):
raise ValueError(
"config.text_config is expected to be of type SiglipTextConfig but is of type"
f" {type(config.text_config)}."
)
# 检查配置文件中的视觉配置是否为 SiglipVisionConfig 类型
if not isinstance(config.vision_config, SiglipVisionConfig):
raise ValueError(
"config.vision_config is expected to be of type SiglipVisionConfig but is of type"
f" {type(config.vision_config)}."
)
# 获取文本和视觉配置
text_config = config.text_config
vision_config = config.vision_config
# 初始化文本模型和视觉模型
self.text_model = SiglipTextTransformer(text_config)
self.vision_model = SiglipVisionTransformer(vision_config)
# 初始化用于缩放和偏置的参数
self.logit_scale = nn.Parameter(torch.randn(1))
self.logit_bias = nn.Parameter(torch.randn(1))
# 执行额外的初始化步骤和最终处理
self.post_init()
# 使用指定的文档字符串模板为方法添加文档字符串
@add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
def get_text_features(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> torch.FloatTensor:
r"""
Returns:
text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
applying the projection layer to the pooled output of [`SiglipTextModel`].
Examples:
```
>>> from transformers import AutoTokenizer, AutoModel
>>> import torch
>>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
>>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")
>>>
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
>>> with torch.no_grad():
... text_features = model.get_text_features(**inputs)
```"""
# 根据参数设置或默认配置,确定是否返回注意力权重、隐藏状态及字典形式的返回
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 使用 SigLIP 文本模型处理输入,获取文本输出
text_outputs = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从文本输出中获取池化的输出作为文本特征表示
pooled_output = text_outputs[1]
# 返回文本特征表示
return pooled_output
@add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
def get_image_features(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> torch.FloatTensor:
r"""
Returns:
image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
applying the projection layer to the pooled output of [`SiglipVisionModel`].
Examples:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, AutoModel
>>> import torch
>>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
>>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> with torch.no_grad():
... image_features = model.get_image_features(**inputs)
```"""
# Use SiglipModel's config for some fields (if specified) instead of those of vision & text components.
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 使用 self.config 中的 output_attentions 字段,如果未指定则使用 vision_model 的默认值
# 使用 self.config 中的 output_hidden_states 字段,如果未指定则使用 vision_model 的默认值
# 使用 self.config 中的 use_return_dict 字段,如果未指定则使用 vision_model 的默认值
vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 调用 vision_model 进行前向传播,传入像素值、注意力输出、隐藏状态输出和返回字典标志位
pooled_output = vision_outputs[1]
# 从 vision_model 的输出中获取汇聚的特征向量作为 pooled_output
return pooled_output
# 声明一个用于 SigLIP 图像分类的编码器模型,其顶部有一个图像分类头部(线性层,位于补丁标记的最终隐藏状态之上),例如用于 ImageNet。
@add_start_docstrings(
"""
SigLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
the patch tokens) e.g. for ImageNet.
""",
SIGLIP_START_DOCSTRING,
)
class SiglipForImageClassification(SiglipPreTrainedModel):
# 主输入名称为 "pixel_values"
main_input_name = "pixel_values"
# 初始化函数,接受一个 SiglipConfig 类型的配置对象
def __init__(self, config: SiglipConfig) -> None:
# 调用父类的初始化函数
super().__init__(config)
# 设置模型的标签数量
self.num_labels = config.num_labels
# 创建 SiglipVisionTransformer 类型的视觉模型
self.vision_model = SiglipVisionTransformer(config.vision_config)
# 分类器头部
self.classifier = (
nn.Linear(config.vision_config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
)
# 初始化权重并应用最终处理
self.post_init()
# 声明 forward 方法,用于模型的前向传播
@add_start_docstrings_to_model_forward(SIGLIP_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, ImageClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 如果 output_attentions 参数为 None,则使用 self.config.output_attentions;否则使用传入的值
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 如果 output_hidden_states 参数为 None,则使用 self.config.output_hidden_states;否则使用传入的值
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果 return_dict 参数为 None,则使用 self.config.use_return_dict;否则使用传入的值
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 使用 vision_model 对象进行前向传播,传入 pixel_values 和各参数
outputs = self.vision_model(
pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从模型输出中获取序列输出
sequence_output = outputs[0]
# 对 patch tokens 进行平均池化
sequence_output = torch.mean(sequence_output[:, 1:, :], dim=1)
# 应用分类器对序列输出进行分类预测
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
# 将 labels 移动到与 logits 相同的设备,以支持模型并行计算
labels = labels.to(logits.device)
# 确定问题类型(回归、单标签分类或多标签分类)
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
# 根据问题类型选择相应的损失函数
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
# 如果不要求返回字典形式的结果,则返回元组
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
# 如果要求返回字典形式的结果,则返回 ImageClassifierOutput 对象
return ImageClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
.\models\siglip\processing_siglip.py
"""
Image/Text processor class for SigLIP.
"""
from typing import List, Optional, Union
from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType
class SiglipProcessor(ProcessorMixin):
r"""
Constructs a Siglip processor which wraps a Siglip image processor and a Siglip tokenizer into a single processor.
[`SiglipProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`SiglipTokenizer`]. See the
[`~SiglipProcessor.__call__`] and [`~SiglipProcessor.decode`] for more information.
Args:
image_processor ([`SiglipImageProcessor`]):
The image processor is a required input.
tokenizer ([`SiglipTokenizer`]):
The tokenizer is a required input.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "SiglipImageProcessor"
tokenizer_class = "SiglipTokenizer"
def __init__(self, image_processor, tokenizer):
super().__init__(image_processor, tokenizer)
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
images: ImageInput = None,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: int = None,
return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
):
"""
This method combines the functionalities of both image processing and tokenization. It processes input text
and/or images according to specified padding, truncation, and max length parameters, and returns processed data
in a format based on the return_tensors argument.
"""
raise NotImplementedError
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to SiglipTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to SiglipTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
@property
def model_input_names(self):
"""
Property method that provides model input names. It is copied from transformers.models.clip.processing_clip.
CLIPProcessor.model_input_names with modifications for Siglip and T5.
"""
return {
"text": "inputs",
"image": "pixel_values",
"padding": "padding",
"max_length": "max_length",
"truncation": "truncation",
"return_tensors": "return_tensors",
}
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
.\models\siglip\tokenization_siglip.py
""" Tokenization class for SigLIP model."""
import os
import re
import string
import warnings
from shutil import copyfile
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
import sentencepiece as spm
from ...convert_slow_tokenizer import import_protobuf
from ...tokenization_utils import PreTrainedTokenizer
from ...tokenization_utils_base import AddedToken
if TYPE_CHECKING:
from ...tokenization_utils_base import TextInput
from ...utils import logging, requires_backends
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"google/siglip-base-patch16-224": "https://huggingface.co/google/siglip-base-patch16-224/resolve/main/spiece.model",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"google/siglip-base-patch16-224": 256,
}
SPIECE_UNDERLINE = "▁"
class SiglipTokenizer(PreTrainedTokenizer):
"""
Construct a Siglip tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
"""
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
do_lower_case=False,
remove_space=True,
keep_accents=False,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
**kwargs
):
pass
"""
Args:
vocab_file (`str`):
[SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
contains the vocabulary necessary to instantiate a tokenizer.
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (`str`, *optional*, defaults to `"</s>"`):
The token used for padding, for example when batching sequences of different lengths.
additional_special_tokens (`List[str]`, *optional*):
Additional special tokens used by the tokenizer.
sp_model_kwargs (`dict`, *optional*):
Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
to set:
- `enable_sampling`: Enable subword regularization.
- `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
- `nbest_size = {0,1}`: No sampling is performed.
- `nbest_size > 1`: samples from the nbest_size results.
- `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
model_max_length (`int`, *optional*, defaults to 64):
The maximum length (in number of tokens) for model inputs.
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
eos_token="</s>",
unk_token="<unk>",
pad_token="</s>",
additional_special_tokens=None,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
model_max_length=64,
do_lower_case=True,
**kwargs,
):
@property
def vocab_size(self):
return self.sp_model.get_piece_size()
def get_vocab(self):
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> torch.Tensor:
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is None:
return ([0] * len(token_ids_0)) + [1]
else:
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
def _add_eos_if_not_present(self, token_ids: List[int]) -> List[int]:
"""Do not add eos again if user already added it."""
if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
warnings.warn(
f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated"
" eos tokens being added."
)
return token_ids
else:
return token_ids + [self.eos_token_id]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
eos = [self.eos_token_id]
if token_ids_1 is None:
return len(token_ids_0 + eos) * [0]
else:
return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
):
"""
Build model inputs from a sequence or a pair of sequences, including adding special tokens.
Args:
token_ids_0 (`List[int]`):
List of IDs for the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional list of IDs for the second sequence in a pair.
Returns:
`List[int]`: A list of token IDs with added special tokens.
"""
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A sequence has the following format:
- single sequence: `X </s>`
- pair of sequences: `A </s> B </s>`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
token_ids_0 = self._add_eos_if_not_present(token_ids_0)
if token_ids_1 is None:
return token_ids_0
else:
token_ids_1 = self._add_eos_if_not_present(token_ids_1)
return token_ids_0 + token_ids_1
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
return state
def __setstate__(self, d):
self.__dict__ = d
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file)
def remove_punctuation(self, text: str) -> str:
return text.translate(str.maketrans("", "", string.punctuation))
def canonicalize_text(self, text, *, keep_punctuation_exact_string=None):
"""Returns canonicalized `text` (puncuation removed).
Args:
text (`str`):
String to be canonicalized.
keep_punctuation_exact_string (`str`, *optional*):
If provided, then this exact string is kept. For example providing '{}' will keep any occurrences of '{}'
(but will still remove '{' and '}' that appear separately).
"""
if keep_punctuation_exact_string:
text = keep_punctuation_exact_string.join(
self.remove_punctuation(part) for part in text.split(keep_punctuation_exact_string)
)
else:
text = self.remove_punctuation(text)
text = re.sub(r"\s+", " ", text)
text = text.strip()
return text
def tokenize(self, text: "TextInput", add_special_tokens=False, **kwargs) -> List[str]:
"""
Converts a string to a list of tokens.
"""
tokens = super().tokenize(SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " "), **kwargs)
if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
tokens = tokens[1:]
return tokens
@property
def unk_token_length(self):
return len(self.sp_model.encode(str(self.unk_token)))
def _tokenize(self, text, **kwargs):
"""
Returns a tokenized string.
We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
SPIECE_UNDERLINE.
For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give `['H', 'e', 'y']` instead of `['▁He', 'y']`.
Thus we always encode `f"{unk_token}text"` and strip the `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
`self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
"""
text = self.canonicalize_text(text, keep_punctuation_exact_string=None)
tokens = self.sp_model.encode(text, out_type=str)
tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.sp_model.piece_to_id(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
token = self.sp_model.IdToPiece(index)
return token
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
current_sub_tokens = []
out_string = ""
prev_is_special = False
for token in tokens:
if token in self.all_special_tokens:
if not prev_is_special:
out_string += " "
out_string += self.sp_model.decode(current_sub_tokens) + token
prev_is_special = True
current_sub_tokens = []
else:
current_sub_tokens.append(token)
prev_is_special = False
out_string += self.sp_model.decode(current_sub_tokens)
return out_string.strip()
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file,)