Transformers 源码解析(二十)
.\models\blenderbot_small\tokenization_blenderbot_small.py
"""BlenderbotSmall 的分词类。"""
import json
import os
from typing import Dict, List, Optional, Tuple
import regex as re
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
"tokenizer_config_file": "tokenizer_config.json",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/vocab.json"
},
"merges_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/merges.txt"
},
"tokenizer_config_file": {
"facebook/blenderbot_small-90M": (
"https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/tokenizer_config.json"
)
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/blenderbot_small-90M": 512}
def get_pairs(word):
"""
返回单词中的符号对集合。
单词表示为符号的元组(符号是可变长度字符串)。
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
pairs = set(pairs)
return pairs
class BlenderbotSmallTokenizer(PreTrainedTokenizer):
"""
基于 BPE(字节对编码)构建 Blenderbot-90M 分词器。
此分词器继承自 [`PreTrainedTokenizer`],其中包含大多数主要方法。用户应参考
超类以获取有关方法的更多信息。
"""
pass
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
merges_file,
bos_token="__start__",
eos_token="__end__",
unk_token="__unk__",
pad_token="__null__",
**kwargs,
):
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
with open(merges_file, encoding="utf-8") as merges_handle:
merges = merges_handle.read().split("\n")[1:-1]
merges = [tuple(merge.split()) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
super().__init__(unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, **kwargs)
@property
def vocab_size(self) -> int:
return len(self.encoder)
def get_vocab(self) -> Dict:
return dict(self.encoder, **self.added_tokens_encoder)
def bpe(self, token: str) -> str:
if token in self.cache:
return self.cache[token]
token = re.sub("([.,!?()])", r" \1", token)
token = re.sub("(')", r" \1 ", token)
token = re.sub(r"\s{2,}", " ", token)
if "\n" in token:
token = token.replace("\n", " __newln__")
tokens = token.split(" ")
words = []
for token in tokens:
if not len(token):
continue
token = token.lower()
word = tuple(token)
word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
pairs = get_pairs(word)
if not pairs:
words.append(token)
continue
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
new_word.extend(word[i:j])
i = j
except ValueError:
new_word.extend(word[i:])
break
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = "@@ ".join(word)
word = word[:-4]
self.cache[token] = word
words.append(word)
return " ".join(words)
def _tokenize(self, text: str) -> List[str]:
"""Split a string into tokens using BPE."""
split_tokens = []
words = re.findall(r"\S+\n?", text)
for token in words:
split_tokens.extend(list(self.bpe(token).split(" ")))
return split_tokens
def _convert_token_to_id(self, token: str) -> int:
"""Converts a token to an id using the vocab."""
token = token.lower()
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index: int) -> str:
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens: List[str]) -> str:
"""Converts a sequence of tokens in a single string."""
out_string = " ".join(tokens).replace("@@ ", "").strip()
return out_string
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write("#version: 0.2\n")
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file
@property
def default_chat_template(self):
"""
A very simple chat template that just adds whitespace between messages.
"""
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
return (
"{% for message in messages %}"
"{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"
"{{ message['content'] }}"
"{% if not loop.last %}{{ ' ' }}{% endif %}"
"{% endfor %}"
"{{ eos_token }}"
)
.\models\blenderbot_small\tokenization_blenderbot_small_fast.py
"""Fast tokenization class for BlenderbotSmall."""
from typing import List, Optional
from tokenizers import ByteLevelBPETokenizer
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .tokenization_blenderbot_small import BlenderbotSmallTokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
"tokenizer_config_file": "tokenizer_config.json",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/vocab.json"
},
"merges_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/merges.txt"
},
"tokenizer_config_file": {
"facebook/blenderbot_small-90M": (
"https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/tokenizer_config.json"
)
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/blenderbot_small-90M": 512,
}
class BlenderbotSmallTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" BlenderbotSmall tokenizer (backed by HuggingFace's *tokenizers* library).
Args:
vocab_file (`str`):
Path to the vocabulary file.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
slow_tokenizer_class = BlenderbotSmallTokenizer
def __init__(
self,
vocab_file=None,
merges_file=None,
unk_token="<|endoftext|>",
bos_token="<|endoftext|>",
eos_token="<|endoftext|>",
add_prefix_space=False,
trim_offsets=True,
**kwargs,
):
super().__init__(
ByteLevelBPETokenizer(
vocab=vocab_file,
merges=merges_file,
add_prefix_space=add_prefix_space,
trim_offsets=trim_offsets,
),
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
**kwargs,
)
self.add_prefix_space = add_prefix_space
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
if token_ids_1 is None:
return output
return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
从传入的两个序列创建用于序列对分类任务的类型标识。BlenderbotSmall 不使用 token type ids,因此返回一个全为零的列表。
Args:
token_ids_0 (`List[int]`):
ID 的列表。
token_ids_1 (`List[int]`, *optional*):
第二个序列的 ID 列表,用于序列对。
Returns:
`List[int]`: 全为零的列表。
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
@property
def default_chat_template(self):
"""
一个非常简单的聊天模板,只在消息之间添加空白。
"""
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
return (
"{% for message in messages %}"
"{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"
"{{ message['content'] }}"
"{% if not loop.last %}{{ ' ' }}{% endif %}"
"{% endfor %}"
"{{ eos_token }}"
)
.\models\blenderbot_small\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_flax_available,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_blenderbot_small": [
"BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP",
"BlenderbotSmallConfig",
"BlenderbotSmallOnnxConfig",
],
"tokenization_blenderbot_small": ["BlenderbotSmallTokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_blenderbot_small_fast"] = ["BlenderbotSmallTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_blenderbot_small"] = [
"BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST",
"BlenderbotSmallForCausalLM",
"BlenderbotSmallForConditionalGeneration",
"BlenderbotSmallModel",
"BlenderbotSmallPreTrainedModel",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_blenderbot_small"] = [
"TFBlenderbotSmallForConditionalGeneration",
"TFBlenderbotSmallModel",
"TFBlenderbotSmallPreTrainedModel",
]
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_flax_blenderbot_small"] = [
"FlaxBlenderbotSmallForConditionalGeneration",
"FlaxBlenderbotSmallModel",
"FlaxBlenderbotSmallPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_blenderbot_small import (
BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
BlenderbotSmallConfig,
BlenderbotSmallOnnxConfig,
)
from .tokenization_blenderbot_small import BlenderbotSmallTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_blenderbot_small_fast import BlenderbotSmallTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_blenderbot_small import (
BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST,
BlenderbotSmallForCausalLM,
BlenderbotSmallForConditionalGeneration,
BlenderbotSmallModel,
BlenderbotSmallPreTrainedModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_blenderbot_small import (
TFBlenderbotSmallForConditionalGeneration,
TFBlenderbotSmallModel,
TFBlenderbotSmallPreTrainedModel,
)
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_flax_blenderbot_small import (
FlaxBlenderbotSmallForConditionalGeneration,
FlaxBlenderbotSmallModel,
FlaxBlenderbotSmallPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\blip\configuration_blip.py
import os
from typing import Union
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
BLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"Salesforce/blip-vqa-base": "https://huggingface.co/Salesforce/blip-vqa-base/resolve/main/config.json",
"Salesforce/blip-vqa-capfit-large": (
"https://huggingface.co/Salesforce/blip-vqa-base-capfit/resolve/main/config.json"
),
"Salesforce/blip-image-captioning-base": (
"https://huggingface.co/Salesforce/blip-image-captioning-base/resolve/main/config.json"
),
"Salesforce/blip-image-captioning-large": (
"https://huggingface.co/Salesforce/blip-image-captioning-large/resolve/main/config.json"
),
"Salesforce/blip-itm-base-coco": "https://huggingface.co/Salesforce/blip-itm-base-coco/resolve/main/config.json",
"Salesforce/blip-itm-large-coco": "https://huggingface.co/Salesforce/blip-itm-large-coco/resolve/main/config.json",
"Salesforce/blip-itm-base-flikr": "https://huggingface.co/Salesforce/blip-itm-base-flikr/resolve/main/config.json",
"Salesforce/blip-itm-large-flikr": (
"https://huggingface.co/Salesforce/blip-itm-large-flikr/resolve/main/config.json"
),
}
class BlipTextConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`BlipTextModel`]. It is used to instantiate a BLIP
text model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the `BlipText` used by the [base
architectures](https://huggingface.co/Salesforce/blip-vqa-base).
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import BlipTextConfig, BlipTextModel
>>> # Initializing a BlipTextConfig with Salesforce/blip-vqa-base style configuration
>>> configuration = BlipTextConfig()
>>> # Initializing a BlipTextModel (with random weights) from the Salesforce/blip-vqa-base style configuration
>>> model = BlipTextModel(configuration)
>>> # Accessing the model configuration
"""
>>> configuration = model.config
```"""
# 获取模型的配置信息
configuration = model.config
model_type = "blip_text_model"
# 设置模型类型为文本模型
def __init__(
self,
vocab_size=30524,
hidden_size=768,
encoder_hidden_size=768,
intermediate_size=3072,
projection_dim=768,
num_hidden_layers=12,
num_attention_heads=8,
max_position_embeddings=512,
hidden_act="gelu",
layer_norm_eps=1e-12,
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
initializer_range=0.02,
bos_token_id=30522,
eos_token_id=2,
pad_token_id=0,
sep_token_id=102,
is_decoder=True,
use_cache=True,
label_smoothing=0.0,
**kwargs,
):
# 调用父类初始化方法,设置特殊标记的ID,并传入额外的关键字参数
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
sep_token_id=sep_token_id,
**kwargs,
)
# 设置模型配置的各种参数
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.encoder_hidden_size = encoder_hidden_size
self.intermediate_size = intermediate_size
self.projection_dim = projection_dim
self.hidden_dropout_prob = hidden_dropout_prob
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.max_position_embeddings = max_position_embeddings
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.is_decoder = is_decoder
self.use_cache = use_cache
self.label_smoothing = label_smoothing
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
# 设置关键字参数中的token
cls._set_token_in_kwargs(kwargs)
# 获取配置字典和处理后的关键字参数
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
# 如果从BlipConfig加载,获取文本配置字典
if config_dict.get("model_type") == "blip":
config_dict = config_dict["text_config"]
# 如果配置字典中包含model_type,并且不同于当前类的model_type,发出警告
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
# 从配置字典和关键字参数创建实例
return cls.from_dict(config_dict, **kwargs)
# 定义 BlipVisionConfig 类,用于存储 [`BlipVisionModel`] 的配置信息。该类用于实例化 BLIP 视觉模型,
# 根据指定参数定义模型架构。默认情况下,配置实例化将产生与 Blip-base 架构类似的配置。
class BlipVisionConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`BlipVisionModel`]. It is used to instantiate a
BLIP vision model according to the specified arguments, defining the model architecture. Instantiating a
configuration defaults will yield a similar configuration to that of the Blip-base
[Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
image_size (`int`, *optional*, defaults to 384):
The size (resolution) of each image.
patch_size (`int`, *optional*, defaults to 16):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 1e-10):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
Example:
```
>>> from transformers import BlipVisionConfig, BlipVisionModel
>>>
>>> configuration = BlipVisionConfig()
>>>
>>> model = BlipVisionModel(configuration)
>>>
>>> configuration = model.config
```
model_type = "blip_vision_model"
def __init__(
self,
hidden_size=768,
intermediate_size=3072,
projection_dim=512,
num_hidden_layers=12,
num_attention_heads=12,
image_size=384,
patch_size=16,
hidden_act="gelu",
layer_norm_eps=1e-5,
attention_dropout=0.0,
initializer_range=1e-10,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.projection_dim = projection_dim
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.patch_size = patch_size
self.image_size = image_size
self.initializer_range = initializer_range
self.attention_dropout = attention_dropout
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
cls._set_token_in_kwargs(kwargs)
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
if config_dict.get("model_type") == "blip":
config_dict = config_dict["vision_config"]
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
return cls.from_dict(config_dict, **kwargs)
class BlipConfig(PretrainedConfig):
r"""
[`BlipConfig`] 是一个配置类,用于存储 [`BlipModel`] 的配置信息。它用于根据指定的参数实例化一个 BLIP 模型,
定义文本模型和视觉模型的配置。使用默认参数实例化一个配置对象将得到类似于 BLIP-base
[Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base) 架构的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型的输出。更多信息请参阅 [`PretrainedConfig`] 的文档。
Args:
text_config (`dict`, *optional*):
用于初始化 [`BlipTextConfig`] 的配置选项字典。
vision_config (`dict`, *optional*):
用于初始化 [`BlipVisionConfig`] 的配置选项字典。
projection_dim (`int`, *optional*, defaults to 512):
文本和视觉投影层的维度。
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
*logit_scale* 参数的初始值。默认使用原始 BLIP 实现中的值。
image_text_hidden_size (`int`, *optional*, defaults to 256):
图像文本融合层隐藏状态的维度。
label_smoothing (float, optional, *optional*, defaults to 0.0):
在计算损失时的平滑度,取值范围为 [0.0, 1.0],其中 0.0 表示无平滑。目标成为原始标签和均匀分布的混合体,详细描述见
`Rethinking the Inception Architecture for Computer Vision <https://arxiv.org/abs/1512.00567>`__。默认值: :math:`0.0`。
kwargs (*optional*):
关键字参数字典。
Example:
```
>>> from transformers import BlipConfig, BlipModel
>>> # 使用 Salesforce/blip-vqa-base 风格的配置初始化 BlipConfig
>>> configuration = BlipConfig()
>>> # 使用 Salesforce/blip-vqa-base 风格的配置初始化 BlipModel(随机权重)
>>> model = BlipModel(configuration)
>>> # 访问模型配置
>>> configuration = model.config
>>> # 也可以从 BlipTextConfig 和 BlipVisionConfig 初始化 BlipConfig
>>> # 初始化 BLIPText 和 BLIPVision 配置
>>> config_text = BlipTextConfig()
>>> config_vision = BlipVisionConfig()
>>> config = BlipConfig.from_text_vision_configs(config_text, config_vision)
```
"""
model_type = "blip"
def __init__(
self,
text_config=None,
vision_config=None,
projection_dim=512,
logit_scale_init_value=2.6592,
image_text_hidden_size=256,
label_smoothing=0.0,
**kwargs,
):
super().__init__(**kwargs)
if text_config is None:
text_config = {}
logger.info("`text_config` is `None`. Initializing the `BlipTextConfig` with default values.")
if vision_config is None:
vision_config = {}
logger.info("`vision_config` is `None`. Initializing the `BlipVisionConfig` with default values.")
self.text_config = BlipTextConfig(**text_config)
self.vision_config = BlipVisionConfig(**vision_config)
self.text_config.encoder_hidden_size = self.vision_config.hidden_size
self.projection_dim = projection_dim
self.logit_scale_init_value = logit_scale_init_value
self.initializer_factor = 1.0
self.initializer_range = 0.02
self.image_text_hidden_size = image_text_hidden_size
self.label_smoothing = label_smoothing
@classmethod
def from_text_vision_configs(cls, text_config: BlipTextConfig, vision_config: BlipVisionConfig, **kwargs):
r"""
从 Blip 文本模型配置和 Blip 视觉模型配置实例化一个 [`BlipConfig`](或其派生类)。
Returns:
[`BlipConfig`]: 一个配置对象的实例
"""
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
.\models\blip\convert_blip_original_pytorch_to_hf.py
@torch.no_grad()
def convert_blip_checkpoint(pytorch_dump_folder_path, config_path=None):
"""
Copy/paste/tweak model's weights to transformers design.
"""
if config_path is not None:
config = BlipConfig.from_pretrained(config_path)
else:
config = BlipConfig(projection_dim=512, text_config={}, vision_config={})
hf_model = BlipForConditionalGeneration(config).eval()
model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
pt_model = blip_decoder(pretrained=model_url, image_size=384, vit="base")
pt_model = pt_model.eval()
modified_state_dict = pt_model.state_dict()
for key in modified_state_dict.copy():
value = modified_state_dict.pop(key)
renamed_key = rename_key(key)
modified_state_dict[renamed_key] = value
hf_model.load_state_dict(modified_state_dict)
image_size = 384
image = load_demo_image(image_size=image_size, device="cpu")
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
input_ids = tokenizer(["a picture of"]).input_ids
out = hf_model.generate(image, input_ids)
assert out[0].tolist() == [30522, 1037, 3861, 1997, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102]
out = hf_model.generate(image)
assert out[0].tolist() == [30522, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102]
if pytorch_dump_folder_path is not None:
hf_model.save_pretrained(pytorch_dump_folder_path)
model_url = (
"https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
)
vqa_model = blip_vqa(pretrained=model_url, image_size=image_size, vit="base")
vqa_model.eval()
modified_state_dict = vqa_model.state_dict()
for key in modified_state_dict.copy():
value = modified_state_dict.pop(key)
renamed_key = rename_key(key)
modified_state_dict[renamed_key] = value
hf_vqa_model = BlipForQuestionAnswering(config)
hf_vqa_model.load_state_dict(modified_state_dict)
question = ["How many dogs are in this image?"]
question_input_ids = tokenizer(question, return_tensors="pt").input_ids
answer = hf_vqa_model.generate(question_input_ids, image)
print(tokenizer.decode(answer[0]))
assert tokenizer.decode(answer[0]) == "[UNK] 1 [SEP]"
if pytorch_dump_folder_path is not None:
hf_vqa_model.save_pretrained(pytorch_dump_folder_path + "_vqa")
model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth"
itm_model = blip_itm(pretrained=model_url, image_size=image_size, vit="base")
itm_model.eval()
modified_state_dict = itm_model.state_dict()
for key in modified_state_dict.copy():
value = modified_state_dict.pop(key)
renamed_key = rename_key(key)
modified_state_dict[renamed_key] = value
hf_itm_model = BlipForImageTextRetrieval(config)
hf_itm_model.load_state_dict(modified_state_dict)
question = ["A picture of a woman with a dog sitting in a beach"]
question_input_ids = tokenizer(
question,
return_tensors="pt",
padding="max_length",
truncation=True,
max_length=35,
).input_ids
hf_itm_model.load_state_dict(modified_state_dict)
hf_itm_model.eval()
out_itm = hf_itm_model(question_input_ids, image, use_itm_head=True)
out = hf_itm_model(question_input_ids, image, use_itm_head=False)
assert out[0].item() == 0.2110687494277954
assert torch.nn.functional.softmax(out_itm[0], dim=1)[:, 1].item() == 0.45698845386505127
if pytorch_dump_folder_path is not None:
hf_itm_model.save_pretrained(pytorch_dump_folder_path + "_itm")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
args = parser.parse_args()
convert_blip_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)
.\models\blip\image_processing_blip.py
"""Image processor class for BLIP."""
from typing import Dict, List, Optional, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format
from ...image_utils import (
OPENAI_CLIP_MEAN,
OPENAI_CLIP_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
to_numpy_array,
valid_images,
validate_kwargs,
validate_preprocess_arguments,
)
from ...utils import TensorType, is_vision_available, logging
if is_vision_available():
import PIL
logger = logging.get_logger(__name__)
class BlipImageProcessor(BaseImageProcessor):
r"""
Constructs a BLIP image processor.
"""
"""
Args:
do_resize (`bool`, *optional*, defaults to `True`):
是否将图像的(高度,宽度)尺寸调整为指定的 `size`。可以在 `preprocess` 方法的 `do_resize` 参数中被覆盖。
size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
调整后的输出图像尺寸。可以在 `preprocess` 方法的 `size` 参数中被覆盖。
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
在调整图像大小时使用的重采样滤波器。仅在 `do_resize` 设置为 `True` 时有效。可以在 `preprocess` 方法的 `resample` 参数中被覆盖。
do_rescale (`bool`, *optional*, defaults to `True`):
是否按指定的缩放比例 `rescale_factor` 进行图像缩放。可以在 `preprocess` 方法的 `do_rescale` 参数中被覆盖。
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
如果进行图像缩放,则使用的缩放因子。仅在 `do_rescale` 设置为 `True` 时有效。可以在 `preprocess` 方法的 `rescale_factor` 参数中被覆盖。
do_normalize (`bool`, *optional*, defaults to `True`):
是否对图像进行归一化处理。可以在 `preprocess` 方法的 `do_normalize` 参数中被覆盖。
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
如果进行图像归一化,则使用的均值。这是一个浮点数或与图像通道数相同长度的浮点数列表。可以在 `preprocess` 方法的 `image_mean` 参数中被覆盖。
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
如果进行图像归一化,则使用的标准差。这是一个浮点数或与图像通道数相同长度的浮点数列表。可以在 `preprocess` 方法的 `image_std` 参数中被覆盖。
do_convert_rgb (`bool`, *optional*, defaults to `True`):
是否将图像转换为 RGB 格式。
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PILImageResampling.BICUBIC,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: bool = True,
**kwargs,
):
"""
初始化方法,设置图像预处理参数。
Args:
do_resize (`bool`, *optional*, defaults to `True`): 是否将图像的(高度,宽度)尺寸调整为指定的 `size`。
可以在 `preprocess` 方法的 `do_resize` 参数中被覆盖。
size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`): 调整后的输出图像尺寸。
可以在 `preprocess` 方法的 `size` 参数中被覆盖。
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
在调整图像大小时使用的重采样滤波器。仅在 `do_resize` 设置为 `True` 时有效。
可以在 `preprocess` 方法的 `resample` 参数中被覆盖。
do_rescale (`bool`, *optional*, defaults to `True`): 是否按指定的缩放比例 `rescale_factor` 进行图像缩放。
可以在 `preprocess` 方法的 `do_rescale` 参数中被覆盖。
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
如果进行图像缩放,则使用的缩放因子。仅在 `do_rescale` 设置为 `True` 时有效。
可以在 `preprocess` 方法的 `rescale_factor` 参数中被覆盖。
do_normalize (`bool`, *optional*, defaults to `True`): 是否对图像进行归一化处理。
可以在 `preprocess` 方法的 `do_normalize` 参数中被覆盖。
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
如果进行图像归一化,则使用的均值。这是一个浮点数或与图像通道数相同长度的浮点数列表。
可以在 `preprocess` 方法的 `image_mean` 参数中被覆盖。
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
如果进行图像归一化,则使用的标准差。这是一个浮点数或与图像通道数相同长度的浮点数列表。
可以在 `preprocess` 方法的 `image_std` 参数中被覆盖。
do_convert_rgb (`bool`, *optional*, defaults to `True`): 是否将图像转换为 RGB 格式。
**kwargs: 其他未明确指定的参数,以字典形式传递。
"""
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"height": 384, "width": 384}
size = get_size_dict(size, default_to_square=True)
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
self.do_convert_rgb = do_convert_rgb
self._valid_processor_keys = [
"images",
"do_resize",
"size",
"resample",
"do_rescale",
"rescale_factor",
"do_normalize",
"image_mean",
"image_std",
"do_convert_rgb",
"return_tensors",
"data_format",
"input_data_format",
]
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize an image to `(size["height"], size["width"])`.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
Returns:
`np.ndarray`: The resized image.
"""
size = get_size_dict(size)
if "height" not in size or "width" not in size:
raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
output_size = (size["height"], size["width"])
return resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
def preprocess(
self,
images: ImageInput,
do_resize: Optional[bool] = None,
size: Optional[Dict[str, int]] = None,
resample: PILImageResampling = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
do_convert_rgb: bool = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
.\models\blip\modeling_blip.py
import warnings
from dataclasses import dataclass
from typing import Any, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn.functional import normalize
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_blip import BlipConfig, BlipTextConfig, BlipVisionConfig
from .modeling_blip_text import BlipTextLMHeadModel, BlipTextModel
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "Salesforce/blip-vqa-base"
BLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
"Salesforce/blip-vqa-base",
"Salesforce/blip-vqa-capfilt-large",
"Salesforce/blip-image-captioning-base",
"Salesforce/blip-image-captioning-large",
"Salesforce/blip-itm-base-coco",
"Salesforce/blip-itm-large-coco",
"Salesforce/blip-itm-base-flickr",
"Salesforce/blip-itm-large-flickr",
]
def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
"""
计算对比损失,使用交叉熵损失函数。
Args:
logits (torch.Tensor): 模型预测的 logits.
Returns:
torch.Tensor: 对比损失值.
"""
return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
def blip_loss(similarity: torch.Tensor) -> torch.Tensor:
"""
计算 BLIP 损失,包括文本和图像的对比损失的平均值。
Args:
similarity (torch.Tensor): 模型预测的相似性张量.
Returns:
torch.Tensor: BLIP 损失值.
"""
caption_loss = contrastive_loss(similarity)
image_loss = contrastive_loss(similarity.t())
return (caption_loss + image_loss) / 2.0
@dataclass
class BlipForConditionalGenerationModelOutput(ModelOutput):
"""
BLIP 生成条件模型的输出类,继承自 BaseModelOutput,并包含最后隐藏状态的图像嵌入池化结果。
该类还添加了来自文本解码器的损失项。
"""
pass
Args:
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
Languge modeling loss from the text decoder.
文本解码器生成的语言建模损失(如果提供了标签)。
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
Prediction scores of the language modeling head of the text decoder model.
文本解码器模型的语言建模头部的预测分数。
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*):
The image embeddings obtained after applying the Vision Transformer model to the input image.
应用视觉Transformer模型到输入图像后得到的图像嵌入。
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the model.
模型最后一层输出的隐藏状态序列。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
每层模型输出的隐藏状态,以及可选的初始嵌入输出。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
注意力权重,经过注意力softmax后的权重,用于计算自注意力头中的加权平均值。
"""
loss: Optional[Tuple[torch.FloatTensor]] = None
logits: Optional[Tuple[torch.FloatTensor]] = None
image_embeds: Optional[torch.FloatTensor] = None
last_hidden_state: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@property
def decoder_logits(self):
"""
Deprecated property to access logits. Use `logits` attribute instead.
获取logits的过时属性。请使用`logits`属性。
"""
warnings.warn(
"`decoder_logits` attribute is deprecated and will be removed in version 5 of Transformers."
" Please use the `logits` attribute to retrieve the final output instead.",
FutureWarning,
)
return self.logits
# 数据类,用于表示BlipTextVision模型的输出结果,继承自ModelOutput基类
@dataclass
class BlipTextVisionModelOutput(ModelOutput):
"""
从视觉模型输出基类改编而来,还包含了最后隐藏状态的图像嵌入。这个类还添加了文本解码器的损失项。
Args:
loss (`torch.FloatTensor`,形状为 `(1,)`,可选,当提供`labels`时返回):
文本解码器的语言建模损失。
image_embeds (`torch.FloatTensor`,形状为 `(batch_size, output_dim)`,可选,当模型初始化时使用 `with_projection=True` 返回):
通过将池化输出应用于投影层获得的图像嵌入。
last_hidden_state (`torch.FloatTensor`,形状为 `(batch_size, sequence_length, hidden_size)`):
模型最后一层的隐藏状态序列输出。
hidden_states (`tuple(torch.FloatTensor)`,可选,当传入 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
`torch.FloatTensor` 元组(如果模型有嵌入层,则返回一个用于每层输出的嵌入输出 + 每层输出的隐藏状态),
形状为 `(batch_size, sequence_length, hidden_size)`。
模型每层输出的隐藏状态,以及可选的初始嵌入输出。
attentions (`tuple(torch.FloatTensor)`,可选,当传入 `output_attentions=True` 或 `config.output_attentions=True` 时返回):
`torch.FloatTensor` 元组(每层一个),
形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
"""
loss: Optional[torch.FloatTensor] = None # 损失项,默认为None
image_embeds: Optional[torch.FloatTensor] = None # 图像嵌入,默认为None
last_hidden_state: torch.FloatTensor = None # 最后一层隐藏状态的输出,默认为None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None # 隐藏状态的元组,默认为None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None # 注意力权重的元组,默认为None
# 数据类,用于表示BlipImageTextMatching模型的输出结果,继承自ModelOutput基类
@dataclass
class BlipImageTextMatchingModelOutput(ModelOutput):
"""
从视觉模型输出基类改编而来,还包含了最后隐藏状态的图像嵌入。这个类还添加了文本解码器的损失项以及图像文本相似度分数。
"""
"""
Args:
itm_score (`torch.FloatTensor`):
The image-text similarity scores.
图像与文本之间的相似性分数。
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Languge modeling loss from the text decoder.
文本解码器产生的语言建模损失,当提供了`labels`时返回。
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
The image embeddings obtained by applying the projection layer to the pooler_output.
通过将投影层应用于池化输出得到的图像嵌入。
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
模型最后一层输出的隐藏状态序列。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
每层模型输出的隐藏状态组成的元组,如果模型有嵌入层则包括嵌入输出,形状为`(batch_size, sequence_length, hidden_size)`。
vision_pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*):
Last layer hidden-state of the vision of the vision-only branch of the model.
模型视觉分支的最后一层隐藏状态。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
注意力权重,经过注意力softmax后的加权平均值,用于自注意力头的计算。
question_embeds (`torch.FloatTensor`):
The question embeddings obtained by the text projection layer.
通过文本投影层得到的问题嵌入。
"""
itm_score: Optional[torch.FloatTensor] = None
loss: Optional[torch.FloatTensor] = None
image_embeds: Optional[torch.FloatTensor] = None
last_hidden_state: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
vision_pooler_output: Optional[torch.FloatTensor] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
question_embeds: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class BlipOutput(ModelOutput):
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
Contrastive loss for image-text similarity.
logits_per_image: (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
similarity scores.
logits_per_text: (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
similarity scores.
text_embeds: (`torch.FloatTensor` of shape `(batch_size, output_dim)`):
The text embeddings obtained by applying the projection layer to the pooled output of [`BlipTextModel`].
image_embeds: (`torch.FloatTensor` of shape `(batch_size, output_dim)`):
The image embeddings obtained by applying the projection layer to the pooled output of [`BlipVisionModel`].
text_model_output: (`BaseModelOutputWithPooling`):
The output of the [`BlipTextModel`].
vision_model_output: (`BaseModelOutputWithPooling`):
The output of the [`BlipVisionModel`].
"""
loss: Optional[torch.FloatTensor] = None # 初始化为可选的浮点数张量,用于存储图像-文本相似性的对比损失
logits_per_image: torch.FloatTensor = None # 存储图像嵌入与文本嵌入之间的点积得分,表示图像-文本的相似性分数
logits_per_text: torch.FloatTensor = None # 存储文本嵌入与图像嵌入之间的点积得分,表示文本-图像的相似性分数
text_embeds: torch.FloatTensor = None # 存储通过投影层应用到[`BlipTextModel`]池化输出得到的文本嵌入
image_embeds: torch.FloatTensor = None # 存储通过投影层应用到[`BlipVisionModel`]池化输出得到的图像嵌入
text_model_output: BaseModelOutputWithPooling = None # 存储[`BlipTextModel`]的输出,包含池化层的基本模型输出
vision_model_output: BaseModelOutputWithPooling = None # 存储[`BlipVisionModel`]的输出,包含池化层的基本模型输出
def to_tuple(self) -> Tuple[Any]:
return tuple(
self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
for k in self.keys()
)
class BlipVisionEmbeddings(nn.Module):
"""
A module for handling vision embeddings in the Blip model.
Args:
config (BlipVisionConfig): Configuration object for the BlipVisionEmbeddings module.
"""
def __init__(self, config: BlipVisionConfig):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size # 设置嵌入维度为配置中的隐藏尺寸
self.image_size = config.image_size # 设置图像大小为配置中的图像尺寸
self.patch_size = config.patch_size # 设置补丁大小为配置中的补丁尺寸
self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim)) # 初始化类别嵌入参数
self.patch_embedding = nn.Conv2d(
in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
) # 创建卷积层,用于从图像中提取补丁特征嵌入
self.num_patches = (self.image_size // self.patch_size) ** 2 # 计算图像中的补丁数量
self.num_positions = self.num_patches + 1 # 计算位置嵌入的数量,包括额外的类别嵌入
self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim)) # 初始化位置嵌入参数
# 定义前向传播方法,接收像素数值作为输入,并返回张量
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
# 获取输入张量的批量大小
batch_size = pixel_values.shape[0]
# 获取目标数据类型,与补丁嵌入权重的数据类型相同
target_dtype = self.patch_embedding.weight.dtype
# 使用补丁嵌入层处理输入像素值,将像素值转换为指定数据类型,并形成补丁嵌入
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
# 将补丁嵌入展平,并调换维度以适应后续操作
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
# 扩展类别嵌入以匹配批次大小,并转换为目标数据类型
class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
# 将类别嵌入与补丁嵌入连接起来形成最终嵌入
embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
# 将位置嵌入加到嵌入张量中(位置嵌入可能与输入批次大小不完全匹配)
embeddings = embeddings + self.position_embedding[:, : embeddings.size(1), :].to(target_dtype)
# 返回最终的嵌入张量作为前向传播的输出
return embeddings
# 从 transformers.models.clip.modeling_clip.CLIPTextEmbeddings 复制而来,将 CLIP 替换为 Blip
class BlipTextEmbeddings(nn.Module):
def __init__(self, config: BlipTextConfig):
super().__init__()
embed_dim = config.hidden_size
# 初始化 token_embedding,用于词嵌入
self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
# 初始化 position_embedding,用于位置编码
self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
# 创建 position_ids 缓冲区,用于位置编码,持久化为非连续内存块
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
) -> torch.Tensor:
# 获取序列长度
seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
# 如果未提供 position_ids,则使用预先初始化的 position_ids
if position_ids is None:
position_ids = self.position_ids[:, :seq_length]
# 如果未提供 inputs_embeds,则通过 token_embedding 获取嵌入
if inputs_embeds is None:
inputs_embeds = self.token_embedding(input_ids)
# 获取位置编码
position_embeddings = self.position_embedding(position_ids)
# 将输入嵌入和位置编码相加作为最终的嵌入表示
embeddings = inputs_embeds + position_embeddings
return embeddings
class BlipAttention(nn.Module):
"""来自 'Attention Is All You Need' 论文的多头注意力机制"""
def __init__(self, config):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.embed_dim // self.num_heads
# 检查 embed_dim 必须被 num_heads 整除
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {self.num_heads})."
)
# 缩放因子为 head_dim 的负半数
self.scale = self.head_dim**-0.5
# dropout 层
self.dropout = nn.Dropout(config.attention_dropout)
# 线性层 qkv,用于查询、键、值的线性变换
self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim)
# 输出投影层,用于最终的线性映射
self.projection = nn.Linear(self.embed_dim, self.embed_dim)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
# 将输入张量重塑为多头注意力矩阵的形状
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = False,
#
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
"""Input shape: Batch x Time x Channel"""
# 获取隐藏状态张量的维度信息
bsz, tgt_len, embed_dim = hidden_states.size()
# 使用 self.qkv 对隐藏状态进行变换,生成混合的查询、键、值张量
mixed_qkv = (
self.qkv(hidden_states)
.reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads)
.permute(2, 0, 3, 1, 4)
)
query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2]
# 计算注意力分数,使用 query 和 key 的点积
attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
# 缩放注意力分数
attention_scores = attention_scores * self.scale
# 将注意力分数归一化为概率分布
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# 对注意力分数应用 dropout
attention_probs = self.dropout(attention_probs)
# 如果有头部掩码,则应用到注意力概率上
if head_mask is not None:
attention_probs = attention_probs * head_mask
# 计算加权后的值张量,生成上下文层
context_layer = torch.matmul(attention_probs, value_states).permute(0, 2, 1, 3)
# 重新调整上下文层的形状以匹配 self.projection 的输入要求
new_context_layer_shape = context_layer.size()[:-2] + (self.embed_dim,)
context_layer = context_layer.reshape(new_context_layer_shape)
# 使用 self.projection 将上下文层映射到输出空间
output = self.projection(context_layer)
# 根据需要决定是否输出注意力分数
outputs = (output, attention_probs) if output_attentions else (output, None)
return outputs
# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Blip
class BlipMLP(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.activation_fn = ACT2FN[config.hidden_act] # 从配置中获取激活函数
self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) # 第一个全连接层
self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) # 第二个全连接层
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.fc1(hidden_states) # 输入经过第一个全连接层
hidden_states = self.activation_fn(hidden_states) # 应用激活函数
hidden_states = self.fc2(hidden_states) # 经过第二个全连接层
return hidden_states
class BlipEncoderLayer(nn.Module):
def __init__(self, config: BlipConfig):
super().__init__()
self.embed_dim = config.hidden_size
self.self_attn = BlipAttention(config) # 自注意力机制
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) # 第一个层标准化层
self.mlp = BlipMLP(config) # MLP网络
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) # 第二个层标准化层
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.FloatTensor]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
`(config.encoder_attention_heads,)`.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states # 残差连接
hidden_states = self.layer_norm1(hidden_states) # 应用第一个层标准化
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
head_mask=attention_mask,
output_attentions=output_attentions,
) # 自注意力机制计算
hidden_states = hidden_states + residual # 添加残差连接
residual = hidden_states # 更新残差连接
hidden_states = self.layer_norm2(hidden_states) # 应用第二个层标准化
hidden_states = self.mlp(hidden_states) # 经过MLP网络
hidden_states = hidden_states + residual # 再次添加残差连接
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,) # 如果需要输出注意力权重,添加到输出中
return outputs
class BlipPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = BlipConfig # 模型配置类
base_model_prefix = "blip" # 基础模型前缀
supports_gradient_checkpointing = True # 支持梯度检查点
# 初始化模型中特定模块的权重和偏置
def _init_weights(self, module):
"""Initialize the weights"""
# 获取初始化因子
factor = self.config.initializer_range
# 如果模块是卷积层、嵌入层或线性层,则使用正态分布初始化权重,并将偏置置零
if isinstance(module, nn.Conv2d) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=factor)
if hasattr(module, "bias") and module.bias is not None:
module.bias.data.zero_()
# 如果模块是 BlipVisionEmbeddings 类型,则根据视觉配置初始化位置嵌入和类别嵌入
if isinstance(module, BlipVisionEmbeddings):
if hasattr(self.config, "vision_config"):
factor = self.config.vision_config.initializer_range
nn.init.trunc_normal_(
module.position_embedding,
mean=0.0,
std=factor,
)
nn.init.trunc_normal_(
module.class_embedding,
mean=0.0,
std=factor,
)
# 如果模块是 LayerNorm 类型,则将偏置置零并将权重填充为 1.0
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
# 对于线性层,如果存在偏置,则将偏置置零
elif isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()
# BLIP_START_DOCSTRING 是一个包含模型描述信息的原始文本字符串,用于指示此模型继承自 PreTrainedModel,
# 并提供了有关模型类通用方法的信息。详细内容可以在 PreTrainedModel 类的文档中找到,
# 包括下载、保存、调整输入嵌入大小、修剪头等功能。
BLIP_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`BlipConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# BLIP_TEXT_INPUTS_DOCSTRING 是关于模型文本输入参数的描述信息,包括 input_ids、attention_mask、position_ids 等参数的说明。
# 每个参数的数据类型和形状都有详细描述,以及如何获取输入 IDs 和如何使用注意力掩码等细节。
BLIP_TEXT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoProcessor`]. See [`BlipProcessor.__call__`] for details.
[What are input IDs?](../glossary
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
# BLIP_VISION_INPUTS_DOCSTRING 是一个空字符串,用于定义模型视觉输入的文档字符串,目前未提供任何信息。
BLIP_VISION_INPUTS_DOCSTRING = r"""
"""
BLIP_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoProcessor`]. See [`BlipProcessor.__call__`] for details.
[What are input IDs?](../glossary
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`BlipImageProcessor`]. See [`BlipImageProcessor.__call__`] for details.
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
class BlipEncoder(nn.Module):
"""
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`BlipEncoderLayer`].
Args:
config (`BlipConfig`):
The corresponding vision configuration for the `BlipEncoder`.
"""
def __init__(self, config: BlipConfig):
super().__init__()
self.config = config
# 创建一个包含多个 BlipEncoderLayer 实例的列表,列表长度为 config.num_hidden_layers
self.layers = nn.ModuleList([BlipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
# 是否使用梯度检查点,默认为 False
self.gradient_checkpointing = False
def forward(
self,
inputs_embeds,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutput]:
r"""
Args:
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Embedded representation of the inputs. Should be float, not int tokens.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
# Determine whether to use the provided `output_attentions` value or fallback to the model's default
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# Determine whether to use the provided `output_hidden_states` value or fallback to the model's default
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# Determine whether to use the provided `return_dict` value or fallback to the model's default
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# Initialize empty tuples based on output configuration to store encoder states and attentions
encoder_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
# Start with the embedded inputs as the initial hidden states
hidden_states = inputs_embeds
# Iterate through each encoder layer in the model
for idx, encoder_layer in enumerate(self.layers):
# If configured to return hidden states, append current hidden states to encoder states
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
# Perform gradient checkpointing if enabled during training
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
encoder_layer.__call__,
hidden_states,
attention_mask,
output_attentions,
)
else:
# Otherwise, directly pass inputs to the encoder layer
layer_outputs = encoder_layer(
hidden_states,
attention_mask,
output_attentions=output_attentions,
)
# Update hidden states with the output from the encoder layer
hidden_states = layer_outputs[0]
# If configured to return attentions, append current layer's attentions to all_attentions
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
# If configured to return hidden states, append final hidden states to encoder states
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
# If return_dict is False, return a tuple of relevant outputs; otherwise, return a ModelOutput object
if not return_dict:
return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
)
class BlipVisionModel(BlipPreTrainedModel):
main_input_name = "pixel_values" # 设置主要输入名称为"pixel_values"
config_class = BlipVisionConfig # 指定配置类为BlipVisionConfig
def __init__(self, config: BlipVisionConfig):
super().__init__(config)
self.config = config
embed_dim = config.hidden_size
self.embeddings = BlipVisionEmbeddings(config) # 初始化图像嵌入模块
self.encoder = BlipEncoder(config) # 初始化编码器模块
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) # 初始化后层归一化模块
self.post_init() # 执行额外的初始化步骤
@add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=BlipVisionConfig)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
前向传播函数
Returns:
根据return_dict返回相应的输出对象
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
raise ValueError("You have to specify pixel_values") # 如果未提供pixel_values则抛出数值错误
hidden_states = self.embeddings(pixel_values) # 将输入的pixel_values转换为嵌入向量
encoder_outputs = self.encoder(
inputs_embeds=hidden_states,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
) # 使用编码器处理嵌入向量,得到编码器的输出
last_hidden_state = encoder_outputs[0] # 获取编码器输出的最后隐藏状态
last_hidden_state = self.post_layernorm(last_hidden_state) # 对最后隐藏状态进行层归一化处理
pooled_output = last_hidden_state[:, 0, :] # 获取池化输出
pooled_output = self.post_layernorm(pooled_output) # 对池化输出进行层归一化处理
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:] # 如果不返回字典,则返回元组形式的输出
return BaseModelOutputWithPooling(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
) # 返回包含池化输出和其他编码器输出的BaseModelOutputWithPooling对象
def get_input_embeddings(self):
return self.embeddings # 返回嵌入模块的实例
def __init__(self, config: BlipConfig):
# 调用父类的初始化方法,传入配置对象
super().__init__(config)
# 检查配置对象中的文本配置是否为BlipTextConfig类型,如果不是则抛出数值错误异常
if not isinstance(config.text_config, BlipTextConfig):
raise ValueError(
"config.text_config is expected to be of type BlipTextConfig but is of type"
f" {type(config.text_config)}."
)
# 检查配置对象中的视觉配置是否为BlipVisionConfig类型,如果不是则抛出数值错误异常
if not isinstance(config.vision_config, BlipVisionConfig):
raise ValueError(
"config.vision_config is expected to be of type BlipVisionConfig but is of type"
f" {type(config.vision_config)}."
)
# 从配置对象中获取文本配置和视觉配置
text_config = config.text_config
vision_config = config.vision_config
# 初始化模型的投影维度、文本嵌入维度和视觉嵌入维度
self.projection_dim = config.projection_dim
self.text_embed_dim = text_config.hidden_size
self.vision_embed_dim = vision_config.hidden_size
# 初始化文本模型和视觉模型,分别使用文本配置和视觉配置
self.text_model = BlipTextModel(text_config)
self.vision_model = BlipVisionModel(vision_config)
# 初始化视觉投影层和文本投影层,分别映射视觉和文本嵌入到投影维度空间,无偏置
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
# 初始化对数尺度参数,使用配置中的初始值
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
# 调用后初始化函数,用于权重初始化和最终处理
self.post_init()
@add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING)
def get_text_features(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
return_dict: Optional[bool] = None,
) -> torch.FloatTensor:
r"""
Returns:
text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
applying the projection layer to the pooled output of [`BlipTextModel`].
Examples:
```
>>> from transformers import AutoProcessor, BlipModel
>>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
>>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs)
```"""
# 如果未指定返回字典,则使用配置中的默认值
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用文本模型,传入输入的ids、注意力掩码、位置ids和是否返回字典的标志
text_outputs = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
return_dict=return_dict,
)
# 获取文本模型的汇总输出(pooled output)
pooled_output = text_outputs[1]
# 将汇总输出投影到文本投影层,得到文本特征
text_features = self.text_projection(pooled_output)
return text_features
@add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
def get_image_features(
self,
pixel_values: Optional[torch.FloatTensor] = None,
return_dict: Optional[bool] = None,
) -> torch.FloatTensor:
r"""
Returns:
image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
applying the projection layer to the pooled output of [`BlipVisionModel`].
Examples:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, BlipModel
>>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
>>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> image_features = model.get_image_features(**inputs)
```
Initialize `return_dict` to `self.config.use_return_dict` if `return_dict` is not provided.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 获取视觉模型的输出,可以选择是否返回字典格式的输出
vision_outputs = self.vision_model(pixel_values=pixel_values, return_dict=return_dict)
# 从视觉模型的输出中获取池化后的特征向量
pooled_output = vision_outputs[1] # pooled_output
# 将池化后的特征向量应用于视觉投影层,得到最终的图像特征表示
image_features = self.visual_projection(pooled_output)
# 返回图像特征表示
return image_features
@add_start_docstrings_to_model_forward(BLIP_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BlipOutput, config_class=BlipConfig)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
pixel_values: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
return_loss: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
BLIP模型的前向传播方法。
Args:
input_ids (Optional[torch.LongTensor], optional): 输入的token IDs. Defaults to None.
pixel_values (Optional[torch.FloatTensor], optional): 输入的像素值. Defaults to None.
attention_mask (Optional[torch.Tensor], optional): 注意力遮罩. Defaults to None.
position_ids (Optional[torch.LongTensor], optional): 位置 IDs. Defaults to None.
return_loss (Optional[bool], optional): 是否返回损失值. Defaults to None.
output_attentions (Optional[bool], optional): 是否返回注意力权重. Defaults to None.
output_hidden_states (Optional[bool], optional): 是否返回隐藏状态. Defaults to None.
return_dict (Optional[bool], optional): 是否以字典格式返回输出. Defaults to None.
Returns:
BLIP模型的输出,类型为`BlipOutput`,根据`return_dict`参数决定返回方式.
"""
@add_start_docstrings(
"""
BLIP Model for image captioning. The model consists of a vision encoder and a text decoder. One can optionally pass
`input_ids` to the model, which serve as a text prompt, to make the text decoder continue the prompt. Otherwise,
the decoder starts generating text from the [BOS] (beginning-of-sequence) token. will start generating the caption
from the text input. If no text input is provided, the decoder will start with the [BOS] token only.
""",
BLIP_START_DOCSTRING,
)
class BlipForConditionalGeneration(BlipPreTrainedModel):
# 定义配置类为 BlipConfig
config_class = BlipConfig
# 定义权重共享的键列表
_tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"]
# 主要输入名称为 "pixel_values"
main_input_name = "pixel_values"
def __init__(self, config: BlipConfig):
# 调用父类的初始化方法
super().__init__(config)
# 使用 BlipVisionModel 初始化视觉模型
self.vision_model = BlipVisionModel(config.vision_config)
# 使用 BlipTextLMHeadModel 初始化文本解码器
self.text_decoder = BlipTextLMHeadModel(config.text_config)
# 设置解码器的起始输入为 BOS 标记的 ID
self.decoder_input_ids = config.text_config.bos_token_id
# 设置解码器的填充标记的 ID
self.decoder_pad_token_id = config.text_config.pad_token_id
# 初始化权重并应用最终处理
self.post_init()
def get_input_embeddings(self) -> nn.Module:
# 返回视觉模型的嵌入模块
return self.vision_model.embeddings.patch_embedding
@add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BlipForConditionalGenerationModelOutput, config_class=BlipVisionConfig)
def forward(
self,
pixel_values: torch.FloatTensor,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
labels: Optional[torch.LongTensor] = None,
return_dict: Optional[bool] = None,
# forward 方法的参数说明文档添加 BLIP_VISION_INPUTS_DOCSTRING
# 替换返回文档字符串的输出类型和配置类为 BlipVisionConfig
):
) -> Union[Tuple, BlipForConditionalGenerationModelOutput]:
r"""
Returns:
Examples:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, BlipForConditionalGeneration
>>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
>>> model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> text = "A picture of"
>>> inputs = processor(images=image, text=text, return_tensors="pt")
>>> outputs = model(**inputs)
```"""
# 如果 return_dict 参数未指定,则使用模型配置中的默认值
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果 output_attentions 参数未指定,则使用模型配置中的默认值
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 如果 output_hidden_states 参数未指定,则使用模型配置中的默认值
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 使用视觉模型处理像素值,根据参数返回不同的结果
vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 提取视觉输出的第一个元素,即图像嵌入
image_embeds = vision_outputs[0]
# 使用文本解码器处理输入的信息,生成输出结果
outputs = self.text_decoder(
input_ids=input_ids,
attention_mask=attention_mask,
encoder_hidden_states=image_embeds,
labels=labels,
return_dict=return_dict,
reduction="mean",
)
# 如果 return_dict 为 False,则返回多个输出元组
if not return_dict:
outputs = (outputs[0], outputs[1], image_embeds, vision_outputs[0]) + vision_outputs[2:]
return tuple(output for output in outputs if output is not None)
# 如果 return_dict 为 True,则返回 BlipForConditionalGenerationModelOutput 对象
return BlipForConditionalGenerationModelOutput(
loss=outputs.loss,
logits=outputs.logits,
image_embeds=image_embeds,
last_hidden_state=vision_outputs.last_hidden_state,
hidden_states=vision_outputs.hidden_states,
attentions=vision_outputs.attentions,
)
@torch.no_grad()
def generate(
self,
pixel_values: torch.FloatTensor,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
**generate_kwargs,
) -> torch.LongTensor:
r"""
Overrides *generate* function to be able to use the model as a conditional generator
Parameters:
pixel_values (*torch.FloatTensor* of shape *(batch_size, num_channels, image_height, image_width)*:
Input image to be processed
input_ids (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
The sequence used as a prompt for the generation.
attention_mask (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
Examples:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, BlipForConditionalGeneration
>>> model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
>>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> outputs = model.generate(**inputs)
>>> print(processor.decode(outputs[0], skip_special_tokens=True))
two cats sleeping on a couch
```
"""
# 获取批处理大小
batch_size = pixel_values.shape[0]
# 使用视觉模型处理输入图像,获取视觉输出
vision_outputs = self.vision_model(pixel_values=pixel_values)
# 从视觉输出中提取图像嵌入
image_embeds = vision_outputs[0]
# 创建图像注意力掩码,用于避免在填充标记索引上执行注意力
image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image_embeds.device)
# 如果输入的input_ids是列表,则转换为torch.LongTensor
if isinstance(input_ids, list):
input_ids = torch.LongTensor(input_ids)
# 如果input_ids为None,则创建包含开始和结束标记的输入序列
elif input_ids is None:
input_ids = (
torch.LongTensor([[self.decoder_input_ids, self.config.text_config.eos_token_id]])
.repeat(batch_size, 1)
.to(image_embeds.device)
)
# 设置输入序列的开始标记为配置中的开始标记
input_ids[:, 0] = self.config.text_config.bos_token_id
# 调整注意力掩码,移除最后一个标记以对齐输入序列
attention_mask = attention_mask[:, :-1] if attention_mask is not None else None
# 使用文本解码器生成文本输出
outputs = self.text_decoder.generate(
input_ids=input_ids[:, :-1],
eos_token_id=self.config.text_config.sep_token_id,
pad_token_id=self.config.text_config.pad_token_id,
attention_mask=attention_mask,
encoder_hidden_states=image_embeds,
encoder_attention_mask=image_attention_mask,
**generate_kwargs,
)
# 返回生成的输出
return outputs
@add_start_docstrings(
"""
BLIP Model for visual question answering. The model consists of a vision encoder, a text encoder as well as a text
decoder. The vision encoder will encode the input image, the text encoder will encode the input question together
with the encoding of the image, and the text decoder will output the answer to the question.
""",
BLIP_START_DOCSTRING,
)
class BlipForQuestionAnswering(BlipPreTrainedModel):
config_class = BlipConfig
_tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"]
def __init__(self, config: BlipConfig):
super().__init__(config)
# Initialize the vision encoder model using the provided vision configuration
self.vision_model = BlipVisionModel(config.vision_config)
# Initialize the text encoder model using the provided text configuration,
# with pooling layer excluded
self.text_encoder = BlipTextModel(config.text_config, add_pooling_layer=False)
# Initialize the text decoder model using the provided text configuration
self.text_decoder = BlipTextLMHeadModel(config.text_config)
# Store special token IDs for decoder inputs
self.decoder_pad_token_id = config.text_config.pad_token_id
self.decoder_start_token_id = config.text_config.bos_token_id
# Initialize weights and perform any necessary post-initialization steps
self.post_init()
def get_input_embeddings(self) -> nn.Module:
# Return the patch embedding module from the vision encoder's embeddings
return self.vision_model.embeddings.patch_embedding
@add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BlipTextVisionModelOutput, config_class=BlipVisionConfig)
def forward(
self,
input_ids: torch.LongTensor,
pixel_values: torch.FloatTensor,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
labels: Optional[torch.LongTensor] = None,
return_dict: Optional[bool] = None,
):
"""
Forward pass of the BLIP model for question answering.
Args:
input_ids (:obj:`torch.LongTensor`):
Indices of input sequence tokens in the vocabulary.
pixel_values (:obj:`torch.FloatTensor`):
Pixel values of images (shape batch_size x channels x height x width).
decoder_input_ids (:obj:`torch.LongTensor`, optional):
Optional input for decoder. If provided, computes the loss and returns the logits.
decoder_attention_mask (:obj:`torch.LongTensor`, optional):
Optional attention mask for the decoder input.
attention_mask (:obj:`torch.LongTensor`, optional):
Optional attention mask for the input.
output_attentions (:obj:`bool`, optional):
Whether to return attentions weights.
output_hidden_states (:obj:`bool`, optional):
Whether to return hidden states.
labels (:obj:`torch.LongTensor`, optional):
Labels for computing the cross-entropy loss.
return_dict (:obj:`bool`, optional):
Whether to return a dictionary.
Returns:
:class:`~transformers.BlipTextVisionModelOutput`: A subclass of :class:`~transformers.ModelOutput`.
"""
# Implementation of the forward pass is provided by the decorated function
@torch.no_grad()
def generate(
self,
input_ids: torch.LongTensor,
pixel_values: torch.FloatTensor,
attention_mask: Optional[torch.LongTensor] = None,
**generate_kwargs,
):
"""
Generate output sequences for the given inputs.
Args:
input_ids (:obj:`torch.LongTensor`):
Indices of input sequence tokens in the vocabulary.
pixel_values (:obj:`torch.FloatTensor`):
Pixel values of images (shape batch_size x channels x height x width).
attention_mask (:obj:`torch.LongTensor`, optional):
Optional attention mask for the input.
**generate_kwargs:
Additional keyword arguments for generation (e.g., max_length, num_beams).
Returns:
:obj:`torch.LongTensor`: Generated sequences.
"""
# Implementation of the generation process is provided by the decorated function
) -> torch.LongTensor:
r"""
重写 *generate* 函数以便将模型用作条件生成器
Parameters:
input_ids (*torch.LongTensor* of shape *(batch_size, sequence_length)*):
用作生成提示的序列。
pixel_values (*torch.FloatTensor* of shape *(batch_size, num_channels, image_height, image_width)*:
要处理的输入图像。
attention_mask (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
遮罩,避免在填充令牌索引上执行注意力。遮罩值选在 `[0, 1]` 中。`1` 表示未被掩盖的令牌,`0` 表示被掩盖的令牌。
**generate_kwargs:
传递给解码器 *generate* 函数的额外参数
Examples:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, BlipForQuestionAnswering
>>> model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
>>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> text = "How many cats are in the picture?"
>>> inputs = processor(images=image, text=text, return_tensors="pt")
>>> outputs = model.generate(**inputs)
>>> print(processor.decode(outputs[0], skip_special_tokens=True))
2
```
"""
vision_outputs = self.vision_model(pixel_values=pixel_values)
image_embeds = vision_outputs[0] # 提取视觉模型的输出中的图像嵌入表示
image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image_embeds.device) # 创建图像的注意力遮罩
if isinstance(input_ids, list):
input_ids = torch.LongTensor(input_ids) # 如果输入的是列表,将其转换为 torch.LongTensor
question_outputs = self.text_encoder(
input_ids=input_ids,
attention_mask=attention_mask,
encoder_hidden_states=image_embeds,
encoder_attention_mask=image_attention_mask,
return_dict=False,
) # 使用文本编码器处理输入的文本和图像嵌入
question_embeds = question_outputs[0] # 提取文本编码器的输出中的问题嵌入表示
question_attention_mask = torch.ones(question_embeds.size()[:-1], dtype=torch.long).to(question_embeds.device) # 创建问题的注意力遮罩
bos_ids = torch.full(
(question_embeds.size(0), 1), fill_value=self.decoder_start_token_id, device=question_embeds.device
) # 创建包含起始标记的张量
outputs = self.text_decoder.generate(
input_ids=bos_ids,
eos_token_id=self.config.text_config.sep_token_id,
pad_token_id=self.config.text_config.pad_token_id,
encoder_hidden_states=question_embeds,
encoder_attention_mask=question_attention_mask,
**generate_kwargs,
) # 使用文本解码器生成输出序列
return outputs # 返回生成的输出序列
# 定义 BLIP 图像文本检索模型,包含视觉和文本投影器以及顶部的分类头部。用于图像文本检索任务,给定图像和文本,模型返回文本与图像相关性的概率。
@add_start_docstrings(
"""
BLIP Model with a vision and text projector, and a classification head on top. The model is used in the context of
image-text retrieval. Given an image and a text, the model returns the probability of the text being relevant to
the image.
""",
BLIP_START_DOCSTRING,
)
class BlipForImageTextRetrieval(BlipPreTrainedModel):
# 使用 BlipConfig 类型的配置
config_class = BlipConfig
def __init__(self, config: BlipConfig):
# 调用父类构造函数,传入配置
super().__init__(config)
# 初始化视觉模型,使用 BlipVisionModel 和视觉配置
self.vision_model = BlipVisionModel(config.vision_config)
# 初始化文本编码器,使用 BlipTextModel 和文本配置,不添加池化层
self.text_encoder = BlipTextModel(config.text_config, add_pooling_layer=False)
# 视觉投影层,线性变换视觉隐藏状态的维度到图像文本隐藏大小
self.vision_proj = nn.Linear(config.vision_config.hidden_size, config.image_text_hidden_size)
# 文本投影层,线性变换文本隐藏状态的维度到图像文本隐藏大小
self.text_proj = nn.Linear(config.text_config.hidden_size, config.image_text_hidden_size)
# 图像文本匹配头部,线性层输出大小为 2,用于二分类任务
self.itm_head = nn.Linear(config.text_config.hidden_size, 2)
# 解码器的填充标记 ID,根据配置的填充标记 ID 初始化
self.decoder_pad_token_id = (
config.text_config.pad_token_id
if not hasattr(config, "decoder_pad_token_id")
else config.decoder_pad_token_id
)
# 解码器的起始标记 ID,根据配置的起始标记 ID 初始化
self.decoder_start_token_id = (
config.text_config.bos_token_id
if not hasattr(config, "decoder_start_token_id")
else config.decoder_start_token_id
)
# 初始化权重并应用最终处理
self.post_init()
# 获取输入嵌入,返回视觉模型的 patch 嵌入层
def get_input_embeddings(self) -> nn.Module:
return self.vision_model.embeddings.patch_embedding
@add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BlipTextVisionModelOutput, config_class=BlipVisionConfig)
# 重写 forward 方法,使用 BLIP_VISION_INPUTS_DOCSTRING 和 BlipTextVisionModelOutput 来替换返回值的文档字符串
def forward(
self,
input_ids: torch.LongTensor,
pixel_values: torch.FloatTensor,
use_itm_head: Optional[bool] = True,
attention_mask: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 输入参数:input_ids 是文本输入的 token ID,pixel_values 是视觉输入的像素值
# use_itm_head 控制是否使用图像文本匹配头部,attention_mask 控制注意力机制的掩码
# output_attentions 和 output_hidden_states 控制是否输出注意力权重和隐藏状态
# return_dict 控制是否返回字典形式的输出
#
# 输出类型为 BlipTextVisionModelOutput,配置类为 BlipVisionConfig
) -> Union[Tuple, BlipTextVisionModelOutput]:
r"""
Returns:
Examples:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, BlipForImageTextRetrieval
>>> model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco")
>>> processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> text = "an image of a cat"
>>> inputs = processor(images=image, text=text, return_tensors="pt")
>>> outputs = model(**inputs)
```
"""
# 如果 return_dict 参数不为 None,则使用该值;否则使用 self.config.use_return_dict 的设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果 output_attentions 参数不为 None,则使用该值;否则使用 self.config.output_attentions 的设置
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 如果 output_hidden_states 参数不为 None,则使用该值;否则使用 self.config.output_hidden_states 的设置
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 使用 vision_model 处理图像数据,获取视觉模型的输出
vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 提取图像的嵌入表示
image_embeds = vision_outputs[0]
# 创建与图像嵌入相同大小的注意力掩码
image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long)
# 如果 use_itm_head 为真,则使用 text_encoder 处理输入问题文本,并应用 itm_head 进行匹配分数计算
if use_itm_head:
# 使用 text_encoder 处理文本数据,将图像嵌入作为 encoder_hidden_states 提供给文本编码器
question_embeds = self.text_encoder(
input_ids=input_ids,
attention_mask=attention_mask,
encoder_hidden_states=image_embeds,
encoder_attention_mask=image_atts,
return_dict=return_dict,
)
# 如果 return_dict 为 False,则使用第一个元素作为输出;否则使用 last_hidden_state
question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state
# 使用 itm_head 计算问题嵌入的匹配分数
output = self.itm_head(question_embeds[:, 0, :])
else:
# 使用 text_encoder 处理文本数据,获取问题文本的嵌入表示
question_embeds = self.text_encoder(
input_ids=input_ids,
attention_mask=attention_mask,
return_dict=return_dict,
)
# 如果 return_dict 为 False,则使用第一个元素作为输出;否则使用 last_hidden_state
question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state
# 规范化图像嵌入,并通过 vision_proj 将其投影到与问题文本嵌入相同的空间中
image_feat = normalize(self.vision_proj(image_embeds[:, 0, :]), dim=-1)
# 规范化问题文本嵌入,并通过 text_proj 进行同样的投影
text_feat = normalize(self.text_proj(question_embeds[:, 0, :]), dim=-1)
# 计算图像嵌入与问题文本嵌入之间的相似度分数
output = image_feat @ text_feat.t()
# 如果 return_dict 为 False,则返回多个元组,确保输出中没有 None 值
if not return_dict:
outputs = (output, vision_outputs[0]) + vision_outputs[2:] + (question_embeds,)
return tuple(output for output in outputs if output is not None)
# 如果 return_dict 为 True,则返回 BlipImageTextMatchingModelOutput 对象,包含 ITM 计算的结果和相关信息
return BlipImageTextMatchingModelOutput(
itm_score=output,
last_hidden_state=vision_outputs.last_hidden_state,
hidden_states=vision_outputs.hidden_states,
attentions=vision_outputs.attentions,
question_embeds=question_embeds,
)
.\models\blip\modeling_blip_text.py
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import Tensor, device, nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
BaseModelOutputWithPoolingAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
)
from ...modeling_utils import (
PreTrainedModel,
apply_chunking_to_forward,
find_pruneable_heads_and_indices,
prune_linear_layer,
)
from ...utils import logging
from .configuration_blip import BlipTextConfig
logger = logging.get_logger(__name__)
class BlipTextEmbeddings(nn.Module):
"""根据单词和位置嵌入构建嵌入层。"""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.config = config
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
past_key_values_length: int = 0,
) -> torch.Tensor:
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
if inputs_embeds is None:
input_ids = input_ids.to(self.word_embeddings.weight.device)
inputs_embeds = self.word_embeddings(input_ids)
embeddings = inputs_embeds
if self.position_embedding_type == "absolute":
position_embeddings = self.position_embeddings(position_ids)
embeddings += position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class BlipTextSelfAttention(nn.Module):
def __init__(self, config, is_cross_attention):
super().__init__()
self.config = config
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention heads (%d)"
% (config.hidden_size, config.num_attention_heads)
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
if is_cross_attention:
self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
else:
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
def save_attn_gradients(self, attn_gradients):
self.attn_gradients = attn_gradients
def get_attn_gradients(self):
return self.attn_gradients
def save_attention_map(self, attention_map):
self.attention_map = attention_map
def get_attention_map(self):
return self.attention_map
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
):
pass
class BlipTextSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class BlipTextAttention(nn.Module):
def __init__(self, config, is_cross_attention=False):
super().__init__()
self.self = BlipTextSelfAttention(config, is_cross_attention)
self.output = BlipTextSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class BlipTextIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class BlipTextOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class BlipTextLayer(nn.Module):
def __init__(self, config, layer_num):
super().__init__()
self.config = config
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = BlipTextAttention(config)
self.layer_num = layer_num
if self.config.is_decoder:
self.crossattention = BlipTextAttention(config, is_cross_attention=self.config.is_decoder)
self.intermediate = BlipTextIntermediate(config)
self.output = BlipTextOutput(config)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
past_key_value=self_attn_past_key_value,
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:-1]
present_key_value = self_attention_outputs[-1]
if encoder_hidden_states is not None:
cross_attention_outputs = self.crossattention(
attention_output,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
output_attentions=output_attentions,
)
attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:-1]
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
outputs = outputs + (present_key_value,)
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class BlipTextEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([BlipTextLayer(config, i) for i in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
-> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warning(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.is_decoder else None
next_decoder_cache = () if use_cache else None
for i in range(self.config.num_hidden_layers):
layer_module = self.layer[i]
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
past_key_value = past_key_values[i] if past_key_values is not None else None
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
else:
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v
for v in [
hidden_states,
next_decoder_cache,
all_hidden_states,
all_self_attentions,
all_cross_attentions,
]
if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
)
class BlipTextPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class BlipTextPredictionHeadTransform(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
class BlipTextLMPredictionHead(nn.Module):
def __init__(self, config):
super().__init__()
self.transform = BlipTextPredictionHeadTransform(config)
self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.decoder.bias = self.bias
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states)
return hidden_states
class BlipTextOnlyMLMHead(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = BlipTextLMPredictionHead(config)
def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
prediction_scores = self.predictions(sequence_output)
return prediction_scores
class BlipTextPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
"""
pass
models.
"""
# 定义配置类为BlipTextConfig
config_class = BlipTextConfig
# 设置基础模型前缀为"bert"
base_model_prefix = "bert"
def _init_weights(self, module):
"""Initialize the weights"""
# 如果模块是线性层或嵌入层
if isinstance(module, (nn.Linear, nn.Embedding)):
# 使用正态分布初始化权重,均值为0.0,标准差为配置中的初始化范围
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
# 如果模块是 LayerNorm 层
elif isinstance(module, nn.LayerNorm):
# 将偏置项初始化为零
module.bias.data.zero_()
# 将权重初始化为1.0
module.weight.data.fill_(1.0)
# 如果模块是线性层并且具有偏置项
if isinstance(module, nn.Linear) and module.bias is not None:
# 将偏置项初始化为零
module.bias.data.zero_()
# Adapted from https://github.com/salesforce/BLIP/blob/3a29b7410476bf5f2ba0955827390eb6ea1f4f9d/models/med.py#L571
class BlipTextModel(BlipTextPreTrainedModel):
"""
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in [Attention is
all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. argument and `is_decoder` set to `True`; an
`encoder_hidden_states` is then expected as an input to the forward pass.
"""
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
# Initialize embeddings layer for text
self.embeddings = BlipTextEmbeddings(config)
# Initialize encoder layer for processing text
self.encoder = BlipTextEncoder(config)
# Optionally initialize pooling layer if specified
self.pooler = BlipTextPooler(config) if add_pooling_layer else None
# Perform any post-initialization steps
self.post_init()
def get_input_embeddings(self):
# Return the word embeddings from the embeddings layer
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
# Set new word embeddings in the embeddings layer
self.embeddings.word_embeddings = value
# Copied from transformers.models.bert.modeling_bert.BertModel._prune_heads
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
# Prune specified heads in the attention mechanism of each layer
self.encoder.layer[layer].attention.prune_heads(heads)
def get_extended_attention_mask(
self, attention_mask: Tensor, input_shape: Tuple[int], device: device, is_decoder: bool
):
# Create an extended attention mask to handle different attention scenarios
# Not fully implemented in the provided snippet
pass
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
encoder_embeds: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
is_decoder: Optional[bool] = False,
):
# Forward pass through the model, not fully implemented here
pass
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811
class BlipTextLMHeadModel(BlipTextPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# Initialize the base BlipTextModel for language modeling
self.bert = BlipTextModel(config, add_pooling_layer=False)
# Initialize the MLM (Masked Language Modeling) head
self.cls = BlipTextOnlyMLMHead(config)
# Define label smoothing factor
self.label_smoothing = config.label_smoothing
def get_output_embeddings(self):
# Return the decoder part of the MLM head's predictions
return self.cls.predictions.decoder
# 设置新的输出嵌入到模型预测的解码器中
def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings
# 模型的前向传播函数,接受多个输入参数并返回模型输出或损失
def forward(
self,
input_ids: Optional[torch.Tensor] = None, # 输入的token ID序列,默认为None
attention_mask: Optional[torch.Tensor] = None, # 注意力掩码,用于指示哪些token需要注意力,默认为None
position_ids: Optional[torch.Tensor] = None, # 位置ID,用于指示每个token的位置信息,默认为None
head_mask: Optional[torch.Tensor] = None, # 头部掩码,用于控制不同头部的注意力,默认为None
inputs_embeds: Optional[torch.Tensor] = None, # 输入的嵌入表示,默认为None
encoder_hidden_states: Optional[torch.Tensor] = None, # 编码器的隐藏状态,默认为None
encoder_attention_mask: Optional[torch.Tensor] = None, # 编码器的注意力掩码,默认为None
labels: Optional[torch.Tensor] = None, # 真实标签,默认为None
past_key_values: Optional[List[torch.Tensor]] = None, # 过去的键值对,用于生成,默认为None
use_cache: Optional[bool] = None, # 是否使用缓存,默认为None
output_attentions: Optional[bool] = None, # 是否输出注意力权重,默认为None
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,默认为None
return_dict: Optional[bool] = None, # 是否以字典形式返回,默认为None
return_logits: Optional[bool] = False, # 是否返回logits,默认为False
is_decoder: Optional[bool] = True, # 是否作为解码器,默认为True
reduction: Optional[str] = "mean", # 损失函数的减少方式,默认为"mean"
):
# 准备生成输入的函数,为生成模型准备输入数据
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
input_shape = input_ids.shape
# 如果注意力掩码为None,则创建一个全1的注意力掩码,形状与输入ID相同
if attention_mask is None:
attention_mask = input_ids.new_ones(input_shape)
# 如果过去的键值对不为None,则根据过去的长度截取输入ID
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
# 一些生成方法已经只传递了最后一个输入ID
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
# 默认保留最后一个ID
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
# 返回准备好的输入数据作为字典形式
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"past_key_values": past_key_values,
"encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
"encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
"is_decoder": True,
}
# 重新排序缓存中的过去键值对,根据beam索引重排
def _reorder_cache(self, past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
.\models\blip\modeling_tf_blip.py
""" TensorFlow BLIP model."""
from __future__ import annotations
import warnings
from dataclasses import dataclass
from typing import Any, Optional, Tuple, Union
import tensorflow as tf
from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling
from ...modeling_tf_utils import (
TFPreTrainedModel,
get_initializer,
get_tf_activation,
keras,
keras_serializable,
shape_list,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, stable_softmax
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_blip import BlipConfig, BlipTextConfig, BlipVisionConfig
from .modeling_tf_blip_text import BLIP_TEXT_INPUTS_DOCSTRING, TFBlipTextLMHeadModel, TFBlipTextModel
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "Salesforce/blip-vqa-base"
TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
"Salesforce/blip-vqa-base",
"Salesforce/blip-vqa-capfilt-large",
"Salesforce/blip-image-captioning-base",
"Salesforce/blip-image-captioning-large",
"Salesforce/blip-itm-base-coco",
"Salesforce/blip-itm-large-coco",
"Salesforce/blip-itm-base-flickr",
"Salesforce/blip-itm-large-flickr",
]
def contrastive_loss(logits: tf.Tensor) -> tf.Tensor:
"""
Computes the contrastive loss based on the sparse categorical crossentropy.
Args:
logits (tf.Tensor): Logits tensor representing predictions.
Returns:
tf.Tensor: Mean contrastive loss value.
"""
return tf.math.reduce_mean(
keras.metrics.sparse_categorical_crossentropy(
y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True
)
)
def blip_loss(similarity: tf.Tensor) -> tf.Tensor:
"""
Computes the BLIP loss, which is an average of contrastive losses calculated for captions and images.
Args:
similarity (tf.Tensor): Tensor representing similarity between captions and images.
Returns:
tf.Tensor: Computed BLIP loss value.
"""
caption_loss = contrastive_loss(similarity)
image_loss = contrastive_loss(tf.transpose(similarity))
return (caption_loss + image_loss) / 2.0
@dataclass
class TFBlipForConditionalGenerationModelOutput(ModelOutput):
"""
Output data structure for TFBlipForConditionalGenerationModel, inheriting from ModelOutput.
Attributes:
This class inherits attributes and methods from ModelOutput and adds none in this implementation.
"""
pass
Args:
loss (`tf.Tensor`, *optional*, returned when `labels` is provided, `tf.Tensor` of shape `(1,)`):
Languge modeling loss from the text decoder.
logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
Prediction scores of the language modeling head of the text decoder model.
image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)`, *optional*):
The image embeddings obtained after applying the Vision Transformer model to the input image.
last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the model.
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
# 初始化各个属性为 None,这些属性用于存储模型推断的结果
loss: Tuple[tf.Tensor] | None = None
logits: Tuple[tf.Tensor] | None = None
image_embeds: tf.Tensor | None = None
last_hidden_state: tf.Tensor = None
hidden_states: Tuple[tf.Tensor, ...] | None = None
attentions: Tuple[tf.Tensor, ...] | None = None
@property
def decoder_logits(self):
# 发出警告,提醒用户 `decoder_logits` 属性即将被移除,建议使用 `logits` 属性来获取最终输出
warnings.warn(
"`decoder_logits` attribute is deprecated and will be removed in version 5 of Transformers."
" Please use the `logits` attribute to retrieve the final output instead.",
FutureWarning,
)
# 返回 `logits` 属性的值作为输出
return self.logits
# 定义一个用于 TFBlip 文本视觉模型输出的数据类,继承自 ModelOutput 基类
@dataclass
class TFBlipTextVisionModelOutput(ModelOutput):
"""
从基类适配的视觉模型输出的扩展,还包含了最后隐藏状态的图像嵌入。该类还添加了文本解码器的损失项。
Args:
loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
文本解码器的语言建模损失。
image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
图像嵌入,通过将投影层应用于池化器输出获得。
last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
模型最后一层的隐藏状态序列。
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
元组 `tf.Tensor` 的隐藏状态(如果模型具有嵌入层,则为输出的初始嵌入输出 + 每一层的输出),
形状为 `(batch_size, sequence_length, hidden_size)`。
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
元组 `tf.Tensor` 的注意力权重(每层一个),形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
"""
# 损失项,类型为 tf.Tensor 或 None
loss: tf.Tensor | None = None
# 图像嵌入,类型为 tf.Tensor 或 None
image_embeds: tf.Tensor | None = None
# 最后一层隐藏状态,类型为 tf.Tensor 或 None
last_hidden_state: tf.Tensor = None
# 隐藏状态元组,包含模型每层的隐藏状态,类型为 Tuple[tf.Tensor] 或 None
hidden_states: Tuple[tf.Tensor, ...] | None = None
# 注意力权重元组,包含每层的注意力权重,类型为 Tuple[tf.Tensor] 或 None
attentions: Tuple[tf.Tensor, ...] | None = None
# 定义一个用于 TFBlip 图像文本匹配模型输出的数据类,继承自 ModelOutput 基类
@dataclass
class TFBlipImageTextMatchingModelOutput(ModelOutput):
"""
从基类适配的视觉模型输出的扩展,还包含了最后隐藏状态的图像嵌入。该类还添加了文本解码器的损失项以及图像文本相似度分数。
(此处省略了进一步的文档内容,未提供完整的注释)
"""
Args:
itm_score (`tf.Tensor`):
图像和文本的相似度分数。
loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
文本解码器的语言建模损失。
image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
通过投影层应用到池化输出得到的图像嵌入。
last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
模型最后一层的隐藏状态序列输出。
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
模型每一层的隐藏状态元组,包括可能的初始嵌入层输出。
vision_pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`, *optional*):
模型视觉分支中视觉池化层的最后一层隐藏状态。
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
注意力权重的元组,用于计算自注意力头中的加权平均值。
question_embeds (`tf.Tensor`):
文本投影层得到的问题嵌入。
"""
itm_score: tf.Tensor | None = None
loss: tf.Tensor | None = None
image_embeds: tf.Tensor | None = None
last_hidden_state: tf.Tensor = None
hidden_states: Tuple[tf.Tensor, ...] | None = None
vision_pooler_output: tf.Tensor | None = None
attentions: Tuple[tf.Tensor, ...] | None = None
question_embeds: Tuple[tf.Tensor] | None = None
@dataclass
class TFBlipOutput(ModelOutput):
"""
Args:
loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
Contrastive loss for image-text similarity.
logits_per_image: (`tf.Tensor` of shape `(image_batch_size, text_batch_size)`):
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
similarity scores.
logits_per_text: (`tf.Tensor` of shape `(text_batch_size, image_batch_size)`):
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
similarity scores.
text_embeds: (`tf.Tensor` of shape `(batch_size, output_dim)`):
The text embeddings obtained by applying the projection layer to the pooled output of [`BlipTextModel`].
image_embeds: (`tf.Tensor` of shape `(batch_size, output_dim)`):
The image embeddings obtained by applying the projection layer to the pooled output of [`BlipVisionModel`].
text_model_output: (`BaseModelOutputWithPooling`):
The output of the [`BlipTextModel`].
vision_model_output: (`BaseModelOutputWithPooling`):
The output of the [`BlipVisionModel`].
"""
loss: tf.Tensor | None = None
logits_per_image: tf.Tensor = None
logits_per_text: tf.Tensor = None
text_embeds: tf.Tensor = None
image_embeds: tf.Tensor = None
text_model_output: TFBaseModelOutputWithPooling = None
vision_model_output: TFBaseModelOutputWithPooling = None
def to_tuple(self) -> Tuple[Any]:
"""
Convert TFBlipOutput object to a tuple, excluding `text_model_output` and `vision_model_output` which are
converted to tuples separately.
Returns:
Tuple[Any]: A tuple representation of the object.
"""
return tuple(
self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
for k in self.keys()
)
class TFBlipVisionEmbeddings(keras.layers.Layer):
def __init__(self, config: BlipVisionConfig, **kwargs):
"""
Initialize the TFBlipVisionEmbeddings layer.
Args:
config (BlipVisionConfig): Configuration object for BlipVisionModel.
**kwargs: Additional keyword arguments passed to the Layer constructor.
"""
super().__init__(**kwargs)
self.config = config
self.embed_dim = config.hidden_size
self.image_size = config.image_size
self.patch_size = config.patch_size
self.patch_embedding = keras.layers.Conv2D(
filters=self.embed_dim,
kernel_size=self.patch_size,
strides=self.patch_size,
kernel_initializer=get_initializer(self.config.initializer_range),
data_format="channels_last",
name="patch_embedding",
)
self.num_patches = (self.image_size // self.patch_size) ** 2
self.num_positions = self.num_patches + 1
def build(self, input_shape=None):
self.class_embedding = self.add_weight(
shape=(1, 1, self.embed_dim),
initializer=get_initializer(self.config.initializer_range),
trainable=True,
name="class_embedding",
)
self.position_embedding = self.add_weight(
shape=(1, self.num_positions, self.embed_dim),
initializer=get_initializer(self.config.initializer_range),
trainable=True,
name="position_embedding",
)
if self.built:
return
self.built = True
if getattr(self, "patch_embedding", None) is not None:
with tf.name_scope(self.patch_embedding.name):
self.patch_embedding.build([None, None, None, 3])
def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
batch_size = tf.shape(pixel_values)[0]
pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
patch_embeds = self.patch_embedding(pixel_values)
patch_embeds = tf.reshape(patch_embeds, (batch_size, self.num_patches, -1))
class_embeds = tf.broadcast_to(self.class_embedding, (batch_size, 1, self.embed_dim))
embeddings = tf.concat([class_embeds, patch_embeds], axis=1)
embeddings = embeddings + self.position_embedding[:, : tf.shape(embeddings)[1], :]
return embeddings
class TFBlipTextEmbeddings(keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.hidden_size
self.config = config
def build(self, input_shape: tf.TensorShape = None):
with tf.name_scope("token_embedding"):
self.weight = self.add_weight(
shape=(self.config.vocab_size, self.embed_dim),
initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
trainable=True,
name="weight",
)
with tf.name_scope("position_embedding"):
self.position_embedding = self.add_weight(
shape=(self.config.max_position_embeddings, self.embed_dim),
initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
trainable=True,
name="embeddings",
)
super().build(input_shape)
def call(
self,
input_ids: tf.Tensor = None,
position_ids: tf.Tensor = None,
inputs_embeds: tf.Tensor = None,
) -> tf.Tensor:
"""
Applies embedding based on inputs tensor.
Args:
input_ids (tf.Tensor, optional): 输入的 token ID 张量
position_ids (tf.Tensor, optional): 输入的位置 ID 张量
inputs_embeds (tf.Tensor, optional): 输入的嵌入张量
Returns:
final_embeddings (`tf.Tensor`): 输出的嵌入张量.
"""
if input_ids is None and inputs_embeds is None:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if inputs_embeds is None:
check_embeddings_within_bounds(input_ids, self.config.vocab_size)
inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
input_shape = shape_list(inputs_embeds)[:-1]
if position_ids is None:
position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)
position_embeds = tf.gather(params=self.position_embedding, indices=position_ids)
position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1))
final_embeddings = inputs_embeds + position_embeds
return final_embeddings
class TFBlipAttention(keras.layers.Layer):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.config = config
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.embed_dim // self.num_heads
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {self.num_heads})."
)
self.scale = self.head_dim**-0.5
self.dropout = keras.layers.Dropout(config.attention_dropout, name="dropout")
self.qkv = keras.layers.Dense(
3 * self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="qkv"
)
self.projection = keras.layers.Dense(
self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="projection"
)
def call(
self,
hidden_states: tf.Tensor,
head_mask: tf.Tensor | None = None,
output_attentions: Optional[bool] = False,
training: Optional[bool] = None,
) -> Tuple[tf.Tensor, tf.Tensor | None, Tuple[tf.Tensor] | None]:
"""Input shape: Batch x Time x Channel"""
bsz, tgt_len, embed_dim = shape_list(hidden_states)
mixed_qkv = self.qkv(hidden_states)
mixed_qkv = tf.reshape(mixed_qkv, (bsz, tgt_len, 3, self.num_heads, self.head_dim))
mixed_qkv = tf.transpose(mixed_qkv, perm=(2, 0, 3, 1, 4))
query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2]
attention_scores = query_states @ tf.transpose(key_states, (0, 1, 3, 2))
attention_scores = attention_scores * self.scale
attention_probs = stable_softmax(attention_scores, axis=-1)
attention_probs = self.dropout(attention_probs, training=training)
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = tf.transpose(attention_probs @ value_states, perm=(0, 2, 1, 3))
new_context_layer_shape = shape_list(context_layer)[:-2] + [self.embed_dim]
context_layer = tf.reshape(context_layer, new_context_layer_shape)
output = self.projection(context_layer)
outputs = (output, attention_probs) if output_attentions else (output, None)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
if getattr(self, "qkv", None) is not None:
with tf.name_scope(self.qkv.name):
self.qkv.build([None, None, self.embed_dim])
if getattr(self, "projection", None) is not None:
with tf.name_scope(self.projection.name):
self.projection.build([None, None, self.embed_dim])
class TFBlipMLP(keras.layers.Layer):
def __init__(self, config: BlipConfig, **kwargs):
super().__init__(**kwargs)
self.activation_fn = get_tf_activation(config.hidden_act)
in_proj_std = (config.hidden_size**-0.5) * ((2 * config.num_hidden_layers) ** -0.5)
fc_std = (2 * config.hidden_size) ** -0.5
self.fc1 = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(fc_std), name="fc1"
)
self.fc2 = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2"
)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.fc1(inputs=hidden_states)
hidden_states = self.activation_fn(hidden_states)
hidden_states = self.fc2(inputs=hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.config.hidden_size])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.intermediate_size])
class TFBlipEncoderLayer(keras.layers.Layer):
def __init__(self, config: BlipConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.hidden_size
self.self_attn = TFBlipAttention(config, name="self_attn")
self.layer_norm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1")
self.mlp = TFBlipMLP(config, name="mlp")
self.layer_norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2")
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
output_attentions: Optional[bool] = False,
training: Optional[bool] = None,
) -> Tuple[tf.Tensor]:
"""
Args:
hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`tf.Tensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
hidden_states = self.layer_norm1(hidden_states)
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
head_mask=attention_mask,
output_attentions=output_attentions,
training=training,
)
hidden_states = hidden_states + residual
residual = hidden_states
hidden_states = self.layer_norm2(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = hidden_states + residual
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "layer_norm1", None) is not None:
with tf.name_scope(self.layer_norm1.name):
self.layer_norm1.build([None, None, self.embed_dim])
if getattr(self, "mlp", None) is not None:
with tf.name_scope(self.mlp.name):
self.mlp.build(None)
if getattr(self, "layer_norm2", None) is not None:
with tf.name_scope(self.layer_norm2.name):
self.layer_norm2.build([None, None, self.embed_dim])
BLIP_INPUTS_DOCSTRING = r"""
Args:
input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Mask to avoid performing attention on padding tokens.
token_type_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Segment token indices to indicate first and second portions of the inputs. Only used by some models like BERT.
position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Indices of positions of each input token in the position embeddings.
inputs_embeds (`tf.Tensor`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated embeddings.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
Args:
input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
[What are input IDs?](../glossary
attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
[What are attention masks?](../glossary
position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
[What are position IDs?](../glossary
pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
return_loss (`bool`, *optional*):
output_attentions (`bool`, *optional*):
output_hidden_states (`bool`, *optional*):
return_dict (`bool`, *optional*):
"""
@keras_serializable
class TFBlipEncoder(keras.layers.Layer):
config_class = BlipConfig
"""
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`BlipEncoderLayer`].
Args:
config (`BlipConfig`):
The corresponding vision configuration for the `BlipEncoder`.
"""
def __init__(self, config: BlipConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
# 创建一个由多个 `TFBlipEncoderLayer` 组成的列表,每个层使用配置参数并命名
self.layers = [TFBlipEncoderLayer(config, name=f"layers_._{i}") for i in range(config.num_hidden_layers)]
@unpack_inputs
def call(
self,
inputs_embeds,
attention_mask: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = None,
) -> Union[Tuple, TFBaseModelOutput]:
r"""
Args:
inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Embedded representation of the inputs. Should be float, not int tokens.
attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
# Determine if `output_attentions` should be overridden by `self.config.output_attentions`
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# Determine if `output_hidden_states` should be overridden by `self.config.output_hidden_states`
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# Determine if `return_dict` should be overridden by `self.config.use_return_dict`
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# Initialize empty tuple for encoder states if `output_hidden_states` is False
encoder_states = () if output_hidden_states else None
# Initialize empty tuple for all attentions if `output_attentions` is False
all_attentions = () if output_attentions else None
# Start with the embedded inputs as the initial hidden states
hidden_states = inputs_embeds
# Iterate through each encoder layer
for idx, encoder_layer in enumerate(self.layers):
# Append current hidden states to encoder states if `output_hidden_states` is True
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
# Pass the current hidden states through the encoder layer
layer_outputs = encoder_layer(
hidden_states,
attention_mask,
output_attentions=output_attentions,
training=training,
)
# Update hidden states with the output of the encoder layer
hidden_states = layer_outputs[0]
# Append attention weights of the current layer if `output_attentions` is True
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
# Append final hidden states to encoder states if `output_hidden_states` is True
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
# Return outputs based on `return_dict` flag
if not return_dict:
return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
return TFBaseModelOutput(
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
)
def build(self, input_shape=None):
# Check if the model is already built; if yes, return immediately
if self.built:
return
# Mark the model as built
self.built = True
# If `self.layers` attribute exists, iterate through each layer and build it
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
# Build each layer with `None` input shape
layer.build(None)
class TFBlipVisionModel(TFBlipPreTrainedModel):
# 主要输入名称为 "pixel_values"
main_input_name = "pixel_values"
# 配置类为 BlipVisionConfig
config_class = BlipVisionConfig
def __init__(self, config: BlipVisionConfig, *args, **kwargs):
# 调用父类的初始化方法
super().__init__(config, *args, **kwargs)
# 保存配置对象
self.config = config
# 创建嵌入层对象,使用 TFBlipVisionEmbeddings 类
self.embeddings = TFBlipVisionEmbeddings(config, name="embeddings")
# 创建编码器对象,使用 TFBlipEncoder 类
self.encoder = TFBlipEncoder(config, name="encoder")
# 创建后层归一化层对象,使用给定的 epsilon 参数
self.post_layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm")
# 设置嵌入维度为配置中的隐藏大小
self.embed_dim = config.hidden_size
def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
# 如果配置要求输出隐藏状态,则将隐藏状态转换为张量
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
# 如果配置要求输出注意力,则将注意力转换为张量
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
# 返回包含指定属性的 TFBaseModelOutputWithPooling 对象
return TFBaseModelOutputWithPooling(
last_hidden_state=output.last_hidden_state,
pooler_output=output.pooler_output,
hidden_states=hs,
attentions=attns,
)
@unpack_inputs
@add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=BlipVisionConfig)
def call(
self,
pixel_values: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = None,
) -> Union[Tuple, TFBaseModelOutputWithPooling]:
r"""
返回类型提示:可能是元组或 TFBaseModelOutputWithPooling 类的对象
"""
# 如果未指定 output_attentions 参数,则使用配置中的默认值
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 如果未指定 output_hidden_states 参数,则使用配置中的默认值
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果未指定 return_dict 参数,则使用配置中的默认值
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果 pixel_values 参数为 None,则抛出数值错误异常
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
# 将像素值通过嵌入层处理,得到隐藏状态
hidden_states = self.embeddings(pixel_values)
# 使用编码器处理隐藏状态,获取编码器的输出
encoder_outputs = self.encoder(
inputs_embeds=hidden_states,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 获取编码器的最后隐藏状态,并通过后层归一化处理
last_hidden_state = encoder_outputs[0]
last_hidden_state = self.post_layernorm(last_hidden_state)
# 提取汇聚输出,即编码器输出的第一个位置
pooled_output = last_hidden_state[:, 0, :]
# TensorFlow 对输入的秩(rank)不一致时可能会出错,因此插入一个单维度来确保一致性
pooled_output = self.post_layernorm(tf.expand_dims(pooled_output, 1))
pooled_output = tf.squeeze(pooled_output, 1)
# 如果不要求返回字典形式,则返回一个元组
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
# 否则,返回 TFBaseModelOutputWithPooling 对象,其中包含编码器输出的各项属性
return TFBaseModelOutputWithPooling(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
def get_input_embeddings(self):
return self.embeddings
def build(self, input_shape=None):
# 如果模型已经构建,则直接返回
if self.built:
return
self.built = True
# 如果 embeddings 属性存在,则构建 embeddings 层
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
# 如果 encoder 属性存在,则构建 encoder 层
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
# 如果 post_layernorm 属性存在,则构建 post_layernorm 层,输入形状为 [None, None, self.embed_dim]
if getattr(self, "post_layernorm", None) is not None:
with tf.name_scope(self.post_layernorm.name):
self.post_layernorm.build([None, None, self.embed_dim])
# 定义 TFBlipMainLayer 类,继承自 keras.layers.Layer,用于实现主层逻辑
class TFBlipMainLayer(keras.layers.Layer):
# 设置类属性 config_class 为 BlipConfig 类型
config_class = BlipConfig
# 初始化方法,接受 BlipConfig 类型的 config 参数及其他位置和关键字参数
def __init__(self, config: BlipConfig, *args, **kwargs):
super().__init__(*args, **kwargs)
# 检查 config.text_config 是否为 BlipTextConfig 类型,若不是则抛出 ValueError 异常
if not isinstance(config.text_config, BlipTextConfig):
raise ValueError(
"config.text_config is expected to be of type BlipTextConfig but is of type"
f" {type(config.text_config)}."
)
# 检查 config.vision_config 是否为 BlipVisionConfig 类型,若不是则抛出 ValueError 异常
if not isinstance(config.vision_config, BlipVisionConfig):
raise ValueError(
"config.vision_config is expected to be of type BlipVisionConfig but is of type"
f" {type(config.vision_config)}."
)
# 从 config 中获取 text_config 和 vision_config 对象
text_config = config.text_config
vision_config = config.vision_config
# 设置实例变量,分别表示投影维度、文本嵌入维度和视觉嵌入维度
self.projection_dim = config.projection_dim
self.text_embed_dim = text_config.hidden_size
self.vision_embed_dim = vision_config.hidden_size
# 创建 TFBlipTextModel 实例并赋给 self.text_model,命名为 "text_model"
self.text_model = TFBlipTextModel(text_config, name="text_model")
# 创建 TFBlipVisionModel 实例并赋给 self.vision_model,命名为 "vision_model"
self.vision_model = TFBlipVisionModel(vision_config, name="vision_model")
# 创建 Dense 层实例 self.visual_projection,用于视觉投影,设置投影维度、不使用偏置、使用指定初始化器
self.visual_projection = keras.layers.Dense(
self.projection_dim,
use_bias=False,
kernel_initializer=get_initializer(config.initializer_range),
name="visual_projection",
)
# 创建 Dense 层实例 self.text_projection,用于文本投影,设置投影维度、不使用偏置、使用指定初始化器
self.text_projection = keras.layers.Dense(
self.projection_dim,
use_bias=False,
kernel_initializer=get_initializer(config.initializer_range),
name="text_projection",
)
# 将 config 参数赋给实例变量 self.config
self.config = config
# build 方法,用于构建层,接受 input_shape 参数
def build(self, input_shape=None):
# 创建并添加名为 logit_scale 的可训练权重,初始化为 config.logit_scale_init_value
self.logit_scale = self.add_weight(
name="logit_scale",
shape=[],
initializer=keras.initializers.Constant(self.config.logit_scale_init_value),
trainable=True,
)
# 如果已经构建过,则直接返回
if self.built:
return
# 标记为已构建
self.built = True
# 如果存在 self.text_model,则构建 self.text_model
if getattr(self, "text_model", None) is not None:
with tf.name_scope(self.text_model.name):
self.text_model.build(None)
# 如果存在 self.vision_model,则构建 self.vision_model
if getattr(self, "vision_model", None) is not None:
with tf.name_scope(self.vision_model.name):
self.vision_model.build(None)
# 如果存在 self.visual_projection,则构建 self.visual_projection
if getattr(self, "visual_projection", None) is not None:
with tf.name_scope(self.visual_projection.name):
self.visual_projection.build([None, None, self.vision_embed_dim])
# 如果存在 self.text_projection,则构建 self.text_projection
if getattr(self, "text_projection", None) is not None:
with tf.name_scope(self.text_projection.name):
self.text_projection.build([None, None, self.text_embed_dim])
# unpack_inputs 装饰器用于处理输入参数的解包操作
@unpack_inputs
# 定义 BLIP 模型的调用方法,接受多个输入参数和可选的输出参数,并返回 TFBlipOutput 或元组
def call(
self,
input_ids: tf.Tensor | None = None, # 输入的文本序列的张量,可选
pixel_values: tf.Tensor | None = None, # 输入的图像像素值的张量,可选
attention_mask: tf.Tensor | None = None, # 文本的注意力遮罩张量,可选
position_ids: tf.Tensor | None = None, # 文本的位置编码张量,可选
return_loss: Optional[bool] = None, # 是否返回损失值,可选
output_attentions: Optional[bool] = None, # 是否输出注意力权重,可选
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,可选
return_dict: Optional[bool] = None, # 是否返回字典形式的输出,可选
training: Optional[bool] = None, # 是否处于训练模式,可选
) -> Union[Tuple, TFBlipOutput]: # 返回值可以是元组或 TFBlipOutput 对象
# 如果没有显式指定,使用 BLIP 模型配置中的设定值来填充相应的输出参数
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 使用视觉模型处理图像输入,并根据指定参数输出相应的结果
vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 使用文本模型处理文本输入,并根据指定参数输出相应的结果
text_outputs = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 从视觉模型的输出中获取图像嵌入表示,并应用视觉投影层
image_embeds = vision_outputs[1]
image_embeds = self.visual_projection(image_embeds)
# 从文本模型的输出中获取文本嵌入表示,并应用文本投影层
text_embeds = text_outputs[1]
text_embeds = self.text_projection(text_embeds)
# 对图像嵌入进行 L2 范数归一化
image_embeds = image_embeds / tf.norm(image_embeds, ord=2, axis=-1, keepdims=True)
# 对文本嵌入进行 L2 范数归一化
text_embeds = text_embeds / tf.norm(text_embeds, ord=2, axis=-1, keepdims=True)
# 使用余弦相似度计算作为对数概率(logits)
logit_scale = tf.exp(self.logit_scale)
logits_per_text = tf.matmul(text_embeds, image_embeds, transpose_b=True) * logit_scale
logits_per_image = tf.transpose(logits_per_text)
# 如果需要返回损失值,则计算 BLIP 损失
loss = None
if return_loss:
loss = blip_loss(logits_per_text)
loss = tf.reshape(loss, (1,))
# 如果不需要返回字典形式的输出,则返回一个包含多个输出的元组
if not return_dict:
output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
return ((loss,) + output) if loss is not None else output
# 如果需要返回字典形式的输出,则创建 TFBlipOutput 对象并返回
return TFBlipOutput(
loss=loss,
logits_per_image=logits_per_image,
logits_per_text=logits_per_text,
text_embeds=text_embeds,
image_embeds=image_embeds,
text_model_output=text_outputs,
vision_model_output=vision_outputs,
)
class TFBlipModel(TFBlipPreTrainedModel):
# 指定配置类为BlipConfig
config_class = BlipConfig
# 在加载模型时忽略的键列表
_keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"]
# 主输入名称为"input_ids"
main_input_name = "input_ids"
def __init__(self, config: BlipConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
# 初始化TFBlipMainLayer作为模型的主要层,使用给定的配置
self.blip = TFBlipMainLayer(config, name="blip")
def serving_output(self, output: TFBlipOutput) -> TFBlipOutput:
# 用于模型服务输出,直接返回给定的TFBlipOutput对象
return TFBlipOutput(
logits_per_image=output.logits_per_image,
logits_per_text=output.logits_per_text,
text_embeds=output.text_embeds,
image_embeds=output.image_embeds,
)
@unpack_inputs
@add_start_docstrings_to_model_forward(BLIP_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFBlipOutput, config_class=BlipConfig)
def call(
self,
input_ids: tf.Tensor | None = None,
pixel_values: tf.Tensor | None = None,
attention_mask: tf.Tensor | None = None,
position_ids: tf.Tensor | None = None,
return_loss: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = None,
) -> Union[Tuple, TFBlipOutput]:
r"""
模型的前向传播方法,接受多种输入参数并返回输出。
Args:
input_ids: 输入的token IDs张量,可以为None。
pixel_values: 图像像素值张量,可以为None。
attention_mask: 注意力遮罩张量,可以为None。
position_ids: 位置IDs张量,可以为None。
return_loss: 是否返回损失值,可选布尔值。
output_attentions: 是否输出注意力张量,可选布尔值。
output_hidden_states: 是否输出隐藏状态张量,可选布尔值。
return_dict: 是否返回字典格式输出,可选布尔值。
training: 是否处于训练模式,可选布尔值。
Returns:
模型的输出结果,类型为TFBlipOutput或一个元组。
Examples:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, TFBlipModel
>>> model = TFBlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
>>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(
... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="tf", padding=True
... )
>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image
>>> probs = tf.nn.softmax(logits_per_image, axis=1)
```
"""
# 调用self.blip对象的call方法,传递所有参数,并返回其输出
outputs = self.blip(
input_ids=input_ids,
pixel_values=pixel_values,
attention_mask=attention_mask,
position_ids=position_ids,
return_loss=return_loss,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
@add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING)
def get_text_features(
self,
input_ids: tf.Tensor | None = None,
attention_mask: tf.Tensor | None = None,
position_ids: tf.Tensor | None = None,
return_dict: Optional[bool] = None,
# 方法用于获取文本特征,接受文本相关的输入参数并返回对应的特征
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果 return_dict 不为 None,则使用它;否则使用配置中的 use_return_dict
vision_outputs = self.blip.vision_model(pixel_values=pixel_values, return_dict=return_dict)
# 使用 BLIP 视觉模型处理像素值,获取视觉输出,根据 return_dict 决定是否返回字典形式的结果
pooled_output = vision_outputs[1] # pooled_output
# 从视觉输出中取出第二个元素作为汇聚输出,通常用于特征投影
image_features = self.blip.visual_projection(pooled_output)
# 使用 BLIP 视觉投影层对汇聚输出进行特征投影,得到图像特征
return image_features
# 返回经过特征投影后的图像特征张量
@add_start_docstrings(
"""
BLIP Model for image captioning. The model consists of a vision encoder and a text decoder. One can optionally pass
`input_ids` to the model, which serve as a text prompt, to make the text decoder continue the prompt. Otherwise,
the decoder starts generating text from the [BOS] (beginning-of-sequence) token. will start generating the caption
from the text input. If no text input is provided, the decoder will start with the [BOS] token only.
""",
BLIP_START_DOCSTRING,
)
class TFBlipForConditionalGeneration(TFBlipPreTrainedModel):
"""
TFBlipForConditionalGeneration 类,继承自 TFBlipPreTrainedModel,用于图像字幕生成任务。
Attributes:
config_class (BlipConfig): 配置类为 BlipConfig。
_keys_to_ignore_on_load_missing (list): 在加载时忽略的缺失键列表。
main_input_name (str): 主要输入名称为 "pixel_values"。
"""
config_class = BlipConfig
_keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"]
main_input_name = "pixel_values"
def __init__(self, config: BlipConfig, *args, **kwargs):
"""
初始化方法,接受 BlipConfig 类型的配置参数。
Args:
config (BlipConfig): BLIP 模型的配置参数。
*args: 位置参数。
**kwargs: 关键字参数。
"""
super().__init__(config, *args, **kwargs)
self.vision_model = TFBlipVisionModel(config.vision_config, name="vision_model")
"""
vision_model 属性,TFBlipVisionModel 类型,使用 vision_config 初始化的视觉模型。
Args:
config.vision_config: 视觉配置参数。
name (str): 模型名称为 "vision_model"。
"""
self.text_decoder = TFBlipTextLMHeadModel(config.text_config, name="text_decoder")
"""
text_decoder 属性,TFBlipTextLMHeadModel 类型,使用 text_config 初始化的文本解码器。
Args:
config.text_config: 文本配置参数。
name (str): 模型名称为 "text_decoder"。
"""
self.decoder_input_ids = config.text_config.bos_token_id
"""
decoder_input_ids 属性,int 类型,表示文本解码器的起始标记 ID。
Args:
config.text_config.bos_token_id: 开始序列的标记 ID。
"""
self.decoder_pad_token_id = config.text_config.pad_token_id
"""
decoder_pad_token_id 属性,int 类型,表示文本解码器的填充标记 ID。
Args:
config.text_config.pad_token_id: 填充标记的 ID。
"""
def get_input_embeddings(self) -> keras.layers.Layer:
"""
获取输入嵌入层的方法。
Returns:
keras.layers.Layer: 返回视觉模型的 patch_embedding 层作为输入嵌入层。
"""
return self.vision_model.embeddings.patch_embedding
@unpack_inputs
@add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFBlipForConditionalGenerationModelOutput, config_class=BlipConfig)
def call(
self,
pixel_values: tf.Tensor,
input_ids: tf.Tensor | None = None,
attention_mask: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
labels: tf.Tensor | None = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = None,
**kwargs
) -> TFBlipForConditionalGenerationModelOutput:
"""
模型的前向传播方法,用于推断和训练。
Args:
pixel_values (tf.Tensor): 输入的像素值张量。
input_ids (tf.Tensor, optional): 输入的文本 ID 张量。默认为 None。
attention_mask (tf.Tensor, optional): 注意力掩码张量。默认为 None。
output_attentions (bool, optional): 是否输出注意力。默认为 None。
output_hidden_states (bool, optional): 是否输出隐藏状态。默认为 None。
labels (tf.Tensor, optional): 标签张量。默认为 None。
return_dict (bool, optional): 是否返回字典格式结果。默认为 None。
training (bool, optional): 是否为训练模式。默认为 None。
Returns:
TFBlipForConditionalGenerationModelOutput: BLIP 条件生成模型的输出结果。
"""
) -> Union[Tuple, TFBlipForConditionalGenerationModelOutput]:
r"""
Returns:
Examples:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, TFBlipForConditionalGeneration
>>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
>>> model = TFBlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> text = "A picture of"
>>> inputs = processor(images=image, text=text, return_tensors="tf")
>>> outputs = model(**inputs)
```"""
# 检查是否需要返回字典形式的输出
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 使用视觉模型处理输入的像素值,返回视觉特征
vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 提取视觉特征的第一个输出,通常是图像嵌入
image_embeds = vision_outputs[0]
# 使用文本解码器生成文本输出
outputs = self.text_decoder(
input_ids=input_ids,
attention_mask=attention_mask,
encoder_hidden_states=image_embeds,
labels=labels,
return_dict=False, # 强制不返回字典
training=training,
)
# 如果不需要返回字典,则按预期输出格式返回结果元组
if not return_dict:
outputs = (outputs[0], outputs[1], image_embeds, vision_outputs[0]) + vision_outputs[2:]
return tuple(output for output in outputs if output is not None)
# 如果有标签,提取损失和逻辑回归结果
if labels is not None:
loss = outputs[0]
logits = outputs[1]
else:
loss = None
logits = outputs[0]
# 如果存在损失并且其维度为0,则进行形状调整以保证一致性
if loss is not None and loss.shape.rank == 0:
loss = tf.reshape(loss, (1,))
# 返回模型输出的命名元组,包括损失、逻辑回归结果、图像嵌入和视觉模型的隐藏状态等
return TFBlipForConditionalGenerationModelOutput(
loss=loss,
logits=logits,
image_embeds=image_embeds,
last_hidden_state=vision_outputs.last_hidden_state,
hidden_states=vision_outputs.hidden_states,
attentions=vision_outputs.attentions,
)
def generate(
self,
pixel_values: tf.Tensor,
input_ids: tf.Tensor | None = None,
attention_mask: tf.Tensor | None = None,
**generate_kwargs,
) -> tf.Tensor:
r"""
Overrides *generate* function to be able to use the model as a conditional generator
Parameters:
pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, image_height, image_width)`:
Input image to be processed
input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
The sequence used as a prompt for the generation.
attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
Examples:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, TFBlipForConditionalGeneration
>>> model = TFBlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
>>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="tf")
>>> outputs = model.generate(**inputs)
>>> print(processor.decode(outputs[0], skip_special_tokens=True))
two cats sleeping on a couch
```
"""
# 获取批次大小
batch_size = pixel_values.shape[0]
# 使用视觉模型处理输入图像,返回视觉输出
vision_outputs = self.vision_model(pixel_values=pixel_values)
# 从视觉输出中获取图像嵌入
image_embeds = vision_outputs[0]
# 创建图像注意力掩码,默认全为1,形状与图像嵌入维度相同
image_attention_mask = tf.ones(shape_list(image_embeds)[:-1], dtype=tf.int32)
# 如果输入的input_ids是列表,则转换为张量
if isinstance(input_ids, list):
input_ids = tf.convert_to_tensor(input_ids, dtype=tf.int32)
# 如果input_ids为None,则使用默认的decoder输入ID和结束标记创建张量
elif input_ids is None:
input_ids = tf.convert_to_tensor(
[[self.decoder_input_ids, self.config.text_config.eos_token_id]], dtype=tf.int32
)
# 扩展为与批次大小匹配的形状
input_ids = tf.tile(input_ids, (batch_size, 1))
# 添加起始标记到input_ids的开头,与PyTorch中的操作等效
input_ids = tf.concat(
[tf.ones((batch_size, 1), dtype=tf.int32) * self.config.text_config.bos_token_id, input_ids[:, 1:]], axis=1
)
# 调整attention_mask的长度,与输入序列长度相匹配
attention_mask = attention_mask[:, :-1] if attention_mask is not None else None
# 调用文本解码器的generate方法生成文本序列
outputs = self.text_decoder.generate(
input_ids=input_ids[:, :-1],
eos_token_id=self.config.text_config.sep_token_id,
pad_token_id=self.config.text_config.pad_token_id,
attention_mask=attention_mask,
encoder_hidden_states=image_embeds,
encoder_attention_mask=image_attention_mask,
**generate_kwargs,
)
# 返回生成的输出序列
return outputs
# 定义模型构建方法,如果已经构建过则直接返回
def build(self, input_shape=None):
if self.built:
return
# 设置标志位,表示模型已经构建
self.built = True
# 如果存在视觉模型,则构建视觉模型
if getattr(self, "vision_model", None) is not None:
# 使用视觉模型的名称作为命名空间
with tf.name_scope(self.vision_model.name):
# 构建视觉模型,传入空的输入形状
self.vision_model.build(None)
# 如果存在文本解码器,则构建文本解码器
if getattr(self, "text_decoder", None) is not None:
# 使用文本解码器的名称作为命名空间
with tf.name_scope(self.text_decoder.name):
# 构建文本解码器,传入空的输入形状
self.text_decoder.build(None)
"""
BLIP Model for visual question answering. The model consists of a vision encoder, a text encoder as well as a text
decoder. The vision encoder will encode the input image, the text encoder will encode the input question together
with the encoding of the image, and the text decoder will output the answer to the question.
"""
# 导入所需的模块和函数装饰器
@add_start_docstrings(
"""
BLIP Model for visual question answering. The model consists of a vision encoder, a text encoder as well as a text
decoder. The vision encoder will encode the input image, the text encoder will encode the input question together
with the encoding of the image, and the text decoder will output the answer to the question.
""",
BLIP_START_DOCSTRING,
)
# 继承自 TFBlipPreTrainedModel 类
class TFBlipForQuestionAnswering(TFBlipPreTrainedModel):
# 使用 BlipConfig 类来配置模型
config_class = BlipConfig
# 在加载时忽略的关键字列表
_keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"]
# 模型初始化方法
def __init__(self, config: BlipConfig, *args, **kwargs):
# 调用父类的初始化方法
super().__init__(config, *args, **kwargs)
# 创建视觉模型,使用 TFBlipVisionModel 类
self.vision_model = TFBlipVisionModel(config.vision_config, name="vision_model")
# 创建文本编码器,使用 TFBlipTextModel 类
self.text_encoder = TFBlipTextModel(config.text_config, name="text_encoder", add_pooling_layer=False)
# 创建文本解码器,使用 TFBlipTextLMHeadModel 类
self.text_decoder = TFBlipTextLMHeadModel(config.text_config, name="text_decoder")
# 解码器的填充标记 ID
self.decoder_pad_token_id = config.text_config.pad_token_id
# 解码器的起始标记 ID
self.decoder_start_token_id = config.text_config.bos_token_id
# 获取输入嵌入的方法
def get_input_embeddings(self) -> keras.layers.Layer:
# 返回视觉模型的补丁嵌入层
return self.vision_model.embeddings.patch_embedding
# 定义的方法来实现标记右移,类似于 transformers.models.t5.modeling_tf_t5.TFT5PreTrainedModel._shift_right 方法
def _shift_right(self, input_ids):
# 获取解码器的起始标记 ID 和填充标记 ID
decoder_start_token_id = self.decoder_start_token_id
pad_token_id = self.decoder_pad_token_id
# 如果起始标记 ID 或填充标记 ID 未定义,则抛出 ValueError
if decoder_start_token_id is None or pad_token_id is None:
raise ValueError("decoder_start_token_id and pad_token_id must be defined!")
# 创建起始标记序列,并确保与输入标记兼容的数据类型
start_tokens = tf.fill((shape_list(input_ids)[0], 1), decoder_start_token_id)
start_tokens = tf.cast(start_tokens, input_ids.dtype) # 确保拼接时数据类型兼容
# 将起始标记序列与输入标记序列右移一位进行拼接
shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)
# 将标签中可能存在的 -100 值替换为填充标记 ID
shifted_input_ids = tf.where(
shifted_input_ids == -100,
tf.cast(tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids.dtype),
shifted_input_ids,
)
# 断言确保 `labels` 只包含正值和 -100
tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=shifted_input_ids.dtype))
return shifted_input_ids
# 装饰器函数,用于将输入拆包并添加模型前向传播的文档字符串
@unpack_inputs
@add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
# 替换返回值文档字符串的装饰器函数
@replace_return_docstrings(output_type=TFBlipTextVisionModelOutput, config_class=BlipVisionConfig)
# 定义一个方法 `call`,用于执行模型推理或训练过程
def call(
self,
input_ids: tf.Tensor, # 输入文本的 token IDs,作为模型的输入
pixel_values: tf.Tensor | None = None, # 图像像素值,可选,用于图像输入模型
decoder_input_ids: tf.Tensor | None = None, # 解码器的输入 token IDs,可选
decoder_attention_mask: tf.Tensor | None = None, # 解码器的注意力遮罩,可选
attention_mask: tf.Tensor | None = None, # 注意力遮罩,控制模型哪些部分需要关注,可选
output_attentions: Optional[bool] = None, # 是否输出注意力权重,可选
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,可选
labels: tf.Tensor | None = None, # 标签,用于模型的监督学习,可选
return_dict: Optional[bool] = None, # 是否以字典形式返回输出,可选
training: Optional[bool] = None, # 是否处于训练模式,可选
):
# 定义一个方法 `generate`,用于生成模型输出(如文本生成)
def generate(
self,
input_ids: tf.Tensor, # 输入文本的 token IDs,作为生成器的输入
pixel_values: tf.Tensor, # 图像像素值,用于图像输入模型
attention_mask: tf.Tensor | None = None, # 注意力遮罩,控制模型哪些部分需要关注,可选
**generate_kwargs, # 其他生成参数,以字典形式传递
) -> tf.Tensor:
r"""
Overrides *generate* function to be able to use the model as a conditional generator
Parameters:
input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
The sequence used as a prompt for the generation.
pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, image_height, image_width)`:
Input image to be processed
attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`. `1` for
tokens that are NOT MASKED, `0` for MASKED tokens.
generate_kwargs (dict, *optional*):
Additional arguments passed to the `generate` function of the decoder
Examples:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, TFBlipForQuestionAnswering
>>> model = TFBlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
>>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> text = "How many cats are in the picture?"
>>> inputs = processor(images=image, text=text, return_tensors="tf")
>>> outputs = model.generate(**inputs)
>>> print(processor.decode(outputs[0], skip_special_tokens=True))
2
```
"""
# 使用视觉模型处理输入图像,获取视觉输出
vision_outputs = self.vision_model(pixel_values=pixel_values)
# 提取图像嵌入表示
image_embeds = vision_outputs[0]
# 生成图像注意力掩码,形状与图像嵌入表示的前几维相同,最后一维是整数类型
image_attention_mask = tf.ones(shape_list(image_embeds)[:-1], dtype=tf.int32)
# 如果输入的input_ids是列表,则转换为Tensor类型
if isinstance(input_ids, list):
input_ids = tf.Tensor(input_ids)
# 使用文本编码器处理输入文本序列,得到文本输出
question_outputs = self.text_encoder(
input_ids=input_ids,
attention_mask=attention_mask,
encoder_hidden_states=image_embeds,
encoder_attention_mask=image_attention_mask,
return_dict=False,
)
# 提取问题嵌入表示
question_embeds = question_outputs[0]
# 生成问题的注意力掩码,形状与问题嵌入表示的前几维相同,最后一维是整数类型
question_attention_mask = tf.ones(shape_list(question_embeds)[:-1], dtype=tf.int32)
# 构造起始标记的Tensor,形状为(batch_size, 1),值为self.decoder_start_token_id
bos_ids = tf.fill(
(tf.shape(question_embeds)[0], 1), value=tf.cast(self.decoder_start_token_id, input_ids.dtype)
)
# 使用文本解码器生成输出序列
outputs = self.text_decoder.generate(
input_ids=bos_ids,
eos_token_id=self.config.text_config.sep_token_id,
pad_token_id=self.config.text_config.pad_token_id,
encoder_hidden_states=question_embeds,
encoder_attention_mask=question_attention_mask,
**generate_kwargs,
)
# 返回生成的输出序列
return outputs
# 定义神经网络层的构建方法,用于建立模型的输入形状
def build(self, input_shape=None):
# 如果模型已经构建过,则直接返回,避免重复构建
if self.built:
return
# 设置标志位,表示模型已经构建
self.built = True
# 如果存在视觉模型,使用 TensorFlow 的命名空间来构建视觉模型
if getattr(self, "vision_model", None) is not None:
with tf.name_scope(self.vision_model.name):
# 调用视觉模型的build方法,传入空输入形状
self.vision_model.build(None)
# 如果存在文本编码器,使用 TensorFlow 的命名空间来构建文本编码器
if getattr(self, "text_encoder", None) is not None:
with tf.name_scope(self.text_encoder.name):
# 调用文本编码器的build方法,传入空输入形状
self.text_encoder.build(None)
# 如果存在文本解码器,使用 TensorFlow 的命名空间来构建文本解码器
if getattr(self, "text_decoder", None) is not None:
with tf.name_scope(self.text_decoder.name):
# 调用文本解码器的build方法,传入空输入形状
self.text_decoder.build(None)
"""
BLIP Model with a vision and text projector, and a classification head on top. The model is used in the context of
image-text retrieval. Given an image and a text, the model returns the probability of the text being relevant to
the image.
"""
# 继承自 TFBlipPreTrainedModel 的 BLIP 图像文本检索模型类
class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel):
# 使用 BlipConfig 类作为配置类
config_class = BlipConfig
def __init__(self, config: BlipConfig, *args, **kwargs):
# 调用父类的初始化方法
super().__init__(config, *args, **kwargs)
# 创建 BLIP 视觉模型,使用配置中的视觉配置
self.vision_model = TFBlipVisionModel(config.vision_config, name="vision_model")
# 创建 BLIP 文本编码器,使用配置中的文本配置,并禁用池化层
self.text_encoder = TFBlipTextModel(config.text_config, name="text_encoder", add_pooling_layer=False)
# 视觉投影层,用于将视觉特征投影到共享空间
self.vision_proj = keras.layers.Dense(
config.image_text_hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name="vision_proj",
)
# 文本投影层,用于将文本特征投影到共享空间
self.text_proj = keras.layers.Dense(
config.image_text_hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name="text_proj",
)
# 图像文本匹配头部,用于预测文本与图像相关性的概率
self.itm_head = keras.layers.Dense(
2, kernel_initializer=get_initializer(config.initializer_range), name="itm_head"
)
# 解码器的填充标记 ID,根据配置中的文本填充标记 ID 或解码器的开始标记 ID
self.decoder_pad_token_id = (
config.text_config.pad_token_id
if not hasattr(config, "decoder_pad_token_id")
else config.decoder_pad_token_id
)
self.decoder_start_token_id = (
config.text_config.bos_token_id
if not hasattr(config, "decoder_start_token_id")
else config.decoder_start_token_id
)
self.config = config
# 获取输入嵌入的方法,返回视觉模型的补丁嵌入层
def get_input_embeddings(self) -> keras.layers.Layer:
return self.vision_model.embeddings.patch_embedding
# 调用方法,对输入数据进行前向传播
@unpack_inputs
@add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFBlipImageTextMatchingModelOutput, config_class=BlipVisionConfig)
def call(
self,
input_ids: tf.Tensor,
pixel_values: tf.Tensor | None = None,
use_itm_head: Optional[bool] = True,
attention_mask: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = None,
# 其他参数用于模型前向传播,如像素值、注意力掩码、是否返回字典等
):
# 构建方法,用于构造模型结构。如果已经构建过,直接返回。
def build(self, input_shape=None):
if self.built:
return
# 标记模型已经构建
self.built = True
# 如果存在视觉模型,使用视觉模型的名称作为命名空间,构建视觉模型
if getattr(self, "vision_model", None) is not None:
with tf.name_scope(self.vision_model.name):
self.vision_model.build(None)
# 如果存在文本编码器,使用文本编码器的名称作为命名空间,构建文本编码器
if getattr(self, "text_encoder", None) is not None:
with tf.name_scope(self.text_encoder.name):
self.text_encoder.build(None)
# 如果存在视觉投影层,使用视觉投影层的名称作为命名空间,构建视觉投影层
if getattr(self, "vision_proj", None) is not None:
with tf.name_scope(self.vision_proj.name):
self.vision_proj.build([None, None, self.config.vision_config.hidden_size])
# 如果存在文本投影层,使用文本投影层的名称作为命名空间,构建文本投影层
if getattr(self, "text_proj", None) is not None:
with tf.name_scope(self.text_proj.name):
self.text_proj.build([None, None, self.config.text_config.hidden_size])
# 如果存在itm_head,使用itm_head的名称作为命名空间,构建itm_head
if getattr(self, "itm_head", None) is not None:
with tf.name_scope(self.itm_head.name):
self.itm_head.build([None, None, self.config.text_config.hidden_size])