Transformers 源码解析(五十六)
.\models\gpt_neox\tokenization_gpt_neox_fast.py
"""GPTNeoX的标记类。"""
import json
from typing import Optional, Tuple
from tokenizers import pre_tokenizers
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"tokenizer_file": {
"EleutherAI/gpt-neox-20b": "https://huggingface.co/EleutherAI/gpt-neox-20b/resolve/main/tokenizer.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"gpt-neox-20b": 2048,
}
class GPTNeoXTokenizerFast(PreTrainedTokenizerFast):
"""
构建一个“快速”的GPT-NeoX-20B标记生成器(由HuggingFace的*tokenizers*库支持)。基于字节级Byte-Pair-Encoding。
这个标记生成器被训练成将空格视为标记的一部分(类似于sentencepiece),因此单词的编码会根据其是否位于句子开头(没有空格)而不同:
```
>>> from transformers import GPTNeoXTokenizerFast
>>> tokenizer = GPTNeoXTokenizerFast.from_pretrained("openai-community/gpt2")
>>> tokenizer("Hello world")["input_ids"]
[15496, 995]
>>> tokenizer(" Hello world")["input_ids"]
[18435, 995]
```
如果在实例化标记生成器时传递`add_prefix_space=True`,可以绕过此行为,但由于模型未用此方式进行预训练,可能会降低性能。
<Tip>
当与`is_split_into_words=True`一起使用时,应使用`add_prefix_space=True`实例化此标记生成器。
</Tip>
此标记生成器继承自[`PreTrainedTokenizerFast`],其中包含大多数主要方法。用户应参考此超类以获取有关这些方法的更多信息。
"""
Args:
vocab_file (`str`):
Path to the vocabulary file.
merges_file (`str`):
Path to the merges file.
errors (`str`, *optional*, defaults to `"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See
[bytes.decode](https://docs.python.org/3/library/stdtypes.html
unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
The beginning of sequence token.
eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
The end of sequence token.
add_prefix_space (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (GPTNeoX tokenizer detect beginning of words by the preceding space).
trim_offsets (`bool`, *optional*, defaults to `True`):
Whether or not the post-processing step should trim offsets to avoid including whitespaces.
"""
# 定义常量:用于存储预定义的词汇文件名列表
vocab_files_names = VOCAB_FILES_NAMES
# 定义常量:用于存储预定义的词汇文件映射字典
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
# 定义常量:用于存储预定义的最大模型输入大小列表
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# 定义常量:用于存储预定义的模型输入名称列表
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file=None,
merges_file=None,
tokenizer_file=None,
unk_token="<|endoftext|>",
bos_token="<|endoftext|>",
eos_token="<|endoftext|>",
add_prefix_space=False,
**kwargs,
):
# 调用父类的初始化方法,传递参数以配置tokenizer
super().__init__(
vocab_file,
merges_file,
tokenizer_file=tokenizer_file,
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
add_prefix_space=add_prefix_space,
**kwargs,
)
# 获取当前tokenizer的预处理状态,并更新其中的add_prefix_space选项
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
# 根据预处理器类型动态获取类,并根据更新后的状态重新配置预处理器
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
# 设置类属性,用于存储是否添加前导空格的标志
self.add_prefix_space = add_prefix_space
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# 调用底层tokenizer模型的保存方法,保存模型到指定目录,并返回保存的文件名元组
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
@property
# 从transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.default_chat_template复制而来
# 定义一个默认的聊天模板函数,用于生成聊天内容,忽略角色信息,并使用 EOS 标记连接消息。
logger.warning_once(
# 发出一次性警告日志,指示没有为此分词器定义聊天模板,而是使用默认模板。
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
# 返回一个字符串模板,用于格式化聊天消息,每条消息后附加 EOS 标记。
return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
.\models\gpt_neox\__init__.py
from typing import TYPE_CHECKING
from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available
from ...utils import OptionalDependencyNotAvailable
_import_structure = {"configuration_gpt_neox": ["GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoXConfig"]}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_gpt_neox_fast"] = ["GPTNeoXTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_gpt_neox"] = [
"GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST",
"GPTNeoXForCausalLM",
"GPTNeoXForQuestionAnswering",
"GPTNeoXForSequenceClassification",
"GPTNeoXForTokenClassification",
"GPTNeoXLayer",
"GPTNeoXModel",
"GPTNeoXPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_gpt_neox import GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoXConfig
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_gpt_neox_fast import GPTNeoXTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_gpt_neox import (
GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST,
GPTNeoXForCausalLM,
GPTNeoXForQuestionAnswering,
GPTNeoXForSequenceClassification,
GPTNeoXForTokenClassification,
GPTNeoXLayer,
GPTNeoXModel,
GPTNeoXPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\gpt_neox_japanese\configuration_gpt_neox_japanese.py
""" GPTNeoX Japanese model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
GPT_NEOX_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"abeja/gpt-neox-japanese-2.7b": "https://huggingface.co/abeja/gpt-neox-japanese-2.7b/resolve/main/config.json",
}
class GPTNeoXJapaneseConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`GPTNeoXModelJapanese`]. It is used to instantiate
a GPTNeoX model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the GPTNeoXJapanese
[abeja/gpt-neox-japanese-2.7b](https://huggingface.co/abeja/gpt-neox-japanese-2.7b) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information. Default configs is set as 2.7B model
"""
model_type = "gpt_neox_japanese"
def __init__(
self,
vocab_size=32000,
hidden_size=2560,
num_hidden_layers=32,
num_attention_heads=32,
intermediate_multiple_size=4,
hidden_act="gelu",
rotary_pct=1.00,
rotary_emb_base=10000,
max_position_embeddings=2048,
initializer_range=0.02,
layer_norm_eps=1e-5,
use_cache=True,
bos_token_id=31996,
eos_token_id=31999,
attention_dropout=0.1,
hidden_dropout=0.0,
**kwargs,
):
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_multiple_size = intermediate_multiple_size
self.hidden_act = hidden_act
self.rotary_pct = rotary_pct
self.rotary_emb_base = rotary_emb_base
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.use_cache = use_cache
self.attention_dropout = attention_dropout
self.hidden_dropout = hidden_dropout
.\models\gpt_neox_japanese\modeling_gpt_neox_japanese.py
""" PyTorch GPTNeoX model."""
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import Tensor, nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
from ...modeling_utils import PreTrainedModel
from ...utils import logging
from .configuration_gpt_neox_japanese import GPTNeoXJapaneseConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "abeja/gpt-neox-japanese-2.7b"
_CONFIG_FOR_DOC = "GPTNeoXJapaneseConfig"
GPT_NEOX_JAPANESE_PRETRAINED_MODEL_ARCHIVE_LIST = {
"https://huggingface.co/abeja/gpt-neox-japanese-2.7b/resolve/main/config.json",
}
class GPTNeoXJapanesePreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = GPTNeoXJapaneseConfig
base_model_prefix = "gpt_neox_japanese"
_no_split_modules = ["GPTNeoXJapaneseLayer"]
_skip_keys_device_placement = "past_key_values"
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
class GPTNeoXJapaneseAttention(nn.Module):
def __init__(self, config, use_bias=False):
super().__init__()
self.num_attention_heads = config.num_attention_heads
self.hidden_size = config.hidden_size
self.head_size = self.hidden_size // self.num_attention_heads
self.rotary_ndims = int(self.head_size * config.rotary_pct)
self.rotary_emb = RotaryEmbedding(
self.rotary_ndims, config.max_position_embeddings, base=config.rotary_emb_base
)
self.max_positions = config.max_position_embeddings
self.attention_dropout = nn.Dropout(config.attention_dropout)
self.norm_factor = torch.sqrt(torch.tensor(self.head_size, dtype=torch.float32)).to(torch.get_default_dtype())
self.query_key_value = nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=False)
self.dense = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
self.use_bias = use_bias
self.dense_bias = nn.Parameter(torch.zeros(config.hidden_size)) if use_bias else None
def forward(
self,
hidden_states,
attention_mask,
head_mask=None,
layer_past=None,
use_cache=False,
output_attentions=False,
):
has_layer_past = layer_past is not None and layer_past[0].numel() > 0
qkv = self.query_key_value(hidden_states)
new_qkv_shape = qkv.size()[:-1] + (self.num_attention_heads, 3 * self.head_size)
qkv = qkv.view(*new_qkv_shape)
query = qkv[..., : self.head_size].permute(0, 2, 1, 3)
key = qkv[..., self.head_size : 2 * self.head_size].permute(0, 2, 1, 3)
value = qkv[..., 2 * self.head_size :].permute(0, 2, 1, 3)
query_rot = query[..., : self.rotary_ndims]
query_pass = query[..., self.rotary_ndims :]
key_rot = key[..., : self.rotary_ndims]
key_pass = key[..., self.rotary_ndims :]
seq_len = key.shape[-2]
offset = 0
if has_layer_past:
offset = layer_past[0].shape[-2]
seq_len += offset
cos, sin = self.rotary_emb(value, seq_len=seq_len)
query, key = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, offset=offset)
query = torch.cat((query, query_pass), dim=-1)
key = torch.cat((key, key_pass), dim=-1)
if has_layer_past:
past_key = layer_past[0]
past_value = layer_past[1]
key = torch.cat((past_key, key), dim=-2)
value = torch.cat((past_value, value), dim=-2)
present = (key, value) if use_cache else None
attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_size)
attn_output = self.dense(attn_output)
outputs = (attn_output, present)
if output_attentions:
outputs += (attn_weights,)
return outputs, self.dense_bias
@classmethod
def _split_heads(cls, tensor, num_attention_heads, attn_head_size):
"""
将隐藏维度分割为 attn_head_size 和 num_attention_heads
"""
new_shape = tensor.size()[:-1] + (num_attention_heads, attn_head_size)
tensor = tensor.view(new_shape)
tensor = tensor.permute(0, 2, 1, 3)
return tensor
def _merge_heads(cls, tensor, num_attention_heads, attn_head_size):
"""
Merges attn_head_size dim and num_attn_heads dim into hidden dim
"""
tensor = tensor.permute(0, 2, 1, 3).contiguous()
tensor = tensor.view(tensor.size(0), tensor.size(1), num_attention_heads * attn_head_size)
return tensor
def _create_causal_mask(self, key_length, query_length):
causal_mask = torch.tril(
torch.ones((self.max_positions, self.max_positions), dtype=torch.bool).view(
1, 1, self.max_positions, self.max_positions
)
)
return causal_mask[:, :, key_length - query_length : key_length, :key_length]
def _attn(self, query, key, value, attention_mask=None, head_mask=None):
batch_size, num_attention_heads, query_length, attn_head_size = query.size()
key_length = key.size(-2)
causal_mask = self._create_causal_mask(key_length, query_length)
query = query.view(batch_size * num_attention_heads, query_length, attn_head_size)
key = key.view(batch_size * num_attention_heads, key_length, attn_head_size)
attn_scores = torch.zeros(
batch_size * num_attention_heads,
query_length,
key_length,
dtype=query.dtype,
device=key.device,
)
attn_scores = torch.baddbmm(
attn_scores,
query,
key.transpose(1, 2),
beta=1.0,
alpha=(torch.tensor(1.0, dtype=self.norm_factor.dtype, device=self.norm_factor.device) / self.norm_factor),
)
attn_scores = attn_scores.view(batch_size, num_attention_heads, query_length, key_length)
mask_value = torch.finfo(attn_scores.dtype).min
mask_value = torch.tensor(mask_value, dtype=attn_scores.dtype).to(attn_scores.device)
causal_mask = causal_mask.to(attn_scores.device)
attn_scores = torch.where(causal_mask, attn_scores, mask_value)
if attention_mask is not None:
attn_scores = attn_scores + attention_mask
attn_weights = nn.functional.softmax(attn_scores, dim=-1)
attn_weights = self.attention_dropout(attn_weights)
attn_weights = attn_weights.to(value.dtype)
if head_mask is not None:
attn_weights = attn_weights * head_mask
attn_output = torch.matmul(attn_weights, value)
return attn_output, attn_weights
class RotaryEmbedding(nn.Module):
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
super().__init__()
self.dim = dim
self.max_position_embeddings = max_position_embeddings
self.base = base
inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
self.register_buffer("inv_freq", inv_freq, persistent=False)
self._set_cos_sin_cache(
seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
)
def _set_cos_sin_cache(self, seq_len, device, dtype):
self.max_seq_len_cached = seq_len
t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
freqs = torch.outer(t, self.inv_freq)
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer("cos_cached", emb.cos(), persistent=False)
self.register_buffer("sin_cached", emb.sin(), persistent=False)
def forward(self, x, seq_len=None):
if seq_len > self.max_seq_len_cached:
self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
return (
self.cos_cached[:seq_len],
self.sin_cached[:seq_len],
)
def rotate_half(x):
"""将输入的一半隐藏维度旋转"""
x1 = x[..., : x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2 :]
return torch.cat((-x2, x1), dim=-1)
def apply_rotary_pos_emb(q, k, cos, sin, offset: int = 0):
"""应用旋转位置嵌入到查询和键中"""
cos = cos[..., offset : q.shape[-2] + offset, :]
sin = sin[..., offset : q.shape[-2] + offset, :]
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)
return q_embed, k_embed
def bias_dropout_add(x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float, training: bool) -> Tensor:
"""为输入添加偏置,应用 dropout 和残差连接
Args:
x (Tensor): 主路径的输出
bias (Tensor): 最后一个注意力层的 attn_bias 或者 None
residual (Optional[Tensor]): 残差值
prob (float): dropout 概率
training (bool): 是否处于训练模式
Returns:
Tensor: dropout(x + bias) + residual
"""
if bias is not None:
x = x + bias
out = torch.nn.functional.dropout(x, p=prob, training=training)
if residual is not None:
out = residual + out
return out
class GPTNeoXJapaneseMLP(nn.Module):
def __init__(self, config):
super().__init__()
intermediate_size = int(config.hidden_size * config.intermediate_multiple_size)
self.dense_h_to_4h = nn.Linear(config.hidden_size, intermediate_size, bias=False)
self.dense_4h_to_h = nn.Linear(intermediate_size, config.hidden_size, bias=False)
self.act = ACT2FN[config.hidden_act]
def forward(self, hidden_states):
intermediate = self.dense_h_to_4h(hidden_states)
intermediate = self.act(intermediate)
output = self.dense_4h_to_h(intermediate)
return output
class GPTNeoXJapaneseLayer(nn.Module):
def __init__(self, config, layer_number):
super().__init__()
self.layer_number = layer_number
self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.attention = GPTNeoXJapaneseAttention(config=config, use_bias=layer_number == config.num_hidden_layers - 1)
self.mlp = GPTNeoXJapaneseMLP(config)
self.hidden_dropout = config.hidden_dropout
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
use_cache=False,
layer_past=None,
output_attentions=False,
):
residual = hidden_states
ln_out = self.input_layernorm(hidden_states)
attention_layer_outputs, attn_bias = self.attention(
ln_out,
attention_mask=attention_mask,
layer_past=layer_past,
head_mask=head_mask,
use_cache=use_cache,
output_attentions=output_attentions,
)
attn_output = attention_layer_outputs[0]
outputs = attention_layer_outputs[1:]
attn_output = bias_dropout_add(
attn_output,
bias=attn_bias.expand_as(residual) if attn_bias is not None else attn_bias,
residual=residual,
prob=self.hidden_dropout,
training=self.training,
)
mlp_output = self.mlp(self.post_attention_layernorm(attn_output))
attn_output = bias_dropout_add(
mlp_output, bias=None, residual=attn_output, prob=self.hidden_dropout, training=self.training
)
if use_cache:
outputs = (attn_output,) + outputs
else:
outputs = (attn_output,) + outputs[1:]
return outputs
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
output_attentions (`bool`, *optional*):
output_hidden_states (`bool`, *optional*):
return_dict (`bool`, *optional*):
"""
@add_start_docstrings(
"The bare GPTNeoXJapaneseForCausalLM Model transformer with a causal language modeling head on top.",
GPT_NEOX_JAPANESE_START_DOCSTRING,
)
"""
class GPTNeoXJapaneseForCausalLM(GPTNeoXJapanesePreTrainedModel):
_tied_weights_keys = ["embed_out.weight"]
def __init__(self, config):
"""
Initialize the GPTNeoXJapaneseForCausalLM model.
Args:
config (GPTNeoXJapaneseConfig): Configuration class for the model.
"""
super().__init__(config)
self.config = config
self.gpt_neox_japanese = GPTNeoXJapaneseModel(config)
self.embed_out = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
def get_output_embeddings(self):
"""
Retrieve the output embeddings.
Returns:
nn.Linear: The output embeddings layer.
"""
return self.embed_out
def set_output_embeddings(self, new_embeddings):
"""
Set new output embeddings.
Args:
new_embeddings (nn.Linear): New embeddings to be set.
"""
self.embed_out = new_embeddings
@add_start_docstrings_to_model_forward(GPT_NEOX_JAPANESE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Perform forward pass of the GPTNeoXJapaneseForCausalLM model.
Args:
input_ids (torch.LongTensor, optional): Input token IDs. Default: None
attention_mask (torch.FloatTensor, optional): Attention mask. Default: None
head_mask (torch.FloatTensor, optional): Head mask. Default: None
inputs_embeds (torch.FloatTensor, optional): Embedded inputs. Default: None
past_key_values (Tuple[Tuple[torch.FloatTensor]], optional): Past key values for autoregressive generation. Default: None
use_cache (bool, optional): Whether to use cache for autoregressive generation. Default: None
output_attentions (bool, optional): Whether to output attentions weights. Default: None
output_hidden_states (bool, optional): Whether to output hidden states. Default: None
return_dict (bool, optional): Whether to return a dictionary as output. Default: None
Returns:
output (CausalLMOutputWithPast): Model output for language modeling.
"""
return self.gpt_neox_japanese(
input_ids=input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
input_shape = input_ids.shape
if attention_mask is None:
attention_mask = input_ids.new_ones(input_shape)
if past_key_values and past_key_values[0] is not None:
input_ids = input_ids[:, -1:]
return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
def _reorder_cache(self, past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+ layer_past[2:],
)
return reordered_past
.\models\gpt_neox_japanese\tokenization_gpt_neox_japanese.py
"""GPTNeoXJapanese 的标记化类。"""
import collections
import json
import os
import re
from typing import Optional, Tuple
import numpy as np
from ...tokenization_utils_fast import PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "emoji_file": "emoji.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"abeja/gpt-neox-japanese-2.7b": "https://huggingface.co/abeja/gpt-neox-japanese-2.7b/resolve/main/vocab.txt",
},
"emoji_file": {
"abeja/gpt-neox-japanese-2.7b": "https://huggingface.co/abeja/gpt-neox-japanese-2.7b/resolve/main/emoji.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"abeja/gpt-neox-japanese-2.7b": 2048,
}
def load_vocab_and_emoji(vocab_file, emoji_file):
"""加载词汇文件和表情文件到字典中。"""
with open(emoji_file, "r", encoding="utf-8") as f:
emoji = json.loads(f.read())
vocab = collections.OrderedDict()
raw_vocab = collections.OrderedDict()
ids_to_tokens = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as f:
token = f.readlines()
token = [[t.rstrip("\n")] if (t == "," or "," not in t) else t.rstrip("\n").split(",") for t in token]
for idx, b in enumerate(token):
ids_to_tokens[idx] = b
raw_vocab[",".join(b)] = idx
for wd in b:
vocab[wd] = idx
return vocab, raw_vocab, ids_to_tokens, emoji
class GPTNeoXJapaneseTokenizer(PreTrainedTokenizer):
"""
这个标记生成器继承自[`PreTrainedTokenizer`],基于日本特殊的子词编码,该编码在此代码库中使用
(https://github.com/tanreinama/Japanese-BPEEncoder_V2)。详细信息请参阅该代码库。
日语词汇相对较大,并且单词之间没有分隔。此外,语言是由平假名、片假名和汉字组成,
并且经常使用"1"和"①"等变体。为了应对这些情况,这个标记生成器具有以下功能:
- 逐字子词分割,介于字节字符串和形态分析之间。
"""
from transformers import GPTNeoXJapaneseTokenizer
class GPTNeoXJapaneseTokenizer:
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
emoji_file,
unk_token="<|endoftext|>",
pad_token="<|endoftext|>",
bos_token="<|startoftext|>",
eos_token="<|endoftext|>",
do_clean_text=False,
**kwargs,
):
pass
):
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
" model use `tokenizer = GPTNeoXJapaneseokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
if not os.path.isfile(emoji_file):
raise ValueError(
f"Can't find an emoji file at path '{emoji_file}'. To load the emoji information from a Google"
" pretrained model use `tokenizer = GPTNeoXJapaneseokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
self.do_clean_text = do_clean_text
self.vocab, self.raw_vocab, self.ids_to_tokens, self.emoji = load_vocab_and_emoji(vocab_file, emoji_file)
self.subword_tokenizer = SubWordJapaneseTokenizer(
vocab=self.vocab, ids_to_tokens=self.ids_to_tokens, emoji=self.emoji
)
super().__init__(
unk_token=unk_token,
pad_token=pad_token,
bos_token=bos_token,
eos_token=eos_token,
do_clean_text=do_clean_text,
**kwargs,
)
@property
def vocab_size(self):
return len(self.raw_vocab)
def get_vocab(self):
return dict(self.raw_vocab, **self.added_tokens_encoder)
def _tokenize(self, text):
return self.subword_tokenizer.tokenize(text, clean=self.do_clean_text)
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.subword_tokenizer.convert_id_to_token(index)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
out_string = "".join(tokens).strip()
return out_string
@property
def default_chat_template(self):
"""
A simple chat template that just adds BOS/EOS tokens around messages while discarding role information.
"""
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
return (
"{% for message in messages %}"
"{{ bos_token + eos_token + message.content + eos_token }}"
"{% endfor %}"
"{% if add_generation_prompt %} {{ bos_token + eos_token }} {% endif %}"
)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
index = 0
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
emoji_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["emoji_file"]
)
else:
vocab_file = (
(filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["vocab_file"]
)
emoji_file = (
(filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["emoji_file"]
)
with open(vocab_file, "w", encoding="utf-8") as writer:
for token_index, token in self.ids_to_tokens.items():
if index != token_index:
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!"
)
index = token_index
writer.write(",".join(token) + "\n")
index += 1
with open(emoji_file, "w", encoding="utf-8") as writer:
json.dump(self.emoji, writer)
return vocab_file, emoji_file
class SubWordJapaneseTokenizer(object):
"""
https://github.com/tanreinama/Japanese-BPEEncoder_V2 This tokenizer class is under MIT Lisence according to the
original repository.
MIT License
Copyright (c) 2020 tanreinama
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of
the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
def __init__(self, vocab, ids_to_tokens, emoji):
self.vocab = vocab
self.ids_to_tokens = ids_to_tokens
self.emoji = emoji
self.maxlen = np.max([len(w) for w in self.vocab.keys()])
self.content_repatter1 = re.compile(r"(https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+$,%#]+)")
self.content_repatter2 = re.compile(r"[A-Za-z0-9\._+]*@[\-_0-9A-Za-z]+(\.[A-Za-z]+)*")
self.content_repatter3 = re.compile(r"[\(]{0,1}[0-9]{2,4}[\)\-\(]{0,1}[0-9]{2,4}[\)\-]{0,1}[0-9]{3,4}")
self.content_repatter4 = re.compile(
r"([12]\d{3}[/\-年])*(0?[1-9]|1[0-2])[/\-月]((0?[1-9]|[12][0-9]|3[01])日?)*(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*"
)
self.content_repatter5 = re.compile(
r"(明治|大正|昭和|平成|令和|㍾|㍽|㍼|㍻|\u32ff)\d{1,2}年(0?[1-9]|1[0-2])月(0?[1-9]|[12][0-9]|3[01])日(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*"
)
self.content_repatter6 = re.compile(
r"((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*億)*((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*万)*((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*千)*(0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*(千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+(\(税込\)|\(税抜\)|\+tax)*"
)
keisen = "─━│┃┄┅┆┇┈┉┊┋┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋╌╍╎╏═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬╭╮╯╰╱╲╳╴╵╶╷╸╹╺╻╼╽╾╿"
blocks = "▀▁▂▃▄▅▆▇█▉▊▋▌▍▎▏▐░▒▓▔▕▖▗▘▙▚▛▜▝▞▟"
self.content_trans1 = str.maketrans({k: "<BLOCK>" for k in keisen + blocks})
def __len__(self):
return len(self.ids_to_tokens)
def clean_text(self, content):
content = self.content_repatter1.sub("<URL>", content)
content = self.content_repatter2.sub("<EMAIL>", content)
content = self.content_repatter3.sub("<TEL>", content)
content = self.content_repatter4.sub("<DATE>", content)
content = self.content_repatter5.sub("<DATE>", content)
content = self.content_repatter6.sub("<PRICE>", content)
content = content.translate(self.content_trans1)
while "<BLOCK><BLOCK>" in content:
content = content.replace("<BLOCK><BLOCK>", "<BLOCK>")
return content
def tokenize(self, text, clean=False):
text = text.replace(" ", "<SP>")
text = text.replace(" ", "<SP>")
text = text.replace("\r\n", "<BR>")
text = text.replace("\n", "<BR>")
text = text.replace("\r", "<BR>")
text = text.replace("\t", "<TAB>")
text = text.replace("—", "ー")
text = text.replace("−", "ー")
for k, v in self.emoji["emoji"].items():
if k in text:
text = text.replace(k, v)
if clean:
text = self.clean_text(text)
def check_simbol(x):
e = x.encode()
if len(x) == 1 and len(e) == 2:
c = (int(e[0]) << 8) + int(e[1])
if (
(c >= 0xC2A1 and c <= 0xC2BF)
or (c >= 0xC780 and c <= 0xC783)
or (c >= 0xCAB9 and c <= 0xCBBF)
or (c >= 0xCC80 and c <= 0xCDA2)
):
return True
return False
def checku2e(x):
e = x.encode()
if len(x) == 1 and len(e) == 3:
c = (int(e[0]) << 16) + (int(e[1]) << 8) + int(e[2])
if c >= 0xE28080 and c <= 0xE2B07F:
return True
return False
pos = 0
result = []
while pos < len(text):
end = min(len(text), pos + self.maxlen + 1) if text[pos] == "<" else pos + 3
candidates = []
for e in range(end, pos, -1):
wd = text[pos:e]
if wd in self.vocab:
if wd[0] == "<" and len(wd) > 2:
candidates = [(self.vocab[wd], wd, e)]
break
else:
candidates.append((self.vocab[wd], wd, e))
if len(candidates) > 0:
_, wd, e = sorted(candidates, key=lambda x: x[0])[0]
result.append(wd)
pos = e
else:
end = pos + 1
wd = text[pos:end]
if check_simbol(wd):
result.append("<KIGOU>")
elif checku2e(wd):
result.append("<U2000U2BFF>")
else:
for i in wd.encode("utf-8"):
result.append("<|byte%d|>" % i)
pos = end
return result
def convert_id_to_token(self, index, breakline="\n"):
words = []
byte_tokens = []
word = self.ids_to_tokens[index][0]
if word[:6] == "<|byte" and word[-2:] == "|>":
byte_tokens.append(int(word[6:-2]))
else:
if len(byte_tokens) > 0:
words.append(bytearray(byte_tokens).decode("utf-8", errors="replace"))
byte_tokens = []
if word[:7] == "<|emoji" and word[-2:] == "|>":
words.append(self.emoji["emoji_inv"][word])
elif word == "<SP>":
words.append(" ")
elif word == "<BR>":
words.append(breakline)
elif word == "<TAB>":
words.append("\t")
elif word == "<BLOCK>":
words.append("▀")
elif word == "<KIGOU>":
words.append("ǀ")
elif word == "<U2000U2BFF>":
words.append("‖")
else:
words.append(word)
if len(byte_tokens) > 0:
words.append(bytearray(byte_tokens).decode("utf-8", errors="replace"))
text = "".join(words)
return text
.\models\gpt_neox_japanese\__init__.py
from typing import TYPE_CHECKING
from ...file_utils import _LazyModule, is_torch_available
from ...utils import OptionalDependencyNotAvailable
_import_structure = {
"configuration_gpt_neox_japanese": ["GPT_NEOX_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoXJapaneseConfig"],
"tokenization_gpt_neox_japanese": ["GPTNeoXJapaneseTokenizer"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_gpt_neox_japanese"] = [
"GPT_NEOX_JAPANESE_PRETRAINED_MODEL_ARCHIVE_LIST",
"GPTNeoXJapaneseForCausalLM",
"GPTNeoXJapaneseLayer",
"GPTNeoXJapaneseModel",
"GPTNeoXJapanesePreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_gpt_neox_japanese import GPT_NEOX_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoXJapaneseConfig
from .tokenization_gpt_neox_japanese import GPTNeoXJapaneseTokenizer
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_gpt_neox_japanese import (
GPT_NEOX_JAPANESE_PRETRAINED_MODEL_ARCHIVE_LIST,
GPTNeoXJapaneseForCausalLM,
GPTNeoXJapaneseLayer,
GPTNeoXJapaneseModel,
GPTNeoXJapanesePreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\gpt_sw3\convert_megatron_to_pytorch.py
""" 将 GPT-SW3 Megatron 检查点转换为 PyTorch 格式 """
import argparse
import os
from os.path import isfile
import torch
from transformers import GPT2Config
def recursive_print(name, val, spaces=0):
if name is None:
msg = None
else:
fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}"
msg = fmt.format(name)
if isinstance(val, dict):
if msg is not None:
print(msg)
for k in val.keys():
recursive_print(k, val[k], spaces + 2)
elif isinstance(val, torch.Tensor):
print(msg, ":", val.size())
else:
print(msg, ":", val)
def fix_query_key_value_ordering(param, num_splits, num_heads, hidden_size):
input_shape = param.size()
saved_shape = (num_heads, num_splits, hidden_size) + input_shape[1:]
param = param.view(*saved_shape)
param = param.transpose(0, 1).contiguous()
param = param.view(*input_shape)
return param
def convert_megatron_checkpoint(sd_megatron, config):
"""
将 Megatron 检查点转换为 HuggingFace GPT-SW3 检查点。
"""
n_positions = config.n_positions
layers = config.n_layer
vocab_size = config.vocab_size
heads = config.n_head
hidden_size_per_head = config.n_embd // config.n_head
word_embeddings = sd_megatron["model.language_model.embedding.word_embeddings.weight"][:vocab_size, :]
sd_hf = {
"transformer.wte.weight": word_embeddings,
"transformer.wpe.weight": sd_megatron["model.language_model.embedding.position_embeddings.weight"],
"transformer.ln_f.weight": sd_megatron["model.language_model.encoder.final_layernorm.weight"],
"transformer.ln_f.bias": sd_megatron["model.language_model.encoder.final_layernorm.bias"],
}
pf = "model.language_model.encoder.layers."
for i in range(layers):
causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.bool))
causal_mask = causal_mask.view(1, 1, n_positions, n_positions)
sd_hf[f"transformer.h.{i}.attn.bias"] = causal_mask
sd_hf[f"transformer.h.{i}.attn.masked_bias"] = torch.tensor(-1e4, dtype=torch.bfloat16)
sd_hf[f"transformer.h.{i}.ln_1.weight"] = sd_megatron[f"{pf}{i}.input_layernorm.weight"]
sd_hf[f"transformer.h.{i}.ln_1.bias"] = sd_megatron[f"{pf}{i}.input_layernorm.bias"]
val1 = sd_megatron[f"{pf}{i}.self_attention.query_key_value.weight"]
val1 = fix_query_key_value_ordering(val1, 3, heads, hidden_size_per_head)
sd_hf[f"transformer.h.{i}.attn.c_attn.weight"] = val1.transpose(0, 1).contiguous()
val2 = sd_megatron[f"{pf}{i}.self_attention.query_key_value.bias"]
val2 = fix_query_key_value_ordering(val2, 3, heads, hidden_size_per_head)
sd_hf[f"transformer.h.{i}.attn.c_attn.bias"] = val2
sd_hf[f"transformer.h.{i}.attn.c_proj.weight"] = sd_megatron[f"{pf}{i}.self_attention.dense.weight"].transpose(0, 1)
sd_hf[f"transformer.h.{i}.attn.c_proj.bias"] = sd_megatron[f"{pf}{i}.self_attention.dense.bias"]
sd_hf[f"transformer.h.{i}.ln_2.weight"] = sd_megatron[f"{pf}{i}.post_attention_layernorm.weight"]
sd_hf[f"transformer.h.{i}.ln_2.bias"] = sd_megatron[f"{pf}{i}.post_attention_layernorm.bias"]
sd_hf[f"transformer.h.{i}.mlp.c_fc.weight"] = sd_megatron[f"{pf}{i}.mlp.dense_h_to_4h.weight"].transpose(0, 1)
sd_hf[f"transformer.h.{i}.mlp.c_fc.bias"] = sd_megatron[f"{pf}{i}.mlp.dense_h_to_4h.bias"]
sd_hf[f"transformer.h.{i}.mlp.c_proj.weight"] = sd_megatron[f"{pf}{i}.mlp.dense_4h_to_h.weight"].transpose(0, 1)
sd_hf[f"transformer.h.{i}.mlp.c_proj.bias"] = sd_megatron[f"{pf}{i}.mlp.dense_4h_to_h.bias"]
sd_hf["lm_head.weight"] = word_embeddings
return sd_hf
def main(args):
print(args)
checkpoint_path = args.checkpoint_path
save_path = args.save_path
if isfile(checkpoint_path):
raise FileNotFoundError(f"ERROR! could not find file {checkpoint_path}")
checkpoint = torch.load(checkpoint_path, map_location="cpu")
config_megatron = checkpoint["hyper_parameters"]["cfg"]
config_hf = GPT2Config()
config_hf = copy_config(config_hf=config_hf, config_megatron=config_megatron)
config_hf.architectures = ["GPT2LMHeadModel"]
sd_megatron = checkpoint["state_dict"]
print("Converting")
sd_hf = convert_megatron_checkpoint(sd_megatron, config_hf)
if args.print_checkpoint_structure:
recursive_print(None, sd_hf)
config_hf.tokenizer_class = "GPTSw3Tokenizer"
print("Saving config")
config_hf.save_pretrained(save_path)
output_checkpoint_file = os.path.join(save_path, "pytorch_model.bin")
print(f'Saving checkpoint to "{output_checkpoint_file}"')
torch.save(sd_hf, output_checkpoint_file)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--checkpoint_path",
type=str,
required=True,
help="e.g. megatron_gpt--val_loss=2.42-step=38000-consumed_samples=54720000",
)
parser.add_argument("--save_path", type=str, required=True, help="e.g. /home/user/gpt-sw3/hf")
parser.add_argument("--print-checkpoint-structure", action="store_true")
_args = parser.parse_args()
main(_args)
.\models\gpt_sw3\tokenization_gpt_sw3.py
"""The tokenizer used by the GPT-SW3 models."""
import os
import re
import unicodedata
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple, Union
import sentencepiece as spm
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import is_torch_available, logging
if is_torch_available():
import torch
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"AI-Sweden-Models/gpt-sw3-126m": "https://huggingface.co/AI-Sweden-Models/gpt-sw3-126m/resolve/main/spiece.model",
"AI-Sweden-Models/gpt-sw3-356m": "https://huggingface.co/AI-Sweden-Models/gpt-sw3-356m/resolve/main/spiece.model",
"AI-Sweden-Models/gpt-sw3-1.3b": "https://huggingface.co/AI-Sweden-Models/gpt-sw3-1.3b/resolve/main/spiece.model",
"AI-Sweden-Models/gpt-sw3-6.7b": "https://huggingface.co/AI-Sweden-Models/gpt-sw3-6.7b/resolve/main/spiece.model",
"AI-Sweden-Models/gpt-sw3-6.7b-v2": "https://huggingface.co/AI-Sweden-Models/gpt-sw3-6.7b-v2/resolve/main/spiece.model",
"AI-Sweden-Models/gpt-sw3-20b": "https://huggingface.co/AI-Sweden-Models/gpt-sw3-20b/resolve/main/spiece.model",
"AI-Sweden-Models/gpt-sw3-40b": "https://huggingface.co/AI-Sweden-Models/gpt-sw3-20b/resolve/main/spiece.model",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"AI-Sweden-Models/gpt-sw3-126m": 2048,
"AI-Sweden-Models/gpt-sw3-356m": 2048,
"AI-Sweden-Models/gpt-sw3-1.3b": 2048,
"AI-Sweden-Models/gpt-sw3-6.7b": 2048,
"AI-Sweden-Models/gpt-sw3-6.7b-v2": 2048,
"AI-Sweden-Models/gpt-sw3-20b": 2048,
"AI-Sweden-Models/gpt-sw3-40b": 2048,
}
class GPTSw3Tokenizer(PreTrainedTokenizer):
"""
Construct an GPTSw3 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
Example usage:
```
>>> from transformers import GPTSw3Tokenizer
>>> tokenizer = GPTSw3Tokenizer.from_pretrained("AI-Sweden-Models/gpt-sw3-126m")
>>> tokenizer("Svenska är kul!")["input_ids"]
[1814, 377, 3617, 63504]
```
"""
Args:
vocab_file (`str`):
[SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
contains the vocabulary necessary to instantiate a tokenizer.
do_lower_case (`bool`, *optional*, defaults to `False`):
Whether or not to lowercase the input when tokenizing.
remove_space (`bool`, *optional*, defaults to `False`):
Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
keep_accents (`bool`, *optional*, defaults to `False`):
Whether or not to keep accents when tokenizing.
pad_token (`str`, *optional*):
The token used for padding, for example when batching sequences of different lengths. If not provided, will
default to '<pad>' or '<unk>' depending on model size.
unk_token (`str`, *optional*):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead. If not provided, will default to '<unk>'.
eos_token (`str`, *optional*):
The end of sequence token seen during pretraining. If not provided, will default to '<|endoftext|>'
bos_token (`str`, *optional*):
The beginning of sequence token that can be used for downstream task, was not seen during pretraining. If
not provided, will default to '<s>' or '<|endoftext|>', depending on model size.
sp_model_kwargs (`dict`, *optional*):
Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
to set:
- `enable_sampling`: Enable subword regularization.
- `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
- `nbest_size = {0,1}`: No sampling is performed.
- `nbest_size > 1`: samples from the nbest_size results.
- `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
Attributes:
sp_model (`SentencePieceProcessor`):
The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
whitespaces (`set`):
The whitespaces that are replaced in the whitespace normalization in preprocessing.
non_printing_characters_re (`Pattern`):
The compiled regular expression to remove non-printing characters in preprocessing.
"""
# List of vocabulary files' names
vocab_files_names = VOCAB_FILES_NAMES
# Map of pretrained vocabulary files
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
# 将预训练模型的位置编码嵌入大小赋值给变量
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# 模型输入的名称列表
model_input_names = ["input_ids", "attention_mask"]
# 初始化函数,接收多个参数来配置分词器
def __init__(
self,
vocab_file,
do_lower_case=False,
remove_space=False,
keep_accents=False,
pad_token=None,
unk_token=None,
eos_token=None,
bos_token=None,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs,
) -> None:
# 如果 sp_model_kwargs 是 None,则设置为空字典
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
# 获取 kwargs 中的 name_or_path 参数,如果不存在则警告并设置为 "None"
name_or_path = kwargs.get("name_or_path")
if name_or_path is None:
logger.warning(
"name_or_path not provided, will work for all GPTSw3 models except gpt-sw3-7b,"
" you are testing the model, this can safely be ignored"
)
name_or_path = "None"
# 根据情况设置 eos_token 和 unk_token 的默认值
eos_token = "<|endoftext|>" if eos_token is None else eos_token
unk_token = "<unk>" if unk_token is None else unk_token
# 如果 name_or_path 包含 "gpt-sw3-7b",则设置 pad_token 和 bos_token 的值
if "gpt-sw3-7b" in name_or_path:
pad_token = unk_token if pad_token is None else pad_token
bos_token = eos_token if bos_token is None else bos_token
else:
pad_token = "<pad>" if pad_token is None else pad_token
bos_token = "<s>" if bos_token is None else bos_token
# 设置对象的属性
self.do_lower_case = do_lower_case
self.remove_space = remove_space
self.keep_accents = keep_accents
self.vocab_file = vocab_file
# 使用 SentencePieceProcessor 初始化 self.sp_model
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(vocab_file)
# 用于输入文本中空白字符的规范化
# fmt : off
self.whitespaces = {" ", " ", " ", " ", " ", " ", " ", " ", " ", " ", "", ""}
# fmt : on
# 正则表达式,用于在预处理中移除非打印字符(例如某些 Unicode 控制字符)
self.non_printing_characters_re = re.compile(
f"[{''.join(map(chr, list(range(0, 9)) + list(range(11, 32)) + list(range(127, 160)) + [160, 173, 8203]))}]"
)
# 调用父类的初始化方法
super().__init__(
do_lower_case=do_lower_case,
remove_space=remove_space,
keep_accents=keep_accents,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs,
)
# 从 transformers.models.albert.tokenization_albert.AlbertTokenizer.__getstate__ 复制的方法
# 返回对象的状态字典,将 sp_model 设为 None
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
return state
# Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.__setstate__
# 用于反序列化对象状态,将给定的字典 d 直接赋给对象的 __dict__ 属性
def __setstate__(self, d):
self.__dict__ = d
# 用于向后兼容性检查,如果对象没有属性 "sp_model_kwargs",则设置为空字典
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
# 使用 SentencePieceProcessor 初始化 self.sp_model 对象,传入 self.sp_model_kwargs 的参数
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
# 加载词汇文件到 self.sp_model
self.sp_model.Load(self.vocab_file)
@property
# 从 transformers.models.albert.tokenization_albert.AlbertTokenizer.vocab_size 处复制的属性
# 返回当前对象 self.sp_model 的词汇大小,即词汇表中的条目数
def vocab_size(self) -> int:
return len(self.sp_model)
# 对给定文本进行预处理,返回处理后的文本
def preprocess_text(self, text: str) -> str:
"""
返回预处理后的文本。该过程与训练标记器时使用的过程相同。
"""
# 移除非打印字符
text = self.non_printing_characters_re.sub("", text)
# 规范化空白字符
text = "".join([char if char not in self.whitespaces else " " for char in text])
# NFC Unicode 规范化
text = unicodedata.normalize("NFC", text)
return text
# 将给定文本进行标记化处理,返回标记化后的列表
def _tokenize(self, text: str, **kwargs) -> List[str]:
text = self.preprocess_text(text)
return self.sp_model.encode(text, out_type=str)
# 将给定的 token (str) 转换为其对应的 id (int),使用当前对象的词汇表进行转换
def _convert_token_to_id(self, token: str) -> int:
"""将 token (str) 转换为 id (int),使用词汇表进行转换。"""
return self.sp_model.PieceToId(token)
# 将给定的 id (int) 转换为其对应的 token (str),使用当前对象的词汇表进行转换
def _convert_id_to_token(self, index: int) -> str:
"""将 id (int) 转换为 token (str),使用词汇表进行转换。"""
return self.sp_model.IdToPiece(index)
@staticmethod
# 返回输入字符串本身,用于覆盖默认的清理函数
def clean_up_tokenization(out_string: str) -> str:
"""返回输入的字符串,此函数用于移除默认的清理行为。"""
return out_string
# 将一系列 token (字符串) 转换为单个字符串,特殊 token 保持不变
def convert_tokens_to_string(self, tokens: List[str]) -> str:
"""将一系列 token (字符串) 转换为单个字符串。特殊 token 保持不变。"""
current_sub_tokens = []
out_string = ""
prev_is_special = False
for token in tokens:
# 确保特殊 token 不使用 sentencepiece 模型解码
if token in self.all_special_tokens:
# TODO: 检查是否需要这一步骤,它确保 decode(encode(doc)) != doc,通过在解码文档中添加额外的空格来实现
if not prev_is_special:
out_string += " "
out_string += self.sp_model.decode(current_sub_tokens) + token
prev_is_special = True
current_sub_tokens = []
else:
current_sub_tokens.append(token)
prev_is_special = False
out_string += self.sp_model.decode(current_sub_tokens)
return out_string
# 从 transformers.models.albert.tokenization_albert.AlbertTokenizer.get_vocab 处复制的方法
# 返回当前对象的词汇表,包含 token 到 id 的映射
def get_vocab(self) -> Dict[str, int]:
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
# 从transformers.models.albert.tokenization_albert.AlbertTokenizer.save_vocabulary方法复制而来
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# 检查保存目录是否存在,如果不存在则记录错误并返回
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
# 拼接输出的词汇表文件路径
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
# 如果当前词汇表文件路径不等于输出路径,并且当前词汇表文件存在,则复制当前词汇表文件到输出路径
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
# 如果当前词汇表文件不存在,则将序列化的 sp_model 内容写入到输出文件
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
# 返回输出文件路径的元组
return (out_vocab_file,)
def encode_fast(
self, text: Union[str, List[str]], return_tensors: Union[str, bool] = False
) -> Union[List[int], List[List[int]], "torch.Tensor"]:
"""
Encodes a text or batch of texts to token ids using preprocessing and the raw SP tokenizer. This has reduced
functionality but is often much faster.
Does NOT handle special tokens correctly, these can manually be added as ids afterwards.
Does NOT support padding, these can manually be added as ids afterwards.
Use default HuggingFace tokenization methods for full functionality.
Args:
text (`str` or `List[str]`): One or several text(s) to convert to token ids.
return_tensors (`str` or `bool`): Returns PyTorch tensors if set to True or "pt"
Returns:
`List[int]`, `List[List[int]]`, or `torch.Tensor`: The encoded text(s) as token ids.
"""
# 如果输入是单个字符串,则预处理文本并使用 sp_model 进行编码
if isinstance(text, str):
text = self.preprocess_text(text)
token_ids = self.sp_model.encode(text)
# 如果输入是字符串列表,则分别预处理每个文本并使用 sp_model 进行编码
else:
text = [self.preprocess_text(t) for t in text]
token_ids = self.sp_model.encode(text)
# 如果需要返回 PyTorch 张量,则转换编码后的结果为张量
if return_tensors is True or return_tensors == "pt":
token_ids = torch.tensor(token_ids)
# 返回编码后的 token_ids
return token_ids
def decode_fast(self, token_ids: Union[int, List[int]]) -> str:
"""
Decodes a text or batch of texts from token ids using preprocessing and the raw SP tokenizer. This has reduced
functionality but is often much faster.
Args:
token_ids (`int` or `List[int]`): Encoded token or text as token id(s).
Returns:
`str`: Decoded text
"""
# 使用 sp_model 对 token_ids 进行解码,返回解码后的文本
return self.sp_model.decode(token_ids)
@property
# 定义一个默认的聊天模板函数,用于格式化消息,类似即时通讯的聊天记录,消息前面有 "User:" 和 "Bot:" 字符串。消息之间使用 BOS 标记分隔。
def default_chat_template(self):
"""
This chat template formats messages like an instant messenger chat log, with "User:" and "Bot:" strings
preceding messages. BOS tokens are added between all messages.
"""
# 记录一次警告,提示没有为该分词器定义聊天模板,将使用该类的默认模板。如果默认模板不适合您的模型,请设置 `tokenizer.chat_template` 为合适的模板。
# 查看 https://huggingface.co/docs/transformers/main/chat_templating 获取更多信息。
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
# 返回格式化后的聊天模板字符串,包括 EOS 标记和 BOS 标记在每条消息之间,处理 messages 列表中的每一条消息并添加对应的角色前缀。
return (
"{{ eos_token }}{{ bos_token }}"
"{% for message in messages %}"
"{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}"
"{% else %}{{ 'Bot: ' + message['content']}}{% endif %}"
"{{ message['text'] }}{{ bos_token }}"
"{% endfor %}"
"Bot:" # 最后追加一个固定的 "Bot:" 字符串,表示消息结束
)
.\models\gpt_sw3\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available
_import_structure = {}
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_gpt_sw3"] = ["GPTSw3Tokenizer"]
if TYPE_CHECKING:
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_gpt_sw3 import GPTSw3Tokenizer
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\graphormer\collating_graphormer.py
from typing import Any, Dict, List, Mapping
import numpy as np
import torch
from ...utils import is_cython_available, requires_backends
if is_cython_available():
import pyximport
pyximport.install(setup_args={"include_dirs": np.get_include()})
from . import algos_graphormer
def convert_to_single_emb(x, offset: int = 512):
feature_num = x.shape[1] if len(x.shape) > 1 else 1
feature_offset = 1 + np.arange(0, feature_num * offset, offset, dtype=np.int64)
x = x + feature_offset
return x
def preprocess_item(item, keep_features=True):
requires_backends(preprocess_item, ["cython"])
if keep_features and "edge_attr" in item.keys():
edge_attr = np.asarray(item["edge_attr"], dtype=np.int64)
else:
edge_attr = np.ones((len(item["edge_index"][0]), 1), dtype=np.int64)
if keep_features and "node_feat" in item.keys():
node_feature = np.asarray(item["node_feat"], dtype=np.int64)
else:
node_feature = np.ones((item["num_nodes"], 1), dtype=np.int64)
edge_index = np.asarray(item["edge_index"], dtype=np.int64)
input_nodes = convert_to_single_emb(node_feature) + 1
num_nodes = item["num_nodes"]
if len(edge_attr.shape) == 1:
edge_attr = edge_attr[:, None]
attn_edge_type = np.zeros([num_nodes, num_nodes, edge_attr.shape[-1]], dtype=np.int64)
attn_edge_type[edge_index[0], edge_index[1]] = convert_to_single_emb(edge_attr) + 1
adj = np.zeros([num_nodes, num_nodes], dtype=bool)
adj[edge_index[0], edge_index[1]] = True
shortest_path_result, path = algos_graphormer.floyd_warshall(adj)
max_dist = np.amax(shortest_path_result)
input_edges = algos_graphormer.gen_edge_input(max_dist, path, attn_edge_type)
attn_bias = np.zeros([num_nodes + 1, num_nodes + 1], dtype=np.single)
item["input_nodes"] = input_nodes + 1
item["attn_bias"] = attn_bias
item["attn_edge_type"] = attn_edge_type
item["spatial_pos"] = shortest_path_result.astype(np.int64) + 1
item["in_degree"] = np.sum(adj, axis=1).reshape(-1) + 1
item["out_degree"] = item["in_degree"]
item["input_edges"] = input_edges + 1
if "labels" not in item:
item["labels"] = item["y"]
return item
class GraphormerDataCollator:
def __init__(self, spatial_pos_max=20, on_the_fly_processing=False):
if not is_cython_available():
raise ImportError("Graphormer preprocessing needs Cython (pyximport)")
self.spatial_pos_max = spatial_pos_max
self.on_the_fly_processing = on_the_fly_processing
.\models\graphormer\configuration_graphormer.py
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"graphormer-base": "https://huggingface.co/clefourrier/graphormer-base-pcqm4mv2/resolve/main/config.json",
}
class GraphormerConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`~GraphormerModel`]. It is used to instantiate an
Graphormer model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the Graphormer
[graphormer-base-pcqm4mv1](https://huggingface.co/graphormer-base-pcqm4mv1) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
model_type = "graphormer"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
num_classes: int = 1,
num_atoms: int = 512 * 9,
num_edges: int = 512 * 3,
num_in_degree: int = 512,
num_out_degree: int = 512,
num_spatial: int = 512,
num_edge_dis: int = 128,
multi_hop_max_dist: int = 5,
spatial_pos_max: int = 1024,
edge_type: str = "multi_hop",
max_nodes: int = 512,
share_input_output_embed: bool = False,
num_hidden_layers: int = 12,
embedding_dim: int = 768,
ffn_embedding_dim: int = 768,
num_attention_heads: int = 32,
dropout: float = 0.1,
attention_dropout: float = 0.1,
activation_dropout: float = 0.1,
layerdrop: float = 0.0,
encoder_normalize_before: bool = False,
pre_layernorm: bool = False,
apply_graphormer_init: bool = False,
activation_fn: str = "gelu",
embed_scale: float = None,
freeze_embeddings: bool = False,
num_trans_layers_to_freeze: int = 0,
traceable: bool = False,
q_noise: float = 0.0,
qn_block_size: int = 8,
kdim: int = None,
vdim: int = None,
bias: bool = True,
self_attention: bool = True,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
**kwargs,
self.num_classes = num_classes
self.num_atoms = num_atoms
self.num_in_degree = num_in_degree
self.num_out_degree = num_out_degree
self.num_edges = num_edges
self.num_spatial = num_spatial
self.num_edge_dis = num_edge_dis
self.edge_type = edge_type
self.multi_hop_max_dist = multi_hop_max_dist
self.spatial_pos_max = spatial_pos_max
self.max_nodes = max_nodes
self.num_hidden_layers = num_hidden_layers
self.embedding_dim = embedding_dim
self.hidden_size = embedding_dim
self.ffn_embedding_dim = ffn_embedding_dim
self.num_attention_heads = num_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.layerdrop = layerdrop
self.encoder_normalize_before = encoder_normalize_before
self.pre_layernorm = pre_layernorm
self.apply_graphormer_init = apply_graphormer_init
self.activation_fn = activation_fn
self.embed_scale = embed_scale
self.freeze_embeddings = freeze_embeddings
self.num_trans_layers_to_freeze = num_trans_layers_to_freeze
self.share_input_output_embed = share_input_output_embed
self.traceable = traceable
self.q_noise = q_noise
self.qn_block_size = qn_block_size
self.num_classes = num_classes
self.num_atoms = num_atoms
self.num_in_degree = num_in_degree
self.num_out_degree = num_out_degree
self.num_edges = num_edges
self.num_spatial = num_spatial
self.num_edge_dis = num_edge_dis
self.edge_type = edge_type
self.multi_hop_max_dist = multi_hop_max_dist
self.spatial_pos_max = spatial_pos_max
self.max_nodes = max_nodes
self.num_hidden_layers = num_hidden_layers
self.embedding_dim = embedding_dim
self.hidden_size = embedding_dim
self.ffn_embedding_dim = ffn_embedding_dim
self.num_attention_heads = num_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.layerdrop = layerdrop
self.encoder_normalize_before = encoder_normalize_before
self.pre_layernorm = pre_layernorm
self.apply_graphormer_init = apply_graphormer_init
self.activation_fn = activation_fn
self.embed_scale = embed_scale
self.freeze_embeddings = freeze_embeddings
self.num_trans_layers_to_freeze = num_trans_layers_to_freeze
self.share_input_output_embed = share_input_output_embed
self.traceable = traceable
self.q_noise = q_noise
self.qn_block_size = qn_block_size
self.kdim = kdim
self.vdim = vdim
self.self_attention = self_attention
self.bias = bias
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
**kwargs,
)
self.kdim = kdim
self.vdim = vdim
self.self_attention = self_attention
self.bias = bias
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
**kwargs,
)
.\models\graphormer\modeling_graphormer.py
""" PyTorch Graphormer model."""
import math
from typing import Iterable, Iterator, List, Optional, Tuple, Union
import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutputWithNoAttention,
SequenceClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import logging
from .configuration_graphormer import GraphormerConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "graphormer-base-pcqm4mv1"
_CONFIG_FOR_DOC = "GraphormerConfig"
GRAPHORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"clefourrier/graphormer-base-pcqm4mv1",
"clefourrier/graphormer-base-pcqm4mv2",
]
def quant_noise(module: nn.Module, p: float, block_size: int):
"""
From:
https://github.com/facebookresearch/fairseq/blob/dd0079bde7f678b0cd0715cbd0ae68d661b7226d/fairseq/modules/quant_noise.py
Wraps modules and applies quantization noise to the weights for subsequent quantization with Iterative Product
Quantization as described in "Training with Quantization Noise for Extreme Model Compression"
Args:
- module: nn.Module
- p: amount of Quantization Noise
- block_size: size of the blocks for subsequent quantization with iPQ
Remarks:
- Module weights must have the right sizes wrt the block size
- Only Linear, Embedding and Conv2d modules are supported for the moment
- For more detail on how to quantize by blocks with convolutional weights, see "And the Bit Goes Down:
Revisiting the Quantization of Neural Networks"
- We implement the simplest form of noise here as stated in the paper which consists in randomly dropping
blocks
"""
if p <= 0:
return module
if not isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d)):
raise NotImplementedError("Module unsupported for quant_noise.")
is_conv = module.weight.ndim == 4
if not is_conv:
if module.weight.size(1) % block_size != 0:
raise AssertionError("Input features must be a multiple of block sizes")
else:
if module.kernel_size == (1, 1):
if module.in_channels % block_size != 0:
raise AssertionError("Input channels must be a multiple of block sizes")
else:
k = module.kernel_size[0] * module.kernel_size[1]
if k % block_size != 0:
raise AssertionError("Kernel size must be a multiple of block size")
def _forward_pre_hook(mod, input):
if mod.training:
if not is_conv:
weight = mod.weight
in_features = weight.size(1)
out_features = weight.size(0)
mask = torch.zeros(in_features // block_size * out_features, device=weight.device)
mask.bernoulli_(p)
mask = mask.repeat_interleave(block_size, -1).view(-1, in_features)
else:
weight = mod.weight
in_channels = mod.in_channels
out_channels = mod.out_channels
if mod.kernel_size == (1, 1):
mask = torch.zeros(
int(in_channels // block_size * out_channels),
device=weight.device,
)
mask.bernoulli_(p)
mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels)
else:
mask = torch.zeros(weight.size(0), weight.size(1), device=weight.device)
mask.bernoulli_(p)
mask = mask.unsqueeze(2).unsqueeze(3).repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1])
mask = mask.to(torch.bool)
s = 1 / (1 - p)
mod.weight.data = s * weight.masked_fill(mask, 0)
module.register_forward_pre_hook(_forward_pre_hook)
return module
class LayerDropModuleList(nn.ModuleList):
"""
From:
https://github.com/facebookresearch/fairseq/blob/dd0079bde7f678b0cd0715cbd0ae68d661b7226d/fairseq/modules/layer_drop.py
A LayerDrop implementation based on [`torch.nn.ModuleList`]. LayerDrop as described in
https://arxiv.org/abs/1909.11556.
We refresh the choice of which layers to drop every time we iterate over the LayerDropModuleList instance. During
evaluation we always iterate over all layers.
Usage:
```
layers = LayerDropList(p=0.5, modules=[layer1, layer2, layer3])
for layer in layers: # this might iterate over layers 1 and 3
x = layer(x)
for layer in layers: # this might iterate over all layers
x = layer(x)
for layer in layers: # this might not iterate over any layers
x = layer(x)
```
Args:
p (float): probability of dropping out each layer
modules (iterable, optional): an iterable of modules to add
"""
def __init__(self, p: float, modules: Optional[Iterable[nn.Module]] = None):
super().__init__(modules)
self.p = p
def __iter__(self) -> Iterator[nn.Module]:
dropout_probs = torch.empty(len(self)).uniform_()
for i, m in enumerate(super().__iter__()):
if not self.training or (dropout_probs[i] > self.p):
yield m
class GraphormerGraphNodeFeature(nn.Module):
"""
Compute node features for each node in the graph.
"""
def __init__(self, config: GraphormerConfig):
super().__init__()
self.num_heads = config.num_attention_heads
self.num_atoms = config.num_atoms
self.atom_encoder = nn.Embedding(config.num_atoms + 1, config.hidden_size, padding_idx=config.pad_token_id)
self.in_degree_encoder = nn.Embedding(
config.num_in_degree, config.hidden_size, padding_idx=config.pad_token_id
)
self.out_degree_encoder = nn.Embedding(
config.num_out_degree, config.hidden_size, padding_idx=config.pad_token_id
)
self.graph_token = nn.Embedding(1, config.hidden_size)
def forward(
self,
input_nodes: torch.LongTensor,
in_degree: torch.LongTensor,
out_degree: torch.LongTensor,
) -> torch.Tensor:
n_graph, n_node = input_nodes.size()[:2]
node_feature = (
self.atom_encoder(input_nodes).sum(dim=-2)
+ self.in_degree_encoder(in_degree)
+ self.out_degree_encoder(out_degree)
)
graph_token_feature = self.graph_token.weight.unsqueeze(0).repeat(n_graph, 1, 1)
graph_node_feature = torch.cat([graph_token_feature, node_feature], dim=1)
return graph_node_feature
class GraphormerGraphAttnBias(nn.Module):
"""
Compute attention bias for each head.
"""
def __init__(self):
super().__init__()
def __init__(self, config: GraphormerConfig):
super().__init__()
self.num_heads = config.num_attention_heads
self.multi_hop_max_dist = config.multi_hop_max_dist
self.edge_encoder = nn.Embedding(config.num_edges + 1, config.num_attention_heads, padding_idx=0)
self.edge_type = config.edge_type
if self.edge_type == "multi_hop":
self.edge_dis_encoder = nn.Embedding(
config.num_edge_dis * config.num_attention_heads * config.num_attention_heads,
1,
)
self.spatial_pos_encoder = nn.Embedding(config.num_spatial, config.num_attention_heads, padding_idx=0)
self.graph_token_virtual_distance = nn.Embedding(1, config.num_attention_heads)
def forward(
self,
input_nodes: torch.LongTensor,
attn_bias: torch.Tensor,
spatial_pos: torch.LongTensor,
input_edges: torch.LongTensor,
attn_edge_type: torch.LongTensor,
...
n_graph, n_node = input_nodes.size()[:2]
graph_attn_bias = attn_bias.clone()
graph_attn_bias = graph_attn_bias.unsqueeze(1).repeat(
1, self.num_heads, 1, 1
)
spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2)
graph_attn_bias[:, :, 1:, 1:] = graph_attn_bias[:, :, 1:, 1:] + spatial_pos_bias
t = self.graph_token_virtual_distance.weight.view(1, self.num_heads, 1)
graph_attn_bias[:, :, 1:, 0] = graph_attn_bias[:, :, 1:, 0] + t
graph_attn_bias[:, :, 0, :] = graph_attn_bias[:, :, 0, :] + t
if self.edge_type == "multi_hop":
spatial_pos_ = spatial_pos.clone()
spatial_pos_[spatial_pos_ == 0] = 1
spatial_pos_ = torch.where(spatial_pos_ > 1, spatial_pos_ - 1, spatial_pos_)
if self.multi_hop_max_dist > 0:
spatial_pos_ = spatial_pos_.clamp(0, self.multi_hop_max_dist)
input_edges = input_edges[:, :, :, : self.multi_hop_max_dist, :]
input_edges = self.edge_encoder(input_edges).mean(-2)
max_dist = input_edges.size(-2)
edge_input_flat = input_edges.permute(3, 0, 1, 2, 4).reshape(max_dist, -1, self.num_heads)
edge_input_flat = torch.bmm(
edge_input_flat,
self.edge_dis_encoder.weight.reshape(-1, self.num_heads, self.num_heads)[:max_dist, :, :],
)
input_edges = edge_input_flat.reshape(max_dist, n_graph, n_node, n_node, self.num_heads).permute(
1, 2, 3, 0, 4
)
input_edges = (input_edges.sum(-2) / (spatial_pos_.float().unsqueeze(-1))).permute(0, 3, 1, 2)
else:
input_edges = self.edge_encoder(attn_edge_type).mean(-2).permute(0, 3, 1, 2)
graph_attn_bias[:, :, 1:, 1:] = graph_attn_bias[:, :, 1:, 1:] + input_edges
graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1)
return graph_attn_bias
def reset_parameters(self):
if self.qkv_same_dim:
nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
else:
nn.init.xavier_uniform_(self.k_proj.weight)
nn.init.xavier_uniform_(self.v_proj.weight)
nn.init.xavier_uniform_(self.q_proj.weight)
nn.init.xavier_uniform_(self.out_proj.weight)
if self.out_proj.bias is not None:
nn.init.constant_(self.out_proj.bias, 0.0)
def forward(
self,
query: torch.LongTensor,
key: Optional[torch.Tensor],
value: Optional[torch.Tensor],
attn_bias: Optional[torch.Tensor],
key_padding_mask: Optional[torch.Tensor] = None,
need_weights: bool = True,
attn_mask: Optional[torch.Tensor] = None,
before_softmax: bool = False,
need_head_weights: bool = False,
):
def apply_sparse_mask(self, attn_weights: torch.Tensor, tgt_len: int, src_len: int, bsz: int) -> torch.Tensor:
return attn_weights
class GraphormerGraphEncoderLayer(nn.Module):
def __init__(self, config: GraphormerConfig) -> None:
super().__init__()
self.embedding_dim = config.embedding_dim
self.num_attention_heads = config.num_attention_heads
self.q_noise = config.q_noise
self.qn_block_size = config.qn_block_size
self.pre_layernorm = config.pre_layernorm
self.dropout_module = torch.nn.Dropout(p=config.dropout, inplace=False)
self.activation_dropout_module = torch.nn.Dropout(p=config.activation_dropout, inplace=False)
self.activation_fn = ACT2FN[config.activation_fn]
self.self_attn = GraphormerMultiheadAttention(config)
self.self_attn_layer_norm = nn.LayerNorm(self.embedding_dim)
self.fc1 = self.build_fc(
self.embedding_dim,
config.ffn_embedding_dim,
q_noise=config.q_noise,
qn_block_size=config.qn_block_size,
)
self.fc2 = self.build_fc(
config.ffn_embedding_dim,
self.embedding_dim,
q_noise=config.q_noise,
qn_block_size=config.qn_block_size,
)
self.final_layer_norm = nn.LayerNorm(self.embedding_dim)
def build_fc(
self, input_dim: int, output_dim: int, q_noise: float, qn_block_size: int
) -> Union[nn.Module, nn.Linear, nn.Embedding, nn.Conv2d]:
return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
def forward(
self,
input_nodes: torch.Tensor,
self_attn_bias: Optional[torch.Tensor] = None,
self_attn_mask: Optional[torch.Tensor] = None,
self_attn_padding_mask: Optional[torch.Tensor] = None,
):
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
"""
定义函数的返回类型为一个包含两个元素的元组,第一个是 torch.Tensor 类型,第二个是可选的 torch.Tensor 类型。
"""
residual = input_nodes
if self.pre_layernorm:
input_nodes = self.self_attn_layer_norm(input_nodes)
input_nodes, attn = self.self_attn(
query=input_nodes,
key=input_nodes,
value=input_nodes,
attn_bias=self_attn_bias,
key_padding_mask=self_attn_padding_mask,
need_weights=False,
attn_mask=self_attn_mask,
)
input_nodes = self.dropout_module(input_nodes)
input_nodes = residual + input_nodes
if not self.pre_layernorm:
input_nodes = self.self_attn_layer_norm(input_nodes)
residual = input_nodes
if self.pre_layernorm:
input_nodes = self.final_layer_norm(input_nodes)
input_nodes = self.activation_fn(self.fc1(input_nodes))
input_nodes = self.activation_dropout_module(input_nodes)
input_nodes = self.fc2(input_nodes)
input_nodes = self.dropout_module(input_nodes)
input_nodes = residual + input_nodes
if not self.pre_layernorm:
input_nodes = self.final_layer_norm(input_nodes)
return input_nodes, attn
class GraphormerGraphEncoder(nn.Module):
def __init__(self, config: GraphormerConfig):
super().__init__()
self.dropout_module = torch.nn.Dropout(p=config.dropout, inplace=False)
self.layerdrop = config.layerdrop
self.embedding_dim = config.embedding_dim
self.apply_graphormer_init = config.apply_graphormer_init
self.traceable = config.traceable
self.graph_node_feature = GraphormerGraphNodeFeature(config)
self.graph_attn_bias = GraphormerGraphAttnBias(config)
self.embed_scale = config.embed_scale
if config.q_noise > 0:
self.quant_noise = quant_noise(
nn.Linear(self.embedding_dim, self.embedding_dim, bias=False),
config.q_noise,
config.qn_block_size,
)
else:
self.quant_noise = None
if config.encoder_normalize_before:
self.emb_layer_norm = nn.LayerNorm(self.embedding_dim)
else:
self.emb_layer_norm = None
if config.pre_layernorm:
self.final_layer_norm = nn.LayerNorm(self.embedding_dim)
if self.layerdrop > 0.0:
self.layers = LayerDropModuleList(p=self.layerdrop)
else:
self.layers = nn.ModuleList([])
self.layers.extend([GraphormerGraphEncoderLayer(config) for _ in range(config.num_hidden_layers)])
if config.freeze_embeddings:
raise NotImplementedError("Freezing embeddings is not implemented yet.")
for layer in range(config.num_trans_layers_to_freeze):
m = self.layers[layer]
if m is not None:
for p in m.parameters():
p.requires_grad = False
def forward(
self,
input_nodes: torch.LongTensor,
input_edges: torch.LongTensor,
attn_bias: torch.Tensor,
in_degree: torch.LongTensor,
out_degree: torch.LongTensor,
spatial_pos: torch.LongTensor,
attn_edge_type: torch.LongTensor,
perturb=None,
last_state_only: bool = False,
token_embeddings: Optional[torch.Tensor] = None,
attn_mask: Optional[torch.Tensor] = None,
) -> Tuple[Union[torch.Tensor, List[torch.LongTensor]], torch.Tensor]:
data_x = input_nodes
n_graph, n_node = data_x.size()[:2]
padding_mask = (data_x[:, :, 0]).eq(0)
padding_mask_cls = torch.zeros(n_graph, 1, device=padding_mask.device, dtype=padding_mask.dtype)
padding_mask = torch.cat((padding_mask_cls, padding_mask), dim=1)
attn_bias = self.graph_attn_bias(input_nodes, attn_bias, spatial_pos, input_edges, attn_edge_type)
if token_embeddings is not None:
input_nodes = token_embeddings
else:
input_nodes = self.graph_node_feature(input_nodes, in_degree, out_degree)
if perturb is not None:
input_nodes[:, 1:, :] += perturb
if self.embed_scale is not None:
input_nodes = input_nodes * self.embed_scale
if self.quant_noise is not None:
input_nodes = self.quant_noise(input_nodes)
if self.emb_layer_norm is not None:
input_nodes = self.emb_layer_norm(input_nodes)
input_nodes = self.dropout_module(input_nodes)
input_nodes = input_nodes.transpose(0, 1)
inner_states = []
if not last_state_only:
inner_states.append(input_nodes)
for layer in self.layers:
input_nodes, _ = layer(
input_nodes,
self_attn_padding_mask=padding_mask,
self_attn_mask=attn_mask,
self_attn_bias=attn_bias,
)
if not last_state_only:
inner_states.append(input_nodes)
graph_rep = input_nodes[0, :, :]
if last_state_only:
inner_states = [input_nodes]
if self.traceable:
return torch.stack(inner_states), graph_rep
else:
return inner_states, graph_rep
class GraphormerDecoderHead(nn.Module):
def __init__(self, embedding_dim: int, num_classes: int):
super().__init__()
"""num_classes should be 1 for regression, or the number of classes for classification"""
self.lm_output_learned_bias = nn.Parameter(torch.zeros(1))
self.classifier = nn.Linear(embedding_dim, num_classes, bias=False)
self.num_classes = num_classes
def forward(self, input_nodes: torch.Tensor, **unused) -> torch.Tensor:
input_nodes = self.classifier(input_nodes)
input_nodes = input_nodes + self.lm_output_learned_bias
return input_nodes
class GraphormerPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = GraphormerConfig
base_model_prefix = "graphormer"
main_input_name_nodes = "input_nodes"
def normal_(self, data: torch.Tensor):
data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device))
def init_graphormer_params(self, module: Union[nn.Linear, nn.Embedding, GraphormerMultiheadAttention]):
"""
Initialize the weights specific to the Graphormer Model.
"""
if isinstance(module, nn.Linear):
self.normal_(module.weight.data)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
self.normal_(module.weight.data)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, GraphormerMultiheadAttention):
self.normal_(module.q_proj.weight.data)
self.normal_(module.k_proj.weight.data)
self.normal_(module.v_proj.weight.data)
def _init_weights(
self,
module: Union[
nn.Linear, nn.Conv2d, nn.Embedding, nn.LayerNorm, GraphormerMultiheadAttention, GraphormerGraphEncoder
],
"""
初始化模型的权重
"""
if isinstance(module, (nn.Linear, nn.Conv2d)):
module.weight.data.normal_(mean=0.0, std=0.02)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=0.02)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, GraphormerMultiheadAttention):
module.q_proj.weight.data.normal_(mean=0.0, std=0.02)
module.k_proj.weight.data.normal_(mean=0.0, std=0.02)
module.v_proj.weight.data.normal_(mean=0.0, std=0.02)
module.reset_parameters()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
elif isinstance(module, GraphormerGraphEncoder):
if module.apply_graphormer_init:
module.apply(self.init_graphormer_params)
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
class GraphormerModel(GraphormerPreTrainedModel):
"""The Graphormer model is a graph-encoder model.
It goes from a graph to its representation. If you want to use the model for a downstream classification task, use
GraphormerForGraphClassification instead. For any other downstream task, feel free to add a new class, or combine
this model with a downstream model of your choice, following the example in GraphormerForGraphClassification.
"""
def __init__(self, config: GraphormerConfig):
super().__init__(config)
self.max_nodes = config.max_nodes
self.graph_encoder = GraphormerGraphEncoder(config)
self.share_input_output_embed = config.share_input_output_embed
self.lm_output_learned_bias = None
self.load_softmax = not getattr(config, "remove_head", False)
self.lm_head_transform_weight = nn.Linear(config.embedding_dim, config.embedding_dim)
self.activation_fn = ACT2FN[config.activation_fn]
self.layer_norm = nn.LayerNorm(config.embedding_dim)
self.post_init()
def reset_output_layer_parameters(self):
self.lm_output_learned_bias = nn.Parameter(torch.zeros(1))
def forward(
self,
input_nodes: torch.LongTensor,
input_edges: torch.LongTensor,
attn_bias: torch.Tensor,
in_degree: torch.LongTensor,
out_degree: torch.LongTensor,
spatial_pos: torch.LongTensor,
attn_edge_type: torch.LongTensor,
perturb: Optional[torch.FloatTensor] = None,
masked_tokens: None = None,
return_dict: Optional[bool] = None,
**unused,
) -> Union[Tuple[torch.LongTensor], BaseModelOutputWithNoAttention]:
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
inner_states, graph_rep = self.graph_encoder(
input_nodes, input_edges, attn_bias, in_degree, out_degree, spatial_pos, attn_edge_type, perturb=perturb
)
input_nodes = inner_states[-1].transpose(0, 1)
if masked_tokens is not None:
raise NotImplementedError
input_nodes = self.layer_norm(self.activation_fn(self.lm_head_transform_weight(input_nodes)))
if self.share_input_output_embed and hasattr(self.graph_encoder.embed_tokens, "weight"):
input_nodes = torch.nn.functional.linear(input_nodes, self.graph_encoder.embed_tokens.weight)
if not return_dict:
return tuple(x for x in [input_nodes, inner_states] if x is not None)
return BaseModelOutputWithNoAttention(last_hidden_state=input_nodes, hidden_states=inner_states)
def max_nodes(self):
"""Maximum output length supported by the encoder."""
return self.max_nodes
class GraphormerForGraphClassification(GraphormerPreTrainedModel):
"""
This model can be used for graph-level classification or regression tasks.
It can be trained on
- regression (by setting config.num_classes to 1); there should be one float-type label per graph
- one task classification (by setting config.num_classes to the number of classes); there should be one integer
label per graph
- binary multi-task classification (by setting config.num_classes to the number of labels); there should be a list
of integer labels for each graph.
"""
def __init__(self, config: GraphormerConfig):
super().__init__(config)
self.encoder = GraphormerModel(config)
self.embedding_dim = config.embedding_dim
self.num_classes = config.num_classes
self.classifier = GraphormerDecoderHead(self.embedding_dim, self.num_classes)
self.is_encoder_decoder = True
self.post_init()
def forward(
self,
input_nodes: torch.LongTensor,
input_edges: torch.LongTensor,
attn_bias: torch.Tensor,
in_degree: torch.LongTensor,
out_degree: torch.LongTensor,
spatial_pos: torch.LongTensor,
attn_edge_type: torch.LongTensor,
labels: Optional[torch.LongTensor] = None,
return_dict: Optional[bool] = None,
**unused,
) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
encoder_outputs = self.encoder(
input_nodes,
input_edges,
attn_bias,
in_degree,
out_degree,
spatial_pos,
attn_edge_type,
return_dict=True,
)
outputs, hidden_states = encoder_outputs["last_hidden_state"], encoder_outputs["hidden_states"]
head_outputs = self.classifier(outputs)
logits = head_outputs[:, 0, :].contiguous()
loss = None
if labels is not None:
mask = ~torch.isnan(labels)
if self.num_classes == 1:
loss_fct = MSELoss()
loss = loss_fct(logits[mask].squeeze(), labels[mask].squeeze().float())
elif self.num_classes > 1 and len(labels.shape) == 1:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits[mask].view(-1, self.num_classes), labels[mask].view(-1))
else:
loss_fct = BCEWithLogitsLoss(reduction="sum")
loss = loss_fct(logits[mask], labels[mask])
if not return_dict:
return tuple(x for x in [loss, logits, hidden_states] if x is not None)
return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=hidden_states, attentions=None)
.\models\graphormer\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
_import_structure = {
"configuration_graphormer": ["GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "GraphormerConfig"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_graphormer"] = [
"GRAPHORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
"GraphormerForGraphClassification",
"GraphormerModel",
"GraphormerPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_graphormer import GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, GraphormerConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_graphormer import (
GRAPHORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
GraphormerForGraphClassification,
GraphormerModel,
GraphormerPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\groupvit\configuration_groupvit.py
""" GroupViT model configuration"""
import os
from collections import OrderedDict
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
if TYPE_CHECKING:
from ...processing_utils import ProcessorMixin
from ...utils import TensorType
logger = logging.get_logger(__name__)
GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"nvidia/groupvit-gcc-yfcc": "https://huggingface.co/nvidia/groupvit-gcc-yfcc/resolve/main/config.json",
}
class GroupViTTextConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`GroupViTTextModel`]. It is used to instantiate an
GroupViT model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the GroupViT
[nvidia/groupvit-gcc-yfcc](https://huggingface.co/nvidia/groupvit-gcc-yfcc) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
model_type = "groupvit_text_model"
def __init__(
self,
vocab_size=49408,
hidden_size=256,
intermediate_size=1024,
num_hidden_layers=12,
num_attention_heads=4,
max_position_embeddings=77,
hidden_act="quick_gelu",
layer_norm_eps=1e-5,
dropout=0.0,
attention_dropout=0.0,
initializer_range=0.02,
initializer_factor=1.0,
pad_token_id=1,
bos_token_id=49406,
eos_token_id=49407,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.dropout = dropout
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.max_position_embeddings = max_position_embeddings
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.attention_dropout = attention_dropout
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
cls._set_token_in_kwargs(kwargs)
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
if config_dict.get("model_type") == "groupvit":
config_dict = config_dict["text_config"]
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
return cls.from_dict(config_dict, **kwargs)
class GroupViTVisionConfig(PretrainedConfig):
"""
这是一个配置类,用于存储 [`GroupViTVisionModel`] 的配置信息。它被用来根据指定的参数实例化一个 GroupViT 模型,
定义模型的架构。使用默认参数实例化一个配置对象会得到与 GroupViT [nvidia/groupvit-gcc-yfcc] 架构相似的配置。
配置对象继承自 [`PretrainedConfig`],可以用来控制模型的输出。阅读 [`PretrainedConfig`] 的文档以获取更多信息。
Args:
hidden_size (`int`, *optional*, defaults to 384):
编码器层和池化层的维度。
intermediate_size (`int`, *optional*, defaults to 1536):
Transformer 编码器中 "intermediate"(即前馈)层的维度。
depths (`List[int]`, *optional*, defaults to [6, 3, 3]):
每个编码器块中的层数。
num_group_tokens (`List[int]`, *optional*, defaults to [64, 8, 0]):
每个阶段的组令牌数量。
num_output_groups (`List[int]`, *optional*, defaults to [64, 8, 8]):
每个阶段的输出组数,0 表示没有组。
num_attention_heads (`int`, *optional*, defaults to 6):
Transformer 编码器中每个注意力层的注意头数。
image_size (`int`, *optional*, defaults to 224):
每个图像的大小(分辨率)。
patch_size (`int`, *optional*, defaults to 16):
每个补丁的大小(分辨率)。
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
编码器和池化器中的非线性激活函数(函数或字符串)。支持的字符串有 "gelu"、"relu"、"selu" 和 "gelu_new" "quick_gelu"。
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
层归一化层使用的 epsilon。
dropout (`float`, *optional*, defaults to 0.0):
嵌入层、编码器和池化器中所有全连接层的 dropout 概率。
attention_dropout (`float`, *optional*, defaults to 0.0):
注意力概率的 dropout 比率。
initializer_range (`float`, *optional*, defaults to 0.02):
用于初始化所有权重矩阵的截断正态初始化器的标准差。
initializer_factor (`float`, *optional*, defaults to 1.0):
初始化所有权重矩阵的因子(应保持为 1,用于初始化测试中使用)。
Example:
```
>>> from transformers import GroupViTVisionConfig, GroupViTVisionModel
```
# 初始化一个 GroupViTVisionConfig 对象,使用 nvidia/groupvit-gcc-yfcc 风格的配置
configuration = GroupViTVisionConfig()
# 使用上述配置初始化一个 GroupViTVisionModel 模型
model = GroupViTVisionModel(configuration)
# 获取模型的配置信息
configuration = model.config
# 定义 GroupViTConfig 类,继承自 PretrainedConfig 类
class GroupViTConfig(PretrainedConfig):
r"""
[`GroupViTConfig`] 是用于存储 [`GroupViTModel`] 配置的配置类。它用于根据指定的参数实例化一个 GroupViT 模型,
定义文本模型和视觉模型的配置。使用默认值实例化配置将产生与 GroupViT [nvidia/groupvit-gcc-yfcc] 架构类似的配置。
配置对象继承自 [`PretrainedConfig`],可以用于控制模型的输出。阅读 [`PretrainedConfig`] 的文档以获取更多信息。
Args:
text_config (`dict`, *optional*):
用于初始化 [`GroupViTTextConfig`] 的配置选项字典。
vision_config (`dict`, *optional*):
用于初始化 [`GroupViTVisionConfig`] 的配置选项字典。
projection_dim (`int`, *optional*, defaults to 256):
文本和视觉投影层的维度。
projection_intermediate_dim (`int`, *optional*, defaults to 4096):
文本和视觉投影层中间层的维度。
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
*logit_scale* 参数的初始值。默认值与原始 GroupViT 实现相匹配。
kwargs (*optional*):
关键字参数字典。
"""
# 模型类型为 "groupvit"
model_type = "groupvit"
# 构造函数,初始化 GroupViTConfig 实例
def __init__(
self,
text_config=None,
vision_config=None,
projection_dim=256,
projection_intermediate_dim=4096,
logit_scale_init_value=2.6592,
**kwargs,
):
# 调用父类的构造函数,设置配置选项
super().__init__(**kwargs)
@classmethod
def from_text_vision_configs(cls, text_config: GroupViTTextConfig, vision_config: GroupViTVisionConfig, **kwargs):
r"""
从 GroupViT 文本模型配置和 GroupViT 视觉模型配置实例化一个 [`GroupViTConfig`](或其派生类)。
Returns:
[`GroupViTConfig`]: 配置对象的一个实例
"""
# 使用传入的 text_config 和 vision_config 创建一个新的 GroupViTConfig 实例
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
class GroupViTOnnxConfig(OnnxConfig):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
# 定义输入层的名称和维度映射
return OrderedDict(
[
("input_ids", {0: "batch", 1: "sequence"}),
("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
("attention_mask", {0: "batch", 1: "sequence"}),
]
)
@property
# 返回一个有序字典,包含输出标识和对应的数据结构
def outputs(self) -> Mapping[str, Mapping[int, str]]:
return OrderedDict(
[
("logits_per_image", {0: "batch"}), # 输出标识"logits_per_image"对应值为{0: "batch"}
("logits_per_text", {0: "batch"}), # 输出标识"logits_per_text"对应值为{0: "batch"}
("text_embeds", {0: "batch"}), # 输出标识"text_embeds"对应值为{0: "batch"}
("image_embeds", {0: "batch"}), # 输出标识"image_embeds"对应值为{0: "batch"}
]
)
# 返回用于验证的绝对误差容限值
@property
def atol_for_validation(self) -> float:
return 1e-4
# 生成虚拟的输入数据字典,结合文本和图像处理器的虚拟输入
def generate_dummy_inputs(
self,
processor: "ProcessorMixin",
batch_size: int = -1,
seq_length: int = -1,
framework: Optional["TensorType"] = None,
) -> Mapping[str, Any]:
# 调用父类方法生成文本输入字典
text_input_dict = super().generate_dummy_inputs(
processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
)
# 调用父类方法生成图像输入字典
image_input_dict = super().generate_dummy_inputs(
processor.image_processor, batch_size=batch_size, framework=framework
)
# 合并文本和图像输入字典,返回合并后的结果
return {**text_input_dict, **image_input_dict}
# 返回默认的ONNX操作集版本号
@property
def default_onnx_opset(self) -> int:
return 14
这些注释解释了每个方法和属性的作用,确保了代码的清晰性和可读性。
.\models\groupvit\convert_groupvit_nvlab_to_hf.py
def rename_key(name):
if "img_encoder.pos_embed" in name:
name = name.replace("img_encoder.pos_embed", "vision_model.embeddings.position_embeddings")
if "img_encoder.patch_embed.proj" in name:
name = name.replace("img_encoder.patch_embed.proj", "vision_model.embeddings.patch_embeddings.projection")
if "img_encoder.patch_embed.norm" in name:
name = name.replace("img_encoder.patch_embed.norm", "vision_model.embeddings.layernorm")
if "img_encoder.layers" in name:
name = name.replace("img_encoder.layers", "vision_model.encoder.stages")
if "blocks" in name and "res" not in name:
name = name.replace("blocks", "layers")
if "attn" in name and "pre_assign" not in name:
name = name.replace("attn", "self_attn")
if "proj" in name and "self_attn" in name and "text" not in name:
name = name.replace("proj", "out_proj")
if "pre_assign_attn.attn.proj" in name:
name = name.replace("pre_assign_attn.attn.proj", "pre_assign_attn.attn.out_proj")
if "norm1" in name:
name = name.replace("norm1", "layer_norm1")
if "norm2" in name and "pre_assign" not in name:
name = name.replace("norm2", "layer_norm2")
if "img_encoder.norm" in name:
name = name.replace("img_encoder.norm", "vision_model.layernorm")
if "text_encoder.token_embedding" in name:
name = name.replace("text_encoder.token_embedding", "text_model.embeddings.token_embedding")
if "text_encoder.positional_embedding" in name:
name = name.replace("text_encoder.positional_embedding", "text_model.embeddings.position_embedding.weight")
if "text_encoder.transformer.resblocks." in name:
name = name.replace("text_encoder.transformer.resblocks.", "text_model.encoder.layers.")
if "ln_1" in name:
name = name.replace("ln_1", "layer_norm1")
if "ln_2" in name:
name = name.replace("ln_2", "layer_norm2")
if "c_fc" in name:
name = name.replace("c_fc", "fc1")
if "c_proj" in name:
name = name.replace("c_proj", "fc2")
if "text_encoder" in name:
name = name.replace("text_encoder", "text_model")
if "ln_final" in name:
name = name.replace("ln_final", "final_layer_norm")
if "img_projector.linear_hidden." in name:
name = name.replace("img_projector.linear_hidden.", "visual_projection.")
if "img_projector.linear_out." in name:
name = name.replace("img_projector.linear_out.", "visual_projection.3.")
if "text_projector.linear_hidden" in name:
name = name.replace("text_projector.linear_hidden", "text_projection")
if "text_projector.linear_out" in name:
name = name.replace("text_projector.linear_out", "text_projection.3")
return name
def convert_state_dict(orig_state_dict, config):
return orig_state_dict
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_groupvit_checkpoint(
checkpoint_path, pytorch_dump_folder_path, model_name="groupvit-gcc-yfcc", push_to_hub=False
):
"""
复制/粘贴/调整模型的权重以符合 Transformers 设计。
"""
config = GroupViTConfig()
model = GroupViTModel(config).eval()
state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
new_state_dict = convert_state_dict(state_dict, config)
missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
assert missing_keys == ["text_model.embeddings.position_ids"]
assert (unexpected_keys == ["multi_label_logit_scale"]) or (len(unexpected_keys) == 0)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
image = prepare_img()
inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
if model_name == "groupvit-gcc-yfcc":
expected_logits = torch.tensor([[13.3523, 6.3629]])
elif model_name == "groupvit-gcc-redcaps":
expected_logits = torch.tensor([[16.1873, 8.6230]])
else:
raise ValueError(f"Model name {model_name} not supported.")
assert torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3)
processor.save_pretrained(pytorch_dump_folder_path)
model.save_pretrained(pytorch_dump_folder_path)
print("Successfully saved processor and model to", pytorch_dump_folder_path)
if push_to_hub:
print("Pushing to the hub...")
processor.push_to_hub(model_name, organization="nielsr")
model.push_to_hub(model_name, organization="nielsr")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, help="Path to dump the processor and PyTorch model."
)
parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to GroupViT checkpoint")
parser.add_argument(
"--model_name",
default="groupvit-gccy-fcc",
type=str,
help="Name of the model. Expecting either 'groupvit-gcc-yfcc' or 'groupvit-gcc-redcaps'",
)
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Whether or not to push the converted model and processor to the 🤗 hub using the provided `model_name`.",
)
args = parser.parse_args()
convert_groupvit_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.model_name, args.push_to_hub)
.\models\groupvit\modeling_groupvit.py
import collections.abc
import math
from dataclasses import dataclass
from typing import Any, Optional, Tuple, Union
import numpy as np
import torch
import torch.utils.checkpoint
from torch import nn
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_groupvit import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "nvidia/groupvit-gcc-yfcc"
GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"nvidia/groupvit-gcc-yfcc",
]
def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
"""
计算对比损失函数。
Args:
logits (torch.Tensor): 模型输出的对比分数张量
Returns:
torch.Tensor: 对比损失值张量
"""
return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
def groupvit_loss(similarity: torch.Tensor) -> torch.Tensor:
"""
计算 GroupViT 模型的损失函数。
Args:
similarity (torch.Tensor): 相似性分数张量
Returns:
torch.Tensor: GroupViT 损失值张量
"""
caption_loss = contrastive_loss(similarity)
image_loss = contrastive_loss(similarity.t())
return (caption_loss + image_loss) / 2.0
def hard_softmax(logits: torch.Tensor, dim: int):
"""
实现硬件 softmax 函数。
Args:
logits (torch.Tensor): 模型输出的 logits 张量
dim (int): softmax 操作的维度
Returns:
torch.Tensor: 硬件 softmax 后的张量
"""
y_soft = logits.softmax(dim)
index = y_soft.max(dim, keepdim=True)[1]
y_hard = torch.zeros_like(logits, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
ret = y_hard - y_soft.detach() + y_soft
return ret
def gumbel_softmax(logits: torch.Tensor, tau: float = 1, hard: bool = False, dim: int = -1) -> torch.Tensor:
"""
实现 Gumbel softmax 函数。
Args:
logits (torch.Tensor): 模型输出的 logits 张量
tau (float): Gumbel 分布的温度参数,默认为 1
hard (bool): 是否使用硬件 softmax,即直通机制,默认为 False
dim (int): softmax 操作的维度,默认为 -1
Returns:
torch.Tensor: Gumbel softmax 后的张量
"""
gumbel_dist = torch.distributions.gumbel.Gumbel(
torch.tensor(0.0, device=logits.device, dtype=logits.dtype),
torch.tensor(1.0, device=logits.device, dtype=logits.dtype),
)
gumbels = gumbel_dist.sample(logits.shape)
gumbels = (logits + gumbels) / tau
y_soft = gumbels.softmax(dim)
if hard:
index = y_soft.max(dim, keepdim=True)[1]
y_hard = torch.zeros_like(logits, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
ret = y_hard - y_soft.detach() + y_soft
else:
ret = y_soft
return ret
def resize_attention_map(attentions, height, width, align_corners=False):
"""
Args:
attentions (`torch.Tensor`): attention map of shape [batch_size, groups, feat_height*feat_width]
height (`int`): height of the output attention map
width (`int`): width of the output attention map
align_corners (`bool`, *optional*): the `align_corner` argument for `nn.functional.interpolate`.
Returns:
`torch.Tensor`: resized attention map of shape [batch_size, groups, height, width]
"""
scale = (height * width // attentions.shape[2]) ** 0.5
if height > width:
feat_width = int(np.round(width / scale))
feat_height = attentions.shape[2] // feat_width
else:
feat_height = int(np.round(height / scale))
feat_width = attentions.shape[2] // feat_height
batch_size = attentions.shape[0]
groups = attentions.shape[1]
attentions = attentions.reshape(batch_size, groups, feat_height, feat_width)
attentions = nn.functional.interpolate(
attentions, size=(height, width), mode="bilinear", align_corners=align_corners
)
return attentions
def get_grouping_from_attentions(attentions, hw_shape):
"""
Args:
attentions (`tuple(torch.FloatTensor)`): tuple of attention maps returned by `GroupViTVisionTransformer`
hw_shape (`tuple(int)`): height and width of the output attention map
Returns:
`torch.Tensor`: the attention map of shape [batch_size, groups, height, width]
"""
attn_maps = []
with torch.no_grad():
prev_attn_masks = None
for attn_masks in attentions:
attn_masks = attn_masks.permute(0, 2, 1).contiguous()
if prev_attn_masks is None:
prev_attn_masks = attn_masks
else:
prev_attn_masks = prev_attn_masks @ attn_masks
cur_attn_map = resize_attention_map(prev_attn_masks.permute(0, 2, 1).contiguous(), *hw_shape)
attn_maps.append(cur_attn_map)
final_grouping = attn_maps[-1]
return final_grouping
class GroupViTCrossAttentionLayer(nn.Module):
def __init__(self, config: GroupViTVisionConfig):
super().__init__()
self.attn = GroupViTAttention(config)
self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.mlp = GroupViTMLP(config)
self.norm_post = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, query, key):
x = query
x = x + self.attn(query, encoder_hidden_states=key)[0]
x = x + self.mlp(self.norm2(x))
x = self.norm_post(x)
return x
class GroupViTAssignAttention(nn.Module):
def __init__(self, config: GroupViTVisionConfig):
super().__init__()
self.scale = config.hidden_size**-0.5
self.q_proj = nn.Linear(config.hidden_size, config.hidden_size)
self.k_proj = nn.Linear(config.hidden_size, config.hidden_size)
self.v_proj = nn.Linear(config.hidden_size, config.hidden_size)
self.proj = nn.Linear(config.hidden_size, config.hidden_size)
self.assign_eps = config.assign_eps
def get_attn(self, attn, gumbel=True, hard=True):
if gumbel and self.training:
attn = gumbel_softmax(attn, dim=-2, hard=hard)
else:
if hard:
attn = hard_softmax(attn, dim=-2)
else:
attn = nn.functional.softmax(attn, dim=-2)
return attn
def forward(self, query, key):
value = key
query = self.q_proj(query)
key = self.k_proj(key)
value = self.v_proj(value)
raw_attn = (query @ key.transpose(-2, -1)) * self.scale
attn = self.get_attn(raw_attn)
soft_attn = self.get_attn(raw_attn, gumbel=False, hard=False)
attn = attn / (attn.sum(dim=-1, keepdim=True) + self.assign_eps)
out = attn @ value
out = self.proj(out)
return out, soft_attn
class GroupViTTokenAssign(nn.Module):
def __init__(self, config: GroupViTVisionConfig, num_group_token, num_output_group):
super().__init__()
self.num_output_group = num_output_group
self.norm_tokens = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
assign_mlp_ratio = (
config.assign_mlp_ratio
if isinstance(config.assign_mlp_ratio, collections.abc.Iterable)
else (config.assign_mlp_ratio, config.assign_mlp_ratio)
)
tokens_dim, channels_dim = [int(x * config.hidden_size) for x in assign_mlp_ratio]
self.mlp_inter = GroupViTMixerMLP(config, num_group_token, tokens_dim, num_output_group)
self.norm_post_tokens = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.norm_x = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.pre_assign_attn = GroupViTCrossAttentionLayer(config)
self.assign = GroupViTAssignAttention(config)
self.norm_new_x = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.mlp_channels = GroupViTMLP(config, config.hidden_size, channels_dim, config.hidden_size)
def project_group_token(self, group_tokens):
"""
Args:
group_tokens (torch.Tensor): group tokens, [batch_size, num_group_tokens, channels]
Returns:
projected_group_tokens (torch.Tensor): [batch_size, num_output_groups, channels]
"""
projected_group_tokens = self.mlp_inter(group_tokens)
projected_group_tokens = self.norm_post_tokens(projected_group_tokens)
return projected_group_tokens
def forward(self, image_tokens, group_tokens):
"""
Args:
image_tokens (`torch.Tensor`): image tokens, of shape [batch_size, input_length, channels]
group_tokens (`torch.Tensor`): group tokens, [batch_size, num_group_tokens, channels]
"""
group_tokens = self.norm_tokens(group_tokens)
image_tokens = self.norm_x(image_tokens)
projected_group_tokens = self.project_group_token(group_tokens)
projected_group_tokens = self.pre_assign_attn(projected_group_tokens, image_tokens)
new_image_tokens, attention = self.assign(projected_group_tokens, image_tokens)
new_image_tokens += projected_group_tokens
new_image_tokens = new_image_tokens + self.mlp_channels(self.norm_new_x(new_image_tokens))
return new_image_tokens, attention
@dataclass
class GroupViTModelOutput(ModelOutput):
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
Contrastive loss for image-text similarity.
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
similarity scores.
segmentation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
Classification scores for each pixel.
<Tip warning={true}>
The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
original image size as post-processing. You should always check your logits shape and resize as needed.
</Tip>
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
The text embeddings obtained by applying the projection layer to the pooled output of
[`GroupViTTextModel`].
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
The image embeddings obtained by applying the projection layer to the pooled output of
[`GroupViTVisionModel`].
text_model_output (`BaseModelOutputWithPooling`):
The output of the [`GroupViTTextModel`].
vision_model_output (`BaseModelOutputWithPooling`):
The output of the [`GroupViTVisionModel`].
"""
loss: Optional[torch.FloatTensor] = None
logits_per_image: torch.FloatTensor = None
logits_per_text: torch.FloatTensor = None
segmentation_logits: torch.FloatTensor = None
text_embeds: torch.FloatTensor = None
image_embeds: torch.FloatTensor = None
text_model_output: BaseModelOutputWithPooling = None
vision_model_output: BaseModelOutputWithPooling = None
def to_tuple(self) -> Tuple[Any]:
return tuple(
self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
for k in self.keys()
)
class GroupViTPatchEmbeddings(nn.Module):
"""
Image to Patch Embedding.
"""
def __init__(
self,
image_size: int = 224,
patch_size: Union[int, Tuple[int, int]] = 16,
num_channels: int = 3,
embed_dim: int = 768,
):
super().__init__()
image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
self.image_size = image_size
self.patch_size = patch_size
self.num_patches = num_patches
self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
batch_size, num_channels, height, width = pixel_values.shape
if not interpolate_pos_encoding:
if height != self.image_size[0] or width != self.image_size[1]:
raise ValueError(
f"Input image size ({height}*{width}) doesn't match model"
f" ({self.image_size[0]}*{self.image_size[1]})."
)
x = self.projection(pixel_values).flatten(2).transpose(1, 2)
return x
class GroupViTVisionEmbeddings(nn.Module):
def __init__(self, config: GroupViTVisionConfig):
super().__init__()
self.patch_embeddings = GroupViTPatchEmbeddings(
image_size=config.image_size,
patch_size=config.patch_size,
num_channels=config.num_channels,
embed_dim=config.hidden_size,
)
num_patches = self.patch_embeddings.num_patches
self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches, config.hidden_size))
self.dropout = nn.Dropout(config.dropout)
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.config = config
def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
"""
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
resolution images.
Source:
https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
"""
npatch = embeddings.shape[1]
if npatch == self.position_embeddings.shape[1] and height == width:
return self.position_embeddings
patch_pos_embed = self.position_embeddings
num_original_pos_embed = patch_pos_embed.shape[1]
dim = embeddings.shape[-1]
feat_height = height // self.config.patch_size
feat_width = width // self.config.patch_size
feat_height, feat_width = feat_height + 0.1, feat_width + 0.1
original_height = original_width = math.sqrt(num_original_pos_embed)
reshaped_patch_pos_embed = patch_pos_embed.reshape(1, int(original_height), int(original_width), dim).permute(
0, 3, 1, 2
)
scale_factor = (feat_height / original_height, feat_width / original_width)
patch_pos_embed = nn.functional.interpolate(
reshaped_patch_pos_embed,
scale_factor=scale_factor,
mode="bicubic",
align_corners=False,
)
patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
return patch_pos_embed
def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
batch_size, num_channels, height, width = pixel_values.shape
embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
embeddings = self.layernorm(embeddings)
batch_size, seq_len, _ = embeddings.size()
if interpolate_pos_encoding:
embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
else:
embeddings = embeddings + self.position_embeddings
embeddings = self.dropout(embeddings)
return embeddings
class GroupViTTextEmbeddings(nn.Module):
def __init__(self, config: GroupViTTextConfig):
super().__init__()
embed_dim = config.hidden_size
self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
) -> torch.Tensor:
seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
if position_ids is None:
position_ids = self.position_ids[:, :seq_length]
if inputs_embeds is None:
inputs_embeds = self.token_embedding(input_ids)
position_embeddings = self.position_embedding(position_ids)
embeddings = inputs_embeds + position_embeddings
return embeddings
class GroupViTStage(nn.Module):
"""这对应于GroupViT实现中的`GroupingLayer`类。"""
def __init__(
self,
config: GroupViTVisionConfig,
depth: int,
num_prev_group_token: int,
num_group_token: int,
num_output_group: int,
):
super().__init__()
self.depth = depth
self.num_group_token = num_group_token
if num_group_token > 0:
self.group_token = nn.Parameter(torch.zeros(1, num_group_token, config.hidden_size))
else:
self.group_token = None
self.layers = nn.ModuleList([GroupViTEncoderLayer(config) for _ in range(depth)])
if num_group_token > 0:
self.downsample = GroupViTTokenAssign(
config=config,
num_group_token=num_group_token,
num_output_group=num_output_group,
)
else:
self.downsample = None
if num_prev_group_token > 0 and num_group_token > 0:
self.group_projector = nn.Sequential(
nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps),
GroupViTMixerMLP(config, num_prev_group_token, config.hidden_size // 2, num_group_token),
)
else:
self.group_projector = None
@property
def with_group_token(self):
return self.group_token is not None
def split_x(self, x):
if self.with_group_token:
return x[:, :-self.num_group_token], x[:, -self.num_group_token:]
else:
return x, None
def concat_x(self, x: torch.Tensor, group_token: Optional[torch.Tensor] = None) -> torch.Tensor:
if group_token is None:
return x
return torch.cat([x, group_token], dim=1)
def forward(
self,
hidden_states: torch.Tensor,
prev_group_token: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.FloatTensor]:
"""
Args:
hidden_states (`torch.FloatTensor`): 输入到层的张量,形状为 `(batch, seq_len, embed_dim)`
prev_group_token (`torch.FloatTensor`, *optional*): 上一个组令牌的张量,形状为 `(batch, 1, embed_dim)`
output_attentions (`bool`, *optional*):
是否返回 Grouping block 的注意力张量。
"""
if self.with_group_token:
group_token = self.group_token.expand(hidden_states.size(0), -1, -1)
if self.group_projector is not None:
group_token = group_token + self.group_projector(prev_group_token)
else:
group_token = None
x = hidden_states
cat_x = self.concat_x(x, group_token)
for layer in self.layers:
layer_out = layer(cat_x, attention_mask=None, causal_attention_mask=None)
cat_x = layer_out[0]
x, group_token = self.split_x(cat_x)
attention = None
if self.downsample is not None:
x, attention = self.downsample(x, group_token)
outputs = (x, group_token)
if output_attentions:
outputs = outputs + (attention,)
return outputs
class GroupViTMLP(nn.Module):
def __init__(
self,
config: GroupViTVisionConfig,
hidden_size: Optional[int] = None,
intermediate_size: Optional[int] = None,
output_size: Optional[int] = None,
):
super().__init__()
self.config = config
self.activation_fn = ACT2FN[config.hidden_act]
hidden_size = hidden_size if hidden_size is not None else config.hidden_size
intermediate_size = intermediate_size if intermediate_size is not None else config.intermediate_size
output_size = output_size if output_size is not None else hidden_size
self.fc1 = nn.Linear(hidden_size, intermediate_size)
self.fc2 = nn.Linear(intermediate_size, output_size)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.fc1(hidden_states)
hidden_states = self.activation_fn(hidden_states)
hidden_states = self.fc2(hidden_states)
return hidden_states
class GroupViTMixerMLP(GroupViTMLP):
def forward(self, x):
x = super().forward(x.transpose(1, 2))
return x.transpose(1, 2)
class GroupViTAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self, config):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.embed_dim // self.num_heads
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {self.num_heads})."
)
self.scale = self.head_dim**-0.5
self.dropout = config.attention_dropout
self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
causal_attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
):
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
query_states = self._shape(query_states, -1, hidden_states.size(0))
key_states = self._shape(key_states, -1, hidden_states.size(0))
value_states = self._shape(value_states, -1, hidden_states.size(0))
attn_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
attn_scores = attn_scores * self.scale
if attention_mask is not None:
attn_scores = attn_scores.masked_fill(attention_mask == 0, float('-inf'))
attn_probs = F.softmax(attn_scores, dim=-1)
attn_probs = F.dropout(attn_probs, p=self.dropout, training=self.training)
attn_output = torch.matmul(attn_probs, value_states)
attn_output = attn_output.transpose(1, 2).contiguous().view(hidden_states.size())
attn_output = self.out_proj(attn_output)
return attn_output
class GroupViTEncoderLayer(nn.Module):
def __init__(self, config: GroupViTConfig):
super().__init__()
self.embed_dim = config.hidden_size
self.self_attn = GroupViTAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = GroupViTMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def __init__(self, config: GroupViTConfig):
super().__init__()
self.embed_dim = config.hidden_size
self.self_attn = GroupViTAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = GroupViTMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
causal_attention_mask: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.FloatTensor]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
`(config.encoder_attention_heads,)`.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
hidden_states = self.layer_norm1(hidden_states)
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask,
output_attentions=output_attentions,
)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.layer_norm2(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
class GroupViTPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = GroupViTConfig
base_model_prefix = "groupvit"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
init_range = self.config.initializer_range
if isinstance(module, (nn.Linear, nn.Conv2d)):
module.weight.data.normal_(mean=0.0, std=init_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
factor = self.config.initializer_factor
if isinstance(module, GroupViTTextEmbeddings):
module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
elif isinstance(module, GroupViTAttention):
in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
out_proj_std = (module.embed_dim**-0.5) * factor
nn.init.normal_(module.q_proj.weight, std=in_proj_std)
nn.init.normal_(module.k_proj.weight, std=in_proj_std)
nn.init.normal_(module.v_proj.weight, std=in_proj_std)
nn.init.normal_(module.out_proj.weight, std=out_proj_std)
elif isinstance(module, GroupViTMLP):
in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
nn.init.normal_(module.fc1.weight, std=fc_std)
nn.init.normal_(module.fc2.weight, std=in_proj_std)
GROUPVIT_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`GroupViTConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
GROUPVIT_TEXT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
# 输入序列的标记索引,用于词汇表中的标记。默认情况下会忽略填充部分。
# 可以使用`CLIPTokenizer`获取索引。详见`PreTrainedTokenizer.encode`和`PreTrainedTokenizer.__call__`。
# [什么是输入 ID?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
# 避免对填充标记索引执行注意力操作的掩码。掩码值在 `[0, 1]` 范围内:
# - 1 表示**未被掩码**的标记,
# - 0 表示**被掩码**的标记。
# [什么是注意力掩码?](../glossary#attention-mask)
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
# 每个输入序列标记在位置嵌入中的位置索引。选择在 `[0, config.max_position_embeddings - 1]` 范围内。
# [什么是位置 ID?](../glossary#position-ids)
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。有关详细信息,请查看返回的张量中的`attentions`。
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。有关详细信息,请查看返回的张量中的`hidden_states`。
return_dict (`bool`, *optional*):
# 是否返回[`~utils.ModelOutput`]而不是普通元组。
"""
GROUPVIT_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
GROUPVIT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`CLIPTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`CLIPImageProcessor.__call__`] for details.
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
class GroupViTVisionEncoder(nn.Module):
"""
Vision encoder module for GroupViT.
"""
def __init__(self):
"""
Initialize the GroupViT vision encoder.
"""
super().__init__()
def forward(
self,
pixel_values: torch.FloatTensor,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = False,
) -> torch.FloatTensor:
"""
Forward pass of the GroupViT vision encoder.
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Returns:
torch.FloatTensor: Output tensor from the vision encoder.
Notes:
The detailed behavior of this method should be found in the GroupViT model's documentation.
"""
pass
def __init__(self, config: GroupViTVisionConfig) -> None:
super().__init__()
self.config = config
self.stages = nn.ModuleList(
[
GroupViTStage(
config=config,
depth=config.depths[i],
num_group_token=config.num_group_tokens[i],
num_output_group=config.num_output_groups[i],
num_prev_group_token=config.num_output_groups[i - 1] if i > 0 else 0,
)
for i in range(len(config.depths))
]
)
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
output_hidden_states: Optional[bool] = None,
output_attentions: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, BaseModelOutput]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
all_hidden_states = () if output_hidden_states else None
all_groupings = () if output_attentions else None
group_tokens = None
for i, stage in enumerate(self.stages):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_outputs = stage(hidden_states, group_tokens, output_attentions)
hidden_states = layer_outputs[0]
group_tokens = layer_outputs[1]
if output_attentions and layer_outputs[2] is not None:
all_groupings = all_groupings + (layer_outputs[2],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_groupings] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_groupings
)
class GroupViTTextEncoder(nn.Module):
"""
Transformer encoder consisting of `config.num_hidden_layers` self-attention layers. Each layer is a
[`GroupViTEncoderLayer`].
Args:
config: GroupViTTextConfig
"""
def __init__(self, config: GroupViTTextConfig):
super().__init__()
self.config = config
self.layers = nn.ModuleList([GroupViTEncoderLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
inputs_embeds,
attention_mask: Optional[torch.Tensor] = None,
causal_attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Forward pass of the encoder.
Args:
inputs_embeds: Embedded input tokens.
attention_mask: Mask to avoid attention on padding tokens.
causal_attention_mask: Mask to apply causal masking in attention layers.
output_attentions: Whether to output attentions weights.
output_hidden_states: Whether to output hidden states.
return_dict: Whether to return a dictionary instead of a tuple.
Returns:
BaseModelOutputWithPooling: Output with pooled representation and optionally attentions and hidden states.
"""
pass
class GroupViTTextTransformer(nn.Module):
def __init__(self, config: GroupViTTextConfig):
super().__init__()
self.config = config
embed_dim = config.hidden_size
self.embeddings = GroupViTTextEmbeddings(config)
self.encoder = GroupViTTextEncoder(config)
self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.eos_token_id = config.eos_token_id
@add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=GroupViTTextConfig)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Forward pass of the model.
Args:
input_ids: Input token IDs.
attention_mask: Mask to avoid attention on padding tokens.
position_ids: IDs indicating the position of each token in the sequence.
output_attentions: Whether to output attentions weights.
output_hidden_states: Whether to output hidden states.
return_dict: Whether to return a dictionary instead of a tuple.
Returns:
BaseModelOutputWithPooling: Output with pooled representation and optionally attentions and hidden states.
"""
pass
class GroupViTTextModel(GroupViTPreTrainedModel):
config_class = GroupViTTextConfig
def __init__(self, config: GroupViTTextConfig):
super().__init__(config)
self.text_model = GroupViTTextTransformer(config)
self.post_init()
def get_input_embeddings(self) -> nn.Module:
return self.text_model.embeddings.token_embedding
def set_input_embeddings(self, value):
self.text_model.embeddings.token_embedding = value
@add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=GroupViTTextConfig)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Forward pass of the model.
Args:
input_ids: Input token IDs.
attention_mask: Mask to avoid attention on padding tokens.
position_ids: IDs indicating the position of each token in the sequence.
output_attentions: Whether to output attentions weights.
output_hidden_states: Whether to output hidden states.
return_dict: Whether to return a dictionary instead of a tuple.
Returns:
BaseModelOutputWithPooling: Output with pooled representation and optionally attentions and hidden states.
"""
pass
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
此方法定义了模型的前向传播过程,接受多个输入参数并返回模型输出或元组。
Returns:
模型的输出或元组,包含了模型的不同部分或汇总结果。
Examples:
```
>>> from transformers import CLIPTokenizer, GroupViTTextModel
>>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> model = GroupViTTextModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output # pooled (EOS token) states
```
"""
调用文本模型的前向传播,传递各种输入参数以控制模型行为和输出。
return self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
class GroupViTVisionTransformer(nn.Module):
def __init__(self, config: GroupViTVisionConfig):
super().__init__()
self.config = config
embed_dim = config.hidden_size
self.embeddings = GroupViTVisionEmbeddings(config)
self.encoder = GroupViTVisionEncoder(config)
self.layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(GROUPVIT_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=GroupViTVisionConfig)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_hidden_states: Optional[bool] = None,
output_attentions: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
模型的前向传播方法
返回:
BaseModelOutputWithPooling 或 Tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
hidden_states = self.embeddings(pixel_values)
encoder_outputs = self.encoder(
hidden_states=hidden_states,
output_hidden_states=output_hidden_states,
output_attentions=output_attentions,
return_dict=return_dict,
)
last_hidden_state = encoder_outputs[0]
last_hidden_state = self.layernorm(last_hidden_state)
pooled_output = last_hidden_state.mean(dim=1)
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPooling(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
class GroupViTVisionModel(GroupViTPreTrainedModel):
config_class = GroupViTVisionConfig
main_input_name = "pixel_values"
def __init__(self, config: GroupViTVisionConfig):
super().__init__(config)
self.vision_model = GroupViTVisionTransformer(config)
self.post_init()
def get_input_embeddings(self) -> GroupViTPatchEmbeddings:
return self.vision_model.embeddings.patch_embeddings
@add_start_docstrings_to_model_forward(GROUPVIT_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=GroupViTVisionConfig)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
此方法用于模型的前向推断,接受多个参数并返回一个包含输出的对象。
Returns:
返回一个包含模型输出的对象,通常包括最后一层隐藏状态和池化后的输出。
Examples:
以下是一些使用示例,展示了如何使用该方法进行图像特征提取和推断。
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, GroupViTVisionModel
>>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> model = GroupViTVisionModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state # 获取最后一层的隐藏状态
>>> pooled_output = outputs.pooler_output # 获取经过池化的输出(通常是CLS状态)
```
"""
调用视觉模型的前向方法,传递参数并返回结果。
return self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
@add_start_docstrings(GROUPVIT_START_DOCSTRING)
class GroupViTModel(GroupViTPreTrainedModel):
config_class = GroupViTConfig
def __init__(self, config: GroupViTConfig):
super().__init__(config)
if not isinstance(config.text_config, GroupViTTextConfig):
raise ValueError(
"config.text_config is expected to be of type GroupViTTextConfig but is of type"
f" {type(config.text_config)}."
)
if not isinstance(config.vision_config, GroupViTVisionConfig):
raise ValueError(
"config.vision_config is expected to be of type GroupViTVisionConfig but is of type"
f" {type(config.vision_config)}."
)
text_config = config.text_config
vision_config = config.vision_config
self.projection_dim = config.projection_dim
self.projection_intermediate_dim = config.projection_intermediate_dim
self.text_embed_dim = text_config.hidden_size
self.vision_embed_dim = vision_config.hidden_size
self.text_model = GroupViTTextTransformer(text_config)
self.vision_model = GroupViTVisionTransformer(vision_config)
self.visual_projection = nn.Sequential(
nn.Linear(self.vision_embed_dim, self.projection_intermediate_dim, bias=True),
nn.BatchNorm1d(self.projection_intermediate_dim),
nn.ReLU(inplace=True),
nn.Linear(self.projection_intermediate_dim, self.projection_dim, bias=True),
)
self.text_projection = nn.Sequential(
nn.Linear(self.text_embed_dim, self.projection_intermediate_dim, bias=True),
nn.BatchNorm1d(self.projection_intermediate_dim),
nn.ReLU(inplace=True),
nn.Linear(self.projection_intermediate_dim, self.projection_dim, bias=True),
)
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
self.post_init()
@add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING)
def get_text_features(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> torch.FloatTensor:
r"""
Returns:
text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
applying the projection layer to the pooled output of [`GroupViTTextModel`].
Examples:
```
>>> from transformers import CLIPTokenizer, GroupViTModel
>>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs)
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
text_outputs = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = text_outputs[1]
text_features = self.text_projection(pooled_output)
return text_features
) -> torch.FloatTensor:
r"""
Returns:
image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
applying the projection layer to the pooled output of [`GroupViTVisionModel`].
Examples:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, GroupViTModel
>>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> image_features = model.get_image_features(**inputs)
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = vision_outputs[1]
image_features = self.visual_projection(pooled_output)
return image_features