Transformers 源码解析(八十二)
.\models\nllb_moe\modeling_nllb_moe.py
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
"""
Shift input ids one token to the right.
将输入的 `input_ids` 张量中的 token 向右移动一位。
"""
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
shifted_input_ids[:, 0] = decoder_start_token_id
if pad_token_id is None:
raise ValueError("self.model.config.pad_token_id has to be defined.")
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
return shifted_input_ids
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
将非填充符号替换为它们的位置编号。位置编号从 `padding_idx+1` 开始。填充符号不变。
"""
pass
mask = input_ids.ne(padding_idx).int()
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
return incremental_indices.long() + padding_idx
def load_balancing_loss_func(router_probs: torch.Tensor, expert_indices: torch.Tensor) -> float:
if router_probs is None:
return 0
num_experts = router_probs.shape[-1]
if expert_indices.dtype != torch.int64:
expert_indices = expert_indices.to(torch.int64)
if len(expert_indices.shape) == 2:
expert_indices = expert_indices.unsqueeze(2)
expert_mask = torch.nn.functional.one_hot(expert_indices, num_experts)
expert_mask = torch.max(expert_mask, axis=-2).values
expert_mask = expert_mask.to(torch.float32)
tokens_per_group_and_expert = torch.mean(expert_mask, axis=-2)
router_prob_per_group_and_expert = torch.mean(router_probs, axis=-2)
return torch.mean(tokens_per_group_and_expert * router_prob_per_group_and_expert) * (num_experts**2)
class NllbMoeSinusoidalPositionalEmbedding(nn.Module):
"""This module produces sinusoidal positional embeddings of any length."""
def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
super().__init__()
self.offset = 2
self.embedding_dim = embedding_dim
self.padding_idx = padding_idx
self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
if hasattr(self, "weights"):
emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
self.register_buffer("weights", emb_weights, persistent=False)
@staticmethod
def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
"""
Build sinusoidal embeddings.
This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
"Attention Is All You Need".
"""
half_dim = embedding_dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
if embedding_dim % 2 == 1:
emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
if padding_idx is not None:
emb[padding_idx, :] = 0
return emb.to(torch.get_default_dtype())
@torch.no_grad()
def forward(
self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0
):
if input_ids is not None:
bsz, seq_len = input_ids.size()
position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
input_ids.device
)
else:
bsz, seq_len = inputs_embeds.size()[:-1]
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length)
max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
if max_pos > self.weights.size(0):
self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length):
"""
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
Args:
inputs_embeds: torch.Tensor
Returns: torch.Tensor
"""
input_shape = inputs_embeds.size()[:-1]
sequence_length = input_shape[1]
position_ids = torch.arange(
self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
)
return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length
class NllbMoeTop2Router(nn.Module):
"""
Router using tokens choose top-2 experts assignment.
This router uses the same mechanism as in NLLB-MoE from the fairseq repository. Items are sorted by router_probs
and then routed to their choice of expert until the expert's expert_capacity is reached. **There is no guarantee
that each token is processed by an expert**, or that each expert receives at least one token.
The router combining weights are also returned to make sure that the states that are not updated will be masked.
"""
def __init__(self, config: NllbMoeConfig):
super().__init__()
self.num_experts = config.num_experts
self.expert_capacity = config.expert_capacity
self.classifier = nn.Linear(config.hidden_size, self.num_experts, bias=config.router_bias)
self.router_ignore_padding_tokens = config.router_ignore_padding_tokens
self.dtype = getattr(torch, config.router_dtype)
self.second_expert_policy = config.second_expert_policy
self.normalize_router_prob_before_dropping = config.normalize_router_prob_before_dropping
self.batch_prioritized_routing = config.batch_prioritized_routing
self.moe_eval_capacity_token_fraction = config.moe_eval_capacity_token_fraction
def _cast_classifier(self):
r"""
`bitsandbytes` `Linear8bitLt` layers does not support manual casting Therefore we need to check if they are an
instance of the `Linear8bitLt` class by checking special attributes.
"""
if not (hasattr(self.classifier, "SCB") or hasattr(self.classifier, "CB")):
self.classifier = self.classifier.to(self.dtype)
def normalize_router_probabilities(self, router_probs, top_1_mask, top_2_mask):
top_1_max_probs = (router_probs * top_1_mask).sum(dim=1)
top_2_max_probs = (router_probs * top_2_mask).sum(dim=1)
denom_s = torch.clamp(top_1_max_probs + top_2_max_probs, min=torch.finfo(router_probs.dtype).eps)
top_1_max_probs = top_1_max_probs / denom_s
top_2_max_probs = top_2_max_probs / denom_s
return top_1_max_probs, top_2_max_probs
def route_tokens(
self,
router_logits: torch.Tensor,
input_dtype: torch.dtype = torch.float32,
padding_mask: Optional[torch.LongTensor] = None,
def forward(self, hidden_states: torch.Tensor, padding_mask: Optional[torch.LongTensor] = None) -> Tuple:
r"""
The hidden states are reshaped to simplify the computation of the router probabilities (combining weights for
each experts.)
隐藏状态被重新整形,以简化路由概率的计算(结合每个专家的权重)。
Args:
hidden_states (`torch.Tensor`):
(batch_size, sequence_length, hidden_dim) from which router probabilities are computed.
用于计算路由概率的隐藏状态张量,形状为(batch_size, sequence_length, hidden_dim)。
Returns:
top_1_mask (`torch.Tensor` of shape (batch_size, sequence_length)):
Index tensor of shape [batch_size, sequence_length] corresponding to the expert selected for each token
using the top1 probabilities of the router.
形状为(batch_size, sequence_length)的索引张量,每个令牌对应的专家选择索引,使用路由器的top1概率。
router_probabilities (`torch.Tensor` of shape (batch_size, sequence_length, nump_experts)):
Tensor of shape (batch_size, sequence_length, num_experts) corresponding to the probabilities for each
token and expert. Used for routing tokens to experts.
形状为(batch_size, sequence_length, num_experts)的张量,每个令牌和专家的概率值,用于将令牌路由到专家。
router_logits (`torch.Tensor` of shape (batch_size, sequence_length))):
Logits tensor of shape (batch_size, sequence_length, num_experts) corresponding to raw router logits.
This is used later for computing router z-loss.
形状为(batch_size, sequence_length, num_experts)的原始路由器logits张量,用于后续计算路由器的z-loss。
"""
self.input_dtype = hidden_states.dtype
batch_size, sequence_length, hidden_dim = hidden_states.shape
hidden_states = hidden_states.reshape((batch_size * sequence_length), hidden_dim)
hidden_states = hidden_states.to(self.dtype)
self._cast_classifier()
router_logits = self.classifier(hidden_states)
top_1_mask, router_probs = self.route_tokens(router_logits, self.input_dtype, padding_mask)
return top_1_mask, router_probs
class NllbMoeDenseActDense(nn.Module):
def __init__(self, config: NllbMoeConfig, ffn_dim: int):
super().__init__()
self.fc1 = nn.Linear(config.d_model, ffn_dim)
self.fc2 = nn.Linear(ffn_dim, config.d_model)
self.dropout = nn.Dropout(config.activation_dropout)
self.act = ACT2FN[config.activation_function]
def forward(self, hidden_states):
hidden_states = self.fc1(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.dropout(hidden_states)
if (
isinstance(self.fc2.weight, torch.Tensor)
and hidden_states.dtype != self.fc2.weight.dtype
and (self.fc2.weight.dtype != torch.int8 and self.fc2.weight.dtype != torch.uint8)
):
hidden_states = hidden_states.to(self.fc2.weight.dtype)
hidden_states = self.fc2(hidden_states)
return hidden_states
class NllbMoeSparseMLP(nn.Module):
def __init__(self, config: NllbMoeConfig, ffn_dim: int, expert_class: nn.Module = NllbMoeDenseActDense):
super().__init__()
self.router = NllbMoeTop2Router(config)
self.moe_token_dropout = config.moe_token_dropout
self.token_dropout = nn.Dropout(self.moe_token_dropout)
self.num_experts = config.num_experts
self.experts = nn.ModuleDict()
for idx in range(self.num_experts):
self.experts[f"expert_{idx}"] = expert_class(config, ffn_dim)
def forward(self, hidden_states: torch.Tensor, padding_mask: Optional[torch.Tensor] = False):
r"""
The goal of this forward pass is to have the same number of operation as the equivalent `NllbMoeDenseActDense`
(mlp) layer. This means that all of the hidden states should be processed at most twice ( since we are using a
top_2 gating mecanism). This means that we keep the complexity to O(batch_size x sequence_length x hidden_dim)
instead of O(num_experts x batch_size x sequence_length x hidden_dim).
1- Get the `router_probs` from the `router`. The shape of the `router_mask` is `(batch_size X sequence_length,
num_expert)` and corresponds to the boolean version of the `router_probs`. The inputs are masked using the
`router_mask`.
"""
batch_size, sequence_length, hidden_dim = hidden_states.shape
top_1_mask, router_probs = self.router(hidden_states, padding_mask)
router_mask = router_probs.bool()
hidden_states = hidden_states.reshape((batch_size * sequence_length), hidden_dim)
masked_hidden_states = torch.einsum("bm,be->ebm", hidden_states, router_mask)
for idx, expert in enumerate(self.experts.values()):
token_indices = router_mask[:, idx]
combining_weights = router_probs[token_indices, idx]
expert_output = expert(masked_hidden_states[idx, token_indices])
if self.moe_token_dropout > 0:
if self.training:
expert_output = self.token_dropout(expert_output)
else:
expert_output *= 1 - self.moe_token_dropout
masked_hidden_states[idx, token_indices] = torch.einsum("b,be->be", combining_weights, expert_output)
hidden_states = masked_hidden_states.sum(dim=0).reshape(batch_size, sequence_length, hidden_dim)
top_1_expert_index = torch.argmax(top_1_mask, dim=-1)
return hidden_states, (router_probs, top_1_expert_index)
class NllbMoeAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
is_causal: bool = False,
config: Optional[NllbMoeConfig] = None,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
self.config = config
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.is_causal = is_causal
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
encoder_hidden_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
class NllbMoeEncoderLayer(nn.Module):
def __init__(self, config: NllbMoeConfig, is_sparse: bool = False):
super().__init__()
self.embed_dim = config.d_model
self.is_sparse = is_sparse
self.self_attn = NllbMoeAttention(
embed_dim=self.embed_dim,
num_heads=config.encoder_attention_heads,
dropout=config.attention_dropout,
)
self.attn_dropout = nn.Dropout(config.dropout)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
if not self.is_sparse:
self.ffn = NllbMoeDenseActDense(config, ffn_dim=config.encoder_ffn_dim)
else:
self.ffn = NllbMoeSparseMLP(config, ffn_dim=config.encoder_ffn_dim)
self.ff_layer_norm = nn.LayerNorm(config.d_model)
self.ff_dropout = nn.Dropout(config.activation_dropout)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
layer_head_mask: torch.Tensor,
output_attentions: bool = False,
output_router_logits: bool = False,
) -> torch.Tensor:
"""
Args:
hidden_states (`torch.FloatTensor`):
input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`):
attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
large negative values.
layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
`(encoder_attention_heads,)`.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
hidden_states, attn_weights, _ = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
hidden_states = self.attn_dropout(hidden_states)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.ff_layer_norm(hidden_states)
if self.is_sparse:
hidden_states, router_states = self.ffn(hidden_states, attention_mask)
else:
hidden_states, router_states = self.ffn(hidden_states), None
hidden_states = self.ff_dropout(hidden_states)
hidden_states = residual + hidden_states
if hidden_states.dtype == torch.float16 and (
torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
):
clamp_value = torch.finfo(hidden_states.dtype).max - 1000
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
if output_router_logits:
outputs += (router_states,)
return outputs
class NllbMoeDecoderLayer(nn.Module):
def __init__(self, config: NllbMoeConfig, is_sparse: bool = False):
super().__init__()
self.embed_dim = config.d_model
self.is_sparse = is_sparse
self.self_attn = NllbMoeAttention(
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.attn_dropout = nn.Dropout(config.dropout)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.cross_attention = NllbMoeAttention(
self.embed_dim, config.decoder_attention_heads, config.attention_dropout, is_decoder=True
)
self.cross_attention_layer_norm = nn.LayerNorm(self.embed_dim)
if not self.is_sparse:
self.ffn = NllbMoeDenseActDense(config, ffn_dim=config.decoder_ffn_dim)
else:
self.ffn = NllbMoeSparseMLP(config, ffn_dim=config.decoder_ffn_dim)
self.ff_layer_norm = nn.LayerNorm(config.d_model)
self.ff_dropout = nn.Dropout(config.activation_dropout)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
output_router_logits: Optional[bool] = False,
use_cache: Optional[bool] = True,
):
pass
class NllbMoePreTrainedModel(PreTrainedModel):
config_class = NllbMoeConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["NllbMoeEncoderLayer", "NllbMoeDecoderLayer"]
def _init_weights(self, module):
"""Initialize the weights"""
std = self.config.init_std
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
Parameters:
config ([`NllbMoeConfig`]):
这是模型配置类,包含模型的所有参数。使用配置文件初始化类时,并不会加载模型的权重,只会加载配置信息。
可以查看 [`~PreTrainedModel.from_pretrained`] 方法来加载模型权重。
"""
NLLB_MOE_GENERATION_EXAMPLE = r"""
Translation example:
```
>>> from transformers import AutoTokenizer, NllbMoeForConditionalGeneration
>>> model = NllbMoeForConditionalGeneration.from_pretrained("facebook/nllb-moe-54b")
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-moe-54b")
>>> text_to_translate = "Life is like a box of chocolates"
>>> model_inputs = tokenizer(text_to_translate, return_tensors="pt")
>>>
>>> gen_tokens = model.generate(**model_inputs, forced_bos_token_id=tokenizer.get_lang_id("eng_Latn"))
>>> print(tokenizer.batch_decode(gen_tokens, skip_special_tokens=True))
```
"""
NLLB_MOE_INPUTS_DOCSTRING = r"""
"""
class NllbMoeEncoder(NllbMoePreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`NllbMoeEncoderLayer`].
Args:
config:
NllbMoeConfig
embed_tokens (nn.Embedding):
output embedding
"""
def __init__(self, config: NllbMoeConfig, embed_tokens: Optional[nn.Embedding] = None):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.encoder_layerdrop
embed_dim = config.d_model
self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_position_embeddings
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
if embed_tokens is not None:
self.embed_tokens.weight = embed_tokens.weight
self.embed_positions = NllbMoeSinusoidalPositionalEmbedding(
config.max_position_embeddings,
embed_dim,
self.padding_idx,
)
sparse_step = config.encoder_sparse_step
self.layers = nn.ModuleList()
for i in range(config.encoder_layers):
is_sparse = (i + 1) % sparse_step == 0 if sparse_step > 0 else False
self.layers.append(NllbMoeEncoderLayer(config, is_sparse))
self.layer_norm = nn.LayerNorm(config.d_model)
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_router_logits: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Perform forward pass of the NllbMoeEncoder.
Args:
input_ids (Optional[torch.Tensor]): Input token IDs.
attention_mask (Optional[torch.Tensor]): Attention mask.
head_mask (Optional[torch.Tensor]): Head mask for attention computation.
inputs_embeds (Optional[torch.Tensor]): Embedded input tokens.
output_attentions (Optional[bool]): Whether to output attentions.
output_hidden_states (Optional[bool]): Whether to output hidden states.
output_router_logits (Optional[bool]): Whether to output router logits.
return_dict (Optional[bool]): Whether to return a dictionary.
Returns:
Depending on `return_dict`, either a tuple or a dictionary with model outputs.
"""
# TODO: Implement the forward pass logic here
pass
class NllbMoeDecoder(NllbMoePreTrainedModel):
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`NllbMoeDecoderLayer`].
"""
# The implementation of the NllbMoeDecoder class would go here, but it's not provided in the snippet.
Args:
config:
NllbMoeConfig
模型配置对象,包含模型的各种配置参数
embed_tokens (nn.Embedding):
output embedding
输出嵌入层,用于将输入的标记转换为嵌入表示
"""
def __init__(self, config: NllbMoeConfig, embed_tokens: Optional[nn.Embedding] = None):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.decoder_layerdrop
self.padding_idx = config.pad_token_id
self.max_target_positions = config.max_position_embeddings
self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
if embed_tokens is not None:
self.embed_tokens.weight = embed_tokens.weight
self.embed_positions = NllbMoeSinusoidalPositionalEmbedding(
config.max_position_embeddings,
config.d_model,
self.padding_idx,
)
sparse_step = config.decoder_sparse_step
self.layers = nn.ModuleList()
for i in range(config.decoder_layers):
is_sparse = (i + 1) % sparse_step == 0 if sparse_step > 0 else False
self.layers.append(NllbMoeDecoderLayer(config, is_sparse))
self.layer_norm = nn.LayerNorm(config.d_model)
self.gradient_checkpointing = False
self.post_init()
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_router_logits: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"The bare NllbMoe Model outputting raw hidden-states without any specific head on top.",
NLLB_MOE_START_DOCSTRING,
)
class NllbMoeModel(NllbMoePreTrainedModel):
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: NllbMoeConfig):
super().__init__(config)
padding_idx, vocab_size = config.pad_token_id, config.vocab_size
self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
self.encoder = NllbMoeEncoder(config, self.shared)
self.decoder = NllbMoeDecoder(config, self.shared)
self.post_init()
def get_input_embeddings(self):
return self.shared
def set_input_embeddings(self, value):
self.shared = value
self.encoder.embed_tokens = self.shared
self.decoder.embed_tokens = self.shared
def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)
def get_encoder(self):
return self.encoder
def get_decoder(self):
return self.decoder
@add_start_docstrings_to_model_forward(NLLB_MOE_INPUTS_DOCSTRING)
@add_start_docstrings_to_model_forward(NLLB_MOE_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqMoEModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_router_logits: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"The NllbMoe Model with a language modeling head. Can be used for summarization.", NLLB_MOE_START_DOCSTRING
)
class NllbMoeForConditionalGeneration(NllbMoePreTrainedModel):
base_model_prefix = "model"
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
def __init__(self, config: NllbMoeConfig):
super().__init__(config)
self.model = NllbMoeModel(config)
self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
self.router_z_loss_coef = config.router_z_loss_coef
self.router_aux_loss_coef = config.router_aux_loss_coef
self.post_init()
def get_encoder(self):
return self.model.get_encoder()
def get_decoder(self):
return self.model.get_decoder()
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
@add_start_docstrings_to_model_forward(NLLB_MOE_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqMoEOutput, config_class=_CONFIG_FOR_DOC)
@add_end_docstrings(NLLB_MOE_GENERATION_EXAMPLE)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_router_logits: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
def _unpack_router_logits(self, router_outputs):
total_router_logits = []
total_expert_indexes = []
for router_output in router_outputs:
if router_output is not None:
router_logits, expert_indexes = router_output
total_router_logits.append(router_logits)
total_expert_indexes.append(expert_indexes)
total_router_logits = torch.cat(total_router_logits, dim=1) if len(total_router_logits) > 0 else None
total_expert_indexes = torch.stack(total_expert_indexes, dim=1) if len(total_expert_indexes) > 0 else None
return total_router_logits, total_expert_indexes
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
if decoder_input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = decoder_input_ids.shape[1] - 1
decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache,
}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
.\models\nllb_moe\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_nllb_moe": [
"NLLB_MOE_PRETRAINED_CONFIG_ARCHIVE_MAP",
"NllbMoeConfig",
]
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_nllb_moe"] = [
"NLLB_MOE_PRETRAINED_MODEL_ARCHIVE_LIST",
"NllbMoeForConditionalGeneration",
"NllbMoeModel",
"NllbMoePreTrainedModel",
"NllbMoeTop2Router",
"NllbMoeSparseMLP",
]
if TYPE_CHECKING:
from .configuration_nllb_moe import (
NLLB_MOE_PRETRAINED_CONFIG_ARCHIVE_MAP,
NllbMoeConfig,
)
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_nllb_moe import (
NLLB_MOE_PRETRAINED_MODEL_ARCHIVE_LIST,
NllbMoeForConditionalGeneration,
NllbMoeModel,
NllbMoePreTrainedModel,
NllbMoeSparseMLP,
NllbMoeTop2Router,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\nougat\convert_nougat_to_hf.py
"""Convert Nougat checkpoints using the original `nougat` library. URL:
https://github.com/facebookresearch/nougat/tree/main"""
import argparse
import torch
from huggingface_hub import hf_hub_download
from nougat import NougatModel
from nougat.dataset.rasterize import rasterize_paper
from nougat.utils.checkpoint import get_checkpoint
from PIL import Image
from transformers import (
DonutSwinConfig,
DonutSwinModel,
MBartConfig,
MBartForCausalLM,
NougatImageProcessor,
NougatProcessor,
NougatTokenizerFast,
VisionEncoderDecoderModel,
)
def get_configs(model):
original_config = model.config
encoder_config = DonutSwinConfig(
image_size=original_config.input_size,
patch_size=4,
depths=original_config.encoder_layer,
num_heads=[4, 8, 16, 32],
window_size=original_config.window_size,
embed_dim=128,
)
decoder_config = MBartConfig(
is_decoder=True,
is_encoder_decoder=False,
add_cross_attention=True,
decoder_layers=original_config.decoder_layer,
max_position_embeddings=original_config.max_position_embeddings,
vocab_size=len(
model.decoder.tokenizer
),
scale_embedding=True,
add_final_layer_norm=True,
tie_word_embeddings=False,
)
return encoder_config, decoder_config
def rename_key(name):
if "encoder.model" in name:
name = name.replace("encoder.model", "encoder")
if "decoder.model" in name:
name = name.replace("decoder.model", "decoder")
if "patch_embed.proj" in name:
name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
if "patch_embed.norm" in name:
name = name.replace("patch_embed.norm", "embeddings.norm")
if name.startswith("encoder"):
if "layers" in name:
name = "encoder." + name
if "attn.proj" in name:
name = name.replace("attn.proj", "attention.output.dense")
if "attn" in name and "mask" not in name:
name = name.replace("attn", "attention.self")
if "norm1" in name:
name = name.replace("norm1", "layernorm_before")
if "norm2" in name:
name = name.replace("norm2", "layernorm_after")
if "mlp.fc1" in name:
name = name.replace("mlp.fc1", "intermediate.dense")
if "mlp.fc2" in name:
name = name.replace("mlp.fc2", "output.dense")
if name == "encoder.norm.weight":
name = "encoder.layernorm.weight"
if name == "encoder.norm.bias":
name = "encoder.layernorm.bias"
return name
def convert_state_dict(orig_state_dict, model):
for key in orig_state_dict.copy().keys():
val = orig_state_dict.pop(key)
if "qkv" in key:
key_split = key.split(".")
layer_num = int(key_split[3])
block_num = int(key_split[5])
dim = model.encoder.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size
if "weight" in key:
orig_state_dict[
f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"
] = val[:dim, :]
orig_state_dict[
f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"
] = val[dim : dim * 2, :]
orig_state_dict[
f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"
] = val[-dim:, :]
else:
orig_state_dict[
f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"
] = val[:dim]
orig_state_dict[
f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"
] = val[dim : dim * 2]
orig_state_dict[
f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"
] = val[-dim:]
elif "attn_mask" in key or key in ["encoder.model.norm.weight", "encoder.model.norm.bias"]:
pass
else:
orig_state_dict[rename_key(key)] = val
return orig_state_dict
def convert_nougat_checkpoint(model_tag, pytorch_dump_folder_path=None, push_to_hub=False):
checkpoint_path = get_checkpoint(None, model_tag)
original_model = NougatModel.from_pretrained(checkpoint_path)
original_model.eval()
encoder_config, decoder_config = get_configs(original_model)
encoder = DonutSwinModel(encoder_config)
decoder = MBartForCausalLM(decoder_config)
model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
model.eval()
state_dict = original_model.state_dict()
new_state_dict = convert_state_dict(state_dict, model)
model.load_state_dict(new_state_dict)
filepath = hf_hub_download(repo_id="ysharma/nougat", filename="input/nougat.pdf", repo_type="space")
images = rasterize_paper(pdf=filepath, return_pil=True)
image = Image.open(images[0])
tokenizer_file = checkpoint_path / "tokenizer.json"
tokenizer = NougatTokenizerFast(tokenizer_file=str(tokenizer_file))
tokenizer.pad_token = "<pad>"
tokenizer.bos_token = "<s>"
tokenizer.eos_token = "</s>"
tokenizer.unk_token = "<unk>"
tokenizer.model_max_length = original_model.config.max_length
size = {"height": original_model.config.input_size[0], "width": original_model.config.input_size[1]}
image_processor = NougatImageProcessor(
do_align_long_axis=original_model.config.align_long_axis,
size=size,
)
processor = NougatProcessor(image_processor=image_processor, tokenizer=tokenizer)
pixel_values = processor(image, return_tensors="pt").pixel_values
original_pixel_values = original_model.encoder.prepare_input(image).unsqueeze(0)
assert torch.allclose(original_pixel_values, pixel_values)
original_patch_embed = original_model.encoder.model.patch_embed(pixel_values)
patch_embeddings, _ = model.encoder.embeddings(pixel_values)
assert torch.allclose(original_patch_embed, patch_embeddings)
original_last_hidden_state = original_model.encoder(pixel_values)
last_hidden_state = model.encoder(pixel_values).last_hidden_state
assert torch.allclose(original_last_hidden_state, last_hidden_state, atol=1e-2)
original_embeddings = original_model.decoder.model.model.decoder.embed_tokens
embeddings = model.decoder.model.decoder.embed_tokens
assert torch.allclose(original_embeddings.weight, embeddings.weight, atol=1e-3)
prompt = "hello world"
decoder_input_ids = original_model.decoder.tokenizer(
prompt, add_special_tokens=False, return_tensors="pt"
).input_ids
decoder_attention_mask = torch.ones_like(decoder_input_ids)
original_logits = original_model(
image_tensors=pixel_values, decoder_input_ids=decoder_input_ids, attention_mask=decoder_attention_mask
).logits
logits = model(
pixel_values,
decoder_input_ids=decoder_input_ids[:, :-1],
decoder_attention_mask=decoder_attention_mask[:, :-1],
).logits
assert torch.allclose(original_logits, logits, atol=1e-3)
outputs = model.generate(
pixel_values,
min_length=1,
max_length=30,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
use_cache=True,
bad_words_ids=[
[tokenizer.unk_token_id],
],
return_dict_in_generate=True,
do_sample=False,
)
generated = tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)[0]
if model_tag == "0.1.0-base":
expected_generation = "# Nougat: Neural Optical Understanding for Academic Documents\n\nLukas Blecher\n\nCorrespondence to: lblec"
elif model_tag == "0.1.0-small":
expected_generation = (
"# Nougat: Neural Optical Understanding for Academic Documents\n\nLukas Blecher\n\nCorrespondence to: lble"
)
else:
raise ValueError(f"Unexpected model tag: {model_tag}")
assert generated == expected_generation
print("Looks ok!")
if pytorch_dump_folder_path is not None:
print(f"Saving model and processor to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
tag_to_name = {"0.1.0-base": "nougat-base", "0.1.0-small": "nougat-small"}
model_name = tag_to_name[model_tag]
model.push_to_hub(f"facebook/{model_name}")
processor.push_to_hub(f"facebook/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_tag",
default="0.1.0-base",
required=False,
type=str,
choices=["0.1.0-base", "0.1.0-small"],
help="Tag of the original model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
required=False,
type=str,
help="Path to the output PyTorch model directory.",
)
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Whether or not to push the converted model and processor to the 🤗 hub.",
)
args = parser.parse_args()
convert_nougat_checkpoint(args.model_tag, args.pytorch_dump_folder_path, args.push_to_hub)
.\models\nougat\image_processing_nougat.py
"""Image processor class for Nougat."""
from typing import Dict, List, Optional, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
get_resize_output_image_size,
pad,
resize,
to_channel_dimension_format,
to_pil_image,
)
from ...image_utils import (
IMAGENET_DEFAULT_MEAN,
IMAGENET_DEFAULT_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
get_image_size,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
to_numpy_array,
valid_images,
validate_kwargs,
validate_preprocess_arguments,
)
from ...utils import TensorType, logging
from ...utils.import_utils import is_cv2_available, is_vision_available
logger = logging.get_logger(__name__)
if is_cv2_available():
pass
if is_vision_available():
import PIL
"""
Args:
do_crop_margin (`bool`, *optional*, defaults to `True`):
Whether to crop the image margins.
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
`do_resize` in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"height": 896, "width": 672}`):
Size of the image after resizing. Can be overridden by `size` in the `preprocess` method.
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
do_thumbnail (`bool`, *optional*, defaults to `True`):
Whether to resize the image using thumbnail method.
do_align_long_axis (`bool`, *optional*, defaults to `False`):
Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
do_pad (`bool`, *optional*, defaults to `True`):
Whether to pad the images to the largest image size in the batch.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
parameter in the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
`preprocess` method.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
Image standard deviation.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_crop_margin: bool = True,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
do_thumbnail: bool = True,
do_align_long_axis: bool = False,
do_pad: bool = True,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
**kwargs,
):
"""
Initialize the ImagePreprocessor object with optional parameters for image preprocessing.
Args:
do_crop_margin (bool, optional, default=True):
Whether to crop the image margins.
do_resize (bool, optional, default=True):
Whether to resize the image to the specified `size`.
size (Dict[str, int], optional, default={"height": 896, "width": 672}):
Size of the image after resizing.
resample (PILImageResampling, optional, default=PILImageResampling.BILINEAR):
Resampling filter to use if resizing the image.
do_thumbnail (bool, optional, default=True):
Whether to resize the image using thumbnail method.
do_align_long_axis (bool, optional, default=False):
Whether to align the long axis of the image with the long axis of `size`.
do_pad (bool, optional, default=True):
Whether to pad the images to the largest image size in the batch.
do_rescale (bool, optional, default=True):
Whether to rescale the image by the specified scale `rescale_factor`.
rescale_factor (int or float, optional, default=1/255):
Scale factor to use if rescaling the image.
do_normalize (bool, optional, default=True):
Whether to normalize the image.
image_mean (float or List[float], optional, default=IMAGENET_DEFAULT_MEAN):
Mean to use if normalizing the image.
image_std (float or List[float], optional, default=IMAGENET_DEFAULT_STD):
Standard deviation of the image.
"""
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"height": 896, "width": 672}
size = get_size_dict(size)
self.do_crop_margin = do_crop_margin
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_thumbnail = do_thumbnail
self.do_align_long_axis = do_align_long_axis
self.do_pad = do_pad
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self._valid_processor_keys = [
"images",
"do_crop_margin",
"do_resize",
"size",
"resample",
"do_thumbnail",
"do_align_long_axis",
"do_pad",
"do_rescale",
"rescale_factor",
"do_normalize",
"image_mean",
"image_std",
"return_tensors",
"data_format",
"input_data_format",
]
) -> np.array:
"""
Crops the margin of the image. Gray pixels are considered margin (i.e., pixels with a value below the
threshold).
Args:
image (`np.array`):
The image to be cropped.
gray_threshold (`int`, *optional*, defaults to `200`):
Value below which pixels are considered to be gray.
data_format (`ChannelDimension`, *optional*):
The channel dimension format of the output image. If unset, will use the inferred format from the
input.
input_data_format (`ChannelDimension`, *optional*):
The channel dimension format of the input image. If unset, will use the inferred format from the input.
"""
if input_data_format is None:
input_data_format = infer_channel_dimension_format(image)
image = to_pil_image(image, input_data_format=input_data_format)
data = np.array(image.convert("L")).astype(np.uint8)
max_val = data.max()
min_val = data.min()
if max_val == min_val:
image = np.array(image)
image = (
to_channel_dimension_format(image, data_format, input_data_format)
if data_format is not None
else image
)
return image
data = (data - min_val) / (max_val - min_val) * 255
gray = data < gray_threshold
coords = self.python_find_non_zero(gray)
x_min, y_min, width, height = self.python_bounding_rect(coords)
image = image.crop((x_min, y_min, x_min + width, y_min + height))
image = np.array(image).astype(np.uint8)
image = to_channel_dimension_format(image, input_data_format, ChannelDimension.LAST)
image = (
to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image
)
return image
def align_long_axis(
self,
image: np.ndarray,
size: Dict[str, int],
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
def align_long_axis(
self,
image: np.ndarray,
size: Dict[str, int],
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""
Align the long axis of the image to the longest axis of the specified size.
Args:
image (`np.ndarray`):
The image to be aligned.
size (`Dict[str, int]`):
The size `{"height": h, "width": w}` to align the long axis to.
data_format (`str` or `ChannelDimension`, *optional*):
The data format of the output image. If unset, the same format as the input image is used.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
Returns:
`np.ndarray`: The aligned image.
"""
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
output_height, output_width = size["height"], size["width"]
if (output_width < output_height and input_width > input_height) or (
output_width > output_height and input_width < input_height
):
image = np.rot90(image, 3)
if data_format is not None:
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
return image
def pad_image(
self,
image: np.ndarray,
size: Dict[str, int],
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""
Pad the image to the specified size at the top, bottom, left and right.
Args:
image (`np.ndarray`):
The image to be padded.
size (`Dict[str, int]`):
The size `{"height": h, "width": w}` to pad the image to.
data_format (`str` or `ChannelDimension`, *optional*):
The data format of the output image. If unset, the same format as the input image is used.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
output_height, output_width = size["height"], size["width"]
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
delta_width = output_width - input_width
delta_height = output_height - input_height
pad_top = delta_height // 2
pad_left = delta_width // 2
pad_bottom = delta_height - pad_top
pad_right = delta_width - pad_left
padding = ((pad_top, pad_bottom), (pad_left, pad_right))
return pad(image, padding, data_format=data_format, input_data_format=input_data_format)
def thumbnail(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize the image to make a thumbnail. The image is resized so that no dimension is larger than any
corresponding dimension of the specified size.
Args:
image (`np.ndarray`):
The image to be resized.
size (`Dict[str, int]`):
The size `{"height": h, "width": w}` to resize the image to.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
The resampling filter to use.
data_format (`Optional[Union[str, ChannelDimension]]`, *optional*):
The data format of the output image. If unset, the same format as the input image is used.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
output_height, output_width = size["height"], size["width"]
height = min(input_height, output_height)
width = min(input_width, output_width)
if height == input_height and width == input_width:
return image
if input_height > input_width:
width = int(input_width * height / input_height)
elif input_width > input_height:
height = int(input_height * width / input_width)
return resize(
image,
size=(height, width),
resample=resample,
reducing_gap=2.0,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
):
) -> np.ndarray:
"""
Resizes `image` to `(height, width)` specified by `size` using the PIL library.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use when resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
size = get_size_dict(size)
shortest_edge = min(size["height"], size["width"])
output_size = get_resize_output_image_size(
image, size=shortest_edge, default_to_square=False, input_data_format=input_data_format
)
resized_image = resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
return resized_image
def preprocess(
self,
images: ImageInput,
do_crop_margin: bool = None,
do_resize: bool = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
do_thumbnail: bool = None,
do_align_long_axis: bool = None,
do_pad: bool = None,
do_rescale: bool = None,
rescale_factor: Union[int, float] = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
.\models\nougat\processing_nougat.py
"""
Processor class for Nougat.
"""
from typing import Dict, List, Optional, Union
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput, TruncationStrategy
from ...processing_utils import ProcessorMixin
from ...utils import PaddingStrategy, TensorType
class NougatProcessor(ProcessorMixin):
r"""
Constructs a Nougat processor which wraps a Nougat image processor and a Nougat tokenizer into a single processor.
[`NougatProcessor`] offers all the functionalities of [`NougatImageProcessor`] and [`NougatTokenizerFast`]. See the
[`~NougatProcessor.__call__`] and [`~NougatProcessor.decode`] for more information.
Args:
image_processor ([`NougatImageProcessor`]):
An instance of [`NougatImageProcessor`]. The image processor is a required input.
tokenizer ([`NougatTokenizerFast`]):
An instance of [`NougatTokenizerFast`]. The tokenizer is a required input.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "AutoImageProcessor"
tokenizer_class = "AutoTokenizer"
def __init__(self, image_processor, tokenizer):
super().__init__(image_processor, tokenizer)
self.current_processor = self.image_processor
def __call__(
self,
images=None,
text=None,
do_crop_margin: bool = None,
do_resize: bool = None,
size: Dict[str, int] = None,
resample: "PILImageResampling" = None,
do_thumbnail: bool = None,
do_align_long_axis: bool = None,
do_pad: bool = None,
do_rescale: bool = None,
rescale_factor: Union[int, float] = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
data_format: Optional["ChannelDimension"] = "channels_first",
input_data_format: Optional[Union[str, "ChannelDimension"]] = None,
text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
text_pair_target: Optional[
Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
):
if images is None and text is None:
raise ValueError("You need to specify either an `images` or `text` input to process.")
if images is not None:
inputs = self.image_processor(
images,
do_crop_margin=do_crop_margin,
do_resize=do_resize,
size=size,
resample=resample,
do_thumbnail=do_thumbnail,
do_align_long_axis=do_align_long_axis,
do_pad=do_pad,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
return_tensors=return_tensors,
data_format=data_format,
input_data_format=input_data_format,
)
if text is not None:
encodings = self.tokenizer(
text,
text_pair=text_pair,
text_target=text_target,
text_pair_target=text_pair_target,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
)
if text is None:
return inputs
elif images is None:
return encodings
else:
inputs["labels"] = encodings["input_ids"]
return inputs
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to NougatTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please refer
to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to NougatTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
def post_process_generation(self, *args, **kwargs):
"""
将所有参数转发到 NougatTokenizer 的 [`~PreTrainedTokenizer.post_process_generation`] 方法。
请参考该方法的文档字符串获取更多信息。
"""
return self.tokenizer.post_process_generation(*args, **kwargs)
.\models\nougat\tokenization_nougat_fast.py
"""
Nougat 的快速分词器类。
"""
import re
from functools import partial
from multiprocessing import Pool
from typing import List, Union
import numpy as np
from transformers.tokenization_utils_base import INIT_TOKENIZER_DOCSTRING
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
from transformers.utils import add_end_docstrings
from ...utils import is_levenshtein_available, is_nltk_available, logging, requires_backends
if is_levenshtein_available():
from Levenshtein import ratio
if is_nltk_available():
import nltk
logger = logging.get_logger(__name__)
INIT_TOKENIZER_DOCSTRING += """
tokenizer_object ([`tokenizers.Tokenizer`]):
A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
tokenizers](../fast_tokenizers) for more information.
tokenizer_file ([`str`]):
A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
tokenizers.
"""
PRETRAINED_VOCAB_FILES_MAP = {
"tokenizer_file": {
"facebook/nougat-base": "https://huggingface.co/facebook/nougat-base/tokenizer/blob/main/tokenizer.json",
},
}
VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/nougat-base": 3584}
def markdown_compatible(text: str) -> str:
"""
使文本兼容 Markdown 格式。
此函数对文本进行各种格式调整,以使其与 Markdown 兼容。
Args:
text (`str`):
要使其兼容 Markdown 的输入文本。
Returns:
`str`: 兼容 Markdown 的文本。
"""
text = re.sub(r"^\(([\d.]+[a-zA-Z]?)\) \\\[(.+?)\\\]$", r"\[\2 \\tag{\1}\]", text, flags=re.M)
text = re.sub(r"^\\\[(.+?)\\\] \(([\d.]+[a-zA-Z]?)\)$", r"\[\1 \\tag{\2}\]", text, flags=re.M)
text = re.sub(
r"^\\\[(.+?)\\\] \(([\d.]+[a-zA-Z]?)\) (\\\[.+?\\\])$",
r"\[\1 \\tag{\2}\] \3",
text,
flags=re.M,
)
text = text.replace(r"\. ", ". ")
text = text.replace(r"\bm{", r"\mathbf{").replace(r"{\\bm ", r"\mathbf{")
text = re.sub(r"\\mbox{ ?\\boldmath\$(.*?)\$}", r"\\mathbf{\1}", text)
text = re.sub(
r"((?:http|ftp|https):\/\/(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]))",
r"[\1](\1)",
text,
)
text = re.sub(r"```\s*(.+?)\s*```", r"```\n\1\n```", text, flags=re.S)
return text
def normalize_list_like_lines(generation):
pattern = r"(?:^)(-|\*)?(?!-|\*) ?((?:\d|[ixv])+ )?.+? (-|\*) (((?:\d|[ixv])+)\.(\d|[ixv]) )?.*(?:$)"
for match in reversed(list(re.finditer(pattern, generation, flags=re.I | re.M))):
start, stop = match.span()
delim = match.group(3) + " "
splits = match.group(0).split(delim)
replacement = ""
if match.group(1) is not None:
splits = splits[1:]
delim1 = match.group(1) + " "
else:
delim1 = ""
continue
pre, post = generation[:start], generation[stop:]
for i, item in enumerate(splits):
level = 0
potential_numeral, _, rest = item.strip().partition(" ")
if not rest:
continue
if re.match(r"^[\dixv]+((?:\.[\dixv])?)+$", potential_numeral, flags=re.I | re.M):
level = potential_numeral.count(".")
replacement += (
("\n" if i > 0 else "") + ("\t" * level) + (delim if i > 0 or start == 0 else delim1) + item.strip()
)
if post == "":
post = "\n"
generation = pre + replacement + post
return generation
def find_next_punctuation(text: str, start_idx=0):
"""
Find the index of the next punctuation mark.
Args:
text (`str`):
String to examine
start_idx (`int`, *optional*)
Index where to start
"""
for i in range(start_idx, len(text)):
if text[i] in [".", "?", "!", "\n"]:
return i
return None
def truncate_repetitions(text: str, min_len: int = 30) -> str:
"""
Attempt to truncate repeating segments in the input string.
Args:
text (str): The input text to process.
min_len (int, optional): The minimum length of repeating segments to truncate.
Returns:
str: The processed text with repeated segments truncated.
"""
text_lower = text.lower()
text_length = len(text_lower)
if text_length < 2 * min_len:
return text
max_repetition_length = None
for repetition_length in range(min_len, int(text_length / 2)):
same = True
for i in range(0, repetition_length):
if text_lower[text_length - repetition_length - i - 1] != text_lower[text_length - i - 1]:
same = False
break
if same:
max_repetition_length = repetition_length
if max_repetition_length is None:
return text
lcs = text_lower[-max_repetition_length:]
substituted_text = text
substituted_text_lower = text_lower
while substituted_text_lower.endswith(lcs):
substituted_text = substituted_text[:-max_repetition_length]
substituted_text_lower = substituted_text_lower[:-max_repetition_length]
repeating_tail = text_lower[len(substituted_text_lower):]
substituted_text_lower_out = substituted_text_lower
while True:
sentence_end = find_next_punctuation(text_lower, len(substituted_text_lower_out))
sentence_start = find_next_punctuation(text_lower[::-1], len(substituted_text_lower_out))
if sentence_end and sentence_start:
sentence = text_lower[sentence_start:sentence_end]
substituted_text_lower_out = text_lower[:sentence_end + 1]
if sentence in repeating_tail:
break
else:
break
text_out = text[:len(substituted_text_lower_out)]
return text_out
def remove_numbers(lines):
def _clean(s):
return re.sub(r"(?:[\d_]|\*\*)", "", s).strip()
if isinstance(lines, str):
return _clean(lines)
out = []
for l in lines:
out.append(_clean(l))
return out
def get_slices(lines, clean_lines):
"""
Get slices of text based on specific criteria within the lines.
This function identifies and returns slices of text from the input lines based on certain conditions.
These conditions were chosen by the Nougat authors:
- The slice is less than 200 characters long.
- The slice is more than 3 characters long.
- The slice does not start with "[MISSING_PAGE".
- The slice is either the same as the next slice or the ratio of the two in terms of Levensthein distance is
greater than 0.9.
Args:
lines (`List[str]`):
The list of lines containing the text.
clean_lines (`List[str]`):
A cleaned version of the text (without numbers).
Returns:
`List[tuple]`: A list of tuples representing the start and end indices of text slices.
"""
indices = np.zeros(len(lines))
for i in range(len(lines) - 1):
j = i + 1
while not clean_lines[j] and j < len(lines) - 1:
j += 1
if (
len(clean_lines[i]) < 200
and len(clean_lines[i]) > 3
and len(clean_lines[j]) < 200
and len(clean_lines[j]) > 3
and not clean_lines[i].startswith("[MISSING_PAGE")
and (clean_lines[i] == clean_lines[j] or ratio(clean_lines[i], clean_lines[j]) > 0.9)
):
indices[i:j] = 1
ids = np.where(indices)[0]
slices = []
if len(ids) == 0:
return slices
j0 = 0
for j, x in enumerate(np.diff(ids) > 3):
if x:
slices.append((ids[j0], ids[j] + 2))
j0 = j + 1
slices.append((ids[j0], ids[-1] + 2))
return [sli for sli in slices if sli[1] - sli[0] > 15]
def remove_slice_from_lines(lines, clean_text, slice) -> str:
"""
Remove a slice of text from the lines based on specific criteria.
This function identifies a slice of text within the lines and removes it based on certain conditions.
Args:
lines (list of str): The list of lines containing the text.
clean_text (list of str): A cleaned version of the text (without numbers).
slice (tuple): A tuple representing the start and end indices of the slice to be removed.
Returns:
str: The removed slice of text as a single string.
"""
base = clean_text[slice[0]]
section = list(slice)
check_start_flag = False
for line_idx in range(max(0, slice[0] - 1), max(0, slice[0] - 5), -1):
if not lines[line_idx]:
continue
if lines[line_idx] == "## References":
section[0] = line_idx
break
elif ratio(base, remove_numbers(lines[line_idx])) < 0.9:
section[0] = line_idx + 1
potential_ref = remove_numbers(lines[max(0, line_idx - 1)].partition("* [")[-1])
if len(potential_ref) >= 0.75 * len(base) and ratio(base, potential_ref) < 0.9:
section[0] = line_idx
check_start_flag = True
break
for line_idx in range(min(len(lines), slice[1]), min(len(lines), slice[1] + 5)):
if ratio(base, remove_numbers(lines[line_idx])) < 0.9:
section[1] = line_idx
break
if len(lines) <= section[1]:
section[1] = len(lines) - 1
to_delete = "\n".join(lines[section[0] : section[1] + 1])
itera, iterb = enumerate(lines[section[1] - 1]), enumerate(lines[section[1]])
while True:
try:
(ia, a) = next(itera)
while a.isnumeric():
(ia, a) = next(itera)
(ib, b) = next(iterb)
while b.isnumeric():
(ib, b) = next(iterb)
if a != b:
break
except StopIteration:
break
if check_start_flag and "* [" in to_delete:
to_delete = "* [" + to_delete.partition("* [")[-1]
try:
delta = len(lines[section[1]]) - ib - 1
if delta > 0:
to_delete = to_delete[:-delta]
except UnboundLocalError:
pass
return to_delete.strip()
@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
class NougatTokenizerFast(PreTrainedTokenizerFast):
"""
Fast tokenizer for Nougat (backed by HuggingFace tokenizers library).
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods. This class mainly adds Nougat-specific
methods for postprocessing the generated text.
Args:
vocab_file (`str`, *optional*):
[SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
contains the vocabulary necessary to instantiate a tokenizer.
tokenizer_file (`str`, *optional*):
[tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
contains everything needed to load the tokenizer.
clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
spaces.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = None
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
clean_up_tokenization_spaces=False,
unk_token="<unk>",
bos_token="<s>",
eos_token="</s>",
pad_token="<pad>",
**kwargs,
):
super().__init__(
vocab_file=vocab_file,
tokenizer_file=tokenizer_file,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
**kwargs,
)
self.vocab_file = vocab_file
def remove_hallucinated_references(self, text: str) -> str:
"""
Remove hallucinated or missing references from the text.
This function identifies and removes references that are marked as missing or hallucinated from the input text.
Args:
text (`str`):
The input text containing references.
Returns:
`str`: The text with hallucinated references removed.
"""
lines = text.split("\n")
if len(lines) == 0:
return ""
clean_lines = remove_numbers(lines)
slices = get_slices(lines, clean_lines)
to_delete = []
for slice in slices:
to_delete.append(remove_slice_from_lines(lines, clean_lines, slice))
for to_delete in reversed(to_delete):
text = text.replace(to_delete, "\n\n[MISSING_PAGE_POST]\n\n")
text = re.sub(
r"## References\n+\[MISSING_PAGE_POST(:\d+)?\]",
"\n\n[MISSING_PAGE_POST\\1]",
text,
)
return text
def correct_tables(self, generation: str) -> str:
"""
Takes a generated string and fixes tables/tabulars to make them match the markdown format needed.
Args:
generation (str): The generated text to be postprocessed.
Returns:
str: The postprocessed text.
Example:
```
correct_tables("\\begin{table} \\begin{tabular}{l l} & \\ \\end{tabular} \\end{table}")
"\\begin{table}\n\\begin{tabular}{l l} & \\ \\end{tabular}\n\\end{table}"
```
"""
for l in generation.split("\n"):
if l.count("\\begin{tabular}") > 15 or l.count("\\multicolumn") > 60 or l.count("&") > 400:
generation = generation.replace(l, "")
generation = generation.replace("\\begin{table} \\begin{tabular}", "\\begin{table}\n\\begin{tabular}")
generation = generation.replace("\\end{tabular} \\end{table}", "\\end{tabular}\n\\end{table}")
generation = generation.replace("\\end{table} Tab", "\\end{table}\nTab")
generation = re.sub(r"(^.+)\\begin{tab", r"\1\n\\begin{tab", generation, flags=re.M)
generation = generation.replace(r"\begin{tabular}{l l} & \\ \end{tabular}", "")
generation = generation.replace("\\begin{tabular}{}\n\n\\end{tabular}", "")
return generation
) -> Union[str, List[str]]:
"""
Postprocess a generated text or a list of generated texts.
This function can be used to perform postprocessing on generated text, such as fixing Markdown formatting.
Postprocessing is quite slow so it is recommended to use multiprocessing to speed up the process.
Args:
generation (Union[str, List[str]]):
The generated text or a list of generated texts.
fix_markdown (`bool`, *optional*, defaults to `True`):
Whether to perform Markdown formatting fixes.
num_workers (`int`, *optional`):
Optional number of workers to pass to leverage multiprocessing (postprocessing several texts in
parallel).
Returns:
Union[str, List[str]]: The postprocessed text or list of postprocessed texts.
"""
requires_backends(self, ["nltk", "levenshtein"])
if isinstance(generation, list):
if num_workers is not None and isinstance(num_workers, int):
with Pool(num_workers) as p:
return p.map(partial(self.post_process_single, fix_markdown=fix_markdown), generation)
else:
return [self.post_process_single(s, fix_markdown=fix_markdown) for s in generation]
else:
return self.post_process_single(generation, fix_markdown=fix_markdown)
.\models\nougat\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_vision_available
_import_structure = {
"processing_nougat": ["NougatProcessor"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_nougat_fast"] = ["NougatTokenizerFast"]
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_nougat"] = ["NougatImageProcessor"]
if TYPE_CHECKING:
from .processing_nougat import NougatProcessor
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_nougat_fast import NougatTokenizerFast
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_nougat import NougatImageProcessor
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
.\models\nystromformer\configuration_nystromformer.py
""" Nystromformer模型配置"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
NYSTROMFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"uw-madison/nystromformer-512": "https://huggingface.co/uw-madison/nystromformer-512/resolve/main/config.json",
}
class NystromformerConfig(PretrainedConfig):
r"""
这是一个配置类,用于存储[`NystromformerModel`]的配置信息。它用于根据指定的参数实例化
一个Nystromformer模型,定义模型的架构。使用默认参数实例化一个配置对象将生成与Nystromformer
[uw-madison/nystromformer-512](https://huggingface.co/uw-madison/nystromformer-512)架构类似的配置。
配置对象继承自[`PretrainedConfig`],可用于控制模型的输出。有关更多信息,请参阅
[`PretrainedConfig`]的文档。
# 定义 NystromformerConfig 类,用于配置 Nystromformer 模型的参数
class NystromformerConfig:
# 初始化函数,设置 Nystromformer 模型的各种参数
def __init__(
self,
vocab_size: int = 30000, # Nystromformer 模型的词汇表大小,默认为 30000
hidden_size: int = 768, # 编码器层和池化层的维度,默认为 768
num_hidden_layers: int = 12, # Transformer 编码器中隐藏层的数量,默认为 12
num_attention_heads: int = 12, # 每个注意力层中的注意力头数,默认为 12
intermediate_size: int = 3072, # Transformer 编码器中"中间"(即前馈)层的维度,默认为 3072
hidden_act: str = "gelu", # 编码器和池化器中的非线性激活函数,默认为 "gelu"
hidden_dropout_prob: float = 0.1, # 嵌入层、编码器和池化器中全连接层的 dropout 概率,默认为 0.1
attention_probs_dropout_prob: float = 0.1, # 注意力概率的 dropout 比率,默认为 0.1
max_position_embeddings: int = 512, # 模型可能使用的最大序列长度,默认为 512
type_vocab_size: int = 2, # 调用 NystromformerModel 时传递的 token_type_ids 的词汇表大小,默认为 2
segment_means_seq_len: int = 64, # segment-means 中使用的序列长度,默认为 64
num_landmarks: int = 64, # Nystrom 近似 softmax 自注意力矩阵时使用的 landmark(或 Nystrom)点数量,默认为 64
conv_kernel_size: int = 65, # Nystrom 近似中使用的深度卷积的内核大小,默认为 65
inv_coeff_init_option: bool = False, # 是否使用精确系数计算来初始化 Moore-Penrose 矩阵的迭代方法的初始值,默认为 False
initializer_range: float = 0.02, # 用于初始化所有权重矩阵的截断正态初始化器的标准差,默认为 0.02
layer_norm_eps: float = 1e-12, # 层归一化层使用的 epsilon,默认为 1e-12
):
# 将参数设置为类的属性
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.segment_means_seq_len = segment_means_seq_len
self.num_landmarks = num_landmarks
self.conv_kernel_size = conv_kernel_size
self.inv_coeff_init_option = inv_coeff_init_option
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
# 设置模型类型为 Nystromformer
model_type = "nystromformer"
# 定义一个初始化方法,初始化 NystromformerConfig 类的实例
def __init__(
self,
vocab_size=30000, # 设置词汇表大小,默认为 30000
hidden_size=768, # 设置隐藏层大小,默认为 768
num_hidden_layers=12, # 设置隐藏层数,默认为 12
num_attention_heads=12, # 设置注意力头数,默认为 12
intermediate_size=3072, # 设置中间层大小,默认为 3072
hidden_act="gelu_new", # 设置隐藏层激活函数,默认为 gelu_new
hidden_dropout_prob=0.1, # 设置隐藏层的 dropout 概率,默认为 0.1
attention_probs_dropout_prob=0.1, # 设置注意力概率的 dropout 概率,默认为 0.1
max_position_embeddings=510, # 设置最大位置编码长度,默认为 510
type_vocab_size=2, # 设置类型词汇表大小,默认为 2
segment_means_seq_len=64, # 设置段落均值序列长度,默认为 64
num_landmarks=64, # 设置地标数,默认为 64
conv_kernel_size=65, # 设置卷积核大小,默认为 65
inv_coeff_init_option=False, # 设置逆系数初始化选项,默认为 False
initializer_range=0.02, # 设置初始化范围,默认为 0.02
layer_norm_eps=1e-5, # 设置层归一化的 epsilon,默认为 1e-5
pad_token_id=1, # 设置填充标记 ID,默认为 1
bos_token_id=0, # 设置起始标记 ID,默认为 0
eos_token_id=2, # 设置结束标记 ID,默认为 2
**kwargs, # 其他可选参数
):
# 调用父类的初始化方法,传入 pad_token_id, bos_token_id, eos_token_id 和其他参数
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
# 将传入的参数保存为对象的属性
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.initializer_range = initializer_range
self.type_vocab_size = type_vocab_size
self.segment_means_seq_len = segment_means_seq_len
self.num_landmarks = num_landmarks
self.conv_kernel_size = conv_kernel_size
self.inv_coeff_init_option = inv_coeff_init_option
self.layer_norm_eps = layer_norm_eps
.\models\nystromformer\convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py
"""从原始存储库转换 Nystromformer 检查点"""
import argparse
import torch
from transformers import NystromformerConfig, NystromformerForMaskedLM
def rename_key(orig_key):
if "model" in orig_key:
orig_key = orig_key.replace("model.", "")
if "norm1" in orig_key:
orig_key = orig_key.replace("norm1", "attention.output.LayerNorm")
if "norm2" in orig_key:
orig_key = orig_key.replace("norm2", "output.LayerNorm")
if "norm" in orig_key:
orig_key = orig_key.replace("norm", "LayerNorm")
if "transformer" in orig_key:
layer_num = orig_key.split(".")[0].split("_")[-1]
orig_key = orig_key.replace(f"transformer_{layer_num}", f"encoder.layer.{layer_num}")
if "mha.attn" in orig_key:
orig_key = orig_key.replace("mha.attn", "attention.self")
if "mha" in orig_key:
orig_key = orig_key.replace("mha", "attention")
if "W_q" in orig_key:
orig_key = orig_key.replace("W_q", "self.query")
if "W_k" in orig_key:
orig_key = orig_key.replace("W_k", "self.key")
if "W_v" in orig_key:
orig_key = orig_key.replace("W_v", "self.value")
if "ff1" in orig_key:
orig_key = orig_key.replace("ff1", "intermediate.dense")
if "ff2" in orig_key:
orig_key = orig_key.replace("ff2", "output.dense")
if "ff" in orig_key:
orig_key = orig_key.replace("ff", "output.dense")
if "mlm_class" in orig_key:
orig_key = orig_key.replace("mlm.mlm_class", "cls.predictions.decoder")
if "mlm" in orig_key:
orig_key = orig_key.replace("mlm", "cls.predictions.transform")
if "cls" not in orig_key:
orig_key = "nystromformer." + orig_key
return orig_key
def convert_checkpoint_helper(config, orig_state_dict):
for key in orig_state_dict.copy().keys():
val = orig_state_dict.pop(key)
if ("pooler" in key) or ("sen_class" in key) or ("conv.bias" in key):
continue
else:
orig_state_dict[rename_key(key)] = val
orig_state_dict["cls.predictions.bias"] = orig_state_dict["cls.predictions.decoder.bias"]
orig_state_dict["nystromformer.embeddings.position_ids"] = (
torch.arange(config.max_position_embeddings).expand((1, -1)) + 2
)
return orig_state_dict
def convert_nystromformer_checkpoint(checkpoint_path, nystromformer_config_file, pytorch_dump_path):
orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"]
config = NystromformerConfig.from_json_file(nystromformer_config_file)
model = NystromformerForMaskedLM(config)
new_state_dict = convert_checkpoint_helper(config, orig_state_dict)
model.load_state_dict(new_state_dict)
model.eval()
model.save_pretrained(pytorch_dump_path)
print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--pytorch_model_path", default=None, type=str, required=True, help="Path to Nystromformer pytorch checkpoint."
)
parser.add_argument(
"--config_file",
default=None,
type=str,
required=True,
help="The json file for Nystromformer model config.",
)
parser.add_argument(
"--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
args = parser.parse_args()
convert_nystromformer_checkpoint(args.pytorch_model_path, args.config_file, args.pytorch_dump_path)
.\models\nystromformer\modeling_nystromformer.py
import math
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
MaskedLMOutput,
MultipleChoiceModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_nystromformer import NystromformerConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "uw-madison/nystromformer-512"
_CONFIG_FOR_DOC = "NystromformerConfig"
NYSTROMFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"uw-madison/nystromformer-512",
]
class NystromformerEmbeddings(nn.Module):
"""构建来自单词、位置和标记类型嵌入的嵌入层。"""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings + 2, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2, persistent=False
)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer(
"token_type_ids",
torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
persistent=False,
)
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = self.position_ids[:, :seq_length]
if token_type_ids is None:
if hasattr(self, "token_type_ids"):
buffered_token_type_ids = self.token_type_ids[:, :seq_length]
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
token_type_ids = buffered_token_type_ids_expanded
else:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + token_type_embeddings
if self.position_embedding_type == "absolute":
position_embeddings = self.position_embeddings(position_ids)
embeddings += position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class NystromformerSelfAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.num_landmarks = config.num_landmarks
self.seq_len = config.segment_means_seq_len
self.conv_kernel_size = config.conv_kernel_size
if config.inv_coeff_init_option:
self.init_option = config["inv_init_coeff_option"]
else:
self.init_option = "original"
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = position_embedding_type or getattr(
config, "position_embedding_type", "absolute"
)
if self.conv_kernel_size is not None:
self.conv = nn.Conv2d(
in_channels=self.num_attention_heads,
out_channels=self.num_attention_heads,
kernel_size=(self.conv_kernel_size, 1),
padding=(self.conv_kernel_size // 2, 0),
bias=False,
groups=self.num_attention_heads,
)
def iterative_inv(self, mat, n_iter=6):
identity = torch.eye(mat.size(-1), device=mat.device)
key = mat
if self.init_option == "original":
value = 1 / torch.max(torch.sum(key, dim=-2)) * key.transpose(-1, -2)
else:
value = 1 / torch.max(torch.sum(key, dim=-2), dim=-1).values[:, :, None, None] * key.transpose(-1, -2)
for _ in range(n_iter):
key_value = torch.matmul(key, value)
value = torch.matmul(
0.25 * value,
13 * identity
- torch.matmul(key_value, 15 * identity - torch.matmul(key_value, 7 * identity - key_value)),
)
return value
def transpose_for_scores(self, layer):
new_layer_shape = layer.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
layer = layer.view(*new_layer_shape)
return layer.permute(0, 2, 1, 3)
def forward(self, hidden_states, attention_mask=None, output_attentions=False):
mixed_query_layer = self.query(hidden_states)
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
query_layer = self.transpose_for_scores(mixed_query_layer)
query_layer = query_layer / math.sqrt(math.sqrt(self.attention_head_size))
key_layer = key_layer / math.sqrt(math.sqrt(self.attention_head_size))
if self.num_landmarks == self.seq_len:
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
if attention_mask is not None:
attention_scores = attention_scores + attention_mask
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
context_layer = torch.matmul(attention_probs, value_layer)
else:
q_landmarks = query_layer.reshape(
-1,
self.num_attention_heads,
self.num_landmarks,
self.seq_len // self.num_landmarks,
self.attention_head_size,
).mean(dim=-2)
k_landmarks = key_layer.reshape(
-1,
self.num_attention_heads,
self.num_landmarks,
self.seq_len // self.num_landmarks,
self.attention_head_size,
).mean(dim=-2)
kernel_1 = torch.nn.functional.softmax(torch.matmul(query_layer, k_landmarks.transpose(-1, -2)), dim=-1)
kernel_2 = torch.nn.functional.softmax(torch.matmul(q_landmarks, k_landmarks.transpose(-1, -2)), dim=-1)
attention_scores = torch.matmul(q_landmarks, key_layer.transpose(-1, -2))
if attention_mask is not None:
attention_scores = attention_scores + attention_mask
kernel_3 = nn.functional.softmax(attention_scores, dim=-1)
attention_probs = torch.matmul(kernel_1, self.iterative_inv(kernel_2))
new_value_layer = torch.matmul(kernel_3, value_layer)
context_layer = torch.matmul(attention_probs, new_value_layer)
if self.conv_kernel_size is not None:
context_layer += self.conv(value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
class NystromformerSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class NystromformerAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
self.self = NystromformerSelfAttention(config, position_embedding_type=position_embedding_type)
self.output = NystromformerSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(self, hidden_states, attention_mask=None, output_attentions=False):
self_outputs = self.self(hidden_states, attention_mask, output_attentions)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class NystromformerIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class NystromformerOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class NystromformerLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = NystromformerAttention(config)
self.add_cross_attention = config.add_cross_attention
self.intermediate = NystromformerIntermediate(config)
self.output = NystromformerOutput(config)
def forward(self, hidden_states, attention_mask=None, output_attentions=False):
self_attention_outputs = self.attention(hidden_states, attention_mask, output_attentions=output_attentions)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:]
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class NystromformerEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([NystromformerLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
for layer_module in self.layer:
hidden_states = layer_module(hidden_states, attention_mask, output_attentions=output_attentions)[0]
return hidden_states
):
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
attention_mask,
output_attentions,
)
else:
layer_outputs = layer_module(hidden_states, attention_mask, output_attentions)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
class NystromformerPredictionHeadTransform(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
class NystromformerLMPredictionHead(nn.Module):
def __init__(self, config):
super().__init__()
self.transform = NystromformerPredictionHeadTransform(config)
self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.decoder.bias = self.bias
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states)
return hidden_states
class NystromformerOnlyMLMHead(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = NystromformerLMPredictionHead(config)
def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
prediction_scores = self.predictions(sequence_output)
return prediction_scores
class NystromformerPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = NystromformerConfig
base_model_prefix = "nystromformer"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, (nn.Linear, nn.Conv2d)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
NYSTROMFORMER_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`NystromformerConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
NYSTROMFORMER_INPUTS_DOCSTRING = r"""
This is a docstring describing the inputs expected by the Nystromformer model.
Inputs:
**input_ids** (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
**attention_mask** (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, optional):
Mask to avoid performing attention on padding tokens.
**position_ids** (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, optional):
Indices of positions of each input sequence tokens in the position embeddings.
**inputs_embeds** (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, optional):
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
This is useful when you want more control over how to convert input tokens to embeddings before feeding them
into the model.
Returns:
:obj:`torch.Tensor`: Returns tensor(s) containing the model outputs.
"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using `AutoTokenizer`. See `PreTrainedTokenizer.encode` and
`PreTrainedTokenizer.__call__` for details.
[What are input IDs?](../glossary
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
1]`:
- 0 corresponds to a *sentence A* token,
- 1 corresponds to a *sentence B* token.
[What are token type IDs?](../glossary
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
# 是否返回`ModelOutput`而不是普通元组。
Whether or not to return a `ModelOutput` instead of a plain tuple.
"""
@add_start_docstrings(
"The bare Nyströmformer Model transformer outputting raw hidden-states without any specific head on top.",
NYSTROMFORMER_START_DOCSTRING,
)
class NystromformerModel(NystromformerPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.config = config
# Initialize the Nystromformer embeddings and encoder based on the given configuration
self.embeddings = NystromformerEmbeddings(config)
self.encoder = NystromformerEncoder(config)
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
# Retrieve the word embeddings from the Nystromformer embeddings
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
# Set new word embeddings for the Nystromformer embeddings
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
# Prune heads in each layer of the encoder
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings(
"Nyströmformer Model with a `language modeling` head on top.",
NYSTROMFORMER_START_DOCSTRING
)
class NystromformerForMaskedLM(NystromformerPreTrainedModel):
_tied_weights_keys = ["cls.predictions.decoder"]
def __init__(self, config):
super().__init__(config)
# Initialize the Nystromformer model and MLM head based on the provided configuration
self.nystromformer = NystromformerModel(config)
self.cls = NystromformerOnlyMLMHead(config)
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
# Retrieve the output embeddings from the MLM head
return self.cls.predictions.decoder
def set_output_embeddings(self, new_embeddings):
# Set new output embeddings for the MLM head
self.cls.predictions.decoder = new_embeddings
def forward(
self,
input_ids: Optional[torch.LongTensor] = None, # 输入的token ids序列,可以为空
attention_mask: Optional[torch.FloatTensor] = None, # 注意力掩码,用于指示哪些位置是填充的
token_type_ids: Optional[torch.LongTensor] = None, # token类型ids,如用于区分句子A和句子B的位置
position_ids: Optional[torch.LongTensor] = None, # 位置ids,用于指定每个token的位置信息
head_mask: Optional[torch.FloatTensor] = None, # 头部掩码,用于控制每个注意力头部的选择性
inputs_embeds: Optional[torch.FloatTensor] = None, # 输入的嵌入向量,可以用于直接输入嵌入而不是token ids
labels: Optional[torch.LongTensor] = None, # 用于计算MLM损失的标签,指示哪些位置是被mask的
output_attentions: Optional[bool] = None, # 是否输出注意力权重
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态
return_dict: Optional[bool] = None, # 是否返回一个字典格式的输出
) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict # 确定是否使用返回字典模式
outputs = self.nystromformer(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0] # 获取模型输出的序列输出
prediction_scores = self.cls(sequence_output) # 使用线性层对序列输出进行预测得分计算
masked_lm_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss() # 定义交叉熵损失函数,用于计算MLM损失
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) # 计算MLM损失
if not return_dict:
output = (prediction_scores,) + outputs[1:] # 如果不使用字典返回,则组装输出元组
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output # 返回损失和输出元组
return MaskedLMOutput(
loss=masked_lm_loss, # 返回MLM损失
logits=prediction_scores, # 返回预测得分
hidden_states=outputs.hidden_states, # 返回隐藏状态
attentions=outputs.attentions, # 返回注意力权重
)
# 定义一个用于序列级分类任务的模型头部
class NystromformerClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
def __init__(self, config):
super().__init__()
# 线性层,输入尺寸为config.hidden_size,输出尺寸为config.hidden_size
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
# Dropout 层,以config.hidden_dropout_prob的概率随机置零输入张量的部分元素
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 输出投影层,将输入尺寸为config.hidden_size的张量线性映射到尺寸为config.num_labels的张量
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
self.config = config
def forward(self, features, **kwargs):
# 提取features中的第一个位置处的张量(对应于<CLS> token)
x = features[:, 0, :] # take <s> token (equiv. to [CLS])
# 对提取的张量施加dropout操作
x = self.dropout(x)
# 将张量输入到线性层中进行线性变换
x = self.dense(x)
# 根据config中指定的激活函数对输出进行非线性变换
x = ACT2FN[self.config.hidden_act](x)
# 再次对张量施加dropout操作
x = self.dropout(x)
# 将张量输入到输出投影层中进行线性映射,得到最终输出
x = self.out_proj(x)
return x
# 应用于序列分类/回归任务的Nyströmformer模型变换器,顶部带有一个线性层(汇聚输出的线性层),例如用于GLUE任务
@add_start_docstrings(
"""
Nyströmformer Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
NYSTROMFORMER_START_DOCSTRING,
)
class NystromformerForSequenceClassification(NystromformerPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 初始化模型的标签数量
self.num_labels = config.num_labels
# 初始化Nyströmformer模型
self.nystromformer = NystromformerModel(config)
# 初始化分类器头部
self.classifier = NystromformerClassificationHead(config)
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(NYSTROMFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
r"""
定义了一个函数签名,表明该函数接受一些输入,并返回一个包含torch.Tensor或SequenceClassifierOutput的元组。
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
用于计算序列分类/回归损失的标签。索引应在 `[0, ..., config.num_labels - 1]` 范围内。
如果 `config.num_labels == 1`,则计算回归损失(均方损失);如果 `config.num_labels > 1`,则计算分类损失(交叉熵)。
"""
# 初始化返回字典,如果return_dict为None,则根据self.config.use_return_dict确定
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用self.nystromformer进行前向传播
outputs = self.nystromformer(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取序列输出
sequence_output = outputs[0]
# 将序列输出传入分类器得到logits
logits = self.classifier(sequence_output)
# 初始化损失为None
loss = None
# 如果labels不为None,则计算损失
if labels is not None:
# 如果self.config.problem_type未定义,则根据self.num_labels的值设定problem_type
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
# 根据problem_type计算不同类型的损失
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
# 如果return_dict为False,则按顺序返回元组
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
# 如果return_dict为True,则返回SequenceClassifierOutput对象
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
"""
Nyströmformer Model with a multiple choice classification head on top (a linear layer on top of the pooled output
and a softmax) e.g. for RocStories/SWAG tasks.
"""
# 使用 Nyströmformer 模型,并在其顶部添加一个多选分类头部(即在池化输出之上的线性层和 softmax),例如用于 RocStories/SWAG 任务。
class NystromformerForMultipleChoice(NystromformerPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 初始化 Nyströmformer 模型
self.nystromformer = NystromformerModel(config)
# 初始化用于预分类的线性层
self.pre_classifier = nn.Linear(config.hidden_size, config.hidden_size)
# 初始化用于最终分类的线性层
self.classifier = nn.Linear(config.hidden_size, 1)
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(
NYSTROMFORMER_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
# 前向传播函数,接受一系列输入参数,并返回模型输出的多选分类结果
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 参数和返回值的文档字符串,描述了输入和输出的具体格式
) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
# 根据 return_dict 是否为 None,确定是否使用配置中的 use_return_dict
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 计算 num_choices,如果 input_ids 不为 None,则为其第二维度的大小
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
# 如果 input_ids 不为 None,则重新调整其形状为 (batch_size * num_choices, seq_len)
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
# 如果 attention_mask 不为 None,则重新调整其形状为 (batch_size * num_choices, seq_len)
attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
# 如果 token_type_ids 不为 None,则重新调整其形状为 (batch_size * num_choices, seq_len)
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
# 如果 position_ids 不为 None,则重新调整其形状为 (batch_size * num_choices, seq_len)
position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
# 如果 inputs_embeds 不为 None,则重新调整其形状为 (batch_size * num_choices, seq_len, dim)
inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
# 调用 NystromFormer 模型进行前向传播
outputs = self.nystromformer(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取模型输出的隐藏状态
hidden_state = outputs[0] # (bs * num_choices, seq_len, dim)
# 提取池化后的输出,仅保留每个序列的第一个标记的表示
pooled_output = hidden_state[:, 0] # (bs * num_choices, dim)
# 通过预分类器进行线性变换
pooled_output = self.pre_classifier(pooled_output) # (bs * num_choices, dim)
# 应用 ReLU 激活函数
pooled_output = nn.ReLU()(pooled_output) # (bs * num_choices, dim)
# 通过分类器获取最终的 logits
logits = self.classifier(pooled_output)
# 重新调整 logits 的形状为 (batch_size, num_choices)
reshaped_logits = logits.view(-1, num_choices)
# 计算损失值
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
# 如果 return_dict 为 False,则返回扁平化的输出元组
if not return_dict:
output = (reshaped_logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
# 如果 return_dict 为 True,则返回 MultipleChoiceModelOutput 对象
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 定义一个 Nyströmformer 模型,带有一个顶部的标记分类头部(隐藏状态输出上的线性层),用于命名实体识别(NER)等任务。
# 例如,用于命名实体识别(NER)任务的 Nyströmformer 模型,具有一个在隐藏状态输出之上的线性层作为标记分类头部。
@add_start_docstrings(
"""
Nyströmformer Model with a token classification head on top (a linear layer on top of the hidden-states output)
e.g. for Named-Entity-Recognition (NER) tasks.
""",
NYSTROMFORMER_START_DOCSTRING,
)
class NystromformerForTokenClassification(NystromformerPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 从配置中获取标签数目
self.num_labels = config.num_labels
# 初始化 Nyströmformer 模型
self.nystromformer = NystromformerModel(config)
# 添加 dropout 层
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 添加线性分类器,将隐藏状态大小映射到标签数目
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(NYSTROMFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
# 根据参数 `return_dict` 的值决定是否使用配置中的返回字典选项
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 将输入传递给 Nystromformer 模型,并收集其输出
outputs = self.nystromformer(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取模型输出中的序列输出(通常是最后一层的隐藏状态)
sequence_output = outputs[0]
# 对序列输出应用 dropout,用于防止过拟合
sequence_output = self.dropout(sequence_output)
# 将 dropout 后的输出传递给分类器,得到分类 logits
logits = self.classifier(sequence_output)
# 初始化损失值为 None
loss = None
# 如果提供了标签,则计算交叉熵损失
if labels is not None:
loss_fct = CrossEntropyLoss()
# 将 logits 和标签展平后计算损失
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
# 如果不需要返回字典格式的输出,则组装输出为元组
if not return_dict:
output = (logits,) + outputs[1:] # 包括可能的额外输出
return ((loss,) + output) if loss is not None else output
# 如果需要返回字典格式的输出,则使用 TokenClassifierOutput 类封装输出
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 将 Nyströmformer 模型用于提取式问答任务,例如 SQuAD,顶部包含一个用于分类的线性层来计算“起始位置logits”和“结束位置logits”
@add_start_docstrings(
"""
Nyströmformer Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
NYSTROMFORMER_START_DOCSTRING,
)
class NystromformerForQuestionAnswering(NystromformerPreTrainedModel):
def __init__(self, config):
# 调用父类的初始化方法
super().__init__(config)
# 设置分类标签数为2(起始位置和结束位置)
config.num_labels = 2
self.num_labels = config.num_labels
# 初始化 Nyströmformer 模型和用于问答的输出层
self.nystromformer = NystromformerModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
# 初始化权重并进行最终处理
self.post_init()
@add_start_docstrings_to_model_forward(NYSTROMFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 输入参数详细说明
):
) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
# 根据 return_dict 参数确定是否返回一个字典
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 将输入传递给 NystromFormer 模型进行处理
outputs = self.nystromformer(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从 NystromFormer 模型的输出中获取序列输出
sequence_output = outputs[0]
# 使用序列输出计算问题回答的 logits
logits = self.qa_outputs(sequence_output)
# 将 logits 按照最后一个维度分割成 start_logits 和 end_logits
start_logits, end_logits = logits.split(1, dim=-1)
# 去除多余的维度,使得 start_logits 和 end_logits 的形状变为 (batch_size, sequence_length)
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)
total_loss = None
# 如果给定了 start_positions 和 end_positions,则计算损失
if start_positions is not None and end_positions is not None:
# 如果是多 GPU 环境,可能会有额外的维度,这里进行压缩处理
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# 将超出模型输入范围的 start/end positions 忽略掉
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
# 使用交叉熵损失函数,忽略索引为 ignored_index 的部分
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
# 计算总损失
total_loss = (start_loss + end_loss) / 2
# 如果不要求返回字典形式的输出,直接返回 logits 和可能的损失
if not return_dict:
output = (start_logits, end_logits) + outputs[1:]
return ((total_loss,) + output) if total_loss is not None else output
# 如果要求返回 QuestionAnsweringModelOutput 对象,构建并返回
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
.\models\nystromformer\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
_import_structure = {
"configuration_nystromformer": ["NYSTROMFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "NystromformerConfig"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_nystromformer"] = [
"NYSTROMFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
"NystromformerForMaskedLM",
"NystromformerForMultipleChoice",
"NystromformerForQuestionAnswering",
"NystromformerForSequenceClassification",
"NystromformerForTokenClassification",
"NystromformerLayer",
"NystromformerModel",
"NystromformerPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_nystromformer import NYSTROMFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, NystromformerConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_nystromformer import (
NYSTROMFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
NystromformerForMaskedLM,
NystromformerForMultipleChoice,
NystromformerForQuestionAnswering,
NystromformerForSequenceClassification,
NystromformerForTokenClassification,
NystromformerLayer,
NystromformerModel,
NystromformerPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\oneformer\configuration_oneformer.py
"""OneFormer模型配置"""
from typing import Dict, Optional
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto import CONFIG_MAPPING
logger = logging.get_logger(__name__)
ONEFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"shi-labs/oneformer_ade20k_swin_tiny": (
"https://huggingface.co/shi-labs/oneformer_ade20k_swin_tiny/blob/main/config.json"
),
}
class OneFormerConfig(PretrainedConfig):
r"""
这是一个配置类,用于存储[`OneFormerModel`]的配置。根据指定的参数实例化一个OneFormer模型,定义模型架构。
使用默认值实例化配置会产生一个类似于OneFormer[shi-labs/oneformer_ade20k_swin_tiny]架构的配置,
它在ADE20k-150数据集上进行了训练。
配置对象继承自[`PretrainedConfig`],可用于控制模型的输出。更多信息请参阅[`PretrainedConfig`]的文档。
Examples:
```
>>> from transformers import OneFormerConfig, OneFormerModel
>>> # 初始化一个OneFormer shi-labs/oneformer_ade20k_swin_tiny配置
>>> configuration = OneFormerConfig()
>>> # 使用该配置初始化一个模型(带有随机权重)
>>> model = OneFormerModel(configuration)
>>> # 访问模型配置
>>> configuration = model.config
```
"""
model_type = "oneformer"
attribute_map = {"hidden_size": "hidden_dim"}
def __init__(
self,
backbone_config: Optional[Dict] = None,
backbone: Optional[str] = None,
use_pretrained_backbone: bool = False,
use_timm_backbone: bool = False,
backbone_kwargs: Optional[Dict] = None,
ignore_value: int = 255,
num_queries: int = 150,
no_object_weight: int = 0.1,
class_weight: float = 2.0,
mask_weight: float = 5.0,
dice_weight: float = 5.0,
contrastive_weight: float = 0.5,
contrastive_temperature: float = 0.07,
train_num_points: int = 12544,
oversample_ratio: float = 3.0,
importance_sample_ratio: float = 0.75,
init_std: float = 0.02,
init_xavier_std: float = 1.0,
layer_norm_eps: float = 1e-05,
is_training: bool = False,
use_auxiliary_loss: bool = True,
output_auxiliary_logits: bool = True,
strides: Optional[list] = [4, 8, 16, 32],
task_seq_len: int = 77,
text_encoder_width: int = 256,
text_encoder_context_length: int = 77,
text_encoder_num_layers: int = 6,
text_encoder_vocab_size: int = 49408,
text_encoder_proj_layers: int = 2,
text_encoder_n_ctx: int = 16,
conv_dim: int = 256,
mask_dim: int = 256,
hidden_dim: int = 256,
encoder_feedforward_dim: int = 1024,
norm: str = "GN",
encoder_layers: int = 6,
decoder_layers: int = 10,
use_task_norm: bool = True,
num_attention_heads: int = 8,
dropout: float = 0.1,
dim_feedforward: int = 2048,
pre_norm: bool = False,
enforce_input_proj: bool = False,
query_dec_layers: int = 2,
common_stride: int = 4,
**kwargs,
.\models\oneformer\convert_to_hf_oneformer.py
"""Convert OneFormer checkpoints from the original repository. URL: https://github.com/SHI-Labs/OneFormer"""
import os
import sys
from argparse import ArgumentParser
from dataclasses import dataclass
from pathlib import Path
from pprint import pformat
from typing import Any, Dict, Iterator, List, Set, Tuple
import requests
import torch
import torchvision.transforms as T
from PIL import Image
from torch import Tensor, nn
try:
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import get_cfg
from detectron2.data import MetadataCatalog
from detectron2.projects.deeplab import add_deeplab_config
except ImportError:
pass
from transformers import CLIPTokenizer, DinatConfig, SwinConfig
from transformers.models.oneformer.image_processing_oneformer import OneFormerImageProcessor
from transformers.models.oneformer.modeling_oneformer import (
OneFormerConfig,
OneFormerForUniversalSegmentation,
OneFormerForUniversalSegmentationOutput,
OneFormerModel,
OneFormerModelOutput,
)
from transformers.models.oneformer.processing_oneformer import OneFormerProcessor
from transformers.utils import logging
StateDict = Dict[str, Tensor]
logging.set_verbosity_info()
logger = logging.get_logger()
torch.manual_seed(0)
class TrackedStateDict:
def __init__(self, to_track: Dict):
"""This class "tracks" a python dictionary by keeping track of which item is accessed.
Args:
to_track (Dict): The dictionary we wish to track
"""
self.to_track = to_track
self._seen: Set[str] = set()
def __getitem__(self, key: str) -> Any:
return self.to_track[key]
def __setitem__(self, key: str, item: Any):
self._seen.add(key)
self.to_track[key] = item
def diff(self) -> List[str]:
"""This method returns a set difference between the keys in the tracked state dict and the one we have access so far.
This is an effective method to check if we have update all the keys
Returns:
List[str]: List of keys not yet updated
"""
return set(self.to_track.keys()) - self._seen
def copy(self) -> Dict:
return self.to_track.copy()
def prepare_img():
url = "https://praeclarumjj3.github.io/files/coco.jpeg"
img_data = requests.get(url, stream=True).raw
im = Image.open(img_data)
return im
@dataclass
class Args:
"""Fake command line arguments needed by oneformer/detectron2 implementation"""
config_file: str
def setup_cfg(args: Args):
cfg = get_cfg()
add_deeplab_config(cfg)
add_common_config(cfg)
add_oneformer_config(cfg)
add_swin_config(cfg)
add_dinat_config(cfg)
cfg.merge_from_file(args.config_file)
cfg.freeze()
return cfg
class OriginalOneFormerConfigToOursConverter:
class OriginalOneFormerConfigToProcessorConverter:
def __call__(self, original_config: object, model_repo: str) -> OneFormerProcessor:
model = original_config.MODEL
model_input = original_config.INPUT
dataset_catalog = MetadataCatalog.get(original_config.DATASETS.TEST_PANOPTIC[0])
if "ade20k" in model_repo:
class_info_file = "ade20k_panoptic.json"
elif "coco" in model_repo:
class_info_file = "coco_panoptic.json"
elif "cityscapes" in model_repo:
class_info_file = "cityscapes_panoptic.json"
else:
raise ValueError("Invalid Dataset!")
image_processor = OneFormerImageProcessor(
image_mean=(torch.tensor(model.PIXEL_MEAN) / 255).tolist(),
image_std=(torch.tensor(model.PIXEL_STD) / 255).tolist(),
size=model_input.MIN_SIZE_TEST,
max_size=model_input.MAX_SIZE_TEST,
num_labels=model.SEM_SEG_HEAD.NUM_CLASSES,
ignore_index=dataset_catalog.ignore_label,
class_info_file=class_info_file,
)
tokenizer = CLIPTokenizer.from_pretrained(model_repo)
return OneFormerProcessor(
image_processor=image_processor,
tokenizer=tokenizer,
task_seq_length=original_config.INPUT.TASK_SEQ_LEN,
max_seq_length=original_config.INPUT.MAX_SEQ_LEN,
)
class OriginalOneFormerCheckpointToOursConverter:
def __init__(self, original_model: nn.Module, config: OneFormerConfig):
self.original_model = original_model
self.config = config
def pop_all(self, renamed_keys: List[Tuple[str, str]], dst_state_dict: StateDict, src_state_dict: StateDict):
for src_key, dst_key in renamed_keys:
dst_state_dict[dst_key] = src_state_dict.pop(src_key)
def replace_keys_qkv_transformer_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
dst_prefix: str = "transformer_module.decoder.layers"
src_prefix: str = "sem_seg_head.predictor"
for i in range(self.config.decoder_layers - 1):
in_proj_weight = src_state_dict.pop(
f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.in_proj_weight"
)
in_proj_bias = src_state_dict.pop(
f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.in_proj_bias"
)
dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.q_proj.bias"] = in_proj_bias[:256]
dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.k_proj.bias"] = in_proj_bias[256:512]
dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.v_proj.bias"] = in_proj_bias[-256:]
def replace_task_mlp(self, dst_state_dict: StateDict, src_state_dict: StateDict):
dst_prefix: str = "task_encoder"
src_prefix: str = "task_mlp"
def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
return [
(f"{src_prefix}.weight", f"{dst_prefix}.weight"),
(f"{src_prefix}.bias", f"{dst_prefix}.bias"),
]
renamed_keys = []
for i in range(2):
renamed_keys.extend(
rename_keys_for_weight_bias(f"{src_prefix}.layers.{i}", f"{dst_prefix}.task_mlp.layers.{i}.0")
)
self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
def replace_text_projector(self, dst_state_dict: StateDict, src_state_dict: StateDict):
dst_prefix: str = "text_mapper.text_projector"
src_prefix: str = "text_projector"
def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
return [
(f"{src_prefix}.weight", f"{dst_prefix}.weight"),
(f"{src_prefix}.bias", f"{dst_prefix}.bias"),
]
renamed_keys = []
for i in range(self.config.text_encoder_config["text_encoder_proj_layers"]):
renamed_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.layers.{i}", f"{dst_prefix}.{i}.0"))
self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
def replace_text_mapper(self, dst_state_dict: StateDict, src_state_dict: StateDict):
dst_prefix: str = "text_mapper.text_encoder"
src_prefix: str = "text_encoder"
self.replace_text_projector(dst_state_dict, src_state_dict)
def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
return [
(f"{src_prefix}.weight", f"{dst_prefix}.weight"),
(f"{src_prefix}.bias", f"{dst_prefix}.bias"),
]
def rename_keys_for_attn(src_prefix: str, dst_prefix: str):
attn_keys = [
(f"{src_prefix}.in_proj_bias", f"{dst_prefix}.in_proj_bias"),
(f"{src_prefix}.in_proj_weight", f"{dst_prefix}.in_proj_weight"),
]
attn_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.out_proj", f"{dst_prefix}.out_proj"))
return attn_keys
def rename_keys_for_layer(src_prefix: str, dst_prefix: str):
resblock_keys = []
resblock_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.mlp.c_fc", f"{dst_prefix}.mlp.fc1"))
resblock_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.mlp.c_proj", f"{dst_prefix}.mlp.fc2"))
resblock_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.ln_1", f"{dst_prefix}.layer_norm1"))
resblock_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.ln_2", f"{dst_prefix}.layer_norm2"))
resblock_keys.extend(rename_keys_for_attn(f"{src_prefix}.attn", f"{dst_prefix}.self_attn"))
return resblock_keys
renamed_keys = [
("prompt_ctx.weight", "text_mapper.prompt_ctx.weight"),
]
renamed_keys.extend(
[
(f"{src_prefix}.positional_embedding", f"{dst_prefix}.positional_embedding"),
(f"{src_prefix}.token_embedding.weight", f"{dst_prefix}.token_embedding.weight"),
]
)
renamed_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.ln_final", f"{dst_prefix}.ln_final"))
for i in range(self.config.text_encoder_config["text_encoder_num_layers"]):
renamed_keys.extend(
rename_keys_for_layer(
f"{src_prefix}.transformer.resblocks.{i}", f"{dst_prefix}.transformer.layers.{i}"
)
)
self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
def convert(self, oneformer: OneFormerModel, is_swin: bool) -> OneFormerModel:
dst_state_dict = TrackedStateDict(oneformer.state_dict())
src_state_dict = self.original_model.state_dict()
self.replace_pixel_module(dst_state_dict, src_state_dict, is_swin)
self.replace_transformer_module(dst_state_dict, src_state_dict)
self.replace_task_mlp(dst_state_dict, src_state_dict)
if self.config.is_training:
self.replace_text_mapper(dst_state_dict, src_state_dict)
logger.info(f"Missed keys are {pformat(dst_state_dict.diff())}")
logger.info(f"Not copied keys are {pformat(src_state_dict.keys())}")
logger.info("🙌 Done")
oneformer.load_state_dict(dst_state_dict)
return oneformer
@staticmethod
def using_dirs(checkpoints_dir: Path, config_dir: Path) -> Iterator[Tuple[object, Path, Path]]:
checkpoints: List[Path] = checkpoints_dir.glob("**/*.pth")
for checkpoint in checkpoints:
logger.info(f"💪 Converting {checkpoint.stem}")
config: Path = config_dir / f"{checkpoint.stem}.yaml"
yield config, checkpoint
def post_process_sem_seg_output(outputs: OneFormerForUniversalSegmentationOutput, target_size: Tuple[int, int]):
class_queries_logits = outputs.class_queries_logits
masks_queries_logits = outputs.masks_queries_logits
if target_size is not None:
masks_queries_logits = torch.nn.functional.interpolate(
masks_queries_logits,
size=target_size,
mode="bilinear",
align_corners=False,
)
masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1]
masks_probs = masks_queries_logits.sigmoid()
segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
return segmentation
def test(
original_model,
our_model: OneFormerForUniversalSegmentation,
processor: OneFormerProcessor,
model_repo: str,
):
def _preprocess_text(text_list=None, max_length=77):
if text_list is None:
raise ValueError("tokens cannot be None.")
tokens = tokenizer(text_list, padding="max_length", max_length=max_length, truncation=True)
attention_masks, input_ids = tokens["attention_mask"], tokens["input_ids"]
token_inputs = []
for attn_mask, input_id in zip(attention_masks, input_ids):
token = torch.tensor(attn_mask) * torch.tensor(input_id)
token_inputs.append(token.unsqueeze(0))
token_inputs = torch.cat(token_inputs, dim=0)
return token_inputs
with torch.no_grad():
tokenizer = CLIPTokenizer.from_pretrained(model_repo)
original_model = original_model.eval()
our_model = our_model.eval()
im = prepare_img()
tr = T.Compose(
[
T.Resize((640, 640)),
T.ToTensor(),
T.Normalize(
mean=torch.tensor([123.675, 116.280, 103.530]) / 255.0,
std=torch.tensor([58.395, 57.120, 57.375]) / 255.0,
),
],
)
x = tr(im).unsqueeze(0)
task_input = ["the task is semantic"]
task_token = _preprocess_text(task_input, max_length=processor.task_seq_length)
original_model_backbone_features = original_model.backbone(x.clone())
our_model_output: OneFormerModelOutput = our_model.model(x.clone(), task_token, output_hidden_states=True)
for original_model_feature, our_model_feature in zip(
original_model_backbone_features.values(), our_model_output.encoder_hidden_states
):
assert torch.allclose(
original_model_feature, our_model_feature, atol=3e-3
), "The backbone features are not the same."
mask_features, _, multi_scale_features, _, _ = original_model.sem_seg_head.pixel_decoder.forward_features(
original_model_backbone_features
)
original_pixel_decoder_features = []
original_pixel_decoder_features.append(mask_features)
for i in range(len(multi_scale_features)):
original_pixel_decoder_features.append(multi_scale_features[i])
for original_model_feature, our_model_feature in zip(
original_pixel_decoder_features, our_model_output.pixel_decoder_hidden_states
):
assert torch.allclose(
original_model_feature, our_model_feature, atol=3e-4
), "The pixel decoder feature are not the same"
tr_complete = T.Compose(
[
T.Resize((640, 640)),
T.ToTensor(),
],
)
y = (tr_complete(im) * 255.0).to(torch.int).float()
original_model_out = original_model([{"image": y.clone(), "task": "The task is semantic"}])
original_segmentation = original_model_out[0]["sem_seg"]
our_model_out: OneFormerForUniversalSegmentationOutput = our_model(
x.clone(), task_token, output_hidden_states=True
)
our_segmentation = post_process_sem_seg_output(our_model_out, target_size=(640, 640))[0]
assert torch.allclose(
original_segmentation, our_segmentation, atol=1e-3
), "The segmentation image is not the same."
logger.info("✅ Test passed!")
def get_name(checkpoint_file: Path):
model_name_raw: str = checkpoint_file.stem
backbone = "swin" if "swin" in model_name_raw else "dinat"
dataset = ""
if "coco" in model_name_raw:
dataset = "coco"
elif "ade20k" in model_name_raw:
dataset = "ade20k"
elif "cityscapes" in model_name_raw:
dataset = "cityscapes"
else:
raise ValueError(
f"{model_name_raw} must be wrong since we didn't find 'coco' or 'ade20k' or 'cityscapes' in it "
)
backbone_types = ["tiny", "large"]
backbone_type = list(filter(lambda x: x in model_name_raw, backbone_types))[0]
model_name = f"oneformer_{dataset}_{backbone}_{backbone_type}"
return model_name
if __name__ == "__main__":
parser = ArgumentParser(
description=(
"Command line to convert the original oneformer models (with swin backbone) to transformers"
" implementation."
)
)
parser.add_argument(
"--checkpoints_dir",
type=Path,
help=(
"A directory containing the model's checkpoints. The directory has to have the following structure:"
" <DIR_NAME>/<DATASET_NAME>/<CONFIG_NAME>.pth; where <CONFIG_NAME> name must follow the"
" following nomenclature: oneformer_<DATASET_NAME>_<BACKBONE>_<BACKBONE_TYPE>"
),
)
parser.add_argument(
"--configs_dir",
type=Path,
help=(
"A directory containing the model's configs, see detectron2 doc. The directory has to have the following"
" structure: <DIR_NAME>/<DATASET_NAME>/<CONFIG_NAME>.yaml; where <CONFIG_NAME> name must follow the"
" following nomenclature: oneformer_<DATASET_NAME>_<BACKBONE>_<BACKBONE_TYPE>"
),
)
parser.add_argument(
"--pytorch_dump_folder_path",
required=True,
type=Path,
help="Path to the folder to output PyTorch models.",
)
parser.add_argument(
"--oneformer_dir",
required=True,
type=Path,
help=(
"A path to OneFormer's original implementation directory. You can download from here: "
"https://github.com/SHI-Labs/OneFormer"
),
)
args = parser.parse_args()
checkpoints_dir: Path = args.checkpoints_dir
config_dir: Path = args.configs_dir
save_directory: Path = args.pytorch_dump_folder_path
oneformer_dir: Path = args.oneformer_dir
if not save_directory.exists():
save_directory.mkdir(parents=True)
for config_file, checkpoint_file in OriginalOneFormerCheckpointToOursConverter.using_dirs(
checkpoints_dir, config_dir
):
processor = OriginalOneFormerConfigToProcessorConverter()(
setup_cfg(Args(config_file=config_file)), os.path.join("shi-labs", config_file.stem)
)
original_config = setup_cfg(Args(config_file=config_file))
oneformer_kwargs = OriginalOneFormer.from_config(original_config)
original_model = OriginalOneFormer(**oneformer_kwargs).eval()
DetectionCheckpointer(original_model).load(str(checkpoint_file))
is_swin = "swin" in config_file.stem
config: OneFormerConfig = OriginalOneFormerConfigToOursConverter()(original_config, is_swin)
oneformer = OneFormerModel(config=config).eval()
converter = OriginalOneFormerCheckpointToOursConverter(original_model, config)
oneformer = converter.convert(oneformer, is_swin)
oneformer_for_universal_segmentation = OneFormerForUniversalSegmentation(config=config).eval()
oneformer_for_universal_segmentation.model = oneformer
test(
original_model,
oneformer_for_universal_segmentation,
processor,
os.path.join("shi-labs", config_file.stem),
)
model_name = get_name(checkpoint_file)
logger.info(f"🪄 Saving {model_name}")
processor.save_pretrained(save_directory / model_name)
oneformer_for_universal_segmentation.save_pretrained(save_directory / model_name)
processor.push_to_hub(
repo_id=os.path.join("shi-labs", config_file.stem),
commit_message="Add configs",
use_temp_dir=True,
)
oneformer_for_universal_segmentation.push_to_hub(
repo_id=os.path.join("shi-labs", config_file.stem),
commit_message="Add model",
use_temp_dir=True,
)