Transformers 源码解析(一百一十)
.\models\t5\modeling_tf_t5.py
""" TF 2.0 T5 model."""
from __future__ import annotations
import copy
import itertools
import math
import warnings
from typing import Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from tensorflow.compiler.tf2xla.python.xla import dynamic_slice
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
TFBaseModelOutput,
TFBaseModelOutputWithPastAndCrossAttentions,
TFSeq2SeqLMOutput,
TFSeq2SeqModelOutput,
)
from ...modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFModelInputType,
TFPreTrainedModel,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_t5 import T5Config
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "T5Config"
TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google-t5/t5-small",
"google-t5/t5-base",
"google-t5/t5-large",
"google-t5/t5-3b",
"google-t5/t5-11b",
]
class TFT5LayerNorm(keras.layers.Layer):
def __init__(self, hidden_size, epsilon=1e-6, **kwargs):
"""
Construct a layernorm module in the T5 style No bias and no subtraction of mean.
"""
super().__init__(**kwargs)
self.variance_epsilon = epsilon
self.hidden_size = hidden_size
def build(self, input_shape):
"""Build shared word embedding layer"""
self.weight = self.add_weight("weight", shape=(self.hidden_size,), initializer="ones")
super().build(input_shape)
def call(self, hidden_states):
variance = tf.math.reduce_mean(tf.math.square(hidden_states), axis=-1, keepdims=True)
hidden_states = hidden_states * tf.math.rsqrt(variance + self.variance_epsilon)
return self.weight * hidden_states
class TFT5DenseActDense(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
wi_initializer = keras.initializers.RandomNormal(
mean=0, stddev=config.initializer_factor * (config.d_model**-0.5)
)
wo_initializer = keras.initializers.RandomNormal(
mean=0, stddev=config.initializer_factor * (config.d_ff**-0.5)
)
self.wi = keras.layers.Dense(
config.d_ff, use_bias=False, name="wi", kernel_initializer=wi_initializer
)
self.wo = keras.layers.Dense(
config.d_model, use_bias=False, name="wo", kernel_initializer=wo_initializer
)
self.dropout = keras.layers.Dropout(config.dropout_rate)
self.act = get_tf_activation(config.dense_act_fn)
self.config = config
def call(self, hidden_states, training=False):
hidden_states = self.wi(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = self.wo(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "wi", None) is not None:
with tf.name_scope(self.wi.name):
self.wi.build([None, None, self.config.d_model])
if getattr(self, "wo", None) is not None:
with tf.name_scope(self.wo.name):
self.wo.build([None, None, self.config.d_ff])
class TFT5DenseGatedActDense(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
wi_initializer = keras.initializers.RandomNormal(
mean=0, stddev=config.initializer_factor * (config.d_model**-0.5)
)
wo_initializer = keras.initializers.RandomNormal(
mean=0, stddev=config.initializer_factor * (config.d_ff**-0.5)
)
self.wi_0 = keras.layers.Dense(
config.d_ff, use_bias=False, name="wi_0", kernel_initializer=wi_initializer
)
self.wi_1 = keras.layers.Dense(
config.d_ff, use_bias=False, name="wi_1", kernel_initializer=wi_initializer
)
self.wo = keras.layers.Dense(
config.d_model, use_bias=False, name="wo", kernel_initializer=wo_initializer
)
self.dropout = keras.layers.Dropout(config.dropout_rate)
self.act = get_tf_activation(config.dense_act_fn)
self.config = config
def call(self, hidden_states, training=False):
hidden_gelu = self.act(self.wi_0(hidden_states))
hidden_linear = self.wi_1(hidden_states)
hidden_states = hidden_gelu * hidden_linear
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = self.wo(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "wi_0", None) is not None:
with tf.name_scope(self.wi_0.name):
self.wi_0.build([None, None, self.config.d_model])
if getattr(self, "wi_1", None) is not None:
with tf.name_scope(self.wi_1.name):
self.wi_1.build([None, None, self.config.d_model])
if getattr(self, "wo", None) is not None:
with tf.name_scope(self.wo.name):
self.wo.build([None, None, self.config.d_ff])
class TFT5LayerFF(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
if config.is_gated_act:
self.DenseReluDense = TFT5DenseGatedActDense(config, name="DenseReluDense")
else:
self.DenseReluDense = TFT5DenseActDense(config, name="DenseReluDense")
self.layer_norm = TFT5LayerNorm(config.d_model, epsilon=config.layer_norm_epsilon, name="layer_norm")
self.dropout = keras.layers.Dropout(config.dropout_rate)
def call(self, hidden_states, training=False):
normed_hidden_states = self.layer_norm(hidden_states)
dense_output = self.DenseReluDense(normed_hidden_states, training=training)
hidden_states = hidden_states + self.dropout(dense_output, training=training)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build(None)
if getattr(self, "DenseReluDense", None) is not None:
with tf.name_scope(self.DenseReluDense.name):
self.DenseReluDense.build(None)
class TFT5Attention(keras.layers.Layer):
NEW_ID = itertools.count()
def __init__(self, config, has_relative_attention_bias=False, **kwargs):
super().__init__(**kwargs)
self.layer_id = next(TFT5Attention.NEW_ID)
self.is_decoder = config.is_decoder
self.use_cache = config.use_cache
self.has_relative_attention_bias = has_relative_attention_bias
self.output_attentions = config.output_attentions
self.relative_attention_num_buckets = config.relative_attention_num_buckets
self.relative_attention_max_distance = config.relative_attention_max_distance
self.d_model = config.d_model
self.key_value_proj_dim = config.d_kv
self.n_heads = config.num_heads
self.inner_dim = self.n_heads * self.key_value_proj_dim
q_initializer = keras.initializers.RandomNormal(
mean=0, stddev=config.initializer_factor * ((self.inner_dim * self.key_value_proj_dim) ** -0.5)
)
k_initializer = keras.initializers.RandomNormal(
mean=0, stddev=config.initializer_factor * (self.inner_dim**-0.5)
)
v_initializer = keras.initializers.RandomNormal(
mean=0, stddev=config.initializer_factor * (self.inner_dim**-0.5)
)
o_initializer = keras.initializers.RandomNormal(
mean=0, stddev=config.initializer_factor * (self.inner_dim**-0.5)
)
self.relative_attention_bias_initializer = keras.initializers.RandomNormal(
mean=0, stddev=config.initializer_factor * (self.inner_dim**-0.5)
)
self.q = keras.layers.Dense(
self.inner_dim, use_bias=False, name="q", kernel_initializer=q_initializer
)
self.k = keras.layers.Dense(
self.inner_dim, use_bias=False, name="k", kernel_initializer=k_initializer
)
self.v = keras.layers.Dense(
self.inner_dim, use_bias=False, name="v", kernel_initializer=v_initializer
)
self.o = keras.layers.Dense(
self.d_model, use_bias=False, name="o", kernel_initializer=o_initializer
)
self.dropout = keras.layers.Dropout(config.dropout_rate)
self.pruned_heads = set()
def build(self, input_shape=None):
if self.built:
return
self.built = True
if self.has_relative_attention_bias:
with tf.name_scope("relative_attention_bias"):
self.relative_attention_bias = self.add_weight(
name="embeddings",
shape=[self.relative_attention_num_buckets, self.n_heads],
initializer=self.relative_attention_bias_initializer,
)
if getattr(self, "q", None) is not None:
with tf.name_scope(self.q.name):
self.q.build([None, None, self.d_model])
if getattr(self, "k", None) is not None:
with tf.name_scope(self.k.name):
self.k.build([None, None, self.d_model])
if getattr(self, "v", None) is not None:
with tf.name_scope(self.v.name):
self.v.build([None, None, self.d_model])
if getattr(self, "o", None) is not None:
with tf.name_scope(self.o.name):
self.o.build([None, None, self.inner_dim])
def prune_heads(self, heads):
raise NotImplementedError
@staticmethod
def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
"""
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
Translate relative position to a bucket number for relative attention. The relative position is defined as
memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to longer sequences than the model has been trained on
Args:
relative_position: an int32 Tensor - the relative positions between memory and query
bidirectional: a boolean - whether the attention is bidirectional or not
num_buckets: an integer - number of buckets to map relative positions into
max_distance: an integer - maximum distance to consider for bucketing
Returns:
a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
"""
relative_buckets = 0
if bidirectional:
num_buckets //= 2
relative_buckets += (
tf.cast(tf.math.greater(relative_position, 0), dtype=relative_position.dtype) * num_buckets
)
relative_position = tf.math.abs(relative_position)
else:
relative_position = -tf.math.minimum(relative_position, 0)
max_exact = num_buckets // 2
is_small = tf.math.less(relative_position, max_exact)
relative_position_if_large = max_exact + tf.cast(
tf.math.log(tf.cast(relative_position, tf.float32) / tf.cast(max_exact, tf.float32))
/ math.log(max_distance / max_exact)
* (num_buckets - max_exact),
dtype=relative_position.dtype,
)
relative_position_if_large = tf.math.minimum(relative_position_if_large, num_buckets - 1)
relative_buckets += tf.where(is_small, relative_position, relative_position_if_large)
return relative_buckets
def compute_bias(self, query_length, key_length):
"""Compute binned relative position bias"""
context_position = tf.range(query_length)[:, None]
memory_position = tf.range(key_length)[None, :]
relative_position = memory_position - context_position
relative_position_bucket = self._relative_position_bucket(
relative_position,
bidirectional=(not self.is_decoder),
num_buckets=self.relative_attention_num_buckets,
max_distance=self.relative_attention_max_distance,
)
values = tf.gather(
self.relative_attention_bias, relative_position_bucket
)
values = tf.expand_dims(
tf.transpose(values, [2, 0, 1]), axis=0
)
return values
def call(
self,
hidden_states,
mask=None,
key_value_states=None,
position_bias=None,
past_key_value=None,
layer_head_mask=None,
query_length=None,
use_cache=False,
training=False,
output_attentions=False,
class TFT5LayerSelfAttention(keras.layers.Layer):
def __init__(self, config, has_relative_attention_bias=False, **kwargs):
super().__init__(**kwargs)
self.SelfAttention = TFT5Attention(
config,
has_relative_attention_bias=has_relative_attention_bias,
name="SelfAttention",
)
self.layer_norm = TFT5LayerNorm(config.d_model, epsilon=config.layer_norm_epsilon, name="layer_norm")
self.dropout = keras.layers.Dropout(config.dropout_rate)
def call(
self,
hidden_states,
attention_mask=None,
position_bias=None,
layer_head_mask=None,
past_key_value=None,
use_cache=False,
output_attentions=False,
training=False,
):
normed_hidden_states = self.layer_norm(hidden_states)
attention_output = self.SelfAttention(
normed_hidden_states,
mask=attention_mask,
position_bias=position_bias,
layer_head_mask=layer_head_mask,
past_key_value=past_key_value,
use_cache=use_cache,
output_attentions=output_attentions,
training=training,
)
hidden_states = hidden_states + self.dropout(attention_output[0], training=training)
outputs = (hidden_states,) + attention_output[1:]
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "SelfAttention", None) is not None:
with tf.name_scope(self.SelfAttention.name):
self.SelfAttention.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build(None)
class TFT5LayerCrossAttention(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.EncDecAttention = TFT5Attention(
config,
has_relative_attention_bias=False,
name="EncDecAttention",
)
self.layer_norm = TFT5LayerNorm(config.d_model, epsilon=config.layer_norm_epsilon, name="layer_norm")
self.dropout = keras.layers.Dropout(config.dropout_rate)
def call(
self,
hidden_states,
key_value_states,
attention_mask=None,
position_bias=None,
layer_head_mask=None,
past_key_value=None,
query_length=None,
use_cache=False,
output_attentions=False,
training=False,
):
normed_hidden_states = self.layer_norm(hidden_states)
attention_output = self.EncDecAttention(
normed_hidden_states,
key_value_states,
mask=attention_mask,
position_bias=position_bias,
layer_head_mask=layer_head_mask,
past_key_value=past_key_value,
query_length=query_length,
use_cache=use_cache,
output_attentions=output_attentions,
training=training,
)
hidden_states = hidden_states + self.dropout(attention_output[0], training=training)
outputs = (hidden_states,) + attention_output[1:]
return outputs
):
normed_hidden_states = self.layer_norm(hidden_states)
attention_output = self.EncDecAttention(
normed_hidden_states,
mask=attention_mask,
key_value_states=key_value_states,
position_bias=position_bias,
layer_head_mask=layer_head_mask,
past_key_value=past_key_value,
query_length=query_length,
use_cache=use_cache,
output_attentions=output_attentions,
training=training,
)
hidden_states = hidden_states + self.dropout(attention_output[0], training=training)
outputs = (hidden_states,) + attention_output[1:]
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "EncDecAttention", None) is not None:
with tf.name_scope(self.EncDecAttention.name):
self.EncDecAttention.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build(None)
class TFT5Block(keras.layers.Layer):
def __init__(self, config, has_relative_attention_bias=False, **kwargs):
super().__init__(**kwargs)
self.is_decoder = config.is_decoder
self.layer = []
self.layer.append(
TFT5LayerSelfAttention(
config,
has_relative_attention_bias=has_relative_attention_bias,
name="layer_._0",
)
)
if self.is_decoder:
self.layer.append(
TFT5LayerCrossAttention(
config,
name="layer_._1",
)
)
self.layer.append(TFT5LayerFF(config, name=f"layer_._{len(self.layer)}"))
def call(
self,
hidden_states,
attention_mask=None,
position_bias=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
encoder_decoder_position_bias=None,
layer_head_mask=None,
encoder_layer_head_mask=None,
past_key_value=None,
use_cache=False,
output_attentions=False,
training=False,
):
def build(self, input_shape=None):
if self.built:
return
self.built = True
for layer_module in self.layer:
if hasattr(layer_module, "name"):
with tf.name_scope(layer_module.name):
layer_module.build(None)
@keras_serializable
class TFT5MainLayer(keras.layers.Layer):
config_class = T5Config
def __init__(self, config, embed_tokens=None, **kwargs):
super().__init__(**kwargs)
self.config = config
self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions
self.use_cache = config.use_cache
self.embed_tokens = embed_tokens
self.is_decoder = config.is_decoder
self.config = config
self.num_hidden_layers = config.num_layers
self.block = [
TFT5Block(config, has_relative_attention_bias=bool(i == 0), name=f"block_._{i}")
for i in range(config.num_layers)
]
self.final_layer_norm = TFT5LayerNorm(
config.d_model, epsilon=config.layer_norm_epsilon, name="final_layer_norm"
)
self.dropout = keras.layers.Dropout(config.dropout_rate)
def _prune_heads(self, heads_to_prune):
raise NotImplementedError
def call(
self,
input_ids=None,
attention_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
inputs_embeds=None,
head_mask=None,
encoder_head_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
training=False,
):
if self.built:
return
self.built = True
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build(None)
if getattr(self, "block", None) is not None:
for layer in self.block:
with tf.name_scope(layer.name):
layer.build(None)
class TFT5PreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = T5Config
base_model_prefix = "transformer"
_keys_to_ignore_on_load_unexpected = [
r"decoder\Wblock[\W_0]+layer[\W_1]+EncDecAttention\Wrelative_attention_bias"
]
def get_input_embeddings(self):
return self.shared
def set_input_embeddings(self, value):
self.shared = value
self.encoder.embed_tokens = self.shared
if hasattr(self, "decoder"):
self.decoder.embed_tokens = self.shared
def _shift_right(self, input_ids):
decoder_start_token_id = self.config.decoder_start_token_id
pad_token_id = self.config.pad_token_id
assert decoder_start_token_id is not None, (
"self.model.config.decoder_start_token_id has to be defined. In TF T5 it is usually set to the"
" pad_token_id. See T5 docs for more information"
)
start_tokens = tf.fill((shape_list(input_ids)[0], 1), decoder_start_token_id)
start_tokens = tf.cast(start_tokens, input_ids.dtype)
shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)
assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
shifted_input_ids = tf.where(
shifted_input_ids == -100,
tf.cast(tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids.dtype),
shifted_input_ids,
)
assert_gte0 = tf.debugging.assert_greater_equal(
shifted_input_ids, tf.constant(0, dtype=shifted_input_ids.dtype)
)
with tf.control_dependencies([assert_gte0]):
shifted_input_ids = tf.identity(shifted_input_ids)
return shifted_input_ids
T5_START_DOCSTRING = r"""
The T5 model was proposed in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text
Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan
Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a
# 文档字符串,描述了模型类的基本信息和用法说明。
# 该模型继承自TFPreTrainedModel类,可以查看其超类文档以了解库实现的通用方法,
# 如下载或保存模型、调整输入嵌入、剪枝头等。
# 该模型也是一个keras.Model子类,可以像普通的TF 2.0 Keras模型一样使用,
# 并参考TF 2.0文档中有关一般用法和行为的信息。
# 提示(Tip)部分:
# transformers库中的TensorFlow模型和层接受两种输入格式:
# 1. 所有输入作为关键字参数(类似于PyTorch模型);
# 2. 所有输入作为列表、元组或字典传递给第一个位置参数。
# 第二种格式得到支持是因为Keras方法更倾向于使用这种格式将输入传递给模型和层。
# 因此,当使用model.fit()等方法时,只需传递您支持的任何格式的输入和标签即可“正常工作”!
# 然而,如果您想在Keras方法之外(如在使用Keras Functional API创建自己的层或模型时)使用第二种格式,
# 您可以使用以下三种可能性将所有输入Tensor聚集到第一个位置参数中。
# 参数部分:
# config参数接受一个T5Config类的实例,其中包含模型的所有参数。
# 使用配置文件初始化模型不会加载与模型关联的权重,只加载配置。
# 可以查看~PreTrainedModel.from_pretrained方法来加载模型的权重。
"""
T5_INPUTS_DOCSTRING = r"""
"""
T5_ENCODER_INPUTS_DOCSTRING = r"""
Args:
inputs (`tf.Tensor` of shape `(batch_size, sequence_length)`):
输入序列标记在词汇表中的索引。T5 是一个具有相对位置嵌入的模型,因此可以在右侧或左侧填充输入。
可以使用 [`AutoTokenizer`] 获取索引。详细信息请参阅 [`PreTrainedTokenizer.__call__`] 和 [`PreTrainedTokenizer.encode`]。
若要了解有关预训练的 `inputs` 准备的更多信息,请查看 [T5 Training](./t5#training)。
attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
遮罩,避免在填充的标记索引上执行注意力操作。遮罩的值选择在 `[0, 1]` 之间:
- 1 表示**未遮罩**的标记,
- 0 表示**遮罩**的标记。
[什么是注意力遮罩?](../glossary#attention-mask)
inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
可选,可以直接传递嵌入表示而不是 `input_ids`。如果要比模型的内部嵌入查找矩阵更精确地控制如何将 `input_ids` 索引转换为相关联的向量,这很有用。
head_mask (`tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
用于取消选择自注意模块的特定头部的遮罩。遮罩的值选择在 `[0, 1]` 之间:
- 1 表示**未遮罩**的头部,
- 0 表示**遮罩**的头部。
output_attentions (`bool`, *optional*):
是否返回所有注意力层的注意力张量。有关更多详细信息,请参阅返回张量中的 `attentions`。
output_hidden_states (`bool`, *optional*):
是否返回所有层的隐藏状态。有关更多详细信息,请参阅返回张量中的 `hidden_states`。
return_dict (`bool`, *optional*):
是否返回 [`~utils.ModelOutput`] 而不是普通元组。
training (`bool`, *optional*, 默认为 `False`):
是否在训练模式下使用模型(一些模块如 dropout 在训练和评估之间有不同行为)。
"""
_HEAD_MASK_WARNING_MSG = """
输入参数 `head_mask` 已分为两个参数 `head_mask` 和 `decoder_head_mask`。目前,`decoder_head_mask` 被设置为复制 `head_mask`,但此功能已被弃用,并将在未来版本中移除。
如果现在不想使用任何 `decoder_head_mask`,请设置 `decoder_head_mask = tf.ones((num_layers, num_heads))`。
"""
@add_start_docstrings(
"The bare T5 Model transformer outputting raw hidden-states without any specific head on top.",
T5_START_DOCSTRING,
)
class TFT5Model(TFT5PreTrainedModel):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.shared = keras.layers.Embedding(
input_dim=config.vocab_size,
output_dim=config.d_model,
embeddings_initializer=keras.initializers.TruncatedNormal(self.config.initializer_factor),
name="shared",
)
self.shared.load_weight_prefix = "shared"
encoder_config = copy.deepcopy(config)
encoder_config.use_cache = False
self.encoder = TFT5MainLayer(encoder_config, self.shared, name="encoder")
decoder_config = copy.deepcopy(config)
decoder_config.is_decoder = True
decoder_config.num_layers = config.num_decoder_layers
self.decoder = TFT5MainLayer(decoder_config, self.shared, name="decoder")
def get_encoder(self):
return self.encoder
def get_decoder(self):
return self.decoder
@unpack_inputs
@add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFSeq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
decoder_input_ids: np.ndarray | tf.Tensor | None = None,
decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
decoder_head_mask: np.ndarray | tf.Tensor | None = None,
encoder_outputs: np.ndarray | tf.Tensor | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
):
pass
def build(self, input_shape=None):
if self.built:
return
self.built = True
with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
self.shared.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
@add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.model_dim = config.d_model
self.shared = keras.layers.Embedding(
config.vocab_size,
config.d_model,
name="shared",
embeddings_initializer=get_initializer(self.config.initializer_factor),
)
self.shared.load_weight_prefix = "shared"
encoder_config = copy.deepcopy(config)
encoder_config.use_cache = False
self.encoder = TFT5MainLayer(encoder_config, self.shared, name="encoder")
decoder_config = copy.deepcopy(config)
decoder_config.is_decoder = True
decoder_config.num_layers = config.num_decoder_layers
self.decoder = TFT5MainLayer(decoder_config, self.shared, name="decoder")
if not config.tie_word_embeddings:
lm_head_initializer = keras.initializers.RandomNormal(mean=0, stddev=config.initializer_factor)
self.lm_head = keras.layers.Dense(
config.vocab_size, use_bias=False, name="lm_head", kernel_initializer=lm_head_initializer
)
self.config = config
def get_output_embeddings(self):
if self.config.tie_word_embeddings:
return self.get_input_embeddings()
else:
return tf.transpose(self.lm_head.kernel)
def set_output_embeddings(self, value):
if self.config.tie_word_embeddings:
self.set_input_embeddings(value)
else:
lm_head_initializer = keras.initializers.RandomNormal(mean=0, stddev=self.config.initializer_factor)
self.lm_head = keras.layers.Dense(
shape_list(value)[0], use_bias=False, name="lm_head", kernel_initializer=lm_head_initializer
)
transposed_value = tf.transpose(value)
self.lm_head.kernel = transposed_value
def get_encoder(self):
return self.encoder
def get_decoder(self):
return self.decoder
@unpack_inputs
@add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
decoder_input_ids: np.ndarray | tf.Tensor | None = None,
decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
decoder_head_mask: np.ndarray | tf.Tensor | None = None,
encoder_outputs: np.ndarray | tf.Tensor | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
labels: np.ndarray | tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
):
def serving_output(self, output):
pkv = tf.convert_to_tensor(output.past_key_values[1:]) if self.config.use_cache else None
dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
return TFSeq2SeqLMOutput(
logits=output.logits,
past_key_values=pkv,
decoder_hidden_states=dec_hs,
decoder_attentions=dec_attns,
cross_attentions=cross_attns,
encoder_last_hidden_state=output.encoder_last_hidden_state,
encoder_hidden_states=enc_hs,
encoder_attentions=enc_attns,
)
def prepare_inputs_for_generation(
self,
input_ids,
past_key_values=None,
attention_mask=None,
decoder_attention_mask=None,
head_mask=None,
decoder_head_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
if past_key_values is not None:
input_ids = input_ids[:, -1:]
return {
"input_ids": None,
"decoder_input_ids": input_ids,
"past_key_values": past_key_values,
"encoder_outputs": encoder_outputs,
"attention_mask": attention_mask,
"decoder_attention_mask": decoder_attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"use_cache": use_cache,
}
def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor):
return self._shift_right(labels)
def build(self, input_shape=None):
if self.built:
return
self.built = True
with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
self.shared.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build([None, None, self.config.d_model])
@add_start_docstrings(
"The bare T5 Model transformer outputting encoder's raw hidden-stateswithout any specific head on top.",
T5_START_DOCSTRING,
)
class TFT5EncoderModel(TFT5PreTrainedModel):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.shared = keras.layers.Embedding(
config.vocab_size,
config.d_model,
name="shared",
embeddings_initializer=get_initializer(self.config.initializer_factor),
)
self.shared.load_weight_prefix = "shared"
encoder_config = copy.deepcopy(config)
encoder_config.use_cache = False
self.encoder = TFT5MainLayer(encoder_config, self.shared, name="encoder")
def get_encoder(self):
return self.encoder
@unpack_inputs
@add_start_docstrings_to_model_forward(T5_ENCODER_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFBaseModelOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
) -> Union[Tuple, TFBaseModelOutput]:
r"""
Runs the T5 encoder model on inputs.
Returns:
TFBaseModelOutput or Tuple: Depending on `return_dict`, returns either a dictionary or a tuple of model outputs.
Examples:
```
>>> from transformers import AutoTokenizer, TFT5EncoderModel
>>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
>>> model = TFT5EncoderModel.from_pretrained("google-t5/t5-small")
>>> input_ids = tokenizer(
... "Studies have been shown that owning a dog is good for you", return_tensors="tf"
... ).input_ids # Batch size 1
>>> outputs = model(input_ids)
```
"""
encoder_outputs = self.encoder(
input_ids,
attention_mask=attention_mask,
encoder_hidden_states=None,
encoder_attention_mask=None,
inputs_embeds=inputs_embeds,
head_mask=head_mask,
past_key_values=None,
use_cache=False,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
if not return_dict:
return encoder_outputs
return TFBaseModelOutput(
last_hidden_state=encoder_outputs.last_hidden_state,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
if self.built:
return
self.built = True
with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
self.shared.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
.\models\t5\tokenization_t5.py
""" Tokenization class for model T5."""
import os
import re
import warnings
from shutil import copyfile
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
import sentencepiece as spm
from ...convert_slow_tokenizer import import_protobuf
from ...tokenization_utils import PreTrainedTokenizer
from ...tokenization_utils_base import AddedToken
if TYPE_CHECKING:
from ...tokenization_utils_base import TextInput
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"google-t5/t5-small": "https://huggingface.co/google-t5/t5-small/resolve/main/spiece.model",
"google-t5/t5-base": "https://huggingface.co/google-t5/t5-base/resolve/main/spiece.model",
"google-t5/t5-large": "https://huggingface.co/google-t5/t5-large/resolve/main/spiece.model",
"google-t5/t5-3b": "https://huggingface.co/google-t5/t5-3b/resolve/main/spiece.model",
"google-t5/t5-11b": "https://huggingface.co/google-t5/t5-11b/resolve/main/spiece.model",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"google-t5/t5-small": 512,
"google-t5/t5-base": 512,
"google-t5/t5-large": 512,
"google-t5/t5-3b": 512,
"google-t5/t5-11b": 512,
}
SPIECE_UNDERLINE = "▁"
class T5Tokenizer(PreTrainedTokenizer):
"""
Construct a T5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
Attributes:
sp_model (`SentencePieceProcessor`):
The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
eos_token="</s>",
unk_token="<unk>",
pad_token="<pad>",
extra_ids=100,
additional_special_tokens=None,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
legacy=None,
add_prefix_space=True,
**kwargs,
):
pass
@staticmethod
def get_spm_processor(self, from_slow=False):
tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
if self.legacy or from_slow:
tokenizer.Load(self.vocab_file)
return tokenizer
with open(self.vocab_file, "rb") as f:
sp_model = f.read()
model_pb2 = import_protobuf(f"The new behaviour of {self.__class__.__name__} (with `self.legacy = False`)")
model = model_pb2.ModelProto.FromString(sp_model)
normalizer_spec = model_pb2.NormalizerSpec()
normalizer_spec.add_dummy_prefix = False
model.normalizer_spec.MergeFrom(normalizer_spec)
sp_model = model.SerializeToString()
tokenizer.LoadFromSerializedProto(sp_model)
return tokenizer
@property
def vocab_size(self):
return self.sp_model.get_piece_size()
def get_vocab(self):
"""
构建词汇表字典,将词汇索引映射到对应的词汇。
"""
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
检索没有添加特殊标记的令牌列表的序列ID。当使用分词器的 `prepare_for_model` 方法添加特殊标记时调用此方法。
Args:
token_ids_0 (`List[int]`):
ID 列表。
token_ids_1 (`List[int]`, *optional*):
可选的第二个 ID 列表,用于序列对。
already_has_special_tokens (`bool`, *optional*, 默认为 `False`):
标记列表是否已经包含了模型的特殊标记。
Returns:
`List[int]`: 一个整数列表,范围在 [0, 1]:1 表示特殊标记,0 表示序列标记。
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is None:
return ([0] * len(token_ids_0)) + [1]
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
def get_sentinel_tokens(self):
"""
返回包含特殊标记的令牌列表。
"""
return list(
set(filter(lambda x: bool(re.search(r"<extra_id_\d+>", x)) is not None, self.additional_special_tokens))
)
def get_sentinel_token_ids(self):
"""
返回特殊标记的令牌 ID 列表。
"""
return [self.convert_tokens_to_ids(token) for token in self.get_sentinel_tokens()]
def _add_eos_if_not_present(self, token_ids: List[int]) -> List[int]:
"""
如果用户尚未添加 EOS 标记,则不再添加 EOS。
"""
if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
warnings.warn(
f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated"
" eos tokens being added."
)
return token_ids
else:
return token_ids + [self.eos_token_id]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
从序列中创建令牌类型 ID 列表。
"""
def create_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs representing the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs representing the second sequence for sequence pairs.
Returns:
`List[int]`: List of zeros representing the mask.
"""
eos = [self.eos_token_id]
if token_ids_1 is None:
return len(token_ids_0 + eos) * [0]
return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and
adding special tokens.
Args:
token_ids_0 (`List[int]`):
List of IDs representing the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs representing the second sequence for sequence pairs.
Returns:
`List[int]`: List of input IDs with the appropriate special tokens added.
"""
token_ids_0 = self._add_eos_if_not_present(token_ids_0)
if token_ids_1 is None:
return token_ids_0
else:
token_ids_1 = self._add_eos_if_not_present(token_ids_1)
return token_ids_0 + token_ids_1
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
return state
def __setstate__(self, d):
self.__dict__ = d
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file)
def tokenize(self, text: "TextInput", **kwargs) -> List[str]:
"""
Converts a string to a list of tokens. Adds a prefix token if `self.legacy` is False and the first token is not special.
"""
if self.legacy or len(text) == 0:
return super().tokenize(text, **kwargs)
text = text.replace(SPIECE_UNDERLINE, " ")
if self.add_prefix_space:
text = SPIECE_UNDERLINE + text
tokens = super().tokenize(text, **kwargs)
if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
tokens = tokens[1:]
return tokens
@property
def unk_token_length(self):
"""
Calculate the length of the unknown token.
"""
return len(self.sp_model.encode(str(self.unk_token)))
def _tokenize(self, text, **kwargs):
"""
Returns a tokenized string.
We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
`['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
`unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
`self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
"""
tokens = self.sp_model.encode(text, out_type=str)
if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
return tokens
tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.sp_model.piece_to_id(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
token = self.sp_model.IdToPiece(index)
return token
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
if tokens[0].startswith(SPIECE_UNDERLINE) and self.add_prefix_space:
tokens[0] = tokens[0][1:]
current_sub_tokens = []
out_string = ""
prev_is_special = False
for token in tokens:
if token in self.all_special_tokens:
if not prev_is_special:
out_string += " "
out_string += self.sp_model.decode(current_sub_tokens) + token
prev_is_special = True
current_sub_tokens = []
else:
current_sub_tokens.append(token)
prev_is_special = False
out_string += self.sp_model.decode(current_sub_tokens)
return out_string.strip()
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file,)
.\models\t5\tokenization_t5_fast.py
""" T5 模型的分词类 """
import os
import re
import warnings
from shutil import copyfile
from typing import List, Optional, Tuple
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import is_sentencepiece_available, logging
if is_sentencepiece_available():
from .tokenization_t5 import T5Tokenizer
else:
T5Tokenizer = None
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"google-t5/t5-small": "https://huggingface.co/google-t5/t5-small/resolve/main/spiece.model",
"google-t5/t5-base": "https://huggingface.co/google-t5/t5-base/resolve/main/spiece.model",
"google-t5/t5-large": "https://huggingface.co/google-t5/t5-large/resolve/main/spiece.model",
"google-t5/t5-3b": "https://huggingface.co/google-t5/t5-3b/resolve/main/spiece.model",
"google-t5/t5-11b": "https://huggingface.co/google-t5/t5-11b/resolve/main/spiece.model",
},
"tokenizer_file": {
"google-t5/t5-small": "https://huggingface.co/google-t5/t5-small/resolve/main/tokenizer.json",
"google-t5/t5-base": "https://huggingface.co/google-t5/t5-base/resolve/main/tokenizer.json",
"google-t5/t5-large": "https://huggingface.co/google-t5/t5-large/resolve/main/tokenizer.json",
"google-t5/t5-3b": "https://huggingface.co/google-t5/t5-3b/resolve/main/tokenizer.json",
"google-t5/t5-11b": "https://huggingface.co/google-t5/t5-11b/resolve/main/tokenizer.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"google-t5/t5-small": 512,
"google-t5/t5-base": 512,
"google-t5/t5-large": 512,
"google-t5/t5-3b": 512,
"google-t5/t5-11b": 512,
}
class T5TokenizerFast(PreTrainedTokenizerFast):
"""
构建一个“快速”T5分词器(基于HuggingFace的*tokenizers*库)。基于
[Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models)。
这个分词器继承自[`PreTrainedTokenizerFast`],该类包含大部分主要方法。用户应
参考超类以获取更多关于这些方法的信息。
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = T5Tokenizer
prefix_tokens: List[int] = []
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
eos_token="</s>",
unk_token="<unk>",
pad_token="<pad>",
extra_ids=100,
additional_special_tokens=None,
add_prefix_space=None,
**kwargs,
if additional_special_tokens is not None:
extra_tokens = [x for x in additional_special_tokens if "<extra_id_" in str(x)]
if len(extra_tokens) < 1:
additional_special_tokens += [f"<extra_id_{i}>" for i in range(extra_ids)]
elif extra_ids > 0 and extra_ids != len(extra_tokens):
raise ValueError(
f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are"
" provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids"
" tokens"
)
else:
extra_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
additional_special_tokens = extra_tokens
if add_prefix_space is not None:
logger.warning_once(
"You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers"
)
kwargs["from_slow"] = True
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
extra_ids=extra_ids,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
self.vocab_file = vocab_file
self._extra_ids = extra_ids
def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length):
if pretrained_model_name_or_path in T5TokenizerFast.max_model_input_sizes:
deprecated_max_model_length = T5TokenizerFast.max_model_input_sizes[pretrained_model_name_or_path]
if init_max_model_length is not None and init_max_model_length != max_model_length:
return init_max_model_length
elif init_max_model_length is None:
warnings.warn(
"This tokenizer was incorrectly instantiated with a model max length of"
f" {deprecated_max_model_length} which will be corrected in Transformers v5.\nFor now, this"
" behavior is kept to avoid breaking backwards compatibility when padding/encoding with"
" `truncation is True`.\n- Be aware that you SHOULD NOT rely on"
f" {pretrained_model_name_or_path} automatically truncating your input to"
f" {deprecated_max_model_length} when padding/encoding.\n- If you want to encode/pad to sequences"
f" longer than {deprecated_max_model_length} you can either instantiate this tokenizer with"
" `model_max_length` or pass `max_length` when encoding/padding.\n- To avoid this warning, please"
" instantiate this tokenizer with `model_max_length` set to your preferred value.",
FutureWarning,
)
return max_model_length
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not self.can_save_slow_tokenizer:
raise ValueError(
"Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
"tokenizer."
)
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
logger.info(f"Copy vocab file to {out_vocab_file}")
return (out_vocab_file,)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
def build_inputs_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and
adding special tokens. A sequence has the following format:
- single sequence: `X </s>`
- pair of sequences: `A </s> B </s>`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of input IDs with the appropriate special tokens added.
"""
token_ids_0 = token_ids_0 + [self.eos_token_id]
if token_ids_1 is None:
return self.prefix_tokens + token_ids_0
else:
token_ids_1 = token_ids_1 + [self.eos_token_id]
return self.prefix_tokens + token_ids_0 + token_ids_1
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros as T5 does not use token type ids.
"""
eos = [self.eos_token_id]
if token_ids_1 is None:
return len(token_ids_0 + eos) * [0]
else:
return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
def get_sentinel_tokens(self):
"""
Get sentinel tokens from the additional special tokens list based on a regex pattern matching.
Returns:
List[str]: List of sentinel tokens.
"""
return list(
set(filter(lambda x: bool(re.search(r"<extra_id_\d+>", x)) is not None, self.additional_special_tokens))
)
def get_sentinel_token_ids(self):
"""
Convert sentinel tokens to their corresponding token IDs.
Returns:
List[int]: List of token IDs of sentinel tokens.
"""
return [self.convert_tokens_to_ids(token) for token in self.get_sentinel_tokens()]
.\models\t5\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_flax_available,
is_sentencepiece_available,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {"configuration_t5": ["T5_PRETRAINED_CONFIG_ARCHIVE_MAP", "T5Config", "T5OnnxConfig"]}
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_t5"] = ["T5Tokenizer"]
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_t5_fast"] = ["T5TokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_t5"] = [
"T5_PRETRAINED_MODEL_ARCHIVE_LIST",
"T5EncoderModel",
"T5ForConditionalGeneration",
"T5Model",
"T5PreTrainedModel",
"load_tf_weights_in_t5",
"T5ForQuestionAnswering",
"T5ForSequenceClassification",
"T5ForTokenClassification",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_t5"] = [
"TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFT5EncoderModel",
"TFT5ForConditionalGeneration",
"TFT5Model",
"TFT5PreTrainedModel",
]
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_flax_t5"] = [
"FlaxT5EncoderModel",
"FlaxT5ForConditionalGeneration",
"FlaxT5Model",
"FlaxT5PreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config, T5OnnxConfig
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_t5 import T5Tokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
try:
from .tokenization_t5_fast import T5TokenizerFast
except OptionalDependencyNotAvailable:
pass
else:
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_t5 import (
T5_PRETRAINED_MODEL_ARCHIVE_LIST,
T5EncoderModel,
T5ForConditionalGeneration,
T5ForQuestionAnswering,
T5ForSequenceClassification,
T5ForTokenClassification,
T5Model,
T5PreTrainedModel,
load_tf_weights_in_t5,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_t5 import (
TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST,
TFT5EncoderModel,
TFT5ForConditionalGeneration,
TFT5Model,
TFT5PreTrainedModel,
)
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_flax_t5 import (
FlaxT5EncoderModel,
FlaxT5ForConditionalGeneration,
FlaxT5Model,
FlaxT5PreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\table_transformer\configuration_table_transformer.py
""" Table Transformer 模型配置"""
from collections import OrderedDict
from typing import Mapping
from packaging import version
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
from ..auto import CONFIG_MAPPING
logger = logging.get_logger(__name__)
TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"microsoft/table-transformer-detection": (
"https://huggingface.co/microsoft/table-transformer-detection/resolve/main/config.json"
),
}
class TableTransformerConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`TableTransformerModel`]. It is used to
instantiate a Table Transformer model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the Table Transformer
[microsoft/table-transformer-detection](https://huggingface.co/microsoft/table-transformer-detection) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Examples:
```
>>> from transformers import TableTransformerModel, TableTransformerConfig
>>> # Initializing a Table Transformer microsoft/table-transformer-detection style configuration
>>> configuration = TableTransformerConfig()
>>> # Initializing a model from the microsoft/table-transformer-detection style configuration
>>> model = TableTransformerModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "table-transformer"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"hidden_size": "d_model",
"num_attention_heads": "encoder_attention_heads",
}
def __init__(
self,
use_timm_backbone=True,
backbone_config=None,
num_channels=3,
num_queries=100,
encoder_layers=6,
encoder_ffn_dim=2048,
encoder_attention_heads=8,
decoder_layers=6,
decoder_ffn_dim=2048,
decoder_attention_heads=8,
encoder_layerdrop=0.0,
decoder_layerdrop=0.0,
is_encoder_decoder=True,
activation_function="relu",
d_model=256,
dropout=0.1,
attention_dropout=0.0,
activation_dropout=0.0,
init_std=0.02,
init_xavier_std=1.0,
auxiliary_loss=False,
position_embedding_type="sine",
backbone="resnet50",
use_pretrained_backbone=True,
backbone_kwargs=None,
dilation=False,
class_cost=1,
bbox_cost=5,
giou_cost=2,
mask_loss_coefficient=1,
dice_loss_coefficient=1,
bbox_loss_coefficient=5,
giou_loss_coefficient=2,
eos_coefficient=0.1,
**kwargs,
):
@property
def num_attention_heads(self) -> int:
return self.encoder_attention_heads
@property
def hidden_size(self) -> int:
return self.d_model
class TableTransformerOnnxConfig(OnnxConfig):
torch_onnx_minimum_version = version.parse("1.11")
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
return OrderedDict(
[
("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
("pixel_mask", {0: "batch"}),
]
)
@property
def atol_for_validation(self) -> float:
return 1e-5
@property
def default_onnx_opset(self) -> int:
return 12
.\models\table_transformer\convert_table_transformer_to_hf.py
"""使用 timm-backbone 转换 Table Transformer 检查点。
URL: https://github.com/microsoft/table-transformer
"""
import argparse
from collections import OrderedDict
from pathlib import Path
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from torchvision.transforms import functional as F
from transformers import DetrImageProcessor, TableTransformerConfig, TableTransformerForObjectDetection
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
rename_keys = []
for i in range(6):
rename_keys.append(
(f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight")
)
rename_keys.append(
(f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias")
)
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias"))
rename_keys.append(
(f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight")
)
rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias"))
rename_keys.append(
(f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight")
)
rename_keys.append(
(f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias")
)
rename_keys.append(
(
f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight",
f"decoder.layers.{i}.encoder_attn.out_proj.weight",
)
)
rename_keys.append(
(
f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias",
f"decoder.layers.{i}.encoder_attn.out_proj.bias",
)
)
rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias"))
rename_keys.append(
(f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight")
)
rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias"))
rename_keys.append(
(f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight")
)
rename_keys.append(
(f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias")
)
rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))
rename_keys.extend(
[
("input_proj.weight", "input_projection.weight"),
("input_proj.bias", "input_projection.bias"),
("query_embed.weight", "query_position_embeddings.weight"),
("transformer.encoder.norm.weight", "encoder.layernorm.weight"),
("transformer.encoder.norm.bias", "encoder.layernorm.bias"),
("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
("class_embed.weight", "class_labels_classifier.weight"),
("class_embed.bias", "class_labels_classifier.bias"),
("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
]
)
def rename_key(state_dict, old, new):
val = state_dict.pop(old)
state_dict[new] = val
def rename_backbone_keys(state_dict):
new_state_dict = OrderedDict()
for key, value in state_dict.items():
if "backbone.0.body" in key:
new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model")
new_state_dict[new_key] = value
else:
new_state_dict[key] = value
return new_state_dict
def read_in_q_k_v(state_dict):
prefix = ""
for i in range(6):
in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight")
in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias")
state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
in_proj_weight_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight")
in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias")
state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :]
state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256]
state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :]
state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512]
state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :]
state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:]
def resize(image, checkpoint_url):
width, height = image.size
current_max_size = max(width, height)
target_max_size = 800 if "detection" in checkpoint_url else 1000
scale = target_max_size / current_max_size
resized_image = image.resize((int(round(scale * width)), int(round(scale * height))))
return resized_image
def normalize(image):
image = F.to_tensor(image)
image = F.normalize(image, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
return image
@torch.no_grad()
def convert_table_transformer_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub):
"""
Copy/paste/tweak model's weights to our DETR structure.
"""
logger.info("Converting model...")
state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
for src, dest in rename_keys:
rename_key(state_dict, src, dest)
state_dict = rename_backbone_keys(state_dict)
read_in_q_k_v(state_dict)
prefix = "model."
for key in state_dict.copy().keys():
if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
val = state_dict.pop(key)
state_dict[prefix + key] = val
config = TableTransformerConfig(
backbone="resnet18",
mask_loss_coefficient=1,
dice_loss_coefficient=1,
ce_loss_coefficient=1,
bbox_loss_coefficient=5,
giou_loss_coefficient=2,
eos_coefficient=0.4,
class_cost=1,
bbox_cost=5,
giou_cost=2,
)
if "detection" in checkpoint_url:
config.num_queries = 15
config.num_labels = 2
id2label = {0: "table", 1: "table rotated"}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
else:
config.num_queries = 125
config.num_labels = 6
id2label = {
0: "table",
1: "table column",
2: "table row",
3: "table column header",
4: "table projected row header",
5: "table spanning cell",
}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
image_processor = DetrImageProcessor(
format="coco_detection", max_size=800 if "detection" in checkpoint_url else 1000
)
model = TableTransformerForObjectDetection(config)
model.load_state_dict(state_dict)
model.eval()
filename = "example_pdf.png" if "detection" in checkpoint_url else "example_table.png"
file_path = hf_hub_download(repo_id="nielsr/example-pdf", repo_type="dataset", filename=filename)
image = Image.open(file_path).convert("RGB")
pixel_values = normalize(resize(image, checkpoint_url)).unsqueeze(0)
outputs = model(pixel_values)
if "detection" in checkpoint_url:
expected_shape = (1, 15, 3)
expected_logits = torch.tensor(
[[-6.7897, -16.9985, 6.7937], [-8.0186, -22.2192, 6.9677], [-7.3117, -21.0708, 7.4055]]
)
expected_boxes = torch.tensor([[0.4867, 0.1767, 0.6732], [0.6718, 0.4479, 0.3830], [0.4716, 0.1760, 0.6364]])
else:
expected_shape = (1, 125, 7)
expected_logits = torch.tensor(
[[-18.1430, -8.3214, 4.8274], [-18.4685, -7.1361, -4.2667], [-26.3693, -9.3429, -4.9962]]
)
expected_boxes = torch.tensor([[0.4983, 0.5595, 0.9440], [0.4916, 0.6315, 0.5954], [0.6108, 0.8637, 0.1135]])
assert outputs.logits.shape == expected_shape
assert torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4)
assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4)
print("Looks ok!")
if pytorch_dump_folder_path is not None:
logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_folder_path)
image_processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
logger.info("Pushing model to the hub...")
model_name = (
"microsoft/table-transformer-detection"
if "detection" in checkpoint_url
else "microsoft/table-transformer-structure-recognition"
)
model.push_to_hub(model_name)
image_processor.push_to_hub(model_name)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--checkpoint_url",
default="https://pubtables1m.blob.core.windows.net/model/pubtables1m_detection_detr_r18.pth",
type=str,
choices=[
"https://pubtables1m.blob.core.windows.net/model/pubtables1m_detection_detr_r18.pth",
"https://pubtables1m.blob.core.windows.net/model/pubtables1m_structure_detr_r18.pth",
],
help="URL of the Table Transformer checkpoint you'd like to convert."
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the folder to output PyTorch model."
)
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Whether or not to push the converted model to the 🤗 hub."
)
args = parser.parse_args()
convert_table_transformer_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub)
.\models\table_transformer\convert_table_transformer_to_hf_no_timm.py
"""Convert Table Transformer checkpoints with native (Transformers) backbone.
URL: https://github.com/microsoft/table-transformer
"""
import argparse
from pathlib import Path
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from torchvision.transforms import functional as F
from transformers import DetrImageProcessor, ResNetConfig, TableTransformerConfig, TableTransformerForObjectDetection
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def create_rename_keys(config):
rename_keys = []
rename_keys.append(("backbone.0.body.conv1.weight", "backbone.conv_encoder.model.embedder.embedder.convolution.weight"))
rename_keys.append(("backbone.0.body.bn1.weight", "backbone.conv_encoder.model.embedder.embedder.normalization.weight"))
rename_keys.append(("backbone.0.body.bn1.bias", "backbone.conv_encoder.model.embedder.embedder.normalization.bias"))
rename_keys.append(("backbone.0.body.bn1.running_mean", "backbone.conv_encoder.model.embedder.embedder.normalization.running_mean"))
rename_keys.append(("backbone.0.body.bn1.running_var", "backbone.conv_encoder.model.embedder.embedder.normalization.running_var"))
rename_keys.extend(
[
("input_proj.weight", "input_projection.weight"),
("input_proj.bias", "input_projection.bias"),
("query_embed.weight", "query_position_embeddings.weight"),
("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
("class_embed.weight", "class_labels_classifier.weight"),
("class_embed.bias", "class_labels_classifier.bias"),
("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
("transformer.encoder.norm.weight", "encoder.layernorm.weight"),
("transformer.encoder.norm.bias", "encoder.layernorm.bias"),
]
)
return rename_keys
def rename_key(state_dict, old, new):
val = state_dict.pop(old)
state_dict[new] = val
def read_in_q_k_v(state_dict, is_panoptic=False):
prefix = ""
if is_panoptic:
prefix = "detr."
for i in range(6):
in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
for i in range(6):
in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight")
in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias")
state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
in_proj_weight_cross_attn = state_dict.pop(
f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight"
)
in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias")
state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :]
state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256]
state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :]
state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512]
state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :]
state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:]
def resize(image, checkpoint_url):
width, height = image.size
current_max_size = max(width, height)
target_max_size = 800 if "detection" in checkpoint_url else 1000
scale = target_max_size / current_max_size
resized_image = image.resize((int(round(scale * width)), int(round(scale * height))))
return resized_image
def normalize(image):
image = F.to_tensor(image)
image = F.normalize(image, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
return image
@torch.no_grad()
def convert_table_transformer_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub):
"""
Copy/paste/tweak model's weights to our DETR structure.
"""
logger.info("Converting model...")
backbone_config = ResNetConfig.from_pretrained(
"microsoft/resnet-18", out_features=["stage1", "stage2", "stage3", "stage4"]
)
config = TableTransformerConfig(
backbone_config=backbone_config,
use_timm_backbone=False,
mask_loss_coefficient=1,
dice_loss_coefficient=1,
ce_loss_coefficient=1,
bbox_loss_coefficient=5,
giou_loss_coefficient=2,
eos_coefficient=0.4,
class_cost=1,
bbox_cost=5,
giou_cost=2,
)
state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
for src, dest in create_rename_keys(config):
rename_key(state_dict, src, dest)
read_in_q_k_v(state_dict)
prefix = "model."
for key in state_dict.copy().keys():
if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
val = state_dict.pop(key)
state_dict[prefix + key] = val
if "detection" in checkpoint_url:
config.num_queries = 15
config.num_labels = 2
id2label = {0: "table", 1: "table rotated"}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
else:
config.num_queries = 125
config.num_labels = 6
id2label = {
0: "table",
1: "table column",
2: "table row",
3: "table column header",
4: "table projected row header",
5: "table spanning cell",
}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
image_processor = DetrImageProcessor(format="coco_detection", size={"longest_edge": 800})
model = TableTransformerForObjectDetection(config)
model.load_state_dict(state_dict)
model.eval()
filename = "example_pdf.png" if "detection" in checkpoint_url else "example_table.png"
file_path = hf_hub_download(repo_id="nielsr/example-pdf", repo_type="dataset", filename=filename)
image = Image.open(file_path).convert("RGB")
pixel_values = normalize(resize(image, checkpoint_url)).unsqueeze(0)
outputs = model(pixel_values)
if "detection" in checkpoint_url:
expected_shape = (1, 15, 3)
expected_logits = torch.tensor(
[[-6.7897, -16.9985, 6.7937], [-8.0186, -22.2192, 6.9677], [-7.3117, -21.0708, 7.4055]]
)
expected_boxes = torch.tensor([[0.4867, 0.1767, 0.6732], [0.6718, 0.4479, 0.3830], [0.4716, 0.1760, 0.6364]])
else:
expected_shape = (1, 125, 7)
expected_logits = torch.tensor(
[[-18.1430, -8.3214, 4.8274], [-18.4685, -7.1361, -4.2667], [-26.3693, -9.3429, -4.9962]]
)
expected_boxes = torch.tensor([[0.4983, 0.5595, 0.9440], [0.4916, 0.6315, 0.5954], [0.6108, 0.8637, 0.1135]])
assert outputs.logits.shape == expected_shape
assert torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4)
assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4)
print("Looks ok!")
if pytorch_dump_folder_path is not None:
logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_folder_path)
image_processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
logger.info("Pushing model to the hub...")
model_name = (
"microsoft/table-transformer-detection"
if "detection" in checkpoint_url
else "microsoft/table-transformer-structure-recognition"
)
model.push_to_hub(model_name, revision="no_timm")
image_processor.push_to_hub(model_name, revision="no_timm")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--checkpoint_url",
default="https://pubtables1m.blob.core.windows.net/model/pubtables1m_detection_detr_r18.pth",
type=str,
choices=[
"https://pubtables1m.blob.core.windows.net/model/pubtables1m_detection_detr_r18.pth",
"https://pubtables1m.blob.core.windows.net/model/pubtables1m_structure_detr_r18.pth",
],
help="URL of the Table Transformer checkpoint you'd like to convert."
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the folder to output PyTorch model."
)
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Whether or not to push the converted model to the 🤗 hub."
)
args = parser.parse_args()
convert_table_transformer_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub)
.\models\table_transformer\modeling_table_transformer.py
""" PyTorch Table Transformer model."""
import math
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Union
import torch
from torch import Tensor, nn
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithCrossAttentions,
Seq2SeqModelOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_accelerate_available,
is_scipy_available,
is_timm_available,
is_vision_available,
logging,
replace_return_docstrings,
requires_backends,
)
from ...utils.backbone_utils import load_backbone
from .configuration_table_transformer import TableTransformerConfig
if is_scipy_available():
from scipy.optimize import linear_sum_assignment
if is_timm_available():
from timm import create_model
if is_vision_available():
from transformers.image_transforms import center_to_corners_format
if is_accelerate_available():
from accelerate import PartialState
from accelerate.utils import reduce
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "TableTransformerConfig"
_CHECKPOINT_FOR_DOC = "microsoft/table-transformer-detection"
TABLE_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/table-transformer-detection",
]
@dataclass
class TableTransformerDecoderOutput(BaseModelOutputWithCrossAttentions):
"""
TABLE_TRANSFORMER 解码器输出的基类。该类在 BaseModelOutputWithCrossAttentions 基础上添加了一个属性,
即中间解码器激活的堆栈,即每个解码器层的输出,每个输出通过 layernorm 处理。
在使用辅助解码损失训练模型时特别有用。
# 定义函数参数及其类型注释,说明函数接受的输入和返回值的数据类型和形状
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
模型最后一层的隐藏状态的序列。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
元组,包含模型每一层的隐藏状态,形状为 `(batch_size, sequence_length, hidden_size)`。当传递 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
元组,包含注意力权重张量,形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。这些是经过注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
元组,包含解码器交叉注意力层的注意力权重张量,形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。这些是经过注意力 softmax 后的注意力权重,用于计算交叉注意力头中的加权平均值。
intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
中间的解码器激活状态,即每个解码器层的输出,每个都经过 layernorm 处理。
"""
intermediate_hidden_states: Optional[torch.FloatTensor] = None
@dataclass
class TableTransformerModelOutput(Seq2SeqModelOutput):
"""
Base class for outputs of the TABLE_TRANSFORMER encoder-decoder model. This class adds one attribute to Seq2SeqModelOutput,
namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
"""
intermediate_hidden_states: Optional[torch.FloatTensor] = None
@dataclass
class TableTransformerObjectDetectionOutput(ModelOutput):
"""
Output type of [`TableTransformerForObjectDetection`].
"""
loss: Optional[torch.FloatTensor] = None
loss_dict: Optional[Dict] = None
logits: torch.FloatTensor = None
pred_boxes: torch.FloatTensor = None
auxiliary_outputs: Optional[List[Dict]] = None
last_hidden_state: Optional[torch.FloatTensor] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
class TableTransformerFrozenBatchNorm2d(nn.Module):
"""
BatchNorm2d where the batch statistics and the affine parameters are fixed.
Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
torchvision.models.resnet[18,34,50,101] produce nans.
"""
def __init__(self, n):
super().__init__()
self.register_buffer("weight", torch.ones(n))
self.register_buffer("bias", torch.zeros(n))
self.register_buffer("running_mean", torch.zeros(n))
self.register_buffer("running_var", torch.ones(n))
def _load_from_state_dict(
self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
):
num_batches_tracked_key = prefix + "num_batches_tracked"
if num_batches_tracked_key in state_dict:
del state_dict[num_batches_tracked_key]
super()._load_from_state_dict(
state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
)
def forward(self, x):
weight = self.weight.reshape(1, -1, 1, 1)
bias = self.bias.reshape(1, -1, 1, 1)
running_var = self.running_var.reshape(1, -1, 1, 1)
running_mean = self.running_mean.reshape(1, -1, 1, 1)
epsilon = 1e-5
scale = weight * (running_var + epsilon).rsqrt()
bias = bias - running_mean * scale
return x * scale + bias
def replace_batch_norm(model):
"""
递归地将所有的 `torch.nn.BatchNorm2d` 替换为 `TableTransformerFrozenBatchNorm2d`。
Args:
model (torch.nn.Module):
输入的模型
"""
for name, module in model.named_children():
if isinstance(module, nn.BatchNorm2d):
new_module = TableTransformerFrozenBatchNorm2d(module.num_features)
if not module.weight.device == torch.device("meta"):
new_module.weight.data.copy_(module.weight)
new_module.bias.data.copy_(module.bias)
new_module.running_mean.data.copy_(module.running_mean)
new_module.running_var.data.copy_(module.running_var)
model._modules[name] = new_module
if len(list(module.children())) > 0:
replace_batch_norm(module)
class TableTransformerConvEncoder(nn.Module):
"""
卷积骨干网络,使用 AutoBackbone API 或 timm 库中的一个模型。
nn.BatchNorm2d 层被上面定义的 TableTransformerFrozenBatchNorm2d 替换。
"""
def __init__(self, config):
super().__init__()
self.config = config
if config.use_timm_backbone:
requires_backends(self, ["timm"])
kwargs = {}
if config.dilation:
kwargs["output_stride"] = 16
backbone = create_model(
config.backbone,
pretrained=config.use_pretrained_backbone,
features_only=True,
out_indices=(1, 2, 3, 4),
in_chans=config.num_channels,
**kwargs,
)
else:
backbone = load_backbone(config)
with torch.no_grad():
replace_batch_norm(backbone)
self.model = backbone
self.intermediate_channel_sizes = (
self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
)
backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type
if "resnet" in backbone_model_type:
for name, parameter in self.model.named_parameters():
if config.use_timm_backbone:
if "layer2" not in name and "layer3" not in name and "layer4" not in name:
parameter.requires_grad_(False)
else:
if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
parameter.requires_grad_(False)
def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
out = []
for feature_map in features:
mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
out.append((feature_map, mask))
return out
class TableTransformerConvModel(nn.Module):
"""
This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
"""
def __init__(self, conv_encoder, position_embedding):
super().__init__()
self.conv_encoder = conv_encoder
self.position_embedding = position_embedding
def forward(self, pixel_values, pixel_mask):
out = self.conv_encoder(pixel_values, pixel_mask)
pos = []
for feature_map, mask in out:
pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype))
return out, pos
class TableTransformerSinePositionEmbedding(nn.Module):
"""
This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
need paper, generalized to work on images.
"""
def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
super().__init__()
self.embedding_dim = embedding_dim
self.temperature = temperature
self.normalize = normalize
if scale is not None and normalize is False:
raise ValueError("normalize should be True if scale is passed")
if scale is None:
scale = 2 * math.pi
self.scale = scale
def forward(self, pixel_values, pixel_mask):
if pixel_mask is None:
raise ValueError("No pixel mask provided")
y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
if self.normalize:
y_embed = y_embed / (y_embed[:, -1:, :] + 1e-6) * self.scale
x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale
dim_t = torch.arange(self.embedding_dim, dtype=torch.int64, device=pixel_values.device).float()
dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
pos_x = x_embed[:, :, :, None] / dim_t
pos_y = y_embed[:, :, :, None] / dim_t
pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
return pos
class TableTransformerLearnedPositionEmbedding(nn.Module):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
def __init__(self, embedding_dim=256):
super().__init__()
self.row_embeddings = nn.Embedding(50, embedding_dim)
self.column_embeddings = nn.Embedding(50, embedding_dim)
def forward(self, pixel_values, pixel_mask=None):
height, width = pixel_values.shape[-2:]
width_values = torch.arange(width, device=pixel_values.device)
height_values = torch.arange(height, device=pixel_values.device)
x_emb = self.column_embeddings(width_values)
y_emb = self.row_embeddings(height_values)
pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
pos = pos.permute(2, 0, 1)
pos = pos.unsqueeze(0)
pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
return pos
def build_position_encoding(config):
n_steps = config.d_model // 2
if config.position_embedding_type == "sine":
position_embedding = TableTransformerSinePositionEmbedding(n_steps, normalize=True)
elif config.position_embedding_type == "learned":
position_embedding = TableTransformerLearnedPositionEmbedding(n_steps)
else:
raise ValueError(f"Not supported {config.position_embedding_type}")
return position_embedding
class TableTransformerAttention(nn.Module):
"""
'Attention Is All You Need' 论文中的多头注意力机制。
在这里,我们为查询和键添加位置嵌入(如TABLE_TRANSFORMER论文中所解释的)。
"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
bias: bool = True,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
if self.head_dim * num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: "
f"{num_heads})."
)
self.scaling = self.head_dim**-0.5
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def with_pos_embed(self, tensor: torch.Tensor, object_queries: Optional[Tensor], **kwargs):
position_embeddings = kwargs.pop("position_embeddings", None)
if kwargs:
raise ValueError(f"Unexpected arguments {kwargs.keys()}")
if position_embeddings is not None and object_queries is not None:
raise ValueError(
"Cannot specify both position_embeddings and object_queries. Please use just object_queries"
)
if position_embeddings is not None:
logger.warning_once(
"position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
)
object_queries = position_embeddings
return tensor if object_queries is None else tensor + object_queries
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
object_queries: Optional[torch.Tensor] = None,
key_value_states: Optional[torch.Tensor] = None,
spatial_position_embeddings: Optional[torch.Tensor] = None,
output_attentions: bool = False,
**kwargs,
class TableTransformerEncoderLayer(nn.Module):
def __init__(self, config: TableTransformerConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = TableTransformerAttention(
embed_dim=self.embed_dim,
num_heads=config.encoder_attention_heads,
dropout=config.attention_dropout,
)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
object_queries: torch.Tensor = None,
output_attentions: bool = False,
):
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
输入到层的隐藏状态张量,形状为 `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
values.
注意力掩码张量,大小为 `(batch, 1, target_len, source_len)`,其中填充元素由非常大的负值表示。
object_queries (`torch.FloatTensor`, *optional*): object queries, to be added to hidden_states.
目标查询张量,可选项,将添加到隐藏状态中。
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
是否返回所有注意力层的注意力张量。详细信息请参阅返回的张量中的 `attentions` 部分。
"""
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
object_queries=object_queries,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.final_layer_norm(hidden_states)
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
if self.training:
if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
clamp_value = torch.finfo(hidden_states.dtype).max - 1000
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
class TableTransformerDecoderLayer(nn.Module):
def __init__(self, config: TableTransformerConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = TableTransformerAttention(
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.encoder_attn = TableTransformerAttention(
self.embed_dim,
config.decoder_attention_heads,
dropout=config.attention_dropout,
)
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
object_queries: Optional[torch.Tensor] = None,
query_position_embeddings: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = False,
):
pass
class TableTransformerClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float):
super().__init__()
self.dense = nn.Linear(input_dim, inner_dim)
self.dropout = nn.Dropout(p=pooler_dropout)
self.out_proj = nn.Linear(inner_dim, num_classes)
def forward(self, hidden_states: torch.Tensor):
hidden_states = self.dropout(hidden_states)
hidden_states = self.dense(hidden_states)
hidden_states = torch.tanh(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.out_proj(hidden_states)
return hidden_states
class TableTransformerPreTrainedModel(PreTrainedModel):
config_class = TableTransformerConfig
base_model_prefix = "model"
main_input_name = "pixel_values"
_no_split_modules = [
r"TableTransformerConvEncoder",
r"TableTransformerEncoderLayer",
r"TableTransformerDecoderLayer",
]
def _init_weights(self, module):
std = self.config.init_std
if isinstance(module, TableTransformerLearnedPositionEmbedding):
nn.init.uniform_(module.row_embeddings.weight)
nn.init.uniform_(module.column_embeddings.weight)
if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
TABLE_TRANSFORMER_INPUTS_DOCSTRING = r"""
# Args: 函数的参数说明开始
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
# 像素值。默认情况下将忽略填充部分。
# 使用 `DetrImageProcessor` 可以获取像素值。详见 [`DetrImageProcessor.__call__`]。
pixel_mask (`torch.FloatTensor` of shape `(batch_size, height, width)`, *optional*):
# 用于避免在填充像素值上执行注意力的掩码。
# 掩码值在 `[0, 1]` 之间:
# - 1 表示真实像素(即**未遮蔽**),
# - 0 表示填充像素(即**已遮蔽**)。
# [What are attention masks?](../glossary#attention-mask)
decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
# 默认情况下不使用。可用于屏蔽对象查询。
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
# 元组包含 (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
# `last_hidden_state` 的形状为 `(batch_size, sequence_length, hidden_size)`,*optional*)是编码器最后一层的隐藏状态的序列。在解码器的交叉注意力中使用。
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
# 可选,而不是传递扁平化特征图(骨干网络输出 + 投影层的输出),可以直接传递图像的扁平化表示。
decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
# 可选,而不是使用零张量初始化查询,可以直接传递嵌入表示。
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。详见返回的张量中的 `attentions` 以获取更多细节。
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。详见返回的张量中的 `hidden_states` 以获取更多细节。
return_dict (`bool`, *optional*):
# 是否返回 [`~utils.ModelOutput`] 而不是普通元组。
"""
@add_start_docstrings(
"""
The bare Table Transformer Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
hidden-states without any specific head on top.
""",
TABLE_TRANSFORMER_START_DOCSTRING,
)
"""
# 从TableTransformerPreTrainedModel类继承,并重写__init__方法,初始化TableTransformerModel对象
class TableTransformerModel(TableTransformerPreTrainedModel):
# 从transformers.models.detr.modeling_detr.DetrModel.__init__复制而来,将Detr替换为TableTransformer
def __init__(self, config: TableTransformerConfig):
# 调用父类的构造方法初始化模型
super().__init__(config)
# 创建骨干网络(backbone)和位置编码
backbone = TableTransformerConvEncoder(config)
object_queries = build_position_encoding(config)
self.backbone = TableTransformerConvModel(backbone, object_queries)
# 创建投影层,将骨干网络的输出进行卷积投影到config.d_model维度
self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1)
# 创建查询位置嵌入层,根据config.num_queries和config.d_model创建一个嵌入层
self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
# 创建编码器和解码器
self.encoder = TableTransformerEncoder(config)
self.decoder = TableTransformerDecoder(config)
# 初始化权重并应用最终处理
self.post_init()
# 返回编码器对象
def get_encoder(self):
return self.encoder
# 返回解码器对象
def get_decoder(self):
return self.decoder
# 冻结骨干网络的参数,使其在反向传播中不更新
def freeze_backbone(self):
for name, param in self.backbone.conv_encoder.model.named_parameters():
param.requires_grad_(False)
# 解除骨干网络参数的冻结,使其在反向传播中更新
def unfreeze_backbone(self):
for name, param in self.backbone.conv_encoder.model.named_parameters():
param.requires_grad_(True)
# 重写forward方法,定义模型的前向传播逻辑,接收多个输入参数
@add_start_docstrings_to_model_forward(TABLE_TRANSFORMER_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TableTransformerModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.FloatTensor,
pixel_mask: Optional[torch.FloatTensor] = None,
decoder_attention_mask: Optional[torch.FloatTensor] = None,
encoder_outputs: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Table Transformer Model(由骨干网络和编码器-解码器Transformer组成),顶部带有用于例如COCO检测的对象检测头部。
"""
# 实现前向传播逻辑,具体实现由具体的Table Transformer模型定义
pass
def __init__(self, config: TableTransformerConfig):
# 调用父类的构造函数,初始化对象
super().__init__(config)
# 创建表格转换器模型
self.model = TableTransformerModel(config)
# 创建对象检测头部的类别分类器
self.class_labels_classifier = nn.Linear(
config.d_model, config.num_labels + 1
) # 添加一个类别用于表示“没有对象”
# 创建表格转换器的边界框预测器
self.bbox_predictor = TableTransformerMLPPredictionHead(
input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
)
# 初始化权重并进行最终处理
self.post_init()
@torch.jit.unused
# 从 transformers.models.detr.modeling_detr.DetrForObjectDetection._set_aux_loss 复制而来
def _set_aux_loss(self, outputs_class, outputs_coord):
# 这是为了使 torchscript 正常工作的一种解决方法,因为 torchscript 不支持具有非同构值的字典,
# 例如同时包含张量和列表的字典。
return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
@add_start_docstrings_to_model_forward(TABLE_TRANSFORMER_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TableTransformerObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.FloatTensor,
pixel_mask: Optional[torch.FloatTensor] = None,
decoder_attention_mask: Optional[torch.FloatTensor] = None,
encoder_outputs: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[List[Dict]] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# Copied from transformers.models.detr.modeling_detr.dice_loss
# 计算 DICE 损失,类似于面向掩码的广义 IOU
def dice_loss(inputs, targets, num_boxes):
"""
Compute the DICE loss, similar to generalized IOU for masks
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs (0 for the negative class and 1 for the positive
class).
"""
# 对预测进行 sigmoid 激活,确保在 (0, 1) 范围内
inputs = inputs.sigmoid()
# 将输入展平为二维(batch_size, -1)
inputs = inputs.flatten(1)
# 计算 DICE 损失的分子部分
numerator = 2 * (inputs * targets).sum(1)
# 计算 DICE 损失的分母部分
denominator = inputs.sum(-1) + targets.sum(-1)
# 计算最终的 DICE 损失
loss = 1 - (numerator + 1) / (denominator + 1)
# 返回平均损失除以框的数量
return loss.sum() / num_boxes
# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss
# 计算用于密集检测的 sigmoid focal loss
def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
"""
Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
Args:
inputs (`torch.FloatTensor` of arbitrary shape):
The predictions for each example.
targets (`torch.FloatTensor` with the same shape as `inputs`)
A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
and 1 for the positive class).
alpha (`float`, *optional*, defaults to `0.25`):
Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
gamma (`int`, *optional*, defaults to `2`):
Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.
Returns:
Loss tensor
"""
# 对输入进行 sigmoid 激活
prob = inputs.sigmoid()
# 计算二元交叉熵损失
ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
# 计算 p_t,用于调节损失
p_t = prob * targets + (1 - prob) * (1 - targets)
# 计算最终损失,加入调节因子 gamma
loss = ce_loss * ((1 - p_t) ** gamma)
# 如果 alpha 大于等于 0,则应用平衡因子 alpha_t
if alpha >= 0:
alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
loss = alpha_t * loss
# 返回每个样本的平均损失除以框的数量
return loss.mean(1).sum() / num_boxes
# Copied from transformers.models.detr.modeling_detr.DetrLoss with Detr->TableTransformer,detr->table_transformer
# 计算用于 TableTransformerForObjectDetection/TableTransformerForSegmentation 的损失
class TableTransformerLoss(nn.Module):
"""
This class computes the losses for TableTransformerForObjectDetection/TableTransformerForSegmentation. The process happens in two steps: 1)
we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair
of matched ground-truth / prediction (supervise class and box).
A note on the `num_classes` argument (copied from original repo in table_transformer.py): "the naming of the `num_classes`
parameter of the criterion is somewhat misleading. It indeed corresponds to `max_obj_id` + 1, where `max_obj_id` is
the maximum id for a class in your dataset. For example, COCO has a `max_obj_id` of 90, so we pass `num_classes` to
"""
def __init__(self):
super().__init__()
def forward(self):
pass
"""
Module for computing losses in table transformer model.
This module defines methods for calculating classification and cardinality errors for
object detection tasks using a transformer-based approach.
Args:
matcher (`TableTransformerHungarianMatcher`):
Module able to compute a matching between targets and proposals.
num_classes (`int`):
Number of object categories, omitting the special no-object category.
eos_coef (`float`):
Relative classification weight applied to the no-object category.
losses (`List[str]`):
List of all the losses to be applied. See `get_loss` for a list of all available losses.
"""
def __init__(self, matcher, num_classes, eos_coef, losses):
super().__init__()
self.matcher = matcher
self.num_classes = num_classes
self.eos_coef = eos_coef
self.losses = losses
empty_weight = torch.ones(self.num_classes + 1)
empty_weight[-1] = self.eos_coef
self.register_buffer("empty_weight", empty_weight)
def loss_labels(self, outputs, targets, indices, num_boxes):
"""
Classification loss (NLL) targets dicts must contain the key "class_labels" containing a tensor of dim
[nb_target_boxes]
"""
if "logits" not in outputs:
raise KeyError("No logits were found in the outputs")
source_logits = outputs["logits"]
idx = self._get_source_permutation_idx(indices)
target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
target_classes = torch.full(
source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device
)
target_classes[idx] = target_classes_o
loss_ce = nn.functional.cross_entropy(source_logits.transpose(1, 2), target_classes, self.empty_weight)
losses = {"loss_ce": loss_ce}
return losses
@torch.no_grad()
def loss_cardinality(self, outputs, targets, indices, num_boxes):
"""
Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
"""
logits = outputs["logits"]
device = logits.device
target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
losses = {"cardinality_error": card_err}
return losses
def loss_boxes(self, outputs, targets, indices, num_boxes):
"""
Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
are expected in format (center_x, center_y, w, h), normalized by the image size.
"""
if "pred_boxes" not in outputs:
raise KeyError("No predicted boxes found in outputs")
idx = self._get_source_permutation_idx(indices)
source_boxes = outputs["pred_boxes"][idx]
target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none")
losses = {}
losses["loss_bbox"] = loss_bbox.sum() / num_boxes
loss_giou = 1 - torch.diag(
generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes))
)
losses["loss_giou"] = loss_giou.sum() / num_boxes
return losses
def loss_masks(self, outputs, targets, indices, num_boxes):
"""
Compute the losses related to the masks: the focal loss and the dice loss.
Targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w].
"""
if "pred_masks" not in outputs:
raise KeyError("No predicted masks found in outputs")
source_idx = self._get_source_permutation_idx(indices)
target_idx = self._get_target_permutation_idx(indices)
source_masks = outputs["pred_masks"]
source_masks = source_masks[source_idx]
masks = [t["masks"] for t in targets]
target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
target_masks = target_masks.to(source_masks)
target_masks = target_masks[target_idx]
source_masks = nn.functional.interpolate(
source_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False
)
source_masks = source_masks[:, 0].flatten(1)
target_masks = target_masks.flatten(1)
target_masks = target_masks.view(source_masks.shape)
losses = {
"loss_mask": sigmoid_focal_loss(source_masks, target_masks, num_boxes),
"loss_dice": dice_loss(source_masks, target_masks, num_boxes),
}
return losses
def _get_source_permutation_idx(self, indices):
batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
source_idx = torch.cat([source for (source, _) in indices])
return batch_idx, source_idx
def _get_target_permutation_idx(self, indices):
batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
target_idx = torch.cat([target for (_, target) in indices])
return batch_idx, target_idx
def get_loss(self, loss, outputs, targets, indices, num_boxes):
loss_map = {
"labels": self.loss_labels,
"cardinality": self.loss_cardinality,
"boxes": self.loss_boxes,
"masks": self.loss_masks,
}
if loss not in loss_map:
raise ValueError(f"Loss {loss} not supported")
return loss_map[loss](outputs, targets, indices, num_boxes)
def forward(self, outputs, targets):
"""
This performs the loss computation.
Args:
outputs (`dict`, *optional*):
Dictionary of tensors, see the output specification of the model for the format.
targets (`List[dict]`, *optional*):
List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
losses applied, see each loss' doc.
"""
outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"}
indices = self.matcher(outputs_without_aux, targets)
num_boxes = sum(len(t["class_labels"]) for t in targets)
num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
world_size = 1
if is_accelerate_available():
if PartialState._shared_state != {}:
num_boxes = reduce(num_boxes)
world_size = PartialState().num_processes
num_boxes = torch.clamp(num_boxes / world_size, min=1).item()
losses = {}
for loss in self.losses:
losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
if "auxiliary_outputs" in outputs:
for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
indices = self.matcher(auxiliary_outputs, targets)
for loss in self.losses:
if loss == "masks":
continue
l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
losses.update(l_dict)
return losses
class TableTransformerMLPPredictionHead(nn.Module):
"""
简单的多层感知机(MLP,也称为 FFN),用于预测相对于图像的标准化中心坐标、高度和宽度的类。
从 https://github.com/facebookresearch/table_transformer/blob/master/models/table_transformer.py 复制而来
"""
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
super().__init__()
self.num_layers = num_layers
h = [hidden_dim] * (num_layers - 1)
self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
def forward(self, x):
for i, layer in enumerate(self.layers):
x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
return x
class TableTransformerHungarianMatcher(nn.Module):
"""
这个类计算网络目标和预测之间的分配。
由于效率原因,目标不包括 no_object。因此,通常预测比目标多。在这种情况下,我们对最佳预测进行一对一匹配,而其他预测则未匹配(因此视为非对象)。
Args:
class_cost:
在匹配成本中分类错误的相对权重。
bbox_cost:
在匹配成本中边界框坐标的 L1 误差的相对权重。
giou_cost:
在匹配成本中边界框的 giou 损失的相对权重。
"""
def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
super().__init__()
requires_backends(self, ["scipy"])
self.class_cost = class_cost
self.bbox_cost = bbox_cost
self.giou_cost = giou_cost
if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
raise ValueError("All costs of the Matcher can't be 0")
@torch.no_grad()
def forward(self, outputs, targets):
"""
Args:
outputs (`dict`):
A dictionary that contains at least these entries:
* "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
* "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
targets (`List[dict]`):
A list of targets (len(targets) = batch_size), where each target is a dict containing:
* "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
ground-truth objects in the target) containing the class labels
* "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
Returns:
`List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
- index_i is the indices of the selected predictions (in order)
- index_j is the indices of the corresponding selected targets (in order)
For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
"""
batch_size, num_queries = outputs["logits"].shape[:2]
out_prob = outputs["logits"].flatten(0, 1).softmax(-1)
out_bbox = outputs["pred_boxes"].flatten(0, 1)
target_ids = torch.cat([v["class_labels"] for v in targets])
target_bbox = torch.cat([v["boxes"] for v in targets])
class_cost = -out_prob[:, target_ids]
bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
sizes = [len(v["boxes"]) for v in targets]
indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
def _upcast(t: Tensor) -> Tensor:
if t.is_floating_point():
return t if t.dtype in (torch.float32, torch.float64) else t.float()
else:
return t if t.dtype in (torch.int32, torch.int64) else t.int()
def box_area(boxes: Tensor) -> Tensor:
"""
计算一组边界框的面积,这些边界框由它们的 (x1, y1, x2, y2) 坐标指定。
Args:
boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
要计算面积的边界框。它们应该以 (x1, y1, x2, y2) 格式给出,要求 `0 <= x1 < x2` 和 `0 <= y1 < y2`。
Returns:
`torch.FloatTensor`: 包含每个边界框面积的张量。
"""
boxes = _upcast(boxes)
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
def box_iou(boxes1, boxes2):
area1 = box_area(boxes1)
area2 = box_area(boxes2)
left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])
right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])
width_height = (right_bottom - left_top).clamp(min=0)
inter = width_height[:, :, 0] * width_height[:, :, 1]
union = area1[:, None] + area2 - inter
iou = inter / union
return iou, union
def generalized_box_iou(boxes1, boxes2):
"""
从 https://giou.stanford.edu/ 中获取的广义 IoU。边界框应处于 [x0, y0, x1, y1](角点)格式。
Returns:
`torch.FloatTensor`: 一个 [N, M] 的成对矩阵,其中 N = len(boxes1),M = len(boxes2)
"""
if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
raise ValueError(f"boxes1 必须在 [x0, y0, x1, y1](角点)格式内,但得到了 {boxes1}")
if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
raise ValueError(f"boxes2 必须在 [x0, y0, x1, y1](角点)格式内,但得到了 {boxes2}")
iou, union = box_iou(boxes1, boxes2)
top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
width_height = (bottom_right - top_left).clamp(min=0)
area = width_height[:, :, 0] * width_height[:, :, 1]
return iou - (area - union) / area
def _max_by_axis(the_list):
maxes = the_list[0]
for sublist in the_list[1:]:
for index, item in enumerate(sublist):
maxes[index] = max(maxes[index], item)
return maxes
class NestedTensor(object):
def __init__(self, tensors, mask: Optional[Tensor]):
self.tensors = tensors
self.mask = mask
def to(self, device):
cast_tensor = self.tensors.to(device)
mask = self.mask
if mask is not None:
cast_mask = mask.to(device)
else:
cast_mask = None
return NestedTensor(cast_tensor, cast_mask)
def decompose(self):
return self.tensors, self.mask
def __repr__(self):
return str(self.tensors)
def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
if tensor_list[0].ndim == 3:
max_size = _max_by_axis([list(img.shape) for img in tensor_list])
batch_shape = [len(tensor_list)] + max_size
batch_size, num_channels, height, width = batch_shape
dtype = tensor_list[0].dtype
device = tensor_list[0].device
tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
for img, pad_img, m in zip(tensor_list, tensor, mask):
pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
m[: img.shape[1], : img.shape[2]] = False
else:
raise ValueError("Only 3-dimensional tensors are supported")
return NestedTensor(tensor, mask)
.\models\table_transformer\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_table_transformer": [
"TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
"TableTransformerConfig",
"TableTransformerOnnxConfig",
]
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_table_transformer"] = [
"TABLE_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
"TableTransformerForObjectDetection",
"TableTransformerModel",
"TableTransformerPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_table_transformer import (
TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
TableTransformerConfig,
TableTransformerOnnxConfig,
)
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_table_transformer import (
TABLE_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
TableTransformerForObjectDetection,
TableTransformerModel,
TableTransformerPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\tapas\configuration_tapas.py
"""
TAPAS configuration. Based on the BERT configuration with added parameters.
Hyperparameters are taken from run_task_main.py and hparam_utils.py of the original implementation. URLS:
- https://github.com/google-research/tapas/blob/master/tapas/run_task_main.py
- https://github.com/google-research/tapas/blob/master/tapas/utils/hparam_utils.py
"""
from ...configuration_utils import PretrainedConfig
TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"google/tapas-base-finetuned-sqa": (
"https://huggingface.co/google/tapas-base-finetuned-sqa/resolve/main/config.json"
),
"google/tapas-base-finetuned-wtq": (
"https://huggingface.co/google/tapas-base-finetuned-wtq/resolve/main/config.json"
),
"google/tapas-base-finetuned-wikisql-supervised": (
"https://huggingface.co/google/tapas-base-finetuned-wikisql-supervised/resolve/main/config.json"
),
"google/tapas-base-finetuned-tabfact": (
"https://huggingface.co/google/tapas-base-finetuned-tabfact/resolve/main/config.json"
),
}
class TapasConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`TapasModel`]. It is used to instantiate a TAPAS
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the TAPAS
[google/tapas-base-finetuned-sqa](https://huggingface.co/google/tapas-base-finetuned-sqa) architecture.
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Hyperparameters additional to BERT are taken from run_task_main.py and hparam_utils.py of the original
implementation. Original implementation available at https://github.com/google-research/tapas/tree/master.
Example:
```
>>> from transformers import TapasModel, TapasConfig
>>> # Initializing a default (SQA) Tapas configuration
>>> configuration = TapasConfig()
>>> # Initializing a model from the configuration
>>> model = TapasModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "tapas"
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=1024,
type_vocab_sizes=[3, 256, 256, 2, 256, 256, 10],
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=0,
positive_label_weight=10.0,
num_aggregation_labels=0,
aggregation_loss_weight=1.0,
use_answer_as_supervision=None,
answer_loss_importance=1.0,
use_normalized_answer_loss=False,
huber_loss_delta=None,
temperature=1.0,
aggregation_temperature=1.0,
use_gumbel_for_cells=False,
use_gumbel_for_aggregation=False,
average_approximation_function="ratio",
cell_selection_preference=None,
answer_loss_cutoff=None,
max_num_rows=64,
max_num_columns=32,
average_logits_per_cell=False,
select_one_column=True,
allow_empty_column_selection=False,
init_cell_selection_weights_to_zero=False,
reset_position_index_per_cell=True,
disable_per_token_loss=False,
aggregation_labels=None,
no_aggregation_label_index=None,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_sizes = type_vocab_sizes
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.positive_label_weight = positive_label_weight
self.num_aggregation_labels = num_aggregation_labels
self.aggregation_loss_weight = aggregation_loss_weight
self.use_answer_as_supervision = use_answer_as_supervision
self.answer_loss_importance = answer_loss_importance
self.use_normalized_answer_loss = use_normalized_answer_loss
self.huber_loss_delta = huber_loss_delta
self.temperature = temperature
self.aggregation_temperature = aggregation_temperature
self.use_gumbel_for_cells = use_gumbel_for_cells
self.use_gumbel_for_aggregation = use_gumbel_for_aggregation
self.average_approximation_function = average_approximation_function
self.cell_selection_preference = cell_selection_preference
self.answer_loss_cutoff = answer_loss_cutoff
self.max_num_rows = max_num_rows
self.max_num_columns = max_num_columns
self.average_logits_per_cell = average_logits_per_cell
self.select_one_column = select_one_column
self.allow_empty_column_selection = allow_empty_column_selection
self.init_cell_selection_weights_to_zero = init_cell_selection_weights_to_zero
self.reset_position_index_per_cell = reset_position_index_per_cell
self.disable_per_token_loss = disable_per_token_loss
self.aggregation_labels = aggregation_labels
self.no_aggregation_label_index = no_aggregation_label_index
if isinstance(self.aggregation_labels, dict):
self.aggregation_labels = {int(k): v for k, v in aggregation_labels.items()}
.\models\tapas\convert_tapas_original_tf_checkpoint_to_pytorch.py
import argparse
from transformers import (
TapasConfig,
TapasForMaskedLM,
TapasForQuestionAnswering,
TapasForSequenceClassification,
TapasModel,
TapasTokenizer,
load_tf_weights_in_tapas,
)
from transformers.utils import logging
logging.set_verbosity_info()
def convert_tf_checkpoint_to_pytorch(
task, reset_position_index_per_cell, tf_checkpoint_path, tapas_config_file, pytorch_dump_path
):
config = TapasConfig.from_json_file(tapas_config_file)
config.reset_position_index_per_cell = reset_position_index_per_cell
if task == "SQA":
model = TapasForQuestionAnswering(config=config)
elif task == "WTQ":
config.num_aggregation_labels = 4
config.use_answer_as_supervision = True
config.answer_loss_cutoff = 0.664694
config.cell_selection_preference = 0.207951
config.huber_loss_delta = 0.121194
config.init_cell_selection_weights_to_zero = True
config.select_one_column = True
config.allow_empty_column_selection = False
config.temperature = 0.0352513
model = TapasForQuestionAnswering(config=config)
elif task == "WIKISQL_SUPERVISED":
config.num_aggregation_labels = 4
config.use_answer_as_supervision = False
config.answer_loss_cutoff = 36.4519
config.cell_selection_preference = 0.903421
config.huber_loss_delta = 222.088
config.init_cell_selection_weights_to_zero = True
config.select_one_column = True
config.allow_empty_column_selection = True
config.temperature = 0.763141
model = TapasForQuestionAnswering(config=config)
elif task == "TABFACT":
model = TapasForSequenceClassification(config=config)
elif task == "MLM":
model = TapasForMaskedLM(config=config)
elif task == "INTERMEDIATE_PRETRAINING":
model = TapasModel(config=config)
else:
raise ValueError(f"Task {task} not supported.")
print(f"Building PyTorch model from configuration: {config}")
load_tf_weights_in_tapas(model, config, tf_checkpoint_path)
print(f"Save PyTorch model to {pytorch_dump_path}")
model.save_pretrained(pytorch_dump_path)
print(f"Save tokenizer files to {pytorch_dump_path}")
tokenizer = TapasTokenizer(vocab_file=tf_checkpoint_path[:-10] + "vocab.txt", model_max_length=512)
tokenizer.save_pretrained(pytorch_dump_path)
print("Used relative position embeddings:", model.config.reset_position_index_per_cell)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--task", default="SQA", type=str, help="Model task for which to convert a checkpoint. Defaults to SQA."
)
parser.add_argument(
"--reset_position_index_per_cell",
default=False,
action="store_true",
help="Whether to use relative position embeddings or not. Defaults to True.",
)
parser.add_argument(
"--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
)
parser.add_argument(
"--tapas_config_file",
default=None,
type=str,
required=True,
help=(
"The config json file corresponding to the pre-trained TAPAS model. \n"
"This specifies the model architecture."
),
)
parser.add_argument(
"--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
args = parser.parse_args()
convert_tf_checkpoint_to_pytorch(
args.task,
args.reset_position_index_per_cell,
args.tf_checkpoint_path,
args.tapas_config_file,
args.pytorch_dump_path,
)
.\models\tapas\modeling_tapas.py
"""PyTorch TAPAS model."""
import enum
import math
import os
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, MaskedLMOutput, SequenceClassifierOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import (
apply_chunking_to_forward,
find_pruneable_heads_and_indices,
is_torch_greater_or_equal_than_1_12,
prune_linear_layer,
)
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_tapas import TapasConfig
logger = logging.get_logger(__name__)
if not is_torch_greater_or_equal_than_1_12:
logger.warning(
f"You are using torch=={torch.__version__}, but torch>=1.12.0 is required to use "
"TapasModel. Please upgrade torch."
)
_CONFIG_FOR_DOC = "TapasConfig"
_CHECKPOINT_FOR_DOC = "google/tapas-base"
TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/tapas-large",
"google/tapas-large-finetuned-sqa",
"google/tapas-large-finetuned-wtq",
"google/tapas-large-finetuned-wikisql-supervised",
"google/tapas-large-finetuned-tabfact",
"google/tapas-base",
"google/tapas-base-finetuned-sqa",
"google/tapas-base-finetuned-wtq",
"google/tapas-base-finetuned-wikisql-supervised",
"google/tapas-base-finetuned-tabfact",
"google/tapas-small",
"google/tapas-small-finetuned-sqa",
"google/tapas-small-finetuned-wtq",
"google/tapas-small-finetuned-wikisql-supervised",
"google/tapas-small-finetuned-tabfact",
"google/tapas-mini",
"google/tapas-mini-finetuned-sqa",
"google/tapas-mini-finetuned-wtq",
"google/tapas-mini-finetuned-wikisql-supervised",
"google/tapas-mini-finetuned-tabfact",
"google/tapas-tiny",
"google/tapas-tiny-finetuned-sqa",
"google/tapas-tiny-finetuned-wtq",
"google/tapas-tiny-finetuned-wikisql-supervised",
"google/tapas-tiny-finetuned-tabfact",
]
EPSILON_ZERO_DIVISION = 1e-10
CLOSE_ENOUGH_TO_LOG_ZERO = -10000.0
@dataclass
class TableQuestionAnsweringOutput(ModelOutput):
"""
[`TapasForQuestionAnswering`] 的输出类型。
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` (and possibly `answer`, `aggregation_labels`, `numeric_values` and `numeric_values_scale` are provided)):
总损失,包括层次单元选择对数似然损失和(可选)半监督回归损失,以及(可选)聚合操作的监督损失。
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
单元选择头部的预测分数,每个标记的分数。
logits_aggregation (`torch.FloatTensor`, *optional*, of shape `(batch_size, num_aggregation_labels)`):
聚合头部的预测分数,每种聚合运算符的分数。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
模型在每一层输出的隐藏状态,包括每一层的输出和初始嵌入输出。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
self-attention 头部中的注意力权重,用于计算加权平均值。
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
logits_aggregation: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
def load_tf_weights_in_tapas(model, config, tf_checkpoint_path):
"""
将 TensorFlow 的检查点加载到 PyTorch 模型中。这是从 load_tf_weights_in_bert 改编而来的函数。
- 添加单元选择和聚合头部
- 考虑额外的标记类型嵌入层
"""
try:
import re
import numpy as np
import tensorflow as tf
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path = os.path.abspath(tf_checkpoint_path)
logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
logger.info(f"Loading TF weight {name} with shape {shape}")
array = tf.train.load_variable(tf_path, name)
names.append(name)
arrays.append(array)
return model
class TapasEmbeddings(nn.Module):
"""
Construct the embeddings from word, position and token_type embeddings. Same as BertEmbeddings but with a number of
additional token type embeddings to encode tabular structure.
"""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
for i, type_vocab_sizes in enumerate(config.type_vocab_sizes):
name = f"token_type_embeddings_{i}"
setattr(self, name, nn.Embedding(type_vocab_sizes, config.hidden_size))
self.number_of_token_type_embeddings = len(config.type_vocab_sizes)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.config = config
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
device = input_ids.device if input_ids is not None else inputs_embeds.device
if position_ids is None:
position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
position_ids = position_ids.unsqueeze(0).expand(input_shape)
if self.config.reset_position_index_per_cell:
col_index = IndexMap(token_type_ids[:, :, 1], self.config.type_vocab_sizes[1], batch_dims=1)
row_index = IndexMap(token_type_ids[:, :, 2], self.config.type_vocab_sizes[2], batch_dims=1)
full_index = ProductIndexMap(col_index, row_index)
first_position_per_segment = reduce_min(position_ids, full_index)[0]
first_position = gather(first_position_per_segment, full_index)
position = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0)
position_ids = torch.min(
torch.as_tensor(self.config.max_position_embeddings - 1, device=device), position - first_position
)
if token_type_ids is None:
token_type_ids = torch.zeros(
(input_shape + self.number_of_token_type_embeddings), dtype=torch.long, device=device
)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
embeddings = inputs_embeds + position_embeddings
for i in range(self.number_of_token_type_embeddings):
name = f"token_type_embeddings_{i}"
embeddings += getattr(self, name)(token_type_ids[:, :, i])
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class TapasSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
f"heads {config.num_attention_heads}"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.is_decoder = config.is_decoder
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
):
mixed_query_layer = self.query(hidden_states)
is_cross_attention = encoder_hidden_states is not None
if is_cross_attention and past_key_value is not None:
key_layer = past_key_value[0]
value_layer = past_key_value[1]
attention_mask = encoder_attention_mask
elif is_cross_attention:
key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
attention_mask = encoder_attention_mask
elif past_key_value is not None:
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
else:
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
query_layer = self.transpose_for_scores(mixed_query_layer)
if self.is_decoder:
past_key_value = (key_layer, value_layer)
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
if attention_mask is not None:
attention_scores = attention_scores + attention_mask
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
attention_probs = self.dropout(attention_probs)
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
if self.is_decoder:
outputs = outputs + (past_key_value,)
return outputs
class TapasSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class TapasAttention(nn.Module):
def __init__(self, config):
super().__init__()
self.self = TapasSelfAttention(config)
self.output = TapasSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class TapasIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class TapasOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class TapasLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = TapasAttention(config)
self.is_decoder = config.is_decoder
self.add_cross_attention = config.add_cross_attention
if self.add_cross_attention:
if not self.is_decoder:
raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
self.crossattention = TapasAttention(config)
self.intermediate = TapasIntermediate(config)
self.output = TapasOutput(config)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
past_key_value=self_attn_past_key_value,
)
attention_output = self_attention_outputs[0]
if self.is_decoder:
outputs = self_attention_outputs[1:-1]
present_key_value = self_attention_outputs[-1]
else:
outputs = self_attention_outputs[1:]
cross_attn_present_key_value = None
if self.is_decoder and encoder_hidden_states is not None:
if not hasattr(self, "crossattention"):
raise ValueError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
" by setting `config.add_cross_attention=True`"
)
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
cross_attention_outputs = self.crossattention(
attention_output,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
cross_attn_past_key_value,
output_attentions,
)
attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:-1]
cross_attn_present_key_value = cross_attention_outputs[-1]
present_key_value = present_key_value + cross_attn_present_key_value
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
if self.is_decoder:
outputs = outputs + (present_key_value,)
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class TapasEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([TapasLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
):
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_values,
output_attentions,
)
else:
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_values,
output_attentions,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
class TapasPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class TapasPredictionHeadTransform(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
class TapasLMPredictionHead(nn.Module):
def __init__(self, config):
super().__init__()
self.transform = TapasPredictionHeadTransform(config)
self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.decoder.bias = self.bias
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states)
return hidden_states
class TapasOnlyMLMHead(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = TapasLMPredictionHead(config)
def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
prediction_scores = self.predictions(sequence_output)
return prediction_scores
class TapasPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = TapasConfig
base_model_prefix = "tapas"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
TAPAS_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its models (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`TapasConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
TAPAS_INPUTS_DOCSTRING = r"""
"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
token_type_ids (`torch.LongTensor` of shape `({0}, 7)`, *optional*):
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
output_attentions (`bool`, *optional*):
output_hidden_states (`bool`, *optional*):
return_dict (`bool`, *optional*):
"""
@add_start_docstrings(
"The bare Tapas Model transformer outputting raw hidden-states without any specific head on top.",
TAPAS_START_DOCSTRING,
)
class TapasModel(TapasPreTrainedModel):
"""
This class defines the Tapas Model, which extends TapasPreTrainedModel.
It can function as an encoder (self-attention only) or a decoder, incorporating cross-attention layers between
self-attention layers, following the architecture in the paper "Attention is All You Need" by Ashish Vaswani et al.
"""
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
# Initialize TapasEmbeddings and TapasEncoder based on provided configuration
self.embeddings = TapasEmbeddings(config)
self.encoder = TapasEncoder(config)
# Optionally initialize TapasPooler for pooling layer
self.pooler = TapasPooler(config) if add_pooling_layer else None
# Perform any additional initialization tasks
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
# Iterates over specified layers and prunes heads accordingly
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 初始化函数,用于创建一个新的TapasOnlyMLMHead对象,并初始化权重
def __init__(self, config):
# 调用父类的初始化方法,传入配置参数
super().__init__(config)
# 创建一个TapasModel对象,不添加池化层
self.tapas = TapasModel(config, add_pooling_layer=False)
# 创建一个TapasOnlyMLMHead对象
self.cls = TapasOnlyMLMHead(config)
# 调用后续处理函数,初始化权重并进行最终处理
self.post_init()
# 返回MLM头部的输出嵌入
def get_output_embeddings(self):
return self.cls.predictions.decoder
# 设置MLM头部的输出嵌入为新的嵌入
def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings
# 重写的前向传播函数,接受多个输入参数,并返回MaskedLMOutput对象
@add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
) -> Union[Tuple, MaskedLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
Returns:
Depending on `return_dict`:
- If `return_dict` is `False`, returns a tuple with `prediction_scores` followed by additional outputs.
- If `return_dict` is `True`, returns a `MaskedLMOutput` object containing `loss`, `logits`, `hidden_states`, and `attentions`.
Examples:
```
>>> from transformers import AutoTokenizer, TapasForMaskedLM
>>> import pandas as pd
>>> tokenizer = AutoTokenizer.from_pretrained("google/tapas-base")
>>> model = TapasForMaskedLM.from_pretrained("google/tapas-base")
>>> data = {
... "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
... "Age": ["56", "45", "59"],
... "Number of movies": ["87", "53", "69"],
... }
>>> table = pd.DataFrame.from_dict(data)
>>> inputs = tokenizer(
... table=table, queries="How many [MASK] has George [MASK] played in?", return_tensors="pt"
... )
>>> labels = tokenizer(
... table=table, queries="How many movies has George Clooney played in?", return_tensors="pt"
... )["input_ids"]
>>> outputs = model(**inputs, labels=labels)
>>> logits = outputs.logits
```
Determines the return type based on `return_dict`. If `labels` are provided, computes the masked language modeling loss using `CrossEntropyLoss`.
Returns either a tuple or a `MaskedLMOutput` object depending on `return_dict`.
```
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
```
Passes input arguments to the Tapas model and retrieves the outputs, including sequence output and prediction scores.
```
outputs = self.tapas(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
```
Retrieves the sequence output from the Tapas model's outputs and computes prediction scores using a classifier layer.
```
sequence_output = outputs[0]
prediction_scores = self.cls(sequence_output)
```
If `labels` are provided, calculates the masked language modeling loss using `CrossEntropyLoss`.
```
masked_lm_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss() # -100 index = padding token
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
```
Constructs the output based on whether `return_dict` is `False`, returning a tuple of outputs or including `masked_lm_loss` in a `MaskedLMOutput` object.
```
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return MaskedLMOutput(
loss=masked_lm_loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
```
"""
Tapas Model with a cell selection head and optional aggregation head on top for question-answering tasks on tables
(linear layers on top of the hidden-states output to compute `logits` and optional `logits_aggregation`), e.g. for
SQA, WTQ or WikiSQL-supervised tasks.
"""
# 使用 TapasStartDocstring 和 TAPAS_START_DOCSTRING 定义的文档字符串来注释 TapasForQuestionAnswering 类
@add_start_docstrings(
"""
Tapas Model with a cell selection head and optional aggregation head on top for question-answering tasks on tables
(linear layers on top of the hidden-states output to compute `logits` and optional `logits_aggregation`), e.g. for
SQA, WTQ or WikiSQL-supervised tasks.
""",
TAPAS_START_DOCSTRING,
)
class TapasForQuestionAnswering(TapasPreTrainedModel):
def __init__(self, config: TapasConfig):
super().__init__(config)
# base model
self.tapas = TapasModel(config)
# dropout (only used when training)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# cell selection heads
if config.init_cell_selection_weights_to_zero:
# init_cell_selection_weights_to_zero: Whether the initial weights should be
# set to 0. This ensures that all tokens have the same prior probability.
self.output_weights = nn.Parameter(torch.zeros(config.hidden_size))
self.column_output_weights = nn.Parameter(torch.zeros(config.hidden_size))
else:
self.output_weights = nn.Parameter(torch.empty(config.hidden_size))
nn.init.normal_(
self.output_weights, std=config.initializer_range
) # here, a truncated normal is used in the original implementation
self.column_output_weights = nn.Parameter(torch.empty(config.hidden_size))
nn.init.normal_(
self.column_output_weights, std=config.initializer_range
) # here, a truncated normal is used in the original implementation
self.output_bias = nn.Parameter(torch.zeros([]))
self.column_output_bias = nn.Parameter(torch.zeros([]))
# aggregation head
if config.num_aggregation_labels > 0:
self.aggregation_classifier = nn.Linear(config.hidden_size, config.num_aggregation_labels)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=TableQuestionAnsweringOutput, config_class=_CONFIG_FOR_DOC)
# 定义一个方法,用于模型的前向传播
def forward(
self,
input_ids: Optional[torch.LongTensor] = None, # 输入的 token IDs,类型为可选的长整型张量
attention_mask: Optional[torch.FloatTensor] = None, # 注意力掩码,类型为可选的浮点数张量
token_type_ids: Optional[torch.LongTensor] = None, # token 类型 IDs,类型为可选的长整型张量
position_ids: Optional[torch.LongTensor] = None, # 位置 IDs,类型为可选的长整型张量
head_mask: Optional[torch.FloatTensor] = None, # 头部掩码,类型为可选的浮点数张量
inputs_embeds: Optional[torch.FloatTensor] = None, # 输入的嵌入表示,类型为可选的浮点数张量
table_mask: Optional[torch.LongTensor] = None, # 表格掩码,类型为可选的长整型张量
labels: Optional[torch.LongTensor] = None, # 标签,类型为可选的长整型张量
aggregation_labels: Optional[torch.LongTensor] = None, # 聚合标签,类型为可选的长整型张量
float_answer: Optional[torch.FloatTensor] = None, # 浮点型答案,类型为可选的浮点数张量
numeric_values: Optional[torch.FloatTensor] = None, # 数值,类型为可选的浮点数张量
numeric_values_scale: Optional[torch.FloatTensor] = None, # 数值的比例,类型为可选的浮点数张量
output_attentions: Optional[bool] = None, # 是否输出注意力信息,类型为可选的布尔值
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,类型为可选的布尔值
return_dict: Optional[bool] = None, # 是否返回字典形式的输出,类型为可选的布尔值
"""
Tapas Model with a sequence classification head on top (a linear layer on top of the pooled output), e.g. for table
entailment tasks, such as TabFact (Chen et al., 2020).
"""
@add_start_docstrings(
TAPAS_START_DOCSTRING,
)
class TapasForSequenceClassification(TapasPreTrainedModel):
def __init__(self, config):
"""
Initializes TapasForSequenceClassification model.
Args:
config (TapasConfig): Configuration object specifying the model architecture and parameters.
"""
super().__init__(config)
self.num_labels = config.num_labels
# Initialize Tapas model
self.tapas = TapasModel(config)
# Dropout layer
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# Linear layer for classification
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Forward pass of the TapasForSequenceClassification model.
Args:
input_ids (torch.LongTensor, optional): Input IDs of the sequence.
attention_mask (torch.FloatTensor, optional): Mask to avoid performing attention on padding token indices.
token_type_ids (torch.LongTensor, optional): Segment token indices.
position_ids (torch.LongTensor, optional): Indices of positions of each input sequence tokens in the model.
head_mask (torch.FloatTensor, optional): Mask to nullify selected heads of the self-attention modules.
inputs_embeds (torch.FloatTensor, optional): Embedded representations of input sequences.
labels (torch.LongTensor, optional): Labels for computing the sequence classification loss.
output_attentions (bool, optional): Whether to return attentions weights.
output_hidden_states (bool, optional): Whether to return hidden states.
return_dict (bool, optional): Whether to return a dictionary as output.
Returns:
SequenceClassifierOutput: Output of the sequence classification, including loss, logits, and optional hidden states and attentions.
"""
""" TAPAS utilities."""
class AverageApproximationFunction(str, enum.Enum):
"""
Enum defining average approximation functions.
Includes:
- RATIO: ratio approximation
- FIRST_ORDER: first order approximation
- SECOND_ORDER: second order approximation
"""
RATIO = "ratio"
FIRST_ORDER = "first_order"
SECOND_ORDER = "second_order"
# Beginning of everything related to segmented tensors
class IndexMap(object):
"""
Index grouping entries within a tensor.
Attributes:
indices (torch.LongTensor): Tensor containing the indices.
num_segments (torch.LongTensor): Scalar tensor specifying the number of segments.
batch_dims (int): Number of batch dimensions.
"""
def __init__(self, indices, num_segments, batch_dims=0):
"""
Creates an IndexMap instance.
Args:
indices (torch.LongTensor): Tensor containing the indices.
num_segments (torch.LongTensor): Scalar tensor specifying the number of segments.
batch_dims (int, optional): Number of batch dimensions. Defaults to 0.
"""
self.indices = torch.as_tensor(indices)
self.num_segments = torch.as_tensor(num_segments, device=indices.device)
self.batch_dims = batch_dims
def batch_shape(self):
"""
Returns the batch shape of the indices tensor.
Returns:
torch.Size: Size object representing the shape of the indices tensor up to batch dimensions.
"""
return self.indices.size()[: self.batch_dims]
class ProductIndexMap(IndexMap):
"""
Index map representing the product of two indices.
"""
def __init__(self, outer_index, inner_index):
"""
Combines indices i and j into pairs (i, j). The result is an index where each segment (i, j) is the
intersection of segments i and j. For example if the inputs represent table cells indexed by respectively rows
and columns the output will be a table indexed by (row, column) pairs, i.e. by cell. The implementation
combines indices {0, .., n - 1} and {0, .., m - 1} into {0, .., nm - 1}. The output has *num_segments* equal to
*outer_index.num_segments* * *inner_index.num_segments*
Args:
outer_index (`IndexMap`):
IndexMap.
inner_index (`IndexMap`):
IndexMap, must have the same shape as *outer_index*.
"""
# 检查两个索引对象的批处理维度是否相同
if outer_index.batch_dims != inner_index.batch_dims:
raise ValueError("outer_index.batch_dims and inner_index.batch_dims must be the same.")
# 调用父类的构造函数来初始化对象
super().__init__(
indices=(inner_index.indices + outer_index.indices * inner_index.num_segments),
num_segments=inner_index.num_segments * outer_index.num_segments,
batch_dims=inner_index.batch_dims,
)
# 存储外部索引对象和内部索引对象
self.outer_index = outer_index
self.inner_index = inner_index
def project_outer(self, index):
"""Projects an index with the same index set onto the outer components."""
# 将索引映射到外部组件
indices = torch.div(index.indices, self.inner_index.num_segments, rounding_mode="floor").type(torch.long)
return IndexMap(indices=indices, num_segments=self.outer_index.num_segments, batch_dims=index.batch_dims)
def project_inner(self, index):
"""Projects an index with the same index set onto the inner components."""
# 将索引映射到内部组件
return IndexMap(
indices=torch.fmod(index.indices, self.inner_index.num_segments)
.type(torch.float)
.floor()
.type(torch.long),
num_segments=self.inner_index.num_segments,
batch_dims=index.batch_dims,
)
# 构造一个索引映射,其值为范围内的连续整数,用于表示段的编号
def range_index_map(batch_shape, num_segments, name="range_index_map"):
"""
Constructs an index map equal to range(num_segments).
Args:
batch_shape (tuple): Shape of the batch dimensions.
num_segments (int): Number of segments.
name (str, optional): Name for the operation. Currently not used.
Returns:
IndexMap: An IndexMap object containing the constructed indices.
"""
# 计算批量大小作为标量张量
batch_size = torch.prod(torch.tensor(list(batch_shape)))
# 创建偏移量作为长度为批量大小的一维张量,
# 逐元素与段数相乘(以偏移批次中的不同元素)例如,如果批量大小为2:[0, 64]
offset = torch.arange(start=0, end=batch_size, device='cpu') * num_segments
offset = offset.view(batch_shape)
# 根据索引映射的维度数范围(通常是range(1,2))多次展开偏移量
for _ in range(1, len(batch_shape)): # 通常范围为(1, 2)
offset = offset.unsqueeze(-1)
# 计算最终的索引,为偏移量加上原始索引
indices = offset + torch.arange(num_segments, device='cpu').view(-1)
return IndexMap(indices=indices.view(-1), num_segments=num_segments * batch_size, batch_dims=0)
Args:
batch_shape (`torch.Size`):
Batch shape
num_segments (`int`):
Number of segments
name (`str`, *optional*, defaults to 'range_index_map'):
Name for the operation. Currently not used
Returns:
(`IndexMap`): IndexMap of shape batch_shape with elements equal to range(num_segments).
"""
# 将 batch_shape 转换为 long 类型的张量,创建一个包含 batch_shape 的一维张量(例如 [2])
batch_shape = torch.as_tensor(
batch_shape, dtype=torch.long
) # create a rank 1 tensor vector containing batch_shape (e.g. [2])
assert len(batch_shape.size()) == 1 # 断言 batch_shape 的维度为 1
# 将 num_segments 转换为张量,创建一个包含 num_segments 的标量张量(例如 64)
num_segments = torch.as_tensor(num_segments) # create a rank 0 tensor (scalar) containing num_segments (e.g. 64)
assert len(num_segments.size()) == 0 # 断言 num_segments 的维度为 0,即为标量
# 创建一个从 0 到 num_segments-1 的张量,设备与 num_segments 相同
indices = torch.arange(
start=0, end=num_segments, device=num_segments.device
) # create a rank 1 vector with num_segments elements
# 创建一个新的张量,形状为 [1, num_segments],其中第一个元素为 1,其余维度与 batch_shape 相同
new_tensor = torch.cat(
[torch.ones_like(batch_shape, dtype=torch.long, device=num_segments.device), num_segments.unsqueeze(dim=0)],
dim=0,
)
# new_tensor is just a vector of [1 64] for example (assuming only 1 batch dimension)
# 将 new_tensor 转换为 Python 列表,并将其元素转换为整数,得到新的形状 new_shape
new_shape = [int(x) for x in new_tensor.tolist()]
# 通过重塑 indices 张量,使其形状为 new_shape
indices = indices.view(new_shape)
# 创建一个倍增张量,其形状为 [batch_shape, 1],将 indices 张量按 multiples 张量指定的次数重复
multiples = torch.cat([batch_shape, torch.as_tensor([1])], dim=0)
indices = indices.repeat(multiples.tolist())
# equivalent (in Numpy:)
# indices = torch.as_tensor(np.tile(indices.numpy(), multiples.tolist()))
# 返回 IndexMap 对象,包含 indices 张量、num_segments 和 batch_shape 的长度(作为 batch_dims)
return IndexMap(indices=indices, num_segments=num_segments, batch_dims=list(batch_shape.size())[0])
# 对输入的张量进行分段约简操作。
def _segment_reduce(values, index, segment_reduce_fn, name):
"""
Applies a segment reduction segment-wise.
Args:
values (`torch.Tensor`):
Tensor with segment values. 包含分段值的张量。
index (`IndexMap`):
IndexMap. 索引映射对象。
segment_reduce_fn (`str`):
Name for the reduce operation. One of "sum", "mean", "max" or "min". 约简操作的名称,可以是"sum"、"mean"、"max"或"min"之一。
name (`str`):
Name for the operation. Currently not used. 操作的名称,目前未使用
Returns:
(`IndexMap`): IndexMap of shape batch_shape with elements equal to range(num_segments).
返回值:形状为 batch_shape 的 IndexMap,其元素等于 range(num_segments)。
"""
# Flatten the batch dimensions, as segments ops (scatter) do not support batching.
# However if `values` has extra dimensions to the right keep them
# unflattened. Segmented ops support vector-valued operations.
# 压平批处理维度,因为分段操作(scatter)不支持批处理。
# 如果 `values` 的右侧有额外的维度,则保持未压平。分段操作支持矢量值操作。
flat_index = flatten(index)
vector_shape = values.size()[len(index.indices.size()) :] # torch.Size object
flattened_shape = torch.cat(
[torch.as_tensor([-1], dtype=torch.long), torch.as_tensor(vector_shape, dtype=torch.long)], dim=0
)
# 将 `values` 重塑为压平后的形状
flat_values = values.reshape(flattened_shape.tolist())
# Create a tensor filled with zeros for output
# 为输出创建一个用零填充的张量
out = torch.zeros(int(flat_index.num_segments), dtype=torch.float, device=flat_values.device)
# 在指定维度上进行分段约简操作,使用给定的约简函数 `segment_reduce_fn`
segment_means = out.scatter_reduce(
dim=0, index=flat_index.indices.long(), src=flat_values.float(), reduce=segment_reduce_fn, include_self=False
)
# Unflatten the values.
# 将值重新恢复为原始形状。
new_shape = torch.cat(
[
torch.as_tensor(index.batch_shape(), dtype=torch.long),
torch.as_tensor([index.num_segments], dtype=torch.long),
torch.as_tensor(vector_shape, dtype=torch.long),
],
dim=0,
)
# Clone segment_means and reshape it to match the original `values` shape
# 克隆 segment_means,并将其重塑以匹配原始 `values` 的形状
output_values = segment_means.clone().view(new_shape.tolist()).to(values.dtype)
# 创建并返回输出索引对象
output_index = range_index_map(index.batch_shape(), index.num_segments)
return output_values, output_index
def reduce_sum(values, index, name="segmented_reduce_sum"):
"""
Sums a tensor over its segments.
Outputs 0 for empty segments.
This operations computes the sum over segments, with support for:
- Batching using the first dimensions [B1, B2, ..., Bn]. Each element in a batch can have different indices.
- Vectorization using the last dimension [V1, V2, ...]. If they are present, the output will be a sum of
vectors rather than scalars. Only the middle dimensions [I1, ..., Ik] are reduced by the operation.
Args:
values (`torch.Tensor` of shape [B1, B2, ..., Bn, I1, .., Ik, V1, V2, ..]):
Tensor containing the values of which the sum must be taken segment-wise.
包含需要进行分段求和的值的张量。
index (`IndexMap`, indices are of shape [B1, B2, ..., Bn, I1, .., Ik].):
Index defining the segments. 定义分段的索引对象。
name (`str`, *optional*, defaults to 'segmented_reduce_sum'):
Name for the operation. Currently not used
操作的名称,默认为 'segmented_reduce_sum',目前未使用
"""
# 调用 _segment_reduce 函数,用于对输入的 values 和 index 进行分段求和操作
# 返回的结果包括两部分:
# - output_values: 形状为 [B1, B2, ..., Bn, num_segments, V1, V2, ..] 的张量,包含了求和后的输出值
# - output_index: 类型为 IndexMap,形状为 [B1, B2, ..., Bn, num_segments],表示每个分段的索引映射
return _segment_reduce(values, index, "sum", name)
# 对输入的张量在其各个段上求平均值,空段返回0
def reduce_mean(values, index, name="segmented_reduce_mean"):
# 调用内部函数 _segment_reduce,执行平均值操作
return _segment_reduce(values, index, "mean", name)
# 对输入的张量在其各个段上求最大值
def reduce_max(values, index, name="segmented_reduce_max"):
# 调用内部函数 _segment_reduce,执行最大值操作
return _segment_reduce(values, index, "amax", name)
# 对输入的张量在其各个段上求最小值
def reduce_min(values, index, name="segmented_reduce_min"):
# 此函数未完成,仅有文档字符串作为占位符
# 使用 `_segment_reduce` 函数计算各段的最小值。该函数支持以下特性:
#
# - 使用第一维度 [B1, B2, ..., Bn] 进行批处理。每个批次中的元素可以具有不同的索引。
# - 使用最后一维度 [V1, V2, ...] 进行向量化。如果存在这些维度,则输出将是向量的逐元素最小值,而不是标量。
#
# 只有中间维度 [I1, ..., Ik] 会被操作减少。
#
# Args:
# values (`torch.Tensor` of shape [B1, B2, ..., Bn, I1, .., Ik, V1, V2, ..]):
# 包含要在每段中取最小值的张量。
# index (`IndexMap`, 索引的形状为 [B1, B2, ..., Bn, I1, .., Ik]):
# 定义段的索引。
# name (`str`, *optional*, 默认为 'segmented_reduce_sum'):
# 操作的名称。当前未使用。
#
# Returns:
# output_values (`torch.Tensor` 的形状为 [B1, B2, ..., Bn, num_segments, V1, V2, ..]):
# 包含输出值的张量。
# output_index (`IndexMap`):
# 形状为 [B1, B2, ..., Bn, num_segments] 的索引映射。
# 计算列 logits
def compute_column_logits(
sequence_output, column_output_weights, column_output_bias, cell_index, cell_mask, allow_empty_column_selection
):
"""
计算列的 logits。
Args:
sequence_output (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
模型最后一层的隐藏状态输出,也称为 last_hidden_state。
column_output_weights (`torch.FloatTensor` of shape `(hidden_size)`):
列选择线性层的权重。
column_output_bias (`torch.FloatTensor` of shape `()`):
列选择线性层的偏置。
cell_index (`ProductIndexMap`):
将标记分组为单元格的索引。
cell_mask (`torch.FloatTensor` of shape `(batch_size, max_num_rows * max_num_cols)`):
表格中存在的单元格的掩码(即非填充单元格)。
allow_empty_column_selection (`bool`):
是否允许不选择任何列
Returns:
column_logits (`torch.FloatTensor`of shape `(batch_size, max_num_cols)`):
包含每个示例在批次中的列 logits 的张量。
"""
# 首先,计算标记的 logits(batch_size, seq_len)- 不考虑温度
token_logits = torch.einsum("bsj,j->bs", sequence_output, column_output_weights) + column_output_bias
# 接下来,按单元格平均 logits(batch_size, max_num_cols*max_num_rows)
cell_logits, cell_logits_index = reduce_mean(token_logits, cell_index)
# 最后,按列平均 logits(batch_size, max_num_cols)
column_index = cell_index.project_inner(cell_logits_index)
column_logits, out_index = reduce_sum(cell_logits * cell_mask, column_index)
# 计算每列的单元格数
cell_count, _ = reduce_sum(cell_mask, column_index)
column_logits /= cell_count + EPSILON_ZERO_DIVISION
# 掩盖不在示例中出现的列
is_padding = torch.logical_and(cell_count < 0.5, ~torch.eq(out_index.indices, 0))
column_logits += CLOSE_ENOUGH_TO_LOG_ZERO * torch.as_tensor(
is_padding, dtype=torch.float32, device=is_padding.device
)
# 如果不允许空列选择,则将 logits 加上一个小量,以表示选择空列的代价
if not allow_empty_column_selection:
column_logits += CLOSE_ENOUGH_TO_LOG_ZERO * torch.as_tensor(
torch.eq(out_index.indices, 0), dtype=torch.float32, device=out_index.indices.device
)
return column_logits
Args:
token_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
每个标记的对数概率值的张量。
column_logits (`torch.FloatTensor` of shape `(batch_size, max_num_cols)`):
每个列的对数概率值的张量。
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
每个标记的标签。
cell_index (`ProductIndexMap`):
将标记分组为单元格的索引映射。
col_index (`IndexMap`):
将标记分组为列的索引映射。
cell_mask (`torch.FloatTensor` of shape `(batch_size, max_num_rows * max_num_cols)`):
表中存在的单元格的掩码(即不是填充的部分)。
Returns:
selection_loss_per_example (`torch.FloatTensor` of shape `(batch_size,)`):
每个示例的损失值。
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
新的对数概率值,只允许选择单个列中的单元格。
根据 *column_logits* 最可能的列之外的对数概率设为非常低的值(概率为0)。
"""
# Part 1: column loss
# 首先确定应选择的列。我们使用具有最大选定单元格数的列。
labels_per_column, _ = reduce_sum(torch.as_tensor(labels, dtype=torch.float32, device=labels.device), col_index)
# labels_per_column 的形状是 (batch_size, max_num_cols)。
# 它包含每个示例每列的标签数量。
column_label = torch.argmax(labels_per_column, dim=-1) # 形状为 (batch_size,)
# 检查列中是否没有选定的单元格。在这种情况下,模型应该预测特殊的列 id 0,表示“不选择任何内容”。
no_cell_selected = torch.eq(
torch.max(labels_per_column, dim=-1)[0], 0
) # no_cell_selected 的形状为 (batch_size,),值为 True
# 如果批处理中的某个示例没有选定单元格(即没有为该示例设置为1的标签),则将 column_label 设置为0。
column_label = torch.where(
no_cell_selected.view(column_label.size()), torch.zeros_like(column_label), column_label
)
column_dist = torch.distributions.Categorical(logits=column_logits) # 形状为 (batch_size, max_num_cols)
column_loss_per_example = -column_dist.log_prob(column_label)
# Part 2: cell loss
# 将标签和对数概率从每个标记减少到每个单元格。
# logits_per_cell: 形状为 (batch_size, max_num_rows*max_num_cols) 即 (batch_size, 64*32)
logits_per_cell, _ = reduce_mean(token_logits, cell_index)
# labels_per_cell: 形状为 (batch_size, 64*32),指示每个单元格是否应该被选择(1)或不被选择(0)
labels_per_cell, labels_index = reduce_max(
torch.as_tensor(labels, dtype=torch.long, device=labels.device), cell_index
)
# 选择所选列的掩码。
# column_id_for_cells: 形状为 (batch_size, 64*32),指示每个单元格属于哪一列。
# 使用 `cell_index` 对 `labels_index` 进行投影操作,并获取投影后的列索引
column_id_for_cells = cell_index.project_inner(labels_index).indices
# 创建一个形状为 (batch_size, 64*32) 的张量 `column_mask`,
# 如果单元格属于要选择的列,则该值等于1
column_mask = torch.as_tensor(
torch.eq(column_id_for_cells, torch.unsqueeze(column_label, dim=-1)),
dtype=torch.float32,
device=cell_mask.device,
)
# 使用 Bernoulli 分布生成 `cell_dist`,logits_per_cell 的形状为 (batch_size, 64*32)
cell_dist = torch.distributions.Bernoulli(logits=logits_per_cell)
# 计算每个单元格的对数似然,仅针对所选列
cell_log_prob = cell_dist.log_prob(labels_per_cell.type(torch.float32)) # 形状为 (batch_size, 64*32)
# 计算单元格损失,乘以列掩码和单元格掩码
cell_loss = -torch.sum(cell_log_prob * column_mask * cell_mask, dim=1)
# 将损失标准化为列中的单元格数目
cell_loss /= torch.sum(column_mask * cell_mask, dim=1) + EPSILON_ZERO_DIVISION
# 将列损失初始化为每个示例的选择损失
selection_loss_per_example = column_loss_per_example
# 如果没有选择单元格,则设置损失为零,否则使用计算的 `cell_loss`
selection_loss_per_example += torch.where(
no_cell_selected.view(selection_loss_per_example.size()),
torch.zeros_like(selection_loss_per_example),
cell_loss,
)
# 通过对 `column_logits` 的最大值获取模型选择的列 ID
selected_column_id = torch.as_tensor(
torch.argmax(column_logits, dim=-1), dtype=torch.long, device=column_logits.device
) # 形状为 (batch_size,)
# 创建一个形状为 (batch_size, 64*32) 的 `selected_column_mask`,
# 如果单元格属于模型选择的列,则该值等于1
selected_column_mask = torch.as_tensor(
torch.eq(column_id_for_cells, torch.unsqueeze(selected_column_id, dim=-1)),
dtype=torch.float32,
device=selected_column_id.device,
)
# 不选择特殊列 ID 为 0 的单元格
selected_column_mask = torch.where(
torch.eq(column_id_for_cells, 0).view(selected_column_mask.size()),
torch.zeros_like(selected_column_mask),
selected_column_mask,
)
# 调整 `logits_per_cell`,确保在模型选择的列之外将概率设为0
new_logits_per_cell = logits_per_cell + CLOSE_ENOUGH_TO_LOG_ZERO * (1.0 - cell_mask * selected_column_mask)
# 使用 `cell_index` 收集新的 `new_logits_per_cell`,形状由 `cell_index` 决定
logits = gather(new_logits_per_cell, cell_index)
# 返回选择损失和调整后的 `logits`
return selection_loss_per_example, logits
"""
Computes logits per token
Args:
sequence_output (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Also known as last_hidden_state. Sequence of hidden-states at the output of the last layer of the model.
temperature (`float`):
Temperature for the Bernoulli distribution.
output_weights (`torch.FloatTensor` of shape `(hidden_size,)`):
Weights of the linear layer for cell selection.
output_bias (`torch.FloatTensor` of shape `()`):
Bias of the linear layer for cell selection
Returns:
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Logits per token.
"""
# 计算每个 token 的 logits
logits = (torch.einsum("bsj,j->bs", sequence_output, output_weights) + output_bias) / temperature
return logits
def _calculate_aggregate_mask(answer, pooled_output, cell_selection_preference, labels, aggregation_classifier):
"""
Finds examples where the model should select cells with no aggregation.
Returns a mask that determines for which examples should the model select answers directly from the table, without
any aggregation function. If the answer is a piece of text the case is unambiguous as aggregation functions only
apply to numbers. If the answer is a number but does not appear in the table then we must use some aggregation
case. The ambiguous case is when the answer is a number that also appears in the table. In this case we use the
aggregation function probabilities predicted by the model to decide whether to select or aggregate. The threshold
for this is a hyperparameter *cell_selection_preference*
Args:
answer (`torch.FloatTensor` of shape `(batch_size, )`):
Answer for every example in the batch. Nan if there is no scalar answer.
pooled_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
Output of the pooler (BertPooler) on top of the encoder layer.
cell_selection_preference (`float`):
Preference for cell selection in ambiguous cases.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Labels per token.
aggregation_classifier (`torch.nn.Linear`): Aggregation head
Returns:
aggregate_mask (`torch.FloatTensor` of shape `(batch_size,)`): A mask set to 1 for examples that should use
aggregation functions.
"""
# 创建一个初始的聚合掩码,根据答案是否为数值来确定是否需要聚合
aggregate_mask_init = torch.logical_not(torch.isnan(answer)).type(torch.FloatTensor).to(answer.device)
# 使用汇聚分类器计算聚合操作的 logits
logits_aggregation = aggregation_classifier(pooled_output)
# 使用分类分布创建一个分布对象
dist_aggregation = torch.distributions.categorical.Categorical(logits=logits_aggregation)
# 计算除了“不进行聚合”操作之外的所有聚合操作的总质量
aggregation_ops_total_mass = torch.sum(dist_aggregation.probs[:, 1:], dim=1)
# 根据当前模型的选择例子进行细胞选择。
is_pred_cell_selection = aggregation_ops_total_mass <= cell_selection_preference
# 存在非空细胞选择监督的例子。
is_cell_supervision_available = torch.sum(labels, dim=1) > 0
# torch.where 与 tf.where(在 tensorflow 1 中)不等价,
# 因此在条件上添加 .view,以匹配第一个张量的形状。
aggregate_mask = torch.where(
torch.logical_and(is_pred_cell_selection, is_cell_supervision_available).view(aggregate_mask_init.size()),
torch.zeros_like(aggregate_mask_init, dtype=torch.float32),
aggregate_mask_init,
)
# 分离张量,使其不再跟踪梯度。
aggregate_mask = aggregate_mask.detach()
# 返回聚合掩码。
return aggregate_mask
# 计算已知情况下的聚合损失
def _calculate_aggregation_loss_known(
logits_aggregation, aggregate_mask, aggregation_labels, use_answer_as_supervision, num_aggregation_labels
):
"""
在训练过程中,当聚合类型已知时计算聚合损失。
在弱监督设置中,唯一已知的信息是对于单元格选择示例,应预测“无聚合”。对于其他需要聚合的示例,不会累积损失。在总是已知聚合类型的设置中,将为所有示例累积标准交叉熵损失。
Args:
logits_aggregation (`torch.FloatTensor` of shape `(batch_size, num_aggregation_labels)`):
每个聚合操作的logits。
aggregate_mask (`torch.FloatTensor` of shape `(batch_size, )`):
对应于应使用聚合函数的示例的掩码,设为1。
aggregation_labels (`torch.LongTensor` of shape `(batch_size, )`):
每个示例的聚合函数 id。
use_answer_as_supervision (`bool`, *可选*):
是否将答案作为聚合示例的唯一监督。
num_aggregation_labels (`int`, *可选*, 默认为0):
要预测的聚合运算符数量。
Returns:
aggregation_loss_known (`torch.FloatTensor` of shape `(batch_size,)`): 每个示例的聚合损失(在已知类型的情况下)。
"""
if use_answer_as_supervision:
# 为单元格选择示例准备“无聚合”目标。
target_aggregation = torch.zeros_like(aggregate_mask, dtype=torch.long)
else:
# 使用聚合监督作为目标。
target_aggregation = aggregation_labels
one_hot_labels = nn.functional.one_hot(target_aggregation, num_classes=num_aggregation_labels).type(torch.float32)
log_probs = nn.functional.log_softmax(logits_aggregation, dim=-1)
# torch.FloatTensor[batch_size]
per_example_aggregation_intermediate = -torch.sum(one_hot_labels * log_probs, dim=-1)
if use_answer_as_supervision:
# 仅累积需要单元格选择(无聚合)的示例的损失。
return per_example_aggregation_intermediate * (1 - aggregate_mask)
else:
return per_example_aggregation_intermediate
# 计算未知情况下的聚合损失
def _calculate_aggregation_loss_unknown(logits_aggregation, aggregate_mask):
"""
在答案监督情况下计算聚合损失。
Args:
logits_aggregation (`torch.FloatTensor` of shape `(batch_size, num_aggregation_labels)`):
每个聚合操作的logits。
aggregate_mask (`torch.FloatTensor` of shape `(batch_size, )`):
对应于应使用聚合函数的示例的掩码,设为1。
Returns:
aggregation_loss_unknown (`torch.FloatTensor` of shape `(batch_size,)`): 每个示例的聚合损失(在答案监督情况下)。
"""
# 创建一个分类分布对象,基于给定的 logits 参数
dist_aggregation = torch.distributions.categorical.Categorical(logits=logits_aggregation)
# 计算除了第一个索引外的所有聚合操作的总概率质量
aggregation_ops_total_mass = torch.sum(dist_aggregation.probs[:, 1:], dim=1)
# 在需要聚合答案的情况下预测一些聚合操作。
# 这会增加所有聚合函数的概率,类似于最大边际似然(MML),但不考虑函数是否给出正确答案。
# 返回负对数似然乘以聚合掩码,用于损失函数计算
return -torch.log(aggregation_ops_total_mass) * aggregate_mask
# 计算每个样本的聚合损失
def _calculate_aggregation_loss(
logits_aggregation,
aggregate_mask,
aggregation_labels,
use_answer_as_supervision,
num_aggregation_labels,
aggregation_loss_weight,
):
"""
计算每个样本的聚合损失。
Args:
logits_aggregation (`torch.FloatTensor` of shape `(batch_size, num_aggregation_labels)`):
每个聚合操作的logits。
aggregate_mask (`torch.FloatTensor` of shape `(batch_size, )`):
对应于应使用聚合函数的样本的掩码,为1。
aggregation_labels (`torch.LongTensor` of shape `(batch_size, )`):
每个样本的聚合函数 ID。
use_answer_as_supervision (`bool`, *optional*):
是否将答案作为聚合样本的唯一监督。
num_aggregation_labels (`int`, *optional*, defaults to 0):
预测的聚合操作数目。
aggregation_loss_weight (`float`, *optional*, defaults to 1.0):
聚合损失的权重。
Returns:
aggregation_loss (`torch.FloatTensor` of shape `(batch_size,)`): 每个样本的聚合损失。
"""
# 使用已知的聚合损失计算函数
per_example_aggregation_loss = _calculate_aggregation_loss_known(
logits_aggregation, aggregate_mask, aggregation_labels, use_answer_as_supervision, num_aggregation_labels
)
if use_answer_as_supervision:
# 对需要聚合的数值答案增加聚合损失
per_example_aggregation_loss += _calculate_aggregation_loss_unknown(logits_aggregation, aggregate_mask)
return aggregation_loss_weight * per_example_aggregation_loss
# 计算给定单元格和聚合概率时的期望结果
def _calculate_expected_result(
dist_per_cell, numeric_values, numeric_values_scale, input_mask_float, logits_aggregation, config
):
"""
计算给定单元格和聚合概率时的期望结果。
Args:
dist_per_cell (`torch.distributions.Bernoulli`):
每个单元格的选择分布。
numeric_values (`torch.FloatTensor` of shape `(batch_size, seq_length)`):
每个标记的数值。对于非数值标记为NaN。
numeric_values_scale (`torch.FloatTensor` of shape `(batch_size, seq_length)`):
每个标记数值的缩放。
input_mask_float (`torch.FloatTensor` of shape `(batch_size, seq_length)`):
表的掩码,不包括问题标记和表头。
logits_aggregation (`torch.FloatTensor` of shape `(batch_size, num_aggregation_labels)`):
每个聚合操作的logits。
config ([`TapasConfig`]):
包含模型所有超参数的配置类。
Returns:
expected_result (`torch.FloatTensor` of shape `(batch_size,)`): 每个样本的期望结果。
"""
# 如果配置中使用 Gumbel 分布来处理单元格,则创建一个 RelaxedBernoulli 分布对象
gumbel_dist = torch.distributions.RelaxedBernoulli(
# 由于标记的 logit 已经被温度除过并用于计算单元格选择误差,因此这里需要再次乘以温度
temperature=config.temperature,
logits=dist_per_cell.logits * config.temperature,
)
# 从 Gumbel 分布中采样得到每个单元格的概率值
scaled_probability_per_cell = gumbel_dist.sample()
else:
# 如果配置中未使用 Gumbel 分布,则直接使用每个单元格的概率值
scaled_probability_per_cell = dist_per_cell.probs
# <float32>[batch_size, seq_length],将每个单元格的概率按照数值缩放比例和输入掩码进行调整
scaled_probability_per_cell = (scaled_probability_per_cell / numeric_values_scale) * input_mask_float
# 计算每个批次中单元格概率的总和
count_result = torch.sum(scaled_probability_per_cell, dim=1)
# 将非数值表格值置零,使用 torch.where 进行条件替换
numeric_values_masked = torch.where(
torch.isnan(numeric_values), torch.zeros_like(numeric_values), numeric_values
) # Mask non-numeric table values to zero.
# 计算加权和结果,使用数值表格值乘以单元格概率和数值掩码
sum_result = torch.sum(scaled_probability_per_cell * numeric_values_masked, dim=1)
# 选择平均近似函数配置
avg_approximation = config.average_approximation_function
if avg_approximation == AverageApproximationFunction.RATIO:
# 使用比率近似方法计算平均值结果
average_result = sum_result / (count_result + EPSILON_ZERO_DIVISION)
elif avg_approximation == AverageApproximationFunction.FIRST_ORDER:
# 使用一阶方法计算平均值结果,基于 TAPAS 论文附录 D 中的公式 X_c
ex = torch.sum(scaled_probability_per_cell, dim=1, keepdim=True) - scaled_probability_per_cell + 1
average_result = torch.sum(numeric_values_masked * scaled_probability_per_cell / ex, dim=1)
elif avg_approximation == AverageApproximationFunction.SECOND_ORDER:
# 使用二阶方法计算平均值结果,基于 TAPAS 论文附录 D 中的公式
ex = torch.sum(scaled_probability_per_cell, dim=1, keepdim=True) - scaled_probability_per_cell + 1
pointwise_var = scaled_probability_per_cell * (1 - scaled_probability_per_cell)
var = torch.sum(pointwise_var, dim=1, keepdim=True) - pointwise_var
multiplier = (var / torch.square(ex) + 1) / ex
average_result = torch.sum(numeric_values_masked * scaled_probability_per_cell * multiplier, dim=1)
else:
# 如果配置中的平均近似函数配置无效,则抛出异常
raise ValueError(f"Invalid average_approximation_function: {config.average_approximation_function}")
if config.use_gumbel_for_aggregation:
# 如果配置中使用 Gumbel 分布来处理聚合操作,则创建一个 RelaxedOneHotCategorical 分布对象
gumbel_dist = torch.distributions.RelaxedOneHotCategorical(
config.aggregation_temperature, logits=logits_aggregation[:, 1:]
)
# <float32>[batch_size, num_aggregation_labels - 1],从 Gumbel 分布中采样得到聚合操作的概率值
aggregation_op_only_probs = gumbel_dist.sample()
else:
# 计算去除第一列后的 logits 的 softmax 操作,用于聚合操作概率计算
aggregation_op_only_probs = nn.functional.softmax(
logits_aggregation[:, 1:] / config.aggregation_temperature, dim=-1
)
# 将三个结果张量按列拼接成一个张量
all_results = torch.cat(
[
torch.unsqueeze(sum_result, dim=1),
torch.unsqueeze(average_result, dim=1),
torch.unsqueeze(count_result, dim=1),
],
dim=1,
)
# 计算期望的聚合结果,通过加权求和得到
expected_result = torch.sum(all_results * aggregation_op_only_probs, dim=1)
# 返回期望的结果张量
return expected_result
# PyTorch 目前不支持带有自定义 delta 的 Huber 损失函数,因此我们自己定义它
def huber_loss(input, target, delta: float = 1.0):
# 计算输入和目标之间的绝对误差,形状为 (batch_size,)
errors = torch.abs(input - target)
# 根据误差是否小于 delta,选择计算平方误差的一半或线性误差减去常量
return torch.where(errors < delta, 0.5 * errors**2, errors * delta - (0.5 * delta**2))
def _calculate_regression_loss(
answer,
aggregate_mask,
dist_per_cell,
numeric_values,
numeric_values_scale,
input_mask_float,
logits_aggregation,
config,
):
"""
计算每个样本的回归损失。
Args:
answer (`torch.FloatTensor` of shape `(batch_size,)`):
每个样本的答案。如果没有标量答案则为 NaN。
aggregate_mask (`torch.FloatTensor` of shape `(batch_size,)`):
对应需要使用聚合函数的样本的掩码,为1。
dist_per_cell (`torch.distributions.Bernoulli`):
每个单元格的选择分布。
numeric_values (`torch.FloatTensor` of shape `(batch_size, seq_length)`):
每个标记的数值。对于非数值标记为 NaN。
numeric_values_scale (`torch.FloatTensor` of shape `(batch_size, seq_length)`):
每个标记数值的规模。
input_mask_float (`torch.FloatTensor` of shape `(batch_size, seq_length)`):
表格的掩码,不包括问题标记和表头。
logits_aggregation (`torch.FloatTensor` of shape `(batch_size, num_aggregation_labels)`):
每种聚合操作的对数。
config ([`TapasConfig`]):
模型配置类,包含模型的所有参数。
Returns:
per_example_answer_loss_scaled (`torch.FloatTensor` of shape `(batch_size,)`): 每个样本的答案损失(已缩放)。
large_answer_loss_mask (`torch.FloatTensor` of shape `(batch_size,)`): 一个掩码,对于损失超过 answer_loss_cutoff 的样本为 1。
"""
# float32 (batch_size,)
# 计算预期结果
expected_result = _calculate_expected_result(
dist_per_cell, numeric_values, numeric_values_scale, input_mask_float, logits_aggregation, config
)
# float32 (batch_size,)
# 将答案中的 NaN 替换为零
answer_masked = torch.where(torch.isnan(answer), torch.zeros_like(answer), answer)
if config.use_normalized_answer_loss:
# 计算归一化因子,避免零除错误
normalizer = (torch.max(torch.abs(expected_result), torch.abs(answer_masked)) + EPSILON_ZERO_DIVISION).detach()
# 对答案和预期结果进行归一化
normalized_answer_masked = answer_masked / normalizer
normalized_expected_result = expected_result / normalizer
# 使用 Huber 损失函数计算答案损失
per_example_answer_loss = huber_loss(
normalized_expected_result * aggregate_mask, normalized_answer_masked * aggregate_mask
)
else:
# 使用 Huber 损失函数计算答案损失,使用配置中的 delta
per_example_answer_loss = huber_loss(
expected_result * aggregate_mask, answer_masked * aggregate_mask, delta=config.huber_loss_delta
)
# 如果配置中的答案损失截断为 None,则创建一个全为1的张量,与 per_example_answer_loss 的形状相同
if config.answer_loss_cutoff is None:
large_answer_loss_mask = torch.ones_like(per_example_answer_loss, dtype=torch.float32)
else:
# 否则,根据答案损失是否大于答案损失截断值,创建一个掩码张量
large_answer_loss_mask = torch.where(
per_example_answer_loss > config.answer_loss_cutoff, # 条件:答案损失大于答案损失截断值
torch.zeros_like(per_example_answer_loss, dtype=torch.float32), # 答案损失大于截断值时的值
torch.ones_like(per_example_answer_loss, dtype=torch.float32), # 答案损失不大于截断值时的值
)
# 计算每个样本的答案损失按比例缩放后的值,乘以聚合掩码
per_example_answer_loss_scaled = config.answer_loss_importance * (per_example_answer_loss * aggregate_mask)
# 返回按比例缩放后的答案损失和大答案损失掩码
return per_example_answer_loss_scaled, large_answer_loss_mask