Transformers 源码解析(六十七)
.\models\longt5\modeling_longt5.py
""" PyTorch LongT5 model."""
import copy
import math
import warnings
from typing import Any, List, Optional, Tuple, Union
import torch
from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
Seq2SeqLMOutput,
Seq2SeqModelOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
DUMMY_INPUTS,
DUMMY_MASK,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_torch_fx_proxy,
logging,
replace_return_docstrings,
)
from .configuration_longt5 import LongT5Config
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "LongT5Config"
_CHECKPOINT_FOR_DOC = "google/long-t5-local-base"
LONGT5_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/long-t5-local-base",
"google/long-t5-local-large",
"google/long-t5-tglobal-base",
"google/long-t5-tglobal-large",
]
def _pad_to_multiple(x: torch.Tensor, block_len: int, dim: int, pad_value: int = 0) -> torch.Tensor:
"""
Pad a tensor so that a sequence length will be a multiple of `block_len`.
Args:
x (torch.Tensor): Input tensor to be padded.
block_len (int): Desired block length for padding.
dim (int): Dimension along which padding will be applied.
pad_value (int, optional): Value used for padding. Defaults to 0.
Returns:
torch.Tensor: Padded tensor.
"""
pad_len = -x.shape[dim] % block_len
if not all(x.shape):
new_shape = list(x.shape)
new_shape[dim] += pad_len
return torch.zeros(new_shape, dtype=x.dtype)
pad = [(0, 0)] * x.ndim
pad[dim] = (0, pad_len)
pad = sum(pad[::-1], ())
x = nn.functional.pad(x, pad=pad, mode="constant", value=pad_value)
return x
def _split_into_blocks(x: torch.Tensor, block_len: int, dim: int) -> torch.Tensor:
"""
Split an input tensor into blocks of a given `block_len` along the given `dim`.
If the dimension length is not a multiple of `block_len`, it will be padded first
with selected `pad_value`.
Args:
x (torch.Tensor): Input tensor to be split into blocks.
block_len (int): Length of each block.
dim (int): Dimension along which to split the tensor.
Returns:
torch.Tensor: Tensor split into blocks.
"""
if x.shape[dim] % block_len != 0:
x = _pad_to_multiple(x, block_len, dim, pad_value=0)
num_blocks = x.shape[dim] // block_len
output_shape = x.shape[:dim] + (num_blocks, block_len) + x.shape[(dim + 1) :]
if 0 in output_shape:
return torch.empty(output_shape, dtype=x.dtype, device=x.device)
return x.reshape(output_shape)
def _concatenate_3_blocks(x: torch.Tensor, block_dim: int, sequence_dim: int, pad_value: int = 0) -> torch.Tensor:
"""Concatenate three consecutive blocks for each input block for local attentiont.
For more information, see: https://arxiv.org/pdf/2112.07916.pdf.
"""
num_blocks = x.shape[block_dim]
pad = [(0, 0)] * x.ndim
pad[block_dim] = (1, 1)
pad = sum(pad[::-1], ())
x = nn.functional.pad(x, pad=pad, mode="constant", value=pad_value)
blocks_list: List[torch.Tensor] = []
for i in range(3):
indices = [slice(0, None)] * x.ndim
indices[block_dim] = slice(i, i + num_blocks)
indices = tuple(indices)
blocks_list.append(x[indices])
return torch.cat(blocks_list, dim=sequence_dim)
def _make_3block_relative_position_ids(block_len: int) -> torch.Tensor:
"""Makes 3-blocked relative position ids for local attention."""
position_ids = torch.arange(3 * block_len, dtype=torch.int32)
center_position_ids = position_ids[block_len:-block_len]
relative_position_ids = position_ids.unsqueeze(0) - center_position_ids.unsqueeze(1)
return relative_position_ids
def _mask_local_attention_mask(local_attention_mask: torch.Tensor, block_len: int) -> torch.Tensor:
"""Mask local attention mask to enforce that tokens are not allowed to attend tokens farther than ``local_radius."""
relative_position_ids = _make_3block_relative_position_ids(block_len)
locality_mask = torch.abs(relative_position_ids) < block_len
locality_mask = locality_mask[None, None, :, :]
locality_mask = locality_mask.to(local_attention_mask.device)
return torch.logical_and(local_attention_mask, locality_mask)
def _get_local_attention_mask(attention_mask: torch.Tensor, block_len: int, device: torch.device) -> torch.Tensor:
"""Prepare attention mask to be applied for a local attention."""
_blocked_attention_mask = _split_into_blocks(attention_mask, block_len, dim=1)
_3blocked_attention_mask = _concatenate_3_blocks(_blocked_attention_mask, block_dim=1, sequence_dim=2)
_blocked_attention_mask = _blocked_attention_mask.unsqueeze(-1)
_3blocked_attention_mask = _3blocked_attention_mask.unsqueeze(-2)
local_attention_mask = torch.logical_and(_blocked_attention_mask, _3blocked_attention_mask)
local_attention_mask = _mask_local_attention_mask(local_attention_mask, block_len)
return local_attention_mask.unsqueeze(1).to(device)
def _make_global_fixed_block_ids(
attention_mask: torch.Tensor, global_block_size: int
) -> Tuple[torch.Tensor, torch.Tensor]:
"""获取每个输入标记对应的“固定块”全局ID。
这个实现是从以下地址采用的 Flaxformr 原始实现的简化版本:
https://github.com/google/flaxformer/blob/main/flaxformer/architectures/longt5/long_attention.py.
在我们的场景中,由于我们仅用于解码器,孤立的标记(即不组成整个固定块的标记)将被分配到前一个块。
原始序列中的填充标记由 -1 表示。
"""
batch_size, seq_len = attention_mask.shape[:2]
def handle_orphan_tokens(block_ids: torch.Tensor) -> torch.Tensor:
block_ends = (torch.arange(seq_len) % global_block_size) == global_block_size - 1
block_ends = block_ends.to(block_ids.device)
true_block_ends = torch.logical_and(block_ends, block_ids >= 0)
full_blocks = true_block_ends.sum(-1).unsqueeze(-1).type(block_ids.dtype) - 1
block_ids = torch.where(block_ids < full_blocks, block_ids, full_blocks)
return block_ids
fixed_block_mask = torch.ones_like(attention_mask, device=attention_mask.device) / global_block_size
fixed_block_mask = torch.cumsum(fixed_block_mask, axis=1) - fixed_block_mask
mask = torch.where(attention_mask != 0.0, 1.0, -1000.0).type(attention_mask.dtype)
global_block_ids = torch.floor(mask + fixed_block_mask - 1.0).type(attention_mask.dtype)
_global_block_ids_lower_bound = torch.tensor(-1, dtype=global_block_ids.dtype, device=global_block_ids.device)
global_block_ids = torch.where(
global_block_ids > _global_block_ids_lower_bound, global_block_ids, _global_block_ids_lower_bound
)
global_block_ids = (global_block_ids * attention_mask) + (attention_mask - 1)
global_block_ids = handle_orphan_tokens(global_block_ids)
num_globals = seq_len // global_block_size
if num_globals > 0:
_sequence_block_ids_max = torch.max(global_block_ids, dim=-1).values.repeat(num_globals, 1).transpose(0, 1)
else:
_sequence_block_ids_max = torch.zeros(
batch_size, 0, dtype=global_block_ids.dtype, device=global_block_ids.device
)
global_segment_ids = torch.cumsum(torch.ones(batch_size, num_globals), dim=-1) - 1
global_segment_ids = global_segment_ids.to(attention_mask.device)
global_segment_ids = torch.where(global_segment_ids <= _sequence_block_ids_max, 1, 0)
return global_block_ids.type(torch.int), global_segment_ids.type(torch.int)
def _make_side_relative_position_ids(attention_mask: torch.Tensor, global_block_size: int) -> torch.Tensor:
"""Create the relative position tensor for local -> global attention."""
block_ids, global_segment_ids = _make_global_fixed_block_ids(attention_mask, global_block_size)
global_seq_len = global_segment_ids.shape[-1]
global_positions = torch.arange(global_seq_len, device=block_ids.device)
side_relative_position = global_positions - block_ids[..., None]
return side_relative_position.type(torch.int64)
def _create_global_aggregates(
hidden_states: torch.Tensor, block_ids: torch.Tensor, global_seq_len: int
) -> torch.Tensor:
"""Compute individual block aggregates by summing over individual blocks."""
block_ids = block_ids.where(
block_ids >= 0, torch.tensor(global_seq_len, dtype=block_ids.dtype, device=block_ids.device)
)
one_hot_block_ids = nn.functional.one_hot(block_ids.type(torch.int64), global_seq_len + 1)[:, :, :-1]
return torch.einsum("...nd,...ng->...gd", hidden_states, one_hot_block_ids.type(hidden_states.dtype))
class LongT5LayerNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
"""
Construct a layernorm module in the LongT5 style. No bias and no subtraction of mean.
"""
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.variance_epsilon = eps
def forward(self, hidden_states):
variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
if self.weight.dtype in [torch.float16, torch.bfloat16]:
hidden_states = hidden_states.to(self.weight.dtype)
return self.weight * hidden_states
try:
from apex.normalization import FusedRMSNorm
LongT5LayerNorm = FusedRMSNorm
logger.info("Discovered apex.normalization.FusedRMSNorm - will use it instead of LongT5LayerNorm")
except ImportError:
pass
except Exception:
logger.warning("discovered apex but it failed to load, falling back to LongT5LayerNorm")
pass
ALL_LAYERNORM_LAYERS.append(LongT5LayerNorm)
class LongT5DenseActDense(nn.Module):
def __init__(self, config: LongT5Config):
super().__init__()
self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
self.dropout = nn.Dropout(config.dropout_rate)
self.act = ACT2FN[config.dense_act_fn]
def forward(self, hidden_states):
hidden_states = self.wi(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.dropout(hidden_states)
if (
isinstance(self.wo.weight, torch.Tensor)
and hidden_states.dtype != self.wo.weight.dtype
and self.wo.weight.dtype != torch.int8
):
hidden_states = hidden_states.to(self.wo.weight.dtype)
hidden_states = self.wo(hidden_states)
return hidden_states
class LongT5DenseGatedActDense(nn.Module):
def __init__(self, config: LongT5Config):
super().__init__()
self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
self.dropout = nn.Dropout(config.dropout_rate)
self.act = ACT2FN[config.dense_act_fn]
def forward(self, hidden_states):
hidden_gelu = self.act(self.wi_0(hidden_states))
hidden_linear = self.wi_1(hidden_states)
hidden_states = hidden_gelu * hidden_linear
hidden_states = self.dropout(hidden_states)
hidden_states = self.wo(hidden_states)
return hidden_states
class LongT5LayerFF(nn.Module):
def __init__(self, config: LongT5Config):
super().__init__()
if config.is_gated_act:
self.DenseReluDense = LongT5DenseGatedActDense(config)
else:
self.DenseReluDense = LongT5DenseActDense(config)
self.layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate)
def forward(self, hidden_states):
forwarded_states = self.layer_norm(hidden_states)
forwarded_states = self.DenseReluDense(forwarded_states)
hidden_states = hidden_states + self.dropout(forwarded_states)
return hidden_states
class LongT5Attention(nn.Module):
def __init__(self, config: LongT5Config, has_relative_attention_bias=False):
super().__init__()
self.is_decoder = config.is_decoder
self.has_relative_attention_bias = has_relative_attention_bias
self.relative_attention_num_buckets = config.relative_attention_num_buckets
self.relative_attention_max_distance = config.relative_attention_max_distance
self.d_model = config.d_model
self.key_value_proj_dim = config.d_kv
self.n_heads = config.num_heads
self.dropout = config.dropout_rate
self.inner_dim = self.n_heads * self.key_value_proj_dim
self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
if self.has_relative_attention_bias:
self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
self.pruned_heads = set()
self.gradient_checkpointing = False
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
)
self.q = prune_linear_layer(self.q, index)
self.k = prune_linear_layer(self.k, index)
self.v = prune_linear_layer(self.v, index)
self.o = prune_linear_layer(self.o, index, dim=1)
self.n_heads = self.n_heads - len(heads)
self.inner_dim = self.key_value_proj_dim * self.n_heads
self.pruned_heads = self.pruned_heads.union(heads)
def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
"""
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
Translate relative position to a bucket number for relative attention. The relative position is defined as
memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to longer sequences than the model has been trained on
Args:
relative_position: an int32 Tensor - represents the relative position between memory and query
bidirectional: a boolean - indicates if attention is bidirectional or unidirectional
num_buckets: an integer - number of buckets to divide the range of relative positions
max_distance: an integer - maximum allowed distance for relative positions
Returns:
a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
"""
relative_buckets = 0
if bidirectional:
num_buckets //= 2
relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
relative_position = torch.abs(relative_position)
else:
relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
max_exact = num_buckets // 2
is_small = relative_position < max_exact
relative_position_if_large = max_exact + (
torch.log(relative_position.float() / max_exact)
/ math.log(max_distance / max_exact)
* (num_buckets - max_exact)
).to(torch.long)
relative_position_if_large = torch.min(
relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
)
relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
return relative_buckets
def compute_bias(self, query_length, key_length, device=None):
"""Compute binned relative position bias"""
if device is None:
device = self.relative_attention_bias.weight.device
context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
relative_position = memory_position - context_position
relative_position_bucket = self._relative_position_bucket(
relative_position,
bidirectional=(not self.is_decoder),
num_buckets=self.relative_attention_num_buckets,
max_distance=self.relative_attention_max_distance,
)
values = self.relative_attention_bias(relative_position_bucket)
values = values.permute([2, 0, 1]).unsqueeze(0)
return values
def forward(
self,
hidden_states,
mask=None,
key_value_states=None,
position_bias=None,
past_key_value=None,
layer_head_mask=None,
query_length=None,
use_cache=False,
output_attentions=False,
class LongT5LocalAttention(nn.Module):
def __init__(self, config: LongT5Config, has_relative_attention_bias: bool = False) -> None:
super().__init__()
self.is_decoder = config.is_decoder
self.has_relative_attention_bias = has_relative_attention_bias
self.relative_attention_num_buckets = config.relative_attention_num_buckets
self.relative_attention_max_distance = config.relative_attention_max_distance
self.d_model = config.d_model
self.key_value_proj_dim = config.d_kv
self.n_heads = config.num_heads
self.local_radius = config.local_radius
self.block_len = self.local_radius + 1
self.dropout = config.dropout_rate
self.inner_dim = self.n_heads * self.key_value_proj_dim
self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
if self.has_relative_attention_bias:
self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
self.pruned_heads = set()
self.gradient_checkpointing = False
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
)
self.q = prune_linear_layer(self.q, index)
self.k = prune_linear_layer(self.k, index)
self.v = prune_linear_layer(self.v, index)
self.o = prune_linear_layer(self.o, index, dim=1)
self.n_heads = self.n_heads - len(heads)
self.inner_dim = self.key_value_proj_dim * self.n_heads
self.pruned_heads = self.pruned_heads.union(heads)
@staticmethod
def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
"""
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
Translate relative position to a bucket number for relative attention. The relative position is defined as
memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to longer sequences than the model has been trained on
Args:
relative_position: an int32 Tensor - 相对位置,表示从当前位置到被关注位置的距离(记忆位置 - 查询位置)
bidirectional: a boolean - 是否为双向注意力
num_buckets: an integer - 桶的数量,用来分桶相对位置
max_distance: an integer - 最大距离,超过此距离的相对位置都映射到同一个桶内
Returns:
a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
返回一个形状与relative_position相同的张量,其中的值在区间[0, num_buckets)内,表示相对位置所属的桶编号
"""
relative_buckets = 0
if bidirectional:
num_buckets //= 2
relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
relative_position = torch.abs(relative_position)
else:
relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
max_exact = num_buckets // 2
is_small = relative_position < max_exact
relative_position_if_large = max_exact + (
torch.log(relative_position.float() / max_exact)
/ math.log(max_distance / max_exact)
* (num_buckets - max_exact)
).to(torch.long)
relative_position_if_large = torch.min(
relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
)
relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
return relative_buckets
def compute_bias(self, block_length: int):
"""Compute binned relative position bias"""
target_device = (
self.relative_attention_bias.weight.device
if self.relative_attention_bias.weight.device.type != "meta"
else None
)
memory_position = torch.arange(3 * block_length, dtype=torch.long, device=target_device)
context_position = memory_position[block_length:-block_length]
relative_position = memory_position[None, :] - context_position[:, None]
relative_position_bucket = self._relative_position_bucket(
relative_position,
bidirectional=(not self.is_decoder),
num_buckets=self.relative_attention_num_buckets,
max_distance=self.relative_attention_max_distance,
)
values = self.relative_attention_bias(relative_position_bucket)
values = values.permute([2, 0, 1]).unsqueeze(0).unsqueeze(0)
return values
def forward(
self,
hidden_states,
mask=None,
position_bias=None,
layer_head_mask=None,
output_attentions=False,
class LongT5TransientGlobalAttention(nn.Module):
def __init__(self, config: LongT5Config, has_relative_attention_bias: bool = False) -> None:
super().__init__()
self.is_decoder = config.is_decoder
self.has_relative_attention_bias = has_relative_attention_bias
self.relative_attention_num_buckets = config.relative_attention_num_buckets
self.relative_attention_max_distance = config.relative_attention_max_distance
self.d_model = config.d_model
self.key_value_proj_dim = config.d_kv
self.n_heads = config.num_heads
self.local_radius = config.local_radius
self.block_len = self.local_radius + 1
self.global_block_size = config.global_block_size
self.dropout = config.dropout_rate
self.inner_dim = self.n_heads * self.key_value_proj_dim
self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
if self.has_relative_attention_bias:
self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
self.pruned_heads = set()
if self.has_relative_attention_bias:
self.global_relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
self.global_input_layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
)
self.q = prune_linear_layer(self.q, index)
self.k = prune_linear_layer(self.k, index)
self.v = prune_linear_layer(self.v, index)
self.o = prune_linear_layer(self.o, index, dim=1)
self.n_heads = self.n_heads - len(heads)
self.inner_dim = self.key_value_proj_dim * self.n_heads
self.pruned_heads = self.pruned_heads.union(heads)
@staticmethod
def _relative_position_bucket():
def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
"""
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
Translate relative position to a bucket number for relative attention. The relative position is defined as
memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to longer sequences than the model has been trained on
Args:
relative_position: an int32 Tensor - represents the relative position between memory and query
bidirectional: a boolean - whether the attention is bidirectional or unidirectional
num_buckets: an integer - number of buckets to quantize relative positions into
max_distance: an integer - maximum distance that a relative position can have
Returns:
a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
"""
relative_buckets = 0
if bidirectional:
num_buckets //= 2
relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
relative_position = torch.abs(relative_position)
else:
relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
max_exact = num_buckets // 2
is_small = relative_position < max_exact
relative_position_if_large = max_exact + (
torch.log(relative_position.float() / max_exact)
/ math.log(max_distance / max_exact)
* (num_buckets - max_exact)
).to(torch.long)
relative_position_if_large = torch.min(
relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
)
relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
return relative_buckets
def compute_bias(self, block_length: int):
"""Compute binned relative position bias"""
target_device = (
self.relative_attention_bias.weight.device
if self.relative_attention_bias.weight.device.type != "meta"
else None
)
memory_position = torch.arange(3 * block_length, dtype=torch.long, device=target_device)
context_position = memory_position[block_length:-block_length]
relative_position = memory_position[None, :] - context_position[:, None]
relative_position_bucket = self._relative_position_bucket(
relative_position,
bidirectional=(not self.is_decoder),
num_buckets=self.relative_attention_num_buckets,
max_distance=self.relative_attention_max_distance,
)
values = self.relative_attention_bias(relative_position_bucket)
values = values.permute([2, 0, 1]).unsqueeze(0).unsqueeze(0)
return values
def compute_side_bias(self, mask: torch.Tensor, global_segment_ids: torch.Tensor) -> torch.Tensor:
side_attention_mask = torch.eq(mask[..., None], global_segment_ids[:, None, :])[:, None, ...]
attention_side_bias = torch.where(side_attention_mask > 0, 0.0, -1e10)
side_relative_position = _make_side_relative_position_ids(mask, self.global_block_size)
side_relative_position_bucket = self._relative_position_bucket(
side_relative_position,
bidirectional=(not self.is_decoder),
num_buckets=self.relative_attention_num_buckets,
max_distance=self.relative_attention_max_distance,
)
side_bias = self.global_relative_attention_bias(side_relative_position_bucket)
side_bias = side_bias.permute([0, 3, 1, 2])
attention_side_bias = attention_side_bias + side_bias
return attention_side_bias
def forward(
self,
hidden_states,
mask=None,
position_bias=None,
layer_head_mask=None,
output_attentions=False,
class LongT5LayerSelfAttention(nn.Module):
def __init__(self, config, has_relative_attention_bias=False):
super().__init__()
self.SelfAttention = LongT5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
self.layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate)
def forward(
self,
hidden_states,
attention_mask=None,
position_bias=None,
layer_head_mask=None,
past_key_value=None,
use_cache=False,
output_attentions=False,
):
normed_hidden_states = self.layer_norm(hidden_states)
attention_output = self.SelfAttention(
normed_hidden_states,
mask=attention_mask,
position_bias=position_bias,
layer_head_mask=layer_head_mask,
past_key_value=past_key_value,
use_cache=use_cache,
output_attentions=output_attentions,
)
hidden_states = hidden_states + self.dropout(attention_output[0])
outputs = (hidden_states,) + attention_output[1:]
return outputs
class LongT5LayerLocalSelfAttention(nn.Module):
"""用于编码器的局部自注意力"""
def __init__(self, config, has_relative_attention_bias=False):
super().__init__()
self.LocalSelfAttention = LongT5LocalAttention(config, has_relative_attention_bias=has_relative_attention_bias)
self.layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate)
def forward(
self,
hidden_states,
attention_mask=None,
position_bias=None,
layer_head_mask=None,
output_attentions=False,
**kwargs: Any,
):
normed_hidden_states = self.layer_norm(hidden_states)
attention_output = self.LocalSelfAttention(
normed_hidden_states,
mask=attention_mask,
position_bias=position_bias,
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
hidden_states = hidden_states + self.dropout(attention_output[0])
outputs = (hidden_states,) + attention_output[1:]
return outputs
class LongT5LayerTransientGlobalSelfAttention(nn.Module):
"""用于编码器的瞬时全局自注意力"""
def __init__(self, config, has_relative_attention_bias=False):
super().__init__()
self.TransientGlobalSelfAttention = LongT5TransientGlobalAttention(
config, has_relative_attention_bias=has_relative_attention_bias
)
self.layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate)
def forward(
self,
hidden_states,
attention_mask=None,
position_bias=None,
layer_head_mask=None,
output_attentions=False,
**kwargs: Any,
):
normed_hidden_states = self.layer_norm(hidden_states)
attention_output = self.TransientGlobalSelfAttention(
normed_hidden_states,
mask=attention_mask,
position_bias=position_bias,
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
hidden_states = hidden_states + self.dropout(attention_output[0])
outputs = (hidden_states,) + attention_output[1:]
return outputs
class LongT5LayerCrossAttention(nn.Module):
def __init__(self, config):
super().__init__()
self.EncDecAttention = LongT5Attention(config, has_relative_attention_bias=False)
self.layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate)
def forward(
self,
hidden_states,
key_value_states,
attention_mask=None,
position_bias=None,
layer_head_mask=None,
past_key_value=None,
use_cache=False,
query_length=None,
output_attentions=False,
):
normed_hidden_states = self.layer_norm(hidden_states)
attention_output = self.EncDecAttention(
normed_hidden_states,
mask=attention_mask,
key_value_states=key_value_states,
position_bias=position_bias,
layer_head_mask=layer_head_mask,
past_key_value=past_key_value,
use_cache=use_cache,
query_length=query_length,
output_attentions=output_attentions,
)
layer_output = hidden_states + self.dropout(attention_output[0])
outputs = (layer_output,) + attention_output[1:]
return outputs
class LongT5Block(nn.Module):
def __init__(self, config, has_relative_attention_bias=False):
super().__init__()
self.is_decoder = config.is_decoder
if config.is_decoder:
attention_layer = LongT5LayerSelfAttention
elif config.encoder_attention_type == "local":
attention_layer = LongT5LayerLocalSelfAttention
elif config.encoder_attention_type == "transient-global":
attention_layer = LongT5LayerTransientGlobalSelfAttention
else:
raise ValueError(
"For encoder attention mechanism, either `local` or `transient-global` attention type is expected, "
f"but got {config.encoder_attention_type}."
)
self.layer = nn.ModuleList()
self.layer.append(attention_layer(config, has_relative_attention_bias=has_relative_attention_bias))
if self.is_decoder:
self.layer.append(LongT5LayerCrossAttention(config))
self.layer.append(LongT5LayerFF(config))
def forward(
self,
hidden_states,
attention_mask=None,
position_bias=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
encoder_decoder_position_bias=None,
layer_head_mask=None,
cross_attn_layer_head_mask=None,
past_key_value=None,
use_cache=False,
output_attentions=False,
return_dict=True,
):
for layer_module in self.layer:
hidden_states = layer_module(
hidden_states,
attention_mask=attention_mask,
position_bias=position_bias,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
encoder_decoder_position_bias=encoder_decoder_position_bias,
layer_head_mask=layer_head_mask,
cross_attn_layer_head_mask=cross_attn_layer_head_mask,
past_key_value=past_key_value,
use_cache=use_cache,
output_attentions=output_attentions,
return_dict=return_dict,
)
return hidden_states
"""
模型的基类,LongT5PreTrainedModel,继承自父类 T5PreTrainedModel,并针对 LongT5 进行特定设置和调整。
"""
config_class = LongT5Config
base_model_prefix = "transformer"
supports_gradient_checkpointing = True
_no_split_modules = ["LongT5Block"]
@property
def dummy_inputs(self):
input_ids = torch.tensor(DUMMY_INPUTS)
input_mask = torch.tensor(DUMMY_MASK)
dummy_inputs = {
"decoder_input_ids": input_ids,
"input_ids": input_ids,
"decoder_attention_mask": input_mask,
}
return dummy_inputs
def _shift_right(self, input_ids):
decoder_start_token_id = self.config.decoder_start_token_id
pad_token_id = self.config.pad_token_id
if decoder_start_token_id is None:
raise ValueError(
"self.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the pad_token_id. "
"See LongT5 docs for more information."
)
if is_torch_fx_proxy(input_ids):
shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
else:
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
shifted_input_ids[..., 0] = decoder_start_token_id
if pad_token_id is None:
raise ValueError("self.model.config.pad_token_id has to be defined.")
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
return shifted_input_ids
class LongT5Stack(LongT5PreTrainedModel):
def __init__(self, config, embed_tokens=None):
super().__init__(config)
self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model)
if embed_tokens is not None:
self.embed_tokens.weight = embed_tokens.weight
self.is_decoder = config.is_decoder
self.local_radius = config.local_radius
self.block_len = self.local_radius + 1
self.block = nn.ModuleList(
[LongT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
)
self.final_layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate)
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, new_embeddings):
self.embed_tokens = new_embeddings
def forward(
self,
input_ids=None,
attention_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
inputs_embeds=None,
head_mask=None,
cross_attn_head_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
LONGT5_START_DOCSTRING = r"""
The LongT5 model was proposed in [LongT5: Efficient Text-To-Text Transformer for Long
Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo
Ni, Yun-Hsuan Sung and Yinfei Yang. It's an encoder-decoder transformer pre-trained in a text-to-text denoising
generative setting. LongT5 model is an extension of T5 model, and it enables using one of the two different
efficient attention mechanisms - (1) Local attention, or (2) Transient-Global attention.
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`LongT5Config`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
LONGT5_INPUTS_DOCSTRING = r"""
"""
LONGT5_ENCODER_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
输入序列标记在词汇表中的索引。LongT5模型具有相对位置嵌入,因此可以在左右两侧进行填充。
索引可以使用[`AutoTokenizer`]获得。参见[`PreTrainedTokenizer.encode`]和
[`PreTrainedTokenizer.__call__`]了解详情。
如何为预训练准备`input_ids`的更多信息,请参阅[LONGT5
Training](./longt5#training)。
attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
避免在填充标记索引上执行注意力的掩码。掩码的值在`[0, 1]`范围内选择:
- 1 表示**未被掩码**的标记,
- 0 表示**被掩码**的标记。
[什么是注意力掩码?](../glossary#attention-mask)
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
用于空置自注意力模块中选择头部的掩码。掩码的值在`[0, 1]`范围内选择:
- 1 表示头部**未被掩码**,
- 0 表示头部**被掩码**。
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
可选,可以直接传递嵌入表示而不是`input_ids`。如果您希望更多控制如何将`input_ids`索引转换为关联向量,
而不是使用模型的内部嵌入查找矩阵,这将非常有用。
output_attentions (`bool`, *optional*):
是否返回所有注意力层的注意力张量。有关更多细节,请参见返回张量中的`attentions`。
output_hidden_states (`bool`, *optional*):
是否返回所有层的隐藏状态。有关更多细节,请参见返回张量中的`hidden_states`。
return_dict (`bool`, *optional*):
是否返回[`~utils.ModelOutput`]而不是普通元组。
"""
__HEAD_MASK_WARNING_MSG = """
输入参数`head_mask`已分成两个参数`head_mask`和`decoder_head_mask`。当前,
`decoder_head_mask`设置为复制`head_mask`,但此功能已被弃用,并将在未来版本中删除。
如果您现在不想使用任何`decoder_head_mask`,请设置`decoder_head_mask = torch.ones(num_layers, num_heads)`。
"""
@add_start_docstrings(
"The bare LONGT5 Model transformer outputting raw hidden-states without any specific head on top.",
LONGT5_START_DOCSTRING,
)
class LongT5Model(LongT5PreTrainedModel):
_keys_to_ignore_on_load_unexpected = [
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: LongT5Config):
super().__init__(config)
self.shared = nn.Embedding(config.vocab_size, config.d_model)
encoder_config = copy.deepcopy(config)
encoder_config.is_decoder = False
encoder_config.use_cache = False
encoder_config.is_encoder_decoder = False
self.encoder = LongT5Stack(encoder_config, self.shared)
decoder_config = copy.deepcopy(config)
decoder_config.is_decoder = True
decoder_config.is_encoder_decoder = False
decoder_config.num_layers = config.num_decoder_layers
self.decoder = LongT5Stack(decoder_config, self.shared)
self.post_init()
def get_input_embeddings(self):
return self.shared
def set_input_embeddings(self, new_embeddings):
self.shared = new_embeddings
self.encoder.set_input_embeddings(new_embeddings)
self.decoder.set_input_embeddings(new_embeddings)
def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)
def get_encoder(self):
return self.encoder
def get_decoder(self):
return self.decoder
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(LONGT5_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.BoolTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
decoder_head_mask: Optional[torch.FloatTensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
decoder_inputs_embeds: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings("""LONGT5 Model with a `language modeling` head on top.""", LONGT5_START_DOCSTRING)
class LongT5ForConditionalGeneration(LongT5PreTrainedModel):
_keys_to_ignore_on_load_unexpected = [
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
def __init__(self, config: LongT5Config):
super().__init__(config)
self.model_dim = config.d_model
self.shared = nn.Embedding(config.vocab_size, config.d_model)
encoder_config = copy.deepcopy(config)
encoder_config.is_decoder = False
encoder_config.use_cache = False
encoder_config.is_encoder_decoder = False
self.encoder = LongT5Stack(encoder_config, self.shared)
decoder_config = copy.deepcopy(config)
decoder_config.is_decoder = True
decoder_config.is_encoder_decoder = False
decoder_config.num_layers = config.num_decoder_layers
self.decoder = LongT5Stack(decoder_config, self.shared)
self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
self.post_init()
def get_input_embeddings(self):
return self.shared
def set_input_embeddings(self, new_embeddings):
self.shared = new_embeddings
self.encoder.set_input_embeddings(new_embeddings)
self.decoder.set_input_embeddings(new_embeddings)
def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def get_output_embeddings(self):
return self.lm_head
def get_encoder(self):
return self.encoder
def get_decoder(self):
return self.decoder
@add_start_docstrings_to_model_forward(LONGT5_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.BoolTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
decoder_head_mask: Optional[torch.FloatTensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
return {
"decoder_input_ids": input_ids,
"past_key_values": past_key_values,
"encoder_outputs": encoder_outputs,
"attention_mask": attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache,
}
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
return self._shift_right(labels)
def _reorder_cache(self, past_key_values, beam_idx):
if past_key_values is None:
logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
return past_key_values
reordered_decoder_past = ()
for layer_past_states in past_key_values:
reordered_layer_past_states = ()
for layer_past_state in layer_past_states:
reordered_layer_past_states = reordered_layer_past_states + (
layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
)
assert reordered_layer_past_states[0].shape == layer_past_states[0].shape
assert len(reordered_layer_past_states) == len(layer_past_states)
reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
return reordered_decoder_past
class LongT5EncoderModel(LongT5PreTrainedModel):
_tied_weights_keys = ["encoder.embed_tokens.weight"]
_keys_to_ignore_on_load_unexpected = [r"decoder"]
def __init__(self, config: LongT5Config):
super().__init__(config)
self.shared = nn.Embedding(config.vocab_size, config.d_model)
encoder_config = copy.deepcopy(config)
encoder_config.use_cache = False
encoder_config.is_encoder_decoder = False
self.encoder = LongT5Stack(encoder_config, self.shared)
self.post_init()
def get_input_embeddings(self):
return self.shared
def set_input_embeddings(self, new_embeddings):
self.shared = new_embeddings
self.encoder.set_input_embeddings(new_embeddings)
def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
def get_encoder(self):
return self.encoder
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(LONGT5_ENCODER_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
pass
) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
r"""
返回一个元组,包含 torch.FloatTensor 或 BaseModelOutput 类型的对象。
Returns:
返回模型的输出结果。
Example:
示例代码展示如何使用这个模型:
```
>>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration
>>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
>>> model = LongT5EncoderModel.from_pretrained("google/long-t5-local-base")
>>> input_ids = tokenizer(
... 100 * "Studies have been shown that owning a dog is good for you ", return_tensors="pt"
... ).input_ids # Batch size 1
>>> outputs = model(input_ids=input_ids)
>>> last_hidden_states = outputs.last_hidden_state
```
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
encoder_outputs = self.encoder(
input_ids=input_ids,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
return encoder_outputs
.\models\longt5\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_flax_available, is_torch_available
_import_structure = {
"configuration_longt5": ["LONGT5_PRETRAINED_CONFIG_ARCHIVE_MAP", "LongT5Config", "LongT5OnnxConfig"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_longt5"] = [
"LONGT5_PRETRAINED_MODEL_ARCHIVE_LIST",
"LongT5EncoderModel",
"LongT5ForConditionalGeneration",
"LongT5Model",
"LongT5PreTrainedModel",
]
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_flax_longt5"] = [
"FlaxLongT5ForConditionalGeneration",
"FlaxLongT5Model",
"FlaxLongT5PreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_longt5 import LONGT5_PRETRAINED_CONFIG_ARCHIVE_MAP, LongT5Config, LongT5OnnxConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_longt5 import (
LONGT5_PRETRAINED_MODEL_ARCHIVE_LIST,
LongT5EncoderModel,
LongT5ForConditionalGeneration,
LongT5Model,
LongT5PreTrainedModel,
)
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_flax_longt5 import (
FlaxLongT5ForConditionalGeneration,
FlaxLongT5Model,
FlaxLongT5PreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\luke\configuration_luke.py
""" LUKE configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"studio-ousia/luke-base": "https://huggingface.co/studio-ousia/luke-base/resolve/main/config.json",
"studio-ousia/luke-large": "https://huggingface.co/studio-ousia/luke-large/resolve/main/config.json",
}
class LukeConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`LukeModel`]. It is used to instantiate a LUKE
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the LUKE
[studio-ousia/luke-base](https://huggingface.co/studio-ousia/luke-base) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Examples:
```
>>> from transformers import LukeConfig, LukeModel
>>> # Initializing a LUKE configuration
>>> configuration = LukeConfig()
>>> # Initializing a model from the configuration
>>> model = LukeModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "luke"
def __init__(
self,
vocab_size=50267,
entity_vocab_size=500000,
hidden_size=768,
entity_emb_size=256,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
use_entity_aware_attention=True,
classifier_dropout=None,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
**kwargs,
):
"""
Initialize a LUKE configuration with default values.
Args:
vocab_size (int): Size of the token vocabulary.
entity_vocab_size (int): Size of the entity vocabulary.
hidden_size (int): Size of the encoder layers and the pooler layer.
entity_emb_size (int): Dimensionality of the entity embeddings.
num_hidden_layers (int): Number of hidden layers in the Transformer encoder.
num_attention_heads (int): Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (int): Size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (str): The non-linear activation function (function or string) in the encoder and pooler.
hidden_dropout_prob (float): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (float): The dropout ratio for the attention probabilities.
max_position_embeddings (int): The maximum sequence length that this model might ever be used with.
type_vocab_size (int): The vocabulary size of the "type" (i.e., token type IDs) embeddings.
initializer_range (float): The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (float): The epsilon used by the layer normalization layers.
use_entity_aware_attention (bool): Whether to use entity-aware attention in the model.
classifier_dropout (float or None): The dropout probability for the classifier layer (None means no dropout).
pad_token_id (int): The ID of the padding token in the token vocabulary.
bos_token_id (int): The ID of the beginning-of-sequence token in the token vocabulary.
eos_token_id (int): The ID of the end-of-sequence token in the token vocabulary.
**kwargs: Additional configuration arguments.
"""
super().__init__(
vocab_size=vocab_size,
entity_vocab_size=entity_vocab_size,
hidden_size=hidden_size,
entity_emb_size=entity_emb_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
intermediate_size=intermediate_size,
hidden_act=hidden_act,
hidden_dropout_prob=hidden_dropout_prob,
attention_probs_dropout_prob=attention_probs_dropout_prob,
max_position_embeddings=max_position_embeddings,
type_vocab_size=type_vocab_size,
initializer_range=initializer_range,
layer_norm_eps=layer_norm_eps,
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
**kwargs,
)
self.use_entity_aware_attention = use_entity_aware_attention
self.classifier_dropout = classifier_dropout
"""
Constructs LukeConfig.
"""
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.entity_vocab_size = entity_vocab_size
self.hidden_size = hidden_size
self.entity_emb_size = entity_emb_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.use_entity_aware_attention = use_entity_aware_attention
self.classifier_dropout = classifier_dropout
.\models\luke\convert_luke_original_pytorch_checkpoint_to_pytorch.py
"""Convert LUKE checkpoint."""
import argparse
import json
import os
import torch
from transformers import LukeConfig, LukeModel, LukeTokenizer, RobertaTokenizer
from transformers.tokenization_utils_base import AddedToken
@torch.no_grad()
def convert_luke_checkpoint(checkpoint_path, metadata_path, entity_vocab_path, pytorch_dump_folder_path, model_size):
with open(metadata_path) as metadata_file:
metadata = json.load(metadata_file)
config = LukeConfig(use_entity_aware_attention=True, **metadata["model_config"])
state_dict = torch.load(checkpoint_path, map_location="cpu")
entity_vocab = load_entity_vocab(entity_vocab_path)
tokenizer = RobertaTokenizer.from_pretrained(metadata["model_config"]["bert_model_name"])
entity_token_1 = AddedToken("<ent>", lstrip=False, rstrip=False)
entity_token_2 = AddedToken("<ent2>", lstrip=False, rstrip=False)
tokenizer.add_special_tokens({"additional_special_tokens": [entity_token_1, entity_token_2]})
config.vocab_size += 2
print(f"Saving tokenizer to {pytorch_dump_folder_path}")
tokenizer.save_pretrained(pytorch_dump_folder_path)
with open(os.path.join(pytorch_dump_folder_path, LukeTokenizer.vocab_files_names["entity_vocab_file"]), "w") as f:
json.dump(entity_vocab, f)
tokenizer = LukeTokenizer.from_pretrained(pytorch_dump_folder_path)
word_emb = state_dict["embeddings.word_embeddings.weight"]
ent_emb = word_emb[tokenizer.convert_tokens_to_ids(["@"])[0]].unsqueeze(0)
ent2_emb = word_emb[tokenizer.convert_tokens_to_ids(["#"])[0]].unsqueeze(0)
state_dict["embeddings.word_embeddings.weight"] = torch.cat([word_emb, ent_emb, ent2_emb])
for layer_index in range(config.num_hidden_layers):
for matrix_name in ["query.weight", "query.bias"]:
prefix = f"encoder.layer.{layer_index}.attention.self."
state_dict[prefix + "w2e_" + matrix_name] = state_dict[prefix + matrix_name]
state_dict[prefix + "e2w_" + matrix_name] = state_dict[prefix + matrix_name]
state_dict[prefix + "e2e_" + matrix_name] = state_dict[prefix + matrix_name]
entity_emb = state_dict["entity_embeddings.entity_embeddings.weight"]
entity_emb[entity_vocab["[MASK2]"]] = entity_emb[entity_vocab["[MASK]"]]
model = LukeModel(config=config).eval()
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
if not (len(missing_keys) == 1 and missing_keys[0] == "embeddings.position_ids"):
raise ValueError(f"Missing keys {', '.join(missing_keys)}. Expected only missing embeddings.position_ids")
if not (all(key.startswith("entity_predictions") or key.startswith("lm_head") for key in unexpected_keys)):
raise ValueError(
"Unexpected keys"
f" {', '.join([key for key in unexpected_keys if not (key.startswith('entity_predictions') or key.startswith('lm_head'))])}"
)
tokenizer = LukeTokenizer.from_pretrained(pytorch_dump_folder_path, task="entity_classification")
text = (
"Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped the"
" new world number one avoid a humiliating second- round exit at Wimbledon ."
)
span = (39, 42)
encoding = tokenizer(text, entity_spans=[span], add_prefix_space=True, return_tensors="pt")
outputs = model(**encoding)
if model_size == "large":
expected_shape = torch.Size((1, 42, 1024))
expected_slice = torch.tensor(
[[0.0133, 0.0865, 0.0095], [0.3093, -0.2576, -0.7418], [-0.1720, -0.2117, -0.2869]]
)
else:
expected_shape = torch.Size((1, 42, 768))
expected_slice = torch.tensor([[0.0037, 0.1368, -0.0091], [0.1099, 0.3329, -0.1095], [0.0765, 0.5335, 0.1179]])
if not (outputs.last_hidden_state.shape == expected_shape):
raise ValueError(
f"Outputs.last_hidden_state.shape is {outputs.last_hidden_state.shape}, Expected shape is {expected_shape}"
)
if not torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4):
raise ValueError
if model_size == "large":
expected_shape = torch.Size((1, 1, 1024))
expected_slice = torch.tensor([[0.0466, -0.0106, -0.0179]])
else:
expected_shape = torch.Size((1, 1, 768))
expected_slice = torch.tensor([[0.1457, 0.1044, 0.0174]])
if not (outputs.entity_last_hidden_state.shape != expected_shape):
raise ValueError(
f"Outputs.entity_last_hidden_state.shape is {outputs.entity_last_hidden_state.shape}, Expected shape is"
f" {expected_shape}"
)
if not torch.allclose(outputs.entity_last_hidden_state[0, :3, :3], expected_slice, atol=1e-4):
raise ValueError
print("Saving PyTorch model to {}".format(pytorch_dump_folder_path))
model.save_pretrained(pytorch_dump_folder_path)
def load_entity_vocab(entity_vocab_path):
entity_vocab = {}
with open(entity_vocab_path, "r", encoding="utf-8") as f:
for index, line in enumerate(f):
title, _ = line.rstrip().split("\t")
entity_vocab[title] = index
return entity_vocab
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--checkpoint_path", type=str, help="Path to a pytorch_model.bin file.")
parser.add_argument(
"--metadata_path", default=None, type=str, help="Path to a metadata.json file, defining the configuration."
)
parser.add_argument(
"--entity_vocab_path",
default=None,
type=str,
help="Path to an entity_vocab.tsv file, containing the entity vocabulary.",
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, help="Path to where to dump the output PyTorch model."
)
parser.add_argument(
"--model_size", default="base", type=str, choices=["base", "large"], help="Size of the model to be converted."
)
args = parser.parse_args()
convert_luke_checkpoint(
args.checkpoint_path,
args.metadata_path,
args.entity_vocab_path,
args.pytorch_dump_folder_path,
args.model_size,
)
.\models\luke\modeling_luke.py
"""PyTorch LUKE model."""
import math
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN, gelu
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "LukeConfig"
_CHECKPOINT_FOR_DOC = "studio-ousia/luke-base"
LUKE_PRETRAINED_MODEL_ARCHIVE_LIST = [
"studio-ousia/luke-base",
"studio-ousia/luke-large",
]
@dataclass
class BaseLukeModelOutputWithPooling(BaseModelOutputWithPooling):
"""
Base class for outputs of the LUKE model.
"""
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
模型最后一层输出的隐藏状态序列。
entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
实体的最后一层隐藏状态序列。
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
经过线性层和Tanh激活函数处理过的序列中第一个标记(分类标记)的最后一层隐藏状态。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
元组,包含模型每一层的隐藏状态(从嵌入层开始,每层一个张量),形状为 `(batch_size, sequence_length, hidden_size)`。
当 `output_hidden_states=True` 时返回。
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
元组,包含实体的每一层隐藏状态(从嵌入层开始,每层一个张量),形状为 `(batch_size, entity_length, hidden_size)`。
当 `output_hidden_states=True` 时返回。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
元组,包含每一层的注意力权重张量,形状为 `(batch_size, num_heads, sequence_length + entity_length, sequence_length + entity_length)`。
注意力权重经过 softmax 处理,用于计算自注意力头中的加权平均值。
entity_last_hidden_state: torch.FloatTensor = None
entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class BaseLukeModelOutput(BaseModelOutput):
"""
Base class for model's outputs, with potential hidden states and attentions.
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
Sequence of entity hidden-states at the output of the last layer of the model.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
layer plus the initial entity embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
entity_last_hidden_state: torch.FloatTensor = None
entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
The sum of masked language modeling (MLM) loss and entity prediction loss.
当提供 `labels` 参数时返回,表示掩码语言建模(MLM)损失和实体预测损失的总和。
mlm_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Masked language modeling (MLM) loss.
当提供 `labels` 参数时返回,表示掩码语言建模(MLM)损失。
mep_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Masked entity prediction (MEP) loss.
当提供 `labels` 参数时返回,表示掩码实体预测(MEP)损失。
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
语言建模头部的预测分数(SoftMax 之前的每个词汇标记的分数)。
entity_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the entity prediction head (scores for each entity vocabulary token before SoftMax).
实体预测头部的预测分数(SoftMax 之前的每个实体词汇标记的分数)。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`.
模型在每个层输出后的隐藏状态的元组,包括初始嵌入输出。
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, entity_length, hidden_size)`.
模型在每个层输出后的实体隐藏状态的元组,包括初始实体嵌入输出。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
注意力权重在经过注意力 softmax 后的结果,用于计算自注意力头部的加权平均值。
@dataclass
class EntityClassificationOutput(ModelOutput):
"""
实体分类模型的输出结果。
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
分类损失。
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
分类得分(SoftMax 之前)。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
包含每一层的输出的元组 `torch.FloatTensor`,形状为 `(batch_size, sequence_length, hidden_size)`。
模型每一层的隐藏状态加上初始嵌入的输出。
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
实体的隐藏状态的元组 `torch.FloatTensor`,形状为 `(batch_size, entity_length, hidden_size)`。
模型每一层的实体隐藏状态加上初始实体嵌入的输出。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
自注意力头中用于计算加权平均的注意力权重的元组 `torch.FloatTensor`,形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
注意力 softmax 后的注意力权重。
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
分类损失。
如果提供了 `labels` 参数,则返回分类损失。
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
分类分数(SoftMax 之前的输出)。
形状为 `(batch_size, config.num_labels)` 的分类分数。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
模型隐藏状态,包括每一层的输出和初始嵌入的输出。
形状为 `(batch_size, sequence_length, hidden_size)` 的元组,第一个元素是嵌入的输出,后续元素是每一层的输出。
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
实体的隐藏状态,包括每一层的输出和初始实体嵌入的输出。
形状为 `(batch_size, entity_length, hidden_size)` 的元组,第一个元素是实体嵌入的输出,后续元素是每一层的输出。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
注意力权重,用于计算自注意力头中的加权平均值。
形状为 `(batch_size, num_heads, sequence_length, sequence_length)` 的元组,每个元素对应一个层的注意力权重。
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class EntitySpanClassificationOutput(ModelOutput):
"""
实体跨度分类模型的输出结果。
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, 当提供 `labels` 时返回):
分类损失值。
logits (`torch.FloatTensor` of shape `(batch_size, entity_length, config.num_labels)`):
分类分数(SoftMax 之前的)。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, 当 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
由两部分组成的元组 `torch.FloatTensor` (一个用于嵌入的输出 + 一个用于每层输出),形状为 `(batch_size, sequence_length, hidden_size)`。
模型在每层输出结束时的隐藏状态,加上初始嵌入输出。
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, 当 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
由两部分组成的元组 `torch.FloatTensor` (一个用于嵌入的输出 + 一个用于每层输出),形状为 `(batch_size, entity_length, hidden_size)`。
模型在每层输出结束时实体的隐藏状态,加上初始实体嵌入输出。
attentions (`tuple(torch.FloatTensor)`, *optional*, 当 `output_attentions=True` 或 `config.output_attentions=True` 时返回):
由每一层的 `torch.FloatTensor` 组成的元组,形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
"""
@dataclass
class LukeSequenceClassifierOutput(ModelOutput):
"""
句子分类模型的输出结果。
"""
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
分类(或回归,如果 `config.num_labels==1`)的损失值。
如果提供 `labels`,则返回损失值。
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
分类(或回归,如果 `config.num_labels==1`)的分数(SoftMax 之前)。
模型的输出分数。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
模型在每一层的隐藏状态以及可选的初始嵌入输出的元组。
形状为 `(batch_size, sequence_length, hidden_size)`。
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
实体的隐藏状态,包括每一层的输出以及初始实体嵌入输出的元组。
形状为 `(batch_size, entity_length, hidden_size)`。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
注意力权重的元组,每个层级一个。
形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
在自注意力机制中用于计算加权平均值的注意力权重。
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class LukeTokenClassifierOutput(ModelOutput):
"""
Base class for outputs of token classification models.
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
Classification loss.
分类损失,当提供 `labels` 参数时返回,数据类型为 `torch.FloatTensor`,形状为 `(1,)`。
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
分类分数(SoftMax 之前的输出),数据类型为 `torch.FloatTensor`,形状为 `(batch_size, sequence_length, config.num_labels)`。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
可选项。当传入 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回,
返回一个元组,包含 `torch.FloatTensor` 类型的张量(如果模型有嵌入层,则包含嵌入层的输出,
否则包含每一层的输出),形状为 `(batch_size, sequence_length, hidden_size)`。
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
layer plus the initial entity embedding outputs.
可选项。当传入 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回,
返回一个元组,包含 `torch.FloatTensor` 类型的张量(一个用于嵌入层的输出,另一个包含每一层的输出),
形状为 `(batch_size, entity_length, hidden_size)`。用于表示每一层的实体隐藏状态以及初始实体嵌入输出。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
可选项。当传入 `output_attentions=True` 或 `config.output_attentions=True` 时返回,
返回一个元组,包含 `torch.FloatTensor` 类型的张量(每一层一个),形状为 `(batch_size, num_heads, sequence_length,
sequence_length)`。表示经过注意力 softmax 后的注意力权重,用于计算自注意力头部的加权平均值。
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class LukeQuestionAnsweringModelOutput(ModelOutput):
"""
Outputs of question answering models.
"""
loss: Optional[torch.FloatTensor] = None
start_logits: torch.FloatTensor = None
end_logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class LukeMultipleChoiceModelOutput(ModelOutput):
"""
多选模型的输出结果。
Args:
loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
分类损失值。
logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
`num_choices` 是输入张量的第二个维度。参见上文中的 `input_ids`。
分类分数(SoftMax 之前)。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
一个元组,包含 `torch.FloatTensor`(如果模型有嵌入层,则包含嵌入层输出,以及每一层的输出),
形状为 `(batch_size, sequence_length, hidden_size)`。
模型在每一层输出的隐藏状态,以及可选的初始嵌入输出。
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
一个元组,包含 `torch.FloatTensor`(包含嵌入层输出以及每一层的输出),
形状为 `(batch_size, entity_length, hidden_size)`。
模型在每一层输出的实体隐藏状态,以及初始实体嵌入输出。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
一个元组,包含 `torch.FloatTensor`(每一层一个),
形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
注意力机制 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
class LukeEmbeddings(nn.Module):
"""
与 BertEmbeddings 类似,但稍作修改以支持位置嵌入索引。
"""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.padding_idx = config.pad_token_id
self.position_embeddings = nn.Embedding(
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
)
def forward(
self,
input_ids=None,
token_type_ids=None,
position_ids=None,
inputs_embeds=None,
):
if position_ids is None:
if input_ids is not None:
position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx).to(input_ids.device)
else:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
def create_position_ids_from_inputs_embeds(self, inputs_embeds):
"""
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
Args:
inputs_embeds: torch.Tensor
Returns: torch.Tensor
"""
input_shape = inputs_embeds.size()[:-1]
sequence_length = input_shape[1]
position_ids = torch.arange(
self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
)
return position_ids.unsqueeze(0).expand(input_shape)
class LukeEntityEmbeddings(nn.Module):
def __init__(self, config: LukeConfig):
super().__init__()
self.config = config
self.entity_embeddings = nn.Embedding(config.entity_vocab_size, config.entity_emb_size, padding_idx=0)
if config.entity_emb_size != config.hidden_size:
self.entity_embedding_dense = nn.Linear(config.entity_emb_size, config.hidden_size, bias=False)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(
self, entity_ids: torch.LongTensor, position_ids: torch.LongTensor, token_type_ids: torch.LongTensor = None
):
if token_type_ids is None:
token_type_ids = torch.zeros_like(entity_ids)
entity_embeddings = self.entity_embeddings(entity_ids)
if self.config.entity_emb_size != self.config.hidden_size:
entity_embeddings = self.entity_embedding_dense(entity_embeddings)
position_embeddings = self.position_embeddings(position_ids.clamp(min=0))
position_embedding_mask = (position_ids != -1).type_as(position_embeddings).unsqueeze(-1)
position_embeddings = position_embeddings * position_embedding_mask
position_embeddings = torch.sum(position_embeddings, dim=-2)
position_embeddings = position_embeddings / position_embedding_mask.sum(dim=-2).clamp(min=1e-7)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = entity_embeddings + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
def __init__(self, config):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
f"heads {config.num_attention_heads}."
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.use_entity_aware_attention = config.use_entity_aware_attention
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
if self.use_entity_aware_attention:
self.w2e_query = nn.Linear(config.hidden_size, self.all_head_size)
self.e2w_query = nn.Linear(config.hidden_size, self.all_head_size)
self.e2e_query = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
word_hidden_states,
entity_hidden_states,
attention_mask=None,
head_mask=None,
output_attentions=False,
class LukeSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class LukeAttention(nn.Module):
def __init__(self, config):
super().__init__()
self.self = LukeSelfAttention(config)
self.output = LukeSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
raise NotImplementedError("LUKE does not support the pruning of attention heads")
def forward(
self,
word_hidden_states,
entity_hidden_states,
attention_mask=None,
head_mask=None,
output_attentions=False,
):
word_size = word_hidden_states.size(1)
self_outputs = self.self(
word_hidden_states,
entity_hidden_states,
attention_mask,
head_mask,
output_attentions,
)
if entity_hidden_states is None:
concat_self_outputs = self_outputs[0]
concat_hidden_states = word_hidden_states
else:
concat_self_outputs = torch.cat(self_outputs[:2], dim=1)
concat_hidden_states = torch.cat([word_hidden_states, entity_hidden_states], dim=1)
attention_output = self.output(concat_self_outputs, concat_hidden_states)
word_attention_output = attention_output[:, :word_size, :]
if entity_hidden_states is None:
entity_attention_output = None
else:
entity_attention_output = attention_output[:, word_size:, :]
outputs = (word_attention_output, entity_attention_output) + self_outputs[2:]
return outputs
class LukeIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class LukeOutput(nn.Module):
pass
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class LukeLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = LukeAttention(config)
self.intermediate = LukeIntermediate(config)
self.output = LukeOutput(config)
def forward(
self,
word_hidden_states,
entity_hidden_states,
attention_mask=None,
head_mask=None,
output_attentions=False,
):
word_size = word_hidden_states.size(1)
self_attention_outputs = self.attention(
word_hidden_states,
entity_hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
)
if entity_hidden_states is None:
concat_attention_output = self_attention_outputs[0]
else:
concat_attention_output = torch.cat(self_attention_outputs[:2], dim=1)
outputs = self_attention_outputs[2:]
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, concat_attention_output
)
word_layer_output = layer_output[:, :word_size, :]
if entity_hidden_states is None:
entity_layer_output = None
else:
entity_layer_output = layer_output[:, word_size:, :]
outputs = (word_layer_output, entity_layer_output) + outputs
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class LukeEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([LukeLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
word_hidden_states,
entity_hidden_states,
attention_mask=None,
head_mask=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
):
for layer_module in self.layer:
word_hidden_states, entity_hidden_states = layer_module(
word_hidden_states,
entity_hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
)
return word_hidden_states, entity_hidden_states
):
all_word_hidden_states = () if output_hidden_states else None
all_entity_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_word_hidden_states = all_word_hidden_states + (word_hidden_states,)
all_entity_hidden_states = all_entity_hidden_states + (entity_hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
word_hidden_states,
entity_hidden_states,
attention_mask,
layer_head_mask,
output_attentions,
)
else:
layer_outputs = layer_module(
word_hidden_states,
entity_hidden_states,
attention_mask,
layer_head_mask,
output_attentions,
)
word_hidden_states = layer_outputs[0]
if entity_hidden_states is not None:
entity_hidden_states = layer_outputs[1]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[2],)
if output_hidden_states:
all_word_hidden_states = all_word_hidden_states + (word_hidden_states,)
all_entity_hidden_states = all_entity_hidden_states + (entity_hidden_states,)
if not return_dict:
return tuple(
v
for v in [
word_hidden_states,
all_word_hidden_states,
all_self_attentions,
entity_hidden_states,
all_entity_hidden_states,
]
if v is not None
)
return BaseLukeModelOutput(
last_hidden_state=word_hidden_states,
hidden_states=all_word_hidden_states,
attentions=all_self_attentions,
entity_last_hidden_state=entity_hidden_states,
entity_hidden_states=all_entity_hidden_states,
)
class LukePooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class EntityPredictionHeadTransform(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.entity_emb_size)
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = nn.LayerNorm(config.entity_emb_size, eps=config.layer_norm_eps)
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
class EntityPredictionHead(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.transform = EntityPredictionHeadTransform(config)
self.decoder = nn.Linear(config.entity_emb_size, config.entity_vocab_size, bias=False)
self.bias = nn.Parameter(torch.zeros(config.entity_vocab_size))
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states) + self.bias
return hidden_states
class LukePreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = LukeConfig
base_model_prefix = "luke"
supports_gradient_checkpointing = True
_no_split_modules = ["LukeAttention", "LukeEntityEmbeddings"]
def _init_weights(self, module: nn.Module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
if module.embedding_dim == 1:
module.weight.data.zero_()
else:
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
LUKE_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`LukeConfig`]): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
LUKE_INPUTS_DOCSTRING = r"""
"""
@add_start_docstrings(
"The bare LUKE model transformer outputting raw hidden-states for both word tokens and entities without any"
" specific head on top.",
LUKE_START_DOCSTRING,
)
class LukeModel(LukePreTrainedModel):
def __init__(self, config: LukeConfig, add_pooling_layer: bool = True):
super().__init__(config)
self.config = config
self.embeddings = LukeEmbeddings(config)
self.entity_embeddings = LukeEntityEmbeddings(config)
self.encoder = LukeEncoder(config)
self.pooler = LukePooler(config) if add_pooling_layer else None
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def get_entity_embeddings(self):
return self.entity_embeddings.entity_embeddings
def set_entity_embeddings(self, value):
self.entity_embeddings.entity_embeddings = value
def _prune_heads(self, heads_to_prune):
raise NotImplementedError("LUKE does not support the pruning of attention heads")
@add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=BaseLukeModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
entity_ids: Optional[torch.LongTensor] = None,
entity_attention_mask: Optional[torch.FloatTensor] = None,
entity_token_type_ids: Optional[torch.LongTensor] = None,
entity_position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Executes the forward pass for the model.
Arguments:
input_ids (`torch.LongTensor`, *optional*):
Indices of input sequence tokens in the vocabulary.
attention_mask (`torch.FloatTensor`, *optional*):
Mask to avoid performing attention on padding tokens.
token_type_ids (`torch.LongTensor`, *optional*):
Segment token indices to indicate first and second portions of the inputs.
position_ids (`torch.LongTensor`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings.
entity_ids (`torch.LongTensor`, *optional*):
Indices of entity sequence tokens in the vocabulary.
entity_attention_mask (`torch.FloatTensor`, *optional*):
Mask to avoid performing attention on padding tokens in entity tokens.
entity_token_type_ids (`torch.LongTensor`, *optional*):
Segment token indices to indicate first and second portions of the entity inputs.
entity_position_ids (`torch.LongTensor`, *optional*):
Indices of positions of each entity sequence tokens in the position embeddings.
head_mask (`torch.FloatTensor`, *optional*):
Mask to nullify selected heads of the self-attention modules.
inputs_embeds (`torch.FloatTensor`, *optional*):
Optionally instead of input_ids, you can pass pre-computed embeddings.
output_attentions (`bool`, *optional*):
Whether to output the attentions tensors.
output_hidden_states (`bool`, *optional*):
Whether to output the hidden states tensors.
return_dict (`bool`, *optional*):
Whether to return a dict instead of a tuple.
Returns:
`torch.Tensor` The extended attention mask, with the same dtype as `attention_mask.dtype`.
"""
pass
def get_extended_attention_mask(
self, word_attention_mask: torch.LongTensor, entity_attention_mask: Optional[torch.LongTensor]
):
"""
Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
Arguments:
word_attention_mask (`torch.LongTensor`):
Attention mask for word tokens with ones indicating tokens to attend to, zeros for tokens to ignore.
entity_attention_mask (`torch.LongTensor`, *optional*):
Attention mask for entity tokens with ones indicating tokens to attend to, zeros for tokens to ignore.
Returns:
`torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
"""
attention_mask = word_attention_mask
if entity_attention_mask is not None:
attention_mask = torch.cat([attention_mask, entity_attention_mask], dim=-1)
if attention_mask.dim() == 3:
extended_attention_mask = attention_mask[:, None, :, :]
elif attention_mask.dim() == 2:
extended_attention_mask = attention_mask[:, None, None, :]
else:
raise ValueError(f"Wrong shape for attention_mask (shape {attention_mask.shape})")
extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)
extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(self.dtype).min
return extended_attention_mask
@add_start_docstrings(
"""
LUKE 模型带有语言建模头和顶部实体预测头,支持掩码语言建模和掩码实体预测。
""",
LUKE_START_DOCSTRING,
)
class LukeForMaskedLM(LukePreTrainedModel):
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias", "entity_predictions.decoder.weight"]
def __init__(self, config):
super().__init__(config)
self.luke = LukeModel(config)
self.lm_head = LukeLMHead(config)
self.entity_predictions = EntityPredictionHead(config)
self.loss_fn = nn.CrossEntropyLoss()
self.post_init()
def tie_weights(self):
super().tie_weights()
self._tie_or_clone_weights(self.entity_predictions.decoder, self.luke.entity_embeddings.entity_embeddings)
def get_output_embeddings(self):
return self.lm_head.decoder
def set_output_embeddings(self, new_embeddings):
self.lm_head.decoder = new_embeddings
@add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
def forward(self, **kwargs):
@replace_return_docstrings(output_type=LukeMaskedLMOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
entity_ids: Optional[torch.LongTensor] = None,
entity_attention_mask: Optional[torch.LongTensor] = None,
entity_token_type_ids: Optional[torch.LongTensor] = None,
entity_position_ids: Optional[torch.LongTensor] = None,
labels: Optional[torch.LongTensor] = None,
entity_labels: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""
The LUKE model with a classification head on top (a linear layer on top of the hidden state of the first entity
token) for entity classification tasks, such as Open Entity.
""",
LUKE_START_DOCSTRING,
)
class LukeForEntityClassification(LukePreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.luke = LukeModel(config)
self.num_labels = config.num_labels
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=EntityClassificationOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
entity_ids: Optional[torch.LongTensor] = None,
entity_attention_mask: Optional[torch.FloatTensor] = None,
entity_token_type_ids: Optional[torch.LongTensor] = None,
entity_position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""
The LUKE model with a classification head on top (a linear layer on top of the hidden states of the two entity
tokens) for entity pair classification tasks, such as TACRED.
""",
LUKE_START_DOCSTRING,
)
class LukeForEntityPairClassification(LukePreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.luke = LukeModel(config)
self.num_labels = config.num_labels
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size * 2, config.num_labels, False)
self.post_init()
@add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=EntityPairClassificationOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
entity_ids: Optional[torch.LongTensor] = None,
entity_attention_mask: Optional[torch.FloatTensor] = None,
entity_token_type_ids: Optional[torch.LongTensor] = None,
entity_position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""
The LUKE model with a span classification head on top (a linear layer on top of the hidden states output) for tasks
such as named entity recognition.
""",
LUKE_START_DOCSTRING,
)
class LukeForEntitySpanClassification(LukePreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.luke = LukeModel(config)
self.num_labels = config.num_labels
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size * 3, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=EntitySpanClassificationOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
entity_ids: Optional[torch.LongTensor] = None,
entity_attention_mask: Optional[torch.LongTensor] = None,
entity_token_type_ids: Optional[torch.LongTensor] = None,
entity_position_ids: Optional[torch.LongTensor] = None,
entity_start_positions: Optional[torch.LongTensor] = None,
entity_end_positions: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""
The LUKE Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
LUKE_START_DOCSTRING,
)
class LukeForSequenceClassification(LukePreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.luke = LukeModel(config)
self.dropout = nn.Dropout(
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=LukeSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
entity_ids: Optional[torch.LongTensor] = None,
entity_attention_mask: Optional[torch.FloatTensor] = None,
entity_token_type_ids: Optional[torch.LongTensor] = None,
entity_position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
"""
The LUKE Model with a token classification head on top (a linear layer on top of the hidden-states output). To
solve Named-Entity Recognition (NER) task using LUKE, `LukeForEntitySpanClassification` is more suitable than this
class.
"""
@add_start_docstrings(
"""
The LUKE Model with a token classification head on top (a linear layer on top of the hidden-states output). To
solve Named-Entity Recognition (NER) task using LUKE, `LukeForEntitySpanClassification` is more suitable than this
class.
""",
LUKE_START_DOCSTRING,
)
class LukeForTokenClassification(LukePreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.luke = LukeModel(config, add_pooling_layer=False)
self.dropout = nn.Dropout(
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=LukeTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
entity_ids: Optional[torch.LongTensor] = None,
entity_attention_mask: Optional[torch.FloatTensor] = None,
entity_token_type_ids: Optional[torch.LongTensor] = None,
entity_position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, LukeTokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.luke(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
entity_ids=entity_ids,
entity_attention_mask=entity_attention_mask,
entity_token_type_ids=entity_token_type_ids,
entity_position_ids=entity_position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=True,
)
sequence_output = outputs.last_hidden_state
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
labels = labels.to(logits.device)
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
return tuple(
v
for v in [loss, logits, outputs.hidden_states, outputs.entity_hidden_states, outputs.attentions]
if v is not None
)
return LukeTokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
entity_hidden_states=outputs.entity_hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
The LUKE Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
LUKE_START_DOCSTRING,
)
class LukeForQuestionAnswering(LukePreTrainedModel):
"""
LUKE模型,用于支持抽取式问答任务(如SQuAD),在隐藏状态输出的顶部增加一个用于计算“起始位置logits”和“结束位置logits”的线性层。
继承自LukePreTrainedModel。
"""
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.luke = LukeModel(config, add_pooling_layer=False)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=LukeQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.FloatTensor] = None,
entity_ids: Optional[torch.LongTensor] = None,
entity_attention_mask: Optional[torch.FloatTensor] = None,
entity_token_type_ids: Optional[torch.LongTensor] = None,
entity_position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
正向传播方法,接收多种输入并生成相应的输出。
参数与返回值的详细说明参见LUKE_INPUTS_DOCSTRING。
"""
pass
@add_start_docstrings(
"""
The LUKE Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
LUKE_START_DOCSTRING,
)
class LukeForMultipleChoice(LukePreTrainedModel):
"""
LUKE模型,用于多选分类任务(例如RocStories/SWAG),在汇总输出的顶部增加一个线性层和softmax激活函数。
继承自LukePreTrainedModel。
"""
def __init__(self, config):
super().__init__(config)
self.luke = LukeModel(config)
self.dropout = nn.Dropout(
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.classifier = nn.Linear(config.hidden_size, 1)
self.post_init()
@add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=LukeMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
.\models\luke\tokenization_luke.py
@lru_cache()
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
characters the bpe code barfs on.
The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
"""
return list(bytes(range(256))) + [chr(i) for i in range(256, 65536)]
"""
# 创建一个字典,用于将 UTF-8 字节与 Unicode 字符之间建立映射关系
bs = (
# bs 列表包含可打印的 ASCII 字符的 Unicode 编码范围
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
)
# 复制 bs 到 cs 列表
cs = bs[:]
n = 0
# 遍历所有可能的字节值
for b in range(2**8):
# 如果字节值不在 bs 中,将其添加到 bs 和 cs 列表,并分配一个新的 Unicode 字符给它
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
# 将 cs 列表中的数字转换为相应的 Unicode 字符
cs = [chr(n) for n in cs]
# 返回一个将 bs 中的字节映射到 cs 中 Unicode 字符的字典
return dict(zip(bs, cs))
```
# 从 transformers.models.roberta.tokenization_roberta.get_pairs 复制而来的函数定义,用于获取单词中的符号对集合
def get_pairs(word):
"""
Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
"""
pairs = set()
prev_char = word[0]
# 遍历单词中的每个字符(从第二个字符开始),形成前一个字符和当前字符的符号对,加入集合中
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
# LUKE tokenizer 类,继承自 PreTrainedTokenizer
class LukeTokenizer(PreTrainedTokenizer):
"""
Constructs a LUKE tokenizer, derived from the GPT-2 tokenizer, using byte-level Byte-Pair-Encoding.
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
be encoded differently whether it is at the beginning of the sentence (without space) or not:
```
>>> from transformers import LukeTokenizer
>>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base")
>>> tokenizer("Hello world")["input_ids"]
[0, 31414, 232, 2]
>>> tokenizer(" Hello world")["input_ids"]
[0, 20920, 232, 2]
```
You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
<Tip>
When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one).
</Tip>
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods. It also creates entity sequences, namely
`entity_ids`, `entity_attention_mask`, `entity_token_type_ids`, and `entity_position_ids` to be used by the LUKE
model.
"""
# 定义了一些类属性
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
# 初始化方法,接受多个参数,设置 LUKE tokenizer 的各种配置
def __init__(
self,
vocab_file,
merges_file,
entity_vocab_file,
task=None,
max_entity_length=32,
max_mention_length=30,
entity_token_1="<ent>",
entity_token_2="<ent2>",
entity_unk_token="[UNK]",
entity_pad_token="[PAD]",
entity_mask_token="[MASK]",
entity_mask2_token="[MASK2]",
errors="replace",
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
add_prefix_space=False,
**kwargs,
):
# 继承父类的初始化方法
super().__init__(
vocab_file=vocab_file,
merges_file=merges_file,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
cls_token=cls_token,
unk_token=unk_token,
pad_token=pad_token,
mask_token=mask_token,
add_prefix_space=add_prefix_space,
**kwargs,
)
@property
# 从 transformers.models.roberta.tokenization_roberta.RobertaTokenizer.vocab_size 复制而来,用于返回 LUKE tokenizer 的词汇表大小
def vocab_size(self):
return len(self.encoder)
# 从 transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_vocab 复制而来,用于获取 LUKE tokenizer 的词汇表
# 获取词汇表,复制编码器中的内容并更新添加的特殊标记编码器的内容,返回合并后的词汇表字典
def get_vocab(self):
vocab = dict(self.encoder).copy()
vocab.update(self.added_tokens_encoder)
return vocab
# 从 transformers.models.roberta.tokenization_roberta.RobertaTokenizer.bpe 复制过来,修改为使用 LUKE 和 Luke 替代 RoBERTa 和 Roberta
def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token)
pairs = get_pairs(word)
if not pairs:
return token
while True:
# 选择具有最小 bpe 排名的双字母对作为 bigram
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
# 如果找到 bigram,则将其替换为一个单一的 token
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = " ".join(word)
self.cache[token] = word
return word
# 从 transformers.models.roberta.tokenization_roberta.RobertaTokenizer._tokenize 复制过来,修改为使用 LUKE 和 Luke 替代 RoBERTa 和 Roberta
def _tokenize(self, text):
"""Tokenize a string."""
bpe_tokens = []
for token in re.findall(self.pat, text):
token = "".join(
self.byte_encoder[b] for b in token.encode("utf-8")
) # 将所有字节映射为 Unicode 字符串,避免 BPE 的控制标记(在我们的情况下是空格)
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
return bpe_tokens
# 从 transformers.models.roberta.tokenization_roberta.RobertaTokenizer._convert_token_to_id 复制过来,修改为使用 LUKE 和 Luke 替代 RoBERTa 和 Roberta
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.encoder.get(token, self.encoder.get(self.unk_token))
# 从 transformers.models.roberta.tokenization_roberta.RobertaTokenizer._convert_id_to_token 复制过来,修改为使用 LUKE 和 Luke 替代 RoBERTa 和 Roberta
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index)
# 从 transformers.models.roberta.tokenization_roberta.RobertaTokenizer.convert_tokens_to_string 复制过来,修改为使用 LUKE 和 Luke 替代 RoBERTa 和 Roberta
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
# 将一系列的 tokens(字符串)连接成一个字符串
text = "".join(tokens)
# 使用 byte_decoder 将字符串解码为 UTF-8 编码的文本
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
# 返回解码后的文本
return text
# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.build_inputs_with_special_tokens with Roberta->Luke, RoBERTa->LUKE
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A LUKE sequence has the following format:
- single sequence: `<s> X </s>`
- pair of sequences: `<s> A </s></s> B </s>`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary
"""
if token_ids_1 is None:
# 返回单个序列的 input IDs,加上特殊 token `<s>` 和 `</s>`
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
# 返回序列对的 input IDs,加上特殊 token `<s>`, `</s>` 和 `</s>` 以及第二个序列的 tokens
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_special_tokens_mask with Roberta->Luke, RoBERTa->LUKE
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
# 如果已经有特殊 token,则调用父类方法获取特殊 token 的 mask
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is None:
# 返回单个序列的特殊 token 的 mask:首位为 1,其余为 0,末尾再加 1
return [1] + ([0] * len(token_ids_0)) + [1]
# 返回序列对的特殊 token 的 mask:首位为 1,其余为 0,中间再加两个 1,再加上第二个序列的 mask
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
# 从 transformers.models.roberta.tokenization_roberta.RobertaTokenizer.create_token_type_ids_from_sequences 复制并修改,RoBERTa->LUKE, Roberta->Luke
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
创建用于序列对分类任务的掩码。LUKE 不使用 token type ids,因此返回一个全零列表。
Args:
token_ids_0 (`List[int]`):
第一个序列的 ID 列表。
token_ids_1 (`List[int]`, *optional*):
第二个序列的 ID 列表(可选)。
Returns:
`List[int]`: 全零列表。
"""
sep = [self.sep_token_id] # 分隔符的 token ID
cls = [self.cls_token_id] # 类别开始的 token ID
if token_ids_1 is None:
# 如果只有一个序列,则返回长度为序列长度加上特殊 token 的全零列表
return len(cls + token_ids_0 + sep) * [0]
# 如果有两个序列,则返回长度为两个序列加上多个特殊 token 的全零列表
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
# 从 transformers.models.roberta.tokenization_roberta.RobertaTokenizer.prepare_for_tokenization 复制并修改,RoBERTa->LUKE, Roberta->Luke
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
# 如果文本已经分成单词或需要在文本前加空格,并且文本长度大于0且第一个字符不是空白,则在文本前加空格
if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
text = " " + text
return (text, kwargs)
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def __call__(
self,
text: Union[TextInput, List[TextInput]],
text_pair: Optional[Union[TextInput, List[TextInput]]] = None,
entity_spans: Optional[Union[EntitySpanInput, List[EntitySpanInput]]] = None,
entity_spans_pair: Optional[Union[EntitySpanInput, List[EntitySpanInput]]] = None,
entities: Optional[Union[EntityInput, List[EntityInput]]] = None,
entities_pair: Optional[Union[EntityInput, List[EntityInput]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
max_entity_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: Optional[bool] = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
# 这是一个装饰器,将 ENCODE_KWARGS_DOCSTRING 和 ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING 作为参数添加到该方法中的文档字符串中
pass
# 定义一个方法 `_encode_plus`,用于处理文本编码及其相关特征的生成
def _encode_plus(
self,
text: Union[TextInput], # 输入参数:文本或文本对,可以是字符串或列表形式的字符串
text_pair: Optional[Union[TextInput]] = None, # 可选参数:第二个文本或文本对
entity_spans: Optional[EntitySpanInput] = None, # 可选参数:实体跨度信息
entity_spans_pair: Optional[EntitySpanInput] = None, # 可选参数:第二个文本的实体跨度信息
entities: Optional[EntityInput] = None, # 可选参数:单个文本的实体信息
entities_pair: Optional[EntityInput] = None, # 可选参数:第二个文本的实体信息
add_special_tokens: bool = True, # 是否添加特殊标记(如CLS和SEP)
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, # 填充策略,默认不填充
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, # 截断策略,默认不截断
max_length: Optional[int] = None, # 最大长度限制
max_entity_length: Optional[int] = None, # 单个实体的最大长度限制
stride: int = 0, # 步长,默认为0
is_split_into_words: Optional[bool] = False, # 输入是否已分词
pad_to_multiple_of: Optional[int] = None, # 填充到某个整数倍
return_tensors: Optional[Union[str, TensorType]] = None, # 返回的张量类型
return_token_type_ids: Optional[bool] = None, # 是否返回token类型IDs
return_attention_mask: Optional[bool] = None, # 是否返回attention mask
return_overflowing_tokens: bool = False, # 是否返回溢出的token
return_special_tokens_mask: bool = False, # 是否返回特殊token的mask
return_offsets_mapping: bool = False, # 是否返回偏移映射
return_length: bool = False, # 是否返回编码后的长度
verbose: bool = True, # 是否打印详细信息
**kwargs, # 其他未列出的关键字参数
):
) -> BatchEncoding:
# 如果 return_offsets_mapping 为真,则抛出未实现的错误
if return_offsets_mapping:
raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers. "
"To use this feature, change your tokenizer to one deriving from "
"transformers.PreTrainedTokenizerFast. "
"More information on available tokenizers at "
"https://github.com/huggingface/transformers/pull/2674"
)
# 如果 is_split_into_words 为真,则抛出未实现的错误
if is_split_into_words:
raise NotImplementedError("is_split_into_words is not supported in this tokenizer.")
# 调用 _create_input_sequence 方法生成输入序列的各部分
(
first_ids,
second_ids,
first_entity_ids,
second_entity_ids,
first_entity_token_spans,
second_entity_token_spans,
) = self._create_input_sequence(
text=text,
text_pair=text_pair,
entities=entities,
entities_pair=entities_pair,
entity_spans=entity_spans,
entity_spans_pair=entity_spans_pair,
**kwargs,
)
# prepare_for_model 方法将创建 attention_mask 和 token_type_ids
return self.prepare_for_model(
first_ids,
pair_ids=second_ids,
entity_ids=first_entity_ids,
pair_entity_ids=second_entity_ids,
entity_token_spans=first_entity_token_spans,
pair_entity_token_spans=second_entity_token_spans,
add_special_tokens=add_special_tokens,
padding=padding_strategy.value,
truncation=truncation_strategy.value,
max_length=max_length,
max_entity_length=max_entity_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
verbose=verbose,
)
# 定义一个方法用于批量编码文本和实体信息,并返回编码后的结果
def _batch_encode_plus(
self,
batch_text_or_text_pairs: Union[List[TextInput], List[TextInputPair]],
batch_entity_spans_or_entity_spans_pairs: Optional[
Union[List[EntitySpanInput], List[Tuple[EntitySpanInput, EntitySpanInput]]]
] = None,
batch_entities_or_entities_pairs: Optional[
Union[List[EntityInput], List[Tuple[EntityInput, EntityInput]]]
] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
max_entity_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: Optional[bool] = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
# 检查实体输入的格式是否正确
def _check_entity_input_format(self, entities: Optional[EntityInput], entity_spans: Optional[EntitySpanInput]):
# 如果实体 spans 不是 list 类型,抛出数值错误异常
if not isinstance(entity_spans, list):
raise ValueError("entity_spans should be given as a list")
# 如果实体 spans 的长度大于零且第一个元素不是元组,抛出数值错误异常
elif len(entity_spans) > 0 and not isinstance(entity_spans[0], tuple):
raise ValueError(
"entity_spans should be given as a list of tuples containing the start and end character indices"
)
# 如果 entities 不是 None
if entities is not None:
# 如果 entities 不是 list 类型,抛出数值错误异常
if not isinstance(entities, list):
raise ValueError("If you specify entities, they should be given as a list")
# 如果 entities 的长度大于零且第一个元素不是字符串,抛出数值错误异常
if len(entities) > 0 and not isinstance(entities[0], str):
raise ValueError("If you specify entities, they should be given as a list of entity names")
# 如果 entities 的长度和 entity_spans 的长度不相等,抛出数值错误异常
if len(entities) != len(entity_spans):
raise ValueError("If you specify entities, entities and entity_spans must be the same length")
# 创建输入序列的方法,接受文本、实体信息以及其他关键字参数
def _create_input_sequence(
self,
text: Union[TextInput],
text_pair: Optional[Union[TextInput]] = None,
entities: Optional[EntityInput] = None,
entities_pair: Optional[EntityInput] = None,
entity_spans: Optional[EntitySpanInput] = None,
entity_spans_pair: Optional[EntitySpanInput] = None,
**kwargs,
):
# 使用 ENCODE_KWARGS_DOCSTRING 和 ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING 的注释添加到方法
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
# 定义一个方法 `_batch_prepare_for_model`,用于准备模型输入数据的批处理
def _batch_prepare_for_model(
# 批次中的每个样本由一个 ID 列表和一个空值组成的元组组成
self,
batch_ids_pairs: List[Tuple[List[int], None]],
# 批次中的每个样本由两个可选的实体 ID 列表组成的元组组成
batch_entity_ids_pairs: List[Tuple[Optional[List[int]], Optional[List[int]]]],
# 批次中的每个样本由两个可选的实体标记跨度列表组成的元组组成
batch_entity_token_spans_pairs: List[Tuple[Optional[List[Tuple[int, int]]], Optional[List[Tuple[int, int]]]]],
# 是否添加特殊标记
add_special_tokens: bool = True,
# 填充策略,默认不填充
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
# 截断策略,默认不截断
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
# 最大长度限制
max_length: Optional[int] = None,
# 最大实体长度限制
max_entity_length: Optional[int] = None,
# 步幅大小,默认为0
stride: int = 0,
# 填充到某个倍数,默认不填充到倍数
pad_to_multiple_of: Optional[int] = None,
# 返回的张量类型,默认不返回张量
return_tensors: Optional[str] = None,
# 返回的 token_type_ids 是否可选
return_token_type_ids: Optional[bool] = None,
# 返回的 attention_mask 是否可选
return_attention_mask: Optional[bool] = None,
# 是否返回溢出的 token
return_overflowing_tokens: bool = False,
# 是否返回特殊 token 掩码
return_special_tokens_mask: bool = False,
# 是否返回长度信息
return_length: bool = False,
# 是否详细输出信息,默认为 True
verbose: bool = True,
) -> BatchEncoding:
"""
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
manages a moving window (with user defined stride) for overflowing tokens
Args:
batch_ids_pairs: list of tokenized input ids or input ids pairs
batch_entity_ids_pairs: list of entity ids or entity ids pairs
batch_entity_token_spans_pairs: list of entity spans or entity spans pairs
max_entity_length: The maximum length of the entity sequence.
"""
# Initialize an empty dictionary to store batch outputs
batch_outputs = {}
# Iterate over input sequences and corresponding entity information
for input_ids, entity_ids, entity_token_span_pairs in zip(
batch_ids_pairs, batch_entity_ids_pairs, batch_entity_token_spans_pairs
):
# Unpack input sequences into first and second parts
first_ids, second_ids = input_ids
# Unpack entity ids into first and second parts
first_entity_ids, second_entity_ids = entity_ids
# Unpack entity token spans into first and second parts
first_entity_token_spans, second_entity_token_spans = entity_token_span_pairs
# Prepare inputs for the model using specified parameters
outputs = self.prepare_for_model(
first_ids,
second_ids,
entity_ids=first_entity_ids,
pair_entity_ids=second_entity_ids,
entity_token_spans=first_entity_token_spans,
pair_entity_token_spans=second_entity_token_spans,
add_special_tokens=add_special_tokens,
padding=PaddingStrategy.DO_NOT_PAD.value, # Specify padding strategy
truncation=truncation_strategy.value, # Specify truncation strategy
max_length=max_length, # Maximum length of the sequences
max_entity_length=max_entity_length, # Maximum length of the entity sequence
stride=stride, # Stride for handling overflowing tokens
pad_to_multiple_of=None, # We pad in batch afterward
return_attention_mask=False, # We pad in batch afterward
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
return_tensors=None, # Convert batch to tensors at the end
prepend_batch_axis=False,
verbose=verbose,
)
# Aggregate outputs from each batch iteration
for key, value in outputs.items():
if key not in batch_outputs:
batch_outputs[key] = []
batch_outputs[key].append(value)
# Perform padding on batch outputs using specified parameters
batch_outputs = self.pad(
batch_outputs,
padding=padding_strategy.value, # Specify padding strategy
max_length=max_length, # Maximum length of the sequences
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
)
# Convert batch outputs to BatchEncoding format
batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
# Return the processed batch outputs
return batch_outputs
# 准备输入数据以供模型使用,根据参数进行处理和转换
def prepare_for_model(
self,
ids: List[int],
pair_ids: Optional[List[int]] = None,
entity_ids: Optional[List[int]] = None,
pair_entity_ids: Optional[List[int]] = None,
entity_token_spans: Optional[List[Tuple[int, int]]] = None,
pair_entity_token_spans: Optional[List[Tuple[int, int]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
max_entity_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
prepend_batch_axis: bool = False,
**kwargs,
):
# 对输入数据进行预处理,包括添加特殊标记、填充、截断等操作
...
# 对编码后的输入进行填充处理,确保输入数据的长度一致性
def pad(
self,
encoded_inputs: Union[
BatchEncoding,
List[BatchEncoding],
Dict[str, EncodedInput],
Dict[str, List[EncodedInput]],
List[Dict[str, EncodedInput]],
],
padding: Union[bool, str, PaddingStrategy] = True,
max_length: Optional[int] = None,
max_entity_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
verbose: bool = True,
):
# 对编码后的输入进行填充,使得它们具有相同的长度或满足指定的填充要求
...
# 内部方法:对编码后的输入进行低级别的填充操作
def _pad(
self,
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
max_length: Optional[int] = None,
max_entity_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
...
# 保存词汇表到指定目录中
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# 检查保存目录是否存在,若不存在则记录错误并返回
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
# 构建词汇表文件路径
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
# 构建合并文件路径
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
# 写入词汇表文件
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
# 初始化索引
index = 0
# 写入合并文件
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write("#version: 0.2\n")
# 遍历并排序BPE词汇表,按索引写入文件
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
# 检查BPE合并索引是否连续,记录警告信息
if index != token_index:
logger.warning(
f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
# 写入BPE token到文件
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
# 构建实体词汇表文件路径
entity_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["entity_vocab_file"]
)
# 写入实体词汇表文件
with open(entity_vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.entity_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
# 返回保存的文件路径元组
return vocab_file, merge_file, entity_vocab_file
.\models\luke\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_luke": ["LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP", "LukeConfig"],
"tokenization_luke": ["LukeTokenizer"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_luke"] = [
"LUKE_PRETRAINED_MODEL_ARCHIVE_LIST",
"LukeForEntityClassification",
"LukeForEntityPairClassification",
"LukeForEntitySpanClassification",
"LukeForMultipleChoice",
"LukeForQuestionAnswering",
"LukeForSequenceClassification",
"LukeForTokenClassification",
"LukeForMaskedLM",
"LukeModel",
"LukePreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_luke import LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP, LukeConfig
from .tokenization_luke import LukeTokenizer
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_luke import (
LUKE_PRETRAINED_MODEL_ARCHIVE_LIST,
LukeForEntityClassification,
LukeForEntityPairClassification,
LukeForEntitySpanClassification,
LukeForMaskedLM,
LukeForMultipleChoice,
LukeForQuestionAnswering,
LukeForSequenceClassification,
LukeForTokenClassification,
LukeModel,
LukePreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\lxmert\configuration_lxmert.py
""" LXMERT model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"unc-nlp/lxmert-base-uncased": "https://huggingface.co/unc-nlp/lxmert-base-uncased/resolve/main/config.json",
}
class LxmertConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`LxmertModel`] or a [`TFLxmertModel`]. It is used
to instantiate a LXMERT model according to the specified arguments, defining the model architecture. Instantiating
a configuration with the defaults will yield a similar configuration to that of the Lxmert
[unc-nlp/lxmert-base-uncased](https://huggingface.co/unc-nlp/lxmert-base-uncased) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
model_type = "lxmert"
attribute_map = {}
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_attention_heads=12,
num_qa_labels=9500,
num_object_labels=1600,
num_attr_labels=400,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
l_layers=9,
x_layers=5,
r_layers=5,
visual_feat_dim=2048,
visual_pos_dim=4,
visual_loss_normalizer=6.67,
task_matched=True,
task_mask_lm=True,
task_obj_predict=True,
task_qa=True,
visual_obj_loss=True,
visual_attr_loss=True,
visual_feat_loss=True,
**kwargs,
):
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.num_qa_labels = num_qa_labels
self.num_object_labels = num_object_labels
self.num_attr_labels = num_attr_labels
self.l_layers = l_layers
self.x_layers = x_layers
self.r_layers = r_layers
self.visual_feat_dim = visual_feat_dim
self.visual_pos_dim = visual_pos_dim
self.visual_loss_normalizer = visual_loss_normalizer
self.task_matched = task_matched
self.task_mask_lm = task_mask_lm
self.task_obj_predict = task_obj_predict
self.task_qa = task_qa
self.visual_obj_loss = visual_obj_loss
self.visual_attr_loss = visual_attr_loss
self.visual_feat_loss = visual_feat_loss
self.num_hidden_layers = {"vision": r_layers, "cross_encoder": x_layers, "language": l_layers}
super().__init__(**kwargs)
.\models\lxmert\convert_lxmert_original_tf_checkpoint_to_pytorch.py
"""Convert LXMERT checkpoint."""
import argparse
import torch
from transformers import LxmertConfig, LxmertForPreTraining, load_tf_weights_in_lxmert
from transformers.utils import logging
logging.set_verbosity_info()
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
config = LxmertConfig.from_json_file(config_file)
print(f"Building PyTorch model from configuration: {config}")
model = LxmertForPreTraining(config)
load_tf_weights_in_lxmert(model, config, tf_checkpoint_path)
print(f"Save PyTorch model to {pytorch_dump_path}")
torch.save(model.state_dict(), pytorch_dump_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
)
parser.add_argument(
"--config_file",
default=None,
type=str,
required=True,
help="The config json file corresponding to the pre-trained model. \nThis specifies the model architecture.",
)
parser.add_argument(
"--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
args = parser.parse_args()
convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
.\models\lxmert\modeling_lxmert.py
import math
import os
import warnings
from dataclasses import dataclass
from typing import Dict, Optional, Tuple, Union
import torch
from torch import nn
from torch.nn import CrossEntropyLoss, SmoothL1Loss
from ...activations import ACT2FN, gelu
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_lxmert import LxmertConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "unc-nlp/lxmert-base-uncased"
_CONFIG_FOR_DOC = "LxmertConfig"
LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"unc-nlp/lxmert-base-uncased",
]
class GeLU(nn.Module):
"""
实现Gaussian Error Linear Unit (GELU)激活函数的PyTorch模块。
"""
def __init__(self):
super().__init__()
def forward(self, x):
"""
对输入张量应用GELU激活函数。
Args:
x (torch.Tensor): 输入张量
Returns:
torch.Tensor: 经过GELU激活函数后的张量
"""
return gelu(x)
@dataclass
class LxmertModelOutput(ModelOutput):
"""
LXMERT模型的输出,包含语言编码器、视觉编码器和跨模态编码器的最后隐藏状态、汇总输出和注意力概率。
(注意:在LXMERT中,视觉编码器称为“关系-语义”编码器)
"""
pass
Args:
language_output (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
最后一层语言编码器的隐藏状态序列。
vision_output (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
最后一层视觉编码器的隐藏状态序列。
pooled_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
序列中第一个令牌(分类、CLS令牌)的最后一层隐藏状态,通过一个线性层和Tanh激活函数进一步处理。
language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
语言编码器的隐藏状态元组,形状为 `(batch_size, sequence_length, hidden_size)`。
vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
视觉编码器的隐藏状态元组,形状为 `(batch_size, sequence_length, hidden_size)`。
language_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
注意力权重元组,形状为 `(batch_size, num_heads, sequence_length, sequence_length)`,经过注意力softmax后得到,用于计算自注意力头中的加权平均值。
vision_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
注意力权重元组,形状为 `(batch_size, num_heads, sequence_length, sequence_length)`,经过注意力softmax后得到,用于计算自注意力头中的加权平均值。
cross_encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
注意力权重元组,形状为 `(batch_size, num_heads, sequence_length, sequence_length)`,经过注意力softmax后得到,用于计算交叉编码器注意力头中的加权平均值。
"""
language_output: Optional[torch.FloatTensor] = None
vision_output: Optional[torch.FloatTensor] = None
pooled_output: Optional[torch.FloatTensor] = None
language_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
vision_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
# 声明一个可选的类型为 Tuple[torch.FloatTensor] 的变量 language_attentions,并初始化为 None
language_attentions: Optional[Tuple[torch.FloatTensor]] = None
# 声明一个可选的类型为 Tuple[torch.FloatTensor] 的变量 vision_attentions,并初始化为 None
vision_attentions: Optional[Tuple[torch.FloatTensor]] = None
# 声明一个可选的类型为 Tuple[torch.FloatTensor] 的变量 cross_encoder_attentions,并初始化为 None
cross_encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
# 定义 LxmertForQuestionAnsweringOutput 类,用于存储 LXMERT 模型问题回答的输出结果
@dataclass
class LxmertForQuestionAnsweringOutput(ModelOutput):
"""
LxmertForQuestionAnswering 的输出类型。
Args:
loss (*optional*, 当提供 `labels` 时返回,`torch.FloatTensor`,形状为 `(1,)`):
总损失,包括掩码语言建模损失和下一个序列预测(分类)损失的和。
question_answering_score (`torch.FloatTensor`,形状为 `(batch_size, n_qa_answers)`,*optional*):
问题回答目标的预测分数(分类)。
language_hidden_states (`tuple(torch.FloatTensor)`,*optional*,当传递 `output_hidden_states=True` 或者 `config.output_hidden_states=True` 时返回):
元组,包含 `torch.FloatTensor`(一个用于输入特征 + 一个用于每个交叉模态层的输出),
形状为 `(batch_size, sequence_length, hidden_size)`。
vision_hidden_states (`tuple(torch.FloatTensor)`,*optional*,当传递 `output_hidden_states=True` 或者 `config.output_hidden_states=True` 时返回):
元组,包含 `torch.FloatTensor`(一个用于输入特征 + 一个用于每个交叉模态层的输出),
形状为 `(batch_size, sequence_length, hidden_size)`。
language_attentions (`tuple(torch.FloatTensor)`,*optional*,当传递 `output_attentions=True` 或者 `config.output_attentions=True` 时返回):
元组,包含 `torch.FloatTensor`(每个层一个),
形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
经过注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
vision_attentions (`tuple(torch.FloatTensor)`,*optional*,当传递 `output_attentions=True` 或者 `config.output_attentions=True` 时返回):
元组,包含 `torch.FloatTensor`(每个层一个),
形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
经过注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
cross_encoder_attentions (`tuple(torch.FloatTensor)`,*optional*,当传递 `output_attentions=True` 或者 `config.output_attentions=True` 时返回):
元组,包含 `torch.FloatTensor`(每个层一个),
形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
经过注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
"""
# 损失值,类型为可选的浮点张量
loss: Optional[torch.FloatTensor] = None
# 问题回答分数,类型为可选的浮点张量
question_answering_score: Optional[torch.FloatTensor] = None
# 语言隐藏状态,类型为可选的张量元组
language_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
# 视觉隐藏状态,类型为可选的张量元组
vision_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
# 语言注意力权重,类型为可选的张量元组
language_attentions: Optional[Tuple[torch.FloatTensor]] = None
# 视觉注意力权重,类型为可选的张量元组
vision_attentions: Optional[Tuple[torch.FloatTensor]] = None
# 定义一个可选的类型注解,表示 cross_encoder_attentions 变量可以是一个包含一个 torch.FloatTensor 的元组,或者是 None
cross_encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class LxmertForPreTrainingOutput(ModelOutput):
"""
Output type of [`LxmertForPreTraining`].
Args:
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
Total loss as the sum of the masked language modeling loss and the next sequence prediction
(classification) loss.
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
cross_relationship_score (`torch.FloatTensor` of shape `(batch_size, 2)`):
Prediction scores of the textual matching objective (classification) head (scores of True/False
continuation before SoftMax).
question_answering_score (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`):
Prediction scores of question answering objective (classification).
language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
shape `(batch_size, sequence_length, hidden_size)`.
vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
shape `(batch_size, sequence_length, hidden_size)`.
language_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
the self-attention heads.
vision_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
the self-attention heads.
cross_encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
the self-attention heads.
"""
# 定义损失变量,类型为可选的浮点张量
loss: Optional[torch.FloatTensor] = None
# 定义预测 logits 变量,类型为可选的浮点张量
prediction_logits: Optional[torch.FloatTensor] = None
# 定义跨关系分数变量,类型为可选的浮点张量
cross_relationship_score: Optional[torch.FloatTensor] = None
# 定义问答分数变量,类型为可选的浮点张量
question_answering_score: Optional[torch.FloatTensor] = None
# 定义语言隐藏状态变量,类型为可选的浮点张量元组
language_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
# 定义视觉隐藏状态变量,类型为可选的浮点张量元组
vision_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
# 定义语言注意力变量,类型为可选的浮点张量元组
language_attentions: Optional[Tuple[torch.FloatTensor]] = None
# 定义视觉注意力变量,类型为可选的浮点张量元组
vision_attentions: Optional[Tuple[torch.FloatTensor]] = None
# 定义跨编码器注意力变量,类型为可选的浮点张量元组
cross_encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
def load_tf_weights_in_lxmert(model, config, tf_checkpoint_path):
"""Load tf checkpoints in a pytorch model."""
try:
import re # 导入正则表达式模块
import numpy as np # 导入NumPy库
import tensorflow as tf # 导入TensorFlow库
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path = os.path.abspath(tf_checkpoint_path) # 获取TensorFlow checkpoint文件的绝对路径
logger.info(f"Converting TensorFlow checkpoint from {tf_path}") # 记录日志:转换TensorFlow checkpoint的路径
# Load weights from TF model
init_vars = tf.train.list_variables(tf_path) # 获取TensorFlow模型中的所有变量及其形状
names = []
arrays = []
for name, shape in init_vars:
logger.info(f"Loading TF weight {name} with shape {shape}") # 记录日志:加载TensorFlow权重的名称和形状
array = tf.train.load_variable(tf_path, name) # 加载TensorFlow模型中的变量值
names.append(name) # 将变量名添加到列表中
arrays.append(array) # 将变量值添加到列表中
for name, array in zip(names, arrays):
name = name.split("/")
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
if any(
n
in [
"adam_v",
"adam_m",
"AdamWeightDecayOptimizer",
"AdamWeightDecayOptimizer_1",
"global_step",
]
for n in name
):
logger.info(f"Skipping {'/'.join(name)}") # 记录日志:跳过特定的TensorFlow变量
continue
pointer = model
for m_name in name:
if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
scope_names = re.split(r"_(\d+)", m_name)
else:
scope_names = [m_name]
# 根据变量名的前缀,设置对应的PyTorch模型指针
if scope_names[0] == "kernel" or scope_names[0] == "gamma":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
pointer = getattr(pointer, "bias")
elif scope_names[0] == "output_weights":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "squad":
pointer = getattr(pointer, "classifier")
else:
try:
pointer = getattr(pointer, scope_names[0])
except AttributeError:
logger.info(f"Skipping {'/'.join(name)}") # 记录日志:跳过特定的PyTorch变量
continue
if len(scope_names) >= 2:
num = int(scope_names[1])
pointer = pointer[num] # 根据索引获取嵌套的指针
# 处理特殊情况下的变量名,设置对应的PyTorch模型指针
if m_name[-11:] == "_embeddings":
pointer = getattr(pointer, "weight")
elif m_name == "kernel":
array = np.transpose(array) # 转置权重数组
try:
assert pointer.shape == array.shape # 断言PyTorch模型指针和权重数组的形状相同
except AssertionError as e:
e.args += (pointer.shape, array.shape)
raise
logger.info(f"Initialize PyTorch weight {name}") # 记录日志:初始化PyTorch权重的名称
pointer.data = torch.from_numpy(array) # 使用NumPy数组初始化PyTorch模型的权重
return model # 返回加载了TensorFlow权重的PyTorch模型
"""Construct the embeddings from word, position and token_type embeddings."""
# 初始化函数,接受一个配置对象config作为参数
def __init__(self, config):
super().__init__()
# 创建一个词嵌入层,用于将输入的词汇索引映射为隐藏大小的词嵌入向量,padding_idx=0表示用0进行填充
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
# 创建一个位置嵌入层,用于将位置索引映射为隐藏大小的位置嵌入向量,padding_idx=0表示用0进行填充
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size, padding_idx=0)
# 创建一个标记类型嵌入层,用于将标记类型索引映射为隐藏大小的嵌入向量,padding_idx=0表示用0进行填充
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size, padding_idx=0)
# LayerNorm不使用蛇形命名以保持与TensorFlow模型变量名的一致性,使得能够加载任何TensorFlow检查点文件
# 创建LayerNorm层,用于归一化隐藏状态向量,eps=1e-12是一个非常小的数,用于数值稳定性
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=1e-12)
# 创建Dropout层,用于在训练过程中随机失活部分神经元,防止过拟合
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 前向传播函数,接受输入的词汇ID(input_ids)、标记类型ID(token_type_ids)和预先计算的嵌入(inputs_embeds)
def forward(self, input_ids, token_type_ids=None, inputs_embeds=None):
# 如果input_ids不为None,则获取其形状和设备信息;否则获取inputs_embeds的形状和设备信息
if input_ids is not None:
input_shape = input_ids.size()
device = input_ids.device
else:
input_shape = inputs_embeds.size()[:-1]
device = inputs_embeds.device
seq_length = input_shape[1]
# 根据序列长度创建位置ID张量,dtype=torch.long表示数据类型为长整型,device=device表示放置在指定设备上
position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
position_ids = position_ids.unsqueeze(0).expand(input_shape)
# 如果token_type_ids为None,则创建全零张量作为标记类型ID,数据类型为长整型,设备使用self.position_ids的设备
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
# 如果inputs_embeds为None,则通过word_embeddings将input_ids转换为嵌入向量
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
# 根据位置ID获取位置嵌入向量
position_embeddings = self.position_embeddings(position_ids)
# 根据标记类型ID获取标记类型嵌入向量
token_type_embeddings = self.token_type_embeddings(token_type_ids)
# 将词嵌入向量、位置嵌入向量和标记类型嵌入向量相加得到最终的嵌入向量
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
# 对最终的嵌入向量进行LayerNorm归一化
embeddings = self.LayerNorm(embeddings)
# 对归一化后的向量进行Dropout处理
embeddings = self.dropout(embeddings)
return embeddings
# 定义 LxmertAttention 类,继承自 nn.Module,用于执行 LXMERT 模型中的自注意力机制
class LxmertAttention(nn.Module):
def __init__(self, config, ctx_dim=None):
super().__init__()
# 检查隐藏层大小是否能被注意力头数整除
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.head_size = self.num_attention_heads * self.attention_head_size
# 如果未指定上下文维度,则使用配置中的隐藏层大小
if ctx_dim is None:
ctx_dim = config.hidden_size
# 创建查询、键、值的线性映射层
self.query = nn.Linear(config.hidden_size, self.head_size)
self.key = nn.Linear(ctx_dim, self.head_size)
self.value = nn.Linear(ctx_dim, self.head_size)
# 定义 dropout 层,用于注意力概率的 dropout
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
# 对输入张量 x 进行形状转换,以适应多头注意力机制
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (
self.num_attention_heads,
self.attention_head_size,
)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
# 前向传播函数,执行自注意力计算
def forward(self, hidden_states, context, attention_mask=None, output_attentions=False):
# 通过查询、键、值映射层计算混合的查询、键、值张量
mixed_query_layer = self.query(hidden_states)
mixed_key_layer = self.key(context)
mixed_value_layer = self.value(context)
# 对混合的查询、键、值张量进行形状转换,以进行多头注意力计算
query_layer = self.transpose_for_scores(mixed_query_layer)
key_layer = self.transpose_for_scores(mixed_key_layer)
value_layer = self.transpose_for_scores(mixed_value_layer)
# 计算原始注意力分数,通过查询与键的点积得到
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
# 如果存在注意力掩码,则将其应用于注意力分数
if attention_mask is not None:
attention_scores = attention_scores + attention_mask
# 对注意力分数进行 softmax 归一化,得到注意力概率
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# 对注意力概率进行 dropout 操作
attention_probs = self.dropout(attention_probs)
# 计算上下文张量,通过注意力概率与值层的乘积得到
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
# 重新调整上下文张量的形状,以匹配预期的输出形状
new_context_layer_shape = context_layer.size()[:-2] + (self.head_size,)
context_layer = context_layer.view(new_context_layer_shape)
# 如果需要输出注意力权重,则将其包含在输出中
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
class LxmertAttentionOutput(nn.Module):
# 初始化函数,用于初始化神经网络模型的参数和层
def __init__(self, config):
# 调用父类的初始化函数
super().__init__()
# 创建一个线性层,输入和输出的大小都为 config.hidden_size
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
# 创建一个 LayerNorm 层,对隐藏状态进行归一化处理
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=1e-12)
# 创建一个 Dropout 层,用于在训练过程中随机置零输入张量的部分元素
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 前向传播函数,定义了模型的计算过程
def forward(self, hidden_states, input_tensor):
# 使用线性层对隐藏状态进行线性变换
hidden_states = self.dense(hidden_states)
# 对变换后的隐藏状态进行随机置零处理,以减少过拟合
hidden_states = self.dropout(hidden_states)
# 对处理后的隐藏状态进行 LayerNorm 归一化,并与输入张量相加
hidden_states = self.LayerNorm(hidden_states + input_tensor)
# 返回处理后的隐藏状态作为输出
return hidden_states
class LxmertCrossAttentionLayer(nn.Module):
def __init__(self, config):
super().__init__()
# 初始化交叉注意力层,包括注意力和输出
self.att = LxmertAttention(config)
self.output = LxmertAttentionOutput(config)
def forward(self, input_tensor, ctx_tensor, ctx_att_mask=None, output_attentions=False):
# 执行前向传播,调用注意力层,并返回注意力输出
output = self.att(input_tensor, ctx_tensor, ctx_att_mask, output_attentions=output_attentions)
if output_attentions:
# 如果需要输出注意力权重,则获取注意力概率
attention_probs = output[1]
# 使用输出层处理注意力输出和输入张量,得到最终输出
attention_output = self.output(output[0], input_tensor)
# 根据需要是否输出注意力权重,构建最终输出结果
outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
return outputs
class LxmertSelfAttentionLayer(nn.Module):
def __init__(self, config):
super().__init__()
# 初始化自注意力层,包括注意力和输出
self.self = LxmertAttention(config)
self.output = LxmertAttentionOutput(config)
def forward(self, input_tensor, attention_mask, output_attentions=False):
# 自注意力层的前向传播,处理输入张量、注意力掩码,并返回注意力输出
# 注意:自注意力的键和查询是相同的(即输入张量)
output = self.self(
input_tensor,
input_tensor,
attention_mask,
output_attentions=output_attentions,
)
if output_attentions:
# 如果需要输出注意力权重,则获取注意力概率
attention_probs = output[1]
# 使用输出层处理注意力输出和输入张量,得到最终输出
attention_output = self.output(output[0], input_tensor)
# 根据需要是否输出注意力权重,构建最终输出结果
outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
return outputs
class LxmertIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
# 初始化中间层,包括线性变换和激活函数
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
self.intermediate_act_fn = ACT2FN[config.hidden_act]
def forward(self, hidden_states):
# 中间层的前向传播,先进行线性变换,再应用激活函数
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class LxmertOutput(nn.Module):
def __init__(self, config):
super().__init__()
# 初始化输出层,包括线性变换、LayerNorm和Dropout
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=1e-12)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
# 输出层的前向传播,先进行线性变换,再应用Dropout和LayerNorm,最后与输入张量相加
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class LxmertLayer(nn.Module):
def __init__(self, config):
super().__init__()
# 初始化整个 LXMERT 层,包括自注意力层、中间层和输出层
self.attention = LxmertSelfAttentionLayer(config)
self.intermediate = LxmertIntermediate(config)
self.output = LxmertOutput(config)
# 定义一个前向传播方法,接受隐藏状态作为输入,并可选地接受注意力掩码和是否输出注意力信息的标志
def forward(self, hidden_states, attention_mask=None, output_attentions=False):
# 调用注意力层的前向传播方法,得到输出
outputs = self.attention(hidden_states, attention_mask, output_attentions=output_attentions)
# 从注意力层的输出中获取注意力输出
attention_output = outputs[0]
# 将注意力输出送入中间层处理
intermediate_output = self.intermediate(attention_output)
# 将中间层的输出送入输出层处理,得到最终层输出
layer_output = self.output(intermediate_output, attention_output)
# 如果输出注意力信息被激活,将注意力信息加入输出元组中
outputs = (layer_output,) + outputs[1:] # add attentions if we output them
# 返回所有输出(包括最终层输出和可能的注意力信息)
return outputs
class LxmertXLayer(nn.Module):
def __init__(self, config):
super().__init__()
# The cross-attention Layer
self.visual_attention = LxmertCrossAttentionLayer(config)
# Self-attention Layers
self.lang_self_att = LxmertSelfAttentionLayer(config)
self.visn_self_att = LxmertSelfAttentionLayer(config)
# Intermediate and Output Layers (FFNs)
self.lang_inter = LxmertIntermediate(config)
self.lang_output = LxmertOutput(config)
self.visn_inter = LxmertIntermediate(config)
self.visn_output = LxmertOutput(config)
def cross_att(
self,
lang_input,
lang_attention_mask,
visual_input,
visual_attention_mask,
output_x_attentions=False,
):
# Cross Attention between language and visual inputs
lang_att_output = self.visual_attention(
lang_input,
visual_input,
ctx_att_mask=visual_attention_mask,
output_attentions=output_x_attentions,
)
# Cross Attention between visual and language inputs
visual_att_output = self.visual_attention(
visual_input,
lang_input,
ctx_att_mask=lang_attention_mask,
output_attentions=False,
)
return lang_att_output, visual_att_output
def self_att(self, lang_input, lang_attention_mask, visual_input, visual_attention_mask):
# Self Attention for language input
lang_att_output = self.lang_self_att(lang_input, lang_attention_mask, output_attentions=False)
# Self Attention for visual input
visual_att_output = self.visn_self_att(visual_input, visual_attention_mask, output_attentions=False)
return lang_att_output[0], visual_att_output[0]
def output_fc(self, lang_input, visual_input):
# Feed-forward layers for language input
lang_inter_output = self.lang_inter(lang_input)
# Feed-forward layers for visual input
visual_inter_output = self.visn_inter(visual_input)
# Output layers for language input
lang_output = self.lang_output(lang_inter_output, lang_input)
# Output layers for visual input
visual_output = self.visn_output(visual_inter_output, visual_input)
return lang_output, visual_output
def forward(
self,
lang_feats,
lang_attention_mask,
visual_feats,
visual_attention_mask,
output_attentions=False,
):
# Perform cross-attention
lang_att_output, visual_att_output = self.cross_att(
lang_feats,
lang_attention_mask,
visual_feats,
visual_attention_mask,
output_x_attentions=output_attentions,
)
# Perform self-attention
lang_self_output, visual_self_output = self.self_att(
lang_feats,
lang_attention_mask,
visual_feats,
visual_attention_mask,
)
# Perform output FC layers
lang_output, visual_output = self.output_fc(lang_self_output, visual_self_output)
return lang_output, visual_output
# 定义一个方法,执行交叉注意力操作,将语言和视觉特征进行注意力计算
def forward(
self,
lang_feats, # 输入的语言特征
lang_attention_mask, # 语言注意力掩码
visual_feats, # 输入的视觉特征
visual_attention_mask, # 视觉注意力掩码
output_attentions=False # 是否输出注意力矩阵,默认为 False
):
# 执行交叉注意力计算,得到语言和视觉的注意力输出
lang_att_output, visual_att_output = self.cross_att(
lang_input=lang_feats,
lang_attention_mask=lang_attention_mask,
visual_input=visual_feats,
visual_attention_mask=visual_attention_mask,
output_x_attentions=output_attentions,
)
# 获取语言注意力输出中除第一个之外的所有部分
attention_probs = lang_att_output[1:]
# 执行自注意力计算,传入语言和视觉的注意力输出以及对应的注意力掩码
lang_att_output, visual_att_output = self.self_att(
lang_att_output[0],
lang_attention_mask,
visual_att_output[0],
visual_attention_mask,
)
# 将经过注意力计算后的语言和视觉输出,输入到输出全连接层进行最终的输出
lang_output, visual_output = self.output_fc(lang_att_output, visual_att_output)
# 根据是否需要输出注意力矩阵,决定返回值的格式
return (
(
lang_output, # 语言输出
visual_output, # 视觉输出
attention_probs[0], # 第一个注意力矩阵(如果有输出注意力)
)
if output_attentions # 如果需要输出注意力矩阵
else (lang_output, visual_output) # 否则只返回语言和视觉输出
)
# LXMERT 编码器模型,用于处理多模态输入数据
super().__init__()
# 对象级别视觉特征编码层
self.visn_fc = LxmertVisualFeatureEncoder(config)
self.config = config
# 层的数量
self.num_l_layers = config.l_layers
self.num_x_layers = config.x_layers
self.num_r_layers = config.r_layers
# 层的初始化
# 使用 self.layer 而不是 self.l_layer 来支持加载 BERT 权重
self.layer = nn.ModuleList([LxmertLayer(config) for _ in range(self.num_l_layers)])
self.x_layers = nn.ModuleList([LxmertXLayer(config) for _ in range(self.num_x_layers)])
self.r_layers = nn.ModuleList([LxmertLayer(config) for _ in range(self.num_r_layers)])
def forward(
self,
lang_feats,
lang_attention_mask,
visual_feats,
visual_pos,
visual_attention_mask=None,
output_attentions=None,
):
vision_hidden_states = ()
language_hidden_states = ()
# 如果需要输出注意力权重或者配置要求输出注意力权重,则初始化视觉和语言注意力为空元组,否则设为None
vision_attentions = () if output_attentions or self.config.output_attentions else None
language_attentions = () if output_attentions or self.config.output_attentions else None
cross_encoder_attentions = () if output_attentions or self.config.output_attentions else None
visual_feats = self.visn_fc(visual_feats, visual_pos)
# 运行语言层
for layer_module in self.layer:
# 调用每个语言层模块进行前向传播
l_outputs = layer_module(lang_feats, lang_attention_mask, output_attentions=output_attentions)
lang_feats = l_outputs[0]
# 将每一层的隐藏状态添加到语言隐藏状态元组中
language_hidden_states = language_hidden_states + (lang_feats,)
# 如果需要记录注意力权重,将每一层的注意力权重添加到语言注意力元组中
if language_attentions is not None:
language_attentions = language_attentions + (l_outputs[1],)
# 运行关系层
for layer_module in self.r_layers:
# 调用每个关系层模块进行前向传播
v_outputs = layer_module(visual_feats, visual_attention_mask, output_attentions=output_attentions)
visual_feats = v_outputs[0]
# 将每一层的隐藏状态添加到视觉隐藏状态元组中
vision_hidden_states = vision_hidden_states + (visual_feats,)
# 如果需要记录注意力权重,将每一层的注意力权重添加到视觉注意力元组中
if vision_attentions is not None:
vision_attentions = vision_attentions + (v_outputs[1],)
# 运行跨模态层
for layer_module in self.x_layers:
# 调用每个跨模态层模块进行前向传播
x_outputs = layer_module(
lang_feats,
lang_attention_mask,
visual_feats,
visual_attention_mask,
output_attentions=output_attentions,
)
lang_feats, visual_feats = x_outputs[:2]
# 将每一层的隐藏状态添加到视觉和语言隐藏状态元组中
vision_hidden_states = vision_hidden_states + (visual_feats,)
language_hidden_states = language_hidden_states + (lang_feats,)
# 如果需要记录注意力权重,将每一层的注意力权重添加到跨模态注意力元组中
if cross_encoder_attentions is not None:
cross_encoder_attentions = cross_encoder_attentions + (x_outputs[2],)
visual_encoder_outputs = (
vision_hidden_states,
vision_attentions if output_attentions else None,
)
lang_encoder_outputs = (
language_hidden_states,
language_attentions if output_attentions else None,
)
# 返回最终的视觉编码器输出、语言编码器输出以及跨编码器注意力权重(如果需要的话)
return (
visual_encoder_outputs,
lang_encoder_outputs,
cross_encoder_attentions if output_attentions else None,
)
class LxmertVisualObjHead(nn.Module):
def __init__(self, config):
super().__init__()
hid_dim = config.hidden_size
self.vis_fc = nn.Sequential(
nn.Linear(hid_dim, hid_dim * 2),
GeLU(),
nn.LayerNorm(hid_dim * 2, eps=1e-12),
)
def forward(self, hidden_states):
# 进行视觉特征的预测,通过全连接层实现特征转换和归一化
visual_feats = self.vis_fc(hidden_states)
return visual_feats
def __init__(self, config):
super().__init__()
self.transform = LxmertPredictionHeadTransform(config)
# Decide the use of visual losses
visual_losses = {}
if config.visual_obj_loss:
visual_losses["obj"] = {"shape": (-1,), "num": config.num_object_labels}
if config.visual_attr_loss:
visual_losses["attr"] = {"shape": (-1,), "num": config.num_attr_labels}
if config.visual_feat_loss:
visual_losses["feat"] = {
"shape": (-1, config.visual_feat_dim),
"num": config.visual_feat_dim,
}
self.visual_losses = visual_losses
# 定义一个字典,用于存储不同类型的视觉损失
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self.decoder_dict = nn.ModuleDict(
{key: nn.Linear(config.hidden_size, self.visual_losses[key]["num"]) for key in self.visual_losses}
)
# 使用 nn.ModuleDict 创建一个 Module 字典,每个 key 对应不同的视觉损失类型,
# 值为一个 Linear 层,用于处理隐藏状态到对应损失类型的输出映射
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
# 对输入的隐藏状态进行转换
output = {}
for key in self.visual_losses:
output[key] = self.decoder_dict[key](hidden_states)
# 使用每个视觉损失对应的 Linear 层计算输出
return output
class LxmertPreTrainingHeads(nn.Module):
def __init__(self, config, lxmert_model_embedding_weights):
super(LxmertPreTrainingHeads, self).__init__()
# 初始化预测头部:语言模型预测头部
self.predictions = LxmertLMPredictionHead(config, lxmert_model_embedding_weights)
# 初始化预测头部:序列关系预测头部
self.seq_relationship = nn.Linear(config.hidden_size, 2)
def forward(self, sequence_output, pooled_output):
# 预测语言模型的分数
prediction_scores = self.predictions(sequence_output)
# 预测序列关系的分数
seq_relationship_score = self.seq_relationship(pooled_output)
return prediction_scores, seq_relationship_score
class LxmertPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = LxmertConfig
load_tf_weights = load_tf_weights_in_lxmert
base_model_prefix = "lxmert"
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
# 使用正态分布初始化线性层的权重
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
# 使用正态分布初始化嵌入层的权重
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
# 将 LayerNorm 层的偏置项初始化为零,权重初始化为 1.0
module.bias.data.zero_()
module.weight.data.fill_(1.0)
LXMERT_START_DOCSTRING = r"""
The LXMERT model was proposed in [LXMERT: Learning Cross-Modality Encoder Representations from
Transformers](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal. It's a vision and language transformer
model, pretrained on a variety of multi-modal datasets comprising of GQA, VQAv2.0, MSCOCO captions, and Visual
genome, using a combination of masked language modeling, region of interest feature regression, cross entropy loss
for question answering attribute prediction, and object tag prediction.
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`LxmertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
LXMERT_INPUTS_DOCSTRING = r"""
Args:
batch_size (int): The batch size of the input data.
sequence_length (int): The length of the input sequences.
"""
@add_start_docstrings(
"The bare Lxmert Model transformer outputting raw hidden-states without any specific head on top.",
LXMERT_START_DOCSTRING,
)
class LxmertModel(LxmertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# Initialize the embeddings module with the provided configuration
self.embeddings = LxmertEmbeddings(config)
# Initialize the encoder module with the provided configuration
self.encoder = LxmertEncoder(config)
# Initialize the pooler module with the provided configuration
self.pooler = LxmertPooler(config)
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
# Return the word embeddings from the embeddings module
return self.embeddings.word_embeddings
def set_input_embeddings(self, new_embeddings):
# Update the word embeddings in the embeddings module with new_embeddings
self.embeddings.word_embeddings = new_embeddings
@add_start_docstrings_to_model_forward(LXMERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=LxmertModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
visual_feats: Optional[torch.FloatTensor] = None,
visual_pos: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
visual_attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# Perform forward pass through the LxmertModel
...
def __init__(self, config):
super().__init__(config)
# Configuration
self.config = config
self.num_qa_labels = config.num_qa_labels # 从配置中获取问答标签数量
self.visual_loss_normalizer = config.visual_loss_normalizer # 从配置中获取视觉损失的归一化器
# Use of pretraining tasks
self.task_mask_lm = config.task_mask_lm # 是否执行掩码语言建模任务
self.task_obj_predict = config.task_obj_predict # 是否执行对象预测任务
self.task_matched = config.task_matched # 是否执行匹配任务
self.task_qa = config.task_qa # 是否执行问答任务
# Lxmert backbone
self.lxmert = LxmertModel(config) # 初始化Lxmert模型
# Pre-training heads
self.cls = LxmertPreTrainingHeads(config, self.lxmert.embeddings.word_embeddings.weight) # 初始化预训练头部
if self.task_obj_predict:
self.obj_predict_head = LxmertVisualObjHead(config) # 如果执行对象预测任务,则初始化对象预测头部
if self.task_qa:
self.answer_head = LxmertVisualAnswerHead(config, self.num_qa_labels) # 如果执行问答任务,则初始化问答头部
# Weight initialization
# Initialize weights and apply final processing
self.post_init() # 执行后初始化操作,包括权重初始化和最终处理
# Loss functions
self.loss_fcts = {
"l2": SmoothL1Loss(reduction="none"), # 平滑的L1损失函数,不进行降维
"visual_ce": CrossEntropyLoss(reduction="none"), # 视觉交叉熵损失函数,不进行降维
"ce": CrossEntropyLoss(), # 交叉熵损失函数,进行降维
}
visual_losses = {}
if config.visual_obj_loss:
visual_losses["obj"] = {
"shape": (-1,), # 形状为一维向量
"num": config.num_object_labels, # 目标标签数量
"loss": "visual_ce", # 使用视觉交叉熵损失
}
if config.visual_attr_loss:
visual_losses["attr"] = {
"shape": (-1,), # 形状为一维向量
"num": config.num_attr_labels, # 属性标签数量
"loss": "visual_ce", # 使用视觉交叉熵损失
}
if config.visual_feat_loss:
visual_losses["feat"] = {
"shape": (-1, config.visual_feat_dim), # 形状为二维张量,其中维度为视觉特征维度
"num": config.visual_feat_dim, # 视觉特征的维度
"loss": "l2", # 使用平滑的L1损失
}
self.visual_losses = visual_losses # 存储视觉损失的配置信息
def resize_num_qa_labels(self, num_labels):
"""
从提供的新线性层构建调整大小的问答线性层模块。增加大小会添加新初始化的权重,减小大小会从末尾移除权重。
Args:
num_labels (`int`, *optional*):
线性层权重矩阵中的新标签数量。增加大小会在末尾添加新初始化的权重,减小大小会从末尾移除权重。如果未提供或为 `None`,则仅返回模型的问答标签 `torch.nn.Linear` 模块的指针,而不执行任何操作。
Returns:
`torch.nn.Linear`: 调整大小后的线性层指针或旧线性层
"""
cur_qa_logit_layer = self.get_qa_logit_layer()
if num_labels is None or cur_qa_logit_layer is None:
return
new_qa_logit_layer = self._resize_qa_labels(num_labels)
self.config.num_qa_labels = num_labels
self.num_qa_labels = num_labels
return new_qa_logit_layer
def _resize_qa_labels(self, num_labels):
"""
根据指定的标签数量调整当前问答预测线性层。
Args:
num_labels (`int`): 线性层权重矩阵中的新标签数量
Returns:
`nn.Module`: 调整大小后的问答预测线性层
"""
cur_qa_logit_layer = self.get_qa_logit_layer()
new_qa_logit_layer = self._get_resized_qa_labels(cur_qa_logit_layer, num_labels)
self._set_qa_logit_layer(new_qa_logit_layer)
return self.get_qa_logit_layer()
def get_qa_logit_layer(self) -> nn.Module:
"""
返回生成问答 logits 的线性层模块。
Returns:
`nn.Module`: 一个 torch 模块,映射问答预测隐藏状态的线性层,如果 LXMERT 没有视觉回答头部则返回 `None`。
"""
if hasattr(self, "answer_head"):
return self.answer_head.logit_fc[-1]
def _set_qa_logit_layer(self, qa_logit_layer):
"""
设置问答预测线性层。
Args:
qa_logit_layer (`nn.Module`): 新的问答预测线性层
"""
self.answer_head.logit_fc[-1] = qa_logit_layer
# 如果 num_labels 为 None,则直接返回当前的 cur_qa_logit_layer
if num_labels is None:
return cur_qa_logit_layer
# 获取当前 cur_qa_logit_layer 的标签数和隐藏维度
cur_qa_labels, hidden_dim = cur_qa_logit_layer.weight.size()
# 如果当前标签数等于 num_labels,则直接返回当前的 cur_qa_logit_layer
if cur_qa_labels == num_labels:
return cur_qa_logit_layer
# 如果 cur_qa_logit_layer 存在偏置项,则创建新的线性输出层,否则不创建偏置项的新线性层
if getattr(cur_qa_logit_layer, "bias", None) is not None:
new_qa_logit_layer = nn.Linear(hidden_dim, num_labels)
else:
new_qa_logit_layer = nn.Linear(hidden_dim, num_labels, bias=False)
# 将新的线性层放置在与 cur_qa_logit_layer 相同的设备上
new_qa_logit_layer.to(cur_qa_logit_layer.weight.device)
# 初始化新标签的权重
self._init_weights(new_qa_logit_layer)
# 复制之前权重中的标签
num_labels_to_copy = min(cur_qa_labels, num_labels)
new_qa_logit_layer.weight.data[:num_labels_to_copy, :] = cur_qa_logit_layer.weight.data[:num_labels_to_copy, :]
if getattr(cur_qa_logit_layer, "bias", None) is not None:
new_qa_logit_layer.bias.data[:num_labels_to_copy] = cur_qa_logit_layer.bias.data[:num_labels_to_copy]
# 返回新的线性层 new_qa_logit_layer
return new_qa_logit_layer
@add_start_docstrings_to_model_forward(LXMERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=LxmertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
visual_feats: Optional[torch.FloatTensor] = None,
visual_pos: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
visual_attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
obj_labels: Optional[Dict[str, Tuple[torch.FloatTensor, torch.FloatTensor]]] = None,
matched_label: Optional[torch.LongTensor] = None,
ans: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
):
# 此处函数定义用于模型的前向传播,接收多个输入参数和可选的返回类型标志
@add_start_docstrings(
"""Lxmert Model with a visual-answering head on top for downstream QA tasks""",
LXMERT_START_DOCSTRING,
)
class LxmertForQuestionAnswering(LxmertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# Configuration
self.config = config # 存储模型配置信息
self.num_qa_labels = config.num_qa_labels # 获取问题回答标签的数量
self.visual_loss_normalizer = config.visual_loss_normalizer # 获取视觉损失归一化参数
# Lxmert backbone
self.lxmert = LxmertModel(config) # 初始化LXMERT模型作为主干网络
self.answer_head = LxmertVisualAnswerHead(config, self.num_qa_labels) # 初始化视觉回答头部
# Weight initialization
# Initialize weights and apply final processing
self.post_init() # 执行权重初始化和最终处理步骤
# Loss function
self.loss = CrossEntropyLoss() # 定义交叉熵损失函数
def resize_num_qa_labels(self, num_labels):
"""
Build a resized question answering linear layer Module from a provided new linear layer. Increasing the size
will add newly initialized weights. Reducing the size will remove weights from the end
Args:
num_labels (`int`, *optional*):
New number of labels in the linear layer weight matrix. Increasing the size will add newly initialized
weights at the end. Reducing the size will remove weights from the end. If not provided or `None`, just
returns a pointer to the qa labels ``torch.nn.Linear``` module of the model without doing anything.
Return:
`torch.nn.Linear`: Pointer to the resized Linear layer or the old Linear layer
"""
cur_qa_logit_layer = self.get_qa_logit_layer() # 获取当前问题回答对数层
if num_labels is None or cur_qa_logit_layer is None:
return # 如果没有提供num_labels或当前qa_logit_layer为None,则直接返回
new_qa_logit_layer = self._resize_qa_labels(num_labels) # 调整问题回答对数层的大小
self.config.num_qa_labels = num_labels # 更新模型配置中的问题回答标签数量
self.num_qa_labels = num_labels # 更新当前实例的问题回答标签数量
return new_qa_logit_layer # 返回调整后的问题回答对数层
def _resize_qa_labels(self, num_labels):
cur_qa_logit_layer = self.get_qa_logit_layer() # 获取当前问题回答对数层
new_qa_logit_layer = self._get_resized_qa_labels(cur_qa_logit_layer, num_labels) # 调整问题回答对数层的大小
self._set_qa_logit_layer(new_qa_logit_layer) # 设置新的问题回答对数层
return self.get_qa_logit_layer() # 返回调整后的问题回答对数层
def get_qa_logit_layer(self) -> nn.Module:
"""
Returns the linear layer that produces question answering logits
Returns:
`nn.Module`: A torch module mapping the question answering prediction hidden states. `None`: A NoneType
object if Lxmert does not have the visual answering head.
"""
if hasattr(self, "answer_head"):
return self.answer_head.logit_fc[-1] # 返回最后一个问题回答对数层
def _set_qa_logit_layer(self, qa_logit_layer):
self.answer_head.logit_fc[-1] = qa_logit_layer # 设置最后一个问题回答对数层
# 如果 num_labels 为 None,则直接返回当前的 cur_qa_logit_layer
if num_labels is None:
return cur_qa_logit_layer
# 获取当前 cur_qa_logit_layer 的标签数量和隐藏层维度
cur_qa_labels, hidden_dim = cur_qa_logit_layer.weight.size()
# 如果当前 cur_qa_logit_layer 的标签数量与 num_labels 相同,则直接返回 cur_qa_logit_layer
if cur_qa_labels == num_labels:
return cur_qa_logit_layer
# 如果 cur_qa_logit_layer 具有偏置项,则构建一个新的线性输出层
if getattr(cur_qa_logit_layer, "bias", None) is not None:
new_qa_logit_layer = nn.Linear(hidden_dim, num_labels)
else:
# 如果 cur_qa_logit_layer 没有偏置项,则构建一个无偏置的新线性输出层
new_qa_logit_layer = nn.Linear(hidden_dim, num_labels, bias=False)
# 将新构建的线性输出层放置在与 cur_qa_logit_layer 相同的设备上
new_qa_logit_layer.to(cur_qa_logit_layer.weight.device)
# 初始化新线性输出层的权重
self._init_weights(new_qa_logit_layer)
# 复制标签从先前权重中的标签
num_labels_to_copy = min(cur_qa_labels, num_labels)
new_qa_logit_layer.weight.data[:num_labels_to_copy, :] = cur_qa_logit_layer.weight.data[:num_labels_to_copy, :]
# 如果 cur_qa_logit_layer 具有偏置项,则同时复制偏置项
if getattr(cur_qa_logit_layer, "bias", None) is not None:
new_qa_logit_layer.bias.data[:num_labels_to_copy] = cur_qa_logit_layer.bias.data[:num_labels_to_copy]
# 返回新构建的线性输出层 new_qa_logit_layer
return new_qa_logit_layer
) -> Union[LxmertForQuestionAnsweringOutput, Tuple[torch.FloatTensor]]:
r"""
labels (`Torch.Tensor` of shape `(batch_size)`, *optional*):
A one-hot representation of the correct answer
"""
# 根据需要确定是否返回字典格式的输出
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用 LXMERT 模型进行前向传播
lxmert_output = self.lxmert(
input_ids=input_ids, # 输入的token IDs
visual_feats=visual_feats, # 视觉特征
visual_pos=visual_pos, # 视觉位置编码
token_type_ids=token_type_ids, # token类型IDs
attention_mask=attention_mask, # 注意力掩码
visual_attention_mask=visual_attention_mask, # 视觉注意力掩码
inputs_embeds=inputs_embeds, # 输入的嵌入表示
output_hidden_states=output_hidden_states, # 输出隐藏状态
output_attentions=output_attentions, # 输出注意力
return_dict=return_dict, # 是否返回字典格式的输出
)
# 获取经过 LXMERT 模型后的汇总输出
pooled_output = lxmert_output[2]
# 使用答案头部对汇总输出进行评分
answer_score = self.answer_head(pooled_output)
# 初始化损失值
loss = None
# 如果提供了标签,则计算损失值
if labels is not None:
loss = self.loss(answer_score.view(-1, self.num_qa_labels), labels.view(-1))
# 如果不需要返回字典格式的输出,则按元组方式构建输出
if not return_dict:
output = (answer_score,) + lxmert_output[3:]
return (loss,) + output if loss is not None else output
# 如果需要返回字典格式的输出,则创建相应的输出对象
return LxmertForQuestionAnsweringOutput(
loss=loss, # 损失值
question_answering_score=answer_score, # 问题回答分数
language_hidden_states=lxmert_output.language_hidden_states, # 语言模型的隐藏状态
vision_hidden_states=lxmert_output.vision_hidden_states, # 视觉模型的隐藏状态
language_attentions=lxmert_output.language_attentions, # 语言注意力
vision_attentions=lxmert_output.vision_attentions, # 视觉注意力
cross_encoder_attentions=lxmert_output.cross_encoder_attentions, # 跨编码器注意力
)