Transformers 源码解析(一百一十三)
.\models\tvlt\processing_tvlt.py
"""
TVLT 的处理器类。
"""
from ...processing_utils import ProcessorMixin
class TvltProcessor(ProcessorMixin):
r"""
构建一个 TVLT 处理器,将 TVLT 图像处理器和 TVLT 特征提取器包装成一个单一的处理器。
[`TvltProcessor`] 提供了 [`TvltImageProcessor`] 和 [`TvltFeatureExtractor`] 的所有功能。查看
[`~TvltProcessor.__call__`] 的文档字符串以获取更多信息。
Args:
image_processor (`TvltImageProcessor`):
[`TvltImageProcessor`] 的实例。图像处理器是必需的输入。
feature_extractor (`TvltFeatureExtractor`):
[`TvltFeatureExtractor`] 的实例。特征提取器是必需的输入。
"""
attributes = ["image_processor", "feature_extractor"]
image_processor_class = "TvltImageProcessor"
feature_extractor_class = "TvltFeatureExtractor"
def __init__(self, image_processor, feature_extractor):
super().__init__(image_processor=image_processor, feature_extractor=feature_extractor)
self.image_processor = image_processor
self.feature_extractor = feature_extractor
def __call__(
self,
images=None,
audio=None,
images_mixed=None,
sampling_rate=None,
mask_audio=False,
mask_pixel=False,
*args,
**kwargs,
):
"""
Forwards the `images` argument to TvltImageProcessor's [`~TvltImageProcessor.preprocess`] and the `audio`
argument to TvltFeatureExtractor's [`~TvltFeatureExtractor.__call__`]. Please refer to the docstring of the
above two methods for more information.
"""
if images is None and audio is None:
raise ValueError("You need to specify either an `images` or `audio` input to process.")
images_mixed_dict = None
if images is not None:
images_dict = self.image_processor(images, mask_pixel=mask_pixel, *args, **kwargs)
if images_mixed is not None:
images_mixed_dict = self.image_processor(images_mixed, is_mixed=True, *args, **kwargs)
if audio is not None:
audio_dict = self.feature_extractor(
audio, *args, sampling_rate=sampling_rate, mask_audio=mask_audio, **kwargs
)
output_dict = {}
if audio is not None:
output_dict.update(audio_dict)
if images is not None:
output_dict.update(images_dict)
if images_mixed_dict is not None:
output_dict.update(images_mixed_dict)
return output_dict
@property
def model_input_names(self):
image_processor_input_names = self.image_processor.model_input_names
feature_extractor_input_names = self.feature_extractor.model_input_names
return list(dict.fromkeys(image_processor_input_names + feature_extractor_input_names))
.\models\tvlt\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_torch_available,
is_vision_available,
)
_import_structure = {
"configuration_tvlt": ["TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP", "TvltConfig"],
"feature_extraction_tvlt": ["TvltFeatureExtractor"],
"processing_tvlt": ["TvltProcessor"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tvlt"] = [
"TVLT_PRETRAINED_MODEL_ARCHIVE_LIST",
"TvltModel",
"TvltForPreTraining",
"TvltForAudioVisualClassification",
"TvltPreTrainedModel",
]
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_tvlt"] = ["TvltImageProcessor"]
if TYPE_CHECKING:
from .configuration_tvlt import TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP, TvltConfig
from .processing_tvlt import TvltProcessor
from .feature_extraction_tvlt import TvltFeatureExtractor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tvlt import (
TVLT_PRETRAINED_MODEL_ARCHIVE_LIST,
TvltForAudioVisualClassification,
TvltForPreTraining,
TvltModel,
TvltPreTrainedModel,
)
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_tvlt import TvltImageProcessor
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\tvp\configuration_tvp.py
""" TVP model configuration"""
import copy
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto import CONFIG_MAPPING
logger = logging.get_logger(__name__)
TVP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"Intel/tvp-base": "https://huggingface.co/Intel/tvp-base/resolve/main/config.json",
}
class TvpConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`TvpModel`]. It is used to instantiate an Tvp
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Tvp
[Intel/tvp-base](https://huggingface.co/Intel/tvp-base) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
model_type = "tvp"
def __init__(
self,
backbone_config=None,
backbone=None,
use_pretrained_backbone=False,
use_timm_backbone=False,
backbone_kwargs=None,
distance_loss_weight=1.0,
duration_loss_weight=0.1,
visual_prompter_type="framepad",
visual_prompter_apply="replace",
visual_prompt_size=96,
max_img_size=448,
num_frames=48,
vocab_size=30522,
hidden_size=768,
intermediate_size=3072,
num_hidden_layers=12,
num_attention_heads=12,
max_position_embeddings=512,
max_grid_col_position_embeddings=100,
max_grid_row_position_embeddings=100,
hidden_dropout_prob=0.1,
hidden_act="gelu",
layer_norm_eps=1e-12,
initializer_range=0.02,
attention_probs_dropout_prob=0.1,
**kwargs,
):
"""
Initialize the TvpConfig with specific model configuration parameters.
Args:
backbone_config (Optional): Configuration for the backbone, default is None.
backbone (Optional): Backbone model, default is None.
use_pretrained_backbone (bool): Whether to use a pretrained backbone model, default is False.
use_timm_backbone (bool): Whether to use a backbone model from the timm library, default is False.
backbone_kwargs (Optional): Additional parameters for the backbone model, default is None.
distance_loss_weight (float): Weight for the distance loss, default is 1.0.
duration_loss_weight (float): Weight for the duration loss, default is 0.1.
visual_prompter_type (str): Type of visual prompter, default is "framepad".
visual_prompter_apply (str): Application method of visual prompter, default is "replace".
visual_prompt_size (int): Size of the visual prompt, default is 96.
max_img_size (int): Maximum image size, default is 448.
num_frames (int): Number of frames in the image, default is 48.
vocab_size (int): Size of the vocabulary, default is 30522.
hidden_size (int): Size of the hidden layers, default is 768.
intermediate_size (int): Size of the intermediate layers, default is 3072.
num_hidden_layers (int): Number of hidden layers, default is 12.
num_attention_heads (int): Number of attention heads, default is 12.
max_position_embeddings (int): Maximum position embeddings, default is 512.
max_grid_col_position_embeddings (int): Maximum grid column position embeddings, default is 100.
max_grid_row_position_embeddings (int): Maximum grid row position embeddings, default is 100.
hidden_dropout_prob (float): Dropout probability for hidden layers, default is 0.1.
hidden_act (str): Activation function for hidden layers, default is "gelu".
layer_norm_eps (float): Epsilon value for layer normalization, default is 1e-12.
initializer_range (float): Range for weight initialization, default is 0.02.
attention_probs_dropout_prob (float): Dropout probability for attention probabilities, default is 0.1.
**kwargs: Additional keyword arguments for potential future updates.
"""
super().__init__(**kwargs)
):
super().__init__(**kwargs)
if use_pretrained_backbone:
raise ValueError("Pretrained backbones are not supported yet.")
if backbone_config is not None and backbone is not None:
raise ValueError("You can't specify both `backbone` and `backbone_config`.")
if backbone_config is None and backbone is None:
logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
elif isinstance(backbone_config, dict):
backbone_model_type = backbone_config.get("model_type")
config_class = CONFIG_MAPPING[backbone_model_type]
backbone_config = config_class.from_dict(backbone_config)
if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
self.backbone_config = backbone_config
self.backbone = backbone
self.use_pretrained_backbone = use_pretrained_backbone
self.use_timm_backbone = use_timm_backbone
self.backbone_kwargs = backbone_kwargs
self.distance_loss_weight = distance_loss_weight
self.duration_loss_weight = duration_loss_weight
self.visual_prompter_type = visual_prompter_type
self.visual_prompter_apply = visual_prompter_apply
self.visual_prompt_size = visual_prompt_size
self.max_img_size = max_img_size
self.num_frames = num_frames
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.max_position_embeddings = max_position_embeddings
self.max_grid_col_position_embeddings = max_grid_col_position_embeddings
self.max_grid_row_position_embeddings = max_grid_row_position_embeddings
self.layer_norm_eps = layer_norm_eps
self.hidden_dropout_prob = hidden_dropout_prob
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.attention_probs_dropout_prob = attention_probs_dropout_prob
@classmethod
def from_backbone_config(cls, backbone_config: PretrainedConfig, **kwargs):
"""Instantiate a [`TvpConfig`] (or a derived class) from a pre-trained backbone model configuration.
Args:
backbone_config ([`PretrainedConfig`]):
The backbone configuration.
Returns:
[`TvpConfig`]: An instance of a configuration object
"""
return cls(backbone_config=backbone_config, **kwargs)
def to_dict(self):
"""
将当前实例序列化为一个 Python 字典。重写默认的 [`~PretrainedConfig.to_dict`] 方法。
Returns:
`Dict[str, any]`: 包含此配置实例所有属性的字典,
"""
output = copy.deepcopy(self.__dict__)
if output["backbone_config"] is not None:
output["backbone_config"] = self.backbone_config.to_dict()
output["model_type"] = self.__class__.model_type
return output
.\models\tvp\image_processing_tvp.py
"""用于 TVP 的图像处理器类。"""
from typing import Dict, Iterable, List, Optional, Tuple, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
PaddingMode,
flip_channel_order,
pad,
resize,
to_channel_dimension_format,
)
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
get_image_size,
is_valid_image,
to_numpy_array,
valid_images,
validate_kwargs,
validate_preprocess_arguments,
)
from ...utils import TensorType, is_vision_available, logging
if is_vision_available():
import PIL
logger = logging.get_logger(__name__)
def make_batched(videos) -> List[List[ImageInput]]:
if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
return videos
elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
return [videos]
elif is_valid_image(videos):
return [[videos]]
raise ValueError(f"Could not make batched video from {videos}")
def get_resize_output_image_size(
input_image: np.ndarray,
max_size: int = 448,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
height, width = get_image_size(input_image, input_data_format)
if height >= width:
ratio = width * 1.0 / height
new_height = max_size
new_width = int(new_height * ratio)
else:
ratio = height * 1.0 / width
new_width = max_size
new_height = int(new_width * ratio)
size = (new_height, new_width)
return size
class TvpImageProcessor(BaseImageProcessor):
r"""
构建一个 Tvp 图像处理器。
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
do_center_crop: bool = True,
crop_size: Dict[str, int] = None,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_pad: bool = True,
pad_size: Dict[str, int] = None,
constant_values: Union[float, Iterable[float]] = 0,
pad_mode: PaddingMode = PaddingMode.CONSTANT,
do_normalize: bool = True,
do_flip_channel_order: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"longest_edge": 448}
crop_size = crop_size if crop_size is not None else {"height": 448, "width": 448}
pad_size = pad_size if pad_size is not None else {"height": 448, "width": 448}
self.do_resize = do_resize
self.size = size
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_pad = do_pad
self.pad_size = pad_size
self.constant_values = constant_values
self.pad_mode = pad_mode
self.do_normalize = do_normalize
self.do_flip_channel_order = do_flip_channel_order
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
self._valid_processor_keys = [
"videos",
"do_resize",
"size",
"resample",
"do_center_crop",
"crop_size",
"do_rescale",
"rescale_factor",
"do_pad",
"pad_size",
"constant_values",
"pad_mode",
"do_normalize",
"do_flip_channel_order",
"image_mean",
"image_std",
"return_tensors",
"data_format",
"input_data_format",
]
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize an image.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Size of the output image. If `size` is of the form `{"height": h, "width": w}`, the output image will
have the size `(h, w)`. If `size` is of the form `{"longest_edge": s}`, the output image will have its
longest edge of length `s` while keeping the aspect ratio of the original image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use when resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
size = get_size_dict(size, default_to_square=False)
if "height" in size and "width" in size:
output_size = (size["height"], size["width"])
elif "longest_edge" in size:
output_size = get_resize_output_image_size(image, size["longest_edge"], input_data_format)
else:
raise ValueError(f"Size must have 'height' and 'width' or 'longest_edge' as keys. Got {size.keys()}")
return resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
):
"""
Pad an image with zeros to the given size.
Args:
image (`np.ndarray`):
Image to pad.
pad_size (`Dict[str, int]`)
Size of the output image with pad.
constant_values (`Union[float, Iterable[float]]`)
The fill value to use when padding the image.
pad_mode (`PaddingMode`)
The pad mode, default to PaddingMode.CONSTANT
data_format (`ChannelDimension` or `str`, *optional*)
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
height, width = get_image_size(image, channel_dim=input_data_format)
max_height = pad_size.get("height", height)
max_width = pad_size.get("width", width)
pad_right, pad_bottom = max_width - width, max_height - height
if pad_right < 0 or pad_bottom < 0:
raise ValueError("The padding size must be greater than image size")
padding = ((0, pad_bottom), (0, pad_right))
padded_image = pad(
image,
padding,
mode=pad_mode,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
)
return padded_image
def _preprocess_image(
self,
image: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
do_center_crop: bool = None,
crop_size: Dict[str, int] = None,
do_rescale: bool = None,
rescale_factor: float = None,
do_pad: bool = True,
pad_size: Dict[str, int] = None,
constant_values: Union[float, Iterable[float]] = None,
pad_mode: PaddingMode = None,
do_normalize: bool = None,
do_flip_channel_order: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""Preprocesses a single image."""
validate_preprocess_arguments(
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
do_pad=do_pad,
size_divisibility=pad_size,
do_center_crop=do_center_crop,
crop_size=crop_size,
do_resize=do_resize,
size=size,
resample=resample,
)
image = to_numpy_array(image)
if do_resize:
image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
if do_center_crop:
image = self.center_crop(image, size=crop_size, input_data_format=input_data_format)
if do_rescale:
image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
if do_normalize:
image = self.normalize(
image=image.astype(np.float32), mean=image_mean, std=image_std, input_data_format=input_data_format
)
if do_pad:
image = self.pad_image(
image=image,
pad_size=pad_size,
constant_values=constant_values,
pad_mode=pad_mode,
input_data_format=input_data_format,
)
if do_flip_channel_order:
image = flip_channel_order(image=image, input_data_format=input_data_format)
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
return image
.\models\tvp\modeling_tvp.py
"""PyTorch TVP Model"""
import math
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
import torch.utils.checkpoint
from torch import nn
from ...activations import ACT2FN
from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ModelOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import prune_linear_layer
from ...utils import logging
from ...utils.backbone_utils import load_backbone
from .configuration_tvp import TvpConfig
logger = logging.get_logger(__name__)
TVP_PRETRAINED_MODEL_ARCHIVE_LIST = [
"Intel/tvp-base",
"Intel/tvp-base-ANet",
]
@dataclass
class TvpVideoGroundingOutput(ModelOutput):
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
Temporal-Distance IoU loss for video grounding.
logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
Contains start_time/duration and end_time/duration. It is the time slot of the videos corresponding to the
input texts.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
class TvpLoss(nn.Module):
"""
Placeholder for TvpLoss class definition.
"""
This class computes the losses for `TvpForVideoGrounding`. The process happens in two steps: 1) we compute
hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched
ground-truth / prediction (supervise class and box).
Args:
losses (`List[str]`):
List of all the losses to be applied.
"""
# 定义一个用于视频定位损失计算的类
class TvpLossCalculator:
# 初始化方法,接收损失列表并进行初始化
def __init__(self, losses):
super().__init__()
# 定义损失函数映射字典
self.loss_map = {
"iou": self.loss_iou,
"distance": self.loss_distance,
"duration": self.loss_duration,
}
# 检查每个损失函数是否支持,若不支持则引发 ValueError 异常
for loss in losses:
if loss not in self.loss_map:
raise ValueError(f"Loss {loss} not supported")
self.losses = losses
# 计算 IoU 损失函数
def loss_iou(self, start_time, end_time, candidates_start_time, candidates_end_time, duration):
"""
Measure the intersection over union.
"""
# 计算交集部分
inter = torch.min(candidates_end_time, end_time) - torch.max(candidates_start_time, start_time)
# 计算并集部分
union = torch.max(candidates_end_time, end_time) - torch.min(candidates_start_time, start_time)
# 计算 IoU
iou = 1 - inter.clamp(min=0) / union
return iou
# 计算距离损失函数
def loss_distance(self, start_time, end_time, candidates_start_time, candidates_end_time, duration):
"""
Measure the distance of mid points.
"""
# 计算候选框中点
mid_candidates = torch.div(torch.add(candidates_start_time, candidates_end_time), 2.0)
# 计算真实框中点
mid_groundtruth = torch.div(torch.add(start_time, end_time), 2.0)
# 计算中点距离差异
distance_diff = torch.div(
torch.max(mid_candidates, mid_groundtruth) - torch.min(mid_candidates, mid_groundtruth), duration
).clamp(min=0.2)
return distance_diff
# 计算时长损失函数
def loss_duration(self, start_time, end_time, candidates_start_time, candidates_end_time, duration):
"""
Measure the difference of duration.
"""
# 计算候选框时长
duration_candidates = torch.sub(candidates_end_time, candidates_start_time)
# 计算真实框时长
duration_groundtruth = torch.sub(end_time, start_time)
# 计算时长差异
duration_diff = torch.square(torch.div(torch.sub(duration_candidates, duration_groundtruth), duration))
duration_diff = duration_diff.clamp(min=0.4)
return duration_diff
def forward(self, logits, labels):
"""
This performs the loss computation.
Args:
logits (`torch.FloatTensor`):
The output logits of head module.
labels (`List[torch.FloatTensor]`):
List of tensors ([start, end, duration]), which contains start time, end time of the video corresponding to the text, and also the duration.
"""
# 从标签中解包出视频的时长、开始时间和结束时间
duration, start_time, end_time = labels
# 将logits乘以视频持续时间,得到候选的开始时间和结束时间
candidates = torch.mul(logits, duration)
# 将候选的开始时间和结束时间转换为浮点数张量
candidates_start_time, candidates_end_time = candidates[:, 0].float(), candidates[:, 1].float()
# 初始化损失字典
losses_dict = {}
# 遍历每种损失函数并计算损失值,将结果更新到损失字典中
for loss in self.losses:
losses_dict.update(
{loss: self.loss_map[loss](start_time, end_time, candidates_start_time, candidates_end_time, duration)}
)
# 返回损失字典作为结果
return losses_dict
class TvpVisionModel(nn.Module):
# 定义一个视觉模型类,继承自nn.Module
def __init__(self, config):
super().__init__()
# 加载指定配置的后端模型作为主干网络
self.backbone = load_backbone(config)
# 定义网格编码器的卷积层,用于处理特征图
self.grid_encoder_conv = nn.Conv2d(
config.backbone_config.hidden_sizes[-1], # 输入通道数为主干网络的最后一个隐藏层大小
config.hidden_size, # 输出通道数为配置中指定的隐藏层大小
kernel_size=3, # 卷积核大小为3x3
stride=1, # 步长为1
padding=1, # 填充大小为1
groups=1, # 不使用分组卷积
bias=False, # 不使用偏置项
)
def forward(self, pixel_values):
# 获取输入张量的形状信息
batch_size, num_frames, num_channels, height, width = pixel_values.shape
# 将输入张量重新排列为(batch_size * num_frames, num_channels, height, width)
pixel_values = pixel_values.view(batch_size * num_frames, num_channels, height, width)
# 将重新排列后的输入通过主干网络获取特征图输出,并选择第一个输出元素
grid_feat_outputs = self.backbone(pixel_values)["feature_maps"][0]
# 将特征图通过网格编码器的卷积层进行处理
grid = self.grid_encoder_conv(grid_feat_outputs)
# 对处理后的网格进行最大池化操作,核大小为2x2,步长为2
grid = nn.functional.max_pool2d(grid, kernel_size=2, stride=2)
# 对池化后的网格应用ReLU激活函数
grid = nn.functional.relu(grid, inplace=True)
# 获取处理后网格的通道数、高度和宽度信息
new_channel, new_height, new_width = grid.shape[-3:]
# 将网格重新排列为(batch_size, num_frames, new_channel, new_height, new_width)
grid = grid.view(batch_size, num_frames, new_channel, new_height, new_width)
# 将最后两个维度的顺序调整为(batch_size, num_frames, height, width, num_channels)
grid = grid.permute(0, 1, 3, 4, 2)
# 返回处理后的网格张量作为输出
return grid
class TvpVisualInputEmbedding(nn.Module):
"""
Takes input of both image and video (multi-frame)
"""
def __init__(self, config):
super().__init__()
# 定义位置编码的Embedding层,用于序列的位置信息编码
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
# 定义行位置编码的Embedding层,用于网格的行位置信息编码
self.row_position_embeddings = nn.Embedding(config.max_grid_row_position_embeddings, config.hidden_size)
# 定义列位置编码的Embedding层,用于网格的列位置信息编码
self.col_position_embeddings = nn.Embedding(config.max_grid_col_position_embeddings, config.hidden_size)
# 定义令牌类型编码的Embedding层,用于区分不同类型的令牌
self.token_type_embeddings = nn.Embedding(1, config.hidden_size)
# 定义Layer Norm层,用于归一化输入特征
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 定义Dropout层,用于在训练过程中随机丢弃部分输入特征,防止过拟合
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def add_2d_positional_embeddings(self, grid):
"""
Args:
grid: (batch_size, height, width, hidden_dim)
Returns:
grid + col_position_embeddings.view(*col_shape): (batch_size, *, height, width, hidden_dim)
"""
batch_size, height, width, hidden_dim = grid.shape
# 添加行位置嵌入
row_position_ids = torch.arange(height, dtype=torch.long, device=grid.device) # (height, )
row_position_embeddings = self.row_position_embeddings(row_position_ids) # (height, hidden_dim)
row_shape = (1,) * (len(grid.shape) - 3) + (height, 1, hidden_dim) # (1, height, 1, hidden_dim)
grid = grid + row_position_embeddings.view(*row_shape) # 自动广播操作
# 添加列位置嵌入
col_position_ids = torch.arange(width, dtype=torch.long, device=grid.device) # (width, )
col_position_embeddings = self.col_position_embeddings(col_position_ids) # (width, hidden_dim)
col_shape = (batch_size, 1, width, hidden_dim) # (1, 1, width, hidden_dim)
return grid + col_position_embeddings.view(*col_shape) # 自动广播操作
def forward(self, grid):
"""
Args:
grid: Array of shape (batch_size, num_frames, height, width, num_channels).
It contains processed frames extracted from videos, and is generated by Tvp image preprocessor. Note,
num_frames can be 1
Returns:
embeddings: The embedding of grid with size (batch_size, height*width, num_channels)
"""
batch_size, num_frames, height, width, num_channels = grid.shape
# 时间平均池化,得到 (batch_size, height, width, hidden_size)
grid = grid.mean(1)
grid = self.add_2d_positional_embeddings(grid)
# 图像令牌序列,得到 (batch_size, height*width, num_channels)
visual_tokens = grid.view(batch_size, -1, num_channels)
visual_tokens_shape = visual_tokens.shape[:-1]
device = visual_tokens.device
# 图像令牌类型嵌入
token_type_ids = torch.zeros(visual_tokens_shape, dtype=torch.long, device=device)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = visual_tokens + token_type_embeddings
embeddings = self.layer_norm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config):
super().__init__()
# 初始化词嵌入层,将词汇表大小映射到隐藏大小,支持填充索引
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
# 初始化位置嵌入层,将最大位置嵌入数映射到隐藏大小
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
# 初始化标记类型嵌入层,将类型词汇表大小映射到隐藏大小
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
# 初始化层归一化,对隐藏大小的张量进行归一化处理
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 初始化丢弃层,根据隐藏丢弃概率进行随机丢弃
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
device = input_ids.device if input_ids is not None else inputs_embeds.device
# 如果位置ID为None,则创建一个序列长度的张量作为位置ID
if position_ids is None:
position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
position_ids = position_ids.unsqueeze(0).expand(input_shape)
# 如果标记类型ID为None,则创建一个与输入形状相同的零张量作为标记类型ID
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
# 如果输入嵌入为空,则使用输入ID获取词嵌入
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
# 获取位置嵌入和标记类型嵌入
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
# 计算总嵌入,包括词嵌入、位置嵌入和标记类型嵌入
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
embeddings = self.layer_norm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class TvpAttention(nn.Module):
def __init__(self, config):
super().__init__()
# 如果隐藏大小不能被注意力头数整除且配置中没有嵌入大小属性,则抛出错误
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size {config.hidden_size} is not a multiple of the number of attention heads {config.num_attention_heads}"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
# 初始化查询、键、值线性变换层,将隐藏大小映射到注意力头大小
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
# 注意力丢弃层,根据注意力概率丢弃
self.attn_dropout = nn.Dropout(config.attention_probs_dropout_prob)
# 初始化全连接层和层归一化层,用于输出注意力后的隐藏大小
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 初始化丢弃层,根据隐藏丢弃概率进行随机丢弃
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 初始化剪枝头集合,用于标识应该被剪枝的注意力头
self.pruned_heads = set()
# 对 self.pruned_heads 进行修剪操作,移除已经修剪过的头部
def prune_heads(self, heads):
# 如果 heads 长度为 0,则直接返回,不进行修剪操作
if len(heads) == 0:
return
# 创建一个全为 1 的掩码,形状为 (self.num_attention_heads, self.attention_head_size)
mask = torch.ones(self.num_attention_heads, self.attention_head_size)
# 将 heads 转换为集合,并从中移除已经修剪过的头部
heads = set(heads) - self.pruned_heads
# 遍历剩余的 heads
for head in heads:
# 计算比当前 head 小的已修剪头部数量,调整索引
head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
# 将对应位置的掩码设为 0
mask[head] = 0
# 将掩码展平并获取非零元素的索引
mask = mask.view(-1).contiguous().eq(1)
index = torch.arange(len(mask))[mask].long()
# 对线性层进行修剪操作
self.query = prune_linear_layer(self.query, index)
self.key = prune_linear_layer(self.key, index)
self.value = prune_linear_layer(self.value, index)
self.dense = prune_linear_layer(self.dense, index, dim=1)
# 更新超参数并存储修剪过的头部
self.num_attention_heads = self.num_attention_heads - len(heads)
self.all_head_size = self.attention_head_size * self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
# 重新整形操作,用于将 tensor 从形状 (batch_size * sequence_length * ...) 转换为 (batch_size * ... * num_attention_heads * attention_head_size)
def _reshape(self, tensor: torch.Tensor, sequence_length: int, batch_size: int):
return (
tensor.view(batch_size, sequence_length, self.num_attention_heads, self.attention_head_size)
.transpose(1, 2) # 将 sequence_length 和 num_attention_heads 这两个维度交换位置
.contiguous() # 确保张量的内存是连续的
)
# 前向传播函数
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
output_attentions: Optional[bool] = None,
):
# 获取隐藏状态的批量大小和序列长度
batch_size, sequence_length = hidden_states.shape[:2]
# 通过self.query对隐藏状态进行查询操作,生成混合查询层
mixed_query_layer = self.query(hidden_states)
# 通过self.key对隐藏状态进行键操作,生成混合键层
mixed_key_layer = self.key(hidden_states)
# 通过self.value对隐藏状态进行值操作,生成混合值层
mixed_value_layer = self.value(hidden_states)
# 使用私有方法self._reshape重新塑形混合查询、键、值层
query_layer = self._reshape(mixed_query_layer, sequence_length, batch_size)
key_layer = self._reshape(mixed_key_layer, sequence_length, batch_size)
value_layer = self._reshape(mixed_value_layer, sequence_length, batch_size)
# 计算"查询"和"键"之间的点积,得到原始注意力分数
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
# 如果存在注意力遮罩,将其加到注意力分数上
if attention_mask is not None:
attention_scores = attention_scores + attention_mask
# 将注意力分数归一化为注意力概率
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# 对注意力概率应用注意力dropout
attention_probs = self.attn_dropout(attention_probs)
# 如果存在头部遮罩,将其应用到注意力概率上
if head_mask is not None:
attention_probs = attention_probs * head_mask
# 计算注意力输出,将注意力概率与值层进行加权求和
attn_output = torch.matmul(attention_probs, value_layer)
attn_output = attn_output.transpose(1, 2).contiguous()
# 重塑注意力输出的形状
attn_output = attn_output.reshape(batch_size, sequence_length, self.all_head_size)
# 通过self.dense对注意力输出进行全连接层操作
attn_output = self.dense(attn_output)
# 应用dropout到注意力输出上
attn_output = self.dropout(attn_output)
# 将层归一化应用到注意力输出与隐藏状态的残差上
attn_output = self.layer_norm(attn_output + hidden_states)
# 如果需要输出注意力信息,则将注意力概率加入到输出中
outputs = (attn_output, attention_probs) if output_attentions else (attn_output,)
return outputs
# 从 transformers.models.bert.modeling_bert.BertIntermediate 复制并修改为 TvpIntermediate
class TvpIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
# 创建一个全连接层,将输入大小为 config.hidden_size 转换为 config.intermediate_size
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
# 根据配置选择激活函数,若 config.hidden_act 是字符串则使用预定义的函数,否则直接使用配置中的函数
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 全连接层处理输入的 hidden_states
hidden_states = self.dense(hidden_states)
# 应用选择的激活函数到处理后的 hidden_states
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
# 定义 TvpOutputLayer 类,包括线性层、LayerNorm 层和 Dropout 层
class TvpOutputLayer(nn.Module):
def __init__(self, config):
super().__init__()
# 创建一个全连接层,将输入大小为 config.intermediate_size 转换为 config.hidden_size
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
# 创建一个 LayerNorm 层,标准化大小为 config.hidden_size 的输入张量
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 创建一个 Dropout 层,应用概率为 config.hidden_dropout_prob 的丢弃率
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
# 全连接层处理输入的 hidden_states
hidden_states = self.dense(hidden_states)
# 对全连接层的输出进行 Dropout 处理
hidden_states = self.dropout(hidden_states)
# 对加和后的结果应用 LayerNorm 处理
hidden_states = self.layer_norm(hidden_states + input_tensor)
return hidden_states
# 定义 TvpEncodeLayer 类,包括 TvpAttention、TvpIntermediate 和 TvpOutputLayer 实例
class TvpEncodeLayer(nn.Module):
def __init__(self, config):
super().__init__()
# 创建一个 TvpAttention 实例
self.attention = TvpAttention(config)
# 创建一个 TvpIntermediate 实例
self.intermediate = TvpIntermediate(config)
# 创建一个 TvpOutputLayer 实例
self.output = TvpOutputLayer(config)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
output_attentions: Optional[bool] = None,
):
# 调用 attention 实例处理 hidden_states,并返回其输出
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:] # 如果输出注意力权重,则添加自注意力
# 将 attention_output 输入到 intermediate 实例中进行处理
intermediate_output = self.intermediate(attention_output)
# 将 intermediate_output 和 attention_output 输入到 output 实例中进行处理
layer_output = self.output(intermediate_output, attention_output)
outputs = (layer_output,) + outputs
return outputs
# 定义 TvpEncoder 类,包括多个 TvpEncodeLayer 层和一些配置项
class TvpEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
# 创建一个 nn.ModuleList,其中包含 config.num_hidden_layers 个 TvpEncodeLayer 实例
self.layer = nn.ModuleList([TvpEncodeLayer(config) for _ in range(config.num_hidden_layers)])
# 设置梯度检查点为 False
self.gradient_checkpointing = False
def forward(
self,
hidden_states,
attention_mask=None,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
# 如果 return_dict 参数未指定,则使用配置中的默认值
return_dict = return_dict if return_dict is not None else self.config.return_dict
# 如果 output_attentions 参数未指定,则使用配置中的默认值
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 如果 output_hidden_states 参数未指定,则使用配置中的默认值
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 初始化用于存储所有层隐藏状态的元组
all_hidden_states = ()
# 初始化用于存储所有注意力权重的元组
all_attentions = ()
# 遍历每个层次的 Transformer 层
for i, layer_module in enumerate(self.layer):
# 如果需要输出隐藏状态
if output_hidden_states:
# 将当前层的隐藏状态添加到 all_hidden_states 中
all_hidden_states = all_hidden_states + (hidden_states,)
# 如果启用了梯度检查点且在训练阶段
if self.gradient_checkpointing and self.training:
# 调用 _gradient_checkpointing_func 方法实现梯度检查点
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
attention_mask,
(head_mask[i] if head_mask is not None else None),
output_attentions,
)
else:
# 普通地调用 Transformer 层,得到层的输出
layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i], output_attentions)
# 更新 hidden_states 为当前层的输出的第一个元素,即隐藏状态
hidden_states = layer_outputs[0]
# 如果需要输出注意力权重
if output_attentions:
# 将当前层的注意力权重添加到 all_attentions 中
all_attentions = all_attentions + (layer_outputs[1],)
# 添加最后一层的隐藏状态到 all_hidden_states 中
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 如果不需要以字典形式返回结果
if not return_dict:
# 构造 outputs,包含最后一层的隐藏状态及可能的所有隐藏状态和注意力权重
outputs = (hidden_states,)
if output_hidden_states:
outputs = outputs + (all_hidden_states,)
if output_attentions:
outputs = outputs + (all_attentions,)
return outputs # 返回最后一层的隐藏状态,所有隐藏状态和注意力权重的元组
# 如果需要以 BaseModelOutput 对象形式返回结果
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states if output_hidden_states else None,
attentions=all_attentions if output_attentions else None,
)
# 从transformers.models.bert.modeling_bert.BertPooler复制而来,将Bert改为Tvp
class TvpPooler(nn.Module):
def __init__(self, config):
super().__init__()
# 初始化一个全连接层,输入输出维度都为config.hidden_size
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
# 激活函数使用双曲正切函数
self.activation = nn.Tanh()
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 取hidden_states中每个样本的第一个token对应的隐藏状态作为池化输出
first_token_tensor = hidden_states[:, 0]
# 经过全连接层变换
pooled_output = self.dense(first_token_tensor)
# 应用激活函数
pooled_output = self.activation(pooled_output)
return pooled_output
class TvpPreTrainedModel(PreTrainedModel):
"""一个抽象类,用于处理权重初始化和预训练模型的下载加载的简单接口。"""
config_class = TvpConfig # 使用TvpConfig作为配置类
base_model_prefix = "model" # 基础模型前缀为"model"
supports_gradient_checkpointing = True # 支持梯度检查点
def _init_weights(self, module):
"""初始化权重"""
if isinstance(module, (nn.Linear, nn.Embedding)):
# 使用正态分布初始化权重,均值为0,标准差为self.config.initializer_range
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
elif isinstance(module, nn.LayerNorm):
# 将偏置项初始化为零,将权重初始化为1
module.bias.data.zero_()
module.weight.data.fill_(1.0)
if isinstance(module, nn.Linear) and module.bias is not None:
# 如果是线性层且存在偏置项,则将偏置项初始化为零
module.bias.data.zero_()
if isinstance(module, nn.Conv2d):
# 使用Kaiming正态分布初始化卷积层的权重
nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
if module.bias is not None:
# 如果存在偏置项,则将偏置项初始化为零
nn.init.constant_(module.bias, 0)
TVP_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html
as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`TvpConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
TVP_INPUTS_DOCSTRING = r"""
def forward(
input_ids: torch.LongTensor,
pixel_values: torch.FloatTensor,
attention_mask: torch.FloatTensor = None,
head_mask: torch.FloatTensor = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True
):
"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
输入序列 token 的索引,用于从词汇表中获取对应的 token。可使用 [`AutoTokenizer`] 获得。详见 [`PreTrainedTokenizer.encode`] 和 [`PreTrainedTokenizer.__call__`]。[什么是输入 ID?](../glossary#input-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
像素值,用于表示输入图像。可使用 [`TvpImageProcessor`] 获得。详见 [`TvpImageProcessor.__call__`]。
attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
用于避免对填充 token 索引执行注意力操作的掩码。掩码取值为 `[0, 1]`:
- 1 表示**不遮蔽**的 token,
- 0 表示**遮蔽**的 token。
[什么是注意力掩码?](../glossary#attention-mask)
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
用于屏蔽自注意力模块中特定头部的掩码。掩码取值为 `[0, 1]`:
- 1 表示**未遮蔽**的头部,
- 0 表示**遮蔽**的头部。
output_attentions (`bool`, *optional*):
是否返回所有注意力层的注意力张量。详见返回的张量中的 `attentions` 字段。
output_hidden_states (`bool`, *optional*):
是否返回所有层的隐藏状态。详见返回的张量中的 `hidden_states` 字段。
return_dict (`bool`, *optional*):
是否返回 [`~utils.ModelOutput`] 而不是普通的元组。
"""
"""
Pad frames extracted from videos in the surroundings.
"""
class TvpFrameDownPadPrompter(nn.Module):
"""
Pad frames extracted from videos only at the bottom.
"""
def __init__(self, config):
if config.visual_prompter_apply not in ("add", "replace", "remove"):
raise ValueError("`visual_prompter_apply` must be in (add, replace, remove)")
super().__init__()
self.visual_prompt_size = config.visual_prompt_size
self.frame_num = config.frame_num
self.max_img_size = config.max_img_size
self.visual_prompter_apply = config.visual_prompter_apply
self.pad_down = nn.Parameter(
torch.randn([1, config.frame_num, 3, config.visual_prompt_size, config.max_img_size])
)
def forward(self, pixel_values):
if self.visual_prompter_apply != "add":
visual_prompt_mask = torch.ones(
[self.max_img_size, self.max_img_size], dtype=pixel_values.dtype, device=pixel_values.device
)
visual_prompt_mask[self.max_img_size - self.visual_prompt_size : self.max_img_size, :] = 0.0
pixel_values *= visual_prompt_mask
if self.visual_prompter_apply != "remove":
prompt = torch.zeros(
[pixel_values.shape[0], pixel_values.shape[1], 3, self.max_img_size, self.max_img_size],
device=pixel_values.device,
)
start_point = self.max_img_size - self.visual_prompt_size
prompt[:, :, :, start_point : self.max_img_size, :] = self.pad_down
pixel_values += prompt.to(pixel_values.dtype)
return pixel_values
class TvpFramePadPrompter(nn.Module):
"""
Pad frames extracted from videos in the surroundings.
"""
def __init__(self, config):
if config.visual_prompter_apply not in ("add", "replace", "remove"):
raise ValueError("`visual_prompter_apply` must be in (add, replace, remove)")
super().__init__()
self.num_frames = config.num_frames
self.max_img_size = config.max_img_size
self.visual_prompter_apply = config.visual_prompter_apply
self.base_size = config.max_img_size - config.visual_prompt_size * 2
self.pad_up = nn.Parameter(
torch.randn([1, config.num_frames, 3, config.visual_prompt_size, config.max_img_size])
)
self.pad_down = nn.Parameter(
torch.randn([1, config.num_frames, 3, config.visual_prompt_size, config.max_img_size])
)
self.pad_left = nn.Parameter(
torch.randn(
[
1,
config.num_frames,
3,
config.max_img_size - config.visual_prompt_size * 2,
config.visual_prompt_size,
]
)
)
self.pad_right = nn.Parameter(
torch.randn(
[
1,
config.num_frames,
3,
config.max_img_size - config.visual_prompt_size * 2,
config.visual_prompt_size,
]
)
)
def forward(self, pixel_values):
if self.visual_prompter_apply not in ("add", "remove", "replace"):
raise ValueError(f"Invalid visual_prompter_apply value {self.visual_prompter_apply}")
if self.visual_prompter_apply in ("replace", "remove"):
visual_prompt_mask = torch.ones(
[self.max_img_size, self.max_img_size], dtype=pixel_values.dtype, device=pixel_values.device
)
pixel_values *= visual_prompt_mask
if self.visual_prompter_apply in ("replace", "add"):
base = torch.zeros(1, self.num_frames, 3, self.base_size, self.base_size, device=pixel_values.device)
prompt = torch.cat([self.pad_left, base, self.pad_right], dim=4)
prompt = torch.cat([self.pad_up, prompt, self.pad_down], dim=3)
prompt = torch.cat(pixel_values.size(0) * [prompt])
pixel_values = pixel_values + prompt.to(pixel_values.dtype)
return pixel_values
TVP_PROMPTER_CLASSES_MAPPING = {
"framedownpad": TvpFrameDownPadPrompter,
"framepad": TvpFramePadPrompter,
}
@add_start_docstrings(
"The bare Tvp Model transformer outputting BaseModelOutputWithPooling object without any specific head on" " top.",
TVP_START_DOCSTRING,
)
class TvpModel(TvpPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.config = config
self.vision_model = TvpVisionModel(config)
self.embeddings = TvpTextInputEmbeddings(config)
self.visual_embeddings = TvpVisualInputEmbedding(config)
self.encoder = TvpEncoder(config)
self.pooler = TvpPooler(config)
self.text_prompt = nn.Parameter(torch.randn([1, 10, config.hidden_size]))
self.dropout = nn.Dropout(config.hidden_dropout_prob)
if config.visual_prompter_type not in TVP_PROMPTER_CLASSES_MAPPING:
raise ValueError("`visual_prompter_type` must be in (framedownpad, framepad)")
self.visual_prompter = TVP_PROMPTER_CLASSES_MAPPING[config.visual_prompter_type](config)
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(TVP_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=TvpConfig)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
pixel_values: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
def __init__(self, config):
super().__init__(config)
self.config = config
self.model = TvpModel(config)
self.video_grounding_head = TvpVideoGroundingHead(config)
self.post_init()
@add_start_docstrings_to_model_forward(TVP_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TvpVideoGroundingOutput, config_class=TvpConfig)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
pixel_values: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
labels: Tuple[torch.Tensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
r"""
labels (`torch.FloatTensor` of shape `(batch_size, 3)`, *optional*):
The labels contains duration, start time, and end time of the video corresponding to the text.
Returns:
Examples:
```
>>> import torch
>>> from transformers import AutoConfig, AutoTokenizer, TvpForVideoGrounding
>>> model = TvpForVideoGrounding.from_pretrained("Jiqing/tiny-random-tvp")
>>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")
>>> pixel_values = torch.rand(1, 1, 3, 448, 448)
>>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
>>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
```"""
return_dict = return_dict if return_dict is not None else self.config.return_dict
outputs = self.model(
input_ids,
pixel_values,
attention_mask,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooler_output = outputs[1]
logits = self.video_grounding_head(pooler_output)
loss = None
if labels is not None:
criterion = TvpLoss(["iou", "distance", "duration"])
criterion.to(self.device)
loss_dict = criterion(logits, labels)
loss = (
loss_dict["iou"]
+ self.config.distance_loss_weight * loss_dict["distance"]
+ self.config.duration_loss_weight * loss_dict["duration"]
)
if not return_dict:
outputs = (logits,) + outputs[2:]
if loss is not None:
outputs = (loss,) + outputs
return outputs
return TvpVideoGroundingOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
.\models\tvp\processing_tvp.py
"""
Processor class for TVP.
"""
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding
class TvpProcessor(ProcessorMixin):
"""
Constructs an TVP processor which wraps a TVP image processor and a Bert tokenizer into a single processor.
[`TvpProcessor`] offers all the functionalities of [`TvpImageProcessor`] and [`BertTokenizerFast`]. See the
[`~TvpProcessor.__call__`] and [`~TvpProcessor.decode`] for more information.
Args:
image_processor ([`TvpImageProcessor`], *optional*):
The image processor is a required input.
tokenizer ([`BertTokenizerFast`], *optional*):
The tokenizer is a required input.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "TvpImageProcessor"
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
"""
Initialize the TVP processor with an image processor and a tokenizer.
Args:
image_processor ([`TvpImageProcessor`], *optional*):
The image processor is a required input.
tokenizer ([`BertTokenizerFast`], *optional*):
The tokenizer is a required input.
Raises:
ValueError: If either `image_processor` or `tokenizer` is not provided.
"""
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
def batch_decode(self, *args, **kwargs):
"""
Forward all arguments to BertTokenizerFast's [`~PreTrainedTokenizer.batch_decode`] method.
Returns:
Decoded outputs corresponding to the input tokens.
See Also:
[`~PreTrainedTokenizer.batch_decode`] for more details.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
Forward all arguments to BertTokenizerFast's [`~PreTrainedTokenizer.decode`] method.
Returns:
Decoded string corresponding to the input token.
See Also:
[`~PreTrainedTokenizer.decode`] for more details.
"""
return self.tokenizer.decode(*args, **kwargs)
def post_process_video_grounding(self, logits, video_durations):
"""
Compute the time of the video.
Args:
logits (`torch.Tensor`):
The logits output of TvpForVideoGrounding.
video_durations (`float`):
The video's duration.
Returns:
start (`float`):
The start time of the video.
end (`float`):
The end time of the video.
"""
start, end = (
round(logits.tolist()[0][0] * video_durations, 1),
round(logits.tolist()[0][1] * video_durations, 1),
)
return start, end
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
.\models\tvp\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
_import_structure = {
"configuration_tvp": [
"TVP_PRETRAINED_CONFIG_ARCHIVE_MAP",
"TvpConfig",
],
"processing_tvp": ["TvpProcessor"],
}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_tvp"] = ["TvpImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tvp"] = [
"TVP_PRETRAINED_MODEL_ARCHIVE_LIST",
"TvpModel",
"TvpPreTrainedModel",
"TvpForVideoGrounding",
]
if TYPE_CHECKING:
from .configuration_tvp import (
TVP_PRETRAINED_CONFIG_ARCHIVE_MAP,
TvpConfig,
)
from .processing_tvp import TvpProcessor
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_tvp import TvpImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tvp import (
TVP_PRETRAINED_MODEL_ARCHIVE_LIST,
TvpForVideoGrounding,
TvpModel,
TvpPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\udop\configuration_udop.py
""" UDOP model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
UDOP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"microsoft/udop-large": "https://huggingface.co/microsoft/udop-large/resolve/main/config.json",
}
class UdopConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`UdopForConditionalGeneration`]. It is used to
instantiate a UDOP model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the UDOP
[microsoft/udop-large](https://huggingface.co/microsoft/udop-large) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
model_type = "udop"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"hidden_size": "d_model", "num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"}
def __init__(
self,
vocab_size=33201,
d_model=1024,
d_kv=64,
d_ff=4096,
num_layers=24,
num_decoder_layers=None,
num_heads=16,
relative_attention_num_buckets=32,
relative_attention_max_distance=128,
relative_bias_args=[{"type": "1d"}, {"type": "horizontal"}, {"type": "vertical"}],
dropout_rate=0.1,
layer_norm_epsilon=1e-6,
initializer_factor=1.0,
feed_forward_proj="relu",
is_encoder_decoder=True,
use_cache=True,
pad_token_id=0,
eos_token_id=1,
max_2d_position_embeddings=1024,
image_size=224,
patch_size=16,
num_channels=3,
**kwargs,
):
super().__init__(**kwargs)
):
self.vocab_size = vocab_size
self.d_model = d_model
self.d_kv = d_kv
self.d_ff = d_ff
self.num_layers = num_layers
self.num_decoder_layers = (
num_decoder_layers if num_decoder_layers is not None else self.num_layers
)
self.num_heads = num_heads
self.relative_attention_num_buckets = relative_attention_num_buckets
self.relative_attention_max_distance = relative_attention_max_distance
self.dropout_rate = dropout_rate
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_factor = initializer_factor
self.feed_forward_proj = feed_forward_proj
self.use_cache = use_cache
self.max_2d_position_embeddings = max_2d_position_embeddings
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
if not isinstance(relative_bias_args, list):
raise ValueError("`relative_bias_args` should be a list of dictionaries.")
self.relative_bias_args = relative_bias_args
act_info = self.feed_forward_proj.split("-")
self.dense_act_fn = act_info[-1]
self.is_gated_act = act_info[0] == "gated"
if len(act_info) > 1 and act_info[0] != "gated" or len(act_info) > 2:
raise ValueError(
f"`feed_forward_proj`: {feed_forward_proj} is not a valid activation function of the dense layer."
"Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. "
"'gated-gelu' or 'relu'"
)
super().__init__(
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
**kwargs,
)
.\models\udop\convert_udop_to_hf.py
filepath = hf_hub_download(
repo_id="hf-internal-testing/fixtures_docvqa", filename="document_2.png", repo_type="dataset"
)
image = Image.open(filepath).convert("RGB")
return image
words = ['7', 'ITC', 'Limited', 'REPORT', 'AND', 'ACCOUNTS', '2013', 'ITC’s', 'Brands:', 'An', 'Asset', 'for', 'the', 'Nation', 'The', 'consumer', 'needs', 'and', 'aspirations', 'they', 'fulfil,', 'the', 'benefit', 'they', 'generate', 'for', 'millions', 'across', 'ITC’s', 'value', 'chains,', 'the', 'future-ready', 'capabilities', 'that', 'support', 'them,', 'and', 'the', 'value', 'that', 'they', 'create', 'for', 'the', 'country,', 'have', 'made', 'ITC’s', 'brands', 'national', 'assets,', 'adding', 'to', 'India’s', 'competitiveness.', 'It', 'is', 'ITC’s', 'aspiration', 'to', 'be', 'the', 'No', '1', 'FMCG', 'player', 'in', 'the', 'country,', 'driven', 'by', 'its', 'new', 'FMCG', 'businesses.', 'A', 'recent', 'Nielsen', 'report', 'has', 'highlighted', 'that', "ITC's", 'new', 'FMCG', 'businesses', 'are', 'the', 'fastest', 'growing', 'among', 'the', 'top', 'consumer', 'goods', 'companies', 'operating', 'in', 'India.', 'ITC', 'takes', 'justifiable', 'pride', 'that,', 'along', 'with', 'generating', 'economic', 'value,', 'these', 'celebrated', 'Indian', 'brands', 'also', 'drive', 'the', 'creation', 'of', 'larger', 'societal', 'capital', 'through', 'the', 'virtuous', 'cycle', 'of', 'sustainable', 'and', 'inclusive', 'growth.', 'DI', 'WILLS', '*', ';', 'LOVE', 'DELIGHTFULLY', 'SOFT', 'SKIN?', 'aia', 'Ans', 'Source:', 'https://www.industrydocuments.ucsf.edu/docs/snbx0223']
text_list = []
bbox_list = []
for text, box in zip(words, boxes):
if text == "":
continue
sub_tokens = tokenizer.tokenize(text)
for sub_token in sub_tokens:
text_list.append(sub_token)
bbox_list.append(box)
input_ids = tokenizer.convert_tokens_to_ids(text_list)
input_ids = prompt_ids + input_ids
bbox = [[0, 0, 0, 0]] * len(prompt_ids) + bbox_list
pixel_values = image_processor(image, return_tensors="pt").pixel_values
original_pixel_values = original_transform(image, image_size=image_processor.size["height"]).unsqueeze(0)
assert torch.allclose(original_pixel_values, pixel_values)
print("Pixel values are ok!")
return torch.tensor(input_ids).unsqueeze(0), torch.tensor(bbox).unsqueeze(0).float(), pixel_values
def convert_udop_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
name_to_checkpoint_path = {
"udop-large": "/Users/nielsrogge/Documents/UDOP/udop-unimodel-large-224/pytorch_model.bin",
"udop-large-512": "/Users/nielsrogge/Documents/UDOP/udop-unimodel-large-512/pytorch_model.bin",
"udop-large-512-300k": "/Users/nielsrogge/Documents/UDOP/udop-unimodel-large-512-300k-steps/pytorch_model.bin",
}
checkpoint_path = name_to_checkpoint_path[model_name]
state_dict = torch.load(checkpoint_path, map_location="cpu")
print("Checkpoint path:", checkpoint_path)
image_size = 512 if "512" in model_name else 224
config = UdopConfig(decoder_start_token_id=0, image_size=image_size)
model = UdopForConditionalGeneration(config)
model.eval()
state_dict = {k.replace("cell2dembedding", "cell_2d_embedding"): v for k, v in state_dict.items()}
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
print("Missing keys:", missing_keys)
print("Unexpected keys:", unexpected_keys)
assert missing_keys == ["encoder.embed_patches.proj.weight", "encoder.embed_patches.proj.bias"]
assert unexpected_keys == ["pos_embed"]
tokenizer = UdopTokenizer.from_pretrained("t5-base", legacy=True)
size = {"height": image_size, "width": image_size}
image_processor = LayoutLMv3ImageProcessor(
image_mean=IMAGENET_DEFAULT_MEAN, image_std=IMAGENET_DEFAULT_STD, size=size
)
processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
input_ids, bbox, image = prepare_dummy_inputs(tokenizer, image_processor)
prompt = "Question answering. In which year is the report made?"
encoding = processor(images=get_image(), text=prompt, return_tensors="pt")
input_ids = encoding.input_ids
EXPECTED_INPUT_IDS = torch.tensor([[11860, 18243, 5, 86, 84, 215, 19, 8, 934, 263, 58, 1, 489, 27, 3838, 7363, 4083, 14536, 3430, 5686, 5911, 17161, 134, 2038, 27, 3838, 22, 7, 4688, 7, 10, 389, 18202, 21, 8, 11046, 37, 3733, 523, 11, 38, 2388, 1628, 3, 13133, 23334, 6, 8, 1656, 79, 3806, 21, 4040, 640, 27, 3838, 22, 7, 701, 16534, 6, 8, 3, 76, 2693, 18, 23015, 5644, 24, 380, 3, 6015, 6, 11, 8, 701, 24, 79, 482, 21, 3, 88, 684, 6, 43, 263, 27, 3838, 22, 7, 3635, 1157, 4089, 6, 2651, 12, 1547, 22, 7, 3265, 655, 5, 19, 27, 3838, 22, 7, 38, 2388, 257, 12, 36, 8, 465, 209, 13409, 12150, 1959, 16, 8, 684, 6, 6737, 57, 165, 126, 13409, 12150, 1623, 5, 71, 1100, 30298, 934, 65, 12566, 24, 27, 3838, 31, 7, 126, 13409, 12150, 1623, 33, 8, 10391, 1710, 859, 8, 420, 3733, 4968, 688, 2699, 16, 1547, 5, 27, 3838, 1217, 131, 99, 23, 179, 6064, 24, 6, 590, 28, 3, 11600, 1456, 701, 6, 175, 9443, 2557, 3635, 92, 1262, 8, 3409, 13, 2186, 3, 27908, 1784, 190, 8, 3, 5771, 17, 13281, 4005, 13, 5086, 11, 13066, 1170, 5, 10826, 16309, 134, 3, 2, 276, 26, 3, 55, 391, 13570, 5, 10315, 309, 3577, 19114, 371, 4254, 5121, 5055, 6245, 3, 10047, 3162, 58, 3, 9, 61, 1713, 2703, 476, 667, 25158, 301, 6058, 6038, 476, 3765, 9149, 10, 4893, 1303, 1986, 5, 13580, 7, 8224, 28244, 7, 5, 76, 75, 7, 89, 5, 15, 1259, 87, 7171, 7, 87, 7, 29, 115, 226, 4305, 2773, 1]])
torch.testing.assert_close(EXPECTED_INPUT_IDS, input_ids)
bbox = encoding.bbox.float()
pixel_values = encoding.pixel_values
except Exception:
print("Input_ids don't match, preparing dummy inputs")
input_ids, bbox, pixel_values = prepare_dummy_inputs(tokenizer, image_processor)
print("Testing single forward pass..")
with torch.no_grad():
decoder_input_ids = torch.tensor([[101]])
outputs = model(input_ids=input_ids, bbox=bbox, pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
print("Shape of logits:", outputs.logits.shape)
print("First values of logits:", outputs.logits[0, :3, :3])
try:
assert torch.allclose(outputs.logits[0, :3, :3], torch.tensor([[-18.5262, 1.5087, -15.7051]]), atol=1e-4)
print("Looks ok!")
except Exception:
print("logits don't match let's try to generate")
print("Testing generation...")
model_kwargs = {"bbox": bbox, "pixel_values": pixel_values}
outputs = model.generate(input_ids=input_ids, **model_kwargs, max_new_tokens=20)
print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
print("Testing generation with original inputs...")
filepath = hf_hub_download(repo_id="nielsr/test-image", filename="input_ids_udop.pt", repo_type="dataset")
input_ids = torch.load(filepath)
filepath = hf_hub_download(repo_id="nielsr/test-image", filename="bbox_udop.pt", repo_type="dataset")
bbox = torch.load(filepath)
pixel_values_filename = "pixel_values_udop_512.pt" if "512" in model_name else "pixel_values_udop_224.pt"
filepath = hf_hub_download(repo_id="nielsr/test-image", filename=pixel_values_filename, repo_type="dataset")
pixel_values = torch.load(filepath)
print("Decoded input ids:", tokenizer.decode(input_ids[0], skip_special_tokens=True))
print("Bbox shape:", bbox.shape)
model_kwargs = {"bbox": bbox, "pixel_values": pixel_values}
outputs = model.generate(input_ids=input_ids, **model_kwargs, max_new_tokens=20)
generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print("Generated:", generated_text)
if pytorch_dump_folder_path is not None:
model.save_pretrained(pytorch_dump_folder_path)
tokenizer.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
model.push_to_hub(f"microsoft/{model_name}")
processor.push_to_hub(f"microsoft/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
default="udop-large",
type=str,
choices=["udop-large", "udop-large-512", "udop-large-512-300k"],
help=("Name of the UDOP model you'd like to convert."),
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
)
parser.add_argument(
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
)
args = parser.parse_args()
convert_udop_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
.\models\udop\modeling_udop.py
""" PyTorch UDOP model."""
import collections
import logging
import math
import random
from abc import ABC, abstractmethod
from copy import deepcopy
from dataclasses import dataclass
from typing import Any, Dict, Optional, Sequence, Tuple, Union
import torch
from torch import Tensor, nn
from torch.nn import CrossEntropyLoss
from transformers import UdopConfig
from transformers.modeling_outputs import (
Seq2SeqLMOutput,
Seq2SeqModelOutput,
)
from ...activations import ACT2FN
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import (
find_pruneable_heads_and_indices,
prune_linear_layer,
)
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
replace_return_docstrings,
)
logger = logging.getLogger(__name__)
UDOP_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/udop-large",
]
_CONFIG_FOR_DOC = "UdopConfig"
UDOP_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Args:
config ([`UdopConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
UDOP_INPUTS_DOCSTRING = r"""
"""
UDOP_ENCODER_INPUTS_DOCSTRING = r"""
"""
@dataclass
class BaseModelOutputWithAttentionMask(ModelOutput):
"""
Class for the model's outputs that may also contain a past key/values (to speed up sequential decoding). Includes
an additional attention mask.
"""
last_hidden_state: torch.FloatTensor = None
attention_mask: torch.FloatTensor = None
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
"""
合并图像和文本嵌入,作为UDOP编码器/解码器的输入。
首先,通过检查每个视觉补丁是否在标记边界框内,创建图像嵌入。如果是,则将视觉补丁与标记嵌入组合。
然后,将视觉边界框与文本边界框结合起来。
最后,将视觉边界框与文本注意力掩码结合起来。
"""
sequence_length = num_patches
ocr_points_x = torch.clip(
torch.floor((bbox[:, :, 0] + bbox[:, :, 2]) / 2.0 * sequence_length).long(), 0, sequence_length - 1
)
ocr_points_y = (
torch.clip(torch.floor((bbox[:, :, 1] + bbox[:, :, 3]) / 2.0 * sequence_length).long(), 0, sequence_length - 1)
* sequence_length
)
ocr_points = ocr_points_x + ocr_points_y
bbox = bbox.to(torch.float64)
target_seg = (bbox.mean(-1) == 0.0) | (bbox.mean(-1) == 1.0)
repeated_vision_embeds = torch.gather(
image_embeddings, 1, ocr_points.unsqueeze(-1).repeat(1, 1, image_embeddings.size(-1))
)
repeated_vision_embeds[target_seg] = 0.0
inputs_embeds += repeated_vision_embeds
patch_inds = torch.full_like(image_embeddings[:, :, 0], True).bool()
ind = torch.cat(
[
torch.arange(len(ocr_points))[:, None].repeat(1, ocr_points.size(-1))[:, :, None].to(ocr_points),
ocr_points[:, :, None],
],
dim=-1,
)
ind = ind.flatten(0, 1)
rows, cols = zip(*ind)
patch_inds[rows, cols] = False
input_vision_patches = [image_embeddings[i][patch_inds[i]] for i in range(len(patch_inds))]
if visual_bbox is None:
visual_bbox = get_visual_bbox(image_size=image_size, patch_size=patch_size)
visual_bbox = visual_bbox.unsqueeze(0).repeat(image_embeddings.size(0), 1, 1)
visual_bbox = visual_bbox.to(image_embeddings.device)
visual_bbox = [visual_bbox[i][patch_inds[i]] for i in range(len(patch_inds))]
if attention_mask is not None:
visual_attention_mask = [torch.tensor([1] * len(item)).to(attention_mask) for item in visual_bbox]
if max_len == 0:
max_len = image_embeddings.size(1)
else:
max_len = max_len - inputs_embeds.size(1)
inputs_vision_patches = torch.stack(
[pad_sequence(item, max_len, torch.zeros_like(image_embeddings[0, 0])) for item in input_vision_patches]
)
visual_bbox = torch.stack([pad_sequence(item, max_len, torch.zeros_like(bbox[0, 0])) for item in visual_bbox])
if attention_mask is not None:
visual_attention_mask = torch.stack(
[pad_sequence(item, max_len, torch.zeros_like(attention_mask[0, 0])) for item in visual_attention_mask]
)
inputs_embeds = torch.cat([inputs_embeds, inputs_vision_patches], 1)
bbox = torch.cat([bbox, visual_bbox], 1)
if attention_mask is not None:
attention_mask = torch.cat([attention_mask, visual_attention_mask], 1)
return inputs_embeds, bbox, attention_mask
class UdopPatchEmbeddings(nn.Module):
"""2D Image to Patch Embeddings"""
def __init__(self, config):
super().__init__()
image_size, patch_size = config.image_size, config.patch_size
num_channels, hidden_size = config.num_channels, config.hidden_size
image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.num_patches = num_patches
self.proj = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
def forward(self, pixel_values):
batch_size, num_channels, height, width = pixel_values.shape
if height != self.image_size[0] or width != self.image_size[1]:
raise ValueError(
f"Input image size ({height}*{width}) doesn't match model"
f" ({self.image_size[0]}*{self.image_size[1]})."
)
embeddings = self.proj(pixel_values)
embeddings = embeddings.flatten(2).transpose(1, 2)
return embeddings
class UdopPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models. Based on `T5PreTrainedModel`.
"""
config_class = UdopConfig
base_model_prefix = "transformer"
supports_gradient_checkpointing = True
_no_split_modules = ["UdopBlock"]
_keep_in_fp32_modules = ["wo"]
def _shift_right(self, input_ids):
decoder_start_token_id = self.config.decoder_start_token_id
pad_token_id = self.config.pad_token_id
assert decoder_start_token_id is not None, (
"self.model.config.decoder_start_token_id has to be defined. In Udop it is usually set to the"
" pad_token_id. See Udop docs for more information"
)
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
shifted_input_ids[..., 0] = decoder_start_token_id
assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
assert torch.all(shifted_input_ids >= 0).item(), "Verify that `shifted_input_ids` has only positive values"
return shifted_input_ids
class UdopLayerNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
"""
构造一个 Udop 风格的 layernorm 模块。无偏置和无均值减法。
"""
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.variance_epsilon = eps
def forward(self, hidden_states):
variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
if self.weight.dtype in [torch.float16, torch.bfloat16]:
hidden_states = hidden_states.to(self.weight.dtype)
return self.weight * hidden_states
class UdopDenseActDense(nn.Module):
def __init__(self, config: UdopConfig):
super().__init__()
self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
self.dropout = nn.Dropout(config.dropout_rate)
self.act = ACT2FN[config.dense_act_fn]
def forward(self, hidden_states):
hidden_states = self.wi(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.dropout(hidden_states)
if (
isinstance(self.wo.weight, torch.Tensor)
and hidden_states.dtype != self.wo.weight.dtype
and self.wo.weight.dtype != torch.int8
):
hidden_states = hidden_states.to(self.wo.weight.dtype)
hidden_states = self.wo(hidden_states)
return hidden_states
class UdopDenseGatedActDense(nn.Module):
def __init__(self, config: UdopConfig):
super().__init__()
self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
self.dropout = nn.Dropout(config.dropout_rate)
self.act = ACT2FN[config.dense_act_fn]
def forward(self, hidden_states):
hidden_gelu = self.act(self.wi_0(hidden_states))
hidden_linear = self.wi_1(hidden_states)
hidden_states = hidden_gelu * hidden_linear
hidden_states = self.dropout(hidden_states)
if (
isinstance(self.wo.weight, torch.Tensor)
and hidden_states.dtype != self.wo.weight.dtype
and self.wo.weight.dtype != torch.int8
):
hidden_states = hidden_states.to(self.wo.weight.dtype)
hidden_states = self.wo(hidden_states)
return hidden_states
class UdopLayerFF(nn.Module):
def __init__(self, config: UdopConfig):
super().__init__()
if config.is_gated_act:
self.DenseReluDense = UdopDenseGatedActDense(config)
else:
self.DenseReluDense = UdopDenseActDense(config)
self.layer_norm = UdopLayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate)
def forward(self, hidden_states):
forwarded_states = self.layer_norm(hidden_states)
forwarded_states = self.DenseReluDense(forwarded_states)
hidden_states = hidden_states + self.dropout(forwarded_states)
return hidden_states
class UdopAttention(nn.Module):
def __init__(self, config: UdopConfig, has_relative_attention_bias=False):
super().__init__()
self.is_decoder = config.is_decoder
self.has_relative_attention_bias = has_relative_attention_bias
self.relative_attention_num_buckets = config.relative_attention_num_buckets
self.relative_attention_max_distance = config.relative_attention_max_distance
self.d_model = config.d_model
self.key_value_proj_dim = config.d_kv
self.n_heads = config.num_heads
self.dropout = config.dropout_rate
self.inner_dim = self.n_heads * self.key_value_proj_dim
self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
if self.has_relative_attention_bias:
self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
self.pruned_heads = set()
self.gradient_checkpointing = False
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
)
self.q = prune_linear_layer(self.q, index)
self.k = prune_linear_layer(self.k, index)
self.v = prune_linear_layer(self.v, index)
self.o = prune_linear_layer(self.o, index, dim=1)
self.n_heads = self.n_heads - len(heads)
self.inner_dim = self.key_value_proj_dim * self.n_heads
self.pruned_heads = self.pruned_heads.union(heads)
@staticmethod
def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
"""
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
Translate relative position to a bucket number for relative attention. The relative position is defined as
memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to longer sequences than the model has been trained on
Args:
relative_position: an int32 Tensor - 相对位置差值的整数张量
bidirectional: a boolean - 是否为双向注意力的标志
num_buckets: an integer - 桶的数量,决定了输出的范围 [0, num_buckets)
max_distance: an integer - 最大的相对距离,超过该距离的相对位置都映射到同一个桶中
Returns:
a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
返回一个形状与relative_position相同的张量,包含在范围[0, num_buckets)内的int32值
"""
relative_buckets = 0
if bidirectional:
num_buckets //= 2
relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
relative_position = torch.abs(relative_position)
else:
relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
max_exact = num_buckets // 2
is_small = relative_position < max_exact
relative_position_if_large = max_exact + (
torch.log(relative_position.float() / max_exact)
/ math.log(max_distance / max_exact)
* (num_buckets - max_exact)
).to(torch.long)
relative_position_if_large = torch.min(
relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
)
relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
return relative_buckets
def compute_bias(self, query_length, key_length, device=None):
"""Compute binned relative position bias"""
if device is None:
device = self.relative_attention_bias.weight.device
context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
relative_position = memory_position - context_position
relative_position_bucket = self._relative_position_bucket(
relative_position,
bidirectional=(not self.is_decoder),
num_buckets=self.relative_attention_num_buckets,
max_distance=self.relative_attention_max_distance,
)
values = self.relative_attention_bias(relative_position_bucket)
values = values.permute([2, 0, 1]).unsqueeze(0)
return values
def forward(
self,
hidden_states,
mask=None,
key_value_states=None,
position_bias=None,
past_key_value=None,
layer_head_mask=None,
query_length=None,
use_cache=False,
output_attentions=False,
class UdopLayerSelfAttention(nn.Module):
def __init__(self, config, has_relative_attention_bias=False):
super().__init__()
self.SelfAttention = UdopAttention(config, has_relative_attention_bias=has_relative_attention_bias)
self.layer_norm = UdopLayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate)
def forward(
self,
hidden_states,
attention_mask=None,
position_bias=None,
layer_head_mask=None,
past_key_value=None,
use_cache=False,
output_attentions=False,
):
normed_hidden_states = self.layer_norm(hidden_states)
attention_output = self.SelfAttention(
normed_hidden_states,
mask=attention_mask,
position_bias=position_bias,
layer_head_mask=layer_head_mask,
past_key_value=past_key_value,
use_cache=use_cache,
output_attentions=output_attentions,
)
hidden_states = hidden_states + self.dropout(attention_output[0])
outputs = (hidden_states,) + attention_output[1:]
return outputs
class UdopLayerCrossAttention(nn.Module):
def __init__(self, config):
super().__init__()
self.EncDecAttention = UdopAttention(config, has_relative_attention_bias=False)
self.layer_norm = UdopLayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate)
def forward(
self,
hidden_states,
key_value_states,
attention_mask=None,
position_bias=None,
layer_head_mask=None,
past_key_value=None,
use_cache=False,
query_length=None,
output_attentions=False,
):
normed_hidden_states = self.layer_norm(hidden_states)
attention_output = self.EncDecAttention(
normed_hidden_states,
mask=attention_mask,
key_value_states=key_value_states,
position_bias=position_bias,
layer_head_mask=layer_head_mask,
past_key_value=past_key_value,
use_cache=use_cache,
query_length=query_length,
output_attentions=output_attentions,
)
layer_output = hidden_states + self.dropout(attention_output[0])
outputs = (layer_output,) + attention_output[1:]
return outputs
class UdopBlock(nn.Module):
def __init__(self, config, has_relative_attention_bias=False):
super().__init__()
self.is_decoder = config.is_decoder
self.layer = nn.ModuleList()
self.layer.append(UdopLayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
if self.is_decoder:
self.layer.append(UdopLayerCrossAttention(config))
self.layer.append(UdopLayerFF(config))
def forward(
self,
hidden_states,
attention_mask=None,
position_bias=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
encoder_decoder_position_bias=None,
layer_head_mask=None,
cross_attn_layer_head_mask=None,
past_key_value=None,
use_cache=False,
output_attentions=False,
return_dict=True,
class UdopCellEmbeddings(nn.Module):
def __init__(self, max_2d_position_embeddings=501, hidden_size=1024):
super(UdopCellEmbeddings, self).__init__()
self.max_2d_position_embeddings = max_2d_position_embeddings
self.x_position_embeddings = nn.Embedding(max_2d_position_embeddings, hidden_size)
self.y_position_embeddings = nn.Embedding(max_2d_position_embeddings, hidden_size)
def forward(self, bbox):
bbox = torch.clip(bbox, 0.0, 1.0)
bbox = (bbox * (self.max_2d_position_embeddings - 1)).long()
left_position_embeddings = self.x_position_embeddings(bbox[:, :, 0])
upper_position_embeddings = self.y_position_embeddings(bbox[:, :, 1])
right_position_embeddings = self.x_position_embeddings(bbox[:, :, 2])
lower_position_embeddings = self.y_position_embeddings(bbox[:, :, 3])
embeddings = (
left_position_embeddings
+ upper_position_embeddings
+ right_position_embeddings
+ lower_position_embeddings
)
return embeddings
get_relative_position_bucket = UdopAttention._relative_position_bucket
AUGMENTATION_RANGE = (0.80, 1.25)
class RelativePositionBiasBase(nn.Module, ABC):
"""
相对位置偏置的基础类。
Args:
num_heads (`int`):
模型中的注意力头数,将创建大小为 `num_heads` 的嵌入,将添加到每个令牌对的分数上。
relative_attention_num_buckets (`int`, *optional*, 默认为 32):
令牌对度量(序列中的距离、像素中的距离等)将被分桶化,该参数定义了这种桶的数量。
bidirectional (`bool`, *optional*, 默认为 `True`):
令牌对之间的距离是否应该是双向的。如果为 `False`,则距离(tok1, tok2) == 距离(tok2, tok1)。
scaling_factor (`int`, *optional*, 默认为 1):
用于缩放相对距离的因子。
max_distance (`int`, *optional*, 默认为 128):
所有大于此值的距离将进入同一个桶中。
augmentation (`bool`, *optional*, 默认为 `False`):
是否将相对距离乘以随机标量。
expand (`bool`, *optional*, 默认为 `False`):
是否扩展现有的预训练模型,并在后续添加中添加 prefix_bucket。
"""
def __init__(
self,
num_heads=None,
relative_attention_num_buckets=32,
bidirectional=True,
scaling_factor=1,
max_distance=128,
level="tokens",
augmentation=False,
prefix_bucket=False,
expand=False,
*args,
**kwargs
):
super().__init__(*args, **kwargs)
):
super(RelativePositionBiasBase, self).__init__()
self.prefix_bucket = prefix_bucket
self.augmentation = augmentation
self.level = level
self.max_distance = max_distance
self.scaling_factor = scaling_factor
self.bidirectional = bidirectional
self.num_heads = num_heads
self.expand = expand
self.relative_attention_num_buckets = relative_attention_num_buckets
extra_head = 2 if prefix_bucket and not self.expand else 0
self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets + extra_head, self.num_heads)
@abstractmethod
def prepare_input(
self,
attention_mask: Optional[Tensor] = None,
bbox: Optional[Dict[str, Any]] = None,
) -> Tensor:
pass
def get_bucket(self, attention_mask: Optional[Tensor] = None, bbox: Optional[Dict[str, Any]] = None) -> Tensor:
relative_position = self.prepare_input(attention_mask, bbox)
rp_bucket: Tensor = get_relative_position_bucket(
relative_position,
bidirectional=self.bidirectional,
num_buckets=self.relative_attention_num_buckets,
max_distance=self.max_distance,
)
return rp_bucket
def get_relative_position(self, positions):
context_position = positions[:, :, None]
memory_position = positions[:, None, :]
relative_position = memory_position - context_position
if self.augmentation and self.training:
relative_position *= random.uniform(*AUGMENTATION_RANGE)
relative_position *= self.scaling_factor
return relative_position.to(torch.long)
new_bias = nn.Embedding(self.relative_attention_num_buckets + 2, self.num_heads)
new_bias.weight.data[: self.relative_attention_num_buckets] = self.relative_attention_bias.weight.data
new_bias.weight.data[self.relative_attention_num_buckets :] = 0.1
self.relative_attention_bias = new_bias
self.expand = False
rp_bucket = self.get_bucket(attention_mask, bbox)
if self.prefix_bucket:
if rp_bucket.size(0) == 1 and attention_mask.size(0) > 1:
rp_bucket = rp_bucket.repeat(attention_mask.size(0), 1, 1)
is_prefix = bbox[:, :, 1] < 0
num_prefix = is_prefix.sum(-1)
for idx, num_prefix_row in enumerate(num_prefix.cpu().numpy()):
rp_bucket[idx, :num_prefix_row, num_prefix_row:] = self.relative_attention_num_buckets
rp_bucket[idx, num_prefix_row:, :num_prefix_row] = self.relative_attention_num_buckets + 1
values: Tensor = self.relative_attention_bias(rp_bucket)
if values.dim() != 4:
raise ValueError("Wrong dimension of values tensor")
values = values.permute([0, 3, 1, 2])
return values
class RelativePositionBias1D(RelativePositionBiasBase):
def __init__(self, scaling_factor=1, max_distance=128, **kwargs):
"""
Reimplementation of T5 relative position bias. Distance between given tokens is their distance in the sequence.
Parameters are the same as in base class
"""
super().__init__(scaling_factor=scaling_factor, max_distance=max_distance, **kwargs)
def prepare_input(self, attention_mask: Optional[Tensor] = None, bbox: Optional[Dict[str, Any]] = None) -> Tensor:
if self.scaling_factor != 1:
raise ValueError("No need to scale 1d features")
relative_position = self.get_relative_position(
torch.arange(attention_mask.size(1), dtype=torch.long, device=attention_mask.device)[None, :]
)
return relative_position
class RelativePositionBiasHorizontal(RelativePositionBiasBase):
def __init__(self, scaling_factor=100, max_distance=100, **kwargs):
"""
Represents in the bucket embeddings horizontal distance between two tokens. Parameters are the same as in base
class
"""
super().__init__(scaling_factor=scaling_factor, max_distance=max_distance, **kwargs)
def prepare_input(self, attention_mask: Optional[Tensor] = None, bbox: Optional[Dict[str, Any]] = None) -> Tensor:
if not self.scaling_factor > 1.0:
raise ValueError("Need to scale the values of bboxes, as they are in small (0,1) range")
if bbox is None:
raise ValueError("Bbox is required for horizontal relative position bias")
horizontal_position: Tensor = bbox[:, :, [0, 2]].mean(dim=-1)
return self.get_relative_position(horizontal_position)
class RelativePositionBiasVertical(RelativePositionBiasBase):
def __init__(self, scaling_factor=100, max_distance=100, **kwargs):
"""
Represents in the bucket embeddings vertical distance between two tokens. Parameters are the same as in base
class
"""
super().__init__(scaling_factor=scaling_factor, max_distance=max_distance, **kwargs)
def prepare_input(self, attention_mask: Optional[Tensor] = None, bbox: Optional[Dict[str, Any]] = None) -> Tensor:
if not self.scaling_factor > 1.0:
raise ValueError("Need to scale the values of bboxes, as they are in small (0,1) range")
if bbox is None:
raise ValueError("Bbox is required for vertical relative position bias")
vertical_position: Tensor = bbox[:, :, [1, 3]].mean(dim=-1)
return self.get_relative_position(vertical_position)
class RelativePositionBiasAggregated(nn.Module):
def __init__(self, modules: Sequence[RelativePositionBiasBase]):
"""
Class which sums up various computed biases.
Args:
modules (Sequence[RelativePositionBiasBase]):
List of relative bias modules.
"""
super().__init__()
self.biases = nn.ModuleList(modules)
def forward(
self, attention_mask: Optional[Tensor] = None, bbox: Optional[Dict[str, Any]] = None
) -> Union[float, Tensor]:
output = 0.0
for bias in self.biases:
output = bias(attention_mask, bbox) + output
return output
BIAS_CLASSES = {
"1d": RelativePositionBias1D,
"horizontal": RelativePositionBiasHorizontal,
"vertical": RelativePositionBiasVertical,
}
def create_relative_bias(config: UdopConfig) -> Sequence[RelativePositionBiasBase]:
"""
创建一个空列表或一个/多个相对偏置对象。
:param config: 模型的配置对象
:return: 创建的偏置模块序列
"""
bias_list = []
if hasattr(config, "relative_bias_args"):
for bias_kwargs_org in config.relative_bias_args:
bias_kwargs = deepcopy(bias_kwargs_org)
bias_type = bias_kwargs.pop("type")
model_num_heads = config.num_heads if hasattr(config, "num_heads") else config.num_attention_heads
if "num_heads" in bias_kwargs:
if bias_kwargs["num_heads"] != model_num_heads:
raise ValueError("Number of heads must match num of heads in the model")
else:
bias_kwargs["num_heads"] = model_num_heads
bias_list.append(BIAS_CLASSES[bias_type](**bias_kwargs))
return bias_list
class UdopStack(UdopPreTrainedModel):
"""
这个类基于 `T5Stack`,但修改以考虑图像模态以及2D位置嵌入。
"""
def __init__(self, config, embed_tokens=None, embed_patches=None):
super().__init__(config)
self.embed_tokens = embed_tokens
self.embed_patches = embed_patches
self.is_decoder = config.is_decoder
self._max_length = config.max_length
self.num_layers = config.num_layers
self.block = nn.ModuleList(
[UdopBlock(config, has_relative_attention_bias=bool(i == 0)) for i in range(self.num_layers)]
)
self.final_layer_norm = UdopLayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate)
if not self.is_decoder:
self.cell_2d_embedding = UdopCellEmbeddings(config.max_2d_position_embeddings, config.hidden_size)
self.relative_bias = self._get_relative_bias(config)
for bias in self.relative_bias.biases:
if isinstance(bias, RelativePositionBias1D):
self._tie_or_clone_weights(
bias.relative_attention_bias, self.block[0].layer[0].SelfAttention.relative_attention_bias
)
@staticmethod
def _get_relative_bias(config: UdopConfig) -> RelativePositionBiasAggregated:
relative_bias_list = create_relative_bias(config)
return RelativePositionBiasAggregated(relative_bias_list)
def get_input_embeddings(self):
return self.embed_tokens
def get_output_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, new_embeddings):
self.embed_tokens = new_embeddings
def forward(
self,
input_ids=None,
attention_mask=None,
bbox=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
inputs_embeds=None,
pixel_values=None,
visual_bbox=None,
image_embeddings=None,
position_bias=None,
head_mask=None,
cross_attn_head_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
@add_start_docstrings(
"The bare UDOP encoder-decoder Transformer outputting raw hidden-states without any specific head on top.",
UDOP_START_DOCSTRING,
)
class UdopModel(UdopPreTrainedModel):
_tied_weights_keys = [
"encoder.embed_tokens.weight",
"decoder.embed_tokens.weight",
"encoder.embed_patches.proj.weight",
"encoder.embed_patches.proj.bias",
"encoder.relative_bias.biases.0.relative_attention_bias.weight",
"decoder.relative_bias.biases.0.relative_attention_bias.weight",
]
def __init__(self, config):
super(UdopModel, self).__init__(config)
self.shared = nn.Embedding(config.vocab_size, config.d_model)
self.patch_embed = UdopPatchEmbeddings(config)
encoder_config = deepcopy(config)
encoder_config.is_decoder = False
encoder_config.use_cache = False
encoder_config.is_encoder_decoder = False
self.encoder = UdopStack(encoder_config, self.shared, self.patch_embed)
decoder_config = deepcopy(config)
decoder_config.is_decoder = True
decoder_config.is_encoder_decoder = False
decoder_config.num_layers = config.num_decoder_layers
self.decoder = UdopStack(decoder_config, self.shared)
self.post_init()
def get_input_embeddings(self):
return self.shared
def set_input_embeddings(self, new_embeddings):
self.shared = new_embeddings
self.encoder.set_input_embeddings(new_embeddings)
self.decoder.set_input_embeddings(new_embeddings)
def get_encoder(self):
return self.encoder
def get_decoder(self):
return self.decoder
@add_start_docstrings_to_model_forward(UDOP_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Tensor = None,
attention_mask: Tensor = None,
bbox: Dict[str, Any] = None,
pixel_values: Optional[Tensor] = None,
visual_bbox: Dict[str, Any] = None,
decoder_input_ids: Optional[Tensor] = None,
decoder_attention_mask: Optional[Tensor] = None,
inputs_embeds: Optional[Tensor] = None,
encoder_outputs: Optional[Tensor] = None,
past_key_values: Optional[Tensor] = None,
head_mask: Optional[Tensor] = None,
decoder_inputs_embeds: Optional[Tensor] = None,
decoder_head_mask: Optional[Tensor] = None,
cross_attn_head_mask: Optional[Tensor] = None,
use_cache=True,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
pass
注释:这段代码定义了一个UDOP模型类,它是一个编码器-解码器的Transformer模型,用于文本生成任务,包含了共享的嵌入层和编码器解码器栈的初始化。
This class is based on [`T5ForConditionalGeneration`], extended to deal with images and layout (2D) data."""
# 此处是基于 `T5ForConditionalGeneration` 类扩展而来,用于处理图像和布局(2D)数据。
UDOP_START_DOCSTRING,
# 开始 UDOP 文档字符串
# 定义了一个继承自UdopPreTrainedModel的新模型类UdopForConditionalGeneration,用于条件生成任务
class UdopForConditionalGeneration(UdopPreTrainedModel):
# 定义了一些共享权重的键列表,这些键将在模型中被共享使用
_tied_weights_keys = [
"encoder.embed_tokens.weight",
"decoder.embed_tokens.weight",
"encoder.embed_patches.proj.weight",
"encoder.embed_patches.proj.bias",
"encoder.relative_bias.biases.0.relative_attention_bias.weight",
"decoder.relative_bias.biases.0.relative_attention_bias.weight",
"lm_head.weight",
]
# 初始化函数,接受一个配置对象config
def __init__(self, config):
# 调用父类的初始化方法,传入配置对象config
super(UdopForConditionalGeneration, self).__init__(config)
# 定义共享的文本和图像嵌入层,使用nn.Embedding创建,形状为(vocab_size, d_model)
self.shared = nn.Embedding(config.vocab_size, config.d_model)
# 创建UdopPatchEmbeddings对象,用于图像块的嵌入表示
self.patch_embed = UdopPatchEmbeddings(config)
# 复制配置对象config以创建编码器的配置,并设置一些标志位
encoder_config = deepcopy(config)
encoder_config.is_decoder = False
encoder_config.use_cache = False
encoder_config.is_encoder_decoder = False
# 创建编码器对象UdopStack,传入复制后的配置对象、共享的嵌入层和图像块嵌入对象
self.encoder = UdopStack(encoder_config, self.shared, self.patch_embed)
# 复制配置对象config以创建解码器的配置,并设置一些标志位和解码层数
decoder_config = deepcopy(config)
decoder_config.is_decoder = True
decoder_config.is_encoder_decoder = False
decoder_config.num_layers = config.num_decoder_layers
# 创建解码器对象UdopStack,传入复制后的配置对象、共享的嵌入层
self.decoder = UdopStack(decoder_config, self.shared)
# 定义语言建模头部的权重,使用nn.Linear创建,输入特征大小为config.d_model,输出大小为config.vocab_size
self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
# 执行额外的初始化操作,可能包括权重初始化和最终处理
self.post_init()
# 获取输入嵌入层的方法,返回共享的嵌入层对象
def get_input_embeddings(self):
return self.shared
# 设置输入嵌入层的方法,接受一个新的嵌入层对象new_embeddings,并将其设置为共享的嵌入层
def set_input_embeddings(self, new_embeddings):
self.shared = new_embeddings
# 分别将新的嵌入层设置到编码器和解码器中
self.encoder.set_input_embeddings(new_embeddings)
self.decoder.set_input_embeddings(new_embeddings)
# 设置输出嵌入层的方法,接受一个新的嵌入层对象new_embeddings,并将其设置为语言建模头部的权重
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
# 获取输出嵌入层的方法,返回语言建模头部的权重对象
def get_output_embeddings(self):
return self.lm_head
# 获取编码器对象的方法,返回编码器对象
def get_encoder(self):
return self.encoder
# 获取解码器对象的方法,返回解码器对象
def get_decoder(self):
return self.decoder
# 使用装饰器@add_start_docstrings_to_model_forward和@replace_return_docstrings修饰的方法
# 用于向模型的前向方法添加输入文档字符串和替换返回文档字符串
# 定义一个方法 `forward`,用于模型的前向传播
def forward(
self,
input_ids: Tensor = None,
attention_mask: Tensor = None,
bbox: Dict[str, Any] = None,
pixel_values: Optional[Tensor] = None,
visual_bbox: Dict[str, Any] = None,
decoder_input_ids: Optional[Tensor] = None,
decoder_attention_mask: Optional[Tensor] = None,
inputs_embeds: Optional[Tensor] = None,
encoder_outputs: Optional[Tensor] = None,
past_key_values: Optional[Tensor] = None,
head_mask: Optional[Tensor] = None,
decoder_inputs_embeds: Optional[Tensor] = None,
decoder_head_mask: Optional[Tensor] = None,
cross_attn_head_mask: Optional[Tensor] = None,
use_cache=True,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: Optional[Tensor] = None,
):
# 如果 `past_key_values` 不为 None,则截取 `input_ids` 的最后一个位置
if past_key_values is not None:
input_ids = input_ids[:, -1:]
# 返回一个包含各种输入的字典,用于生成模型的输入
return {
"decoder_input_ids": input_ids,
"past_key_values": past_key_values,
"encoder_outputs": encoder_outputs,
"attention_mask": attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache,
# 使用 `kwargs.get` 获取可能存在的额外输入
"bbox": kwargs.get("bbox", None),
"pixel_values": kwargs.get("pixel_values", None),
"visual_bbox": kwargs.get("visual_bbox", None),
}
# 从 `transformers.models.t5.modeling_t5.T5ForConditionalGeneration._reorder_cache` 复制过来的方法
# 重新排序缓存中的过去键值,以适应束搜索的索引
# 如果过去的键值未包含在输出中
# 提示用户快速解码已禁用,无需重新排序
if past_key_values is None:
logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
return past_key_values
# 初始化重新排序后的解码器过去状态元组
reordered_decoder_past = ()
# 遍历每个层级的过去状态
for layer_past_states in past_key_values:
# 初始化重新排序后的层级过去状态元组
reordered_layer_past_states = ()
# 遍历每个层级内部的过去状态
for layer_past_state in layer_past_states:
# 根据束搜索的索引重新排列层级过去状态,以正确的批次索引
reordered_layer_past_states = reordered_layer_past_states + (
layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
)
# 检查重新排序后的第一个层级过去状态的形状是否与原始形状匹配
if reordered_layer_past_states[0].shape != layer_past_states[0].shape:
raise ValueError(
f"reordered_layer_past_states[0] shape {reordered_layer_past_states[0].shape} and layer_past_states[0] shape {layer_past_states[0].shape} mismatched"
)
# 检查重新排序后的层级过去状态元组长度是否与原始长度匹配
if len(reordered_layer_past_states) != len(layer_past_states):
raise ValueError(
f"length of reordered_layer_past_states {len(reordered_layer_past_states)} and length of layer_past_states {len(layer_past_states)} mismatched"
)
# 将重新排序后的层级过去状态元组添加到重新排序后的解码器过去状态元组中
reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
# 返回重新排序后的解码器过去状态元组
return reordered_decoder_past
# 为 UDOP 模型的编码器定义一个新的类,继承自 UdopPreTrainedModel
@add_start_docstrings(
"The bare UDOP Model transformer outputting encoder's raw hidden-states without any specific head on top.",
UDOP_START_DOCSTRING,
)
class UdopEncoderModel(UdopPreTrainedModel):
# 被绑定权重的键列表,用于共享权重的层
_tied_weights_keys = [
"encoder.embed_tokens.weight",
"encoder.embed_patches.proj.weight",
"encoder.embed_patches.proj.bias",
"encoder.relative_bias.biases.0.relative_attention_bias.weight",
]
# 初始化函数,接受一个 UdopConfig 类型的参数 config
def __init__(self, config: UdopConfig):
super().__init__(config)
# 文本和图像的嵌入层
self.shared = nn.Embedding(config.vocab_size, config.d_model)
# 图像补丁的嵌入层
self.patch_embed = UdopPatchEmbeddings(config)
# 深拷贝配置以创建编码器的配置
encoder_config = deepcopy(config)
encoder_config.is_decoder = False
encoder_config.use_cache = False
encoder_config.is_encoder_decoder = False
# 创建 UdopStack 编码器层
self.encoder = UdopStack(encoder_config, self.shared, self.patch_embed)
# 初始化权重并应用最终处理
self.post_init()
# 获取输入嵌入层的方法
def get_input_embeddings(self):
return self.shared
# 设置输入嵌入层的方法,接受新的嵌入层参数
def set_input_embeddings(self, new_embeddings):
self.shared = new_embeddings
# 更新编码器的输入嵌入层
self.encoder.set_input_embeddings(new_embeddings)
# 获取编码器的方法
def get_encoder(self):
return self.encoder
# 剪枝模型头部的方法,接受一个 heads_to_prune 字典参数
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
# 遍历 heads_to_prune 字典中的每个层和对应要剪枝的头部列表
for layer, heads in heads_to_prune.items():
# 调用编码器中相应层的自注意力模块的剪枝方法
self.encoder.block[layer].layer[0].SelfAttention.prune_heads(heads)
# 前向传播方法,接受多个输入参数,并使用装饰器添加了输入输出文档字符串
@add_start_docstrings_to_model_forward(UDOP_ENCODER_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithAttentionMask, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Tensor = None,
bbox: Dict[str, Any] = None,
attention_mask: Tensor = None,
pixel_values: Optional[Tensor] = None,
visual_bbox: Dict[str, Any] = None,
head_mask: Optional[Tensor] = None,
inputs_embeds: Optional[Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 设置输出注意力权重,默认为模型配置中的设定,如果未显式提供则使用模型的配置值
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 设置输出隐藏状态,默认为模型配置中的设定,如果未显式提供则使用模型的配置值
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 设置是否返回字典形式的输出,默认为模型配置中的设定,如果未显式提供则使用模型的配置值
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用编码器模型的前向方法,传入各种输入参数和配置选项
encoder_outputs = self.encoder(
input_ids=input_ids, # 输入的token IDs
bbox=bbox, # 文本框的边界框信息
visual_bbox=visual_bbox, # 可视化边界框信息
pixel_values=pixel_values, # 图像像素值
attention_mask=attention_mask, # 注意力遮罩,控制哪些token参与注意力计算
inputs_embeds=inputs_embeds, # 替代token IDs的嵌入表示
head_mask=head_mask, # 多头注意力掩码
output_attentions=output_attentions, # 控制是否输出注意力权重
output_hidden_states=output_hidden_states, # 控制是否输出隐藏状态
return_dict=return_dict, # 控制是否返回字典形式的输出
)
# 返回编码器的输出结果
return encoder_outputs
.\models\udop\processing_udop.py
"""
Processor class for UDOP.
UDOP 的处理器类。
"""
from typing import List, Optional, Union
from ...image_utils import ImageInput
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType
class UdopProcessor(ProcessorMixin):
r"""
Constructs a UDOP processor which combines a LayoutLMv3 image processor and a UDOP tokenizer into a single processor.
构建 UDOP 处理器,将 LayoutLMv3 图像处理器和 UDOP 分词器结合成一个单一的处理器。
[`UdopProcessor`] offers all the functionalities you need to prepare data for the model.
[`UdopProcessor`] 提供了准备数据以供模型使用的所有功能。
It first uses [`LayoutLMv3ImageProcessor`] to resize, rescale and normalize document images, and optionally applies OCR
to get words and normalized bounding boxes. These are then provided to [`UdopTokenizer`] or [`UdopTokenizerFast`],
which turns the words and bounding boxes into token-level `input_ids`, `attention_mask`, `token_type_ids`, `bbox`.
Optionally, one can provide integer `word_labels`, which are turned into token-level `labels` for token
classification tasks (such as FUNSD, CORD).
首先使用 [`LayoutLMv3ImageProcessor`] 调整、重新缩放和归一化文档图像,可选地应用 OCR
获取单词和归一化边界框。然后提供给 [`UdopTokenizer`] 或 [`UdopTokenizerFast`],
将单词和边界框转换为令牌级别的 `input_ids`、`attention_mask`、`token_type_ids` 和 `bbox`。
可选地,可以提供整数 `word_labels`,将其转换为令牌级别的标签,用于令牌分类任务(如 FUNSD、CORD)。
Additionally, it also supports passing `text_target` and `text_pair_target` to the tokenizer, which can be used to
prepare labels for language modeling tasks.
此外,还支持将 `text_target` 和 `text_pair_target` 传递给分词器,可用于准备语言建模任务的标签。
Args:
image_processor (`LayoutLMv3ImageProcessor`):
An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input.
tokenizer (`UdopTokenizer` or `UdopTokenizerFast`):
An instance of [`UdopTokenizer`] or [`UdopTokenizerFast`]. The tokenizer is a required input.
参数:
image_processor (`LayoutLMv3ImageProcessor`):
[`LayoutLMv3ImageProcessor`] 的一个实例。图像处理器是必需的输入。
tokenizer (`UdopTokenizer` or `UdopTokenizerFast`):
[`UdopTokenizer`] 或 [`UdopTokenizerFast`] 的一个实例。分词器是必需的输入。
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "LayoutLMv3ImageProcessor"
tokenizer_class = ("UdopTokenizer", "UdopTokenizerFast")
def __init__(self, image_processor, tokenizer):
super().__init__(image_processor, tokenizer)
def __call__(
self,
images: Optional[ImageInput] = None,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
boxes: Union[List[List[int]], List[List[List[int]]]] = None,
word_labels: Optional[Union[List[int], List[List[int]]]] = None,
text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
text_pair_target: Optional[
Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
):
"""
Copied from transformers.models.layoutlmv3.processing_layoutlmv3.LayoutLMv3Processor.__call__
Method defining the behavior of the object when called as a function.
"""
def get_overflowing_images(self, images, overflow_to_sample_mapping):
"""
This method ensures each `input_ids` sample is mapped to its corresponding image in case of overflow.
"""
images_with_overflow = []
for sample_idx in overflow_to_sample_mapping:
images_with_overflow.append(images[sample_idx])
if len(images_with_overflow) != len(overflow_to_sample_mapping):
raise ValueError(
"Expected length of images to be the same as the length of `overflow_to_sample_mapping`, but got"
f" {len(images_with_overflow)} and {len(overflow_to_sample_mapping)}"
)
return images_with_overflow
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to PreTrainedTokenizer's `batch_decode`.
Please refer to the docstring of that method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to PreTrainedTokenizer's `decode`.
Please refer to the docstring of that method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
return ["input_ids", "bbox", "attention_mask", "pixel_values"]
.\models\udop\tokenization_udop.py
""" Tokenization classes for UDOP model."""
import os
import re
import warnings
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple, Union
import sentencepiece as spm
from ...tokenization_utils import PreTrainedTokenizer
from ...tokenization_utils_base import (
AddedToken,
BatchEncoding,
EncodedInput,
PreTokenizedInput,
TextInput,
TextInputPair,
TruncationStrategy,
)
from ...utils import PaddingStrategy, TensorType, add_end_docstrings, logging
logger = logging.get_logger(__name__)
SPIECE_UNDERLINE = "▁"
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/udop-large": "https://huggingface.co/microsoft/udop-large/resolve/main/spiece.model",
},
"tokenizer_file": {
"microsoft/udop-large": "https://huggingface.co/microsoft/udop-large/resolve/main/tokenizer.json",
},
}
class UdopTokenizer(PreTrainedTokenizer):
"""
从 LayoutXLMTokenizer 和 T5Tokenizer 改编而来。基于 SentencePiece 实现的 tokenizer。
继承自 PreTrainedTokenizer 类,该类包含大多数主要方法。用户应参考超类以获取有关这些方法的更多信息。
属性:
sp_model (`SentencePieceProcessor`):
每次转换(字符串、token 和 ID)所使用的 SentencePiece 处理器。
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
eos_token="</s>",
unk_token="<unk>",
sep_token="</s>",
pad_token="<pad>",
sep_token_box=[1000, 1000, 1000, 1000],
pad_token_box=[0, 0, 0, 0],
pad_token_label=-100,
only_label_first_subword=True,
additional_special_tokens=None,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
legacy=True,
add_prefix_space=True,
**kwargs,
) -> None:
eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
self.legacy = legacy
self.add_prefix_space = add_prefix_space
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
self.vocab_file = vocab_file
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(vocab_file)
self.sep_token_box = sep_token_box
self.pad_token_box = pad_token_box
self.pad_token_label = pad_token_label
self.only_label_first_subword = only_label_first_subword
super().__init__(
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
sep_token_box=sep_token_box,
pad_token_box=pad_token_box,
pad_token_label=pad_token_label,
only_label_first_subword=only_label_first_subword,
additional_special_tokens=additional_special_tokens,
sp_model_kwargs=self.sp_model_kwargs,
legacy=legacy,
add_prefix_space=add_prefix_space,
**kwargs,
)
@property
def vocab_size(self):
return len(self.sp_model)
def get_vocab(self):
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
):
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is None:
return ([0] * len(token_ids_0)) + [1]
else:
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
def get_sentinel_tokens(self):
"""
Retrieves sentinel tokens from the list of additional special tokens.
Returns:
list: List of sentinel tokens identified by regex pattern "<extra_id_\d+>".
"""
return list(
set(filter(lambda x: bool(re.search(r"<extra_id_\d+>", x)), self.additional_special_tokens))
)
def get_sentinel_token_ids(self):
"""
Retrieves token IDs for sentinel tokens using the tokenizer's vocabulary.
Returns:
list: List of token IDs corresponding to sentinel tokens.
"""
return [self.convert_tokens_to_ids(token) for token in self.get_sentinel_tokens()]
def _add_eos_if_not_present(self, token_ids: List[int]) -> List[int]:
"""
Adds an end-of-sequence (EOS) token to token_ids if it's not already present.
Args:
token_ids (List[int]): List of token IDs.
Returns:
List[int]: List of token IDs with EOS appended if not already present.
"""
if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
warnings.warn(
f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated"
" eos tokens being added."
)
return token_ids
else:
return token_ids + [self.eos_token_id]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates token type IDs for sequences, distinguishing between token_ids_0 and token_ids_1.
Args:
token_ids_0 (List[int]): List of token IDs for the first sequence.
token_ids_1 (List[int], optional): List of token IDs for the second sequence (if exists).
Returns:
List[int]: List of token type IDs where 0 corresponds to token_ids_0 and 1 to token_ids_1 (if provided).
"""
def create_model_input_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
eos = [self.eos_token_id]
if token_ids_1 is None:
return len(token_ids_0 + eos) * [0]
return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A sequence has the following format:
- single sequence: `X </s>`
- pair of sequences: `A </s> B </s>`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
token_ids_0 = self._add_eos_if_not_present(token_ids_0)
if token_ids_1 is None:
return token_ids_0
else:
token_ids_1 = self._add_eos_if_not_present(token_ids_1)
return token_ids_0 + token_ids_1
def __getstate__(self):
"""
Serialize the T5Tokenizer instance, preparing it for pickling.
"""
state = self.__dict__.copy()
state["sp_model"] = None
return state
def __setstate__(self, d):
"""
Deserialize and restore a previously serialized T5Tokenizer instance.
"""
self.__dict__ = d
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file)
def tokenize(self, text: "TextInput", **kwargs) -> List[str]:
"""
Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
first token is special.
"""
if self.legacy or len(text) == 0:
return super().tokenize(text, **kwargs)
text = text.replace(SPIECE_UNDERLINE, " ")
if self.add_prefix_space:
text = SPIECE_UNDERLINE + text
tokens = super().tokenize(text, **kwargs)
if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
tokens = tokens[1:]
return tokens
def _tokenize(self, text, **kwargs):
"""
Returns a tokenized string.
We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
`['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
`unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
`self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
"""
tokens = self.sp_model.encode(text, out_type=str)
if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
return tokens
tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.sp_model.piece_to_id(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.sp_model.IdToPiece(index)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
if tokens[0].startswith(SPIECE_UNDERLINE) and self.add_prefix_space:
tokens[0] = tokens[0][1:]
current_sub_tokens = []
out_string = ""
prev_is_special = False
for token in tokens:
if token in self.all_special_tokens:
if not prev_is_special:
out_string += " "
out_string += self.sp_model.decode(current_sub_tokens) + token
prev_is_special = True
current_sub_tokens = []
else:
current_sub_tokens.append(token)
prev_is_special = False
out_string += self.sp_model.decode(current_sub_tokens)
return out_string.strip()
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file,)
@add_end_docstrings(UDOP_ENCODE_KWARGS_DOCSTRING)
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
boxes: Union[List[List[int]], List[List[List[int]]]] = None,
word_labels: Optional[Union[List[int], List[List[int]]]] = None,
text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
text_pair_target: Optional[
Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
] = None,
**kwargs,
) -> BatchEncoding:
if text is None and text_target is None:
raise ValueError("You need to specify either `text` or `text_target`.")
if text is not None:
if not self._in_target_context_manager:
self._switch_to_input_mode()
encodings = self.call_boxes(text=text, text_pair=text_pair, boxes=boxes, word_labels=word_labels, **kwargs)
if text_target is not None:
self._switch_to_target_mode()
target_encodings = self._call_one(text=text_target, text_pair=text_pair_target, **kwargs)
self._switch_to_input_mode()
if text_target is None:
return encodings
elif text is None:
return target_encodings
else:
encodings["labels"] = target_encodings["input_ids"]
return encodings
def call_boxes(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
boxes: Union[List[List[int]], List[List[List[int]]]] = None,
word_labels: Optional[Union[List[int], List[List[int]]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
def batch_encode_plus_boxes(
self,
batch_text_or_text_pairs: Union[
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
],
is_pair: bool = None,
boxes: Optional[List[List[List[int]]]] = None,
word_labels: Optional[List[List[int]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
) -> BatchEncoding:
"""
Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.
Args:
batch_text_or_text_pairs (`List[str]`, `List[Tuple[str, str]]`, `List[List[str]]`, `List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also `List[List[int]]`, `List[Tuple[List[int], List[int]]]`):
Batch of sequences or pair of sequences to be encoded. This can be a list of
string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see
details in `encode_plus`).
"""
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
return self._batch_encode_plus_boxes(
batch_text_or_text_pairs=batch_text_or_text_pairs,
is_pair=is_pair,
boxes=boxes,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def encode_boxes(
self,
text: Union[TextInput, PreTokenizedInput, EncodedInput],
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
boxes: Optional[List[List[int]]] = None,
word_labels: Optional[List[List[int]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
) -> List[int]:
"""
Args:
Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary. Same as doing
`self.convert_tokens_to_ids(self.tokenize(text))`.
text (`str`, `List[str]` or `List[int]`):
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
`tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method).
text_pair (`str`, `List[str]` or `List[int]`, *optional*):
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method).
"""
encoded_inputs = self.encode_plus_boxes(
text,
text_pair=text_pair,
boxes=boxes,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
return_tensors=return_tensors,
**kwargs,
)
return encoded_inputs["input_ids"]
def encode_plus_boxes(
self,
text: Union[TextInput, PreTokenizedInput],
text_pair: Optional[PreTokenizedInput] = None,
boxes: Optional[List[List[int]]] = None,
word_labels: Optional[List[List[int]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
"""
Encode text inputs with associated boxes, word labels, and other optional parameters into model inputs.
Args:
text: Input text or tokenized input.
text_pair: Optional second input text or tokenized input.
boxes: Optional list of bounding boxes for each token.
word_labels: Optional list of labels corresponding to each token.
add_special_tokens: Whether to add special tokens (like [CLS], [SEP]) to the encoded inputs.
padding: Strategy for padding sequences to a certain length.
truncation: Strategy for truncating sequences longer than `max_length`.
max_length: Maximum length of the sequences after padding/truncation.
stride: Stride for splitting the sequence into smaller parts.
is_split_into_words: Whether the input is already split into words.
pad_to_multiple_of: Pad the sequence length to a multiple of this value.
return_tensors: Whether to return tensors (e.g., PyTorch tensors) as outputs.
return_token_type_ids: Whether to return token type ids as part of the outputs.
return_attention_mask: Whether to return attention masks as part of the outputs.
return_overflowing_tokens: Whether to return overflowing tokens beyond max_length.
return_special_tokens_mask: Whether to return a mask indicating special tokens.
return_offsets_mapping: Whether to return offsets mapping from original text to tokens.
return_length: Whether to return the length of the encoded inputs.
verbose: Whether to print verbose information during encoding.
**kwargs: Additional keyword arguments for specific encoders.
Returns:
Dictionary containing the encoded inputs with specified model inputs (like 'input_ids', 'attention_mask', etc.).
"""
pass
) -> BatchEncoding:
"""
Tokenize and prepare for the model a sequence or a pair of sequences.
<Tip warning={true}>
This method is deprecated, `__call__` should be used instead.
</Tip>
Args:
text (`str`, `List[str]` or `List[int]` (the latter only for not-fast tokenizers)):
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
`tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method).
text_pair (`str`, `List[str]` or `List[int]`, *optional*):
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method).
"""
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
return self._encode_plus_boxes(
text=text,
text_pair=text_pair,
boxes=boxes,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def _batch_encode_plus_boxes(
self,
batch_text_or_text_pairs: Union[
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
],
is_pair: bool = None,
boxes: Optional[List[List[List[int]]]] = None,
word_labels: Optional[List[List[int]]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
if return_offsets_mapping:
raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers. "
"To use this feature, change your tokenizer to one deriving from "
"transformers.PreTrainedTokenizerFast."
)
batch_outputs = self._batch_prepare_for_model_boxes(
batch_text_or_text_pairs=batch_text_or_text_pairs,
is_pair=is_pair,
boxes=boxes,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
return_tensors=return_tensors,
verbose=verbose,
)
return BatchEncoding(batch_outputs)
@add_end_docstrings(UDOP_ENCODE_KWARGS_DOCSTRING)
def _batch_prepare_for_model_boxes(
self,
batch_text_or_text_pairs,
is_pair: bool = None,
boxes: Optional[List[List[int]]] = None,
word_labels: Optional[List[List[int]]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_length: bool = False,
verbose: bool = True,
) -> BatchEncoding:
"""
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
manages a moving window (with user defined stride) for overflowing tokens
Args:
batch_ids_pairs: list of tokenized input ids or input ids pairs
"""
batch_outputs = {}
for idx, example in enumerate(zip(batch_text_or_text_pairs, boxes)):
batch_text_or_text_pair, boxes_example = example
if is_pair:
text_or_text_pair = batch_text_or_text_pair[0]
else:
text_or_text_pair = batch_text_or_text_pair
outputs = self.prepare_for_model_boxes(
text_or_text_pair,
batch_text_or_text_pair[1] if is_pair else None,
boxes_example,
word_labels=word_labels[idx] if word_labels is not None else None,
add_special_tokens=add_special_tokens,
padding=PaddingStrategy.DO_NOT_PAD.value,
truncation=truncation_strategy.value,
max_length=max_length,
stride=stride,
pad_to_multiple_of=None,
return_attention_mask=False,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
return_tensors=None,
prepend_batch_axis=False,
verbose=verbose,
)
for key, value in outputs.items():
if key not in batch_outputs:
batch_outputs[key] = []
batch_outputs[key].append(value)
batch_outputs = self.pad(
batch_outputs,
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
)
batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
return batch_outputs
def _encode_plus_boxes(
self,
text: Union[TextInput, PreTokenizedInput],
text_pair: Optional[PreTokenizedInput] = None,
boxes: Optional[List[List[int]]] = None,
word_labels: Optional[List[int]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
if return_offsets_mapping:
raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers. "
"To use this feature, change your tokenizer to one deriving from "
"transformers.PreTrainedTokenizerFast. "
"More information on available tokenizers at "
"https://github.com/huggingface/transformers/pull/2674"
)
return self.prepare_for_model_boxes(
text=text,
text_pair=text_pair,
boxes=boxes,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding=padding_strategy.value,
truncation=truncation_strategy.value,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
verbose=verbose,
)
@add_end_docstrings(UDOP_ENCODE_KWARGS_DOCSTRING)
def prepare_for_model_boxes(
self,
text: Union[TextInput, PreTokenizedInput],
text_pair: Optional[PreTokenizedInput] = None,
boxes: Optional[List[List[int]]] = None,
word_labels: Optional[List[int]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
prepend_batch_axis: bool = False,
**kwargs,
):
def truncate_sequences(
self,
ids: List[int],
token_boxes: List[List[int]],
pair_ids: Optional[List[int]] = None,
pair_token_boxes: Optional[List[List[int]]] = None,
labels: Optional[List[int]] = None,
num_tokens_to_remove: int = 0,
truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
stride: int = 0,
):
def _pad(
self,
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
.\models\udop\tokenization_udop_fast.py
"""
定义一个 UdopTokenizerFast 类,继承自 PreTrainedTokenizerFast 类,用于实现快速的 UDOP 分词器,基于 HuggingFace 的 tokenizers 库。
该类提供了从 LayoutXLMTokenizer 和 T5Tokenizer 中适配的功能,并基于 BPE 模型实现。
继承自 PreTrainedTokenizerFast 类,包含了大部分主要方法,用户可以参考其超类以获取更多关于这些方法的信息。
"""
class UdopTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" UDOP tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
[`LayoutXLMTokenizer`] and [`T5Tokenizer`]. Based on
[BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.
"""
Args:
vocab_file (`str`, *optional*):
Path to the vocabulary file. 词汇表文件的路径。
tokenizer_file (`str`, *optional*):
Path to the tokenizer file. 标记器文件的路径。
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token. 序列结束标记,默认为 `"</s>"`。
<Tip>
When building a sequence using special tokens, this is not the token that is used for the end of sequence.
The token used is the `sep_token`.
</Tip>
sep_token (`str`, *optional*, defaults to `"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
分隔符标记,在构建多个序列时使用,例如用于序列分类或问题回答中的文本和问题。还用作使用特殊标记构建的序列的最后一个标记。
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
未知标记,词汇表中不存在的标记会被设置为此标记。
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
用于填充的标记,例如在批处理具有不同长度序列时使用。
sep_token_box (`List[int]`, *optional*, defaults to `[1000, 1000, 1000, 1000]`):
The bounding box to use for the special [SEP] token.
用于特殊 [SEP] 标记的边界框。
pad_token_box (`List[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
The bounding box to use for the special [PAD] token.
用于特殊 [PAD] 标记的边界框。
pad_token_label (`int`, *optional*, defaults to -100):
The label to use for padding tokens. Defaults to -100, which is the `ignore_index` of PyTorch's
CrossEntropyLoss.
用于填充标记的标签。默认为 -100,这是 PyTorch CrossEntropyLoss 的 `ignore_index`。
only_label_first_subword (`bool`, *optional*, defaults to `True`):
Whether or not to only label the first subword, in case word labels are provided.
是否仅标记第一个子词,如果提供了单词标签。
additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
标记器使用的额外特殊标记。
"""
# 定义用于加载预训练模型的相关常量和类
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = UdopTokenizer
# 初始化方法,用于设置类的属性
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
eos_token="</s>",
sep_token="</s>",
unk_token="<unk>",
pad_token="<pad>",
sep_token_box=[1000, 1000, 1000, 1000],
pad_token_box=[0, 0, 0, 0],
pad_token_label=-100,
only_label_first_subword=True,
additional_special_tokens=None,
**kwargs,
):
):
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
eos_token=eos_token,
sep_token=sep_token,
unk_token=unk_token,
pad_token=pad_token,
sep_token_box=sep_token_box,
pad_token_box=pad_token_box,
pad_token_label=pad_token_label,
only_label_first_subword=only_label_first_subword,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
# 调用父类的初始化方法,传递必要的参数和关键字参数
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
eos_token=eos_token,
sep_token=sep_token,
unk_token=unk_token,
pad_token=pad_token,
sep_token_box=sep_token_box,
pad_token_box=pad_token_box,
pad_token_label=pad_token_label,
only_label_first_subword=only_label_first_subword,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
self.vocab_file = vocab_file
# 添加额外的属性
self.sep_token_box = sep_token_box
self.pad_token_box = pad_token_box
self.pad_token_label = pad_token_label
self.only_label_first_subword = only_label_first_subword
@property
def can_save_slow_tokenizer(self) -> bool:
# 检查是否可以保存慢速的分词器,需要检查词汇文件是否存在
return os.path.isfile(self.vocab_file) if self.vocab_file else False
@add_end_docstrings(UDOP_ENCODE_KWARGS_DOCSTRING)
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
boxes: Union[List[List[int]], List[List[List[int]]]] = None,
word_labels: Optional[Union[List[int], List[List[int]]]] = None,
text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
text_pair_target: Optional[
Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
] = None,
**kwargs,
) -> BatchEncoding:
# 检查输入参数,确保至少有 `text` 或 `text_target` 被指定
if text is None and text_target is None:
raise ValueError("You need to specify either `text` or `text_target`.")
if text is not None:
# 如果没有处于目标文本模式,则切换到输入文本模式
if not self._in_target_context_manager:
self._switch_to_input_mode()
# 调用 `call_boxes` 方法处理文本、文本对、框和词标签等参数
encodings = self.call_boxes(text=text, text_pair=text_pair, boxes=boxes, word_labels=word_labels, **kwargs)
if text_target is not None:
# 切换到目标文本模式
self._switch_to_target_mode()
# 调用 `_call_one` 方法处理目标文本、目标文本对等参数
target_encodings = self._call_one(text=text_target, text_pair=text_pair_target, **kwargs)
# 回到输入文本模式
self._switch_to_input_mode()
# 根据是否有目标文本,返回相应的编码结果
if text_target is None:
return encodings
elif text is None:
return target_encodings
else:
# 将目标文本的 `input_ids` 放入编码结果的 `labels` 键中
encodings["labels"] = target_encodings["input_ids"]
return encodings
@add_end_docstrings(UDOP_ENCODE_KWARGS_DOCSTRING)
# 定义一个方法用于处理文本、文本对、文本列表或预分词输入,同时接收盒子坐标和词标签等参数
def call_boxes(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
boxes: Union[List[List[int]], List[List[List[int]]]] = None,
word_labels: Optional[Union[List[int], List[List[int]]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
# 从文本和文本对(如果存在)创建批处理输入
batched_input = [(text, text_pair)] if text_pair else [text]
# 使用预定义的 tokenizer 对批处理输入进行编码
encodings = self._tokenizer.encode_batch(
batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
)
# 返回编码结果的第一个样本的 token 列表
return encodings[0].tokens
# 定义一个方法用于将文本或文本对列表批量编码并处理盒子坐标和词标签等参数
def batch_encode_plus_boxes(
self,
batch_text_or_text_pairs: Union[
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
],
is_pair: bool = None,
boxes: Optional[List[List[List[int]]]] = None,
word_labels: Optional[List[List[int]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
# 创建一个包含文本或文本对的批处理输入
batched_input = batch_text_or_text_pairs
# 使用预定义的 tokenizer 对批处理输入进行编码
encodings = self._tokenizer.encode_batch(
batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
)
# 返回编码结果
return encodings
) -> BatchEncoding:
"""
Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.
<Tip warning={true}>
This method is deprecated, `__call__` should be used instead.
</Tip>
Args:
batch_text_or_text_pairs (`List[str]`, `List[Tuple[str, str]]`, `List[List[str]]`, `List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also `List[List[int]]`, `List[Tuple[List[int], List[int]]]`):
Batch of sequences or pair of sequences to be encoded. This can be a list of
string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see
details in `encode_plus`).
"""
# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
# 获取填充和截断策略以及其他相关参数,以确保向后兼容性
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
# 调用底层方法 `_batch_encode_plus_boxes` 进行批量编码
return self._batch_encode_plus_boxes(
batch_text_or_text_pairs=batch_text_or_text_pairs, # 待编码的文本或文本对
is_pair=is_pair, # 是否是文本对
boxes=boxes, # 区域框
word_labels=word_labels, # 单词标签
add_special_tokens=add_special_tokens, # 是否添加特殊标记
padding_strategy=padding_strategy, # 填充策略
truncation_strategy=truncation_strategy, # 截断策略
max_length=max_length, # 最大长度
stride=stride, # 步长
is_split_into_words=is_split_into_words, # 是否已拆分为单词
pad_to_multiple_of=pad_to_multiple_of, # 填充至倍数长度
return_tensors=return_tensors, # 是否返回张量
return_token_type_ids=return_token_type_ids, # 是否返回 token 类型 id
return_attention_mask=return_attention_mask, # 是否返回注意力掩码
return_overflowing_tokens=return_overflowing_tokens, # 是否返回溢出的 token
return_special_tokens_mask=return_special_tokens_mask, # 是否返回特殊 token 掩码
return_offsets_mapping=return_offsets_mapping, # 是否返回偏移映射
return_length=return_length, # 是否返回长度
verbose=verbose, # 是否详细输出
**kwargs, # 其他参数
)
# 定义一个方法用于批量编码文本或文本对,支持多种输入类型
def _batch_encode_plus_boxes(
self,
batch_text_or_text_pairs: Union[
List[TextInput], # 输入为单个文本
List[TextInputPair], # 输入为文本对
List[PreTokenizedInput], # 输入为预分词文本
],
is_pair: bool = None, # 标志是否为文本对
boxes: Optional[List[List[List[int]]]] = None, # 相关文本的边框坐标
word_labels: Optional[List[List[int]]] = None, # 文本中单词的标签
add_special_tokens: bool = True, # 是否添加特殊标记
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, # 填充策略
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, # 截断策略
max_length: Optional[int] = None, # 最大长度限制
stride: int = 0, # 截断和填充时的步长
pad_to_multiple_of: Optional[int] = None, # 填充到倍数长度
return_tensors: Optional[str] = None, # 返回的张量类型
return_token_type_ids: Optional[bool] = None, # 是否返回token类型id
return_attention_mask: Optional[bool] = None, # 是否返回attention mask
return_overflowing_tokens: bool = False, # 是否返回超出最大长度的token
return_special_tokens_mask: bool = False, # 是否返回特殊token的mask
return_offsets_mapping: bool = False, # 是否返回偏移映射
return_length: bool = False, # 是否返回编码后的长度
verbose: bool = True, # 是否输出详细信息
**kwargs, # 其他关键字参数
):
# TODO: 实现批量编码文本及边框的功能
pass
# 定义一个方法用于编码单个文本或文本对,支持多种输入类型
def _encode_plus_boxes(
self,
text: Union[TextInput, PreTokenizedInput], # 输入的文本
text_pair: Optional[PreTokenizedInput] = None, # 可选的第二个文本
boxes: Optional[List[List[int]]] = None, # 相关文本的边框坐标
word_labels: Optional[List[int]] = None, # 文本中单词的标签
add_special_tokens: bool = True, # 是否添加特殊标记
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, # 填充策略
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, # 截断策略
max_length: Optional[int] = None, # 最大长度限制
stride: int = 0, # 截断和填充时的步长
pad_to_multiple_of: Optional[int] = None, # 填充到倍数长度
return_tensors: Optional[bool] = None, # 返回的张量类型
return_token_type_ids: Optional[bool] = None, # 是否返回token类型id
return_attention_mask: Optional[bool] = None, # 是否返回attention mask
return_overflowing_tokens: bool = False, # 是否返回超出最大长度的token
return_special_tokens_mask: bool = False, # 是否返回特殊token的mask
return_offsets_mapping: bool = False, # 是否返回偏移映射
return_length: bool = False, # 是否返回编码后的长度
verbose: bool = True, # 是否输出详细信息
**kwargs, # 其他关键字参数
):
# TODO: 实现编码单个文本及边框的功能
pass
) -> BatchEncoding:
# 将输入组成批处理输入
# 两种选项:
# 1) 只有文本,如果文本必须是一个字符串列表
# 2) 文本 + 文本对,此时文本是字符串,text_pair 是字符串列表
batched_input = [(text, text_pair)] if text_pair else [text]
batched_boxes = [boxes] # 将盒子坐标转为批处理列表
batched_word_labels = [word_labels] if word_labels is not None else None # 将单词标签转为批处理列表,如果不存在则为 None
batched_output = self._batch_encode_plus_boxes(
batched_input,
is_pair=bool(text_pair is not None), # 如果存在 text_pair 则设置为 True
boxes=batched_boxes,
word_labels=batched_word_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
# 如果返回的张量为 None,并且不返回溢出的 tokens,则移除前导的批处理轴
# 在这种情况下,溢出的 tokens 作为批处理输出返回
if return_tensors is None and not return_overflowing_tokens:
batched_output = BatchEncoding(
{
key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
for key, value in batched_output.items()
},
batched_output.encodings, # 将编码添加到批处理输出中
)
# 检查是否需要提醒序列过长
self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
return batched_output
) -> List[int]:
"""
Args:
Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary. Same as doing
`self.convert_tokens_to_ids(self.tokenize(text))`.
text (`str`, `List[str]` or `List[int]`):
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
`tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method).
text_pair (`str`, `List[str]` or `List[int]`, *optional*):
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method).
"""
# 使用 `encode_plus_boxes` 方法对输入文本及其可选的文本对进行编码,同时处理其他参数
encoded_inputs = self.encode_plus_boxes(
text,
text_pair=text_pair,
boxes=boxes,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
return_tensors=return_tensors,
**kwargs,
)
# 返回编码后的输入文本的 `input_ids` 列表
return encoded_inputs["input_ids"]
def encode_plus_boxes(
self,
text: Union[TextInput, PreTokenizedInput],
text_pair: Optional[PreTokenizedInput] = None,
boxes: Optional[List[List[int]]] = None,
word_labels: Optional[List[List[int]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
"""
Tokenize and prepare for the model a sequence or a pair of sequences.
<Tip warning={true}>
This method is deprecated, `__call__` should be used instead.
</Tip>
Args:
text (`str`, `List[str]` or `List[int]` (the latter only for not-fast tokenizers)):
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
`tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method).
text_pair (`str`, `List[str]` or `List[int]`, *optional*):
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method).
"""
# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
# 获取填充和截断策略,以及其他参数
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
# 调用内部方法 `_encode_plus_boxes` 进行编码,并返回结果
return self._encode_plus_boxes(
text=text,
text_pair=text_pair,
boxes=boxes,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
# Copied from transformers.models.layoutxlm.tokenization_layoutxlm_fast.LayoutXLMTokenizerFast._pad
def _pad(
self,
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
):
# 方法 `_pad` 负责对编码后的输入进行填充操作,根据传入的参数进行相应的处理
pass
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An XLM-RoBERTa sequence has the following format:
- single sequence: `<s> X </s>`
- pair of sequences: `<s> A </s></s> B </s>`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
# If only one sequence is provided, append the separator token to the end of token_ids_0
if token_ids_1 is None:
return token_ids_0 + [self.sep_token_id]
# Define the separator token as a list
sep = [self.sep_token_id]
# Concatenate token_ids_0, separator, token_ids_1, and another separator
return token_ids_0 + sep + token_ids_1 + sep
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
not make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
# Define the separator token as a list
sep = [self.sep_token_id]
# If only one sequence is provided, return a list of zeros of length equal to token_ids_0 + separator
if token_ids_1 is None:
return len(token_ids_0 + sep) * [0]
# If two sequences are provided, return a list of zeros of length equal to token_ids_0 + separator + token_ids_1 + separator
return len(token_ids_0 + sep + token_ids_1 + sep) * [0]
# Copied from transformers.models.layoutxlm.tokenization_layoutxlm_fast.LayoutXLMTokenizerFast.save_vocabulary
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary to a directory. This method is adapted from the LayoutXLMTokenizerFast class.
Args:
save_directory (`str`):
Directory where the vocabulary will be saved.
filename_prefix (`str`, *optional*):
Optional prefix to prepend to the vocabulary filename.
Returns:
`Tuple[str]`: Tuple containing the path to the saved vocabulary file.
Raises:
ValueError: If the fast tokenizer cannot save the vocabulary.
"""
# Check if the fast tokenizer has the capability to save the vocabulary
if not self.can_save_slow_tokenizer:
raise ValueError(
"Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
"tokenizer."
)
# Ensure save_directory exists and is a directory; log an error if not
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
return
# Define the output vocabulary file path
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
# If the current vocabulary file path is different from the desired output path, copy the vocabulary file
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
# Return the path to the saved vocabulary file
return (out_vocab_file,)