Transformers 源码解析(六十一)
.\models\kosmos2\processing_kosmos2.py
"""KOSMOS-2 的处理器类。"""
import copy
import math
import re
from typing import List, Optional, Tuple, Union
from ...image_processing_utils import BatchFeature
from ...image_utils import ImageInput, is_batched
from ...processing_utils import ProcessorMixin
from ...tokenization_utils import AddedToken
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy
from ...utils import TensorType
BboxInput = Union[
List[Tuple[int, int]],
List[Tuple[float, float, float, float]],
List[List[Tuple[int, int]]],
List[List[Tuple[float, float, float]]],
]
class Kosmos2Processor(ProcessorMixin):
"""
构造一个 KOSMOS-2 处理器,将 KOSMOS-2 图像处理器和 KOSMOS-2 分词器封装成一个单一的处理器。
[`Kosmos2Processor`] 提供了 [`CLIPImageProcessor`] 的所有功能以及 [`XLMRobertaTokenizerFast`] 的一些功能。
更多信息请参阅 [`~Kosmos2Processor.__call__`] 和 [`~Kosmos2Processor.decode`] 的文档字符串。
Args:
image_processor (`CLIPImageProcessor`):
一个 [`CLIPImageProcessor`] 实例。图像处理器是必需的输入。
tokenizer (`XLMRobertaTokenizerFast`):
一个 [`XLMRobertaTokenizerFast`] 实例。分词器是必需的输入。
num_patch_index_tokens (`int`, *optional*, 默认为 1024):
表示补丁索引的标记数。
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "CLIPImageProcessor"
tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast")
def __init__(self, image_processor, tokenizer, num_patch_index_tokens=1024):
tokenizer.return_token_type_ids = False
self.eod_token = "</doc>"
self.boi_token = "<image>"
self.eoi_token = "</image>"
self.eoc_token = "</chunk>"
self.eol_token = "</line>"
self.bop_token = "<phrase>"
self.eop_token = "</phrase>"
self.boo_token = "<object>"
self.eoo_token = "</object>"
self.dom_token = "<|delimiter_of_multi_objects|>"
self.grd_token = "<grounding>"
self.tag_tokens = [
self.eod_token,
self.boi_token,
self.eoi_token,
self.eoc_token,
self.eol_token,
self.bop_token,
self.eop_token,
self.boo_token,
self.eoo_token,
self.dom_token,
self.grd_token,
]
self.num_patch_index_tokens = num_patch_index_tokens
patch_index_tokens = [f"<patch_index_{str(x).zfill(4)}>" for x in range(self.num_patch_index_tokens)]
tokens_to_add = []
for token in self.tag_tokens + patch_index_tokens:
tokens_to_add.append(AddedToken(token, lstrip=True, rstrip=False, normalized=False))
tokenizer.add_tokens(tokens_to_add)
super().__init__(image_processor, tokenizer)
def __call__(
self,
images: ImageInput = None,
text: Union[TextInput, List[TextInput]] = None,
bboxes: BboxInput = None,
num_image_tokens: Optional[int] = 64,
first_image_token_id: Optional[int] = None,
add_special_tokens: bool = True,
add_eos_token: bool = False,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
return_length: bool = False,
verbose: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
def _check_bboxes_for_single_text(self, bboxes):
"""
Check `bboxes` for a single text example. It could be
- `None`: no bounding box associated to a text.
- A list with each element being the bounding boxes associated to one `<phrase> ... </phrase>` pair found
in a text. This could be:
- `None`: no bounding box associated to a `<phrase> ... </phrase>` pair.
- A tuple of 2 integers: A single bounding box specified by patch indices.
- A tuple of 4 float point number: A single bounding box specified by (normalized) coordinates.
- A list containing the above 2 tuple types: Multiple bounding boxes for a
`<phrase> ... </phrase>` pair.
"""
if bboxes is None:
return
elif not isinstance(bboxes, list):
raise ValueError("`bboxes` (for a single text example) should be `None` or a list.")
for bbox in bboxes:
if bbox is None:
continue
elif not isinstance(bbox, list):
bbox = [bbox]
for element in bbox:
if not isinstance(element, tuple) or not (
(len(element) == 2 and all(isinstance(x, int) for x in element))
or (len(element) == 4 and all(isinstance(x, float) for x in element))
):
raise ValueError(
"Each element in `bboxes` (for a single text example) should be either `None`, a tuple containing "
"2 integers or 4 float point numbers, or a list containing such tuples. Also "
"make sure the arguments `texts` and `bboxes` passed to `preprocess_text` are both in "
"batches or both for a single example."
)
def _preprocess_single_example(self, text, image, bboxes, img_info_tokens):
text = text.strip()
if image is not None:
text = f"{img_info_tokens} {text}"
text = self._insert_patch_index_tokens(text, bboxes)
return text
) -> Union[str, List[str]]:
"""Add image and bounding box information to `texts` as image and patch index tokens.
Args:
texts (`Union[TextInput, List[TextInput]]`): The texts to be processed.
images (`ImageInput`, *optional*): The images associated to `texts`.
bboxes (`Union[List[Tuple[int]], List[Tuple[float]], List[List[Tuple[int]]], List[List[Tuple[float]]]]`, *optional*):
The bounding bboxes associated to `texts`.
num_image_tokens (`int`, *optional*, defaults to 64):
The number of image tokens (used as latent queries). This should corresponds to the `latent_query_num`
attribute in `Kosmos2Config`.
Returns:
`Union[TextInput, List[TextInput]]`: The processed texts with image and patch index tokens.
"""
img_tokens = [self.boi_token] * num_image_tokens
img_info_tokens = " ".join([self.boi_token] + img_tokens + [self.eoi_token])
batched = True
if isinstance(texts, str):
batched = False
texts = [texts]
if images is None:
images = [None] * len(texts)
elif not is_batched(images):
images = [images]
if len(texts) != len(images):
raise ValueError(
f"The number of examples in `texts` and `images` should be the same. Got {len(texts)} v.s. {len(images)} instead."
)
if not batched:
self._check_bboxes_for_single_text(bboxes)
bboxes = [bboxes]
elif bboxes is not None:
if not isinstance(bboxes, list):
raise ValueError("`bboxes` should be `None` or a list (as a batch) when `texts` is passed as a batch.")
for x in bboxes:
self._check_bboxes_for_single_text(x)
else:
bboxes = [None] * len(texts)
if len(bboxes) != len(texts):
raise ValueError(
f"The number of examples in `texts` and `bboxes` should be the same. Got {len(texts)} v.s. {len(bboxes)} instead."
)
result = [
self._preprocess_single_example(text, image, bbox, img_info_tokens)
for text, image, bbox in zip(texts, images, bboxes)
]
if not batched:
result = result[0]
return result
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
def post_process_generation(self, text, cleanup_and_extract=True):
caption = text.split(self.eoi_token)[-1]
if cleanup_and_extract:
return clean_text_and_extract_entities_with_bboxes(caption)
return caption
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
def _insert_patch_index_tokens(self, text: str, bboxes: Union[List[Tuple[int]], List[Tuple[float]]]) -> str:
if bboxes is None or len(bboxes) == 0:
return text
matched_phrases = list(re.finditer(r"<phrase>.+?</phrase>", string=text))
if len(matched_phrases) != len(bboxes):
raise ValueError(
f"The number of elements in `bboxes` should be the same as the number of `<phrase> ... </phrase>` pairs in `text`. Got {len(matched_phrases)} v.s. {len(bboxes)} instead."
)
curr_pos = 0
buffer = []
for matched, bbox in zip(matched_phrases, bboxes):
_, end = matched.span()
buffer.append(text[curr_pos:end])
curr_pos = end
if bbox is None:
continue
if isinstance(bbox, tuple):
bbox = [bbox]
if not all(box is not None for box in bbox):
raise ValueError(
"The multiple bounding boxes for a single phrase should not contain any `None` value."
)
patch_index_strings = []
for box in bbox:
patch_index_1, patch_index_2 = self._convert_bbox_to_patch_index_tokens(box)
patch_index_strings.append(f"{patch_index_1} {patch_index_2}")
if len(patch_index_strings) == 0:
continue
position_str = " <|delimiter_of_multi_objects|> ".join(patch_index_strings)
buffer.append(f"<object> {position_str} </object>")
if curr_pos < len(text):
buffer.append(text[curr_pos:])
text = "".join(buffer)
return text
def _convert_bbox_to_patch_index_tokens(
self, bbox: Union[Tuple[int, int], Tuple[float, float, float, float]]
) -> Tuple[str, str]:
if len(bbox) == 2:
idx_1, idx_2 = bbox
else:
num_patches_per_side = int(math.sqrt(self.num_patch_index_tokens))
idx_1, idx_2 = coordinate_to_patch_index(bbox, num_patches_per_side)
token_1 = f"<patch_index_{str(idx_1).zfill(4)}>"
token_2 = f"<patch_index_{str(idx_2).zfill(4)}>"
return token_1, token_2
def coordinate_to_patch_index(bbox: Tuple[float, float, float, float], num_patches_per_side: int) -> Tuple[int, int]:
"""Convert a bounding box to a pair of patch indices.
Args:
bbox (`Tuple[float, float, float, float]`):
The 4 coordinates of the bounding box, with the format being (x1, y1, x2, y2) specifying the upper-left and
lower-right corners of the box. It should have x2 > x1 and y2 > y1.
num_patches_per_side (`int`): the number of patches along each side.
Returns:
`Tuple[int, int]`: A pair of patch indices representing the upper-left patch and lower-right patch.
"""
(x1, y1, x2, y2) = bbox
if not (x2 > x1 and y2 > y1):
raise ValueError("The coordinates in `bbox` should be `(x1, y1, x2, y2)` with `x2 > x1` and `y2 > y1`.")
ul_x = math.floor(x1 * num_patches_per_side)
ul_y = math.floor(y1 * num_patches_per_side)
lr_x = math.ceil(x2 * num_patches_per_side - 1)
lr_y = math.ceil(y2 * num_patches_per_side - 1)
ul_idx = ul_y * num_patches_per_side + ul_x
lr_idx = lr_y * num_patches_per_side + lr_x
return ul_idx, lr_idx
def patch_index_to_coordinate(ul_idx: int, lr_idx: int, num_patches_per_side: int):
"""
Given a grid of length `num_patches_per_side` and the indices of the upper-left and lower-right corners of a
bounding box, returns the normalized coordinates of the bounding box, in the form (x1, y1, x2, y2).
Args:
ul_idx (`int`): the index of the grid cell that corresponds to the upper-left corner of the bounding box.
lr_idx (`int`): the index of the grid cell that corresponds to the lower-right corner of the bounding box.
num_patches_per_side (`int`): the number of patches along each side.
Returns:
`Tuple[float]`: the normalized coordinates of the bounding box, in the form (x1, y1, x2, y2).
"""
cell_size = 1.0 / num_patches_per_side
ul_x = ul_idx % num_patches_per_side
ul_y = ul_idx // num_patches_per_side
lr_x = lr_idx % num_patches_per_side
lr_y = lr_idx // num_patches_per_side
if ul_idx == lr_idx:
x1 = ul_x * cell_size
y1 = ul_y * cell_size
x2 = lr_x * cell_size + cell_size
y2 = lr_y * cell_size + cell_size
elif ul_x == lr_x or ul_y == lr_y:
x1 = ul_x * cell_size
y1 = ul_y * cell_size
x2 = lr_x * cell_size + cell_size
y2 = lr_y * cell_size + cell_size
else:
x1 = ul_x * cell_size + cell_size / 2
y1 = ul_y * cell_size + cell_size / 2
x2 = lr_x * cell_size + cell_size / 2
y2 = lr_y * cell_size + cell_size / 2
return x1, y1, x2, y2
def extract_entities_with_patch_indices(text):
"""Extract entities contained in `text`. The bounding bboxes is given in the form of patch indices.
This function is only intended to be used within `clean_text_and_extract_entities_with_bboxes` where further
processing happens, including converting to normalized coordinates and whitespace character cleaning up.
Examples:
```
>>> text = "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
>>> entities = extract_entities_with_patch_indices(text)
>>> entities
[(' a snowman', (31, 41), [(44, 863)]), (' a fire', (130, 137), [(5, 911)])]
```"""
pattern = r"(?:(<phrase>([^<]+)</phrase>))?<object>((?:<patch_index_\d+><patch_index_\d+><|delimiter_of_multi_objects|>)*<patch_index_\d+><patch_index_\d+>)</object>"
matches = re.finditer(pattern, text)
entities_with_patch_indices = []
for match in matches:
span = match.span(2)
phrase_tag, phrase, match_content = match.groups()
if not phrase_tag:
phrase = None
span = (match.span(0)[0], match.span(0)[0])
patch_index_pairs = match_content.split("<|delimiter_of_multi_objects|>")
entity_bboxes = []
for pair in patch_index_pairs:
x = re.search(r"<patch_index_(\d+)>", pair)
y = re.search(r"<patch_index_(\d+)>", pair[1:])
if x and y:
if phrase:
entity_bboxes.append((int(x.group(1)), int(y.group(1))))
else:
entity_bboxes.append((int(x.group(1)), int(y.group(1))))
if phrase:
entities_with_patch_indices.append((phrase, span, entity_bboxes))
else:
for bbox in entity_bboxes:
entity = f"<patch_index_{bbox[0]}><patch_index_{bbox[1]}>"
entities_with_patch_indices.append((entity, span, [bbox]))
return entities_with_patch_indices
def adjust_entity_positions(entity, text):
"""Adjust the positions of the entities in `text` to be relative to the text with special fields removed."""
entity_name, (start, end) = entity
adjusted_start = len(re.sub("<.*?>", "", text[:start]))
adjusted_end = len(re.sub("<.*?>", "", text[:end]))
adjusted_entity = (entity_name, (adjusted_start, adjusted_end))
return adjusted_entity
def _cleanup_spaces(text, entities):
new_text = text.strip()
leading_spaces = len(text) - len(text.lstrip())
new_entities = []
for entity_name, (start, end), bboxes in entities:
entity_name_leading_spaces = len(entity_name) - len(entity_name.lstrip())
entity_name_trailing_spaces = len(entity_name) - len(entity_name.rstrip())
start = start - leading_spaces + entity_name_leading_spaces
end = end - leading_spaces - entity_name_trailing_spaces
entity_name = entity_name.strip()
new_entities.append((entity_name, (start, end), bboxes))
return new_text, new_entities
def clean_text_and_extract_entities_with_bboxes(text, num_patches_per_side=32):
"""从 `text` 中删除标签标记,提取其中的实体并清除一些空白字符。
示例:
```
>>> text = "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
>>> clean_text, entities = clean_text_and_extract_entities_with_bboxes(text)
>>> clean_text
'An image of a snowman warming himself by a fire.'
>>> entities
[('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
```"""
processed_text = re.sub("<.*?>", "", text)
entities_with_patch_indices = extract_entities_with_patch_indices(text)
entities = []
for item in entities_with_patch_indices:
entity, bboxes = item[0:2], item[2]
adjusted_entity = adjust_entity_positions(entity, text)
bboxes_in_coords = [patch_index_to_coordinate(bbox[0], bbox[1], num_patches_per_side) for bbox in bboxes]
entities.append(adjusted_entity + (bboxes_in_coords,))
return _cleanup_spaces(processed_text, entities)
.\models\kosmos2\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_torch_available,
is_vision_available,
)
_import_structure = {
"configuration_kosmos2": ["KOSMOS2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Kosmos2Config"],
"processing_kosmos2": ["Kosmos2Processor"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_kosmos2"] = [
"KOSMOS2_PRETRAINED_MODEL_ARCHIVE_LIST",
"Kosmos2ForConditionalGeneration",
"Kosmos2Model",
"Kosmos2PreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_kosmos2 import KOSMOS2_PRETRAINED_CONFIG_ARCHIVE_MAP, Kosmos2Config
from .processing_kosmos2 import Kosmos2Processor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_kosmos2 import (
KOSMOS2_PRETRAINED_MODEL_ARCHIVE_LIST,
Kosmos2ForConditionalGeneration,
Kosmos2Model,
Kosmos2PreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
.\models\layoutlm\configuration_layoutlm.py
""" LayoutLM model configuration"""
from collections import OrderedDict
from typing import Any, List, Mapping, Optional
from ... import PretrainedConfig, PreTrainedTokenizer
from ...onnx import OnnxConfig, PatchingSpec
from ...utils import TensorType, is_torch_available, logging
logger = logging.get_logger(__name__)
LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"microsoft/layoutlm-base-uncased": (
"https://huggingface.co/microsoft/layoutlm-base-uncased/resolve/main/config.json"
),
"microsoft/layoutlm-large-uncased": (
"https://huggingface.co/microsoft/layoutlm-large-uncased/resolve/main/config.json"
),
}
class LayoutLMConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`LayoutLMModel`]. It is used to instantiate a
LayoutLM model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the LayoutLM
[microsoft/layoutlm-base-uncased](https://huggingface.co/microsoft/layoutlm-base-uncased) architecture.
Configuration objects inherit from [`BertConfig`] and can be used to control the model outputs. Read the
documentation from [`BertConfig`] for more information.
Examples:
```
>>> from transformers import LayoutLMConfig, LayoutLMModel
>>> # Initializing a LayoutLM configuration
>>> configuration = LayoutLMConfig()
>>> # Initializing a model (with random weights) from the configuration
>>> model = LayoutLMModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "layoutlm"
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=0,
position_embedding_type="absolute",
use_cache=True,
max_2d_position_embeddings=1024,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.position_embedding_type = position_embedding_type
self.use_cache = use_cache
self.max_2d_position_embeddings = max_2d_position_embeddings
class LayoutLMOnnxConfig(OnnxConfig):
def __init__(
self,
config: PretrainedConfig,
task: str = "default",
patching_specs: List[PatchingSpec] = None,
):
super().__init__(config, task=task, patching_specs=patching_specs)
self.max_2d_positions = config.max_2d_position_embeddings - 1
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
return OrderedDict(
[
("input_ids", {0: "batch", 1: "sequence"}),
("bbox", {0: "batch", 1: "sequence"}),
("attention_mask", {0: "batch", 1: "sequence"}),
("token_type_ids", {0: "batch", 1: "sequence"}),
]
)
def generate_dummy_inputs(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
"""
生成用于 ONNX 导出器的特定框架的输入
Args:
tokenizer: 与该模型配置关联的 tokenizer
batch_size: 导出模型的批次大小(整数)(-1 表示动态轴)
seq_length: 导出模型的序列长度(整数)(-1 表示动态轴)
is_pair: 表示输入是否为句对(句子1,句子2)
framework: tokenizer 将为其生成张量的框架(可选)
Returns:
Mapping[str, Tensor],包含要提供给模型前向函数的参数
"""
input_dict = super().generate_dummy_inputs(
tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
)
box = [48, 84, 73, 128]
if not framework == TensorType.PYTORCH:
raise NotImplementedError("Exporting LayoutLM to ONNX is currently only supported for PyTorch.")
if not is_torch_available():
raise ValueError("Cannot generate dummy inputs without PyTorch installed.")
import torch
batch_size, seq_length = input_dict["input_ids"].shape
input_dict["bbox"] = torch.tensor([*[box] * seq_length]).tile(batch_size, 1, 1)
return input_dict
.\models\layoutlm\modeling_layoutlm.py
""" PyTorch LayoutLM 模型。"""
import math
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
BaseModelOutputWithPoolingAndCrossAttentions,
MaskedLMOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_layoutlm import LayoutLMConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "LayoutLMConfig"
_CHECKPOINT_FOR_DOC = "microsoft/layoutlm-base-uncased"
LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
"layoutlm-base-uncased",
"layoutlm-large-uncased",
]
LayoutLMLayerNorm = nn.LayerNorm
class LayoutLMEmbeddings(nn.Module):
"""从词嵌入、位置嵌入和标记类型嵌入构建嵌入。"""
def __init__(self, config):
super(LayoutLMEmbeddings, self).__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.x_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.hidden_size)
self.y_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.hidden_size)
self.h_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.hidden_size)
self.w_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
self.LayerNorm = LayoutLMLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(
self,
input_ids=None,
bbox=None,
token_type_ids=None,
position_ids=None,
inputs_embeds=None,
):
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
device = input_ids.device if input_ids is not None else inputs_embeds.device
if position_ids is None:
position_ids = self.position_ids[:, :seq_length]
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
words_embeddings = inputs_embeds
position_embeddings = self.position_embeddings(position_ids)
try:
left_position_embeddings = self.x_position_embeddings(bbox[:, :, 0])
upper_position_embeddings = self.y_position_embeddings(bbox[:, :, 1])
right_position_embeddings = self.x_position_embeddings(bbox[:, :, 2])
lower_position_embeddings = self.y_position_embeddings(bbox[:, :, 3])
except IndexError as e:
raise IndexError("The `bbox` coordinate values should be within 0-1000 range.") from e
h_position_embeddings = self.h_position_embeddings(bbox[:, :, 3] - bbox[:, :, 1])
w_position_embeddings = self.w_position_embeddings(bbox[:, :, 2] - bbox[:, :, 0])
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = (
words_embeddings
+ position_embeddings
+ left_position_embeddings
+ upper_position_embeddings
+ right_position_embeddings
+ lower_position_embeddings
+ h_position_embeddings
+ w_position_embeddings
+ token_type_embeddings
)
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class LayoutLMSelfAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = position_embedding_type or getattr(
config, "position_embedding_type", "absolute"
)
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
self.is_decoder = config.is_decoder
def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
):
pass
class LayoutLMSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class LayoutLMAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
self.self = LayoutLMSelfAttention(config, position_embedding_type=position_embedding_type)
self.output = LayoutLMSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class LayoutLMIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class LayoutLMOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class LayoutLMLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = LayoutLMAttention(config)
self.is_decoder = config.is_decoder
self.add_cross_attention = config.add_cross_attention
if self.add_cross_attention:
if not self.is_decoder:
raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
self.crossattention = LayoutLMAttention(config, position_embedding_type="absolute")
self.intermediate = LayoutLMIntermediate(config)
self.output = LayoutLMOutput(config)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
past_key_value=self_attn_past_key_value,
)
attention_output = self_attention_outputs[0]
if self.is_decoder:
outputs = self_attention_outputs[1:-1]
present_key_value = self_attention_outputs[-1]
else:
outputs = self_attention_outputs[1:]
cross_attn_present_key_value = None
if self.is_decoder and encoder_hidden_states is not None:
if not hasattr(self, "crossattention"):
raise ValueError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
" by setting `config.add_cross_attention=True`"
)
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
cross_attention_outputs = self.crossattention(
attention_output,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
cross_attn_past_key_value,
output_attentions,
)
attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:-1]
cross_attn_present_key_value = cross_attention_outputs[-1]
present_key_value = present_key_value + cross_attn_present_key_value
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
if self.is_decoder:
outputs = outputs + (present_key_value,)
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class LayoutLMEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([LayoutLMLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False
next_decoder_cache = () if use_cache else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
past_key_value = past_key_values[i] if past_key_values is not None else None
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
else:
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if self.config.add_cross_attention:
all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v
for v in [
hidden_states,
next_decoder_cache,
all_hidden_states,
all_self_attentions,
all_cross_attentions,
]
if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
)
class LayoutLMPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class LayoutLMPredictionHeadTransform(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
class LayoutLMLMPredictionHead(nn.Module):
def __init__(self, config):
super().__init__()
self.transform = LayoutLMPredictionHeadTransform(config)
self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.decoder.bias = self.bias
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states)
return hidden_states
class LayoutLMOnlyMLMHead(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = LayoutLMLMPredictionHead(config)
def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
prediction_scores = self.predictions(sequence_output)
return prediction_scores
class LayoutLMPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = LayoutLMConfig
pretrained_model_archive_map = LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST
base_model_prefix = "layoutlm"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""初始化模型权重"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, LayoutLMLayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
"""
LAYOUTLM_START_DOCSTRING = r"""
The LayoutLM model was proposed in [LayoutLM: Pre-training of Text and Layout for Document Image
Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei and
Ming Zhou.
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`LayoutLMConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
LAYOUTLM_INPUTS_DOCSTRING = r"""
Args:
batch_size (int): The batch size of the input data.
sequence_length (int): The length of the input sequences.
This method returns the LayoutLM Model's outputs with the specified input parameters.
"""
@add_start_docstrings(
"The bare LayoutLM Model transformer outputting raw hidden-states without any specific head on top.",
LAYOUTLM_START_DOCSTRING,
)
class LayoutLMModel(LayoutLMPreTrainedModel):
def __init__(self, config):
super(LayoutLMModel, self).__init__(config)
self.config = config
self.embeddings = LayoutLMEmbeddings(config)
self.encoder = LayoutLMEncoder(config)
self.pooler = LayoutLMPooler(config)
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
"""
Returns:
torch.nn.Embedding: The word embedding layer of the LayoutLM Model.
"""
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
"""
Args:
value (torch.Tensor): The new input embeddings to be set for the LayoutLM Model.
"""
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model.
Args:
heads_to_prune (dict): Dictionary of {layer_num: list of heads to prune in this layer}.
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
bbox: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Args:
input_ids (torch.LongTensor, optional): The input IDs of the tokens.
bbox (torch.LongTensor, optional): The bounding boxes of each token in the input.
attention_mask (torch.FloatTensor, optional): The attention mask for the input.
token_type_ids (torch.LongTensor, optional): The token type IDs for the input.
position_ids (torch.LongTensor, optional): The position IDs for positional embeddings.
head_mask (torch.FloatTensor, optional): The mask for heads in the multi-head attention mechanism.
inputs_embeds (torch.FloatTensor, optional): The embedded input sequences.
encoder_hidden_states (torch.FloatTensor, optional): The hidden states from the encoder.
encoder_attention_mask (torch.FloatTensor, optional): The attention mask for encoder hidden states.
output_attentions (bool, optional): Whether to output attentions.
output_hidden_states (bool, optional): Whether to output hidden states.
return_dict (bool, optional): Whether to return a dictionary as output.
Returns:
BaseModelOutputWithPoolingAndCrossAttentions or torch.Tensor:
The model outputs with additional pooling and cross-attention information if configured.
"""
return super().forward(
input_ids=input_ids,
bbox=bbox,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 创建一个 LayoutLMForMaskedLM 类,继承自 LayoutLMPreTrainedModel 类
class LayoutLMForMaskedLM(LayoutLMPreTrainedModel):
# 定义一个包含需要共享权重的 key 的列表
_tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
# 定义初始化方法,接受一个 config 对象参数
def __init__(self, config):
# 调用父类的初始化方法
super().__init__(config)
# 创建一个 LayoutLMModel 对象
self.layoutlm = LayoutLMModel(config)
# 创建一个 LayoutLMOnlyMLMHead 对象
self.cls = LayoutLMOnlyMLMHead(config)
# 调用自定义的 post_init 方法
self.post_init()
# 定义方法,返回 layoutlm.embeddings.word_embeddings 对象
def get_input_embeddings(self):
return self.layoutlm.embeddings.word_embeddings
# 定义方法,返回 cls.predictions.decoder 对象
def get_output_embeddings(self):
return self.cls.predictions.decoder
# 定义方法,设置 cls.predictions.decoder 对象的值为 new_embeddings
def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings
# 使用装饰器添加模型前向传播方法的文档注释
# 使用装饰器替换返回文档注释
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
bbox: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 添加模型前向传播的文档注释
# 使用布尔值参数指定是否返回字典类型输出
# 创建一个 LayoutLMForSequenceClassification 类,继承自 LayoutLMPreTrainedModel 类
class LayoutLMForSequenceClassification(LayoutLMPreTrainedModel):
# 定义初始化方法,接受一个 config 对象参数
def __init__(self, config):
# 调用父类的初始化方法
super().__init__(config)
# 设置类别数量为 config 的 num_labels 属性
self.num_labels = config.num_labels
# 创建一个 LayoutLMModel 对象
self.layoutlm = LayoutLMModel(config)
# 创建一个 nn.Dropout 对象,用于屏蔽部分神经元
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 创建一个 nn.Linear 对象,用于线性变换
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
# 调用自定义的 post_init 方法
self.post_init()
# 定义方法,返回 layoutlm.embeddings.word_embeddings 对象
def get_input_embeddings(self):
return self.layoutlm.embeddings.word_embeddings
# 使用装饰器添加模型前向传播方法的文档注释
# 使用装饰器替换返回文档注释
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
bbox: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 添加模型前向传播的文档注释
# 定义一个方法 `forward`,用于模型的前向传播计算
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
bbox: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""
LayoutLM Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
sequence labeling (information extraction) tasks such as the [FUNSD](https://guillaumejaume.github.io/FUNSD/)
dataset and the [SROIE](https://rrc.cvc.uab.es/?ch=13) dataset.
""",
LAYOUTLM_START_DOCSTRING,
)
class LayoutLMForTokenClassification(LayoutLMPreTrainedModel):
"""
LayoutLM 模型,顶部带有一个标记分类头部(在隐藏状态输出之上的线性层),例如用于序列标记(信息提取)任务,如 FUNSD 和 SROIE 数据集。
继承自 LayoutLMPreTrainedModel。
"""
def __init__(self, config):
"""
初始化方法,配置模型参数和各层组件。
Args:
config (LayoutLMConfig): 包含模型配置的对象实例。
"""
super().__init__(config)
self.num_labels = config.num_labels
self.layoutlm = LayoutLMModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
"""
返回模型的输入嵌入层,这里是 layoutlm.embeddings.word_embeddings。
Returns:
nn.Embedding: 输入嵌入层对象。
"""
return self.layoutlm.embeddings.word_embeddings
@add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
bbox: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
前向传播方法,接受一系列输入参数,执行模型的前向计算。
Args:
input_ids (torch.LongTensor, optional): 输入 token IDs,形状为 [batch_size, sequence_length]。
bbox (torch.LongTensor, optional): Bounding box 数据,形状为 [batch_size, sequence_length, 4]。
attention_mask (torch.FloatTensor, optional): 注意力掩码,形状为 [batch_size, sequence_length]。
token_type_ids (torch.LongTensor, optional): Token 类型 IDs,形状为 [batch_size, sequence_length]。
position_ids (torch.LongTensor, optional): 位置 IDs,形状为 [batch_size, sequence_length]。
head_mask (torch.FloatTensor, optional): 头部掩码,形状为 [num_heads] 或 [num_hidden_layers x num_heads]。
inputs_embeds (torch.FloatTensor, optional): 嵌入输入,形状为 [batch_size, sequence_length, embedding_size]。
labels (torch.LongTensor, optional): 标签数据,形状为 [batch_size, sequence_length]。
output_attentions (bool, optional): 是否输出注意力权重。
output_hidden_states (bool, optional): 是否输出隐藏状态。
return_dict (bool, optional): 是否返回字典格式的输出。
Returns:
TokenClassifierOutput: 根据配置返回不同的输出对象,包含损失和预测等信息。
"""
# 省略部分代码...
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果 return_dict 不为 None,则使用它;否则使用 self.config.use_return_dict
outputs = self.layoutlm(
input_ids=input_ids,
bbox=bbox,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 调用 layoutlm 模型进行前向传播,传入各种输入参数,并根据 return_dict 参数返回不同的输出形式
sequence_output = outputs[0]
# 从模型输出中获取序列输出
sequence_output = self.dropout(sequence_output)
# 对序列输出进行 dropout 操作
logits = self.classifier(sequence_output)
# 使用分类器对序列输出进行分类,得到 logits(对数概率)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
# 定义交叉熵损失函数
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
# 计算预测 logits 和真实标签 labels 之间的交叉熵损失
if not return_dict:
output = (logits,) + outputs[2:]
# 如果 return_dict 是 False,构建输出元组,包括 logits 和额外的输出信息
return ((loss,) + output) if loss is not None else output
# 如果有损失,则返回损失和输出元组;否则只返回输出元组
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 返回 TokenClassifierOutput 对象,包含损失、logits、隐藏状态和注意力信息
@add_start_docstrings(
"""
LayoutLM Model with a span classification head on top for extractive question-answering tasks such as
[DocVQA](https://rrc.cvc.uab.es/?ch=17) (a linear layer on top of the final hidden-states output to compute `span
start logits` and `span end logits`).
""",
LAYOUTLM_START_DOCSTRING,
)
class LayoutLMForQuestionAnswering(LayoutLMPreTrainedModel):
"""
LayoutLMForQuestionAnswering extends LayoutLMPreTrainedModel for question answering tasks.
It includes a span classification head on top for tasks like DocVQA.
"""
def __init__(self, config, has_visual_segment_embedding=True):
"""
Initializes LayoutLMForQuestionAnswering model with the provided configuration.
Args:
config: The configuration object that defines the model architecture.
has_visual_segment_embedding: Whether to include visual segment embeddings. Default is True.
"""
super().__init__(config)
self.num_labels = config.num_labels
# Initialize LayoutLM model backbone
self.layoutlm = LayoutLMModel(config)
# Linear layer for question answering outputs
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
"""
Returns the word embeddings from the LayoutLM model.
Returns:
torch.Tensor: Word embeddings.
"""
return self.layoutlm.embeddings.word_embeddings
@replace_return_docstrings(output_type=QuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
bbox: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
.\models\layoutlm\modeling_tf_layoutlm.py
""" TF 2.0 LayoutLM model."""
from __future__ import annotations
import math
import warnings
from typing import Dict, Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
TFBaseModelOutputWithPastAndCrossAttentions,
TFBaseModelOutputWithPoolingAndCrossAttentions,
TFMaskedLMOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (
TFMaskedLanguageModelingLoss,
TFModelInputType,
TFPreTrainedModel,
TFQuestionAnsweringLoss,
TFSequenceClassificationLoss,
TFTokenClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_layoutlm import LayoutLMConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "LayoutLMConfig"
TF_LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/layoutlm-base-uncased",
"microsoft/layoutlm-large-uncased",
]
class TFLayoutLMEmbeddings(keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config: LayoutLMConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings
self.max_2d_position_embeddings = config.max_2d_position_embeddings
self.initializer_range = config.initializer_range
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape=None):
self.weight = self.add_weight(
name="weight",
shape=[self.config.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
self.token_type_embeddings = self.add_weight(
name="embeddings",
shape=[self.config.type_vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
self.x_position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_2d_position_embeddings, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
self.y_position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_2d_position_embeddings, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
self.h_position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_2d_position_embeddings, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
self.w_position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_2d_position_embeddings, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
def call(
self,
input_ids: tf.Tensor = None,
bbox: tf.Tensor = None,
position_ids: tf.Tensor = None,
token_type_ids: tf.Tensor = None,
inputs_embeds: tf.Tensor = None,
training: bool = False,
```
) -> tf.Tensor:
"""
应用基于输入张量的嵌入。
Returns:
final_embeddings (`tf.Tensor`): 输出的嵌入张量。
"""
assert not (input_ids is None and inputs_embeds is None)
if input_ids is not None:
check_embeddings_within_bounds(input_ids, self.config.vocab_size)
inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
input_shape = shape_list(inputs_embeds)[:-1]
if token_type_ids is None:
token_type_ids = tf.fill(dims=input_shape, value=0)
if position_ids is None:
position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)
if position_ids is None:
position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)
if bbox is None:
bbox = bbox = tf.fill(input_shape + [4], value=0)
try:
left_position_embeddings = tf.gather(self.x_position_embeddings, bbox[:, :, 0])
upper_position_embeddings = tf.gather(self.y_position_embeddings, bbox[:, :, 1])
right_position_embeddings = tf.gather(self.x_position_embeddings, bbox[:, :, 2])
lower_position_embeddings = tf.gather(self.y_position_embeddings, bbox[:, :, 3])
except IndexError as e:
raise IndexError("The `bbox`coordinate values should be within 0-1000 range.") from e
h_position_embeddings = tf.gather(self.h_position_embeddings, bbox[:, :, 3] - bbox[:, :, 1])
w_position_embeddings = tf.gather(self.w_position_embeddings, bbox[:, :, 2] - bbox[:, :, 0])
position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
final_embeddings = (
inputs_embeds
+ position_embeds
+ token_type_embeds
+ left_position_embeddings
+ upper_position_embeddings
+ right_position_embeddings
+ lower_position_embeddings
+ h_position_embeddings
+ w_position_embeddings
)
final_embeddings = self.LayerNorm(inputs=final_embeddings)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
return final_embeddings
class TFLayoutLMSelfAttention(keras.layers.Layer):
def __init__(self, config: LayoutLMConfig, **kwargs):
super().__init__(**kwargs)
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number "
f"of attention heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
self.query = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
)
self.key = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
)
self.value = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
)
self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.is_decoder = config.is_decoder
self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
return tf.transpose(tensor, perm=[0, 2, 1, 3])
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
head_mask: tf.Tensor,
encoder_hidden_states: tf.Tensor,
encoder_attention_mask: tf.Tensor,
past_key_value: Tuple[tf.Tensor],
output_attentions: bool,
training: bool = False,
):
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
class TFLayoutLMSelfOutput(keras.layers.Layer):
def __init__(self, config: LayoutLMConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.dropout(inputs=hidden_states, training=training)
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFLayoutLMAttention(keras.layers.Layer):
def __init__(self, config: LayoutLMConfig, **kwargs):
super().__init__(**kwargs)
self.self_attention = TFLayoutLMSelfAttention(config, name="self")
self.dense_output = TFLayoutLMSelfOutput(config, name="output")
def prune_heads(self, heads):
raise NotImplementedError
def call(
self,
input_tensor: tf.Tensor,
attention_mask: tf.Tensor,
head_mask: tf.Tensor,
encoder_hidden_states: tf.Tensor,
encoder_attention_mask: tf.Tensor,
past_key_value: Tuple[tf.Tensor],
output_attentions: bool,
training: bool = False,
) -> Tuple[tf.Tensor]:
self_outputs = self.self_attention(
hidden_states=input_tensor,
attention_mask=attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
past_key_value=past_key_value,
output_attentions=output_attentions,
training=training,
)
attention_output = self.dense_output(
hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
)
outputs = (attention_output,) + self_outputs[1:]
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attention", None) is not None:
with tf.name_scope(self.self_attention.name):
self.self_attention.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
class TFLayoutLMIntermediate(keras.layers.Layer):
def __init__(self, config: LayoutLMConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFLayoutLMOutput(keras.layers.Layer):
def __init__(self, config: LayoutLMConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.dropout(inputs=hidden_states, training=training)
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFLayoutLMLayer(keras.layers.Layer):
def __init__(self, config: LayoutLMConfig, **kwargs):
super().__init__(**kwargs)
self.attention = TFLayoutLMAttention(config, name="attention")
self.is_decoder = config.is_decoder
self.add_cross_attention = config.add_cross_attention
if self.add_cross_attention:
if not self.is_decoder:
raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
self.crossattention = TFLayoutLMAttention(config, name="crossattention")
self.intermediate = TFLayoutLMIntermediate(config, name="intermediate")
self.bert_output = TFLayoutLMOutput(config, name="output")
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
head_mask: tf.Tensor,
encoder_hidden_states: tf.Tensor | None,
encoder_attention_mask: tf.Tensor | None,
past_key_value: Tuple[tf.Tensor] | None,
output_attentions: bool,
training: bool = False,
) -> Tuple[tf.Tensor]:
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
input_tensor=hidden_states,
attention_mask=attention_mask,
head_mask=head_mask,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=self_attn_past_key_value,
output_attentions=output_attentions,
training=training,
)
attention_output = self_attention_outputs[0]
if self.is_decoder:
outputs = self_attention_outputs[1:-1]
present_key_value = self_attention_outputs[-1]
else:
outputs = self_attention_outputs[1:]
cross_attn_present_key_value = None
if self.is_decoder and encoder_hidden_states is not None:
if not hasattr(self, "crossattention"):
raise ValueError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
" by setting `config.add_cross_attention=True`"
)
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
cross_attention_outputs = self.crossattention(
input_tensor=attention_output,
attention_mask=attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
past_key_value=cross_attn_past_key_value,
output_attentions=output_attentions,
training=training,
)
attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:-1]
cross_attn_present_key_value = cross_attention_outputs[-1]
present_key_value = present_key_value + cross_attn_present_key_value
intermediate_output = self.intermediate(hidden_states=attention_output)
layer_output = self.bert_output(
hidden_states=intermediate_output, input_tensor=attention_output, training=training
)
outputs = (layer_output,) + outputs
if self.is_decoder:
outputs = outputs + (present_key_value,)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "bert_output", None) is not None:
with tf.name_scope(self.bert_output.name):
self.bert_output.build(None)
if getattr(self, "crossattention", None) is not None:
with tf.name_scope(self.crossattention.name):
self.crossattention.build(None)
class TFLayoutLMEncoder(keras.layers.Layer):
def __init__(self, config: LayoutLMConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.layer = [TFLayoutLMLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
head_mask: tf.Tensor,
encoder_hidden_states: tf.Tensor | None,
encoder_attention_mask: tf.Tensor | None,
past_key_values: Tuple[Tuple[tf.Tensor]] | None,
use_cache: Optional[bool],
output_attentions: bool,
output_hidden_states: bool,
return_dict: bool,
training: bool = False,
) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
next_decoder_cache = () if use_cache else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
past_key_value = past_key_values[i] if past_key_values is not None else None
layer_outputs = layer_module(
hidden_states=hidden_states,
attention_mask=attention_mask,
head_mask=head_mask[i],
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
past_key_value=past_key_value,
output_attentions=output_attentions,
training=training,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
if self.config.add_cross_attention and encoder_hidden_states is not None:
all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None
)
return TFBaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_attentions,
cross_attentions=all_cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
class TFLayoutLMPooler(keras.layers.Layer):
def __init__(self, config: LayoutLMConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
activation="tanh",
name="dense",
)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(inputs=first_token_tensor)
return pooled_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFLayoutLMPredictionHeadTransform(keras.layers.Layer):
def __init__(self, config: LayoutLMConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name="dense",
)
if isinstance(config.hidden_act, str):
self.transform_act_fn = get_tf_activation(config.hidden_act)
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(inputs=hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFLayoutLMLMPredictionHead(keras.layers.Layer):
pass
def __init__(self, config: LayoutLMConfig, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs)
self.config = config
self.hidden_size = config.hidden_size
self.transform = TFLayoutLMPredictionHeadTransform(config, name="transform")
self.input_embeddings = input_embeddings
def build(self, input_shape=None):
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
if self.built:
return
self.built = True
if getattr(self, "transform", None) is not None:
with tf.name_scope(self.transform.name):
self.transform.build(None)
def get_output_embeddings(self) -> keras.layers.Layer:
return self.input_embeddings
def set_output_embeddings(self, value: tf.Variable):
self.input_embeddings.weight = value
self.input_embeddings.vocab_size = shape_list(value)[0]
def get_bias(self) -> Dict[str, tf.Variable]:
return {"bias": self.bias}
def set_bias(self, value: tf.Variable):
self.bias = value["bias"]
self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.transform(hidden_states=hidden_states)
seq_length = shape_list(hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states
class TFLayoutLMMLMHead(keras.layers.Layer):
def __init__(self, config: LayoutLMConfig, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs)
self.predictions = TFLayoutLMLMPredictionHead(config, input_embeddings, name="predictions")
def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
prediction_scores = self.predictions(hidden_states=sequence_output)
return prediction_scores
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "predictions", None) is not None:
with tf.name_scope(self.predictions.name):
self.predictions.build(None)
@keras_serializable
class TFLayoutLMMainLayer(keras.layers.Layer):
config_class = LayoutLMConfig
def __init__(self, config: LayoutLMConfig, add_pooling_layer: bool = True, **kwargs):
super().__init__(**kwargs)
self.config = config
self.embeddings = TFLayoutLMEmbeddings(config, name="embeddings")
self.encoder = TFLayoutLMEncoder(config, name="encoder")
self.pooler = TFLayoutLMPooler(config, name="pooler") if add_pooling_layer else None
def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings
def set_input_embeddings(self, value: tf.Variable):
self.embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
raise NotImplementedError
@unpack_inputs
def call(
self,
input_ids: TFModelInputType | None = None,
bbox: np.ndarray | tf.Tensor | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
"""
This model class extends TFPreTrainedModel and provides methods for weights initialization, downloading pretrained models,
and handling input signatures.
"""
class TFLayoutLMPreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
Attributes:
config_class: A class attribute indicating the configuration class for this model.
base_model_prefix: A string attribute representing the prefix used for the base model.
"""
config_class = LayoutLMConfig
base_model_prefix = "layoutlm"
@property
def input_signature(self):
"""
Override the input_signature property of TFPreTrainedModel.
Returns:
dict: Updated signature including 'bbox' as a TensorSpec with shape (None, None, 4) and dtype tf.int32.
"""
signature = super().input_signature
signature["bbox"] = tf.TensorSpec(shape=(None, None, 4), dtype=tf.int32, name="bbox")
return signature
LAYOUTLM_START_DOCSTRING = r"""
"""
LAYOUTLM_INPUTS_DOCSTRING = r"""
"""
class TFLayoutLMModel(TFLayoutLMPreTrainedModel):
def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.layoutlm = TFLayoutLMMainLayer(config, name="layoutlm")
def call(
self,
input_ids: TFModelInputType | None = None,
bbox: np.ndarray | tf.Tensor | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
r"""
Returns:
此方法的返回类型为 TFBaseModelOutputWithPoolingAndCrossAttentions 或 Tuple[tf.Tensor]。
Examples:
示例代码演示如何使用该方法:
```
>>> from transformers import AutoTokenizer, TFLayoutLMModel
>>> import tensorflow as tf
>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
>>> model = TFLayoutLMModel.from_pretrained("microsoft/layoutlm-base-uncased")
>>> words = ["Hello", "world"]
>>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
>>> token_boxes = []
>>> for word, box in zip(words, normalized_word_boxes):
... word_tokens = tokenizer.tokenize(word)
... token_boxes.extend([box] * len(word_tokens))
>>> # add bounding boxes of cls + sep tokens
>>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
>>> encoding = tokenizer(" ".join(words), return_tensors="tf")
>>> input_ids = encoding["input_ids"]
>>> attention_mask = encoding["attention_mask"]
>>> token_type_ids = encoding["token_type_ids"]
>>> bbox = tf.convert_to_tensor([token_boxes])
>>> outputs = model(
... input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids
... )
>>> last_hidden_states = outputs.last_hidden_state
```
执行示例代码,使用模型进行推理并获取最后隐藏状态的输出。
"""
outputs = self.layoutlm(
input_ids=input_ids,
bbox=bbox,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layoutlm", None) is not None:
with tf.name_scope(self.layoutlm.name):
self.layoutlm.build(None)
@add_start_docstrings("""LayoutLM Model with a `language modeling` head on top.""", LAYOUTLM_START_DOCSTRING)
class TFLayoutLMForMaskedLM(TFLayoutLMPreTrainedModel, TFMaskedLanguageModelingLoss):
_keys_to_ignore_on_load_unexpected = [
r"pooler",
r"cls.seq_relationship",
r"cls.predictions.decoder.weight",
r"nsp___cls",
]
def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
if config.is_decoder:
logger.warning(
"If you want to use `TFLayoutLMForMaskedLM` make sure `config.is_decoder=False` for "
"bi-directional self-attention."
)
self.layoutlm = TFLayoutLMMainLayer(config, add_pooling_layer=True, name="layoutlm")
self.mlm = TFLayoutLMMLMHead(config, input_embeddings=self.layoutlm.embeddings, name="mlm___cls")
def get_lm_head(self) -> keras.layers.Layer:
return self.mlm.predictions
def get_prefix_bias_name(self) -> str:
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
return self.name + "/" + self.mlm.name + "/" + self.mlm.predictions.name
@unpack_inputs
@add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=TFMaskedLMOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
input_ids: TFModelInputType | None = None,
bbox: np.ndarray | tf.Tensor | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
r"""
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
Returns:
Examples:
```
>>> from transformers import AutoTokenizer, TFLayoutLMForMaskedLM
>>> import tensorflow as tf
>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
>>> model = TFLayoutLMForMaskedLM.from_pretrained("microsoft/layoutlm-base-uncased")
>>> words = ["Hello", "[MASK]"]
>>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
>>> token_boxes = []
>>> for word, box in zip(words, normalized_word_boxes):
... word_tokens = tokenizer.tokenize(word)
... token_boxes.extend([box] * len(word_tokens))
>>>
>>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
>>> encoding = tokenizer(" ".join(words), return_tensors="tf")
>>> input_ids = encoding["input_ids"]
>>> attention_mask = encoding["attention_mask"]
>>> token_type_ids = encoding["token_type_ids"]
>>> bbox = tf.convert_to_tensor([token_boxes])
>>> labels = tokenizer("Hello world", return_tensors="tf")["input_ids"]
>>> outputs = model(
... input_ids=input_ids,
... bbox=bbox,
... attention_mask=attention_mask,
... token_type_ids=token_type_ids,
... labels=labels,
... )
>>> loss = outputs.loss
```
"""
outputs = self.layoutlm(
input_ids=input_ids,
bbox=bbox,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 调用 layoutlm 模型进行前向传播,得到模型输出
sequence_output = outputs[0]
# 从模型输出中提取序列输出
prediction_scores = self.mlm(sequence_output=sequence_output, training=training)
# 如果提供了标签,则计算损失;否则损失设为 None
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=prediction_scores)
# 如果 return_dict 为 False,则组装输出并返回
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
# 如果 return_dict 为 True,则返回 TFMaskedLMOutput 对象
return TFMaskedLMOutput(
loss=loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 定义模型的构建方法,用于构建模型结构
def build(self, input_shape=None):
# 如果模型已经构建过,直接返回,避免重复构建
if self.built:
return
# 将模型标记为已构建状态
self.built = True
# 如果模型中存在 layoutlm 属性,则构建 layoutlm 模块
if getattr(self, "layoutlm", None) is not None:
# 使用 layoutlm 模块的名称作为命名空间
with tf.name_scope(self.layoutlm.name):
# 调用 layoutlm 模块的 build 方法,传入 None 作为输入形状
self.layoutlm.build(None)
# 如果模型中存在 mlm 属性,则构建 mlm 模块
if getattr(self, "mlm", None) is not None:
# 使用 mlm 模块的名称作为命名空间
with tf.name_scope(self.mlm.name):
# 调用 mlm 模块的 build 方法,传入 None 作为输入形状
self.mlm.build(None)
"""
LayoutLM Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
"""
# 使用 LayoutLM 模型进行序列分类或回归,顶部包含一个线性层(在池化输出之上),例如用于 GLUE 任务。
@add_start_docstrings(
"""
LayoutLM Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
LAYOUTLM_START_DOCSTRING,
)
class TFLayoutLMForSequenceClassification(TFLayoutLMPreTrainedModel, TFSequenceClassificationLoss):
# names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
# 以'.'结尾的名称表示在从 PT 模型加载 TF 模型时授权的意外/缺失层
_keys_to_ignore_on_load_unexpected = [r"mlm___cls", r"nsp___cls", r"cls.predictions", r"cls.seq_relationship"]
# 缺失的名称以'.'结尾,表示在从 PT 模型加载 TF 模型时忽略的层
_keys_to_ignore_on_load_missing = [r"dropout"]
def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.layoutlm = TFLayoutLMMainLayer(config, name="layoutlm") # 初始化 LayoutLM 主层
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) # Dropout 层
self.classifier = keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="classifier",
) # 分类器层
self.config = config # 配置信息
@unpack_inputs
@add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
# 调用模型正向传播,对输入进行解包,并替换返回结果的文档字符串
def call(
self,
input_ids: TFModelInputType | None = None,
bbox: np.ndarray | tf.Tensor | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
):
# 正向传播函数的定义,接收多种输入参数和可选的训练标志
pass # 占位符,实际功能在后续实现中完成
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layoutlm", None) is not None:
with tf.name_scope(self.layoutlm.name):
self.layoutlm.build(None) # 构建 LayoutLM 主层
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size]) # 构建分类器层
# 在加载模型时需要忽略的键列表,用于处理意外的键
_keys_to_ignore_on_load_unexpected = [
r"pooler", # 忽略名为"pooler"的键
r"mlm___cls", # 忽略名为"mlm___cls"的键
r"nsp___cls", # 忽略名为"nsp___cls"的键
r"cls.predictions", # 忽略名为"cls.predictions"的键
r"cls.seq_relationship", # 忽略名为"cls.seq_relationship"的键
]
# 在加载模型时需要忽略的键列表,用于处理缺失的键
_keys_to_ignore_on_load_missing = [r"dropout"]
# LayoutLM 模型的初始化方法,继承自父类的初始化方法
def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
# 设置模型的标签数目
self.num_labels = config.num_labels
# 初始化 LayoutLM 主层,包括一个可选的池化层
self.layoutlm = TFLayoutLMMainLayer(config, add_pooling_layer=True, name="layoutlm")
# 设置 Dropout 层,根据配置中的隐藏层丢弃率
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
# 分类器层,用于模型输出预测
self.classifier = keras.layers.Dense(
units=config.num_labels, # 分类器单元数等于配置中的标签数目
kernel_initializer=get_initializer(config.initializer_range), # 使用配置中的初始化范围初始化权重
name="classifier", # 层的名称为"classifier"
)
# 保存配置对象
self.config = config
# 模型调用方法的装饰器,用于解压输入参数并添加文档字符串
@unpack_inputs
@add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=TFTokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
input_ids: TFModelInputType | None = None,
bbox: np.ndarray | tf.Tensor | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
r"""
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Returns:
Examples:
```
>>> import tensorflow as tf
>>> from transformers import AutoTokenizer, TFLayoutLMForTokenClassification
>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
>>> model = TFLayoutLMForTokenClassification.from_pretrained("microsoft/layoutlm-base-uncased")
>>> words = ["Hello", "world"]
>>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
>>> token_boxes = []
>>> for word, box in zip(words, normalized_word_boxes):
... word_tokens = tokenizer.tokenize(word)
... token_boxes.extend([box] * len(word_tokens))
>>>
>>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
>>> encoding = tokenizer(" ".join(words), return_tensors="tf")
>>> input_ids = encoding["input_ids"]
>>> attention_mask = encoding["attention_mask"]
>>> token_type_ids = encoding["token_type_ids"]
>>> bbox = tf.convert_to_tensor([token_boxes])
>>> token_labels = tf.convert_to_tensor([1, 1, 0, 0])
>>> outputs = model(
... input_ids=input_ids,
... bbox=bbox,
... attention_mask=attention_mask,
... token_type_ids=token_type_ids,
... labels=token_labels,
... )
>>> loss = outputs.loss
>>> logits = outputs.logits
```"""
# 调用模型的前向传播方法,传入各种输入参数
outputs = self.layoutlm(
input_ids=input_ids,
bbox=bbox,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 获取模型输出的序列输出(通常是模型最后一层的输出)
sequence_output = outputs[0]
# 在训练模式下对序列输出应用 dropout
sequence_output = self.dropout(inputs=sequence_output, training=training)
# 将 dropout 后的输出送入分类器,得到最终的 logits
logits = self.classifier(inputs=sequence_output)
# 如果提供了标签,则计算损失函数,否则损失为 None
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
# 如果 return_dict 为 False,则返回输出的元组形式
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
# 如果 return_dict 为 True,则以 TFTokenClassifierOutput 对象形式返回结果
return TFTokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 构建模型的方法,用于设置模型结构和参数
def build(self, input_shape=None):
# 如果模型已经构建过,直接返回,避免重复构建
if self.built:
return
# 标记模型已经构建
self.built = True
# 如果模型具有 layoutlm 属性且不为 None,则构建 layoutlm 组件
if getattr(self, "layoutlm", None) is not None:
# 在 TensorFlow 中为 layoutlm 组件创建命名空间
with tf.name_scope(self.layoutlm.name):
# 调用 layoutlm 组件的 build 方法,传入 None 作为输入形状
self.layoutlm.build(None)
# 如果模型具有 classifier 属性且不为 None,则构建 classifier 组件
if getattr(self, "classifier", None) is not None:
# 在 TensorFlow 中为 classifier 组件创建命名空间
with tf.name_scope(self.classifier.name):
# 调用 classifier 组件的 build 方法,传入包含 None、None 和 self.config.hidden_size 的列表作为输入形状
self.classifier.build([None, None, self.config.hidden_size])
"""
LayoutLM Model with a span classification head on top for extractive question-answering tasks such as
[DocVQA](https://rrc.cvc.uab.es/?ch=17) (a linear layer on top of the final hidden-states output to compute `span
start logits` and `span end logits`).
"""
# 使用 LayoutLM 模型,其顶部有一个用于抽取式问答任务的跨度分类头部,例如 [DocVQA](https://rrc.cvc.uab.es/?ch=17)。
# 这个头部是在最终隐藏状态输出之上的线性层,用于计算“跨度起始 logits” 和 “跨度终止 logits”。
class TFLayoutLMForQuestionAnswering(TFLayoutLMPreTrainedModel, TFQuestionAnsweringLoss):
"""
LayoutLM 用于问答的 TensorFlow 模型,继承自 TFLayoutLMPreTrainedModel 和 TFQuestionAnsweringLoss。
"""
# names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
_keys_to_ignore_on_load_unexpected = [
r"pooler",
r"mlm___cls",
r"nsp___cls",
r"cls.predictions",
r"cls.seq_relationship",
]
"""
在从 PyTorch 模型加载 TF 模型时,带有 '.' 的名称表示授权的意外/丢失的层。
"""
def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
"""
初始化 LayoutLMForQuestionAnswering 模型。
Args:
config (LayoutLMConfig): LayoutLM 模型的配置对象。
*inputs: 可变数量的输入。
**kwargs: 其他关键字参数。
"""
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
# 初始化 LayoutLM 主层
self.layoutlm = TFLayoutLMMainLayer(config, add_pooling_layer=True, name="layoutlm")
# 初始化用于问答输出的全连接层
self.qa_outputs = keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="qa_outputs",
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=TFQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
input_ids: TFModelInputType | None = None,
bbox: np.ndarray | tf.Tensor | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
start_positions: np.ndarray | tf.Tensor | None = None,
end_positions: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
):
"""
LayoutLM 问答模型的前向传播方法。
Args:
input_ids (TFModelInputType, optional): 输入的 token IDs.
bbox (np.ndarray or tf.Tensor, optional): 边界框信息.
attention_mask (np.ndarray or tf.Tensor, optional): 注意力掩码.
token_type_ids (np.ndarray or tf.Tensor, optional): token 类型 IDs.
position_ids (np.ndarray or tf.Tensor, optional): 位置 IDs.
head_mask (np.ndarray or tf.Tensor, optional): 头部掩码.
inputs_embeds (np.ndarray or tf.Tensor, optional): 嵌入的输入.
output_attentions (bool, optional): 是否输出注意力权重.
output_hidden_states (bool, optional): 是否输出隐藏状态.
return_dict (bool, optional): 是否返回字典类型的输出.
start_positions (np.ndarray or tf.Tensor, optional): 起始位置.
end_positions (np.ndarray or tf.Tensor, optional): 结束位置.
training (bool, optional): 是否处于训练模式.
Returns:
TFQuestionAnsweringModelOutput: LayoutLM 问答模型的输出对象。
"""
# 省略了具体的前向传播逻辑,用文档字符串和装饰器指定了输入输出的详细描述
pass
def build(self, input_shape=None):
"""
构建模型。
Args:
input_shape: 输入的形状信息,可选。
Notes:
如果已经构建过,则直接返回。
构建 LayoutLM 和 qa_outputs 层。
"""
if self.built:
return
self.built = True
if getattr(self, "layoutlm", None) is not None:
with tf.name_scope(self.layoutlm.name):
self.layoutlm.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
.\models\layoutlm\tokenization_layoutlm.py
""" Tokenization class for model LayoutLM."""
import collections
import os
import unicodedata
from typing import List, Optional, Tuple
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/layoutlm-base-uncased": (
"https://huggingface.co/microsoft/layoutlm-base-uncased/resolve/main/vocab.txt"
),
"microsoft/layoutlm-large-uncased": (
"https://huggingface.co/microsoft/layoutlm-large-uncased/resolve/main/vocab.txt"
),
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/layoutlm-base-uncased": 512,
"microsoft/layoutlm-large-uncased": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"microsoft/layoutlm-base-uncased": {"do_lower_case": True},
"microsoft/layoutlm-large-uncased": {"do_lower_case": True},
}
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class LayoutLMTokenizer(PreTrainedTokenizer):
r"""
Construct a LayoutLM tokenizer. Based on WordPiece.
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
Args:
vocab_file (`str`):
File containing the vocabulary.
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
do_basic_tokenize (`bool`, *optional*, defaults to `True`):
Whether or not to do basic tokenization before WordPiece.
never_split (`Iterable`, *optional*):
Collection of tokens which will never be split during tokenization. Only has an effect when
`do_basic_tokenize=True`
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (`str`, *optional*, defaults to `"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters.
This should likely be deactivated for Japanese (see this
[issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original LayoutLM).
```
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
```
# 初始化方法,用于创建一个新的 Tokenizer 对象
def __init__(
self,
vocab_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs,
):
```
):
# 如果提供的词汇文件路径不是一个文件,则抛出数值错误异常,提示找不到词汇文件
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
" model use `tokenizer = LayoutLMTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
# 加载词汇表并赋值给实例变量 self.vocab
self.vocab = load_vocab(vocab_file)
# 创建一个有序字典,将词汇表中的 id 和 token 对调,赋值给实例变量 self.ids_to_tokens
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
# 初始化是否进行基础分词的标志
self.do_basic_tokenize = do_basic_tokenize
# 如果需要进行基础分词
if do_basic_tokenize:
# 创建 BasicTokenizer 实例并赋值给 self.basic_tokenizer
self.basic_tokenizer = BasicTokenizer(
do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
# 创建 WordpieceTokenizer 实例并赋值给 self.wordpiece_tokenizer,使用未知标记 unk_token
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
# 调用父类的初始化方法,并传递相应参数
super().__init__(
do_lower_case=do_lower_case,
do_basic_tokenize=do_basic_tokenize,
never_split=never_split,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
@property
def do_lower_case(self):
# 返回基础分词器的小写标志位
return self.basic_tokenizer.do_lower_case
@property
def vocab_size(self):
# 返回词汇表的大小
return len(self.vocab)
def get_vocab(self):
# 返回包含额外 tokens 编码器的词汇表字典
return dict(self.vocab, **self.added_tokens_encoder)
def _tokenize(self, text, split_special_tokens=False):
# 分词后的 token 列表
split_tokens = []
# 如果需要进行基础分词
if self.do_basic_tokenize:
# 使用基础分词器对文本进行分词
for token in self.basic_tokenizer.tokenize(
text, never_split=self.all_special_tokens if not split_special_tokens else None
):
# 如果 token 是不能分割的特殊 token
if token in self.basic_tokenizer.never_split:
split_tokens.append(token)
else:
# 使用 WordpieceTokenizer 对 token 进行分词,并添加到 split_tokens 中
split_tokens += self.wordpiece_tokenizer.tokenize(token)
else:
# 直接使用 WordpieceTokenizer 对文本进行分词
split_tokens = self.wordpiece_tokenizer.tokenize(text)
return split_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
# 将 token 转换为对应的 id,如果 token 不在词汇表中,则使用 unk_token
return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
# 将索引 index 转换为对应的 token,如果索引不在 ids_to_tokens 中,则使用 unk_token
return self.ids_to_tokens.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
# 将 tokens 序列转换为单个字符串,去除 " ##" 并去除首尾空格
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create token type IDs from a sequence or a pair of sequences for sequence classification tasks. This method assigns
different token type IDs to distinguish between the first sequence, the second sequence (if provided), and padding.
Args:
token_ids_0 (`List[int]`):
List of IDs representing the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs representing the second sequence.
Returns:
`List[int]`: List of token type IDs.
"""
if token_ids_1 is None:
# For a single sequence, token type IDs are 0 for all tokens
return [0] * len(token_ids_0)
# For a pair of sequences, assign token type 0 to the first sequence and token type 1 to the second sequence
token_type_ids = [0] * len(token_ids_0) + [1] * len(token_ids_1)
return token_type_ids
def create_token_type_ids_from_sequences(self, token_ids_0: List[int], token_ids_1: Optional[List[int]]) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A LayoutLM sequence
pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs for the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of token type IDs according to the given sequence(s).
"""
# Define separator and classification token IDs
sep = [self.sep_token_id] # Separation token ID
cls = [self.cls_token_id] # Classification token ID
# If only one sequence is provided (token_ids_1 is None), return mask for the first sequence
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
# If two sequences are provided, return combined mask for both sequences
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary of the model to a specified directory or file.
Args:
save_directory (str):
Directory path where the vocabulary will be saved.
filename_prefix (str, *optional*):
Optional prefix to prepend to the vocabulary file name.
Returns:
Tuple[str]: A tuple containing the path to the saved vocabulary file.
"""
index = 0
# Determine the full path for saving the vocabulary file
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
else:
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
# Write the vocabulary to the specified file
with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
# Check for non-consecutive indices and issue a warning if found
if index != token_index:
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!"
)
index = token_index
writer.write(token + "\n")
index += 1
# Return the path to the saved vocabulary file
return (vocab_file,)
# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
# 从transformers.models.bert.tokenization_bert.BasicTokenizer复制而来
class BasicTokenizer(object):
"""
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
创建一个BasicTokenizer对象,执行基本的分词(标点符号拆分,小写转换等)。
Args:
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
是否在分词时将输入转换为小写。
never_split (`Iterable`, *optional*):
Collection of tokens which will never be split during tokenization. Only has an effect when
`do_basic_tokenize=True`
在分词过程中不会被拆分的token集合,仅在`do_basic_tokenize=True`时生效。
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters.
是否分词中文字符。对于日语,应该禁用此选项(参见此issue)。
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
是否删除所有的重音符号。如果未指定此选项,则将由`lowercase`的值决定(与原始BERT相同)。
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
在某些情况下,我们希望跳过基本的标点符号拆分,以便后续的分词可以捕获词语的完整上下文,如缩写。
"""
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
# 初始化BasicTokenizer对象
self.do_lower_case = do_lower_case
# 是否进行小写处理
self.never_split = set(never_split)
# 设置不会被拆分的token集合
self.tokenize_chinese_chars = tokenize_chinese_chars
# 是否分词中文字符
self.strip_accents = strip_accents
# 是否删除重音符号
self.do_split_on_punc = do_split_on_punc
# 是否基于标点符号进行拆分
# 对输入的文本进行基本的分词处理。用于子词分词,请参见 WordPieceTokenizer。
# 如果传入了 never_split 参数,则将其与类属性 never_split 的集合取并集,以获取最终的不分割的标记集合。
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
# 清理文本,去除可能存在的特殊符号和空白字符。
text = self._clean_text(text)
# 若设置了 tokenize_chinese_chars 标志为 True,则对包含中文字符的文本进行特殊处理。
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
# 对文本进行 Unicode 规范化,确保文本中的字符使用 NFC 规范。
unicode_normalized_text = unicodedata.normalize("NFC", text)
# 使用空白字符进行基本的分词,得到原始的 token 列表。
orig_tokens = whitespace_tokenize(unicode_normalized_text)
# 初始化空列表,用于存储最终的分词结果。
split_tokens = []
# 遍历原始 token 列表,对每个 token 进行处理。
for token in orig_tokens:
# 如果 token 不在不分割的标记集合中,则进行进一步处理。
if token not in never_split:
# 如果设置了 do_lower_case 标志为 True,则将 token 转换为小写。
if self.do_lower_case:
token = token.lower()
# 如果 strip_accents 不为 False,则移除 token 中的重音符号。
if self.strip_accents is not False:
token = self._run_strip_accents(token)
# 如果 strip_accents 标志为 True,则移除 token 中的重音符号。
elif self.strip_accents:
token = self._run_strip_accents(token)
# 将处理后的 token 列表拼接到 split_tokens 中。
split_tokens.extend(self._run_split_on_punc(token, never_split))
# 将拼接后的分词结果使用空白字符再次进行分割,得到最终的输出 token 列表。
output_tokens = whitespace_tokenize(" ".join(split_tokens))
# 返回最终的输出 token 列表作为函数的返回值。
return output_tokens
def _run_split_on_punc(self, text, never_split=None):
"""按标点符号分割文本。
Args:
text (str): 要分割的文本字符串。
never_split (set): 不希望分割的文本集合。
Returns:
list: 分割后的文本列表。
Notes:
如果不需要按标点符号分割或者指定的文本在never_split中,直接返回原文本。
"""
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char]) # 将标点符号作为单独的列表项添加到输出列表中
start_new_word = True # 标记需要开始一个新单词
else:
if start_new_word:
output.append([]) # 如果需要开始一个新单词,添加一个空列表
start_new_word = False # 取消开始新单词的标记
output[-1].append(char) # 将当前字符添加到最后一个单词的列表中
i += 1
return ["".join(x) for x in output] # 将列表中的字符列表连接成字符串后返回一个列表
def _tokenize_chinese_chars(self, text):
"""在每个CJK字符周围添加空格。
Args:
text (str): 要处理的文本字符串。
Returns:
str: 处理后的文本字符串。
"""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ") # 在CJK字符前后添加空格
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output) # 将列表中的字符连接成一个字符串后返回
def _is_chinese_char(self, cp):
"""检查CP是否为CJK字符的码点。
Args:
cp (int): Unicode码点值。
Returns:
bool: 如果是CJK字符则返回True,否则返回False。
"""
# 这里的CJK字符指的是CJK统一表意文字的Unicode块:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# 注意,CJK Unicode块并不包含所有的日语和韩语字符,
# 现代韩语的谚文字母和片假名、片假名分别属于不同的Unicode块,
# 这些字符用于书写空格分隔的词语,因此不会被特殊对待而被处理。
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF) #
or (cp >= 0x20000 and cp <= 0x2A6DF) #
or (cp >= 0x2A700 and cp <= 0x2B73F) #
or (cp >= 0x2B740 and cp <= 0x2B81F) #
or (cp >= 0x2B820 and cp <= 0x2CEAF) #
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F) #
): #
return True
return False
def _clean_text(self, text):
"""清除文本中的无效字符和空白字符。
Args:
text (str): 要清理的文本字符串。
Returns:
str: 清理后的文本字符串。
"""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xFFFD or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ") # 将空白字符替换为单个空格
else:
output.append(char)
return "".join(output) # 将列表中的字符连接成一个字符串后返回
# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
# 初始化WordpieceTokenizer对象,设置词汇表、未知token和单词最大字符数限制
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
tokenization using the given vocabulary.
For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through *BasicTokenizer*.
Returns:
A list of wordpiece tokens.
"""
# 初始化输出token列表
output_tokens = []
# 使用空白字符分词器对文本进行分词,返回的是一个token列表
for token in whitespace_tokenize(text):
chars = list(token)
# 如果token的字符数超过最大字符数限制,则将其替换为未知token
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
# 构建当前子字符串
substr = "".join(chars[start:end])
# 如果不是第一个子字符串,则在前面加上"##"
if start > 0:
substr = "##" + substr
# 如果当前子字符串在词汇表中,则选择当前子字符串作为最长匹配词
if substr in self.vocab:
cur_substr = substr
break
end -= 1
# 如果没有找到匹配的词,则标记为无效
if cur_substr is None:
is_bad = True
break
# 将匹配的词加入到sub_tokens列表中
sub_tokens.append(cur_substr)
start = end
# 如果token被标记为无效,则使用未知token代替
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
# 返回最终的token列表
return output_tokens
.\models\layoutlm\tokenization_layoutlm_fast.py
""" Tokenization class for model LayoutLM."""
import json
from typing import List, Optional, Tuple
from tokenizers import normalizers
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .tokenization_layoutlm import LayoutLMTokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/layoutlm-base-uncased": (
"https://huggingface.co/microsoft/layoutlm-base-uncased/resolve/main/vocab.txt"
),
"microsoft/layoutlm-large-uncased": (
"https://huggingface.co/microsoft/layoutlm-large-uncased/resolve/main/vocab.txt"
),
},
"tokenizer_file": {
"microsoft/layoutlm-base-uncased": (
"https://huggingface.co/microsoft/layoutlm-base-uncased/resolve/main/tokenizer.json"
),
"microsoft/layoutlm-large-uncased": (
"https://huggingface.co/microsoft/layoutlm-large-uncased/resolve/main/tokenizer.json"
),
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/layoutlm-base-uncased": 512,
"microsoft/layoutlm-large-uncased": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"microsoft/layoutlm-base-uncased": {"do_lower_case": True},
"microsoft/layoutlm-large-uncased": {"do_lower_case": True},
}
class LayoutLMTokenizerFast(PreTrainedTokenizerFast):
r"""
Construct a "fast" LayoutLM tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.
"""
def __init__(
self,
vocab_file: str,
tokenizer_file: str,
**kwargs
):
super().__init__(
vocab_file=vocab_file,
tokenizer_file=tokenizer_file,
**kwargs
)
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
slow_tokenizer_class = LayoutLMTokenizer
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
do_lower_case=True,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs,
):
):
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
if (
normalizer_state.get("lowercase", do_lower_case) != do_lower_case
or normalizer_state.get("strip_accents", strip_accents) != strip_accents
or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
):
normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
normalizer_state["lowercase"] = do_lower_case
normalizer_state["strip_accents"] = strip_accents
normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
self.do_lower_case = do_lower_case
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A LayoutLM sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
if token_ids_1 is not None:
output += token_ids_1 + [self.sep_token_id]
return output
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
if (
normalizer_state.get("lowercase", do_lower_case) != do_lower_case
or normalizer_state.get("strip_accents", strip_accents) != strip_accents
or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
):
normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
normalizer_state["lowercase"] = do_lower_case
normalizer_state["strip_accents"] = strip_accents
normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
self.do_lower_case = do_lower_case
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A LayoutLM sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
if token_ids_1 is not None:
output += token_ids_1 + [self.sep_token_id]
return output
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
def create_token_type_ids_from_sequences(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A LayoutLM sequence
pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs for the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of token type IDs according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary of the tokenizer model to the specified directory.
Args:
save_directory (str):
Directory path where the vocabulary will be saved.
filename_prefix (str, *optional*):
Optional prefix for the saved files.
Returns:
Tuple[str]: Tuple containing the filenames saved.
"""
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
.\models\layoutlm\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_layoutlm": ["LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP", "LayoutLMConfig", "LayoutLMOnnxConfig"],
"tokenization_layoutlm": ["LayoutLMTokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_layoutlm_fast"] = ["LayoutLMTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_layoutlm"] = [
"LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST",
"LayoutLMForMaskedLM",
"LayoutLMForSequenceClassification",
"LayoutLMForTokenClassification",
"LayoutLMForQuestionAnswering",
"LayoutLMModel",
"LayoutLMPreTrainedModel",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_layoutlm"] = [
"TF_LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFLayoutLMForMaskedLM",
"TFLayoutLMForSequenceClassification",
"TFLayoutLMForTokenClassification",
"TFLayoutLMForQuestionAnswering",
"TFLayoutLMMainLayer",
"TFLayoutLMModel",
"TFLayoutLMPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_layoutlm import LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP, LayoutLMConfig, LayoutLMOnnxConfig
from .tokenization_layoutlm import LayoutLMTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_layoutlm_fast import LayoutLMTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_layoutlm import (
LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST,
LayoutLMForMaskedLM,
LayoutLMForQuestionAnswering,
LayoutLMForSequenceClassification,
LayoutLMForTokenClassification,
LayoutLMModel,
LayoutLMPreTrainedModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_layoutlm import (
TF_LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST,
TFLayoutLMForMaskedLM,
TFLayoutLMForQuestionAnswering,
TFLayoutLMForSequenceClassification,
TFLayoutLMForTokenClassification,
TFLayoutLMMainLayer,
TFLayoutLMModel,
TFLayoutLMPreTrainedModel,
)
else:
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\layoutlmv2\configuration_layoutlmv2.py
""" LayoutLMv2 model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import is_detectron2_available, logging
logger = logging.get_logger(__name__)
LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"layoutlmv2-base-uncased": "https://huggingface.co/microsoft/layoutlmv2-base-uncased/resolve/main/config.json",
"layoutlmv2-large-uncased": "https://huggingface.co/microsoft/layoutlmv2-large-uncased/resolve/main/config.json",
}
if is_detectron2_available():
import detectron2
class LayoutLMv2Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`LayoutLMv2Model`]. It is used to instantiate an
LayoutLMv2 model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the LayoutLMv2
[microsoft/layoutlmv2-base-uncased](https://huggingface.co/microsoft/layoutlmv2-base-uncased) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import LayoutLMv2Config, LayoutLMv2Model
>>> # Initializing a LayoutLMv2 microsoft/layoutlmv2-base-uncased style configuration
>>> configuration = LayoutLMv2Config()
>>> # Initializing a model (with random weights) from the microsoft/layoutlmv2-base-uncased style configuration
>>> model = LayoutLMv2Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "layoutlmv2"
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=0,
max_2d_position_embeddings=1024,
max_rel_pos=128,
rel_pos_bins=32,
fast_qkv=True,
max_rel_2d_pos=256,
rel_2d_pos_bins=64,
convert_sync_batchnorm=True,
image_feature_pool_shape=[7, 7, 256],
coordinate_size=128,
shape_size=128,
has_relative_attention_bias=True,
has_spatial_attention_bias=True,
has_visual_segment_embedding=False,
detectron2_config_args=None,
**kwargs,
):
super().__init__(
vocab_size=vocab_size,
hidden_size=hidden_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
intermediate_size=intermediate_size,
hidden_act=hidden_act,
hidden_dropout_prob=hidden_dropout_prob,
attention_probs_dropout_prob=attention_probs_dropout_prob,
max_position_embeddings=max_position_embeddings,
type_vocab_size=type_vocab_size,
initializer_range=initializer_range,
layer_norm_eps=layer_norm_eps,
pad_token_id=pad_token_id,
**kwargs,
)
self.max_2d_position_embeddings = max_2d_position_embeddings
self.max_rel_pos = max_rel_pos
self.rel_pos_bins = rel_pos_bins
self.fast_qkv = fast_qkv
self.max_rel_2d_pos = max_rel_2d_pos
self.rel_2d_pos_bins = rel_2d_pos_bins
self.convert_sync_batchnorm = convert_sync_batchnorm
self.image_feature_pool_shape = image_feature_pool_shape
self.coordinate_size = coordinate_size
self.shape_size = shape_size
self.has_relative_attention_bias = has_relative_attention_bias
self.has_spatial_attention_bias = has_spatial_attention_bias
self.has_visual_segment_embedding = has_visual_segment_embedding
self.detectron2_config_args = (
detectron2_config_args if detectron2_config_args is not None else self.get_default_detectron2_config()
)
@classmethod
def get_default_detectron2_config(self):
return {
"MODEL.MASK_ON": True,
"MODEL.PIXEL_STD": [57.375, 57.120, 58.395],
"MODEL.BACKBONE.NAME": "build_resnet_fpn_backbone",
"MODEL.FPN.IN_FEATURES": ["res2", "res3", "res4", "res5"],
"MODEL.ANCHOR_GENERATOR.SIZES": [[32], [64], [128], [256], [512]],
"MODEL.RPN.IN_FEATURES": ["p2", "p3", "p4", "p5", "p6"],
"MODEL.RPN.PRE_NMS_TOPK_TRAIN": 2000,
"MODEL.RPN.PRE_NMS_TOPK_TEST": 1000,
"MODEL.RPN.POST_NMS_TOPK_TRAIN": 1000,
"MODEL.POST_NMS_TOPK_TEST": 1000,
"MODEL.ROI_HEADS.NAME": "StandardROIHeads",
"MODEL.ROI_HEADS.NUM_CLASSES": 5,
"MODEL.ROI_HEADS.IN_FEATURES": ["p2", "p3", "p4", "p5"],
"MODEL.ROI_BOX_HEAD.NAME": "FastRCNNConvFCHead",
"MODEL.ROI_BOX_HEAD.NUM_FC": 2,
"MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION": 14,
"MODEL.ROI_MASK_HEAD.NAME": "MaskRCNNConvUpsampleHead",
"MODEL.ROI_MASK_HEAD.NUM_CONV": 4,
"MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION": 7,
"MODEL.RESNETS.DEPTH": 101,
"MODEL.RESNETS.SIZES": [[32], [64], [128], [256], [512]],
"MODEL.RESNETS.ASPECT_RATIOS": [[0.5, 1.0, 2.0]],
"MODEL.RESNETS.OUT_FEATURES": ["res2", "res3", "res4", "res5"],
"MODEL.RESNETS.NUM_GROUPS": 32,
"MODEL.RESNETS.WIDTH_PER_GROUP": 8,
"MODEL.RESNETS.STRIDE_IN_1X1": False,
}
def get_detectron2_config(self):
detectron2_config = detectron2.config.get_cfg()
for k, v in self.detectron2_config_args.items():
attributes = k.split(".")
to_set = detectron2_config
for attribute in attributes[:-1]:
to_set = getattr(to_set, attribute)
setattr(to_set, attributes[-1], v)
return detectron2_config
.\models\layoutlmv2\feature_extraction_layoutlmv2.py
"""
Feature extractor class for LayoutLMv2.
"""
import warnings
from ...utils import logging
from .image_processing_layoutlmv2 import LayoutLMv2ImageProcessor
logger = logging.get_logger(__name__)
class LayoutLMv2FeatureExtractor(LayoutLMv2ImageProcessor):
def __init__(self, *args, **kwargs) -> None:
warnings.warn(
"The class LayoutLMv2FeatureExtractor is deprecated and will be removed in version 5 of Transformers."
" Please use LayoutLMv2ImageProcessor instead.",
FutureWarning,
)
super().__init__(*args, **kwargs)
.\models\layoutlmv2\image_processing_layoutlmv2.py
def normalize_box(box, width, height):
return [
int(1000 * (box[0] / width)),
int(1000 * (box[1] / height)),
int(1000 * (box[2] / width)),
int(1000 * (box[3] / height)),
]
def apply_tesseract(
image: np.ndarray,
lang: Optional[str],
tesseract_config: Optional[str] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
):
"""Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
tesseract_config = tesseract_config if tesseract_config is not None else ""
pil_image = to_pil_image(image, input_data_format=input_data_format)
image_width, image_height = pil_image.size
data = pytesseract.image_to_data(pil_image, lang=lang, output_type="dict", config=tesseract_config)
words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
irrelevant_indices = [idx for idx, word in enumerate(words) if not word.strip()]
words = [word for idx, word in enumerate(words) if idx not in irrelevant_indices]
left = [coord for idx, coord in enumerate(left) if idx not in irrelevant_indices]
top = [coord for idx, coord in enumerate(top) if idx not in irrelevant_indices]
width = [coord for idx, coord in enumerate(width) if idx not in irrelevant_indices]
height = [coord for idx, coord in enumerate(height) if idx not in irrelevant_indices]
actual_boxes = []
for x, y, w, h in zip(left, top, width, height):
actual_box = [x, y, x + w, y + h]
actual_boxes.append(actual_box)
normalized_boxes = []
for box in actual_boxes:
normalized_boxes.append(normalize_box(box, image_width, image_height))
assert len(words) == len(normalized_boxes), "Not as many words as there are bounding boxes"
return words, normalized_boxes
r"""
Constructs a LayoutLMv2 image processor.
构造一个 LayoutLMv2 图像处理器。
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to `(size["height"], size["width"])`. Can be
overridden by `do_resize` in `preprocess`.
是否将图像的 (height, width) 尺寸调整为 `(size["height"], size["width"])`。可以在 `preprocess` 中通过 `do_resize` 覆盖。
size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
Size of the image after resizing. Can be overridden by `size` in `preprocess`.
调整大小后的图像尺寸。可以在 `preprocess` 中通过 `size` 覆盖。
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
`preprocess` method.
如果调整图像大小,要使用的重采样滤波器。可以在 `preprocess` 方法中通过 `resample` 参数覆盖。
apply_ocr (`bool`, *optional*, defaults to `True`):
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
`apply_ocr` in `preprocess`.
是否应用 Tesseract OCR 引擎来获取单词 + 标准化边界框。可以在 `preprocess` 中通过 `apply_ocr` 覆盖。
ocr_lang (`str`, *optional*):
The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
used. Can be overridden by `ocr_lang` in `preprocess`.
Tesseract OCR 引擎使用的语言,使用 ISO 代码指定。默认使用英语。可以在 `preprocess` 中通过 `ocr_lang` 覆盖。
tesseract_config (`str`, *optional*, defaults to `""`):
Any additional custom configuration flags that are forwarded to the `config` parameter when calling
Tesseract. For example: '--psm 6'. Can be overridden by `tesseract_config` in `preprocess`.
调用 Tesseract 时转发给 `config` 参数的任何额外自定义配置标志。例如:'--psm 6'。可以在 `preprocess` 中通过 `tesseract_config` 覆盖。
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
apply_ocr: bool = True,
ocr_lang: Optional[str] = None,
tesseract_config: Optional[str] = "",
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"height": 224, "width": 224}
size = get_size_dict(size)
self.do_resize = do_resize
self.size = size
self.resample = resample
self.apply_ocr = apply_ocr
self.ocr_lang = ocr_lang
self.tesseract_config = tesseract_config
self._valid_processor_keys = [
"images",
"do_resize",
"size",
"resample",
"apply_ocr",
"ocr_lang",
"tesseract_config",
"return_tensors",
"data_format",
"input_data_format",
]
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize an image to `(size["height"], size["width"])`.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
Returns:
`np.ndarray`: The resized image.
"""
size = get_size_dict(size)
if "height" not in size or "width" not in size:
raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
output_size = (size["height"], size["width"])
return resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
.\models\layoutlmv2\modeling_layoutlmv2.py
""" PyTorch LayoutLMv2 模型。"""
import math
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPooling,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward
from ...utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_detectron2_available,
logging,
replace_return_docstrings,
requires_backends,
)
from .configuration_layoutlmv2 import LayoutLMv2Config
if is_detectron2_available():
import detectron2
from detectron2.modeling import META_ARCH_REGISTRY
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "microsoft/layoutlmv2-base-uncased"
_CONFIG_FOR_DOC = "LayoutLMv2Config"
LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/layoutlmv2-base-uncased",
"microsoft/layoutlmv2-large-uncased",
]
class LayoutLMv2Embeddings(nn.Module):
"""从词、位置和标记类型嵌入构建嵌入。"""
def __init__(self, config):
super(LayoutLMv2Embeddings, self).__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.x_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
self.y_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
self.h_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.shape_size)
self.w_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.shape_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def _calc_spatial_position_embeddings(self, bbox):
try:
left_position_embeddings = self.x_position_embeddings(bbox[:, :, 0])
upper_position_embeddings = self.y_position_embeddings(bbox[:, :, 1])
right_position_embeddings = self.x_position_embeddings(bbox[:, :, 2])
lower_position_embeddings = self.y_position_embeddings(bbox[:, :, 3])
except IndexError as e:
raise IndexError("The `bbox` coordinate values should be within 0-1000 range.") from e
h_position_embeddings = self.h_position_embeddings(bbox[:, :, 3] - bbox[:, :, 1])
w_position_embeddings = self.w_position_embeddings(bbox[:, :, 2] - bbox[:, :, 0])
spatial_position_embeddings = torch.cat(
[
left_position_embeddings,
upper_position_embeddings,
right_position_embeddings,
lower_position_embeddings,
h_position_embeddings,
w_position_embeddings,
],
dim=-1,
)
return spatial_position_embeddings
def __init__(self, config):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.fast_qkv = config.fast_qkv
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.has_relative_attention_bias = config.has_relative_attention_bias
self.has_spatial_attention_bias = config.has_spatial_attention_bias
if config.fast_qkv:
self.qkv_linear = nn.Linear(config.hidden_size, 3 * self.all_head_size, bias=False)
self.q_bias = nn.Parameter(torch.zeros(1, 1, self.all_head_size))
self.v_bias = nn.Parameter(torch.zeros(1, 1, self.all_head_size))
else:
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def compute_qkv(self, hidden_states):
if self.fast_qkv:
qkv = self.qkv_linear(hidden_states)
q, k, v = torch.chunk(qkv, 3, dim=-1)
if q.ndimension() == self.q_bias.ndimension():
q = q + self.q_bias
v = v + self.v_bias
else:
_sz = (1,) * (q.ndimension() - 1) + (-1,)
q = q + self.q_bias.view(*_sz)
v = v + self.v_bias.view(*_sz)
else:
q = self.query(hidden_states)
k = self.key(hidden_states)
v = self.value(hidden_states)
return q, k, v
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
output_attentions=False,
rel_pos=None,
rel_2d_pos=None,
):
q, k, v = self.compute_qkv(hidden_states)
query_layer = self.transpose_for_scores(q)
key_layer = self.transpose_for_scores(k)
value_layer = self.transpose_for_scores(v)
query_layer = query_layer / math.sqrt(self.attention_head_size)
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
if self.has_relative_attention_bias:
attention_scores += rel_pos
if self.has_spatial_attention_bias:
attention_scores += rel_2d_pos
attention_scores = attention_scores.float().masked_fill_(
attention_mask.to(torch.bool), torch.finfo(attention_scores.dtype).min
)
attention_probs = nn.functional.softmax(attention_scores, dim=-1, dtype=torch.float32).type_as(value_layer)
attention_probs = self.dropout(attention_probs)
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
class LayoutLMv2Attention(nn.Module):
def __init__(self, config):
super().__init__()
self.self = LayoutLMv2SelfAttention(config)
self.output = LayoutLMv2SelfOutput(config)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
output_attentions=False,
rel_pos=None,
rel_2d_pos=None,
):
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
output_attentions,
rel_pos=rel_pos,
rel_2d_pos=rel_2d_pos,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class LayoutLMv2SelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class LayoutLMv2Intermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class LayoutLMv2Output(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class LayoutLMv2Layer(nn.Module):
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = LayoutLMv2Attention(config)
self.intermediate = LayoutLMv2Intermediate(config)
self.output = LayoutLMv2Output(config)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
output_attentions=False,
rel_pos=None,
rel_2d_pos=None,
):
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
rel_pos=rel_pos,
rel_2d_pos=rel_2d_pos,
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:]
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
def relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
"""
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
Translate relative position to a bucket number for relative attention. The relative position is defined as
memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for small
absolute relative_position and larger buckets for larger absolute relative_positions. All relative positions
>=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket. This should
allow for more graceful generalization to longer sequences than the model has been trained on.
Args:
relative_position: an int32 Tensor - 相对位置,表示从注意位置到被注意位置的距离(以标记为单位)
bidirectional: a boolean - 是否双向关注
num_buckets: an integer - 桶的数量,用于映射相对位置到桶号
max_distance: an integer - 最大距离限制,超过此距离的相对位置映射到同一个桶
Returns:
a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
返回一个形状与relative_position相同的Tensor,包含范围在[0, num_buckets)内的int32值
"""
ret = 0
if bidirectional:
num_buckets //= 2
ret += (relative_position > 0).long() * num_buckets
n = torch.abs(relative_position)
else:
n = torch.max(-relative_position, torch.zeros_like(relative_position))
max_exact = num_buckets // 2
is_small = n < max_exact
val_if_large = max_exact + (
torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
).to(torch.long)
val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
ret += torch.where(is_small, n, val_if_large)
return ret
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([LayoutLMv2Layer(config) for _ in range(config.num_hidden_layers)])
self.has_relative_attention_bias = config.has_relative_attention_bias
self.has_spatial_attention_bias = config.has_spatial_attention_bias
if self.has_relative_attention_bias:
self.rel_pos_bins = config.rel_pos_bins
self.max_rel_pos = config.max_rel_pos
self.rel_pos_bias = nn.Linear(self.rel_pos_bins, config.num_attention_heads, bias=False)
if self.has_spatial_attention_bias:
self.max_rel_2d_pos = config.max_rel_2d_pos
self.rel_2d_pos_bins = config.rel_2d_pos_bins
self.rel_pos_x_bias = nn.Linear(self.rel_2d_pos_bins, config.num_attention_heads, bias=False)
self.rel_pos_y_bias = nn.Linear(self.rel_2d_pos_bins, config.num_attention_heads, bias=False)
self.gradient_checkpointing = False
def _calculate_1d_position_embeddings(self, position_ids):
rel_pos_mat = position_ids.unsqueeze(-2) - position_ids.unsqueeze(-1)
rel_pos = relative_position_bucket(
rel_pos_mat,
num_buckets=self.rel_pos_bins,
max_distance=self.max_rel_pos,
)
rel_pos = self.rel_pos_bias.weight.t()[rel_pos].permute(0, 3, 1, 2)
rel_pos = rel_pos.contiguous()
return rel_pos
def _calculate_2d_position_embeddings(self, bbox):
position_coord_x = bbox[:, :, 0]
position_coord_y = bbox[:, :, 3]
rel_pos_x_2d_mat = position_coord_x.unsqueeze(-2) - position_coord_x.unsqueeze(-1)
rel_pos_y_2d_mat = position_coord_y.unsqueeze(-2) - position_coord_y.unsqueeze(-1)
rel_pos_x = relative_position_bucket(
rel_pos_x_2d_mat,
num_buckets=self.rel_2d_pos_bins,
max_distance=self.max_rel_2d_pos,
)
rel_pos_y = relative_position_bucket(
rel_pos_y_2d_mat,
num_buckets=self.rel_2d_pos_bins,
max_distance=self.max_rel_2d_pos,
)
rel_pos_x = self.rel_pos_x_bias.weight.t()[rel_pos_x].permute(0, 3, 1, 2)
rel_pos_y = self.rel_pos_y_bias.weight.t()[rel_pos_y].permute(0, 3, 1, 2)
rel_pos_x = rel_pos_x.contiguous()
rel_pos_y = rel_pos_y.contiguous()
rel_2d_pos = rel_pos_x + rel_pos_y
return rel_2d_pos
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
bbox=None,
position_ids=None,
):
):
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
rel_pos = self._calculate_1d_position_embeddings(position_ids) if self.has_relative_attention_bias else None
rel_2d_pos = self._calculate_2d_position_embeddings(bbox) if self.has_spatial_attention_bias else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
attention_mask,
layer_head_mask,
output_attentions,
rel_pos=rel_pos,
rel_2d_pos=rel_2d_pos,
)
else:
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
output_attentions,
rel_pos=rel_pos,
rel_2d_pos=rel_2d_pos,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v
for v in [
hidden_states,
all_hidden_states,
all_self_attentions,
]
if v is not None
)
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
class LayoutLMv2PreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = LayoutLMv2Config
pretrained_model_archive_map = LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST
base_model_prefix = "layoutlmv2"
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
def my_convert_sync_batchnorm(module, process_group=None):
if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
return nn.modules.SyncBatchNorm.convert_sync_batchnorm(module, process_group)
module_output = module
if isinstance(module, detectron2.layers.FrozenBatchNorm2d):
module_output = torch.nn.SyncBatchNorm(
num_features=module.num_features,
eps=module.eps,
affine=True,
track_running_stats=True,
process_group=process_group,
)
module_output.weight = torch.nn.Parameter(module.weight)
module_output.bias = torch.nn.Parameter(module.bias)
module_output.running_mean = module.running_mean
module_output.running_var = module.running_var
module_output.num_batches_tracked = torch.tensor(0, dtype=torch.long, device=module.running_mean.device)
for name, child in module.named_children():
module_output.add_module(name, my_convert_sync_batchnorm(child, process_group))
del module
return module_output
class LayoutLMv2VisualBackbone(nn.Module):
def __init__(self, config):
super().__init__()
self.cfg = config.get_detectron2_config()
meta_arch = self.cfg.MODEL.META_ARCHITECTURE
model = META_ARCH_REGISTRY.get(meta_arch)(self.cfg)
assert isinstance(model.backbone, detectron2.modeling.backbone.FPN)
self.backbone = model.backbone
assert len(self.cfg.MODEL.PIXEL_MEAN) == len(self.cfg.MODEL.PIXEL_STD)
num_channels = len(self.cfg.MODEL.PIXEL_MEAN)
self.register_buffer(
"pixel_mean",
torch.Tensor(self.cfg.MODEL.PIXEL_MEAN).view(num_channels, 1, 1),
persistent=False,
)
self.register_buffer(
"pixel_std", torch.Tensor(self.cfg.MODEL.PIXEL_STD).view(num_channels, 1, 1), persistent=False
)
self.out_feature_key = "p2"
if torch.are_deterministic_algorithms_enabled():
logger.warning("using `AvgPool2d` instead of `AdaptiveAvgPool2d`")
input_shape = (224, 224)
backbone_stride = self.backbone.output_shape()[self.out_feature_key].stride
self.pool = nn.AvgPool2d(
(
math.ceil(math.ceil(input_shape[0] / backbone_stride) / config.image_feature_pool_shape[0]),
math.ceil(math.ceil(input_shape[1] / backbone_stride) / config.image_feature_pool_shape[1]),
)
)
else:
self.pool = nn.AdaptiveAvgPool2d(config.image_feature_pool_shape[:2])
if len(config.image_feature_pool_shape) == 2:
config.image_feature_pool_shape.append(self.backbone.output_shape()[self.out_feature_key].channels)
assert self.backbone.output_shape()[self.out_feature_key].channels == config.image_feature_pool_shape[2]
def forward(self, images):
images_input = ((images if torch.is_tensor(images) else images.tensor) - self.pixel_mean) / self.pixel_std
features = self.backbone(images_input)
features = self.pool(features[self.out_feature_key]).flatten(start_dim=2).transpose(1, 2).contiguous()
return features
def synchronize_batch_norm(self):
if not (
torch.distributed.is_available()
and torch.distributed.is_initialized()
and torch.distributed.get_rank() > -1
):
raise RuntimeError("Make sure torch.distributed is set up properly.")
self_rank = torch.distributed.get_rank()
node_size = torch.cuda.device_count()
world_size = torch.distributed.get_world_size()
if not (world_size % node_size == 0):
raise RuntimeError("Make sure the number of processes can be divided by the number of nodes")
node_global_ranks = [list(range(i * node_size, (i + 1) * node_size)) for i in range(world_size // node_size)]
sync_bn_groups = [
torch.distributed.new_group(ranks=node_global_ranks[i]) for i in range(world_size // node_size)
]
node_rank = self_rank // node_size
self.backbone = my_convert_sync_batchnorm(self.backbone, process_group=sync_bn_groups[node_rank])
LAYOUTLMV2_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`LayoutLMv2Config`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
LAYOUTLMV2_INPUTS_DOCSTRING = r"""
"""
class LayoutLMv2Pooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states):
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
@add_start_docstrings(
"The bare LayoutLMv2 Model transformer outputting raw hidden-states without any specific head on top.",
LAYOUTLMV2_START_DOCSTRING,
)
class LayoutLMv2Model(LayoutLMv2PreTrainedModel):
def __init__(self, config):
requires_backends(self, "detectron2")
super().__init__(config)
self.config = config
self.has_visual_segment_embedding = config.has_visual_segment_embedding
self.embeddings = LayoutLMv2Embeddings(config)
self.visual = LayoutLMv2VisualBackbone(config)
self.visual_proj = nn.Linear(config.image_feature_pool_shape[-1], config.hidden_size)
if self.has_visual_segment_embedding:
self.visual_segment_embedding = nn.Parameter(nn.Embedding(1, config.hidden_size).weight[0])
self.visual_LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.visual_dropout = nn.Dropout(config.hidden_dropout_prob)
self.encoder = LayoutLMv2Encoder(config)
self.pooler = LayoutLMv2Pooler(config)
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _calc_text_embeddings(self, input_ids, bbox, position_ids, token_type_ids, inputs_embeds=None):
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)
if inputs_embeds is None:
inputs_embeds = self.embeddings.word_embeddings(input_ids)
position_embeddings = self.embeddings.position_embeddings(position_ids)
spatial_position_embeddings = self.embeddings._calc_spatial_position_embeddings(bbox)
token_type_embeddings = self.embeddings.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + position_embeddings + spatial_position_embeddings + token_type_embeddings
embeddings = self.embeddings.LayerNorm(embeddings)
embeddings = self.embeddings.dropout(embeddings)
return embeddings
def _calc_img_embeddings(self, image, bbox, position_ids):
visual_embeddings = self.visual_proj(self.visual(image))
position_embeddings = self.embeddings.position_embeddings(position_ids)
spatial_position_embeddings = self.embeddings._calc_spatial_position_embeddings(bbox)
embeddings = visual_embeddings + position_embeddings + spatial_position_embeddings
if self.has_visual_segment_embedding:
embeddings += self.visual_segment_embedding
embeddings = self.visual_LayerNorm(embeddings)
embeddings = self.visual_dropout(embeddings)
return embeddings
@add_start_docstrings_to_model_forward(LAYOUTLMV2_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
bbox: Optional[torch.LongTensor] = None,
image: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
return input_ids.size()
elif inputs_embeds is not None:
return inputs_embeds.size()[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
@add_start_docstrings(
"""
LayoutLMv2 Model with a sequence classification head on top (a linear layer on top of the concatenation of the
final hidden state of the [CLS] token, average-pooled initial visual embeddings and average-pooled final visual
embeddings, e.g. for document image classification tasks such as the
[RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset.
""",
LAYOUTLMV2_START_DOCSTRING,
)
class LayoutLMv2ForSequenceClassification(LayoutLMv2PreTrainedModel):
"""
LayoutLMv2模型,顶部带有序列分类头(一个线性层,位于[CLS] token的最终隐藏状态、平均池化的初始视觉嵌入和平均池化的最终视觉嵌入的连接处),
例如用于文档图像分类任务,如[RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/)数据集。
"""
def __init__(self, config):
"""
初始化函数,配置LayoutLMv2序列分类模型。
Args:
config (LayoutLMv2Config): 模型配置对象
"""
super().__init__(config)
self.num_labels = config.num_labels
self.layoutlmv2 = LayoutLMv2Model(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size * 3, config.num_labels)
self.post_init()
def get_input_embeddings(self):
"""
获取输入嵌入层(word embeddings)。
Returns:
torch.nn.Embedding: LayoutLMv2模型的词嵌入层对象
"""
return self.layoutlmv2.embeddings.word_embeddings
@add_start_docstrings_to_model_forward(LAYOUTLMV2_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
bbox: Optional[torch.LongTensor] = None,
image: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
前向传播函数,执行LayoutLMv2序列分类模型的前向计算。
Args:
input_ids (torch.LongTensor, optional): 输入token的ID张量,默认为None
bbox (torch.LongTensor, optional): 边界框信息的张量,默认为None
image (torch.FloatTensor, optional): 图像特征的张量,默认为None
attention_mask (torch.FloatTensor, optional): 注意力掩码的张量,默认为None
token_type_ids (torch.LongTensor, optional): token类型ID的张量,默认为None
position_ids (torch.LongTensor, optional): 位置ID的张量,默认为None
head_mask (torch.FloatTensor, optional): 头部掩码的张量,默认为None
inputs_embeds (torch.FloatTensor, optional): 输入嵌入的张量,默认为None
labels (torch.LongTensor, optional): 标签的张量,默认为None
output_attentions (bool, optional): 是否输出注意力,默认为None
output_hidden_states (bool, optional): 是否输出隐藏状态,默认为None
return_dict (bool, optional): 是否返回字典格式的输出,默认为None
Returns:
SequenceClassifierOutput: 序列分类任务的输出对象
"""
@add_start_docstrings(
"""
LayoutLMv2 Model with a token classification head on top (a linear layer on top of the text part of the hidden
states) e.g. for sequence labeling (information extraction) tasks such as
[FUNSD](https://guillaumejaume.github.io/FUNSD/), [SROIE](https://rrc.cvc.uab.es/?ch=13),
[CORD](https://github.com/clovaai/cord) and [Kleister-NDA](https://github.com/applicaai/kleister-nda).
""",
LAYOUTLMV2_START_DOCSTRING,
)
class LayoutLMv2ForTokenClassification(LayoutLMv2PreTrainedModel):
"""
LayoutLMv2模型,顶部带有标记分类头(一个线性层,位于隐藏状态的文本部分的顶部),
例如用于序列标记任务(信息提取),如FUNSD, SROIE, CORD和Kleister-NDA。
"""
def __init__(self, config):
"""
初始化函数,配置LayoutLMv2标记分类模型。
Args:
config (LayoutLMv2Config): 模型配置对象
"""
super().__init__(config)
self.num_labels = config.num_labels
self.layoutlmv2 = LayoutLMv2Model(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
def get_input_embeddings(self):
"""
获取输入嵌入层(word embeddings)。
Returns:
torch.nn.Embedding: LayoutLMv2模型的词嵌入层对象
"""
return self.layoutlmv2.embeddings.word_embeddings
@add_start_docstrings_to_model_forward(LAYOUTLMV2_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
bbox: Optional[torch.LongTensor] = None,
image: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
"""
LayoutLMv2 Model with a span classification head on top for extractive question-answering tasks such as
[DocVQA](https://rrc.cvc.uab.es/?ch=17) (a linear layer on top of the text part of the hidden-states output to
compute `span start logits` and `span end logits`).
"""
LAYOUTLMV2_START_DOCSTRING = """
class LayoutLMv2ForQuestionAnswering(LayoutLMv2PreTrainedModel):
def __init__(self, config, has_visual_segment_embedding=True):
# 调用 LayoutLMv2PreTrainedModel 的初始化方法
super().__init__(config)
# 设置模型需要输出的标签数
self.num_labels = config.num_labels
# 根据输入的配置,决定是否包含视觉段落嵌入
config.has_visual_segment_embedding = has_visual_segment_embedding
# 创建 LayoutLMv2Model 对象
self.layoutlmv2 = LayoutLMv2Model(config)
# 创建用于问答任务的线性输出层,输入大小为隐藏状态的大小,输出大小为标签数
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
# 初始化权重并应用最终处理
self.post_init()
def get_input_embeddings(self):
# 返回 LayoutLMv2 模型中的词嵌入层
return self.layoutlmv2.embeddings.word_embeddings
# 引用 LAYOUTLMV2_INPUTS_DOCSTRING,添加到模型前向方法的文档字符串中
@add_start_docstrings_to_model_forward(LAYOUTLMV2_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
# 替换模型前向方法的返回文档字符串,使用 QuestionAnsweringModelOutput 类型,引用 _CONFIG_FOR_DOC 配置类
@replace_return_docstrings(output_type=QuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
bbox: Optional[torch.LongTensor] = None,
image: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
"""