Transformers 源码解析(六十二)
.\models\layoutlmv2\processing_layoutlmv2.py
"""
Processor class for LayoutLMv2.
"""
import warnings
from typing import List, Optional, Union
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType
class LayoutLMv2Processor(ProcessorMixin):
r"""
Constructs a LayoutLMv2 processor which combines a LayoutLMv2 image processor and a LayoutLMv2 tokenizer into a
single processor.
[`LayoutLMv2Processor`] offers all the functionalities you need to prepare data for the model.
It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
get words and normalized bounding boxes. These are then provided to [`LayoutLMv2Tokenizer`] or
[`LayoutLMv2TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
`attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
into token-level `labels` for token classification tasks (such as FUNSD, CORD).
Args:
image_processor (`LayoutLMv2ImageProcessor`, *optional*):
An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`, *optional*):
An instance of [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "LayoutLMv2ImageProcessor"
tokenizer_class = ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast")
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
feature_extractor = None
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
def __call__(
self,
images,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
boxes: Union[List[List[int]], List[List[List[int]]]] = None,
word_labels: Optional[Union[List[int], List[List[int]]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
):
"""
批量处理图像及其相关信息,将其转换为模型可以处理的格式。参数详细说明可以参考 `PreTrainedTokenizer.batch_decode` 方法的文档字符串。
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def get_overflowing_images(self, images, overflow_to_sample_mapping):
images_with_overflow = []
for sample_idx in overflow_to_sample_mapping:
images_with_overflow.append(images[sample_idx])
if len(images_with_overflow) != len(overflow_to_sample_mapping):
raise ValueError(
"Expected length of images to be the same as the length of `overflow_to_sample_mapping`, but got"
f" {len(images_with_overflow)} and {len(overflow_to_sample_mapping)}"
)
return images_with_overflow
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
to the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
return ["input_ids", "bbox", "token_type_ids", "attention_mask", "image"]
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
.\models\layoutlmv2\tokenization_layoutlmv2.py
"""Tokenization class for LayoutLMv2."""
import collections
import os
import sys
import unicodedata
from typing import Dict, List, Optional, Tuple, Union
from ...tokenization_utils import AddedToken, PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...tokenization_utils_base import (
BatchEncoding,
EncodedInput,
PreTokenizedInput,
TextInput,
TextInputPair,
TruncationStrategy,
)
from ...utils import PaddingStrategy, TensorType, add_end_docstrings, logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/layoutlmv2-base-uncased": (
"https://huggingface.co/microsoft/layoutlmv2-base-uncased/resolve/main/vocab.txt"
),
"microsoft/layoutlmv2-large-uncased": (
"https://huggingface.co/microsoft/layoutlmv2-large-uncased/resolve/main/vocab.txt"
),
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/layoutlmv2-base-uncased": 512,
"microsoft/layoutlmv2-large-uncased": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"microsoft/layoutlmv2-base-uncased": {"do_lower_case": True},
"microsoft/layoutlmv2-large-uncased": {"do_lower_case": True},
}
"""
"""
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
table = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P"))
def subfinder(mylist, pattern):
matches = []
indices = []
for idx, i in enumerate(range(len(mylist))):
if mylist[i] == pattern[0] and mylist[i : i + len(pattern)] == pattern:
matches.append(pattern)
indices.append(idx)
if matches:
return matches[0], indices[0]
else:
return None, 0
class LayoutLMv2Tokenizer(PreTrainedTokenizer):
r"""
"""
构建一个 LayoutLMv2 的分词器。基于 WordPiece。[`LayoutLMv2Tokenizer`] 可以用于将单词、单词级别边界框和可选的单词标签转换为
标记级别的 `input_ids`、`attention_mask`、`token_type_ids`、`bbox`,以及可选的 `labels`(用于标记分类)。
该分词器继承自 [`PreTrainedTokenizer`],其中包含大多数主要方法。用户应参考此超类以获取有关这些方法的更多信息。
[`LayoutLMv2Tokenizer`] 运行端到端的分词:标点符号分割和 WordPiece。它还将单词级别的边界框转换为标记级别的边界框。
"""
# 定义预训练模型所需的词汇文件名列表
vocab_files_names = VOCAB_FILES_NAMES
# 定义预训练模型所需的词汇文件映射
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
# 定义预训练模型输入的最大长度列表
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# 定义预训练模型的初始化配置
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
def __init__(
self,
vocab_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
cls_token_box=[0, 0, 0, 0],
sep_token_box=[1000, 1000, 1000, 1000],
pad_token_box=[0, 0, 0, 0],
pad_token_label=-100,
only_label_first_subword=True,
tokenize_chinese_chars=True,
strip_accents=None,
model_max_length: int = 512,
additional_special_tokens: Optional[List[str]] = None,
**kwargs,
):
):
# 如果 sep_token 是字符串,则创建一个特殊的 AddedToken 对象
sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
# 如果 unk_token 是字符串,则创建一个特殊的 AddedToken 对象
unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
# 如果 pad_token 是字符串,则创建一个特殊的 AddedToken 对象
pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
# 如果 cls_token 是字符串,则创建一个特殊的 AddedToken 对象
cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
# 如果 mask_token 是字符串,则创建一个特殊的 AddedToken 对象
mask_token = AddedToken(mask_token, special=True) if isinstance(mask_token, str) else mask_token
# 如果指定的词汇文件不存在,抛出 ValueError 异常
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
" model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
# 加载词汇表文件并将其存储在 self.vocab 中
self.vocab = load_vocab(vocab_file)
# 创建一个从 id 到 token 的有序字典 self.ids_to_tokens
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
# 根据 do_basic_tokenize 的设置决定是否使用基础的分词器
self.do_basic_tokenize = do_basic_tokenize
if do_basic_tokenize:
# 如果需要基础的分词,创建 BasicTokenizer 对象
self.basic_tokenizer = BasicTokenizer(
do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
# 使用给定的词汇表和 unk_token 创建 WordpieceTokenizer 对象
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
# 设置额外的属性
self.cls_token_box = cls_token_box
self.sep_token_box = sep_token_box
self.pad_token_box = pad_token_box
self.pad_token_label = pad_token_label
self.only_label_first_subword = only_label_first_subword
# 调用父类的构造函数,初始化参数
super().__init__(
do_lower_case=do_lower_case,
do_basic_tokenize=do_basic_tokenize,
never_split=never_split,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
cls_token_box=cls_token_box,
sep_token_box=sep_token_box,
pad_token_box=pad_token_box,
pad_token_label=pad_token_label,
only_label_first_subword=only_label_first_subword,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
model_max_length=model_max_length,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
@property
def do_lower_case(self):
# 返回基础分词器的小写设置
return self.basic_tokenizer.do_lower_case
@property
def vocab_size(self):
# 返回词汇表大小
return len(self.vocab)
def get_vocab(self):
# 返回包含词汇表和添加的特殊 token 编码的字典
return dict(self.vocab, **self.added_tokens_encoder)
# 将文本进行分词处理,返回分词后的结果列表
def _tokenize(self, text):
split_tokens = []
# 如果需要进行基本的分词处理
if self.do_basic_tokenize:
# 使用基本分词器对文本进行分词,忽略不需要分词的特殊标记
for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
# 如果分词后的 token 在不需要分割的集合中
if token in self.basic_tokenizer.never_split:
# 直接加入到分词结果中
split_tokens.append(token)
else:
# 使用 WordPiece 分词器对 token 进行进一步分词处理
split_tokens += self.wordpiece_tokenizer.tokenize(token)
else:
# 否则,直接使用 WordPiece 分词器对文本进行分词处理
split_tokens = self.wordpiece_tokenizer.tokenize(text)
# 返回分词后的结果列表
return split_tokens
# 根据词汇表将 token 转换为对应的 id
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.vocab.get(token, self.vocab.get(self.unk_token))
# 根据词汇表将 id 转换为对应的 token
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.ids_to_tokens.get(index, self.unk_token)
# 将 token 列表转换为单个字符串,同时去除特殊标记 "##"
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string
# 构建包含特殊标记的输入序列,用于序列分类任务
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary
"""
# 如果只有一个输入序列
if token_ids_1 is None:
# 返回带有 [CLS] 和 [SEP] 特殊标记的输入序列
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
# 分别定义 [CLS] 和 [SEP] 的特殊标记
cls = [self.cls_token_id]
sep = [self.sep_token_id]
# 返回带有 [CLS], [SEP] 和两个序列之间的 [SEP] 特殊标记的输入序列
return cls + token_ids_0 + sep + token_ids_1 + sep
# 获取包含特殊标记的 token id 序列的掩码
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
):
"""Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using `build_inputs_with_special_tokens` method.
Args:
token_ids_0 (`List[int]`):
List of token ids (must be pure token ids without special tokens).
token_ids_1 (`List[int]`, *optional*):
Optional second list of token ids for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether the token list is already formated with special tokens or not.
Returns:
`List[int]`: A list of integers in the range [0, 1], with 1 specifying special tokens and 0 specifying
regular tokens.
"""
# 如果输入的 token_ids 已经包含了特殊标记
if already_has_special_tokens:
# 返回与 token_ids 0 和 token_ids 1 长度相同的全零列表
return [0] * len(token_ids_0)
# 定义一个用于存储特殊标记掩码的列表
special_tokens_mask = [1] # [CLS] token
# 如果有第二个序列 token_ids_1
if token_ids_1 is not None:
# 添加一个 [SEP] token 的掩码
special_tokens_mask += [1] * len(token_ids_1) # [SEP] tokens
# 返回特殊标记的掩码与 token_ids_0 长度相同的列表
return special_tokens_mask + [0] * (len(token_ids_0) - len(special_tokens_mask))
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
# 如果已经存在特殊标记,则调用父类方法获取特殊标记的掩码
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is not None:
# 如果存在第二个序列,返回包含特殊标记的掩码:[CLS] + token_ids_0 + [SEP] + token_ids_1 + [SEP]
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
# 如果只有一个序列,返回包含特殊标记的掩码:[CLS] + token_ids_0 + [SEP]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
pair mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second
sequence | If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary
"""
# 获取分隔符和类别标记的 ID
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
# 如果只有一个序列,返回一个长度为 cls + token_ids_0 + sep 长度的全零列表
return len(cls + token_ids_0 + sep) * [0]
# 如果有两个序列,返回两个序列加上分隔符的长度分别对应的掩码列表:[CLS] + token_ids_0 + [SEP] + token_ids_1 + [SEP]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
# 将词汇表保存到指定目录下的文件中,返回保存的文件路径元组
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# 初始化索引
index = 0
# 检查保存目录是否存在,如果存在则构建词汇表文件路径
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
else:
# 否则直接使用指定的保存路径
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
# 打开词汇表文件,准备写入内容
with open(vocab_file, "w", encoding="utf-8") as writer:
# 遍历词汇表中的每个词汇及其索引,并按索引顺序写入文件
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
# 如果当前词汇的索引不是期望的连续索引,记录警告信息
if index != token_index:
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!"
)
index = token_index
# 写入词汇到文件,并增加索引计数
writer.write(token + "\n")
index += 1
# 返回保存的词汇表文件路径元组
return (vocab_file,)
# 调用函数的装饰器,添加文档字符串到__call__方法
@add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
boxes: Union[List[List[int]], List[List[List[int]]]] = None,
word_labels: Optional[Union[List[int], List[List[int]]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
@add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
# 定义一个方法用于批量编码文本或文本对,并返回批处理编码结果
def batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
],
is_pair: bool = None, # 是否为文本对
boxes: Optional[List[List[List[int]]]] = None, # 文本框的坐标信息(可选)
word_labels: Optional[Union[List[int], List[List[int]]]] = None, # 单词标签(可选)
add_special_tokens: bool = True, # 是否添加特殊标记
padding: Union[bool, str, PaddingStrategy] = False, # 填充策略
truncation: Union[bool, str, TruncationStrategy] = None, # 截断策略
max_length: Optional[int] = None, # 最大长度限制(可选)
stride: int = 0, # 步长
pad_to_multiple_of: Optional[int] = None, # 填充到指定倍数的长度(可选)
return_tensors: Optional[Union[str, TensorType]] = None, # 返回的张量类型(可选)
return_token_type_ids: Optional[bool] = None, # 是否返回token类型ID(可选)
return_attention_mask: Optional[bool] = None, # 是否返回注意力掩码(可选)
return_overflowing_tokens: bool = False, # 是否返回溢出的token
return_special_tokens_mask: bool = False, # 是否返回特殊token掩码
return_offsets_mapping: bool = False, # 是否返回偏移映射
return_length: bool = False, # 是否返回长度
verbose: bool = True, # 是否详细输出信息
**kwargs, # 其他关键字参数
) -> BatchEncoding:
# 获取填充和截断策略,并处理旧版本参数兼容性
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
# 调用内部方法执行批量编码
return self._batch_encode_plus(
batch_text_or_text_pairs=batch_text_or_text_pairs,
is_pair=is_pair,
boxes=boxes,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
# 定义一个方法 `_batch_encode_plus`,用于批量编码文本或文本对,并生成批编码结果的对象
def _batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
],
is_pair: bool = None, # 是否为文本对
boxes: Optional[List[List[List[int]]]] = None, # 盒子坐标,用于文本识别任务
word_labels: Optional[List[List[int]]] = None, # 单词标签列表
add_special_tokens: bool = True, # 是否添加特殊标记(例如 [CLS], [SEP])
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, # 填充策略,默认不填充
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, # 截断策略,默认不截断
max_length: Optional[int] = None, # 最大长度限制
stride: int = 0, # 步进值,默认为0
pad_to_multiple_of: Optional[int] = None, # 填充到指定的倍数
return_tensors: Optional[Union[str, TensorType]] = None, # 返回的张量类型
return_token_type_ids: Optional[bool] = None, # 是否返回 token 类型 IDs
return_attention_mask: Optional[bool] = None, # 是否返回 attention mask
return_overflowing_tokens: bool = False, # 是否返回溢出的 tokens
return_special_tokens_mask: bool = False, # 是否返回特殊 tokens 的 mask
return_offsets_mapping: bool = False, # 是否返回偏移映射
return_length: bool = False, # 是否返回长度信息
verbose: bool = True, # 是否打印详细信息
**kwargs, # 其他未命名参数
) -> BatchEncoding: # 方法返回类型为 BatchEncoding 对象
# 如果请求返回偏移映射,则抛出 NotImplementedError
if return_offsets_mapping:
raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers. "
"To use this feature, change your tokenizer to one deriving from "
"transformers.PreTrainedTokenizerFast."
)
# 调用内部方法 `_batch_prepare_for_model` 准备批量数据以供模型处理
batch_outputs = self._batch_prepare_for_model(
batch_text_or_text_pairs=batch_text_or_text_pairs,
is_pair=is_pair,
boxes=boxes,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
return_tensors=return_tensors,
verbose=verbose,
)
# 将批处理输出转换为 BatchEncoding 对象并返回
return BatchEncoding(batch_outputs)
# 将函数 `_batch_encode_plus` 与文档字符串拼接并添加到类中作为方法装饰器
@add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
# 批量准备输入数据以供模型处理,处理文本或文本对的批次
def _batch_prepare_for_model(
self,
batch_text_or_text_pairs, # 输入的文本或文本对的批次
is_pair: bool = None, # 标志是否为文本对
boxes: Optional[List[List[int]]] = None, # 文本框的位置信息(可选)
word_labels: Optional[List[List[int]]] = None, # 单词级别的标签(可选)
add_special_tokens: bool = True, # 是否添加特殊标记
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, # 填充策略
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, # 截断策略
max_length: Optional[int] = None, # 最大长度限制(可选)
stride: int = 0, # 步长
pad_to_multiple_of: Optional[int] = None, # 填充到指定的倍数(可选)
return_tensors: Optional[str] = None, # 返回的张量类型(可选)
return_token_type_ids: Optional[bool] = None, # 是否返回token类型id(可选)
return_attention_mask: Optional[bool] = None, # 是否返回注意力掩码(可选)
return_overflowing_tokens: bool = False, # 是否返回溢出的token
return_special_tokens_mask: bool = False, # 是否返回特殊token的掩码
return_length: bool = False, # 是否返回批次长度
verbose: bool = True, # 是否打印详细信息
) -> BatchEncoding:
"""
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
manages a moving window (with user defined stride) for overflowing tokens.
Args:
batch_ids_pairs: list of tokenized input ids or input ids pairs
"""
# Initialize an empty dictionary to store batch outputs
batch_outputs = {}
# Iterate over each example in the batch, consisting of text or text pairs and corresponding boxes
for idx, example in enumerate(zip(batch_text_or_text_pairs, boxes)):
batch_text_or_text_pair, boxes_example = example
# Determine if the current example is a single text or a pair of texts
if is_pair:
input_ids_or_pair = batch_text_or_text_pair[0] # First sequence of input ids
else:
input_ids_or_pair = batch_text_or_text_pair # Single sequence of input ids
# Prepare inputs for the model using the specified parameters
outputs = self.prepare_for_model(
input_ids_or_pair,
batch_text_or_text_pair[1] if is_pair else None, # Second sequence of input ids if it exists
boxes_example,
word_labels=word_labels[idx] if word_labels is not None else None,
add_special_tokens=add_special_tokens,
padding=PaddingStrategy.DO_NOT_PAD.value, # Do not pad here; it's done in batch
truncation=truncation_strategy.value,
max_length=max_length,
stride=stride,
pad_to_multiple_of=None, # Pad in batch afterward
return_attention_mask=False, # Do not return attention masks here; it's done in batch
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
return_tensors=None, # Convert to tensors at the end
prepend_batch_axis=False,
verbose=verbose,
)
# Aggregate outputs into batch_outputs dictionary
for key, value in outputs.items():
if key not in batch_outputs:
batch_outputs[key] = []
batch_outputs[key].append(value)
# Perform padding across the batch
batch_outputs = self.pad(
batch_outputs,
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
)
# Convert batch_outputs to BatchEncoding format
batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
# Return the final prepared batch_outputs
return batch_outputs
@add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING)
# 定义一个方法 `encode`,用于将输入文本和相关信息编码成模型可以处理的输入格式,并返回编码后的输入 ID 列表
def encode(
self,
text: Union[TextInput, PreTokenizedInput], # 主要输入文本,可以是普通文本或预分词后的输入
text_pair: Optional[PreTokenizedInput] = None, # 可选的第二个输入文本,用于处理句对任务
boxes: Optional[List[List[int]]] = None, # 文本框的坐标信息列表,用于处理文本与空间信息结合的任务
word_labels: Optional[List[int]] = None, # 单词级别的标签列表,用于处理序列标注任务
add_special_tokens: bool = True, # 是否添加特殊令牌(如[CLS], [SEP])
padding: Union[bool, str, PaddingStrategy] = False, # 是否进行填充处理
truncation: Union[bool, str, TruncationStrategy] = None, # 是否进行截断处理
max_length: Optional[int] = None, # 最大序列长度限制
stride: int = 0, # 滑动窗口的步长
pad_to_multiple_of: Optional[int] = None, # 填充长度的倍数
return_tensors: Optional[Union[str, TensorType]] = None, # 返回的张量类型(如`pt`表示PyTorch张量)
return_token_type_ids: Optional[bool] = None, # 是否返回token类型 IDs
return_attention_mask: Optional[bool] = None, # 是否返回attention mask
return_overflowing_tokens: bool = False, # 是否返回溢出的 token
return_special_tokens_mask: bool = False, # 是否返回特殊令牌的 mask
return_offsets_mapping: bool = False, # 是否返回字符偏移映射
return_length: bool = False, # 是否返回编码后的长度
verbose: bool = True, # 是否启用详细输出模式
**kwargs, # 其他未指定的参数
) -> List[int]: # 返回一个整数列表,表示编码后的输入 ID
# 使用 `encode_plus` 方法对输入进行编码,并获取编码后的结果字典
encoded_inputs = self.encode_plus(
text=text,
text_pair=text_pair,
boxes=boxes,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
# 返回编码后结果中的 `input_ids` 键对应的值,即编码后的输入 ID 列表
return encoded_inputs["input_ids"]
# 使用 `add_end_docstrings` 装饰器添加文档字符串,详细说明 `encode_plus` 方法的参数和功能
@add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def encode_plus(
self,
text: Union[TextInput, PreTokenizedInput], # 主要输入文本,可以是普通文本或预分词后的输入
text_pair: Optional[PreTokenizedInput] = None, # 可选的第二个输入文本,用于处理句对任务
boxes: Optional[List[List[int]]] = None, # 文本框的坐标信息列表,用于处理文本与空间信息结合的任务
word_labels: Optional[List[int]] = None, # 单词级别的标签列表,用于处理序列标注任务
add_special_tokens: bool = True, # 是否添加特殊令牌(如[CLS], [SEP])
padding: Union[bool, str, PaddingStrategy] = False, # 是否进行填充处理
truncation: Union[bool, str, TruncationStrategy] = None, # 是否进行截断处理
max_length: Optional[int] = None, # 最大序列长度限制
stride: int = 0, # 滑动窗口的步长
pad_to_multiple_of: Optional[int] = None, # 填充长度的倍数
return_tensors: Optional[Union[str, TensorType]] = None, # 返回的张量类型(如`pt`表示PyTorch张量)
return_token_type_ids: Optional[bool] = None, # 是否返回token类型 IDs
return_attention_mask: Optional[bool] = None, # 是否返回attention mask
return_overflowing_tokens: bool = False, # 是否返回溢出的 token
return_special_tokens_mask: bool = False, # 是否返回特殊令牌的 mask
return_offsets_mapping: bool = False, # 是否返回字符偏移映射
return_length: bool = False, # 是否返回编码后的长度
verbose: bool = True, # 是否启用详细输出模式
**kwargs, # 其他未指定的参数
):
pass # 方法体略,实际实现中将会进行文本编码并返回编码后的结果字典
) -> BatchEncoding:
"""
Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
`__call__` should be used instead.
Args:
text (`str`, `List[str]`, `List[List[str]]`):
The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
text_pair (`List[str]` or `List[int]`, *optional*):
Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
list of list of strings (words of a batch of examples).
"""
# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
# 获取填充和截断策略以及其他相关参数
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
# 调用 _encode_plus 方法,对文本进行编码和处理
return self._encode_plus(
text=text,
boxes=boxes,
text_pair=text_pair,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
) -> BatchEncoding:
if return_offsets_mapping:
# 如果请求返回偏移映射,则抛出未实现错误
raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers. "
"To use this feature, change your tokenizer to one deriving from "
"transformers.PreTrainedTokenizerFast. "
"More information on available tokenizers at "
"https://github.com/huggingface/transformers/pull/2674"
)
# 调用内部方法,准备输入以供模型处理
return self.prepare_for_model(
text=text,
text_pair=text_pair,
boxes=boxes,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding=padding_strategy.value,
truncation=truncation_strategy.value,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
verbose=verbose,
)
@add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def prepare_for_model(
self,
text: Union[TextInput, PreTokenizedInput],
text_pair: Optional[PreTokenizedInput] = None,
boxes: Optional[List[List[int]]] = None,
word_labels: Optional[List[int]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
prepend_batch_axis: bool = False,
**kwargs,
):
# 准备输入以供模型处理,根据参数配置进行处理
# 详细文档参考 LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING 和 LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
pass
def truncate_sequences(
self,
ids: List[int],
token_boxes: List[List[int]],
pair_ids: Optional[List[int]] = None,
pair_token_boxes: Optional[List[List[int]]] = None,
labels: Optional[List[int]] = None,
num_tokens_to_remove: int = 0,
truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
stride: int = 0,
# 定义一个私有方法 `_pad`,用于填充输入序列以达到指定的最大长度
# encoded_inputs: 可以是单个编码输入的字典或批编码对象
# max_length: 可选参数,指定填充后的最大长度
# padding_strategy: 填充策略,默认为不填充
# pad_to_multiple_of: 可选参数,填充后的长度将是该参数的倍数
# return_attention_mask: 可选参数,控制是否返回注意力掩码
# 从transformers.models.bert.tokenization_bert.BasicTokenizer复制的代码
class BasicTokenizer(object):
"""
构建一个BasicTokenizer对象,用于执行基本的分词(如标点符号分割、转换为小写等)。
Args:
do_lower_case (`bool`, *可选*, 默认为 `True`):
在分词时是否将输入转换为小写。
never_split (`Iterable`, *可选*):
在分词时永远不会被分割的token集合。仅在`do_basic_tokenize=True`时生效。
tokenize_chinese_chars (`bool`, *可选*, 默认为 `True`):
是否分词中文字符。
对于日语,这可能需要禁用(参见这个
[issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents (`bool`, *可选*):
是否去除所有的重音符号。如果没有指定此选项,则会由`lowercase`的值来确定(与原始BERT一样)。
do_split_on_punc (`bool`, *可选*, 默认为 `True`):
在某些情况下,我们希望跳过基本的标点符号分割,以便后续的分词可以捕捉到单词的完整上下文,例如缩写词。
"""
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case # 是否进行小写转换
self.never_split = set(never_split) # 永远不分割的token集合,转换成集合类型
self.tokenize_chinese_chars = tokenize_chinese_chars # 是否分割中文字符
self.strip_accents = strip_accents # 是否去除重音符号
self.do_split_on_punc = do_split_on_punc # 是否基于标点符号分割
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of token not to split.
"""
# union() returns a new set by concatenating the two sets.
# 如果提供了never_split参数,则将其与self.never_split合并成一个新的集合,用于记录不需要分割的token集合
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
# 清理文本,例如去除多余的空格等
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
# 如果设置了tokenize_chinese_chars标志位,则调用_tokenize_chinese_chars方法处理中文字符
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
# 使用NFC规范化Unicode文本,确保不同的Unicode编码的同一字符被视为相同
unicode_normalized_text = unicodedata.normalize("NFC", text)
# 使用空格分割文本,生成原始token列表
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
# 遍历每个token,根据条件处理token并分割
for token in orig_tokens:
if token not in never_split:
if self.do_lower_case:
# 如果开启了小写化,则将token转换为小写
token = token.lower()
if self.strip_accents is not False:
# 如果strip_accents不为False,则移除token中的重音符号
token = self._run_strip_accents(token)
elif self.strip_accents:
# 否则只移除token中的重音符号
token = self._run_strip_accents(token)
# 将处理后的token列表添加到split_tokens中
split_tokens.extend(self._run_split_on_punc(token, never_split))
# 使用空格分割处理后的token列表,生成最终的output_tokens
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
# 使用NFD规范化Unicode文本,将重音符号与字符分开表示
text = unicodedata.normalize("NFD", text)
output = []
# 遍历每个字符,如果字符的Unicode category是Mn(Nonspacing_Mark),则跳过该字符,否则添加到output中
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
# 将字符列表连接成字符串,返回处理后的文本
return "".join(output)
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
# 如果不需要在标点符号处分割或者指定的文本在never_split中,直接返回原始文本列表
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
# 将文本转换为字符列表
chars = list(text)
i = 0
start_new_word = True
output = []
# 遍历字符列表
while i < len(chars):
char = chars[i]
# 如果当前字符是标点符号,将其作为新的列表项添加到输出列表中,并标记下一个字符为新单词的起始
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
# 如果当前字符不是标点符号
if start_new_word:
output.append([]) # 添加一个新的空列表项
start_new_word = False # 取消新单词的起始标记
output[-1].append(char) # 将当前字符添加到当前单词的最后一个列表项中
i += 1
# 将列表中的列表项合并为字符串并返回
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
# 遍历文本中的每个字符
for char in text:
cp = ord(char)
# 如果字符是中文字符,添加空格字符作为分隔符
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char) # 否则直接添加当前字符
# 将字符列表转换为字符串并返回
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# 检查传入的码点是否属于CJK字符的Unicode块范围
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF) #
or (cp >= 0x20000 and cp <= 0x2A6DF) #
or (cp >= 0x2A700 and cp <= 0x2B73F) #
or (cp >= 0x2B740 and cp <= 0x2B81F) #
or (cp >= 0x2B820 and cp <= 0x2CEAF) #
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F) #
): #
return True # 是CJK字符返回True
return False # 否则返回False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
# 遍历文本中的每个字符
for char in text:
cp = ord(char)
# 如果字符是无效字符或控制字符,直接跳过
if cp == 0 or cp == 0xFFFD or _is_control(char):
continue
# 如果字符是空白字符,替换为单个空格字符,否则直接添加当前字符
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
# 将字符列表转换为字符串并返回
return "".join(output)
# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
# 初始化 WordpieceTokenizer 类的实例,设置词汇表、未知 token 和单词最大字符数
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
tokenization using the given vocabulary.
For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through *BasicTokenizer*.
Returns:
A list of wordpiece tokens.
"""
# 初始化输出 token 列表
output_tokens = []
# 将输入文本按空白字符分割成 token,并逐个处理
for token in whitespace_tokenize(text):
# 将当前 token 转换为字符列表
chars = list(token)
# 若当前 token 的字符数超过设定的最大字符数,则添加未知 token 并跳过
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
# 初始化标志变量和起始位置
is_bad = False
start = 0
sub_tokens = []
# 迭代处理字符列表直到处理完所有字符
while start < len(chars):
end = len(chars)
cur_substr = None
# 从当前起始位置到结束位置,逐步减少子字符串长度,直到找到在词汇表中存在的最长子字符串
while start < end:
substr = "".join(chars[start:end])
# 如果起始位置不是第一个字符,则在找到的子字符串前加上 "##"
if start > 0:
substr = "##" + substr
# 如果找到了在词汇表中的子字符串,则保存当前子字符串并退出内循环
if substr in self.vocab:
cur_substr = substr
break
end -= 1
# 如果未找到合适的子字符串,则标记为无效,并结束外循环
if cur_substr is None:
is_bad = True
break
# 将找到的子字符串添加到 sub_tokens 列表中
sub_tokens.append(cur_substr)
# 更新起始位置为当前子字符串的结束位置
start = end
# 根据标志变量决定将未知 token 或有效子 token 添加到输出列表
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
# 返回最终的 wordpiece token 列表
return output_tokens
.\models\layoutlmv2\tokenization_layoutlmv2_fast.py
"""
LayoutLMv2 的快速分词器类。覆盖了慢分词器类的两个方法:_batch_encode_plus 和 _encode_plus,其中使用了 Rust 分词器。
"""
import json
from typing import Dict, List, Optional, Tuple, Union
from tokenizers import normalizers
from ...tokenization_utils_base import (
BatchEncoding,
EncodedInput,
PaddingStrategy,
PreTokenizedInput,
TensorType,
TextInput,
TextInputPair,
TruncationStrategy,
)
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import add_end_docstrings, logging
from .tokenization_layoutlmv2 import (
LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING,
LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
LayoutLMv2Tokenizer,
)
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/layoutlmv2-base-uncased": (
"https://huggingface.co/microsoft/layoutlmv2-base-uncased/resolve/main/vocab.txt"
),
},
"tokenizer_file": {
"microsoft/layoutlmv2-base-uncased": (
"https://huggingface.co/microsoft/layoutlmv2-base-uncased/resolve/main/tokenizer.json"
),
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/layoutlmv2-base-uncased": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"microsoft/layoutlmv2-base-uncased": {"do_lower_case": True},
}
class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
r"""
构建一个基于 HuggingFace 的 *tokenizers* 库支持的"快速" LayoutLMv2 分词器。基于 WordPiece。
该分词器继承自 [`PreTrainedTokenizerFast`],其中包含大多数主要方法。用户应参考此超类以获取更多关于这些方法的信息。
# 初始化词汇文件名列表,使用预定义的全局常量
vocab_files_names = VOCAB_FILES_NAMES
# 预训练模型的词汇文件映射,包含文件名到预训练模型配置的映射关系
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
# 预训练模型初始化的配置,包含了预定义的配置参数
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
# 将预训练模型的位置嵌入大小赋值给 max_model_input_sizes
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# 将 LayoutLMv2Tokenizer 类赋值给 slow_tokenizer_class
slow_tokenizer_class = LayoutLMv2Tokenizer
# 初始化函数,用于创建一个 LayoutLMv2Tokenizer 对象
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
do_lower_case=True,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
cls_token_box=[0, 0, 0, 0],
sep_token_box=[1000, 1000, 1000, 1000],
pad_token_box=[0, 0, 0, 0],
pad_token_label=-100,
only_label_first_subword=True,
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs,
):
# 调用父类的初始化方法,设置相关属性
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
cls_token_box=cls_token_box,
sep_token_box=sep_token_box,
pad_token_box=pad_token_box,
pad_token_label=pad_token_label,
only_label_first_subword=only_label_first_subword,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
# 从 backend_tokenizer 中获取当前的标准化状态
pre_tok_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
# 检查预处理器的小写和去重音选项是否与参数中的设置一致,若不一致则更新预处理器状态
if (
pre_tok_state.get("lowercase", do_lower_case) != do_lower_case
or pre_tok_state.get("strip_accents", strip_accents) != strip_accents
):
# 获取预处理器的类,并更新参数
pre_tok_class = getattr(normalizers, pre_tok_state.pop("type"))
pre_tok_state["lowercase"] = do_lower_case
pre_tok_state["strip_accents"] = strip_accents
# 实例化新的预处理器对象
self.backend_tokenizer.normalizer = pre_tok_class(**pre_tok_state)
# 设置实例的属性
self.do_lower_case = do_lower_case
self.cls_token_box = cls_token_box
self.sep_token_box = sep_token_box
self.pad_token_box = pad_token_box
self.pad_token_label = pad_token_label
self.only_label_first_subword = only_label_first_subword
# 将函数的装饰器添加到当前类中
@add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
@add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
# 使用装饰器添加文档字符串,其中包含 LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING 和 LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING 的内容
def batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
],
is_pair: bool = None,
boxes: Optional[List[List[List[int]]]] = None,
word_labels: Optional[Union[List[int], List[List[int]]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
# batch_encode_plus 方法用于批量编码文本或文本对,并返回编码后的结果
) -> BatchEncoding:
# 为了向后兼容 'truncation_strategy', 'pad_to_max_length' 参数
# 调用内部方法获取填充和截断策略以及其他参数
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
# 调用内部方法进行批量编码处理,并返回结果
return self._batch_encode_plus(
batch_text_or_text_pairs=batch_text_or_text_pairs,
is_pair=is_pair,
boxes=boxes,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
# 将输入文本和可选的配对文本构成批量输入
batched_input = [(text, pair)] if pair else [text]
# 使用内部的分词器对批量输入进行编码处理
encodings = self._tokenizer.encode_batch(
batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
)
# 返回第一个编码结果的 tokens 属性,即分词后的文本列表
return encodings[0].tokens
@add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def encode_plus(
self,
text: Union[TextInput, PreTokenizedInput],
text_pair: Optional[PreTokenizedInput] = None,
boxes: Optional[List[List[int]]] = None,
word_labels: Optional[List[int]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
# 使用特定的文本和配对文本、框、单词标签等信息进行编码处理
# 设置默认添加特殊标记,以及填充和截断策略
# 返回编码后的结果,根据参数选择是否返回张量形式的数据
) -> BatchEncoding:
"""
Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
`__call__` should be used instead.
Args:
text (`str`, `List[str]`, `List[List[str]]`):
The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
text_pair (`List[str]` or `List[int]`, *optional*):
Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
list of list of strings (words of a batch of examples).
"""
# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
# 获取填充和截断策略,以及其他相关参数
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
# 调用内部方法 `_encode_plus` 进行编码
return self._encode_plus(
text=text,
boxes=boxes,
text_pair=text_pair,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def _batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
],
is_pair: bool = None,
boxes: Optional[List[List[List[int]]]] = None,
word_labels: Optional[List[List[int]]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
def _encode_plus(
self,
text: Union[TextInput, PreTokenizedInput], # 定义函数参数text,可以是单文本或预分词文本输入
text_pair: Optional[PreTokenizedInput] = None, # 可选参数,用于处理文本对
boxes: Optional[List[List[int]]] = None, # 可选参数,用于处理边界框信息
word_labels: Optional[List[int]] = None, # 可选参数,用于处理单词级别标签
add_special_tokens: bool = True, # 是否添加特殊token,默认为True
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, # 填充策略,默认不填充
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, # 截断策略,默认不截断
max_length: Optional[int] = None, # 可选参数,最大长度限制
stride: int = 0, # 步长,默认为0
pad_to_multiple_of: Optional[int] = None, # 可选参数,填充到某个倍数
return_tensors: Optional[bool] = None, # 可选参数,返回张量形式
return_token_type_ids: Optional[bool] = None, # 可选参数,返回token类型IDs
return_attention_mask: Optional[bool] = None, # 可选参数,返回注意力掩码
return_overflowing_tokens: bool = False, # 是否返回溢出的token,默认不返回
return_special_tokens_mask: bool = False, # 是否返回特殊token的掩码,默认不返回
return_offsets_mapping: bool = False, # 是否返回偏移映射,默认不返回
return_length: bool = False, # 是否返回长度,默认不返回
verbose: bool = True, # 是否显示详细信息,默认为True
**kwargs, # 其他未指定的关键字参数
) -> BatchEncoding:
# 将输入文本处理为批次输入
# 有两种选项:
# 1) 只有text,此时text必须是str的列表
# 2) text + text_pair,此时text是str,text_pair是str的列表
batched_input = [(text, text_pair)] if text_pair else [text]
# 将边界框信息处理为批次边界框
batched_boxes = [boxes]
# 将单词级别标签处理为批次标签
batched_word_labels = [word_labels] if word_labels is not None else None
# 使用_batch_encode_plus方法处理批次输入
batched_output = self._batch_encode_plus(
batched_input,
is_pair=bool(text_pair is not None), # 是否是文本对
boxes=batched_boxes, # 批次边界框信息
word_labels=batched_word_labels, # 批次单词级别标签
add_special_tokens=add_special_tokens, # 是否添加特殊token
padding_strategy=padding_strategy, # 填充策略
truncation_strategy=truncation_strategy, # 截断策略
max_length=max_length, # 最大长度限制
stride=stride, # 步长
pad_to_multiple_of=pad_to_multiple_of, # 填充到某个倍数
return_tensors=return_tensors, # 是否返回张量形式
return_token_type_ids=return_token_type_ids, # 是否返回token类型IDs
return_attention_mask=return_attention_mask, # 是否返回注意力掩码
return_overflowing_tokens=return_overflowing_tokens, # 是否返回溢出的token
return_special_tokens_mask=return_special_tokens_mask, # 是否返回特殊token的掩码
return_offsets_mapping=return_offsets_mapping, # 是否返回偏移映射
return_length=return_length, # 是否返回长度
verbose=verbose, # 是否显示详细信息
**kwargs, # 其他未指定的关键字参数
)
# 如果返回的张量为None,并且不返回溢出的token,则移除批次输出的前导批次轴
# 如果返回的值为批次的输出,则在这种情况下保留它们
if return_tensors is None and not return_overflowing_tokens:
batched_output = BatchEncoding(
{
key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
for key, value in batched_output.items()
},
batched_output.encodings,
)
# 检查并警告处理后序列过长的情况
self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
# 返回处理后的批次输出
return batched_output
def _pad(
self,
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
):
"""
Pad encoded inputs according to specified parameters.
Args:
encoded_inputs (Union[Dict[str, EncodedInput], BatchEncoding]):
Dictionary or batch encoding containing encoded inputs.
max_length (Optional[int], *optional*):
Maximum length to pad or truncate the sequences.
padding_strategy (PaddingStrategy):
Strategy for padding the sequences.
pad_to_multiple_of (Optional[int], *optional*):
Pad to a multiple of this value.
return_attention_mask (Optional[bool], *optional*):
Whether to return attention mask.
Returns:
Union[Dict[str, torch.Tensor], BatchEncoding]:
Padded and encoded inputs.
"""
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequences by adding special tokens.
Args:
token_ids_0 (List[int]):
List of IDs for the first sequence.
token_ids_1 (List[int], *optional*):
Optional list of IDs for the second sequence.
Returns:
List[int]: List of input IDs with added special tokens.
"""
output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
if token_ids_1:
output += token_ids_1 + [self.sep_token_id]
return output
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create token type IDs from sequences for sequence-pair classification tasks.
Args:
token_ids_0 (List[int]):
List of IDs for the first sequence.
token_ids_1 (List[int], *optional*):
Optional list of IDs for the second sequence.
Returns:
List[int]: List of token type IDs indicating the sequence segments.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary of the tokenizer model.
Args:
save_directory (str):
Directory to save the vocabulary files.
filename_prefix (Optional[str], *optional*):
Prefix for the vocabulary filenames.
Returns:
Tuple[str]: Tuple containing the paths of the saved files.
"""
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
.\models\layoutlmv2\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_tokenizers_available,
is_torch_available,
is_vision_available,
)
_import_structure = {
"configuration_layoutlmv2": ["LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "LayoutLMv2Config"],
"processing_layoutlmv2": ["LayoutLMv2Processor"],
"tokenization_layoutlmv2": ["LayoutLMv2Tokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_layoutlmv2_fast"] = ["LayoutLMv2TokenizerFast"]
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["feature_extraction_layoutlmv2"] = ["LayoutLMv2FeatureExtractor"]
_import_structure["image_processing_layoutlmv2"] = ["LayoutLMv2ImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_layoutlmv2"] = [
"LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST",
"LayoutLMv2ForQuestionAnswering",
"LayoutLMv2ForSequenceClassification",
"LayoutLMv2ForTokenClassification",
"LayoutLMv2Layer",
"LayoutLMv2Model",
"LayoutLMv2PreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_layoutlmv2 import LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP, LayoutLMv2Config
from .processing_layoutlmv2 import LayoutLMv2Processor
from .tokenization_layoutlmv2 import LayoutLMv2Tokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_layoutlmv2_fast import LayoutLMv2TokenizerFast
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .feature_extraction_layoutlmv2 import LayoutLMv2FeatureExtractor, LayoutLMv2ImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_layoutlmv2 import (
LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST,
LayoutLMv2ForQuestionAnswering,
LayoutLMv2ForSequenceClassification,
LayoutLMv2ForTokenClassification,
LayoutLMv2Layer,
LayoutLMv2Model,
LayoutLMv2PreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\layoutlmv3\configuration_layoutlmv3.py
""" LayoutLMv3 model configuration"""
from collections import OrderedDict
from typing import TYPE_CHECKING, Any, Mapping, Optional
from packaging import version
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...onnx.utils import compute_effective_axis_dimension
from ...utils import logging
if TYPE_CHECKING:
from ...processing_utils import ProcessorMixin
from ...utils import TensorType
logger = logging.get_logger(__name__)
LAYOUTLMV3_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"microsoft/layoutlmv3-base": "https://huggingface.co/microsoft/layoutlmv3-base/resolve/main/config.json",
}
class LayoutLMv3Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`LayoutLMv3Model`]. It is used to instantiate an
LayoutLMv3 model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the LayoutLMv3
[microsoft/layoutlmv3-base](https://huggingface.co/microsoft/layoutlmv3-base) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import LayoutLMv3Config, LayoutLMv3Model
>>> # Initializing a LayoutLMv3 microsoft/layoutlmv3-base style configuration
>>> configuration = LayoutLMv3Config()
>>> # Initializing a model (with random weights) from the microsoft/layoutlmv3-base style configuration
>>> model = LayoutLMv3Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "layoutlmv3"
def __init__(
self,
vocab_size=50265,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-5,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
max_2d_position_embeddings=1024,
coordinate_size=128,
shape_size=128,
has_relative_attention_bias=True,
rel_pos_bins=32,
max_rel_pos=128,
rel_2d_pos_bins=64,
max_rel_2d_pos=256,
has_spatial_attention_bias=True,
text_embed=True,
visual_embed=True,
input_size=224,
num_channels=3,
patch_size=16,
classifier_dropout=None,
**kwargs,
):
super().__init__(
vocab_size=vocab_size,
hidden_size=hidden_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
intermediate_size=intermediate_size,
hidden_act=hidden_act,
hidden_dropout_prob=hidden_dropout_prob,
attention_probs_dropout_prob=attention_probs_dropout_prob,
max_position_embeddings=max_position_embeddings,
type_vocab_size=type_vocab_size,
initializer_range=initializer_range,
layer_norm_eps=layer_norm_eps,
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
**kwargs,
)
self.max_2d_position_embeddings = max_2d_position_embeddings
self.coordinate_size = coordinate_size
self.shape_size = shape_size
self.has_relative_attention_bias = has_relative_attention_bias
self.rel_pos_bins = rel_pos_bins
self.max_rel_pos = max_rel_pos
self.has_spatial_attention_bias = has_spatial_attention_bias
self.rel_2d_pos_bins = rel_2d_pos_bins
self.max_rel_2d_pos = max_rel_2d_pos
self.text_embed = text_embed
self.visual_embed = visual_embed
self.input_size = input_size
self.num_channels = num_channels
self.patch_size = patch_size
self.classifier_dropout = classifier_dropout
class LayoutLMv3OnnxConfig(OnnxConfig):
torch_onnx_minimum_version = version.parse("1.12")
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task in ["question-answering", "sequence-classification"]:
return OrderedDict(
[
("input_ids", {0: "batch", 1: "sequence"}),
("attention_mask", {0: "batch", 1: "sequence"}),
("bbox", {0: "batch", 1: "sequence"}),
("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
]
)
else:
return OrderedDict(
[
("input_ids", {0: "batch", 1: "sequence"}),
("bbox", {0: "batch", 1: "sequence"}),
("attention_mask", {0: "batch", 1: "sequence"}),
("pixel_values", {0: "batch", 1: "num_channels"}),
]
)
@property
def atol_for_validation(self) -> float:
return 1e-5
@property
def default_onnx_opset(self) -> int:
return 12
def generate_dummy_inputs(
self,
processor: "ProcessorMixin",
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional["TensorType"] = None,
num_channels: int = 3,
image_width: int = 40,
image_height: int = 40,
"""
Generate inputs to provide to the ONNX exporter for the specific framework
Args:
processor ([`ProcessorMixin`]):
The processor associated with this model configuration.
batch_size (`int`, *optional*, defaults to -1):
The batch size to export the model for (-1 means dynamic axis).
seq_length (`int`, *optional*, defaults to -1):
The sequence length to export the model for (-1 means dynamic axis).
is_pair (`bool`, *optional*, defaults to `False`):
Indicate if the input is a pair (sentence 1, sentence 2).
framework (`TensorType`, *optional*, defaults to `None`):
The framework (PyTorch or TensorFlow) that the processor will generate tensors for.
num_channels (`int`, *optional*, defaults to 3):
The number of channels of the generated images.
image_width (`int`, *optional*, defaults to 40):
The width of the generated images.
image_height (`int`, *optional*, defaults to 40):
The height of the generated images.
Returns:
Mapping[str, Any]: holding the kwargs to provide to the model's forward function
"""
setattr(processor.image_processor, "apply_ocr", False)
batch_size = compute_effective_axis_dimension(
batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0
)
token_to_add = processor.tokenizer.num_special_tokens_to_add(is_pair)
seq_length = compute_effective_axis_dimension(
seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
)
dummy_text = [[" ".join([processor.tokenizer.unk_token]) * seq_length]] * batch_size
dummy_bboxes = [[[48, 84, 73, 128]]] * batch_size
dummy_image = self._generate_dummy_images(batch_size, num_channels, image_height, image_width)
inputs = dict(
processor(
dummy_image,
text=dummy_text,
boxes=dummy_bboxes,
return_tensors=framework,
)
)
return inputs
.\models\layoutlmv3\feature_extraction_layoutlmv3.py
"""
Feature extractor class for LayoutLMv3.
"""
import warnings
from ...utils import logging
from .image_processing_layoutlmv3 import LayoutLMv3ImageProcessor
logger = logging.get_logger(__name__)
class LayoutLMv3FeatureExtractor(LayoutLMv3ImageProcessor):
def __init__(self, *args, **kwargs) -> None:
warnings.warn(
"The class LayoutLMv3FeatureExtractor is deprecated and will be removed in version 5 of Transformers."
" Please use LayoutLMv3ImageProcessor instead.",
FutureWarning,
)
super().__init__(*args, **kwargs)
.\models\layoutlmv3\image_processing_layoutlmv3.py
"""LayoutLMv3 的图像处理器类。"""
from typing import Dict, Iterable, Optional, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import resize, to_channel_dimension_format, to_pil_image
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
to_numpy_array,
valid_images,
validate_kwargs,
validate_preprocess_arguments,
)
from ...utils import TensorType, is_pytesseract_available, is_vision_available, logging, requires_backends
if is_vision_available():
import PIL
if is_pytesseract_available():
import pytesseract
logger = logging.get_logger(__name__)
def normalize_box(box, width, height):
"""将边界框的坐标归一化为 [0, 1000] 的范围内。
Args:
box (list): 边界框的坐标 [left, top, right, bottom]。
width (int): 图像宽度。
height (int): 图像高度。
Returns:
list: 归一化后的边界框坐标 [left_norm, top_norm, right_norm, bottom_norm]。
"""
return [
int(1000 * (box[0] / width)),
int(1000 * (box[1] / height)),
int(1000 * (box[2] / width)),
int(1000 * (box[3] / height)),
]
def apply_tesseract(
image: np.ndarray,
lang: Optional[str],
tesseract_config: Optional[str],
input_data_format: Optional[Union[ChannelDimension, str]] = None,
):
"""对文档图像应用 Tesseract OCR,并返回识别的单词及归一化的边界框。
Args:
image (np.ndarray): 输入的图像数据。
lang (Optional[str]): OCR 使用的语言设置。
tesseract_config (Optional[str]): Tesseract 配置选项。
input_data_format (Optional[Union[ChannelDimension, str]]): 输入图像的通道格式。
Returns:
None
"""
pil_image = to_pil_image(image, input_data_format=input_data_format)
image_width, image_height = pil_image.size
data = pytesseract.image_to_data(pil_image, lang=lang, output_type="dict", config=tesseract_config)
words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
irrelevant_indices = [idx for idx, word in enumerate(words) if not word.strip()]
words = [word for idx, word in enumerate(words) if idx not in irrelevant_indices]
left = [coord for idx, coord in enumerate(left) if idx not in irrelevant_indices]
top = [coord for idx, coord in enumerate(top) if idx not in irrelevant_indices]
width = [coord for idx, coord in enumerate(width) if idx not in irrelevant_indices]
height = [coord for idx, coord in enumerate(height) if idx not in irrelevant_indices]
actual_boxes = []
for x, y, w, h in zip(left, top, width, height):
actual_box = [x, y, x + w, y + h]
actual_boxes.append(actual_box)
normalized_boxes = []
for box in actual_boxes:
normalized_boxes.append(normalize_box(box, image_width, image_height))
assert len(words) == len(normalized_boxes), "Not as many words as there are bounding boxes"
return words, normalized_boxes
class LayoutLMv3ImageProcessor(BaseImageProcessor):
r"""
Constructs a LayoutLMv3 image processor.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to `(size["height"], size["width"])`. Can be
overridden by `do_resize` in `preprocess`.
size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
Size of the image after resizing. Can be overridden by `size` in `preprocess`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image's pixel values by the specified `rescale_value`. Can be overridden by
`do_rescale` in `preprocess`.
rescale_factor (`float`, *optional*, defaults to 1 / 255):
Value by which the image's pixel values are rescaled. Can be overridden by `rescale_factor` in
`preprocess`.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
method.
image_mean (`Iterable[float]` or `float`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`Iterable[float]` or `float`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
apply_ocr (`bool`, *optional*, defaults to `True`):
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
the `apply_ocr` parameter in the `preprocess` method.
ocr_lang (`str`, *optional*):
The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method.
tesseract_config (`str`, *optional*):
Any additional custom configuration flags that are forwarded to the `config` parameter when calling
Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the
`preprocess` method.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
do_rescale: bool = True,
rescale_value: float = 1 / 255,
do_normalize: bool = True,
image_mean: Union[float, Iterable[float]] = None,
image_std: Union[float, Iterable[float]] = None,
apply_ocr: bool = True,
ocr_lang: Optional[str] = None,
tesseract_config: Optional[str] = "",
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"height": 224, "width": 224}
size = get_size_dict(size)
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_value
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
self.apply_ocr = apply_ocr
self.ocr_lang = ocr_lang
self.tesseract_config = tesseract_config
self._valid_processor_keys = [
"images",
"do_resize",
"size",
"resample",
"do_rescale",
"rescale_factor",
"do_normalize",
"image_mean",
"image_std",
"apply_ocr",
"ocr_lang",
"tesseract_config",
"return_tensors",
"data_format",
"input_data_format",
]
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize an image to `(size["height"], size["width"])`.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
Returns:
`np.ndarray`: The resized image.
"""
size = get_size_dict(size)
if "height" not in size or "width" not in size:
raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
output_size = (size["height"], size["width"])
return resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
def preprocess(
self,
images: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
resample=None,
do_rescale: bool = None,
rescale_factor: float = None,
do_normalize: bool = None,
image_mean: Union[float, Iterable[float]] = None,
image_std: Union[float, Iterable[float]] = None,
apply_ocr: bool = None,
ocr_lang: Optional[str] = None,
tesseract_config: Optional[str] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
.\models\layoutlmv3\modeling_layoutlmv3.py
import collections
import math
from typing import Optional, Tuple, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_layoutlmv3 import LayoutLMv3Config
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "LayoutLMv3Config"
LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/layoutlmv3-base",
"microsoft/layoutlmv3-large",
]
LAYOUTLMV3_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`LayoutLMv3Config`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
LAYOUTLMV3_MODEL_INPUTS_DOCSTRING = r"""
"""
LAYOUTLMV3_DOWNSTREAM_INPUTS_DOCSTRING = r"""
"""
class LayoutLMv3PatchEmbeddings(nn.Module):
"""LayoutLMv3 image (patch) embeddings. This class also automatically interpolates the position embeddings for varying
image sizes."""
def __init__(self, config):
super().__init__()
image_size = (
config.input_size
if isinstance(config.input_size, collections.abc.Iterable)
else (config.input_size, config.input_size)
)
patch_size = (
config.patch_size
if isinstance(config.patch_size, collections.abc.Iterable)
else (config.patch_size, config.patch_size)
)
self.patch_shape = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
self.proj = nn.Conv2d(config.num_channels, config.hidden_size, kernel_size=patch_size, stride=patch_size)
def forward(self, pixel_values, position_embedding=None):
embeddings = self.proj(pixel_values)
if position_embedding is not None:
position_embedding = position_embedding.view(1, self.patch_shape[0], self.patch_shape[1], -1)
position_embedding = position_embedding.permute(0, 3, 1, 2)
patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
position_embedding = F.interpolate(position_embedding, size=(patch_height, patch_width), mode="bicubic")
embeddings = embeddings + position_embedding
embeddings = embeddings.flatten(2).transpose(1, 2)
return embeddings
"""
LayoutLMv3 text embeddings. Same as `RobertaEmbeddings` but with added spatial (layout) embeddings.
"""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.padding_idx = config.pad_token_id
self.position_embeddings = nn.Embedding(
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
)
self.x_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
self.y_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
self.h_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.shape_size)
self.w_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.shape_size)
def calculate_spatial_position_embeddings(self, bbox):
try:
left_position_embeddings = self.x_position_embeddings(bbox[:, :, 0])
upper_position_embeddings = self.y_position_embeddings(bbox[:, :, 1])
right_position_embeddings = self.x_position_embeddings(bbox[:, :, 2])
lower_position_embeddings = self.y_position_embeddings(bbox[:, :, 3])
except IndexError as e:
raise IndexError("The `bbox` coordinate values should be within 0-1000 range.") from e
h_position_embeddings = self.h_position_embeddings(torch.clip(bbox[:, :, 3] - bbox[:, :, 1], 0, 1023))
w_position_embeddings = self.w_position_embeddings(torch.clip(bbox[:, :, 2] - bbox[:, :, 0], 0, 1023))
spatial_position_embeddings = torch.cat(
[
left_position_embeddings,
upper_position_embeddings,
right_position_embeddings,
lower_position_embeddings,
h_position_embeddings,
w_position_embeddings,
],
dim=-1,
)
return spatial_position_embeddings
def create_position_ids_from_input_ids(self, input_ids, padding_idx):
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
symbols are ignored. This is modified from fairseq's `utils.make_positions`.
"""
mask = input_ids.ne(padding_idx).int()
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask)) * mask
return incremental_indices.long() + padding_idx
def create_position_ids_from_inputs_embeds(self, inputs_embeds):
"""
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
"""
input_shape = inputs_embeds.size()[:-1]
sequence_length = input_shape[1]
position_ids = torch.arange(
self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
)
return position_ids.unsqueeze(0).expand(input_shape)
def forward(
self,
input_ids=None,
bbox=None,
token_type_ids=None,
position_ids=None,
inputs_embeds=None,
):
if position_ids is None:
if input_ids is not None:
position_ids = self.create_position_ids_from_input_ids(input_ids, self.padding_idx).to(
input_ids.device
)
else:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + token_type_embeddings
position_embeddings = self.position_embeddings(position_ids)
embeddings += position_embeddings
spatial_position_embeddings = self.calculate_spatial_position_embeddings(bbox)
embeddings = embeddings + spatial_position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class LayoutLMv3PreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = LayoutLMv3Config
base_model_prefix = "layoutlmv3"
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, (nn.Linear, nn.Conv2d)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
class LayoutLMv3SelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.has_relative_attention_bias = config.has_relative_attention_bias
self.has_spatial_attention_bias = config.has_spatial_attention_bias
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def cogview_attention(self, attention_scores, alpha=32):
"""
https://arxiv.org/abs/2105.13290 Section 2.4 Stabilization of training: Precision Bottleneck Relaxation
(PB-Relax). A replacement of the original nn.Softmax(dim=-1)(attention_scores). Seems the new attention_probs
will result in a slower speed and a little bias. Can use torch.allclose(standard_attention_probs,
cogview_attention_probs, atol=1e-08) for comparison. The smaller atol (e.g., 1e-08), the better.
"""
scaled_attention_scores = attention_scores / alpha
max_value = scaled_attention_scores.amax(dim=(-1)).unsqueeze(-1)
new_attention_scores = (scaled_attention_scores - max_value) * alpha
return nn.Softmax(dim=-1)(new_attention_scores)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
output_attentions=False,
rel_pos=None,
rel_2d_pos=None,
):
mixed_query_layer = self.query(hidden_states)
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
query_layer = self.transpose_for_scores(mixed_query_layer)
attention_scores = torch.matmul(query_layer / math.sqrt(self.attention_head_size), key_layer.transpose(-1, -2))
if self.has_relative_attention_bias and self.has_spatial_attention_bias:
attention_scores += (rel_pos + rel_2d_pos) / math.sqrt(self.attention_head_size)
elif self.has_relative_attention_bias:
attention_scores += rel_pos / math.sqrt(self.attention_head_size)
if attention_mask is not None:
attention_scores = attention_scores + attention_mask
attention_probs = self.cogview_attention(attention_scores)
attention_probs = self.dropout(attention_probs)
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
class LayoutLMv3SelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class LayoutLMv3Attention(nn.Module):
def __init__(self, config):
super().__init__()
self.self = LayoutLMv3SelfAttention(config)
self.output = LayoutLMv3SelfOutput(config)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
output_attentions=False,
rel_pos=None,
rel_2d_pos=None,
):
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
output_attentions,
rel_pos=rel_pos,
rel_2d_pos=rel_2d_pos,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class LayoutLMv3Layer(nn.Module):
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = LayoutLMv3Attention(config)
self.intermediate = LayoutLMv3Intermediate(config)
self.output = LayoutLMv3Output(config)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
output_attentions=False,
rel_pos=None,
rel_2d_pos=None,
):
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
rel_pos=rel_pos,
rel_2d_pos=rel_2d_pos,
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:]
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([LayoutLMv3Layer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
self.has_relative_attention_bias = config.has_relative_attention_bias
self.has_spatial_attention_bias = config.has_spatial_attention_bias
if self.has_relative_attention_bias:
self.rel_pos_bins = config.rel_pos_bins
self.max_rel_pos = config.max_rel_pos
self.rel_pos_bias = nn.Linear(self.rel_pos_bins, config.num_attention_heads, bias=False)
if self.has_spatial_attention_bias:
self.max_rel_2d_pos = config.max_rel_2d_pos
self.rel_2d_pos_bins = config.rel_2d_pos_bins
self.rel_pos_x_bias = nn.Linear(self.rel_2d_pos_bins, config.num_attention_heads, bias=False)
self.rel_pos_y_bias = nn.Linear(self.rel_2d_pos_bins, config.num_attention_heads, bias=False)
def relative_position_bucket(self, relative_position, bidirectional=True, num_buckets=32, max_distance=128):
ret = 0
if bidirectional:
num_buckets //= 2
ret += (relative_position > 0).long() * num_buckets
n = torch.abs(relative_position)
else:
n = torch.max(-relative_position, torch.zeros_like(relative_position))
max_exact = num_buckets // 2
is_small = n < max_exact
val_if_large = max_exact + (
torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
).to(torch.long)
val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
ret += torch.where(is_small, n, val_if_large)
return ret
def _cal_1d_pos_emb(self, position_ids):
rel_pos_mat = position_ids.unsqueeze(-2) - position_ids.unsqueeze(-1)
rel_pos = self.relative_position_bucket(
rel_pos_mat,
num_buckets=self.rel_pos_bins,
max_distance=self.max_rel_pos,
)
rel_pos = self.rel_pos_bias.weight.t()[rel_pos].permute(0, 3, 1, 2)
rel_pos = rel_pos.contiguous()
return rel_pos
def _cal_2d_pos_emb(self, bbox):
position_coord_x = bbox[:, :, 0]
position_coord_y = bbox[:, :, 3]
rel_pos_x_2d_mat = position_coord_x.unsqueeze(-2) - position_coord_x.unsqueeze(-1)
rel_pos_y_2d_mat = position_coord_y.unsqueeze(-2) - position_coord_y.unsqueeze(-1)
rel_pos_x = self.relative_position_bucket(
rel_pos_x_2d_mat,
num_buckets=self.rel_2d_pos_bins,
max_distance=self.max_rel_2d_pos,
)
rel_pos_y = self.relative_position_bucket(
rel_pos_y_2d_mat,
num_buckets=self.rel_2d_pos_bins,
max_distance=self.max_rel_2d_pos,
)
rel_pos_x = self.rel_pos_x_bias.weight.t()[rel_pos_x].permute(0, 3, 1, 2)
rel_pos_y = self.rel_pos_y_bias.weight.t()[rel_pos_y].permute(0, 3, 1, 2)
rel_pos_x = rel_pos_x.contiguous()
rel_pos_y = rel_pos_y.contiguous()
rel_2d_pos = rel_pos_x + rel_pos_y
return rel_2d_pos
def forward(
self,
hidden_states,
bbox=None,
attention_mask=None,
head_mask=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
position_ids=None,
patch_height=None,
patch_width=None,
):
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
rel_pos = self._cal_1d_pos_emb(position_ids) if self.has_relative_attention_bias else None
rel_2d_pos = self._cal_2d_pos_emb(bbox) if self.has_spatial_attention_bias else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
attention_mask,
layer_head_mask,
output_attentions,
rel_pos,
rel_2d_pos,
)
else:
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
output_attentions,
rel_pos=rel_pos,
rel_2d_pos=rel_2d_pos,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v
for v in [
hidden_states,
all_hidden_states,
all_self_attentions,
]
if v is not None
)
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
class LayoutLMv3Intermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class LayoutLMv3Output(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
@add_start_docstrings(
"The bare LayoutLMv3 Model transformer outputting raw hidden-states without any specific head on top.",
LAYOUTLMV3_START_DOCSTRING,
)
class LayoutLMv3Model(LayoutLMv3PreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.config = config
if config.text_embed:
self.embeddings = LayoutLMv3TextEmbeddings(config)
if config.visual_embed:
self.patch_embed = LayoutLMv3PatchEmbeddings(config)
size = int(config.input_size / config.patch_size)
self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
self.pos_embed = nn.Parameter(torch.zeros(1, size * size + 1, config.hidden_size))
self.pos_drop = nn.Dropout(p=0.0)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
if self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias:
self.init_visual_bbox(image_size=(size, size))
self.norm = nn.LayerNorm(config.hidden_size, eps=1e-6)
self.encoder = LayoutLMv3Encoder(config)
self.init_weights()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
def init_visual_bbox(self, image_size=(14, 14), max_len=1000):
"""
Create the bounding boxes for the visual (patch) tokens.
"""
visual_bbox_x = torch.div(
torch.arange(0, max_len * (image_size[1] + 1), max_len), image_size[1], rounding_mode="trunc"
)
visual_bbox_y = torch.div(
torch.arange(0, max_len * (image_size[0] + 1), max_len), image_size[0], rounding_mode="trunc"
)
visual_bbox = torch.stack(
[
visual_bbox_x[:-1].repeat(image_size[0], 1),
visual_bbox_y[:-1].repeat(image_size[1], 1).transpose(0, 1),
visual_bbox_x[1:].repeat(image_size[0], 1),
visual_bbox_y[1:].repeat(image_size[1], 1).transpose(0, 1),
],
dim=-1,
).view(-1, 4)
cls_token_box = torch.tensor([[0 + 1, 0 + 1, max_len - 1, max_len - 1]])
self.visual_bbox = torch.cat([cls_token_box, visual_bbox], dim=0)
def calculate_visual_bbox(self, device, dtype, batch_size):
visual_bbox = self.visual_bbox.repeat(batch_size, 1, 1)
visual_bbox = visual_bbox.to(device).type(dtype)
return visual_bbox
def forward_image(self, pixel_values):
embeddings = self.patch_embed(pixel_values)
batch_size, seq_len, _ = embeddings.size()
cls_tokens = self.cls_token.expand(batch_size, -1, -1)
embeddings = torch.cat((cls_tokens, embeddings), dim=1)
if self.pos_embed is not None:
embeddings = embeddings + self.pos_embed
embeddings = self.pos_drop(embeddings)
embeddings = self.norm(embeddings)
return embeddings
@add_start_docstrings_to_model_forward(
LAYOUTLMV3_MODEL_INPUTS_DOCSTRING.format("batch_size, token_sequence_length")
)
@replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
bbox: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Override of the forward method in the parent class with specific
docstrings added for layoutlmv3 model inputs and outputs.
"""
class LayoutLMv3ClassificationHead(nn.Module):
"""
Head for sentence-level classification tasks. Reference: RobertaClassificationHead
"""
def __init__(self, config, pool_feature=False):
super().__init__()
self.pool_feature = pool_feature
if pool_feature:
self.dense = nn.Linear(config.hidden_size * 3, config.hidden_size)
else:
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, x):
x = self.dropout(x)
x = self.dense(x)
x = torch.tanh(x)
x = self.dropout(x)
x = self.out_proj(x)
return x
@add_start_docstrings(
"""
LayoutLMv3 Model with a token classification head on top (a linear layer on top of the final hidden states) e.g.
for sequence labeling (information extraction) tasks such as [FUNSD](https://guillaumejaume.github.io/FUNSD/),
[SROIE](https://rrc.cvc.uab.es/?ch=13), [CORD](https://github.com/clovaai/cord) and
[Kleister-NDA](https://github.com/applicaai/kleister-nda).
""",
LAYOUTLMV3_START_DOCSTRING,
)
class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.layoutlmv3 = LayoutLMv3Model(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
if config.num_labels < 10:
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
else:
self.classifier = LayoutLMv3ClassificationHead(config, pool_feature=False)
self.init_weights()
@add_start_docstrings_to_model_forward(
LAYOUTLMV3_DOWNSTREAM_INPUTS_DOCSTRING.format("batch_size, sequence_length")
)
@replace_return_docstrings(output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
bbox: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
pixel_values: Optional[torch.LongTensor] = None,
) -> Union[Tuple, TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Returns:
Examples:
```
>>> from transformers import AutoProcessor, AutoModelForTokenClassification
>>> from datasets import load_dataset
>>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
>>> model = AutoModelForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=7)
>>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
>>> example = dataset[0]
>>> image = example["image"]
>>> words = example["tokens"]
>>> boxes = example["bboxes"]
>>> word_labels = example["ner_tags"]
>>> encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
>>> outputs = model(**encoding)
>>> loss = outputs.loss
>>> logits = outputs.logits
```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.layoutlmv3(
input_ids,
bbox=bbox,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
pixel_values=pixel_values,
)
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
sequence_output = outputs[0][:, :seq_length]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
"""
LayoutLMv3 Model with a span classification head on top for extractive question-answering tasks such as
[DocVQA](https://rrc.cvc.uab.es/?ch=17) (a linear layer on top of the text part of the hidden-states output to
compute `span start logits` and `span end logits`).
"""
@add_start_docstrings(
"""
LayoutLMv3 Model with a sequence classification head on top (a linear layer on top of the final hidden state of the
[CLS] token) e.g. for document image classification tasks such as the
[RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset.
""",
LAYOUTLMV3_START_DOCSTRING,
)
class LayoutLMv3ForSequenceClassification(LayoutLMv3PreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
self.layoutlmv3 = LayoutLMv3Model(config)
self.classifier = LayoutLMv3ClassificationHead(config, pool_feature=False)
self.init_weights()
@add_start_docstrings_to_model_forward(
LAYOUTLMV3_DOWNSTREAM_INPUTS_DOCSTRING.format("batch_size, sequence_length")
)
@replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
bbox: Optional[torch.LongTensor] = None,
pixel_values: Optional[torch.LongTensor] = None,
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
bbox: Optional[torch.LongTensor] = None,
pixel_values: Optional[torch.LongTensor] = None,
.\models\layoutlmv3\modeling_tf_layoutlmv3.py
from __future__ import annotations
import collections
import math
from typing import List, Optional, Tuple, Union
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
TFBaseModelOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (
TFPreTrainedModel,
TFQuestionAnsweringLoss,
TFSequenceClassificationLoss,
TFTokenClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds
from ...utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
replace_return_docstrings,
)
from .configuration_layoutlmv3 import LayoutLMv3Config
_CONFIG_FOR_DOC = "LayoutLMv3Config"
_DUMMY_INPUT_IDS = [
[7, 6, 1],
[1, 2, 0],
]
_DUMMY_BBOX = [
[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
[[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]],
]
TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/layoutlmv3-base",
"microsoft/layoutlmv3-large",
]
LARGE_NEGATIVE = -1e8
class TFLayoutLMv3PatchEmbeddings(keras.layers.Layer):
"""LayoutLMv3 图像(patch)嵌入层。"""
def __init__(self, config: LayoutLMv3Config, **kwargs):
super().__init__(**kwargs)
patch_sizes = (
config.patch_size
if isinstance(config.patch_size, collections.abc.Iterable)
else (config.patch_size, config.patch_size)
)
self.proj = keras.layers.Conv2D(
filters=config.hidden_size,
kernel_size=patch_sizes,
strides=patch_sizes,
padding="valid",
data_format="channels_last",
use_bias=True,
kernel_initializer=get_initializer(config.initializer_range),
name="proj",
)
self.hidden_size = config.hidden_size
self.num_patches = (config.input_size**2) // (patch_sizes[0] * patch_sizes[1])
self.config = config
def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
pixel_values = tf.transpose(pixel_values, perm=[0, 2, 3, 1])
embeddings = self.proj(pixel_values)
embeddings = tf.reshape(embeddings, (-1, self.num_patches, self.hidden_size))
return embeddings
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "proj", None) is not None:
with tf.name_scope(self.proj.name):
self.proj.build([None, None, None, self.config.num_channels])
class TFLayoutLMv3TextEmbeddings(keras.layers.Layer):
"""
LayoutLMv3 text embeddings. Same as `RobertaEmbeddings` but with added spatial (layout) embeddings.
"""
def __init__(self, config: LayoutLMv3Config, **kwargs):
super().__init__(**kwargs)
self.word_embeddings = keras.layers.Embedding(
config.vocab_size,
config.hidden_size,
embeddings_initializer=get_initializer(config.initializer_range),
name="word_embeddings",
)
self.token_type_embeddings = keras.layers.Embedding(
config.type_vocab_size,
config.hidden_size,
embeddings_initializer=get_initializer(config.initializer_range),
name="token_type_embeddings",
)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
self.padding_token_index = config.pad_token_id
self.position_embeddings = keras.layers.Embedding(
config.max_position_embeddings,
config.hidden_size,
embeddings_initializer=get_initializer(config.initializer_range),
name="position_embeddings",
)
self.x_position_embeddings = keras.layers.Embedding(
config.max_2d_position_embeddings,
config.coordinate_size,
embeddings_initializer=get_initializer(config.initializer_range),
name="x_position_embeddings",
)
self.y_position_embeddings = keras.layers.Embedding(
config.max_2d_position_embeddings,
config.coordinate_size,
embeddings_initializer=get_initializer(config.initializer_range),
name="y_position_embeddings",
)
self.h_position_embeddings = keras.layers.Embedding(
config.max_2d_position_embeddings,
config.shape_size,
embeddings_initializer=get_initializer(config.initializer_range),
name="h_position_embeddings",
)
self.w_position_embeddings = keras.layers.Embedding(
config.max_2d_position_embeddings,
config.shape_size,
embeddings_initializer=get_initializer(config.initializer_range),
name="w_position_embeddings",
)
self.max_2d_positions = config.max_2d_position_embeddings
self.config = config
def calculate_spatial_position_embeddings(self, bbox: tf.Tensor) -> tf.Tensor:
try:
left_position_ids = bbox[:, :, 0]
upper_position_ids = bbox[:, :, 1]
right_position_ids = bbox[:, :, 2]
lower_position_ids = bbox[:, :, 3]
except IndexError as exception:
raise IndexError("Bounding box is not of shape (batch_size, seq_length, 4).") from exception
try:
left_position_embeddings = self.x_position_embeddings(left_position_ids)
upper_position_embeddings = self.y_position_embeddings(upper_position_ids)
right_position_embeddings = self.x_position_embeddings(right_position_ids)
lower_position_embeddings = self.y_position_embeddings(lower_position_ids)
except IndexError as exception:
raise IndexError(
f"The `bbox` coordinate values should be within 0-{self.max_2d_positions} range."
) from exception
max_position_id = self.max_2d_positions - 1
h_position_embeddings = self.h_position_embeddings(
tf.clip_by_value(bbox[:, :, 3] - bbox[:, :, 1], 0, max_position_id)
)
w_position_embeddings = self.w_position_embeddings(
tf.clip_by_value(bbox[:, :, 2] - bbox[:, :, 0], 0, max_position_id)
)
spatial_position_embeddings = tf.concat(
[
left_position_embeddings,
upper_position_embeddings,
right_position_embeddings,
lower_position_embeddings,
h_position_embeddings,
w_position_embeddings,
],
axis=-1,
)
return spatial_position_embeddings
def create_position_ids_from_inputs_embeds(self, inputs_embds: tf.Tensor) -> tf.Tensor:
"""
We are provided embeddings directly. We cannot infer which are padded, so just generate sequential position
ids.
"""
input_shape = tf.shape(inputs_embds)
sequence_length = input_shape[1]
start_index = self.padding_token_index + 1
end_index = self.padding_token_index + sequence_length + 1
position_ids = tf.range(start_index, end_index, dtype=tf.int32)
batch_size = input_shape[0]
position_ids = tf.reshape(position_ids, (1, sequence_length))
position_ids = tf.tile(position_ids, (batch_size, 1))
return position_ids
def create_position_ids_from_input_ids(self, input_ids: tf.Tensor) -> tf.Tensor:
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_token_index + 1.
"""
mask = tf.cast(tf.not_equal(input_ids, self.padding_token_index), input_ids.dtype)
position_ids = tf.cumsum(mask, axis=1) * mask
position_ids = position_ids + self.padding_token_index
return position_ids
def create_position_ids(self, input_ids: tf.Tensor, inputs_embeds: tf.Tensor) -> tf.Tensor:
if input_ids is None:
return self.create_position_ids_from_inputs_embeds(inputs_embeds)
else:
return self.create_position_ids_from_input_ids(input_ids)
def call(
self,
input_ids: tf.Tensor | None = None,
bbox: tf.Tensor = None,
token_type_ids: tf.Tensor | None = None,
position_ids: tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
training: bool = False,
) -> tf.Tensor:
if position_ids is None:
position_ids = self.create_position_ids(input_ids, inputs_embeds)
if input_ids is not None:
input_shape = tf.shape(input_ids)
else:
input_shape = tf.shape(inputs_embeds)[:-1]
if token_type_ids is None:
token_type_ids = tf.zeros(input_shape, dtype=position_ids.dtype)
if inputs_embeds is None:
check_embeddings_within_bounds(input_ids, self.word_embeddings.input_dim)
inputs_embeds = self.word_embeddings(input_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + token_type_embeddings
position_embeddings = self.position_embeddings(position_ids)
embeddings += position_embeddings
spatial_position_embeddings = self.calculate_spatial_position_embeddings(bbox)
embeddings += spatial_position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings, training=training)
return embeddings
if self.built:
return
self.built = True
if getattr(self, "word_embeddings", None) is not None:
with tf.name_scope(self.word_embeddings.name):
self.word_embeddings.build(None)
if getattr(self, "token_type_embeddings", None) is not None:
with tf.name_scope(self.token_type_embeddings.name):
self.token_type_embeddings.build(None)
if getattr(self, "LayerNorm", None) is not None:
self.LayerNorm.build([None, None, self.config.hidden_size])
if getattr(self, "position_embeddings", None) is not None:
with tf.name_scope(self.position_embeddings.name):
self.position_embeddings.build(None)
if getattr(self, "x_position_embeddings", None) is not None:
with tf.name_scope(self.x_position_embeddings.name):
self.x_position_embeddings.build(None)
if getattr(self, "y_position_embeddings", None) is not None:
with tf.name_scope(self.y_position_embeddings.name):
self.y_position_embeddings.build(None)
if getattr(self, "h_position_embeddings", None) is not None:
with tf.name_scope(self.h_position_embeddings.name):
self.h_position_embeddings.build(None)
if getattr(self, "w_position_embeddings", None) is not None:
with tf.name_scope(self.w_position_embeddings.name):
self.w_position_embeddings.build(None)
class TFLayoutLMv3SelfAttention(keras.layers.Layer):
def __init__(self, config: LayoutLMv3Config, **kwargs):
super().__init__(**kwargs)
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.attention_score_normaliser = math.sqrt(self.attention_head_size)
self.query = keras.layers.Dense(
self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range),
name="query",
)
self.key = keras.layers.Dense(
self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range),
name="key",
)
self.value = keras.layers.Dense(
self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range),
name="value",
)
self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
self.has_relative_attention_bias = config.has_relative_attention_bias
self.has_spatial_attention_bias = config.has_spatial_attention_bias
self.config = config
def transpose_for_scores(self, x: tf.Tensor):
shape = tf.shape(x)
new_shape = (
shape[0],
shape[1],
self.num_attention_heads,
self.attention_head_size,
)
x = tf.reshape(x, new_shape)
return tf.transpose(x, perm=[0, 2, 1, 3])
def cogview_attention(self, attention_scores: tf.Tensor, alpha: Union[float, int] = 32):
"""
https://arxiv.org/abs/2105.13290 Section 2.4 Stabilization of training: Precision Bottleneck Relaxation
(PB-Relax). A replacement of the original keras.layers.Softmax(axis=-1)(attention_scores). Seems the new
attention_probs will result in a slower speed and a little bias. Can use
tf.debugging.assert_near(standard_attention_probs, cogview_attention_probs, atol=1e-08) for comparison. The
smaller atol (e.g., 1e-08), the better.
"""
scaled_attention_scores = attention_scores / alpha
max_value = tf.expand_dims(tf.reduce_max(scaled_attention_scores, axis=-1), axis=-1)
new_attention_scores = (scaled_attention_scores - max_value) * alpha
return tf.math.softmax(new_attention_scores, axis=-1)
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor | None,
head_mask: tf.Tensor | None,
output_attentions: bool,
rel_pos: tf.Tensor | None = None,
rel_2d_pos: tf.Tensor | None = None,
training: bool = False,
) -> Union[Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor]]:
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
query_layer = self.transpose_for_scores(self.query(hidden_states))
normalised_query_layer = query_layer / self.attention_score_normaliser
transposed_key_layer = tf.transpose(
key_layer, perm=[0, 1, 3, 2]
)
attention_scores = tf.matmul(normalised_query_layer, transposed_key_layer)
if self.has_relative_attention_bias and self.has_spatial_attention_bias:
attention_scores += (rel_pos + rel_2d_pos) / self.attention_score_normaliser
elif self.has_relative_attention_bias:
attention_scores += rel_pos / self.attention_score_normaliser
if attention_mask is not None:
attention_scores += attention_mask
attention_probs = self.cogview_attention(attention_scores)
attention_probs = self.dropout(attention_probs, training=training)
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = tf.matmul(attention_probs, value_layer)
context_layer = tf.transpose(
context_layer, perm=[0, 2, 1, 3]
)
shape = tf.shape(context_layer)
context_layer = tf.reshape(
context_layer, (shape[0], shape[1], self.all_head_size)
)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
class TFLayoutLMv3SelfOutput(keras.layers.Layer):
def __init__(self, config: LayoutLMv3Config, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.dropout(inputs=hidden_states, training=training)
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFLayoutLMv3Attention(keras.layers.Layer):
def __init__(self, config: LayoutLMv3Config, **kwargs):
super().__init__(**kwargs)
self.self_attention = TFLayoutLMv3SelfAttention(config, name="self")
self.self_output = TFLayoutLMv3SelfOutput(config, name="output")
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor | None,
head_mask: tf.Tensor | None,
output_attentions: bool,
rel_pos: tf.Tensor | None = None,
rel_2d_pos: tf.Tensor | None = None,
training: bool = False,
) -> Union[Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor]]:
self_outputs = self.self_attention(
hidden_states,
attention_mask,
head_mask,
output_attentions,
rel_pos,
rel_2d_pos,
training=training,
)
attention_output = self.self_output(self_outputs[0], hidden_states, training=training)
outputs = (attention_output,) + self_outputs[1:]
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attention", None) is not None:
with tf.name_scope(self.self_attention.name):
self.self_attention.build(None)
if getattr(self, "self_output", None) is not None:
with tf.name_scope(self.self_output.name):
self.self_output.build(None)
class TFLayoutLMv3Intermediate(keras.layers.Layer):
def __init__(self, config: LayoutLMv3Config, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFLayoutLMv3Output(keras.layers.Layer):
def __init__(self, config: LayoutLMv3Config, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.dropout(inputs=hidden_states, training=training)
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFLayoutLMv3Layer(keras.layers.Layer):
def __init__(self, config: LayoutLMv3Config, **kwargs):
super().__init__(**kwargs)
self.attention = TFLayoutLMv3Attention(config, name="attention")
self.intermediate = TFLayoutLMv3Intermediate(config, name="intermediate")
self.bert_output = TFLayoutLMv3Output(config, name="output")
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor | None,
head_mask: tf.Tensor | None,
output_attentions: bool,
rel_pos: tf.Tensor | None = None,
rel_2d_pos: tf.Tensor | None = None,
training: bool = False,
) -> Union[Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor]]:
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
rel_pos=rel_pos,
rel_2d_pos=rel_2d_pos,
training=training,
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:]
intermediate_output = self.intermediate(attention_output)
layer_output = self.bert_output(intermediate_output, attention_output, training=training)
outputs = (layer_output,) + outputs
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "bert_output", None) is not None:
with tf.name_scope(self.bert_output.name):
self.bert_output.build(None)
class TFLayoutLMv3Encoder(keras.layers.Layer):
def __init__(self, config: LayoutLMv3Config, **kwargs):
super().__init__(**kwargs)
self.config = config
self.layer = [TFLayoutLMv3Layer(config, name=f"layer.{i}") for i in range(config.num_hidden_layers)]
self.has_relative_attention_bias = config.has_relative_attention_bias
self.has_spatial_attention_bias = config.has_spatial_attention_bias
if self.has_relative_attention_bias:
self.rel_pos_bins = config.rel_pos_bins
self.max_rel_pos = config.max_rel_pos
self.rel_pos_bias = keras.layers.Dense(
units=config.num_attention_heads,
kernel_initializer=get_initializer(config.initializer_range),
use_bias=False,
name="rel_pos_bias",
)
if self.has_spatial_attention_bias:
self.max_rel_2d_pos = config.max_rel_2d_pos
self.rel_2d_pos_bins = config.rel_2d_pos_bins
self.rel_pos_x_bias = keras.layers.Dense(
units=config.num_attention_heads,
kernel_initializer=get_initializer(config.initializer_range),
use_bias=False,
name="rel_pos_x_bias",
)
self.rel_pos_y_bias = keras.layers.Dense(
units=config.num_attention_heads,
kernel_initializer=get_initializer(config.initializer_range),
use_bias=False,
name="rel_pos_y_bias",
)
def relative_position_bucket(self, relative_positions: tf.Tensor, num_buckets: int, max_distance: int):
num_buckets = num_buckets // 2
buckets = tf.abs(relative_positions)
max_exact_buckets = num_buckets // 2
is_small = buckets < max_exact_buckets
buckets_log_ratio = tf.math.log(tf.cast(buckets, tf.float32) / max_exact_buckets)
distance_log_ratio = math.log(max_distance / max_exact_buckets)
buckets_big_offset = (
buckets_log_ratio / distance_log_ratio * (num_buckets - max_exact_buckets)
)
buckets_big = max_exact_buckets + buckets_big_offset
buckets_big = tf.cast(buckets_big, buckets.dtype)
buckets_big = tf.minimum(buckets_big, num_buckets - 1)
return (tf.cast(relative_positions > 0, buckets.dtype) * num_buckets) + tf.where(
is_small, buckets, buckets_big
)
def _cal_1d_pos_emb(self, position_ids: tf.Tensor):
return self._cal_pos_emb(self.rel_pos_bias, position_ids, self.rel_pos_bins, self.max_rel_pos)
def _cal_2d_pos_emb(self, bbox: tf.Tensor):
position_coord_x = bbox[:, :, 0]
position_coord_y = bbox[:, :, 3]
rel_pos_x = self._cal_pos_emb(
self.rel_pos_x_bias,
position_coord_x,
self.rel_2d_pos_bins,
self.max_rel_2d_pos,
)
rel_pos_y = self._cal_pos_emb(
self.rel_pos_y_bias,
position_coord_y,
self.rel_2d_pos_bins,
self.max_rel_2d_pos,
)
rel_2d_pos = rel_pos_x + rel_pos_y
return rel_2d_pos
def call(
self,
hidden_states: tf.Tensor,
bbox: tf.Tensor | None = None,
attention_mask: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
position_ids: tf.Tensor | None = None,
training: bool = False,
) -> Union[
TFBaseModelOutput,
Tuple[tf.Tensor],
Tuple[tf.Tensor, tf.Tensor],
Tuple[tf.Tensor, tf.Tensor, tf.Tensor],
]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
rel_pos = self._cal_1d_pos_emb(position_ids) if self.has_relative_attention_bias else None
rel_2d_pos = self._cal_2d_pos_emb(bbox) if self.has_spatial_attention_bias else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
output_attentions,
rel_pos=rel_pos,
rel_2d_pos=rel_2d_pos,
training=training,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if return_dict:
return TFBaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
else:
return tuple(
value for value in [hidden_states, all_hidden_states, all_self_attentions] if value is not None
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "rel_pos_bias", None) is not None:
with tf.name_scope(self.rel_pos_bias.name):
self.rel_pos_bias.build([None, None, self.rel_pos_bins])
if getattr(self, "rel_pos_x_bias", None) is not None:
with tf.name_scope(self.rel_pos_x_bias.name):
self.rel_pos_x_bias.build([None, None, self.rel_2d_pos_bins])
if getattr(self, "rel_pos_y_bias", None) is not None:
with tf.name_scope(self.rel_pos_y_bias.name):
self.rel_pos_y_bias.build([None, None, self.rel_2d_pos_bins])
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFLayoutLMv3MainLayer(keras.layers.Layer):
config_class = LayoutLMv3Config
def __init__(self, config: LayoutLMv3Config, **kwargs):
super().__init__(**kwargs)
self.config = config
if config.text_embed:
self.embeddings = TFLayoutLMv3TextEmbeddings(config, name="embeddings")
if config.visual_embed:
self.patch_embed = TFLayoutLMv3PatchEmbeddings(config, name="patch_embed")
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")
if config.has_relative_attention_bias or config.has_spatial_attention_bias:
image_size = config.input_size // config.patch_size
self.init_visual_bbox(image_size=(image_size, image_size))
self.norm = keras.layers.LayerNormalization(epsilon=1e-6, name="norm")
self.encoder = TFLayoutLMv3Encoder(config, name="encoder")
def build(self, input_shape=None):
if self.config.visual_embed:
image_size = self.config.input_size // self.config.patch_size
self.cls_token = self.add_weight(
shape=(1, 1, self.config.hidden_size),
initializer="zeros",
trainable=True,
dtype=tf.float32,
name="cls_token",
)
self.pos_embed = self.add_weight(
shape=(1, image_size * image_size + 1, self.config.hidden_size),
initializer="zeros",
trainable=True,
dtype=tf.float32,
name="pos_embed",
)
if self.built:
return
self.built = True
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "patch_embed", None) is not None:
with tf.name_scope(self.patch_embed.name):
self.patch_embed.build(None)
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
if getattr(self, "norm", None) is not None:
with tf.name_scope(self.norm.name):
self.norm.build([None, None, self.config.hidden_size])
def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings.word_embeddings
def set_input_embeddings(self, value: tf.Variable):
self.embeddings.word_embeddings.weight = value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
raise NotImplementedError
def init_visual_bbox(self, image_size: Tuple[int, int], max_len: int = 1000):
height, width = image_size
visual_bbox_x = tf.range(0, max_len * (width + 1), max_len) // width
visual_bbox_x = tf.expand_dims(visual_bbox_x, axis=0)
visual_bbox_x = tf.tile(visual_bbox_x, [width, 1])
visual_bbox_y = tf.range(0, max_len * (height + 1), max_len) // height
visual_bbox_y = tf.expand_dims(visual_bbox_y, axis=1)
visual_bbox_y = tf.tile(visual_bbox_y, [1, height])
visual_bbox = tf.stack(
[visual_bbox_x[:, :-1], visual_bbox_y[:-1], visual_bbox_x[:, 1:], visual_bbox_y[1:]],
axis=-1,
)
visual_bbox = tf.reshape(visual_bbox, [-1, 4])
cls_token_box = tf.constant([[1, 1, max_len - 1, max_len - 1]], dtype=tf.int32)
self.visual_bbox = tf.concat([cls_token_box, visual_bbox], axis=0)
def calculate_visual_bbox(self, batch_size: int, dtype: tf.DType):
visual_bbox = tf.expand_dims(self.visual_bbox, axis=0)
visual_bbox = tf.tile(visual_bbox, [batch_size, 1, 1])
visual_bbox = tf.cast(visual_bbox, dtype=dtype)
return visual_bbox
def embed_image(self, pixel_values: tf.Tensor) -> tf.Tensor:
embeddings = self.patch_embed(pixel_values)
batch_size = tf.shape(embeddings)[0]
cls_tokens = tf.tile(self.cls_token, [batch_size, 1, 1])
embeddings = tf.concat([cls_tokens, embeddings], axis=1)
if getattr(self, "pos_embed", None) is not None:
embeddings += self.pos_embed
embeddings = self.norm(embeddings)
return embeddings
n_dims = len(attention_mask.shape)
if n_dims == 3:
extended_attention_mask = tf.expand_dims(attention_mask, axis=1)
elif n_dims == 2:
extended_attention_mask = tf.expand_dims(attention_mask, axis=1)
extended_attention_mask = tf.expand_dims(extended_attention_mask, axis=1)
else:
raise ValueError(f"Wrong shape for attention_mask (shape {attention_mask.shape}).")
extended_attention_mask = tf.cast(extended_attention_mask, self.compute_dtype)
extended_attention_mask = (1.0 - extended_attention_mask) * LARGE_NEGATIVE
return extended_attention_mask
def get_head_mask(self, head_mask: tf.Tensor | None) -> Union[tf.Tensor, List[tf.Tensor | None]]:
if head_mask is None:
return [None] * self.config.num_hidden_layers
n_dims = tf.rank(head_mask)
if n_dims == 1:
head_mask = tf.expand_dims(head_mask, axis=0)
head_mask = tf.expand_dims(head_mask, axis=0)
head_mask = tf.expand_dims(head_mask, axis=-1)
head_mask = tf.expand_dims(head_mask, axis=-1)
head_mask = tf.tile(
head_mask, [self.config.num_hidden_layers, 1, 1, 1, 1]
)
elif n_dims == 2:
head_mask = tf.expand_dims(head_mask, axis=1)
head_mask = tf.expand_dims(head_mask, axis=-1)
head_mask = tf.expand_dims(head_mask, axis=-1)
elif n_dims != 5:
raise ValueError(f"Wrong shape for head_mask (shape {head_mask.shape}).")
assert tf.rank(head_mask) == 5, f"Got head_mask rank of {tf.rank(head_mask)}, but require 5."
head_mask = tf.cast(head_mask, self.compute_dtype)
return head_mask
@unpack_inputs
def call(
self,
input_ids: tf.Tensor | None = None,
bbox: tf.Tensor | None = None,
attention_mask: tf.Tensor | None = None,
token_type_ids: tf.Tensor | None = None,
position_ids: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
pixel_values: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[
TFBaseModelOutput,
Tuple[tf.Tensor],
Tuple[tf.Tensor, tf.Tensor],
Tuple[tf.Tensor, tf.Tensor, tf.Tensor],
Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor],
Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor],
Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor],
]:
class TFLayoutLMv3PreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = LayoutLMv3Config
base_model_prefix = "layoutlmv3"
@property
def input_signature(self):
sig = super().input_signature
sig["bbox"] = tf.TensorSpec((None, None, 4), tf.int32, name="bbox")
return sig
LAYOUTLMV3_START_DOCSTRING = r"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
positional argument:
- a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
Note that when creating models and layers with
[subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
about any of this, as you can just pass inputs like you would to any other Python function!
</Tip>
Parameters:
config ([`LayoutLMv3Config`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
"""
LAYOUTLMV3_INPUTS_DOCSTRING = r"""
"""
@add_start_docstrings(
"The bare LayoutLMv3 Model transformer outputting raw hidden-states without any specific head on top.",
LAYOUTLMV3_START_DOCSTRING,
)
class TFLayoutLMv3Model(TFLayoutLMv3PreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"position_ids"]
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.layoutlmv3 = TFLayoutLMv3MainLayer(config, name="layoutlmv3")
@unpack_inputs
@add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFBaseModelOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
input_ids: tf.Tensor | None = None,
bbox: tf.Tensor | None = None,
attention_mask: tf.Tensor | None = None,
token_type_ids: tf.Tensor | None = None,
position_ids: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
pixel_values: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[
TFBaseModelOutput,
Tuple[tf.Tensor],
Tuple[tf.Tensor, tf.Tensor],
Tuple[tf.Tensor, tf.Tensor, tf.Tensor],
]:
r"""
Forward pass for the TFLayoutLMv3Model.
Args:
input_ids (tf.Tensor, optional): The input token IDs.
bbox (tf.Tensor, optional): The bounding boxes of tokens.
attention_mask (tf.Tensor, optional): The attention mask.
token_type_ids (tf.Tensor, optional): The token type IDs.
position_ids (tf.Tensor, optional): The position IDs.
head_mask (tf.Tensor, optional): The mask for attention heads.
inputs_embeds (tf.Tensor, optional): The embedded inputs.
pixel_values (tf.Tensor, optional): The pixel values of images.
output_attentions (bool, optional): Whether to output attentions.
output_hidden_states (bool, optional): Whether to output hidden states.
return_dict (bool, optional): Whether to return a dictionary.
training (bool, optional): Whether in training mode.
Returns:
Union[TFBaseModelOutput, Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor], Tuple[tf.Tensor, tf.Tensor, tf.Tensor]]:
The model outputs.
Examples:
Example usage of TFLayoutLMv3Model for token classification.
```
>>> from transformers import AutoProcessor, TFAutoModel
>>> from datasets import load_dataset
>>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
>>> model = TFAutoModel.from_pretrained("microsoft/layoutlmv3-base")
>>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
>>> example = dataset[0]
>>> image = example["image"]
>>> words = example["tokens"]
>>> boxes = example["bboxes"]
>>> encoding = processor(image, words, boxes=boxes, return_tensors="tf")
>>> outputs = model(**encoding)
>>> last_hidden_states = outputs.last_hidden_state
```
"""
outputs = self.layoutlmv3(
input_ids=input_ids,
bbox=bbox,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layoutlmv3", None) is not None:
with tf.name_scope(self.layoutlmv3.name):
self.layoutlmv3.build(None)
class TFLayoutLMv3ClassificationHead(keras.layers.Layer):
"""
Placeholder for the classification head of the TFLayoutLMv3Model.
"""
Head for sentence-level classification tasks. Reference: RobertaClassificationHead
"""
# 初始化函数,用于创建一个分类器头部对象
def __init__(self, config: LayoutLMv3Config, **kwargs):
super().__init__(**kwargs)
# 创建一个全连接层,输出维度为config.hidden_size,激活函数为tanh
self.dense = keras.layers.Dense(
config.hidden_size,
activation="tanh",
kernel_initializer=get_initializer(config.initializer_range),
name="dense",
)
# 设置分类器的dropout层,根据config中的设定选择classifier_dropout或者hidden_dropout_prob
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = keras.layers.Dropout(
classifier_dropout,
name="dropout",
)
# 创建一个全连接层,输出维度为config.num_labels,用于最终的输出投影
self.out_proj = keras.layers.Dense(
config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="out_proj",
)
# 保存配置信息
self.config = config
# 调用函数,用于执行前向传播
def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor:
# 对输入数据进行dropout处理
outputs = self.dropout(inputs, training=training)
# 经过全连接层dense处理
outputs = self.dense(outputs)
# 再次对处理后的结果进行dropout处理
outputs = self.dropout(outputs, training=training)
# 最终通过全连接层out_proj输出结果
outputs = self.out_proj(outputs)
return outputs
# 构建函数,用于构建模型的层次结构
def build(self, input_shape=None):
if self.built:
return
# 标记模型已经构建
self.built = True
# 如果dense层存在,则构建dense层
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# 如果dropout层存在,则构建dropout层
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
# 如果out_proj层存在,则构建out_proj层
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.config.hidden_size])
"""
LayoutLMv3 Model with a sequence classification head on top (a linear layer on top of the final hidden state of the
[CLS] token) e.g. for document image classification tasks such as the
[RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset.
"""
# 继承自 TFLayoutLMv3PreTrainedModel 和 TFSequenceClassificationLoss 的 TFLayoutLMv3ForSequenceClassification 类,
# 用于文档图像分类任务,通过在最终隐藏状态的[CLS]标记之上添加线性层来进行序列分类。
@add_start_docstrings(
"""
LayoutLMv3 Model with a sequence classification head on top (a linear layer on top of the final hidden state of the
[CLS] token) e.g. for document image classification tasks such as the
[RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset.
""",
LAYOUTLMV3_START_DOCSTRING,
)
class TFLayoutLMv3ForSequenceClassification(TFLayoutLMv3PreTrainedModel, TFSequenceClassificationLoss):
# 在从 PT 模型加载 TF 模型时,忽略的授权外层或缺失层的名称列表,包含不带位置标识符的项
_keys_to_ignore_on_load_unexpected = [r"position_ids"]
def __init__(self, config: LayoutLMv3Config, **kwargs):
super().__init__(config, **kwargs)
self.config = config
# 创建 LayoutLMv3 主层,并命名为 "layoutlmv3"
self.layoutlmv3 = TFLayoutLMv3MainLayer(config, name="layoutlmv3")
# 创建 LayoutLMv3 分类头,并命名为 "classifier"
self.classifier = TFLayoutLMv3ClassificationHead(config, name="classifier")
@unpack_inputs
@add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
# 调用函数,接收多种输入参数,返回 TFSequenceClassifierOutput 或其它类型的元组
def call(
self,
input_ids: tf.Tensor | None = None,
attention_mask: tf.Tensor | None = None,
token_type_ids: tf.Tensor | None = None,
position_ids: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
labels: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
bbox: tf.Tensor | None = None,
pixel_values: tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[
TFSequenceClassifierOutput,
Tuple[tf.Tensor],
Tuple[tf.Tensor, tf.Tensor],
Tuple[tf.Tensor, tf.Tensor, tf.Tensor],
Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor],
# 多种返回类型的联合
]:
"""
Returns:
Examples:
```
>>> from transformers import AutoProcessor, TFAutoModelForSequenceClassification
>>> from datasets import load_dataset
>>> import tensorflow as tf
>>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
>>> model = TFAutoModelForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")
>>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
>>> example = dataset[0]
>>> image = example["image"]
>>> words = example["tokens"]
>>> boxes = example["bboxes"]
>>> encoding = processor(image, words, boxes=boxes, return_tensors="tf")
>>> sequence_label = tf.convert_to_tensor([1])
>>> outputs = model(**encoding, labels=sequence_label)
>>> loss = outputs.loss
>>> logits = outputs.logits
```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果 return_dict 参数为 None,则使用模型配置中的默认设置
outputs = self.layoutlmv3(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
bbox=bbox,
pixel_values=pixel_values,
training=training,
)
# 使用 LayoutLMv3 模型处理输入数据,输出模型的各种结果
sequence_output = outputs[0][:, 0, :]
# 提取模型输出的序列输出的第一个位置的特征向量
logits = self.classifier(sequence_output, training=training)
# 使用分类器对序列输出进行分类预测
loss = None if labels is None else self.hf_compute_loss(labels, logits)
# 如果没有标签,则损失值为 None;否则计算模型预测与标签之间的损失
if not return_dict:
# 如果不要求返回字典格式的输出
output = (logits,) + outputs[1:]
# 构建输出元组,包含 logits 和模型其他输出
return ((loss,) + output) if loss is not None else output
# 如果有损失则将损失加入输出,否则只输出 logits 和其他结果
return TFSequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 返回 TFSequenceClassifierOutput 格式的输出,包括损失、logits、隐藏状态和注意力权重
def build(self, input_shape=None):
if self.built:
return
# 如果模型已经建立则直接返回
self.built = True
# 设置模型已建立标志为 True
if getattr(self, "layoutlmv3", None) is not None:
# 如果模型有 layoutlmv3 属性
with tf.name_scope(self.layoutlmv3.name):
self.layoutlmv3.build(None)
# 在 TensorFlow 的命名空间下构建 layoutlmv3 模型
if getattr(self, "classifier", None) is not None:
# 如果模型有 classifier 属性
with tf.name_scope(self.classifier.name):
self.classifier.build(None)
# 在 TensorFlow 的命名空间下构建 classifier 分类器模型
"""
LayoutLMv3 Model with a token classification head on top (a linear layer on top of the final hidden states) e.g.
for sequence labeling (information extraction) tasks such as [FUNSD](https://guillaumejaume.github.io/FUNSD/),
[SROIE](https://rrc.cvc.uab.es/?ch=13), [CORD](https://github.com/clovaai/cord) and
[Kleister-NDA](https://github.com/applicaai/kleister-nda).
This class inherits from TFLayoutLMv3PreTrainedModel and TFTokenClassificationLoss. It provides a token
classification model specifically tailored for layout-aware tasks.
Attributes:
_keys_to_ignore_on_load_unexpected (list): Names of layers to ignore when loading a TF model from a PT model.
Args:
config (LayoutLMv3Config): Configuration class instance defining the model architecture and hyperparameters.
"""
@add_start_docstrings(
"""
LayoutLMv3 Model with a token classification head on top (a linear layer on top of the final hidden states) e.g.
for sequence labeling (information extraction) tasks such as [FUNSD](https://guillaumejaume.github.io/FUNSD/),
[SROIE](https://rrc.cvc.uab.es/?ch=13), [CORD](https://github.com/clovaai/cord) and
[Kleister-NDA](https://github.com/applicaai/kleister-nda).
""",
LAYOUTLMV3_START_DOCSTRING,
)
class TFLayoutLMv3ForTokenClassification(TFLayoutLMv3PreTrainedModel, TFTokenClassificationLoss):
# names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
_keys_to_ignore_on_load_unexpected = [r"position_ids"]
def __init__(self, config: LayoutLMv3Config, **kwargs):
super().__init__(config, **kwargs)
self.num_labels = config.num_labels
# Initialize the main layers of the LayoutLMv3 model
self.layoutlmv3 = TFLayoutLMv3MainLayer(config, name="layoutlmv3")
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")
# Initialize the classifier layer based on the number of labels in the configuration
if config.num_labels < 10:
self.classifier = keras.layers.Dense(
config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="classifier",
)
else:
self.classifier = TFLayoutLMv3ClassificationHead(config, name="classifier")
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFTokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
input_ids: tf.Tensor | None = None,
bbox: tf.Tensor | None = None,
attention_mask: tf.Tensor | None = None,
token_type_ids: tf.Tensor | None = None,
position_ids: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
labels: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
pixel_values: tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[
TFTokenClassifierOutput,
Tuple[tf.Tensor],
Tuple[tf.Tensor, tf.Tensor],
Tuple[tf.Tensor, tf.Tensor, tf.Tensor],
Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor],
# More return types depending on the inputs and configuration
]:
"""
Performs the forward pass of the model for token classification.
Args (depending on the input types):
input_ids (tf.Tensor, optional): Tensor of input token IDs.
bbox (tf.Tensor, optional): Tensor of bounding boxes for each token.
attention_mask (tf.Tensor, optional): Mask indicating which tokens should be attended to.
token_type_ids (tf.Tensor, optional): Type IDs to distinguish different sequences in the input.
position_ids (tf.Tensor, optional): Positional IDs to indicate the position of tokens.
head_mask (tf.Tensor, optional): Mask to hide certain heads in the self-attention layers.
inputs_embeds (tf.Tensor, optional): Embedded inputs if the input tokens are already embedded.
labels (tf.Tensor, optional): Labels for the token classification task.
output_attentions (bool, optional): Whether to output attentions.
output_hidden_states (bool, optional): Whether to output hidden states.
return_dict (bool, optional): Whether to return a dictionary instead of a tuple of outputs.
pixel_values (tf.Tensor, optional): Pixel values for image tokens if images are part of inputs.
training (bool, optional): Whether the model is in training mode.
Returns:
TFTokenClassifierOutput or Tuple of Tensors: Output depending on the configuration and inputs.
Raises:
ValueError: If the configuration is invalid or incompatible with the model.
"""
# 如果 `return_dict` 未指定,则使用模型配置中的默认设置来确定是否返回字典格式的输出
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用 LayoutLMv3 模型进行前向传播
outputs = self.layoutlmv3(
input_ids,
bbox=bbox,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
pixel_values=pixel_values,
training=training,
)
# 如果提供了 `input_ids`,则获取其形状;否则获取 `inputs_embeds` 的形状,去掉最后一维
if input_ids is not None:
input_shape = tf.shape(input_ids)
else:
input_shape = tf.shape(inputs_embeds)[:-1]
# 获取序列的长度
seq_length = input_shape[1]
# 从模型输出中提取文本部分的表示
sequence_output = outputs[0][:, :seq_length]
# 在训练过程中对序列输出进行 dropout 操作
sequence_output = self.dropout(sequence_output, training=training)
# 将处理后的序列输出传入分类器以获得 logits
logits = self.classifier(sequence_output)
# 如果没有提供标签,则不计算损失
loss = None if labels is None else self.hf_compute_loss(labels, logits)
# 如果不要求返回字典格式的输出,则按需返回 logits 和其他输出信息
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
# 返回 TFTokenClassifierOutput 类型的对象,包含损失、logits、隐藏状态和注意力权重
return TFTokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 如果模型已经构建完成,则直接返回,避免重复构建
if self.built:
return
# 将模型标记为已构建状态
self.built = True
# 如果存在 layoutlmv3 属性,则构建 layoutlmv3 模型部分
if getattr(self, "layoutlmv3", None) is not None:
# 使用 layoutlmv3 的名称作为命名空间
with tf.name_scope(self.layoutlmv3.name):
# 构建 layoutlmv3 模型
self.layoutlmv3.build(None)
# 如果存在 dropout 属性,则构建 dropout 模型部分
if getattr(self, "dropout", None) is not None:
# 使用 dropout 的名称作为命名空间
with tf.name_scope(self.dropout.name):
# 构建 dropout 模型
self.dropout.build(None)
# 如果存在 classifier 属性,则构建 classifier 模型部分
if getattr(self, "classifier", None) is not None:
# 使用 classifier 的名称作为命名空间
with tf.name_scope(self.classifier.name):
# 构建 classifier 模型,输入形状为 [None, None, self.config.hidden_size]
self.classifier.build([None, None, self.config.hidden_size])
"""
LayoutLMv3 Model with a span classification head on top for extractive question-answering tasks such as
[DocVQA](https://rrc.cvc.uab.es/?ch=17) (a linear layer on top of the text part of the hidden-states output to
compute `span start logits` and `span end logits`).
"""
@add_start_docstrings(
"""
LayoutLMv3 Model with a span classification head on top for extractive question-answering tasks such as
[DocVQA](https://rrc.cvc.uab.es/?ch=17) (a linear layer on top of the text part of the hidden-states output to
compute `span start logits` and `span end logits`).
""",
LAYOUTLMV3_START_DOCSTRING,
)
class TFLayoutLMv3ForQuestionAnswering(TFLayoutLMv3PreTrainedModel, TFQuestionAnsweringLoss):
# names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
_keys_to_ignore_on_load_unexpected = [r"position_ids"]
def __init__(self, config: LayoutLMv3Config, **kwargs):
super().__init__(config, **kwargs)
self.num_labels = config.num_labels
# Initialize the main LayoutLMv3 layer with the provided configuration
self.layoutlmv3 = TFLayoutLMv3MainLayer(config, name="layoutlmv3")
# Initialize the question answering classification head for LayoutLMv3
self.qa_outputs = TFLayoutLMv3ClassificationHead(config, name="qa_outputs")
@unpack_inputs
@add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
input_ids: tf.Tensor | None = None,
attention_mask: tf.Tensor | None = None,
token_type_ids: tf.Tensor | None = None,
position_ids: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
start_positions: tf.Tensor | None = None,
end_positions: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
bbox: tf.Tensor | None = None,
pixel_values: tf.Tensor | None = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[
TFQuestionAnsweringModelOutput,
Tuple[tf.Tensor],
Tuple[tf.Tensor, tf.Tensor],
Tuple[tf.Tensor, tf.Tensor, tf.Tensor],
Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor],
]:
"""
Forward pass of the TFLayoutLMv3ForQuestionAnswering model.
Args:
input_ids: Tensor of input token IDs.
attention_mask: Tensor of attention mask.
token_type_ids: Tensor of token type IDs.
position_ids: Tensor of position IDs.
head_mask: Tensor of head masks.
inputs_embeds: Tensor of input embeddings.
start_positions: Tensor of start positions for QA.
end_positions: Tensor of end positions for QA.
output_attentions: Whether to output attentions.
output_hidden_states: Whether to output hidden states.
bbox: Tensor of bounding boxes.
pixel_values: Tensor of pixel values.
return_dict: Whether to return a dictionary of outputs.
training: Whether the model is in training mode.
Returns:
TFQuestionAnsweringModelOutput or tuple of output tensors.
"""
def build(self, input_shape=None):
"""
Builds the TFLayoutLMv3ForQuestionAnswering model.
Args:
input_shape: Shape of the input tensor.
"""
if self.built:
return
self.built = True
# Build the LayoutLMv3 main layer if it exists
if getattr(self, "layoutlmv3", None) is not None:
with tf.name_scope(self.layoutlmv3.name):
self.layoutlmv3.build(None)
# Build the QA classification head if it exists
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build(None)