Transformers 源码解析(六十三)
.\models\layoutlmv3\processing_layoutlmv3.py
"""
Processor class for LayoutLMv3.
"""
import warnings
from typing import List, Optional, Union
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType
class LayoutLMv3Processor(ProcessorMixin):
r"""
Constructs a LayoutLMv3 processor which combines a LayoutLMv3 image processor and a LayoutLMv3 tokenizer into a
single processor.
[`LayoutLMv3Processor`] offers all the functionalities you need to prepare data for the model.
It first uses [`LayoutLMv3ImageProcessor`] to resize and normalize document images, and optionally applies OCR to
get words and normalized bounding boxes. These are then provided to [`LayoutLMv3Tokenizer`] or
[`LayoutLMv3TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
`attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
into token-level `labels` for token classification tasks (such as FUNSD, CORD).
Args:
image_processor (`LayoutLMv3ImageProcessor`, *optional*):
An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input.
tokenizer (`LayoutLMv3Tokenizer` or `LayoutLMv3TokenizerFast`, *optional*):
An instance of [`LayoutLMv3Tokenizer`] or [`LayoutLMv3TokenizerFast`]. The tokenizer is a required input.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "LayoutLMv3ImageProcessor"
tokenizer_class = ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast")
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
feature_extractor = None
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
def __call__(
self,
images,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
boxes: Union[List[List[int]], List[List[List[int]]]] = None,
word_labels: Optional[Union[List[int], List[List[int]]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
):
...
def get_overflowing_images(self, images, overflow_to_sample_mapping):
images_with_overflow = []
for sample_idx in overflow_to_sample_mapping:
images_with_overflow.append(images[sample_idx])
if len(images_with_overflow) != len(overflow_to_sample_mapping):
raise ValueError(
"Expected length of images to be the same as the length of `overflow_to_sample_mapping`, but got"
f" {len(images_with_overflow)} and {len(overflow_to_sample_mapping)}"
)
return images_with_overflow
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
to the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
return ["input_ids", "bbox", "attention_mask", "pixel_values"]
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
.\models\layoutlmv3\tokenization_layoutlmv3.py
"""Tokenization class for LayoutLMv3. Same as LayoutLMv2, but RoBERTa-like BPE tokenization instead of WordPiece."""
import json
import os
from functools import lru_cache
from typing import Dict, List, Optional, Tuple, Union
import regex as re
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...tokenization_utils_base import (
BatchEncoding,
EncodedInput,
PreTokenizedInput,
TextInput,
TextInputPair,
TruncationStrategy,
)
from ...utils import PaddingStrategy, TensorType, add_end_docstrings, logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/layoutlmv3-base": "https://huggingface.co/microsoft/layoutlmv3-base/raw/main/vocab.json",
"microsoft/layoutlmv3-large": "https://huggingface.co/microsoft/layoutlmv3-large/raw/main/vocab.json",
},
"merges_file": {
"microsoft/layoutlmv3-base": "https://huggingface.co/microsoft/layoutlmv3-base/raw/main/merges.txt",
"microsoft/layoutlmv3-large": "https://huggingface.co/microsoft/layoutlmv3-large/raw/main/merges.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/layoutlmv3-base": 512,
"microsoft/layoutlmv3-large": 512,
}
"""
"""
@lru_cache()
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
characters the bpe code barfs on.
The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
tables between utf-8 bytes and unicode strings.
"""
bs = (
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
)
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
def get_pairs(word):
"""
Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
单词被表示为符号的元组(符号是可变长度的字符串)。
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
class LayoutLMv3Tokenizer(PreTrainedTokenizer):
r"""
Construct a LayoutLMv3 tokenizer. Based on [`RoBERTatokenizer`] (Byte Pair Encoding or BPE).
[`LayoutLMv3Tokenizer`] can be used to turn words, word-level bounding boxes and optional word labels to
token-level `input_ids`, `attention_mask`, `token_type_ids`, `bbox`, and optional `labels` (for token
classification).
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
[`LayoutLMv3Tokenizer`] runs end-to-end tokenization: punctuation splitting and wordpiece. It also turns the
word-level bounding boxes into token-level bounding boxes.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask", "bbox"]
def __init__(
self,
vocab_file,
merges_file,
errors="replace",
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
add_prefix_space=True,
cls_token_box=[0, 0, 0, 0],
sep_token_box=[0, 0, 0, 0],
pad_token_box=[0, 0, 0, 0],
pad_token_label=-100,
only_label_first_subword=True,
**kwargs,
):
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
mask_token=mask_token,
add_prefix_space=add_prefix_space,
**kwargs,
)
self.vocab_file = vocab_file
self.merges_file = merges_file
self.errors = errors
self.cls_token = cls_token
self.sep_token = sep_token
self.unk_token = unk_token
self.pad_token = pad_token
self.mask_token = mask_token
self.cls_token_box = cls_token_box
self.sep_token_box = sep_token_box
self.pad_token_box = pad_token_box
self.pad_token_label = pad_token_label
self.only_label_first_subword = only_label_first_subword
self.special_tokens_map_extended = {}
self.unique_no_split_tokens = set()
self._extra_ids = 0
self._additional_special_tokens = []
self.add_special_tokens(
{"bos_token": bos_token, "eos_token": eos_token, "unk_token": unk_token, "pad_token": pad_token}
)
):
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
with open(merges_file, encoding="utf-8") as merges_handle:
bpe_merges = merges_handle.read().split("\n")[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
self.add_prefix_space = add_prefix_space
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
self.cls_token_box = cls_token_box
self.sep_token_box = sep_token_box
self.pad_token_box = pad_token_box
self.pad_token_label = pad_token_label
self.only_label_first_subword = only_label_first_subword
super().__init__(
errors=errors,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
cls_token=cls_token,
pad_token=pad_token,
mask_token=mask_token,
add_prefix_space=add_prefix_space,
cls_token_box=cls_token_box,
sep_token_box=sep_token_box,
pad_token_box=pad_token_box,
pad_token_label=pad_token_label,
only_label_first_subword=only_label_first_subword,
**kwargs,
)
@property
def vocab_size(self):
return len(self.encoder)
def get_vocab(self):
vocab = dict(self.encoder).copy()
vocab.update(self.added_tokens_encoder)
return vocab
def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token)
pairs = get_pairs(word)
if not pairs:
return token
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = " ".join(word)
self.cache[token] = word
return word
def _tokenize(self, text):
"""对字符串进行分词处理。"""
bpe_tokens = []
for token in re.findall(self.pat, text):
token = "".join(
self.byte_encoder[b] for b in token.encode("utf-8")
)
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
return bpe_tokens
def _convert_token_to_id(self, token):
"""使用词汇表将 token(字符串)转换为对应的 id。"""
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
"""使用词汇表将索引(整数)转换为对应的 token(字符串)。"""
return self.decoder.get(index)
def convert_tokens_to_string(self, tokens):
"""将一系列 token(字符串)转换为单个字符串。"""
text = "".join(tokens)
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
return text
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write("#version: 0.2\n")
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
通过连接和添加特殊标记,为序列分类任务构建模型输入。RoBERTa 的序列格式如下:
- 单个序列: `<s> X </s>`
- 序列对: `<s> A </s></s> B </s>`
Args:
token_ids_0 (`List[int]`):
将添加特殊标记的 ID 列表。
token_ids_1 (`List[int]`, *可选*):
第二个序列的 ID 列表,用于序列对任务。
Returns:
`List[int]`: 包含适当特殊标记的 [输入 ID](../glossary#input-ids) 列表。
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
):
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
if (
(is_split_into_words or add_prefix_space)
and (len(text) > 0 and not text[0].isspace())
and sum([text.startswith(no_split_token) for no_split_token in self.added_tokens_encoder]) == 0
):
text = " " + text
return (text, kwargs)
@add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
boxes: Union[List[List[int]], List[List[List[int]]]] = None,
word_labels: Optional[Union[List[int], List[List[int]]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
@add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
],
is_pair: bool = None,
boxes: Optional[List[List[List[int]]]] = None,
word_labels: Optional[Union[List[int], List[List[int]]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
return self._batch_encode_plus(
batch_text_or_text_pairs=batch_text_or_text_pairs,
is_pair=is_pair,
boxes=boxes,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
) -> BatchEncoding:
if return_offsets_mapping:
raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers. "
"To use this feature, change your tokenizer to one deriving from "
"transformers.PreTrainedTokenizerFast."
)
batch_outputs = self._batch_prepare_for_model(
batch_text_or_text_pairs=batch_text_or_text_pairs,
is_pair=is_pair,
boxes=boxes,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
return_tensors=return_tensors,
verbose=verbose,
)
return BatchEncoding(batch_outputs)
@add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def _batch_prepare_for_model(
self,
batch_text_or_text_pairs,
is_pair: bool = None,
boxes: Optional[List[List[int]]] = None,
word_labels: Optional[List[List[int]]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_length: bool = False,
verbose: bool = True,
) -> BatchEncoding:
"""
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
manages a moving window (with user defined stride) for overflowing tokens.
Args:
batch_ids_pairs: list of tokenized input ids or input ids pairs
"""
batch_outputs = {}
for idx, example in enumerate(zip(batch_text_or_text_pairs, boxes)):
batch_text_or_text_pair, boxes_example = example
outputs = self.prepare_for_model(
batch_text_or_text_pair[0] if is_pair else batch_text_or_text_pair,
batch_text_or_text_pair[1] if is_pair else None,
boxes_example,
word_labels=word_labels[idx] if word_labels is not None else None,
add_special_tokens=add_special_tokens,
padding=PaddingStrategy.DO_NOT_PAD.value,
truncation=truncation_strategy.value,
max_length=max_length,
stride=stride,
pad_to_multiple_of=None,
return_attention_mask=False,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
return_tensors=None,
prepend_batch_axis=False,
verbose=verbose,
)
for key, value in outputs.items():
if key not in batch_outputs:
batch_outputs[key] = []
batch_outputs[key].append(value)
batch_outputs = self.pad(
batch_outputs,
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
)
batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
return batch_outputs
@add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING)
def encode(
self,
text: Union[TextInput, PreTokenizedInput],
text_pair: Optional[PreTokenizedInput] = None,
boxes: Optional[List[List[int]]] = None,
word_labels: Optional[List[int]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> List[int]:
encoded_inputs = self.encode_plus(
text=text,
text_pair=text_pair,
boxes=boxes,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
return encoded_inputs["input_ids"]
@add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def encode_plus(
self,
text: Union[TextInput, PreTokenizedInput],
text_pair: Optional[PreTokenizedInput] = None,
boxes: Optional[List[List[int]]] = None,
word_labels: Optional[List[int]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
"""
Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
`__call__` should be used instead.
Args:
text (`str`, `List[str]`, `List[List[str]]`):
The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
text_pair (`List[str]` or `List[int]`, *optional*):
Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
list of list of strings (words of a batch of examples).
"""
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
return self._encode_plus(
text=text,
boxes=boxes,
text_pair=text_pair,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def _encode_plus(
self,
text: Union[TextInput, PreTokenizedInput],
text_pair: Optional[PreTokenizedInput] = None,
boxes: Optional[List[List[int]]] = None,
word_labels: Optional[List[int]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
if return_offsets_mapping:
raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers. "
"To use this feature, change your tokenizer to one deriving from "
"transformers.PreTrainedTokenizerFast. "
"More information on available tokenizers at "
"https://github.com/huggingface/transformers/pull/2674"
)
return self.prepare_for_model(
text=text,
text_pair=text_pair,
boxes=boxes,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding=padding_strategy.value,
truncation=truncation_strategy.value,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
verbose=verbose,
)
@add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def prepare_for_model(
self,
text: Union[TextInput, PreTokenizedInput],
text_pair: Optional[PreTokenizedInput] = None,
boxes: Optional[List[List[int]]] = None,
word_labels: Optional[List[int]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
prepend_batch_axis: bool = False,
**kwargs,
def truncate_sequences(
self,
ids: List[int],
token_boxes: List[List[int]],
pair_ids: Optional[List[int]] = None,
pair_token_boxes: Optional[List[List[int]]] = None,
labels: Optional[List[int]] = None,
num_tokens_to_remove: int = 0,
truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
stride: int = 0,
.\models\layoutlmv3\tokenization_layoutlmv3_fast.py
import json
from typing import Dict, List, Optional, Tuple, Union
from tokenizers import pre_tokenizers, processors
from ...tokenization_utils_base import (
BatchEncoding,
EncodedInput,
PaddingStrategy,
PreTokenizedInput,
TensorType,
TextInput,
TextInputPair,
TruncationStrategy,
)
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import add_end_docstrings, logging
from .tokenization_layoutlmv3 import (
LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING,
LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
LayoutLMv3Tokenizer,
)
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/layoutlmv3-base": "https://huggingface.co/microsoft/layoutlmv3-base/raw/main/vocab.json",
"microsoft/layoutlmv3-large": "https://huggingface.co/microsoft/layoutlmv3-large/raw/main/vocab.json",
},
"merges_file": {
"microsoft/layoutlmv3-base": "https://huggingface.co/microsoft/layoutlmv3-base/raw/main/merges.txt",
"microsoft/layoutlmv3-large": "https://huggingface.co/microsoft/layoutlmv3-large/raw/main/merges.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/layoutlmv3-base": 512,
"microsoft/layoutlmv3-large": 512,
}
class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
r"""
Construct a "fast" LayoutLMv3 tokenizer (backed by HuggingFace's *tokenizers* library). Based on BPE.
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = LayoutLMv3Tokenizer
def __init__(
self,
vocab_file=None,
merges_file=None,
tokenizer_file=None,
errors="replace",
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
add_prefix_space=True,
trim_offsets=True,
cls_token_box=[0, 0, 0, 0],
sep_token_box=[0, 0, 0, 0],
pad_token_box=[0, 0, 0, 0],
pad_token_label=-100,
only_label_first_subword=True,
**kwargs,
)
super().__init__(
vocab_file,
merges_file,
tokenizer_file=tokenizer_file,
errors=errors,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
cls_token=cls_token,
unk_token=unk_token,
pad_token=pad_token,
mask_token=mask_token,
add_prefix_space=add_prefix_space,
trim_offsets=trim_offsets,
cls_token_box=cls_token_box,
sep_token_box=sep_token_box,
pad_token_box=pad_token_box,
pad_token_label=pad_token_label,
only_label_first_subword=only_label_first_subword,
**kwargs,
)
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
self.add_prefix_space = add_prefix_space
tokenizer_component = "post_processor"
tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
if tokenizer_component_instance:
state = json.loads(tokenizer_component_instance.__getstate__())
if "sep" in state:
state["sep"] = tuple(state["sep"])
if "cls" in state:
state["cls"] = tuple(state["cls"])
changes_to_apply = False
if state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
state["add_prefix_space"] = add_prefix_space
changes_to_apply = True
if state.get("trim_offsets", trim_offsets) != trim_offsets:
state["trim_offsets"] = trim_offsets
changes_to_apply = True
if changes_to_apply:
component_class = getattr(processors, state.pop("type"))
new_value = component_class(**state)
setattr(self.backend_tokenizer, tokenizer_component, new_value)
self.cls_token_box = cls_token_box
self.sep_token_box = sep_token_box
self.pad_token_box = pad_token_box
self.pad_token_label = pad_token_label
self.only_label_first_subword = only_label_first_subword
@add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
boxes: Union[List[List[int]], List[List[List[int]]]] = None,
word_labels: Optional[Union[List[int], List[List[int]]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
@add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
],
is_pair: bool = None,
boxes: Optional[List[List[List[int]]]] = None,
word_labels: Optional[Union[List[int], List[List[int]]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
) -> BatchEncoding:
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
return self._batch_encode_plus(
batch_text_or_text_pairs=batch_text_or_text_pairs,
is_pair=is_pair,
boxes=boxes,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
batched_input = [(text, pair)] if pair else [text]
encodings = self._tokenizer.encode_batch(
batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
)
return encodings[0].tokens
@add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def encode_plus(
self,
text: Union[TextInput, PreTokenizedInput],
text_pair: Optional[PreTokenizedInput] = None,
boxes: Optional[List[List[int]]] = None,
word_labels: Optional[List[int]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
"""
Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
`__call__` should be used instead.
Args:
text (`str`, `List[str]`, `List[List[str]]`):
The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
text_pair (`List[str]` or `List[int]`, *optional*):
Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
list of list of strings (words of a batch of examples).
"""
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
return self._encode_plus(
text=text,
boxes=boxes,
text_pair=text_pair,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def _batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
],
is_pair: bool = None,
boxes: Optional[List[List[List[int]]]] = None,
word_labels: Optional[List[List[int]]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
def _encode_plus(
self,
text: Union[TextInput, PreTokenizedInput],
text_pair: Optional[PreTokenizedInput] = None,
boxes: Optional[List[List[int]]] = None,
word_labels: Optional[List[int]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[bool] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
batched_input = [(text, text_pair)] if text_pair else [text]
batched_boxes = [boxes]
batched_word_labels = [word_labels] if word_labels is not None else None
batched_output = self._batch_encode_plus(
batched_input,
is_pair=bool(text_pair is not None),
boxes=batched_boxes,
word_labels=batched_word_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
if return_tensors is None and not return_overflowing_tokens:
batched_output = BatchEncoding(
{
key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
for key, value in batched_output.items()
},
batched_output.encodings,
)
self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
return batched_output
def _pad(
self,
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
if token_ids_1 is None:
return output
return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Args:
token_ids_0 (`List[int]`):
第一个序列的ID列表。
token_ids_1 (`List[int]`, *optional*):
可选的第二个序列的ID列表,用于序列对任务。
Returns:
`List[int]`: 全零列表,用作RoBERTa模型中不使用token类型ID的占位。
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
.\models\layoutlmv3\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_tf_available,
is_tokenizers_available,
is_torch_available,
is_vision_available,
)
_import_structure = {
"configuration_layoutlmv3": [
"LAYOUTLMV3_PRETRAINED_CONFIG_ARCHIVE_MAP",
"LayoutLMv3Config",
"LayoutLMv3OnnxConfig",
],
"processing_layoutlmv3": ["LayoutLMv3Processor"],
"tokenization_layoutlmv3": ["LayoutLMv3Tokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_layoutlmv3_fast"] = ["LayoutLMv3TokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_layoutlmv3"] = [
"LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST",
"LayoutLMv3ForQuestionAnswering",
"LayoutLMv3ForSequenceClassification",
"LayoutLMv3ForTokenClassification",
"LayoutLMv3Model",
"LayoutLMv3PreTrainedModel",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_layoutlmv3"] = [
"TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFLayoutLMv3ForQuestionAnswering",
"TFLayoutLMv3ForSequenceClassification",
"TFLayoutLMv3ForTokenClassification",
"TFLayoutLMv3Model",
"TFLayoutLMv3PreTrainedModel",
]
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["feature_extraction_layoutlmv3"] = ["LayoutLMv3FeatureExtractor"]
_import_structure["image_processing_layoutlmv3"] = ["LayoutLMv3ImageProcessor"]
if TYPE_CHECKING:
from .configuration_layoutlmv3 import (
LAYOUTLMV3_PRETRAINED_CONFIG_ARCHIVE_MAP,
LayoutLMv3Config,
LayoutLMv3OnnxConfig,
)
from .processing_layoutlmv3 import LayoutLMv3Processor
from .tokenization_layoutlmv3 import LayoutLMv3Tokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_layoutlmv3_fast import LayoutLMv3TokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_layoutlmv3 import (
LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST,
LayoutLMv3ForQuestionAnswering,
LayoutLMv3ForSequenceClassification,
LayoutLMv3ForTokenClassification,
LayoutLMv3Model,
LayoutLMv3PreTrainedModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_layoutlmv3 import (
TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST,
TFLayoutLMv3ForQuestionAnswering,
TFLayoutLMv3ForSequenceClassification,
TFLayoutLMv3ForTokenClassification,
TFLayoutLMv3Model,
TFLayoutLMv3PreTrainedModel,
)
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .feature_extraction_layoutlmv3 import LayoutLMv3FeatureExtractor
from .image_processing_layoutlmv3 import LayoutLMv3ImageProcessor
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\layoutxlm\processing_layoutxlm.py
"""
Processor class for LayoutXLM.
"""
import warnings
from typing import List, Optional, Union
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType
class LayoutXLMProcessor(ProcessorMixin):
r"""
Constructs a LayoutXLM processor which combines a LayoutXLM image processor and a LayoutXLM tokenizer into a single
processor.
[`LayoutXLMProcessor`] offers all the functionalities you need to prepare data for the model.
It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
get words and normalized bounding boxes. These are then provided to [`LayoutXLMTokenizer`] or
[`LayoutXLMTokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
`attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
into token-level `labels` for token classification tasks (such as FUNSD, CORD).
Args:
image_processor (`LayoutLMv2ImageProcessor`, *optional*):
An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`, *optional*):
An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "LayoutLMv2ImageProcessor"
tokenizer_class = ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast")
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
def __call__(
self,
images,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
boxes: Union[List[List[int]], List[List[List[int]]]] = None,
word_labels: Optional[Union[List[int], List[List[int]]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
):
def get_overflowing_images(self, images, overflow_to_sample_mapping):
images_with_overflow = []
for sample_idx in overflow_to_sample_mapping:
images_with_overflow.append(images[sample_idx])
if len(images_with_overflow) != len(overflow_to_sample_mapping):
raise ValueError(
"Expected length of images to be the same as the length of `overflow_to_sample_mapping`, but got"
f" {len(images_with_overflow)} and {len(overflow_to_sample_mapping)}"
)
return images_with_overflow
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
to the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
return ["input_ids", "bbox", "attention_mask", "image"]
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
.\models\layoutxlm\tokenization_layoutxlm.py
""" Tokenization classes for LayoutXLM model."""
import os
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple, Union
import sentencepiece as spm
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...tokenization_utils_base import (
BatchEncoding,
EncodedInput,
PreTokenizedInput,
TextInput,
TextInputPair,
TruncationStrategy,
)
from ...utils import PaddingStrategy, TensorType, add_end_docstrings, logging
from ..xlm_roberta.tokenization_xlm_roberta import (
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
PRETRAINED_VOCAB_FILES_MAP,
SPIECE_UNDERLINE,
VOCAB_FILES_NAMES,
)
logger = logging.get_logger(__name__)
class LayoutXLMTokenizer(PreTrainedTokenizer):
"""
Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
[SentencePiece](https://github.com/google/sentencepiece).
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
Attributes:
sp_model (`SentencePieceProcessor`):
The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
cls_token_box=[0, 0, 0, 0],
sep_token_box=[1000, 1000, 1000, 1000],
pad_token_box=[0, 0, 0, 0],
pad_token_label=-100,
only_label_first_subword=True,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs,
) -> None:
mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(str(vocab_file))
self.vocab_file = vocab_file
self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
self.fairseq_offset = 1
self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + self.fairseq_offset
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
self.cls_token_box = cls_token_box
self.sep_token_box = sep_token_box
self.pad_token_box = pad_token_box
self.pad_token_label = pad_token_label
self.only_label_first_subword = only_label_first_subword
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
cls_token=cls_token,
pad_token=pad_token,
mask_token=mask_token,
cls_token_box=cls_token_box,
sep_token_box=sep_token_box,
pad_token_box=pad_token_box,
pad_token_label=pad_token_label,
only_label_first_subword=only_label_first_subword,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs,
)
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
state["sp_model_proto"] = self.sp_model.serialized_model_proto()
return state
def __setstate__(self, d):
self.__dict__ = d
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Generate token type IDs from a pair of sequences. Token type IDs distinguish between two sequences in a model input.
For XLM-RoBERTa, token type IDs are:
- single sequence: 0s for all tokens
- pair of sequences: 0s for the tokens from the first sequence, 1s for the tokens from the second sequence
Args:
token_ids_0 (`List[int]`):
List of IDs for the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional list of IDs for the second sequence in a pair.
Returns:
`List[int]`: List of token type IDs indicating the sequence membership of each token.
"""
if token_ids_1 is None:
return [0] * len(token_ids_0)
return [0] * len(token_ids_0) + [1] * len(token_ids_1)
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
not make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
else:
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
@property
def vocab_size(self):
return len(self.sp_model) + self.fairseq_offset + 1
def get_vocab(self):
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text: str) -> List[str]:
return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
if token in self.fairseq_tokens_to_ids:
return self.fairseq_tokens_to_ids[token]
spm_id = self.sp_model.PieceToId(token)
return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
if index in self.fairseq_ids_to_tokens:
return self.fairseq_ids_to_tokens[index]
return self.sp_model.IdToPiece(index - self.fairseq_offset)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file,)
@add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
boxes: Union[List[List[int]], List[List[List[int]]]] = None,
word_labels: Optional[Union[List[int], List[List[int]]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
...
def _batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
],
is_pair: bool = None,
boxes: Optional[List[List[List[int]]]] = None,
word_labels: Optional[List[List[int]]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
...
) -> BatchEncoding:
if return_offsets_mapping:
raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers. "
"To use this feature, change your tokenizer to one deriving from "
"transformers.PreTrainedTokenizerFast."
)
batch_outputs = self._batch_prepare_for_model(
batch_text_or_text_pairs=batch_text_or_text_pairs,
is_pair=is_pair,
boxes=boxes,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
return_tensors=return_tensors,
verbose=verbose,
)
return BatchEncoding(batch_outputs)
@add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
def _batch_prepare_for_model(
self,
batch_text_or_text_pairs,
is_pair: bool = None,
boxes: Optional[List[List[int]]] = None,
word_labels: Optional[List[List[int]]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_length: bool = False,
verbose: bool = True,
) -> BatchEncoding:
"""
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
manages a moving window (with user defined stride) for overflowing tokens
Args:
batch_ids_pairs: list of tokenized input ids or input ids pairs
"""
batch_outputs = {}
for idx, example in enumerate(zip(batch_text_or_text_pairs, boxes)):
batch_text_or_text_pair, boxes_example = example
outputs = self.prepare_for_model(
batch_text_or_text_pair[0] if is_pair else batch_text_or_text_pair,
batch_text_or_text_pair[1] if is_pair else None,
boxes_example,
word_labels=word_labels[idx] if word_labels is not None else None,
add_special_tokens=add_special_tokens,
padding=PaddingStrategy.DO_NOT_PAD.value,
truncation=truncation_strategy.value,
max_length=max_length,
stride=stride,
pad_to_multiple_of=None,
return_attention_mask=False,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
return_tensors=None,
prepend_batch_axis=False,
verbose=verbose,
)
for key, value in outputs.items():
if key not in batch_outputs:
batch_outputs[key] = []
batch_outputs[key].append(value)
batch_outputs = self.pad(
batch_outputs,
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
)
batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
return batch_outputs
def _encode_plus(
self,
text: Union[TextInput, PreTokenizedInput],
text_pair: Optional[PreTokenizedInput] = None,
boxes: Optional[List[List[int]]] = None,
word_labels: Optional[List[int]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
if return_offsets_mapping:
raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers. "
"To use this feature, change your tokenizer to one deriving from "
"transformers.PreTrainedTokenizerFast. "
"More information on available tokenizers at "
"https://github.com/huggingface/transformers/pull/2674"
)
return self.prepare_for_model(
text=text,
text_pair=text_pair,
boxes=boxes,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding=padding_strategy.value,
truncation=truncation_strategy.value,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
verbose=verbose,
)
@add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
def prepare_for_model(
self,
text: Union[TextInput, PreTokenizedInput],
text_pair: Optional[PreTokenizedInput] = None,
boxes: Optional[List[List[int]]] = None,
word_labels: Optional[List[int]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
prepend_batch_axis: bool = False,
**kwargs,
):
"""
准备输入以供模型使用的方法。
参数:
- text: 输入文本,可以是未分词或预分词的输入。
- text_pair: 可选的第二个文本输入,用于处理文本对(如句子对任务)。
- boxes: 可选的边界框列表,用于处理与文本相关的图像区域。
- word_labels: 可选的单词级别标签列表。
- add_special_tokens: 是否添加特殊的语言模型令牌(如CLS和SEP)。
- padding: 控制填充输入序列的方式,可以是布尔值、字符串或填充策略对象。
- truncation: 控制截断输入序列的方式,可以是布尔值、字符串或截断策略对象。
- max_length: 输入序列的最大长度限制。
- stride: 截断或填充时的步长。
- pad_to_multiple_of: 如果指定,将输入填充到该数的倍数。
- return_tensors: 控制返回的张量类型。
- return_token_type_ids: 是否返回token_type_ids。
- return_attention_mask: 是否返回attention_mask。
- return_overflowing_tokens: 是否返回溢出的token。
- return_special_tokens_mask: 是否返回特殊token的mask。
- return_offsets_mapping: 是否返回token在原始输入中的偏移映射。
- return_length: 是否返回输入长度。
- verbose: 是否打印详细信息。
- prepend_batch_axis: 是否在返回张量中添加批处理维度。
- **kwargs: 其他未明确列出的参数。
"""
...
def truncate_sequences(
self,
ids: List[int],
token_boxes: List[List[int]],
pair_ids: Optional[List[int]] = None,
pair_token_boxes: Optional[List[List[int]]] = None,
labels: Optional[List[int]] = None,
num_tokens_to_remove: int = 0,
truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
stride: int = 0,
):
"""
截断序列的方法。
参数:
- ids: 输入序列的token IDs。
- token_boxes: 每个token的边界框。
- pair_ids: 可选的第二个序列的token IDs,用于处理序列对。
- pair_token_boxes: 可选的第二个序列的边界框列表。
- labels: 可选的标签列表。
- num_tokens_to_remove: 要移除的token数量。
- truncation_strategy: 截断策略,如"longest_first"等。
- stride: 截断时的步长。
"""
...
def _pad(
self,
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
):
"""
填充方法,用于将输入编码填充到相同长度。
参数:
- encoded_inputs: 编码后的输入,可以是单个EncodedInput对象或BatchEncoding对象。
- max_length: 填充后的最大长度限制。
- padding_strategy: 填充策略对象,控制如何进行填充。
- pad_to_multiple_of: 如果指定,将填充到该数的倍数。
- return_attention_mask: 是否返回attention_mask。
"""
...
.\models\layoutxlm\tokenization_layoutxlm_fast.py
import os
from shutil import copyfile
from typing import Dict, List, Optional, Tuple, Union
from ...tokenization_utils import AddedToken
from ...tokenization_utils_base import (
BatchEncoding,
EncodedInput,
PreTokenizedInput,
TextInput,
TextInputPair,
TruncationStrategy,
)
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import PaddingStrategy, TensorType, add_end_docstrings, is_sentencepiece_available, logging
from ..xlm_roberta.tokenization_xlm_roberta_fast import (
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
PRETRAINED_VOCAB_FILES_MAP,
VOCAB_FILES_NAMES,
)
if is_sentencepiece_available():
from .tokenization_layoutxlm import LayoutXLMTokenizer
else:
LayoutXLMTokenizer = None
logger = logging.get_logger(__name__)
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
cls_token=cls_token,
unk_token=unk_token,
pad_token=pad_token,
mask_token=mask_token,
cls_token_box=cls_token_box,
sep_token_box=sep_token_box,
pad_token_box=pad_token_box,
pad_token_label=pad_token_label,
only_label_first_subword=only_label_first_subword,
**kwargs,
)
self.vocab_file = vocab_file
self.cls_token_box = cls_token_box
self.sep_token_box = sep_token_box
self.pad_token_box = pad_token_box
self.pad_token_label = pad_token_label
self.only_label_first_subword = only_label_first_subword
@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False
@add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
boxes: Union[List[List[int]], List[List[List[int]]]] = None,
word_labels: Optional[Union[List[int], List[List[int]]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
pass
def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
batched_input = [(text, pair)] if pair else [text]
encodings = self._tokenizer.encode_batch(
batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
)
return encodings[0].tokens
def _batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
],
is_pair: bool = None,
boxes: Optional[List[List[List[int]]]] = None,
word_labels: Optional[List[List[int]]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
def _encode_plus(
self,
text: Union[TextInput, PreTokenizedInput],
text_pair: Optional[PreTokenizedInput] = None,
boxes: Optional[List[List[int]]] = None,
word_labels: Optional[List[int]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[bool] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
) -> BatchEncoding:
batched_input = [(text, text_pair)] if text_pair else [text]
batched_boxes = [boxes]
batched_word_labels = [word_labels] if word_labels is not None else None
batched_output = self._batch_encode_plus(
batched_input,
is_pair=bool(text_pair is not None),
boxes=batched_boxes,
word_labels=batched_word_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
if return_tensors is None and not return_overflowing_tokens:
batched_output = BatchEncoding(
{
key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
for key, value in batched_output.items()
},
batched_output.encodings,
)
self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
return batched_output
def _pad(
self,
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
):
...
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
...
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and
adding special tokens. An XLM-RoBERTa sequence has the following format:
- single sequence: `<s> X </s>`
- pair of sequences: `<s> A </s></s> B </s>`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of input IDs with the appropriate special tokens added.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
not make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros indicating the token types.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the tokenizer vocabulary to the specified directory.
Args:
save_directory (str):
Directory where the vocabulary will be saved.
filename_prefix (str, *optional*):
Prefix to prepend to the saved vocabulary filename.
Returns:
Tuple containing the path to the saved vocabulary file.
"""
if not self.can_save_slow_tokenizer:
raise ValueError(
"Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
"tokenizer."
)
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)
.\models\layoutxlm\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_sentencepiece_available,
is_tokenizers_available,
is_torch_available,
is_vision_available,
)
_import_structure = {"processing_layoutxlm": ["LayoutXLMProcessor"]}
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_layoutxlm"] = ["LayoutXLMTokenizer"]
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_layoutxlm_fast"] = ["LayoutXLMTokenizerFast"]
if TYPE_CHECKING:
from .processing_layoutxlm import LayoutXLMProcessor
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_layoutxlm import LayoutXLMTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_layoutxlm_fast import LayoutXLMTokenizerFast
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\led\configuration_led.py
""" LED model configuration"""
from typing import List, Union
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
LED_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"allenai/led-base-16384": "https://huggingface.co/allenai/led-base-16384/resolve/main/config.json",
}
class LEDConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`LEDModel`]. It is used to instantiate an LED
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the LED
[allenai/led-base-16384](https://huggingface.co/allenai/led-base-16384) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import LEDModel, LEDConfig
>>> # Initializing a LED allenai/led-base-16384 style configuration
>>> configuration = LEDConfig()
>>> # Initializing a model from the allenai/led-base-16384 style configuration
>>> model = LEDModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "led"
attribute_map = {
"num_attention_heads": "encoder_attention_heads",
"hidden_size": "d_model",
"attention_probs_dropout_prob": "attention_dropout",
"initializer_range": "init_std",
}
def __init__(
self,
vocab_size=50265,
max_encoder_position_embeddings=16384,
max_decoder_position_embeddings=1024,
encoder_layers=12,
encoder_ffn_dim=4096,
encoder_attention_heads=16,
decoder_layers=12,
decoder_ffn_dim=4096,
decoder_attention_heads=16,
encoder_layerdrop=0.0,
decoder_layerdrop=0.0,
use_cache=True,
is_encoder_decoder=True,
activation_function="gelu",
d_model=1024,
dropout=0.1,
attention_dropout=0.0,
activation_dropout=0.0,
init_std=0.02,
decoder_start_token_id=2,
classifier_dropout=0.0,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
attention_window: Union[List[int], int] = 512,
**kwargs,
):
self.vocab_size = vocab_size
self.max_encoder_position_embeddings = max_encoder_position_embeddings
self.max_decoder_position_embeddings = max_decoder_position_embeddings
self.d_model = d_model
self.encoder_ffn_dim = encoder_ffn_dim
self.encoder_layers = encoder_layers
self.encoder_attention_heads = encoder_attention_heads
self.decoder_ffn_dim = decoder_ffn_dim
self.decoder_layers = decoder_layers
self.decoder_attention_heads = decoder_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.activation_function = activation_function
self.init_std = init_std
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.classifier_dropout = classifier_dropout
self.use_cache = use_cache
self.num_hidden_layers = encoder_layers
self.attention_window = attention_window
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
.\models\led\modeling_led.py
""" PyTorch LED model."""
import math
import warnings
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask
from ...modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
Seq2SeqLMOutput,
Seq2SeqModelOutput,
Seq2SeqQuestionAnsweringModelOutput,
Seq2SeqSequenceClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_end_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_led import LEDConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "allenai/led-base-16384"
_CONFIG_FOR_DOC = "LEDConfig"
LED_PRETRAINED_MODEL_ARCHIVE_LIST = [
"allenai/led-base-16384",
]
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
"""
Shift input ids one token to the right.
"""
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
shifted_input_ids[:, 0] = decoder_start_token_id
if pad_token_id is None:
raise ValueError("config.pad_token_id has to be defined.")
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
return shifted_input_ids
def _prepare_4d_attention_mask_inverted(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
"""
bsz, src_len = mask.size()
tgt_len = tgt_len if tgt_len is not None else src_len
expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
inverted_mask = 1.0 - expanded_mask
expanded_attention_mask = inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
expanded_attention_mask = expanded_attention_mask * inverted_mask
return expanded_attention_mask
class LEDLearnedPositionalEmbedding(nn.Embedding):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
def __init__(self, num_embeddings: int, embedding_dim: int):
super().__init__(num_embeddings, embedding_dim)
def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
"""`input_ids_shape` is expected to be [bsz x seqlen]."""
bsz, seq_len = input_ids_shape[:2]
positions = torch.arange(
past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
)
return super().forward(positions)
class LEDEncoderSelfAttention(nn.Module):
def __init__(self, config, layer_id):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_heads = config.num_attention_heads
self.head_dim = int(config.hidden_size / config.num_attention_heads)
self.embed_dim = config.hidden_size
self.query = nn.Linear(config.hidden_size, self.embed_dim)
self.key = nn.Linear(config.hidden_size, self.embed_dim)
self.value = nn.Linear(config.hidden_size, self.embed_dim)
self.query_global = nn.Linear(config.hidden_size, self.embed_dim)
self.key_global = nn.Linear(config.hidden_size, self.embed_dim)
self.value_global = nn.Linear(config.hidden_size, self.embed_dim)
self.dropout = config.attention_probs_dropout_prob
self.layer_id = layer_id
attention_window = config.attention_window[self.layer_id]
assert (
attention_window % 2 == 0
), f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
assert (
attention_window > 0
), f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
self.one_sided_attn_window_size = attention_window // 2
self.config = config
def forward(
self,
hidden_states,
attention_mask=None,
layer_head_mask=None,
is_index_masked=None,
is_index_global_attn=None,
is_global_attn=None,
output_attentions=False,
):
pass
def _pad_and_transpose_last_two_dims(hidden_states_padded, padding):
"""
对最后两个维度进行填充和转置操作。
Args:
hidden_states_padded (torch.Tensor): 填充后的隐藏状态张量
padding (tuple): 填充值,实际数值并不重要,因为它将被覆盖
Returns:
torch.Tensor: 填充和转置后的隐藏状态张量
"""
hidden_states_padded = nn.functional.pad(
hidden_states_padded, padding
)
hidden_states_padded = hidden_states_padded.view(
*hidden_states_padded.size()[:-2], hidden_states_padded.size(-1), hidden_states_padded.size(-2)
)
return hidden_states_padded
@staticmethod
def _pad_and_diagonalize(chunked_hidden_states):
"""
将每一行向右移动一步,将列转换为对角线。
Args:
chunked_hidden_states (torch.Tensor): 分块的隐藏状态张量
Returns:
torch.Tensor: 填充和对角化后的隐藏状态张量
"""
total_num_heads, num_chunks, window_overlap, hidden_dim = chunked_hidden_states.size()
chunked_hidden_states = nn.functional.pad(
chunked_hidden_states, (0, window_overlap + 1)
)
chunked_hidden_states = chunked_hidden_states.view(
total_num_heads, num_chunks, -1
)
chunked_hidden_states = chunked_hidden_states[
:, :, :-window_overlap
]
chunked_hidden_states = chunked_hidden_states.view(
total_num_heads, num_chunks, window_overlap, window_overlap + hidden_dim
)
chunked_hidden_states = chunked_hidden_states[:, :, :, :-1]
return chunked_hidden_states
def _chunk(hidden_states, window_overlap, onnx_export: bool = False):
"""convert into overlapping chunks. Chunk size = 2w, overlap size = w"""
if not onnx_export:
hidden_states = hidden_states.view(
hidden_states.size(0),
torch.div(hidden_states.size(1), (window_overlap * 2), rounding_mode="trunc"),
window_overlap * 2,
hidden_states.size(2),
)
chunk_size = list(hidden_states.size())
chunk_size[1] = chunk_size[1] * 2 - 1
chunk_stride = list(hidden_states.stride())
chunk_stride[1] = chunk_stride[1] // 2
return hidden_states.as_strided(size=chunk_size, stride=chunk_stride)
chunk_size = [
hidden_states.size(0),
torch.div(hidden_states.size(1), window_overlap, rounding_mode="trunc") - 1,
window_overlap * 2,
hidden_states.size(2),
]
overlapping_chunks = torch.empty(chunk_size, device=hidden_states.device)
for chunk in range(chunk_size[1]):
overlapping_chunks[:, chunk, :, :] = hidden_states[
:, chunk * window_overlap : chunk * window_overlap + 2 * window_overlap, :
]
return overlapping_chunks
@staticmethod
def _mask_invalid_locations(input_tensor, affected_seq_len) -> torch.Tensor:
beginning_mask_2d = input_tensor.new_ones(affected_seq_len, affected_seq_len + 1).tril().flip(dims=[0])
beginning_mask = beginning_mask_2d[None, :, None, :]
ending_mask = beginning_mask.flip(dims=(1, 3))
beginning_input = input_tensor[:, :affected_seq_len, :, : affected_seq_len + 1]
beginning_mask = beginning_mask.expand(beginning_input.size())
input_tensor[:, :affected_seq_len, :, : affected_seq_len + 1] = torch.full_like(
beginning_input, -float("inf")
).where(beginning_mask.bool(), beginning_input)
ending_input = input_tensor[:, -affected_seq_len:, :, -(affected_seq_len + 1) :]
ending_mask = ending_mask.expand(ending_input.size())
input_tensor[:, -affected_seq_len:, :, -(affected_seq_len + 1) :] = torch.full_like(
ending_input, -float("inf")
).where(ending_mask.bool(), ending_input)
def _sliding_chunks_matmul_attn_probs_value(
self, attn_probs: torch.Tensor, value: torch.Tensor, window_overlap: int
):
"""
Same as _sliding_chunks_query_key_matmul but for attn_probs and value tensors. Returned tensor will be of the
same shape as `attn_probs`
"""
batch_size, seq_len, num_heads, head_dim = value.size()
assert seq_len % (window_overlap * 2) == 0
assert attn_probs.size()[:3] == value.size()[:3]
assert attn_probs.size(3) == 2 * window_overlap + 1
chunks_count = torch.div(seq_len, window_overlap, rounding_mode="trunc") - 1
chunked_attn_probs = attn_probs.transpose(1, 2).reshape(
batch_size * num_heads,
torch.div(seq_len, window_overlap, rounding_mode="trunc"),
window_overlap,
2 * window_overlap + 1,
)
value = value.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim)
padded_value = nn.functional.pad(value, (0, 0, window_overlap, window_overlap), value=-1)
chunked_value_size = (batch_size * num_heads, chunks_count + 1, 3 * window_overlap, head_dim)
chunked_value_stride = padded_value.stride()
chunked_value_stride = (
chunked_value_stride[0],
window_overlap * chunked_value_stride[1],
chunked_value_stride[1],
chunked_value_stride[2],
)
chunked_value = padded_value.as_strided(size=chunked_value_size, stride=chunked_value_stride)
chunked_attn_probs = self._pad_and_diagonalize(chunked_attn_probs)
context = torch.einsum("bcwd,bcdh->bcwh", (chunked_attn_probs, chunked_value))
return context.view(batch_size, num_heads, seq_len, head_dim).transpose(1, 2)
def _get_global_attn_indices(is_index_global_attn):
"""计算在整个前向传递中需要使用的全局注意力索引"""
num_global_attn_indices = is_index_global_attn.long().sum(dim=1)
max_num_global_attn_indices = num_global_attn_indices.max()
is_index_global_attn_nonzero = is_index_global_attn.nonzero(as_tuple=True)
is_local_index_global_attn = torch.arange(
max_num_global_attn_indices, device=is_index_global_attn.device
) < num_global_attn_indices.unsqueeze(dim=-1)
is_local_index_global_attn_nonzero = is_local_index_global_attn.nonzero(as_tuple=True)
is_local_index_no_global_attn_nonzero = (is_local_index_global_attn == 0).nonzero(as_tuple=True)
return (
max_num_global_attn_indices,
is_index_global_attn_nonzero,
is_local_index_global_attn_nonzero,
is_local_index_no_global_attn_nonzero,
)
def _concat_with_global_key_attn_probs(
self,
key_vectors,
query_vectors,
max_num_global_attn_indices,
is_index_global_attn_nonzero,
is_local_index_global_attn_nonzero,
is_local_index_no_global_attn_nonzero,
):
batch_size = key_vectors.shape[0]
key_vectors_only_global = key_vectors.new_zeros(
batch_size, max_num_global_attn_indices, self.num_heads, self.head_dim
)
key_vectors_only_global[is_local_index_global_attn_nonzero] = key_vectors[is_index_global_attn_nonzero]
attn_probs_from_global_key = torch.einsum("blhd,bshd->blhs", (query_vectors, key_vectors_only_global))
attn_probs_from_global_key = attn_probs_from_global_key.transpose(1, 3)
attn_probs_from_global_key[
is_local_index_no_global_attn_nonzero[0], is_local_index_no_global_attn_nonzero[1], :, :
] = torch.finfo(attn_probs_from_global_key.dtype).min
attn_probs_from_global_key = attn_probs_from_global_key.transpose(1, 3)
return attn_probs_from_global_key
def _compute_attn_output_with_global_indices(
self,
value_vectors,
attn_probs,
max_num_global_attn_indices,
is_index_global_attn_nonzero,
is_local_index_global_attn_nonzero,
is_local_index_no_global_attn_nonzero,
):
):
batch_size = attn_probs.shape[0]
attn_probs_only_global = attn_probs.narrow(-1, 0, max_num_global_attn_indices)
value_vectors_only_global = value_vectors.new_zeros(
batch_size, max_num_global_attn_indices, self.num_heads, self.head_dim
)
value_vectors_only_global[is_local_index_global_attn_nonzero] = value_vectors[is_index_global_attn_nonzero]
attn_output_only_global = torch.matmul(
attn_probs_only_global.transpose(1, 2).clone(), value_vectors_only_global.transpose(1, 2).clone()
).transpose(1, 2)
attn_probs_without_global = attn_probs.narrow(
-1, max_num_global_attn_indices, attn_probs.size(-1) - max_num_global_attn_indices
).contiguous()
attn_output_without_global = self._sliding_chunks_matmul_attn_probs_value(
attn_probs_without_global, value_vectors, self.one_sided_attn_window_size
)
return attn_output_only_global + attn_output_without_global
def _compute_global_attn_output_from_hidden(
self,
hidden_states,
max_num_global_attn_indices,
layer_head_mask,
is_local_index_global_attn_nonzero,
is_index_global_attn_nonzero,
is_local_index_no_global_attn_nonzero,
is_index_masked,
class LEDEncoderAttention(nn.Module):
def __init__(self, config, layer_id):
super().__init__()
self.longformer_self_attn = LEDEncoderSelfAttention(config, layer_id=layer_id)
self.output = nn.Linear(config.d_model, config.d_model)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
is_index_masked: Optional[torch.Tensor] = None,
is_index_global_attn: Optional[torch.Tensor] = None,
is_global_attn: Optional[bool] = None,
output_attentions: bool = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
"""Input shape: Batch x Time x Channel"""
self_outputs = self.longformer_self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
is_index_masked=is_index_masked,
is_index_global_attn=is_index_global_attn,
is_global_attn=is_global_attn,
output_attentions=output_attentions,
)
attn_output = self.output(self_outputs[0])
outputs = (attn_output,) + self_outputs[1:]
return outputs
class LEDDecoderAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
if self.head_dim * num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
"""Forward pass of the decoder attention module."""
def __init__(self, config: LEDConfig, layer_id: int):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = LEDEncoderAttention(config, layer_id)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
layer_head_mask: torch.Tensor,
is_index_masked=None,
is_index_global_attn=None,
is_global_attn=None,
output_attentions=False,
):
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
attention_mask (`torch.FloatTensor`): attention mask of size
*(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
*(encoder_attention_heads,)*.
"""
residual = hidden_states
attn_outputs = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
is_index_masked=is_index_masked,
is_index_global_attn=is_index_global_attn,
is_global_attn=is_global_attn,
output_attentions=output_attentions,
)
hidden_states = attn_outputs[0]
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
residual = hidden_states
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.final_layer_norm(hidden_states)
if hidden_states.dtype == torch.float16 and (
torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
):
clamp_value = torch.finfo(hidden_states.dtype).max - 1000
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
return (hidden_states,) + attn_outputs[1:]
class LEDDecoderLayer(nn.Module):
def __init__(self, config: LEDConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = LEDDecoderAttention(
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.encoder_attn = LEDDecoderAttention(
self.embed_dim,
config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
)
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = True,
):
pass
class LEDClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
def __init__(
self,
input_dim: int,
inner_dim: int,
num_classes: int,
pooler_dropout: float,
):
super().__init__()
self.dense = nn.Linear(input_dim, inner_dim)
self.dropout = nn.Dropout(p=pooler_dropout)
self.out_proj = nn.Linear(inner_dim, num_classes)
def forward(self, hidden_states: torch.Tensor):
hidden_states = self.dropout(hidden_states)
hidden_states = self.dense(hidden_states)
hidden_states = torch.tanh(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.out_proj(hidden_states)
return hidden_states
class LEDPreTrainedModel(PreTrainedModel):
config_class = LEDConfig
base_model_prefix = "led"
supports_gradient_checkpointing = True
def _init_weights(self, module):
std = self.config.init_std
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
@property
def dummy_property(self):
pass
def dummy_inputs(self):
pad_token = self.config.pad_token_id
input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
dummy_inputs = {
"attention_mask": input_ids.ne(pad_token),
"input_ids": input_ids,
}
return dummy_inputs
@dataclass
class LEDEncoderBaseModelOutput(ModelOutput):
"""
LEDEncoder 的输出基类,可能包含隐藏状态、局部和全局注意力等信息。
"""
last_hidden_state: torch.FloatTensor
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class LEDSeq2SeqModelOutput(ModelOutput):
"""
LEDSeq2SeqModelOutput 类,继承自 ModelOutput,用于表示模型编码器的输出,
同时包含预先计算的隐藏状态,可加快顺序解码过程。
"""
last_hidden_state: torch.FloatTensor = None
past_key_values: Optional[List[torch.FloatTensor]] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
encoder_global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class LEDSeq2SeqLMOutput(ModelOutput):
"""
LEDSeq2SeqLMOutput 类,继承自 ModelOutput,用于表示序列到序列语言模型的输出。
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
past_key_values: Optional[List[torch.FloatTensor]] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
encoder_global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class LEDSeq2SeqSequenceClassifierOutput(ModelOutput):
"""
LEDSeq2SeqSequenceClassifierOutput 类,继承自 ModelOutput,用于表示序列到序列句子分类模型的输出。
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
past_key_values: Optional[List[torch.FloatTensor]] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
encoder_global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class LEDSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
"""
LEDSeq2SeqQuestionAnsweringModelOutput 类,继承自 ModelOutput,用于表示序列到序列问答模型的输出。
"""
loss: Optional[torch.FloatTensor] = None
start_logits: torch.FloatTensor = None
end_logits: torch.FloatTensor = None
past_key_values: Optional[List[torch.FloatTensor]] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
encoder_global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
LED_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. See the superclass documentation for the generic methods the library
implements for all its models (such as downloading or saving, resizing the input embeddings, pruning heads etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for general usage and behavior.
Parameters:
config ([`LEDConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
LED_GENERATION_EXAMPLE = r"""
Summarization example:
```
>>> import torch
>>> from transformers import AutoTokenizer, LEDForConditionalGeneration
>>> model = LEDForConditionalGeneration.from_pretrained("allenai/led-large-16384-arxiv")
>>> tokenizer = AutoTokenizer.from_pretrained("allenai/led-large-16384-arxiv")
>>> ARTICLE_TO_SUMMARIZE = '''Transformers (Vaswani et al., 2017) have achieved state-of-the-art
... results in a wide range of natural language tasks including generative language modeling
... (Dai et al., 2019; Radford et al., 2019) and discriminative ... language understanding (Devlin et al., 2019).
... This success is partly due to the self-attention component which enables the network to capture contextual
... information from the entire sequence. While powerful, the memory and computational requirements of
... self-attention grow quadratically with sequence length, making it infeasible (or very expensive) to
... process long sequences. To address this limitation, we present Longformer, a modified Transformer
... architecture with a self-attention operation that scales linearly with the sequence length, making it
... versatile for processing long documents (Fig 1). This is an advantage for natural language tasks such as
... long document classification, question answering (QA), and coreference resolution, where existing approaches
... partition or shorten the long context into smaller sequences that fall within the typical 512 token limit
... of BERT-style pretrained models. Such partitioning could potentially result in loss of important
... cross-partition information, and to mitigate this problem, existing methods often rely on complex
... architectures to address such interactions. On the other hand, our proposed Longformer is able to build
... contextual representations of the entire context using multiple layers of attention, reducing the need for
... task-specific architectures.'''
>>> inputs = tokenizer.encode(ARTICLE_TO_SUMMARIZE, return_tensors="pt")
# 使用给定的文章进行分词编码,返回PyTorch张量表示的输入
>>> global_attention_mask = torch.zeros_like(inputs)
# 创建一个与输入张量相同大小的全零张量,用于全局注意力掩码
>>> global_attention_mask[:, 0] = 1
# 将全局注意力掩码的第一个位置设置为1,以指示模型应该全局关注输入的第一个token
>>> summary_ids = model.generate(inputs, global_attention_mask=global_attention_mask, num_beams=3, max_length=32)
# 使用模型生成摘要,传入输入张量、全局注意力掩码、束搜索数量和最大长度限制
>>> print(tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
# 解码生成的摘要,跳过特殊标记并清理标记化空格后打印输出
"""
LED_INPUTS_DOCSTRING = r"""
"""
class LEDEncoder(LEDPreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self-attention layers. Each layer is a
[`LEDEncoderLayer`].
Args:
config: LEDConfig
embed_tokens (nn.Embedding): output embedding
"""
def __init__(self, config: LEDConfig, embed_tokens: Optional[nn.Embedding] = None):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.encoder_layerdrop
embed_dim = config.d_model
self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_encoder_position_embeddings
if isinstance(config.attention_window, int):
if config.attention_window % 2 != 0:
raise ValueError("`config.attention_window` has to be an even value")
if config.attention_window <= 0:
raise ValueError("`config.attention_window` has to be positive")
config.attention_window = [config.attention_window] * config.num_hidden_layers
else:
if len(config.attention_window) != config.num_hidden_layers:
raise ValueError(
"`len(config.attention_window)` should equal `config.num_hidden_layers`. "
f"Expected {config.num_hidden_layers}, given {len(config.attention_window)}"
)
if embed_tokens is not None:
self.embed_tokens = embed_tokens
else:
self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
self.embed_positions = LEDLearnedPositionalEmbedding(
self.max_source_positions,
embed_dim,
)
self.layers = nn.ModuleList([LEDEncoderLayer(config, i) for i in range(config.encoder_layers)])
self.layernorm_embedding = nn.LayerNorm(embed_dim)
self.gradient_checkpointing = False
self.post_init()
def _merge_to_attention_mask(self, attention_mask: torch.Tensor, global_attention_mask: torch.Tensor):
if attention_mask is not None:
attention_mask = attention_mask * (global_attention_mask + 1)
else:
attention_mask = global_attention_mask + 1
return attention_mask
def _pad_to_window_size(
self,
input_ids: torch.Tensor,
attention_mask: torch.Tensor,
inputs_embeds: torch.Tensor,
pad_token_id: int,
):
"""
A helper function to pad tokens and mask to work with implementation of Longformer self-attention.
"""
attention_window = (
self.config.attention_window
if isinstance(self.config.attention_window, int)
else max(self.config.attention_window)
)
if attention_window % 2 != 0:
raise ValueError(f"`attention_window` should be an even value. Given {attention_window}")
input_shape = input_ids.shape if input_ids is not None else inputs_embeds.shape
batch_size, seq_len = input_shape[:2]
padding_len = (attention_window - seq_len % attention_window) % attention_window
if padding_len > 0:
logger.warning_once(
f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of "
f"`config.attention_window`: {attention_window}"
)
if input_ids is not None:
input_ids = nn.functional.pad(input_ids, (0, padding_len), value=pad_token_id)
if inputs_embeds is not None:
input_ids_padding = inputs_embeds.new_full(
(batch_size, padding_len),
self.config.pad_token_id,
dtype=torch.long,
)
inputs_embeds_padding = self.embed_tokens(input_ids_padding)
inputs_embeds = torch.cat([inputs_embeds, inputs_embeds_padding], dim=-2)
attention_mask = nn.functional.pad(
attention_mask, (0, padding_len), value=False
)
return padding_len, input_ids, attention_mask, inputs_embeds
def forward(
self,
input_ids=None,
attention_mask=None,
global_attention_mask=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
class LEDDecoder(LEDPreTrainedModel):
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`LEDDecoderLayer`]
Args:
config: LEDConfig
embed_tokens (nn.Embedding): output embedding
"""
def __init__(self, config: LEDConfig, embed_tokens: Optional[nn.Embedding] = None):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.decoder_layerdrop
self.padding_idx = config.pad_token_id
self.max_target_positions = config.max_decoder_position_embeddings
if embed_tokens is not None:
self.embed_tokens = embed_tokens
else:
self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
self.embed_positions = LEDLearnedPositionalEmbedding(
self.max_target_positions,
config.d_model,
)
self.layers = nn.ModuleList([LEDDecoderLayer(config) for _ in range(config.decoder_layers)])
self.layernorm_embedding = nn.LayerNorm(config.d_model)
self.gradient_checkpointing = False
self.post_init()
def forward(
self,
input_ids=None,
attention_mask=None,
global_attention_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
head_mask=None,
cross_attn_head_mask=None,
past_key_values=None,
inputs_embeds=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
"""
Forward pass of the LEDDecoder model.
Args:
input_ids (torch.LongTensor, optional): Input token IDs, shape (batch_size, seq_length).
attention_mask (torch.Tensor, optional): Attention mask for input_ids, shape (batch_size, seq_length).
global_attention_mask (torch.Tensor, optional): Global attention mask, shape (batch_size, seq_length).
encoder_hidden_states (torch.FloatTensor, optional): Hidden states from the encoder.
encoder_attention_mask (torch.FloatTensor, optional): Attention mask for encoder_hidden_states.
head_mask (torch.FloatTensor, optional): Mask to nullify heads, shape (num_heads).
cross_attn_head_mask (torch.FloatTensor, optional): Mask for cross-attention heads, shape (num_decoder_heads, num_encoder_heads).
past_key_values (tuple, optional): Cached key/values for faster autoregressive decoding.
inputs_embeds (torch.FloatTensor, optional): Embedded inputs if input_ids is not provided, shape (batch_size, seq_length, embed_dim).
use_cache (bool, optional): Whether to use cached key/values for autoregressive decoding.
output_attentions (bool, optional): Whether to return attentions weights.
output_hidden_states (bool, optional): Whether to return hidden states.
return_dict (bool, optional): Whether to return a dictionary as output.
Returns:
Sequence of output tensors depending on flags (return_dict, output_attentions, output_hidden_states).
"""
@add_start_docstrings(
"The bare LED Model outputting raw hidden-states without any specific head on top.",
LED_START_DOCSTRING,
)
class LEDModel(LEDPreTrainedModel):
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
def __init__(self, config: LEDConfig):
super().__init__(config)
padding_idx, vocab_size = config.pad_token_id, config.vocab_size
self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
self.encoder = LEDEncoder(config, self.shared)
self.decoder = LEDDecoder(config, self.shared)
self.post_init()
def get_input_embeddings(self):
return self.shared
def set_input_embeddings(self, value):
self.shared = value
self.encoder.embed_tokens = self.shared
self.decoder.embed_tokens = self.shared
def get_encoder(self):
return self.encoder
def get_decoder(self):
return self.decoder
@add_start_docstrings_to_model_forward(LED_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Seq2SeqModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids=None,
attention_mask=None,
global_attention_mask=None,
encoder_outputs=None,
decoder_input_ids=None,
decoder_attention_mask=None,
decoder_past_key_values=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
"""
Forward pass of the LEDModel.
Args:
input_ids (torch.LongTensor, optional): Input token IDs, shape (batch_size, seq_length).
attention_mask (torch.Tensor, optional): Attention mask for input_ids, shape (batch_size, seq_length).
global_attention_mask (torch.Tensor, optional): Global attention mask, shape (batch_size, seq_length).
encoder_outputs (tuple, optional): Outputs of the encoder.
decoder_input_ids (torch.LongTensor, optional): Decoder input token IDs, shape (batch_size, seq_length).
decoder_attention_mask (torch.Tensor, optional): Attention mask for decoder_input_ids, shape (batch_size, seq_length).
decoder_past_key_values (tuple, optional): Cached key/values for faster autoregressive decoding.
use_cache (bool, optional): Whether to use cached key/values for autoregressive decoding.
output_attentions (bool, optional): Whether to return attentions weights.
output_hidden_states (bool, optional): Whether to return hidden states.
return_dict (bool, optional): Whether to return a dictionary as output.
Returns:
Sequence of output tensors depending on flags (return_dict, output_attentions, output_hidden_states).
"""
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
global_attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"The LED Model with a language modeling head. Can be used for summarization.", LED_START_DOCSTRING
)
class LEDForConditionalGeneration(LEDPreTrainedModel):
base_model_prefix = "led"
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]
def __init__(self, config: LEDConfig):
super().__init__(config)
self.led = LEDModel(config)
self.register_buffer("final_logits_bias", torch.zeros((1, self.led.shared.num_embeddings)))
self.lm_head = nn.Linear(config.d_model, self.led.shared.num_embeddings, bias=False)
self.post_init()
def get_encoder(self):
return self.led.get_encoder()
def get_decoder(self):
return self.led.get_decoder()
def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
self._resize_final_logits_bias(new_embeddings.weight.shape[0])
return new_embeddings
def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
old_num_tokens = self.final_logits_bias.shape[-1]
if new_num_tokens <= old_num_tokens:
new_bias = self.final_logits_bias[:, :new_num_tokens]
else:
extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
self.register_buffer("final_logits_bias", new_bias)
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
@add_start_docstrings_to_model_forward(LED_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
@add_end_docstrings(LED_GENERATION_EXAMPLE)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
global_attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
if past_key_values is not None:
decoder_input_ids = decoder_input_ids[:, -1:]
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"global_attention_mask": global_attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache,
}
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
global_attention_mask=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
if past_key_values is not None:
decoder_input_ids = decoder_input_ids[:, -1:]
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"global_attention_mask": global_attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache,
}
@staticmethod
def prepare_decoder_input_ids_from_labels(labels: torch.Tensor):
return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+ layer_past[2:],
)
return reordered_past
@add_start_docstrings(
"""
LED model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
tasks.
""",
LED_START_DOCSTRING,
)
class LEDForSequenceClassification(LEDPreTrainedModel):
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
def __init__(self, config: LEDConfig, **kwargs):
warnings.warn(
"The `transformers.LEDForSequenceClassification` class is deprecated and will be removed in version 5 of"
" Transformers. No actual method were provided in the original paper on how to perfom"
" sequence classification.",
FutureWarning,
)
super().__init__(config, **kwargs)
self.led = LEDModel(config)
self.classification_head = LEDClassificationHead(
config.d_model,
config.d_model,
config.num_labels,
config.classifier_dropout,
)
self.post_init()
@add_start_docstrings_to_model_forward(LED_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Seq2SeqSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
global_attention_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""
LED Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layer
on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
LED_START_DOCSTRING,
)
class LEDForQuestionAnswering(LEDPreTrainedModel):
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
def __init__(self, config):
super().__init__(config)
config.num_labels = 2
self.num_labels = config.num_labels
self.led = LEDModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(LED_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Seq2SeqQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
global_attention_mask: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
.\models\led\modeling_tf_led.py
""" TF 2.0 LED 模型。"""
from __future__ import annotations
import random
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import TFBaseModelOutputWithPastAndCrossAttentions
from ...modeling_tf_utils import (
TFModelInputType,
TFPreTrainedModel,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_led import LEDConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "allenai/led-base-16384"
_CONFIG_FOR_DOC = "LEDConfig"
LARGE_NEGATIVE = -1e8
def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int):
pad_token_id = tf.cast(pad_token_id, input_ids.dtype)
decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype)
start_tokens = tf.fill(
(shape_list(input_ids)[0], 1),
tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype)
)
shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)
shifted_input_ids = tf.where(
shifted_input_ids == -100,
tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)),
shifted_input_ids,
)
assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=input_ids.dtype))
with tf.control_dependencies([assert_gte0]):
shifted_input_ids = tf.identity(shifted_input_ids)
return shifted_input_ids
def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0):
"""
# 创建用于双向自注意力的因果掩码。
"""
bsz = input_ids_shape[0]
tgt_len = input_ids_shape[1]
mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE
mask_cond = tf.range(shape_list(mask)[-1])
mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask)
if past_key_values_length > 0:
mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1)
return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1))
def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
"""
将注意力掩码从`[bsz, seq_len]`扩展到`[bsz, 1, tgt_seq_len, src_seq_len]`。
"""
src_len = shape_list(mask)[1]
tgt_len = tgt_len if tgt_len is not None else src_len
one_cst = tf.constant(1.0)
mask = tf.cast(mask, dtype=one_cst.dtype)
expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))
return (one_cst - expanded_mask) * LARGE_NEGATIVE
class TFLEDLearnedPositionalEmbedding(keras.layers.Embedding):
"""
该模块学习固定最大大小的位置嵌入。
"""
def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
super().__init__(num_embeddings, embedding_dim, **kwargs)
def call(self, input_shape: tf.TensorShape, past_key_values_length: int = 0):
"""
输入预期为大小为[bsz x seqlen]的张量。
"""
seq_len = input_shape[1]
position_ids = tf.range(seq_len, delta=1, name="range")
position_ids += past_key_values_length
return super().call(tf.cast(position_ids, dtype=tf.int32))
class TFLEDEncoderSelfAttention(keras.layers.Layer):
def __init__(self, config, layer_id, **kwargs):
super().__init__(**kwargs)
self.config = config
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads}"
)
self.num_heads = config.num_attention_heads
self.head_dim = int(config.hidden_size / config.num_attention_heads)
self.embed_dim = config.hidden_size
self.query = keras.layers.Dense(
self.embed_dim,
kernel_initializer=get_initializer(config.initializer_range),
name="query",
)
self.key = keras.layers.Dense(
self.embed_dim,
kernel_initializer=get_initializer(config.initializer_range),
name="key",
)
self.value = keras.layers.Dense(
self.embed_dim,
kernel_initializer=get_initializer(config.initializer_range),
name="value",
)
self.query_global = keras.layers.Dense(
self.embed_dim,
kernel_initializer=get_initializer(config.initializer_range),
name="query_global",
)
self.key_global = keras.layers.Dense(
self.embed_dim,
kernel_initializer=get_initializer(config.initializer_range),
name="key_global",
)
self.value_global = keras.layers.Dense(
self.embed_dim,
kernel_initializer=get_initializer(config.initializer_range),
name="value_global",
)
self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
self.global_dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
self.layer_id = layer_id
attention_window = config.attention_window[self.layer_id]
assert (
attention_window % 2 == 0
), f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
assert (
attention_window > 0
), f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
self.one_sided_attn_window_size = attention_window // 2
def build(self, input_shape=None):
if not self.built:
with tf.name_scope("query_global"):
self.query_global.build((self.config.hidden_size,))
with tf.name_scope("key_global"):
self.key_global.build((self.config.hidden_size,))
with tf.name_scope("value_global"):
self.value_global.build((self.config.hidden_size,))
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
if getattr(self, "query_global", None) is not None:
with tf.name_scope(self.query_global.name):
self.query_global.build([None, None, self.config.hidden_size])
if getattr(self, "key_global", None) is not None:
with tf.name_scope(self.key_global.name):
self.key_global.build([None, None, self.config.hidden_size])
if getattr(self, "value_global", None) is not None:
with tf.name_scope(self.value_global.name):
self.value_global.build([None, None, self.config.hidden_size])
@staticmethod
def _mask_invalid_locations(input_tensor, window_overlap):
mask_2d_upper = tf.reverse(
tf.linalg.band_part(tf.ones(shape=(window_overlap, window_overlap + 1)), -1, 0),
axis=[0],
)
padding = tf.convert_to_tensor(
[[0, shape_list(input_tensor)[1] - window_overlap], [0, shape_list(input_tensor)[3] - window_overlap - 1]]
)
mask_2d = tf.pad(mask_2d_upper, padding)
mask_2d = mask_2d + tf.reverse(mask_2d, axis=[0, 1])
mask_4d = tf.tile(mask_2d[None, :, None, :], (shape_list(input_tensor)[0], 1, 1, 1))
inf_tensor = -float("inf") * tf.ones_like(input_tensor)
input_tensor = tf.where(tf.math.greater(mask_4d, 0), inf_tensor, input_tensor)
return input_tensor
def _sliding_chunks_matmul_attn_probs_value(self, attn_probs, value, window_overlap):
"""
Same as _sliding_chunks_query_key_matmul but for attn_probs and value tensors. Returned tensor will be of the
same shape as `attn_probs`
"""
batch_size, seq_len, num_heads, head_dim = shape_list(value)
tf.debugging.assert_equal(
seq_len % (window_overlap * 2), 0, message="Seq_len has to be multiple of 2 * window_overlap"
)
tf.debugging.assert_equal(
shape_list(attn_probs)[:3],
shape_list(value)[:3],
message="value and attn_probs must have same dims (except head_dim)",
)
tf.debugging.assert_equal(
shape_list(attn_probs)[3],
2 * window_overlap + 1,
message="attn_probs last dim has to be 2 * window_overlap + 1",
)
chunks_count = seq_len // window_overlap - 1
chunked_attn_probs = tf.reshape(
tf.transpose(attn_probs, (0, 2, 1, 3)),
(
batch_size * num_heads,
seq_len // window_overlap,
window_overlap,
2 * window_overlap + 1,
),
)
value = tf.reshape(
tf.transpose(value, (0, 2, 1, 3)),
(batch_size * num_heads, seq_len, head_dim),
)
paddings = tf.convert_to_tensor([[0, 0], [window_overlap, window_overlap], [0, 0]])
padded_value = tf.pad(value, paddings, constant_values=-1)
frame_size = 3 * window_overlap * head_dim
frame_hop_size = (shape_list(padded_value)[1] * head_dim - frame_size) // chunks_count
chunked_value = tf.signal.frame(
tf.reshape(padded_value, (batch_size * num_heads, -1)),
frame_size,
frame_hop_size,
)
chunked_value = tf.reshape(
chunked_value,
(batch_size * num_heads, chunks_count + 1, 3 * window_overlap, head_dim),
)
tf.debugging.assert_equal(
shape_list(chunked_value),
[batch_size * num_heads, chunks_count + 1, 3 * window_overlap, head_dim],
message="Chunked value has the wrong shape",
)
chunked_attn_probs = self._pad_and_diagonalize(chunked_attn_probs)
context = tf.einsum("bcwd,bcdh->bcwh", chunked_attn_probs, chunked_value)
context = tf.transpose(
tf.reshape(context, (batch_size, num_heads, seq_len, head_dim)),
(0, 2, 1, 3),
)
return context
def _pad_and_transpose_last_two_dims(hidden_states_padded, paddings):
"""
对最后两个维度进行填充和转置操作。
Args:
- hidden_states_padded: 填充后的隐藏状态张量
- paddings: 填充的尺寸,用于指定在各维度上的填充数量
Returns:
- hidden_states_padded: 转置后的隐藏状态张量
"""
hidden_states_padded = tf.pad(
hidden_states_padded, paddings
)
batch_size, chunk_size, seq_length, hidden_dim = shape_list(hidden_states_padded)
hidden_states_padded = tf.reshape(hidden_states_padded, (batch_size, chunk_size, hidden_dim, seq_length))
return hidden_states_padded
@staticmethod
def _pad_and_diagonalize(chunked_hidden_states):
"""
将每一行向右移动一个步长,将列转换为对角线。
Args:
- chunked_hidden_states: 分块的隐藏状态张量,每个块的形状为 (total_num_heads, num_chunks, window_overlap, hidden_dim)
Returns:
- chunked_hidden_states: 填充并对角化后的隐藏状态张量
"""
total_num_heads, num_chunks, window_overlap, hidden_dim = shape_list(chunked_hidden_states)
paddings = tf.convert_to_tensor([[0, 0], [0, 0], [0, 0], [0, window_overlap + 1]])
chunked_hidden_states = tf.pad(
chunked_hidden_states, paddings
)
chunked_hidden_states = tf.reshape(
chunked_hidden_states, (total_num_heads, num_chunks, -1)
)
chunked_hidden_states = chunked_hidden_states[
:, :, :-window_overlap
]
chunked_hidden_states = tf.reshape(
chunked_hidden_states,
(total_num_heads, num_chunks, window_overlap, window_overlap + hidden_dim),
)
chunked_hidden_states = chunked_hidden_states[:, :, :, :-1]
return chunked_hidden_states
@staticmethod
def _chunk(hidden_states, window_overlap):
"""convert into overlapping chunks. Chunk size = 2w, overlap size = w"""
batch_size, seq_length, hidden_dim = shape_list(hidden_states)
num_output_chunks = 2 * (seq_length // (2 * window_overlap)) - 1
frame_hop_size = window_overlap * hidden_dim
frame_size = 2 * frame_hop_size
hidden_states = tf.reshape(hidden_states, (batch_size, seq_length * hidden_dim))
chunked_hidden_states = tf.signal.frame(hidden_states, frame_size, frame_hop_size)
tf.debugging.assert_equal(
shape_list(chunked_hidden_states),
[batch_size, num_output_chunks, frame_size],
message=(
"Make sure chunking is correctly applied. `Chunked hidden states should have output dimension"
f" {[batch_size, frame_size, num_output_chunks]}, but got {shape_list(chunked_hidden_states)}."
),
)
chunked_hidden_states = tf.reshape(
chunked_hidden_states,
(batch_size, num_output_chunks, 2 * window_overlap, hidden_dim),
)
return chunked_hidden_states
@staticmethod
def _get_global_attn_indices(is_index_global_attn):
"""compute global attn indices required throughout forward pass"""
num_global_attn_indices = tf.math.count_nonzero(is_index_global_attn, axis=1)
num_global_attn_indices = tf.cast(num_global_attn_indices, dtype=tf.constant(1).dtype)
max_num_global_attn_indices = tf.reduce_max(num_global_attn_indices)
is_index_global_attn_nonzero = tf.where(is_index_global_attn)
is_local_index_global_attn = tf.range(max_num_global_attn_indices) < tf.expand_dims(
num_global_attn_indices, axis=-1
)
is_local_index_global_attn_nonzero = tf.where(is_local_index_global_attn)
is_local_index_no_global_attn_nonzero = tf.where(tf.math.logical_not(is_local_index_global_attn))
return (
max_num_global_attn_indices,
is_index_global_attn_nonzero,
is_local_index_global_attn_nonzero,
is_local_index_no_global_attn_nonzero,
)
def _concat_with_global_key_attn_probs(
self,
attn_scores,
key_vectors,
query_vectors,
max_num_global_attn_indices,
is_index_global_attn_nonzero,
is_local_index_global_attn_nonzero,
is_local_index_no_global_attn_nonzero,
):
batch_size = shape_list(key_vectors)[0]
global_key_vectors = tf.gather_nd(key_vectors, is_index_global_attn_nonzero)
key_vectors_only_global = tf.scatter_nd(
is_local_index_global_attn_nonzero,
global_key_vectors,
shape=(
batch_size,
max_num_global_attn_indices,
self.num_heads,
self.head_dim,
),
)
attn_probs_from_global_key = tf.einsum("blhd,bshd->blhs", query_vectors, key_vectors_only_global)
attn_probs_from_global_key_trans = tf.transpose(attn_probs_from_global_key, (0, 3, 1, 2))
mask_shape = (shape_list(is_local_index_no_global_attn_nonzero)[0],) + tuple(
shape_list(attn_probs_from_global_key_trans)[-2:]
)
mask = tf.ones(mask_shape) * -10000.0
mask = tf.cast(mask, dtype=attn_probs_from_global_key_trans.dtype)
attn_probs_from_global_key_trans = tf.tensor_scatter_nd_update(
attn_probs_from_global_key_trans,
is_local_index_no_global_attn_nonzero,
mask,
)
attn_probs_from_global_key = tf.transpose(attn_probs_from_global_key_trans, (0, 2, 3, 1))
attn_scores = tf.concat((attn_probs_from_global_key, attn_scores), axis=-1)
return attn_scores
def _compute_attn_output_with_global_indices(
self,
value_vectors,
attn_probs,
max_num_global_attn_indices,
is_index_global_attn_nonzero,
is_local_index_global_attn_nonzero,
):
batch_size = shape_list(attn_probs)[0]
attn_probs_only_global = attn_probs[:, :, :, :max_num_global_attn_indices]
global_value_vectors = tf.gather_nd(value_vectors, is_index_global_attn_nonzero)
value_vectors_only_global = tf.scatter_nd(
is_local_index_global_attn_nonzero,
global_value_vectors,
shape=(
batch_size,
max_num_global_attn_indices,
self.num_heads,
self.head_dim,
),
)
attn_output_only_global = tf.einsum("blhs,bshd->blhd", attn_probs_only_global, value_vectors_only_global)
attn_probs_without_global = attn_probs[:, :, :, max_num_global_attn_indices:]
attn_output_without_global = self._sliding_chunks_matmul_attn_probs_value(
attn_probs_without_global, value_vectors, self.one_sided_attn_window_size
)
return attn_output_only_global + attn_output_without_global
def _compute_global_attn_output_from_hidden(
self,
attn_output,
hidden_states,
max_num_global_attn_indices,
layer_head_mask,
is_local_index_global_attn_nonzero,
is_index_global_attn_nonzero,
is_local_index_no_global_attn_nonzero,
is_index_masked,
training,
def reshape_and_transpose(self, vector, batch_size):
return tf.reshape(
tf.transpose(
tf.reshape(vector, (batch_size, -1, self.num_heads, self.head_dim)),
(0, 2, 1, 3),
),
(batch_size * self.num_heads, -1, self.head_dim),
)
class TFLEDEncoderAttention(keras.layers.Layer):
def __init__(self, config, layer_id, **kwargs):
super().__init__(**kwargs)
self.longformer_self_attn = TFLEDEncoderSelfAttention(config, layer_id=layer_id, name="longformer_self_attn")
self.output_dense = keras.layers.Dense(config.d_model, use_bias=True, name="output")
self.config = config
def call(self, inputs, training=False):
(
hidden_states,
attention_mask,
layer_head_mask,
is_index_masked,
is_index_global_attn,
is_global_attn,
) = inputs
self_outputs = self.longformer_self_attn(
[hidden_states, attention_mask, layer_head_mask, is_index_masked, is_index_global_attn, is_global_attn],
training=training,
)
attention_output = self.output_dense(self_outputs[0], training=training)
outputs = (attention_output,) + self_outputs[1:]
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "longformer_self_attn", None) is not None:
with tf.name_scope(self.longformer_self_attn.name):
self.longformer_self_attn.build(None)
if getattr(self, "output_dense", None) is not None:
with tf.name_scope(self.output_dense.name):
self.output_dense.build([None, None, self.config.d_model])
class TFLEDDecoderAttention(keras.layers.Layer):
"""Multi-headed attention from "Attention Is All You Need""""
# 初始化解码器注意力层
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
**kwargs,
):
super().__init__(**kwargs)
self.embed_dim = embed_dim # 嵌入维度
self.num_heads = num_heads # 注意力头数
self.dropout = keras.layers.Dropout(dropout) # Dropout层
self.head_dim = embed_dim // num_heads # 每个注意力头的维度
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
self.scaling = self.head_dim**-0.5 # 缩放因子
self.is_decoder = is_decoder # 是否为解码器
# 线性变换层,用于计算K、Q、V以及输出
self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
# 对张量进行形状变换
def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
# 调用函数,用于前向传播
def call(
self,
hidden_states: tf.Tensor,
key_value_states: tf.Tensor | None = None,
past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
attention_mask: tf.Tensor | None = None,
layer_head_mask: tf.Tensor | None = None,
training=False,
# 构建函数,用于构建模型的层次结构
def build(self, input_shape=None):
# 如果已经构建过,直接返回,避免重复构建
if self.built:
return
# 设置标志位表示已经构建
self.built = True
# 如果存在 k_proj 属性,则构建 k_proj 层
if getattr(self, "k_proj", None) is not None:
# 在命名空间下构建 k_proj 层
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
# 如果存在 q_proj 属性,则构建 q_proj 层
if getattr(self, "q_proj", None) is not None:
# 在命名空间下构建 q_proj 层
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
# 如果存在 v_proj 属性,则构建 v_proj 层
if getattr(self, "v_proj", None) is not None:
# 在命名空间下构建 v_proj 层
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
# 如果存在 out_proj 属性,则构建 out_proj 层
if getattr(self, "out_proj", None) is not None:
# 在命名空间下构建 out_proj 层
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
class TFLEDEncoderLayer(keras.layers.Layer):
# 初始化编码器层,接受配置参数和层编号
def __init__(self, config: LEDConfig, layer_id: int, **kwargs):
super().__init__(**kwargs)
# 设置嵌入维度为模型配置中的维度
self.embed_dim = config.d_model
# 初始化自注意力机制
self.self_attn = TFLEDEncoderAttention(config, layer_id, name="self_attn")
# 初始化自注意力层规范化
self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
# 设置dropout层
self.dropout = keras.layers.Dropout(config.dropout)
# 获取激活函数
self.activation_fn = get_tf_activation(config.activation_function)
# 设置激活函数的dropout层
self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
# 第一个全连接层,输出维度为编码器FFN的维度
self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
# 第二个全连接层,输出维度为嵌入维度
self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
# 最终层规范化
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
# 保存配置参数
self.config = config
# 定义调用方法,处理输入数据和各种掩码
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
layer_head_mask: tf.Tensor,
is_index_masked: tf.Tensor,
is_index_global_attn: tf.Tensor,
is_global_attn: bool,
training=False,
):
"""
Args:
hidden_states (`tf.Tensor`): 输入层的张量形状为 *(batch, seq_len, embed_dim)*
attention_mask (`tf.Tensor`): 注意力掩码的形状为 *(batch, 1, tgt_len, src_len)*,
其中填充元素由极大负值表示。
layer_head_mask (`tf.Tensor`): 给定层中注意力头的掩码形状为 *(config.encoder_attention_heads,)*。
"""
# 保留输入的残差连接
residual = hidden_states
# 进行自注意力计算
layer_outputs = self.self_attn(
[hidden_states, attention_mask, layer_head_mask, is_index_masked, is_index_global_attn, is_global_attn],
training=training,
)
# 获取自注意力层的输出作为新的隐藏状态
hidden_states = layer_outputs[0]
# 断言自注意力是否修改了查询的形状
tf.debugging.assert_equal(
shape_list(hidden_states),
shape_list(residual),
message=f"Self attn modified the shape of query {shape_list(residual)} to {shape_list(hidden_states)}",
)
# 应用dropout层到隐藏状态
hidden_states = self.dropout(hidden_states, training=training)
# 添加残差连接到dropout后的隐藏状态
hidden_states = residual + hidden_states
# 应用自注意力层规范化
hidden_states = self.self_attn_layer_norm(hidden_states)
# 保留更新后的残差连接
residual = hidden_states
# 应用激活函数到第一个全连接层
hidden_states = self.activation_fn(self.fc1(hidden_states))
# 应用激活函数的dropout层
hidden_states = self.activation_dropout(hidden_states, training=training)
# 应用第二个全连接层
hidden_states = self.fc2(hidden_states)
# 应用dropout层到第二个全连接层
hidden_states = self.dropout(hidden_states, training=training)
# 添加残差连接到dropout后的第二个全连接层
hidden_states = residual + hidden_states
# 应用最终层规范化
hidden_states = self.final_layer_norm(hidden_states)
# 返回更新后的隐藏状态和其他层输出(如果有)
return (hidden_states,) + layer_outputs[1:]
# 构建函数,用于构建神经网络层的结构
def build(self, input_shape=None):
# 如果已经构建过,直接返回,避免重复构建
if self.built:
return
# 将构建状态标记为已构建
self.built = True
# 如果存在 self_attn 属性,构建 self attention 层
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
# 如果存在 self_attn_layer_norm 属性,构建 self attention 层的 Layer Normalization
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
# 如果存在 fc1 属性,构建第一个全连接层
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
# 如果存在 fc2 属性,构建第二个全连接层
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.encoder_ffn_dim])
# 如果存在 final_layer_norm 属性,构建最终的 Layer Normalization 层
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
class TFLEDDecoderLayer(keras.layers.Layer):
# 定义 TFLED 解码器层,继承自 keras.layers.Layer
def __init__(self, config: LEDConfig, **kwargs):
# 初始化函数,接受 LEDConfig 类型的配置参数和其他关键字参数
super().__init__(**kwargs)
# 调用父类的初始化方法
self.embed_dim = config.d_model
# 设置嵌入维度为配置中的模型维度
self.self_attn = TFLEDDecoderAttention(
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
name="self_attn",
is_decoder=True,
)
# 创建自注意力机制对象,用于解码器自注意力层
self.dropout = keras.layers.Dropout(config.dropout)
# 创建 dropout 层,用于整个层的 dropout 操作
self.activation_fn = get_tf_activation(config.activation_function)
# 获取 TensorFlow 激活函数对象,根据配置中的激活函数类型
self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
# 创建 dropout 层,用于激活函数的 dropout 操作
self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
# 创建层归一化层,用于自注意力层的归一化
self.encoder_attn = TFLEDDecoderAttention(
self.embed_dim,
config.decoder_attention_heads,
dropout=config.attention_dropout,
name="encoder_attn",
is_decoder=True,
)
# 创建编码器注意力对象,用于解码器与编码器之间的注意力
self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
# 创建层归一化层,用于编码器注意力层的归一化
self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
# 创建全连接层,用于解码器的前馈神经网络
self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
# 创建全连接层,用于解码器的前馈神经网络的输出层
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
# 创建层归一化层,用于最终输出的归一化
self.config = config
# 保存配置对象
def call(
self,
hidden_states,
attention_mask: tf.Tensor | None = None,
encoder_hidden_states: tf.Tensor | None = None,
encoder_attention_mask: tf.Tensor | None = None,
layer_head_mask: tf.Tensor | None = None,
encoder_layer_head_mask: tf.Tensor | None = None,
past_key_value: Tuple[tf.Tensor] | None = None,
training=False,
**kwargs
):
# 定义层的调用函数,实现解码器层的前向传播逻辑
# 构建方法用于构造模型的各个层,如果模型已经构建过则直接返回
def build(self, input_shape=None):
if self.built:
return
# 设置标志表示模型已经构建完成
self.built = True
# 如果存在 self_attn 层,则构建 self_attn 层,并使用其名称作为命名空间
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
# 如果存在 self_attn_layer_norm 层,则构建 self_attn_layer_norm 层,
# 传入的形状是 [None, None, self.embed_dim]
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
# 如果存在 encoder_attn 层,则构建 encoder_attn 层,并使用其名称作为命名空间
if getattr(self, "encoder_attn", None) is not None:
with tf.name_scope(self.encoder_attn.name):
self.encoder_attn.build(None)
# 如果存在 encoder_attn_layer_norm 层,则构建 encoder_attn_layer_norm 层,
# 传入的形状是 [None, None, self.embed_dim]
if getattr(self, "encoder_attn_layer_norm", None) is not None:
with tf.name_scope(self.encoder_attn_layer_norm.name):
self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
# 如果存在 fc1 层,则构建 fc1 层,传入的形状是 [None, None, self.embed_dim]
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
# 如果存在 fc2 层,则构建 fc2 层,传入的形状是 [None, None, self.config.decoder_ffn_dim]
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.decoder_ffn_dim])
# 如果存在 final_layer_norm 层,则构建 final_layer_norm 层,
# 传入的形状是 [None, None, self.embed_dim]
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
# 定义 TFLEDPreTrainedModel 类,继承自 TFPreTrainedModel
class TFLEDPreTrainedModel(TFPreTrainedModel):
# 设置配置类为 LEDConfig
config_class = LEDConfig
# 指定基础模型前缀为 "led"
base_model_prefix = "led"
# 定义 input_signature 属性,用于指定输入的签名
@property
def input_signature(self):
# 调用父类的 input_signature 方法获取默认签名
sig = super().input_signature
# 添加全局注意力掩码的 TensorSpec 到签名中,形状为 (None, None),数据类型为 tf.int32
sig["global_attention_mask"] = tf.TensorSpec((None, None), tf.int32, name="global_attention_mask")
# 返回更新后的签名
return sig
# 使用 dataclass 装饰器定义 TFLEDEncoderBaseModelOutput 类
@dataclass
# 类的注释被省略,这里是 TFLongformerBaseModelOutput 类的修改版本,用于 TFLEDEncoder
class TFLEDEncoderBaseModelOutput(ModelOutput):
"""
Base class for Longformer's outputs, with potential hidden states, local and global attentions.
"""
# 定义函数参数 `last_hidden_state`,类型为 `tf.Tensor`,默认为 None
last_hidden_state: tf.Tensor = None
# 定义函数参数 `hidden_states`,类型为元组 `Tuple[tf.Tensor, ...]` 或者 None,当 `output_hidden_states=True` 时返回
# 表示模型在每个层输出的隐藏状态以及初始嵌入输出
hidden_states: Tuple[tf.Tensor, ...] | None = None
# 定义函数参数 `attentions`,类型为元组 `Tuple[tf.Tensor, ...]` 或者 None,当 `output_attentions=True` 时返回
# 表示每个层的本地注意力权重,形状为 `(batch_size, num_heads, sequence_length, x + attention_window + 1)`
# 这些是经过注意力 softmax 后的本地注意力权重,用于计算自注意力头中的加权平均值
attentions: Tuple[tf.Tensor, ...] | None = None
# 定义函数参数 `global_attentions`,类型为元组 `Tuple[tf.Tensor, ...]` 或者 None,当 `output_attentions=True` 时返回
# 表示每个层的全局注意力权重,形状为 `(batch_size, num_heads, sequence_length, x)`
# 这些是经过注意力 softmax 后的全局注意力权重,用于计算自注意力头中的加权平均值
global_attentions: Tuple[tf.Tensor, ...] | None = None
# 定义一个 TFLEDSeq2SeqModelOutput 类,继承自 ModelOutput 类,用于存储序列到序列模型的输出
@dataclass
class TFLEDSeq2SeqModelOutput(ModelOutput):
"""
Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
decoding.
"""
# 最后一个隐藏状态,类型为 tf.Tensor,默认为 None
last_hidden_state: tf.Tensor = None
# 存储过去关键值的列表,类型为 List[tf.Tensor] 或者 None
past_key_values: List[tf.Tensor] | None = None
# 解码器的隐藏状态元组,类型为 Tuple[tf.Tensor, ...] 或者 None
decoder_hidden_states: Tuple[tf.Tensor, ...] | None = None
# 解码器的注意力权重元组,类型为 Tuple[tf.Tensor, ...] 或者 None
decoder_attentions: Tuple[tf.Tensor, ...] | None = None
# 交叉注意力权重元组,类型为 Tuple[tf.Tensor, ...] 或者 None
cross_attentions: Tuple[tf.Tensor, ...] | None = None
# 编码器最后一个隐藏状态,类型为 tf.Tensor 或者 None
encoder_last_hidden_state: tf.Tensor | None = None
# 编码器的隐藏状态元组,类型为 Tuple[tf.Tensor, ...] 或者 None
encoder_hidden_states: Tuple[tf.Tensor, ...] | None = None
# 编码器的注意力权重元组,类型为 Tuple[tf.Tensor, ...] 或者 None
encoder_attentions: Tuple[tf.Tensor, ...] | None = None
# 编码器的全局注意力权重元组,类型为 Tuple[tf.Tensor, ...] 或者 None
encoder_global_attentions: Tuple[tf.Tensor, ...] | None = None
# 定义一个 TFLEDSeq2SeqLMOutput 类,继承自 ModelOutput 类,用于存储序列到序列语言模型的输出
@dataclass
class TFLEDSeq2SeqLMOutput(ModelOutput):
"""
Base class for sequence-to-sequence language models outputs.
"""
# 损失张量,类型为 tf.Tensor 或者 None
loss: tf.Tensor | None = None
# 预测的 logits 张量,类型为 tf.Tensor,默认为 None
logits: tf.Tensor = None
# 存储过去关键值的列表,类型为 List[tf.Tensor] 或者 None
past_key_values: List[tf.Tensor] | None = None
# 解码器的隐藏状态元组,类型为 Tuple[tf.Tensor, ...] 或者 None
decoder_hidden_states: Tuple[tf.Tensor, ...] | None = None
# 解码器的注意力权重元组,类型为 Tuple[tf.Tensor, ...] 或者 None
decoder_attentions: Tuple[tf.Tensor, ...] | None = None
# 交叉注意力权重元组,类型为 Tuple[tf.Tensor, ...] 或者 None
cross_attentions: Tuple[tf.Tensor, ...] | None = None
# 编码器最后一个隐藏状态,类型为 tf.Tensor 或者 None
encoder_last_hidden_state: tf.Tensor | None = None
# 编码器的隐藏状态元组,类型为 Tuple[tf.Tensor, ...] 或者 None
encoder_hidden_states: Tuple[tf.Tensor, ...] | None = None
# 编码器的注意力权重元组,类型为 Tuple[tf.Tensor, ...] 或者 None
encoder_attentions: Tuple[tf.Tensor, ...] | None = None
# 编码器的全局注意力权重元组,类型为 Tuple[tf.Tensor, ...] 或者 None
encoder_global_attentions: Tuple[tf.Tensor, ...] | None = None
# LED_START_DOCSTRING 为一个原始字符串,用于描述 TFPreTrainedModel 类的文档字符串
LED_START_DOCSTRING = r"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
positional argument:
- a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- 在调用模型时,可以使用这种形式传入输入张量。如果模型有不同的输入名称(比如input_ids、attention_mask、token_type_ids),则需要按照相应的输入名称传递张量。
`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
- 如果模型的输入需要按照名称显式传递,则可以使用这种字典形式传递输入张量,其中键对应于模型的输入名称,值对应于输入张量本身。
Note that when creating models and layers with
[subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
about any of this, as you can just pass inputs like you would to any other Python function!
- 如果使用子类化的方式创建模型和层,那么可以像调用任何其他Python函数一样传递输入张量,无需担心输入的名称和形式。
Args:
config ([`LEDConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
- 参数说明部分,config参数接受一个`LEDConfig`类型的对象,该对象包含模型的所有参数配置。使用配置文件初始化时,并不会加载模型的权重,只会加载配置信息。可以查阅[`~TFPreTrainedModel.from_pretrained`]方法来加载模型的权重。
"""
LED_INPUTS_DOCSTRING = r"""
"""
@keras_serializable
class TFLEDEncoder(keras.layers.Layer):
# 设置配置类为LEDConfig
config_class = LEDConfig
"""
Transformer encoder consisting of *config.encoder_layers* self-attention layers. Each layer is a
[`TFLEDEncoderLayer`].
Args:
config: LEDConfig
"""
def __init__(self, config: LEDConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
super().__init__(**kwargs)
# 初始化配置参数
self.config = config
# 设置dropout层
self.dropout = keras.layers.Dropout(config.dropout)
# 如果启用了encoder_layerdrop,则记录警告信息
if config.encoder_layerdrop > 0:
logger.warning("Layerdrop is currently disabled in TFLED models.")
# 设置layerdrop为0.0
self.layerdrop = 0.0
# 设置padding索引为config.pad_token_id
self.padding_idx = config.pad_token_id
# 如果config.attention_window为整数,则确认其为偶数且为正数,并复制给每个层
if isinstance(config.attention_window, int):
assert config.attention_window % 2 == 0, "`config.attention_window` has to be an even value"
assert config.attention_window > 0, "`config.attention_window` has to be positive"
config.attention_window = [config.attention_window] * config.num_hidden_layers # one value per layer
else:
# 否则确认其长度与num_hidden_layers相等
assert len(config.attention_window) == config.num_hidden_layers, (
"`len(config.attention_window)` should equal `config.num_hidden_layers`. "
f"Expected {config.num_hidden_layers}, given {len(config.attention_window)}"
)
# 设置attention_window为config.attention_window
self.attention_window = config.attention_window
# 设置embed_tokens为输入的embed_tokens
self.embed_tokens = embed_tokens
# 初始化位置编码层TFLEDLearnedPositionalEmbedding
self.embed_positions = TFLEDLearnedPositionalEmbedding(
config.max_encoder_position_embeddings,
config.d_model,
name="embed_positions",
)
# 创建transformer encoder层列表,每一层为TFLEDEncoderLayer
self.layers = [TFLEDEncoderLayer(config, i, name=f"layers.{i}") for i in range(config.encoder_layers)]
# 创建layernorm层
self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
# 设置embed_dim为config.d_model
self.embed_dim = config.d_model
# 获取embed_tokens方法
def get_embed_tokens(self):
return self.embed_tokens
# 设置embed_tokens方法
def set_embed_tokens(self, embed_tokens):
self.embed_tokens = embed_tokens
# 解包输入的装饰器函数
@unpack_inputs
# 定义call函数,处理Transformer编码器的前向传播
def call(
self,
input_ids=None,
inputs_embeds=None,
attention_mask=None,
global_attention_mask=None,
head_mask=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
training=False,
):
# 实际前向传播逻辑在TFLEDEncoderLayer中实现,这里仅定义函数签名和参数
# 计算隐藏状态的函数,截取hidden_states以适配padding长度
@tf.function
def compute_hidden_states(self, hidden_states, padding_len):
return hidden_states[:, :-padding_len] if padding_len > 0 else hidden_states
# 填充到指定窗口大小的函数,处理输入以匹配指定的注意力窗口大小
def _pad_to_window_size(
self,
input_ids,
attention_mask,
inputs_embeds,
pad_token_id,
):
"""A helper function to pad tokens and mask to work with implementation of Longformer selfattention."""
# padding
attention_window = (
self.attention_window if isinstance(self.attention_window, int) else max(self.attention_window)
)
assert attention_window % 2 == 0, f"`attention_window` should be an even value. Given {attention_window}"
input_shape = shape_list(input_ids) if input_ids is not None else shape_list(inputs_embeds)
batch_size, seq_len = input_shape[:2]
padding_len = (attention_window - seq_len % attention_window) % attention_window
if padding_len > 0:
logger.warning_once(
f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of "
f"`config.attention_window`: {attention_window}"
)
paddings = tf.convert_to_tensor([[0, 0], [0, padding_len]])
if input_ids is not None:
# Pad input_ids with pad_token_id according to calculated padding_len
input_ids = tf.pad(input_ids, paddings, constant_values=pad_token_id)
if inputs_embeds is not None:
if padding_len > 0:
# Create padding for input_ids and embed them to get inputs_embeds_padding
input_ids_padding = tf.fill((batch_size, padding_len), pad_token_id)
inputs_embeds_padding = self.embed_tokens(input_ids_padding)
# Concatenate original inputs_embeds with inputs_embeds_padding along the sequence dimension
inputs_embeds = tf.concat([inputs_embeds, inputs_embeds_padding], axis=-2)
# Pad attention_mask with False (indicating no attention on padding tokens)
attention_mask = tf.pad(attention_mask, paddings, constant_values=False)
return (
padding_len, # Amount of padding added to input_ids and attention_mask
input_ids, # Padded input_ids
attention_mask, # Padded attention_mask
inputs_embeds, # Padded inputs_embeds
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layernorm_embedding", None) is not None:
with tf.name_scope(self.layernorm_embedding.name):
self.layernorm_embedding.build([None, None, self.embed_dim])
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
# 使用 @keras_serializable 装饰器,将 TFLEDDecoder 类标记为可序列化的 Keras 层
@keras_serializable
class TFLEDDecoder(keras.layers.Layer):
# 指定配置类为 LEDConfig
config_class = LEDConfig
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFLEDDecoderLayer`]
Args:
config: LEDConfig
embed_tokens: output embedding
"""
# 初始化方法,接受 LEDConfig 和可选的嵌入标记作为参数
def __init__(self, config: LEDConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
super().__init__(**kwargs)
# 设置配置
self.config = config
# 设置填充索引为配置中的 pad_token_id
self.padding_idx = config.pad_token_id
# 设置嵌入标记
self.embed_tokens = embed_tokens
# 如果配置中启用了 layerdrop,则发出警告(当前未启用)
if config.decoder_layerdrop > 0:
logger.warning("Layerdrop is currently disabled in TFLED models.")
# 设置 layerdrop 为 0.0
self.layerdrop = 0.0
# 创建位置嵌入层对象 TFLEDLearnedPositionalEmbedding
self.embed_positions = TFLEDLearnedPositionalEmbedding(
config.max_decoder_position_embeddings,
config.d_model,
name="embed_positions",
)
# 创建多个 TFLEDDecoderLayer 层对象组成的列表
self.layers = [TFLEDDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
# 创建层标准化层对象,用于嵌入层标准化处理
self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
# 创建丢弃层,使用配置中的 dropout 比率
self.dropout = keras.layers.Dropout(config.dropout)
# 设置嵌入标记的方法
def set_embed_tokens(self, embed_tokens):
self.embed_tokens = embed_tokens
# 使用 @unpack_inputs 装饰器定义的调用方法,接受多个输入参数并返回结果
def call(
self,
input_ids=None,
inputs_embeds=None,
attention_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
head_mask=None,
encoder_head_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
training=False,
):
# 方法实现略过,未提供具体实现
# 构建方法,用于构建层的内部组件
def build(self, input_shape=None):
# 如果已经构建过,则直接返回
if self.built:
return
# 标记为已构建
self.built = True
# 如果存在 embed_positions 属性,则构建该对象
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
# 如果存在 layernorm_embedding 属性,则构建该对象
if getattr(self, "layernorm_embedding", None) is not None:
with tf.name_scope(self.layernorm_embedding.name):
self.layernorm_embedding.build([None, None, self.config.d_model])
# 遍历每个层并构建它们
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
# 初始化函数,用于创建一个新的LED模型实例
def __init__(self, config: LEDConfig, **kwargs):
# 调用父类的初始化方法,继承父类的属性和方法
super().__init__(**kwargs)
# 将传入的配置参数保存到实例属性中
self.config = config
# 创建一个共享的嵌入层,用于编码和解码器共享词嵌入
self.shared = keras.layers.Embedding(
input_dim=config.vocab_size, # 词汇表大小,词嵌入的输入维度
output_dim=config.d_model, # 嵌入向量的输出维度
embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std), # 初始化嵌入矩阵的方式
name="led.shared", # 嵌入层的名称
)
# 为共享的嵌入层添加额外属性,指定层的预期名称范围(用于加载/存储权重)
self.shared.load_weight_prefix = "led.shared"
# 创建LED模型的编码器,传入配置和共享的词嵌入层
self.encoder = TFLEDEncoder(config, self.shared, name="encoder")
# 创建LED模型的解码器,传入配置和共享的词嵌入层
self.decoder = TFLEDDecoder(config, self.shared, name="decoder")
# 返回模型的输入嵌入层(共享的词嵌入层)
def get_input_embeddings(self):
return self.shared
# 设置模型的输入嵌入层为新的词嵌入层
def set_input_embeddings(self, new_embeddings):
# 更新共享的词嵌入层
self.shared = new_embeddings
# 更新编码器的词嵌入层
self.encoder.embed_tokens = self.shared
# 更新解码器的词嵌入层
self.decoder.embed_tokens = self.shared
# 使用装饰器将输入参数解包,处理模型的前向传播
@unpack_inputs
def call(
self,
input_ids=None,
attention_mask=None,
decoder_input_ids=None,
decoder_attention_mask=None,
head_mask=None,
decoder_head_mask=None,
encoder_outputs: Optional[Union[Tuple, TFLEDEncoderBaseModelOutput]] = None,
global_attention_mask=None,
past_key_values=None,
inputs_embeds=None,
decoder_inputs_embeds=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
training=False,
**kwargs,
):
# 如果没有提供解码器的输入 ID 和嵌入向量,则不使用缓存
if decoder_input_ids is None and decoder_inputs_embeds is None:
use_cache = False
# 如果没有提供编码器的输出,则调用编码器来生成编码器的输出
if encoder_outputs is None:
encoder_outputs = self.encoder(
input_ids=input_ids,
attention_mask=attention_mask,
global_attention_mask=global_attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 如果用户传入了元组形式的编码器输出,并且设置了 return_dict=True,则将其包装在 TFLEDEncoderBaseModelOutput 中
elif return_dict and not isinstance(encoder_outputs, TFLEDEncoderBaseModelOutput):
encoder_outputs = TFLEDEncoderBaseModelOutput(
last_hidden_state=encoder_outputs[0],
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
)
# 如果用户传入了 TFLEDEncoderBaseModelOutput 形式的编码器输出,并且设置了 return_dict=False,则将其转换为元组形式
elif not return_dict and not isinstance(encoder_outputs, tuple):
encoder_outputs = encoder_outputs.to_tuple()
# 调用解码器生成解码器的输出
decoder_outputs = self.decoder(
decoder_input_ids,
attention_mask=decoder_attention_mask,
encoder_hidden_states=encoder_outputs[0],
encoder_attention_mask=attention_mask,
head_mask=decoder_head_mask,
encoder_head_mask=head_mask,
past_key_values=past_key_values,
inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 如果 return_dict=False,则返回解码器和编码器的输出作为元组
if not return_dict:
return decoder_outputs + encoder_outputs
# 如果 return_dict=True,则将解码器和编码器的输出包装在 TFLEDSeq2SeqModelOutput 中并返回
return TFLEDSeq2SeqModelOutput(
last_hidden_state=decoder_outputs.last_hidden_state,
past_key_values=decoder_outputs.past_key_values,
decoder_hidden_states=decoder_outputs.hidden_states,
decoder_attentions=decoder_outputs.attentions,
cross_attentions=decoder_outputs.cross_attentions,
encoder_last_hidden_state=encoder_outputs.last_hidden_state,
encoder_hidden_states=encoder_outputs.hidden_states,
encoder_attentions=encoder_outputs.attentions,
encoder_global_attentions=encoder_outputs.global_attentions,
)
# 如果模型已经构建完成,则直接返回,不重复构建
if self.built:
return
# 设置模型已经构建标志为True
self.built = True
# 共享/绑定的权重期望位于模型基础命名空间中
# 将"/"添加到tf.name_scope的末尾(而不是开头!)将其放置在根命名空间而不是当前命名空间中
with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
# 构建共享模型,无输入形状
self.shared.build(None)
# 如果存在编码器对象
if getattr(self, "encoder", None) is not None:
# 使用编码器名称创建命名空间
with tf.name_scope(self.encoder.name):
# 构建编码器,无输入形状
self.encoder.build(None)
# 如果存在解码器对象
if getattr(self, "decoder", None) is not None:
# 使用解码器名称创建命名空间
with tf.name_scope(self.decoder.name):
# 构建解码器,无输入形状
self.decoder.build(None)
# 添加文档字符串,说明这是一个不带顶部特定头的裸 LED 模型输出原始隐藏状态。
# 使用 TFLEDPreTrainedModel 的子类化来定义 TFLEDModel 类
class TFLEDModel(TFLEDPreTrainedModel):
# 初始化方法,接受配置和其他参数
def __init__(self, config, *inputs, **kwargs):
# 调用父类的初始化方法
super().__init__(config, *inputs, **kwargs)
# 创建 TFLEDMainLayer 实例,并命名为 "led"
self.led = TFLEDMainLayer(config, name="led")
# 返回编码器的方法
def get_encoder(self):
return self.led.encoder
# 返回解码器的方法
def get_decoder(self):
return self.led.decoder
# call 方法,定义模型的前向传播过程
@unpack_inputs
# 添加文档字符串,描述输入的格式要求
@add_start_docstrings_to_model_forward(LED_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
# 添加代码示例的文档字符串,指向预训练模型的检查点、输出类型和配置类
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFLEDSeq2SeqModelOutput,
config_class=_CONFIG_FOR_DOC,
)
# 方法签名和参数描述,指定了输入和输出的类型及格式
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: tf.Tensor | None = None,
decoder_input_ids: tf.Tensor | None = None,
decoder_attention_mask: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
decoder_head_mask: tf.Tensor | None = None,
encoder_outputs: tf.Tensor | None = None,
global_attention_mask: tf.Tensor | None = None,
past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
inputs_embeds: tf.Tensor | None = None,
decoder_inputs_embeds: tf.Tensor | None = None,
use_cache: bool | None = None,
output_attentions: bool | None = None,
output_hidden_states: bool | None = None,
return_dict: bool | None = None,
training: bool = False,
**kwargs,
) -> Tuple[tf.Tensor] | TFLEDSeq2SeqModelOutput:
# 调用 TFLEDMainLayer 实例的__call__方法,传递参数并接收输出
outputs = self.led(
input_ids=input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
encoder_outputs=encoder_outputs,
global_attention_mask=global_attention_mask,
head_mask=head_mask,
decoder_head_mask=decoder_head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
decoder_inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 返回模型的输出
return outputs
# 定义一个方法用于处理模型的输出,接受一个output对象作为参数
def serving_output(self, output):
# 如果配置中设置使用缓存,则从output的过去键值对中获取第二个元素作为pkv,否则为None
pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
# 如果配置中设置输出隐藏状态,则将output的解码器隐藏状态转换为张量dec_hs,否则为None
dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
# 如果配置中设置输出注意力,则将output的解码器注意力转换为张量dec_attns,否则为None
dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
# 如果配置中设置输出注意力,则将output的交叉注意力转换为张量cross_attns,否则为None
cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
# 如果配置中设置输出隐藏状态,则将output的编码器隐藏状态转换为张量enc_hs,否则为None
enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
# 如果配置中设置输出注意力,则将output的编码器注意力转换为张量enc_attns,否则为None
enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
# 如果配置中设置输出注意力,则将output的全局编码器注意力转换为张量enc_g_attns,否则为None
enc_g_attns = tf.convert_to_tensor(output.encoder_global_attentions) if self.config.output_attentions else None
# 返回一个TFLEDSeq2SeqModelOutput对象,包含处理后的各种张量
return TFLEDSeq2SeqModelOutput(
last_hidden_state=output.last_hidden_state,
past_key_values=pkv,
decoder_hidden_states=dec_hs,
decoder_attentions=dec_attns,
cross_attentions=cross_attns,
encoder_last_hidden_state=output.encoder_last_hidden_state,
encoder_hidden_states=enc_hs,
encoder_attentions=enc_attns,
encoder_global_attentions=enc_g_attns,
)
# 定义一个方法用于构建模型,接受一个输入形状参数,默认为None
def build(self, input_shape=None):
# 如果已经构建过,直接返回
if self.built:
return
# 标记模型已经构建
self.built = True
# 如果对象中存在led属性
if getattr(self, "led", None) is not None:
# 在led的名字作用域内构建led对象,传入None作为构建参数
with tf.name_scope(self.led.name):
self.led.build(None)
# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
# BiasLayer 类的定义,用于添加偏置作为一个层。用于序列化目的:`keras.Model.save_weights` 按层存储权重,
# 因此所有权重都必须在一个层中注册。
class BiasLayer(keras.layers.Layer):
"""
Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
so all weights have to be registered in a layer.
"""
def __init__(self, shape, initializer, trainable, name, **kwargs):
super().__init__(name=name, **kwargs)
# 注:当序列化时,此变量的名称不会被作用域化,即不会以“outer_layer/inner_layer/.../name:0”的格式。
# 而是“name:0”。更多细节见:
# https://github.com/huggingface/transformers/pull/18833#issuecomment-1233090214
# 添加一个权重作为偏置,具有给定的形状、初始化器和是否可训练的参数。
self.bias = self.add_weight(name=name, shape=shape, initializer=initializer, trainable=trainable)
def call(self, x):
# 在调用时,返回输入张量 x 加上偏置 self.bias
return x + self.bias
@add_start_docstrings(
"The LED Model with a language modeling head. Can be used for summarization.",
LED_START_DOCSTRING,
)
# TFLEDForConditionalGeneration 类继承自 TFLEDPreTrainedModel,表示带有语言建模头部的 LED 模型,可用于摘要生成。
class TFLEDForConditionalGeneration(TFLEDPreTrainedModel):
# 在加载时忽略的键列表,用于不期望的项
_keys_to_ignore_on_load_unexpected = [
r"led.encoder.embed_tokens.weight",
r"led.decoder.embed_tokens.weight",
]
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
# 初始化 LED 主层,用给定的配置,并命名为 "led"
self.led = TFLEDMainLayer(config, name="led")
# 是否使用缓存,从配置中获取
self.use_cache = config.use_cache
# final_bias_logits 在 PyTorch 中作为缓冲区注册,为保持一致性,设为不可训练。
# 创建一个 BiasLayer 实例作为 final_logits_bias,形状为 [1, vocab_size],初始化为零。
self.bias_layer = BiasLayer(
name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
)
# TODO (Joao): investigate why LED has numerical issues in XLA generate
# 是否支持 XLA 生成,默认为 False,需进一步调查为何在 XLA 生成中 LED 存在数值问题。
self.supports_xla_generation = False
# 获取解码器
def get_decoder(self):
return self.led.decoder
# 获取编码器
def get_encoder(self):
return self.led.encoder
# 获取偏置信息,返回包含 final_logits_bias 偏置的字典
def get_bias(self):
return {"final_logits_bias": self.bias_layer.bias}
# 设置偏置,替换包含偏置的现有层以正确(反)序列化
def set_bias(self, value):
# 获取词汇表大小
vocab_size = value["final_logits_bias"].shape[-1]
# 创建一个 BiasLayer 实例作为 final_logits_bias,形状为 [1, vocab_size],初始化为零,且不可训练。
self.bias_layer = BiasLayer(
name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=False
)
# 将给定的偏置值赋给 self.bias_layer.bias
self.bias_layer.bias.assign(value["final_logits_bias"])
# 获取输出的嵌入层
def get_output_embeddings(self):
return self.get_input_embeddings()
# 设置输出的嵌入层
def set_output_embeddings(self, value):
self.set_input_embeddings(value)
# 对模型前向方法添加开始文档字符串,详见 LED_INPUTS_DOCSTRING,并替换返回值的文档字符串为 TFLEDSeq2SeqLMOutput
# 用于 API 文档生成。
@unpack_inputs
@add_start_docstrings_to_model_forward(LED_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFLEDSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
# 定义一个方法,用于调用模型
def call(
# 输入序列的 token IDs,可以是 TFModelInputType 类型或者 None
self,
input_ids: TFModelInputType | None = None,
# 注意力掩码,可以是 numpy 数组、张量或者 None
attention_mask: np.ndarray | tf.Tensor | None = None,
# 解码器输入的 token IDs,可以是 numpy 数组、张量或者 None
decoder_input_ids: np.ndarray | tf.Tensor | None = None,
# 解码器的注意力掩码,可以是 numpy 数组、张量或者 None
decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
# 头部掩码,可以是 numpy 数组、张量或者 None
head_mask: np.ndarray | tf.Tensor | None = None,
# 解码器的头部掩码,可以是 numpy 数组、张量或者 None
decoder_head_mask: np.ndarray | tf.Tensor | None = None,
# 编码器的输出,可以是 TFLEDEncoderBaseModelOutput 类型或者 None
encoder_outputs: TFLEDEncoderBaseModelOutput | None = None,
# 全局注意力掩码,可以是 numpy 数组、张量或者 None
global_attention_mask: np.ndarray | tf.Tensor | None = None,
# 历史键值对,类型为 Tuple[Tuple[Union[np.ndarray, tf.Tensor]]] 或者 None
past_key_values: Tuple[Tuple[Union[np.ndarray, tf.Tensor]]] | None = None,
# 输入嵌入,可以是 numpy 数组、张量或者 None
inputs_embeds: np.ndarray | tf.Tensor | None = None,
# 解码器的输入嵌入,可以是 numpy 数组、张量或者 None
decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
# 是否使用缓存,布尔类型或者 None
use_cache: bool | None = None,
# 是否输出注意力,布尔类型或者 None
output_attentions: bool | None = None,
# 是否输出隐藏状态,布尔类型或者 None
output_hidden_states: bool | None = None,
# 是否返回字典形式的结果,布尔类型或者 None
return_dict: bool | None = None,
# 标签,张量类型或者 None
labels: tf.Tensor | None = None,
# 是否处于训练模式,布尔类型,默认为 False
training: bool = False,
) -> Tuple[tf.Tensor] | TFLEDSeq2SeqLMOutput:
"""
返回一个元组,包含 tf.Tensor 和 TFLEDSeq2SeqLMOutput 类型的对象。
如果 labels 不为 None:
设置 use_cache 为 False
如果 decoder_input_ids 和 decoder_inputs_embeds 都为 None:
使用 shift_tokens_right 函数将 labels 右移,并设置填充和解码起始令牌的 ID
使用 self.led 方法处理以下参数:
input_ids: 输入的 token IDs
attention_mask: 注意力掩码
decoder_input_ids: 解码器输入的 token IDs
decoder_attention_mask: 解码器注意力掩码
encoder_outputs: 编码器输出
global_attention_mask: 全局注意力掩码
head_mask: 头部注意力掩码
decoder_head_mask: 解码器头部注意力掩码
past_key_values: 过去的键值对
inputs_embeds: 输入的嵌入向量
decoder_inputs_embeds: 解码器输入的嵌入向量
use_cache: 是否使用缓存
output_attentions: 是否输出注意力权重
output_hidden_states: 是否输出隐藏状态
return_dict: 是否返回字典
training: 是否训练模式
计算 lm_logits:
使用 self.led.shared.weights 对 outputs[0] 进行矩阵乘法,转置部分
将 lm_logits 传递给 self.bias_layer 进行处理
计算 masked_lm_loss:
如果 labels 为 None,则 masked_lm_loss 为 None,否则调用 self.hf_compute_loss 计算损失
如果 return_dict 为 False:
组装输出元组 output,包括 lm_logits 和 outputs 的其余部分
如果 masked_lm_loss 不为 None,则将其包含在输出中
返回 output
否则,以 TFLEDSeq2SeqLMOutput 对象的形式返回:
loss: masked_lm_loss
logits: lm_logits
past_key_values: outputs 中的 past_key_values(索引为 1)
decoder_hidden_states: outputs 中的 decoder_hidden_states(索引为 2)
decoder_attentions: outputs 中的 decoder_attentions(索引为 3)
cross_attentions: outputs 中的 cross_attentions(索引为 4)
encoder_last_hidden_state: encoder_outputs 中的 encoder_last_hidden_state(索引为 0)
encoder_hidden_states: encoder_outputs 中的 encoder_hidden_states(索引为 1)
encoder_attentions: encoder_outputs 中的 encoder_attentions(索引为 2)
encoder_global_attentions: encoder_global_attentions
"""
# 定义一个方法用于生成模型的输出
def serving_output(self, output):
# 如果配置要求使用缓存,则提取输出中的过去键值(past_key_values)
pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
# 如果配置要求输出隐藏状态,则将输出的解码器隐藏状态转换为张量
dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
# 如果配置要求输出注意力权重,则将输出的解码器注意力权重转换为张量
dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
# 如果配置要求输出交叉注意力权重,则将输出的交叉注意力权重转换为张量
cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
# 如果配置要求输出隐藏状态,则将输出的编码器隐藏状态转换为张量
enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
# 如果配置要求输出注意力权重,则将输出的编码器注意力权重转换为张量
enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
# 如果配置要求输出全局编码器注意力权重,则将输出的全局编码器注意力权重转换为张量
enc_g_attns = tf.convert_to_tensor(output.encoder_global_attentions) if self.config.output_attentions else None
# 返回一个包含输出结果的 TFLEDSeq2SeqLMOutput 对象
return TFLEDSeq2SeqLMOutput(
logits=output.logits,
past_key_values=pkv,
decoder_hidden_states=dec_hs,
decoder_attentions=dec_attns,
cross_attentions=cross_attns,
encoder_last_hidden_state=output.encoder_last_hidden_state,
encoder_hidden_states=enc_hs,
encoder_attentions=enc_attns,
encoder_global_attentions=enc_g_attns,
)
# 定义一个方法,准备生成时的输入参数
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
head_mask=None,
decoder_head_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
# 如果 past_key_values 不为 None,则截断 decoder_input_ids,只保留最后一个 token
if past_key_values is not None:
decoder_input_ids = decoder_input_ids[:, -1:]
# 返回一个字典,包含生成时需要的输入参数
return {
"input_ids": None, # encoder_outputs 已经定义,不需要 input_ids
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"use_cache": use_cache, # 更改此参数以避免缓存(推测是为了调试)
}
# 定义一个方法,从标签生成解码器的输入 token ids
def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor):
return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
def hf_compute_loss(self, labels, logits):
"""计算跨熵损失,忽略填充标记"""
# 使用稀疏分类交叉熵损失函数,设置为从 logits 计算,不进行损失值缩减
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
# 如果配置为使用旧版本 TensorFlow 的损失计算方式
if self.config.tf_legacy_loss:
# 将标签展平为一维张量
melted_labels = tf.reshape(labels, (-1,))
# 创建活跃损失掩码,排除填充标记
active_loss = tf.not_equal(melted_labels, self.config.pad_token_id)
# 使用掩码从 logits 中提取有效值
reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
# 使用掩码从标签中提取有效标签
labels = tf.boolean_mask(melted_labels, active_loss)
return loss_fn(labels, reduced_logits)
# 在此处将负标签裁剪为零,以避免 NaN 和错误 - 这些位置将在后续被掩码处理
unmasked_loss = loss_fn(tf.nn.relu(labels), logits)
# 确保只有非填充标签影响损失
loss_mask = tf.cast(labels != self.config.pad_token_id, dtype=unmasked_loss.dtype)
# 应用损失掩码到未掩码的损失上
masked_loss = unmasked_loss * loss_mask
# 计算掩码后的损失的均值
reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(loss_mask)
# 返回形状为 (1,) 的降维后的损失张量
return tf.reshape(reduced_masked_loss, (1,))
def build(self, input_shape=None):
# 如果已经构建过,直接返回
if self.built:
return
# 标记已构建
self.built = True
# 如果存在 LED 层,则构建 LED 层
if getattr(self, "led", None) is not None:
with tf.name_scope(self.led.name):
self.led.build(None)
# 如果存在偏置层,则构建偏置层
if getattr(self, "bias_layer", None) is not None:
with tf.name_scope(self.bias_layer.name):
self.bias_layer.build(None)