Transformers 源码解析(七十)
.\models\marian\tokenization_marian.py
import json
import os
import re
import warnings
from pathlib import Path
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple, Union
import sentencepiece
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"source_spm": "source.spm",
"target_spm": "target.spm",
"vocab": "vocab.json",
"target_vocab_file": "target_vocab.json",
"tokenizer_config_file": "tokenizer_config.json",
}
PRETRAINED_VOCAB_FILES_MAP = {
"source_spm": {
"Helsinki-NLP/opus-mt-en-de": "https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/source.spm"
},
"target_spm": {
"Helsinki-NLP/opus-mt-en-de": "https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/target.spm"
},
"vocab": {
"Helsinki-NLP/opus-mt-en-de": "https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/vocab.json"
},
"tokenizer_config_file": {
"Helsinki-NLP/opus-mt-en-de": (
"https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/tokenizer_config.json"
)
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"Helsinki-NLP/opus-mt-en-de": 512}
PRETRAINED_INIT_CONFIGURATION = {}
SPIECE_UNDERLINE = "▁"
class MarianTokenizer(PreTrainedTokenizer):
r"""
Construct a Marian tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
language_code_re = re.compile(">>.+<<")
def __init__(
self,
source_spm,
target_spm,
vocab,
target_vocab_file=None,
source_lang=None,
target_lang=None,
unk_token="<unk>",
eos_token="</s>",
pad_token="<pad>",
model_max_length=512,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
separate_vocabs=False,
**kwargs,
) -> None:
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
assert Path(source_spm).exists(), f"cannot find spm source {source_spm}"
self.separate_vocabs = separate_vocabs
self.encoder = load_json(vocab)
if str(unk_token) not in self.encoder:
raise KeyError("<unk> token must be in the vocab")
assert str(pad_token) in self.encoder
if separate_vocabs:
self.target_encoder = load_json(target_vocab_file)
self.decoder = {v: k for k, v in self.target_encoder.items()}
self.supported_language_codes = []
else:
self.decoder = {v: k for k, v in self.encoder.items()}
self.supported_language_codes: list = [k for k in self.encoder if k.startswith(">>") and k.endswith("<<")]
self.source_lang = source_lang
self.target_lang = target_lang
self.spm_files = [source_spm, target_spm]
self.spm_source = load_spm(source_spm, self.sp_model_kwargs)
self.spm_target = load_spm(target_spm, self.sp_model_kwargs)
self.current_spm = self.spm_source
self.current_encoder = self.encoder
self._setup_normalizer()
super().__init__(
source_lang=source_lang,
target_lang=target_lang,
unk_token=unk_token,
eos_token=eos_token,
pad_token=pad_token,
model_max_length=model_max_length,
sp_model_kwargs=self.sp_model_kwargs,
target_vocab_file=target_vocab_file,
separate_vocabs=separate_vocabs,
**kwargs,
)
def _setup_normalizer(self):
try:
from sacremoses import MosesPunctNormalizer
self.punc_normalizer = MosesPunctNormalizer(self.source_lang).normalize
except (ImportError, FileNotFoundError):
warnings.warn("Recommended: pip install sacremoses.")
self.punc_normalizer = lambda x: x
def normalize(self, x: str) -> str:
"""规范化输入字符串,处理空字符串的情况。如果输入为空字符串,则返回空字符串。"""
return self.punc_normalizer(x) if x else ""
def _convert_token_to_id(self, token):
"""将给定的 token 转换为对应的 ID。如果 token 不在当前编码器中,则使用未知 token 的 ID。"""
return self.current_encoder.get(token, self.current_encoder[self.unk_token])
def remove_language_code(self, text: str):
"""移除文本中的语言代码,例如 >>fr<<,以便后续处理使用 SentencePiece。"""
match = self.language_code_re.match(text)
code: list = [match.group(0)] if match else []
return code, self.language_code_re.sub("", text)
def _tokenize(self, text: str) -> List[str]:
"""对文本进行标记化处理,返回标记化后的列表。"""
code, text = self.remove_language_code(text)
pieces = self.current_spm.encode(text, out_type=str)
return code + pieces
def _convert_id_to_token(self, index: int) -> str:
"""使用解码器将索引(整数)转换为对应的 token(字符串)。"""
return self.decoder.get(index, self.unk_token)
def batch_decode(self, sequences, **kwargs):
"""
将一组 token ID 列表转换为字符串列表,通过调用 decode 方法实现。
Args:
sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
tokenized input ids 的列表。可以使用 `__call__` 方法获得。
skip_special_tokens (`bool`, *optional*, defaults to `False`):
是否在解码时移除特殊 token。
clean_up_tokenization_spaces (`bool`, *optional*):
是否清理 tokenization 空格。如果为 `None`,则默认使用 `self.clean_up_tokenization_spaces`。
use_source_tokenizer (`bool`, *optional*, defaults to `False`):
是否使用源 tokenizer 解码序列(仅适用于序列到序列问题)。
kwargs (additional keyword arguments, *optional*):
将传递给底层模型特定的解码方法。
Returns:
`List[str]`: 解码后的句子列表。
"""
return super().batch_decode(sequences, **kwargs)
def decode(self, token_ids, **kwargs):
"""
Converts a sequence of ids into a string using the tokenizer and vocabulary,
with options to remove special tokens and clean up tokenization spaces.
Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
Args:
token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
List of tokenized input ids. Can be obtained using the `__call__` method.
skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (`bool`, *optional*):
Whether or not to clean up the tokenization spaces. If `None`, will default to
`self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
use_source_tokenizer (`bool`, *optional*, defaults to `False`):
Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
problems).
kwargs (additional keyword arguments, *optional*):
Will be passed to the underlying model specific decode method.
Returns:
`str`: The decoded sentence.
"""
return super().decode(token_ids, **kwargs)
def convert_tokens_to_string(self, tokens: List[str]) -> str:
"""
Uses the source SentencePiece model (`spm_source`) if `_decode_use_source_tokenizer` is True,
otherwise uses the target model (`spm_target`) to convert tokens back into a string.
Args:
tokens (List[str]): List of tokens to be converted into a string.
Returns:
str: The reconstructed string from tokens.
"""
sp_model = self.spm_source if self._decode_use_source_tokenizer else self.spm_target
current_sub_tokens = []
out_string = ""
for token in tokens:
if token in self.all_special_tokens:
out_string += sp_model.decode_pieces(current_sub_tokens) + token + " "
current_sub_tokens = []
else:
current_sub_tokens.append(token)
out_string += sp_model.decode_pieces(current_sub_tokens)
out_string = out_string.replace(SPIECE_UNDERLINE, " ")
return out_string.strip()
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
"""
Builds model inputs from token ids by appending `eos_token_id` to token_ids_0 or to both token_ids_0 and token_ids_1.
Args:
token_ids_0 (List[int]): List of token ids for the first sequence.
token_ids_1 (List[int], optional): List of token ids for the second sequence, if processing pairs.
Returns:
List[int]: Concatenated list of token ids with `eos_token_id` appended.
"""
if token_ids_1 is None:
return token_ids_0 + [self.eos_token_id]
else:
return token_ids_0 + token_ids_1 + [self.eos_token_id]
def _switch_to_input_mode(self):
"""
Sets current SentencePiece model (`current_spm`) and encoder (`current_encoder`) to use the source versions.
"""
self.current_spm = self.spm_source
self.current_encoder = self.encoder
def _switch_to_target_mode(self):
"""
Sets current SentencePiece model (`current_spm`) to use the target version (`spm_target`).
Sets current encoder (`current_encoder`) to use `target_encoder` if `separate_vocabs` is True.
"""
self.current_spm = self.spm_target
if self.separate_vocabs:
self.current_encoder = self.target_encoder
def vocab_size(self) -> int:
return len(self.encoder)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
saved_files = []
if self.separate_vocabs:
out_src_vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab"],
)
out_tgt_vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["target_vocab_file"],
)
save_json(self.encoder, out_src_vocab_file)
save_json(self.target_encoder, out_tgt_vocab_file)
saved_files.append(out_src_vocab_file)
saved_files.append(out_tgt_vocab_file)
else:
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab"]
)
save_json(self.encoder, out_vocab_file)
saved_files.append(out_vocab_file)
for spm_save_filename, spm_orig_path, spm_model in zip(
[VOCAB_FILES_NAMES["source_spm"], VOCAB_FILES_NAMES["target_spm"]],
self.spm_files,
[self.spm_source, self.spm_target],
):
spm_save_path = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + spm_save_filename
)
if os.path.abspath(spm_orig_path) != os.path.abspath(spm_save_path) and os.path.isfile(spm_orig_path):
copyfile(spm_orig_path, spm_save_path)
saved_files.append(spm_save_path)
elif not os.path.isfile(spm_orig_path):
with open(spm_save_path, "wb") as fi:
content_spiece_model = spm_model.serialized_model_proto()
fi.write(content_spiece_model)
saved_files.append(spm_save_path)
return tuple(saved_files)
def get_vocab(self) -> Dict:
return self.get_src_vocab()
def get_src_vocab(self):
return dict(self.encoder, **self.added_tokens_encoder)
def get_tgt_vocab(self):
return dict(self.target_encoder, **self.added_tokens_decoder)
def __getstate__(self) -> Dict:
state = self.__dict__.copy()
state.update(
{k: None for k in ["spm_source", "spm_target", "current_spm", "punc_normalizer", "target_vocab_file"]}
)
return state
def __setstate__(self, d: Dict) -> None:
self.__dict__ = d
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.spm_source, self.spm_target = (load_spm(f, self.sp_model_kwargs) for f in self.spm_files)
self.current_spm = self.spm_source
self._setup_normalizer()
def num_special_tokens_to_add(self, *args, **kwargs):
"""Just EOS"""
return 1
def _special_token_mask(self, seq):
all_special_ids = set(self.all_special_ids)
all_special_ids.remove(self.unk_token_id)
return [1 if x in all_special_ids else 0 for x in seq]
def get_special_tokens_mask(
self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""Get list where entries are [1] if a token is [eos] or [pad] else 0."""
if already_has_special_tokens:
return self._special_token_mask(token_ids_0)
elif token_ids_1 is None:
return self._special_token_mask(token_ids_0) + [1]
else:
return self._special_token_mask(token_ids_0 + token_ids_1) + [1]
def load_spm(path: str, sp_model_kwargs: Dict[str, Any]) -> sentencepiece.SentencePieceProcessor:
spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs)
spm.Load(path)
return spm
def save_json(data, path: str) -> None:
with open(path, "w") as f:
json.dump(data, f, indent=2)
def load_json(path: str) -> Union[Dict, List]:
with open(path, "r") as f:
return json.load(f)
.\models\marian\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_flax_available,
is_sentencepiece_available,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_marian": ["MARIAN_PRETRAINED_CONFIG_ARCHIVE_MAP", "MarianConfig", "MarianOnnxConfig"],
}
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_marian"] = ["MarianTokenizer"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_marian"] = [
"MARIAN_PRETRAINED_MODEL_ARCHIVE_LIST",
"MarianForCausalLM",
"MarianModel",
"MarianMTModel",
"MarianPreTrainedModel",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_marian"] = ["TFMarianModel", "TFMarianMTModel", "TFMarianPreTrainedModel"]
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_flax_marian"] = ["FlaxMarianModel", "FlaxMarianMTModel", "FlaxMarianPreTrainedModel"]
if TYPE_CHECKING:
from .configuration_marian import MARIAN_PRETRAINED_CONFIG_ARCHIVE_MAP, MarianConfig, MarianOnnxConfig
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_marian import MarianTokenizer
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_marian import (
MARIAN_PRETRAINED_MODEL_ARCHIVE_LIST,
MarianForCausalLM,
MarianModel,
MarianMTModel,
MarianPreTrainedModel,
)
else:
from .modeling_tf_marian import TFMarianModel, TFMarianMTModel, TFMarianPreTrainedModel
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_flax_marian import FlaxMarianModel, FlaxMarianMTModel, FlaxMarianPreTrainedModel
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\markuplm\configuration_markuplm.py
"""
MarkupLM model configuration
"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
MARKUPLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"microsoft/markuplm-base": "https://huggingface.co/microsoft/markuplm-base/resolve/main/config.json",
"microsoft/markuplm-large": "https://huggingface.co/microsoft/markuplm-large/resolve/main/config.json",
}
class MarkupLMConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`MarkupLMModel`]. It is used to instantiate a
MarkupLM model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the MarkupLM
[microsoft/markuplm-base](https://huggingface.co/microsoft/markuplm-base) architecture.
Configuration objects inherit from [`BertConfig`] and can be used to control the model outputs. Read the
documentation from [`BertConfig`] for more information.
Examples:
```
>>> from transformers import MarkupLMModel, MarkupLMConfig
>>> # Initializing a MarkupLM microsoft/markuplm-base style configuration
>>> configuration = MarkupLMConfig()
>>> # Initializing a model from the microsoft/markuplm-base style configuration
>>> model = MarkupLMModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "markuplm"
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=0,
bos_token_id=0,
eos_token_id=2,
max_xpath_tag_unit_embeddings=256,
max_xpath_subs_unit_embeddings=1024,
tag_pad_id=216,
subs_pad_id=1001,
xpath_unit_hidden_size=32,
max_depth=50,
position_embedding_type="absolute",
use_cache=True,
classifier_dropout=None,
**kwargs,
):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
**kwargs,
)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.position_embedding_type = position_embedding_type
self.use_cache = use_cache
self.classifier_dropout = classifier_dropout
self.max_depth = max_depth
self.max_xpath_tag_unit_embeddings = max_xpath_tag_unit_embeddings
self.max_xpath_subs_unit_embeddings = max_xpath_subs_unit_embeddings
self.tag_pad_id = tag_pad_id
self.subs_pad_id = subs_pad_id
self.xpath_unit_hidden_size = xpath_unit_hidden_size
.\models\markuplm\feature_extraction_markuplm.py
"""
Feature extractor class for MarkupLM.
"""
import html
from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
from ...utils import is_bs4_available, logging, requires_backends
if is_bs4_available():
import bs4
from bs4 import BeautifulSoup
logger = logging.get_logger(__name__)
class MarkupLMFeatureExtractor(FeatureExtractionMixin):
r"""
Constructs a MarkupLM feature extractor. This can be used to get a list of nodes and corresponding xpaths from HTML
strings.
This feature extractor inherits from [`~feature_extraction_utils.PreTrainedFeatureExtractor`] which contains most
of the main methods. Users should refer to this superclass for more information regarding those methods.
"""
def __init__(self, **kwargs):
requires_backends(self, ["bs4"])
super().__init__(**kwargs)
def xpath_soup(self, element):
xpath_tags = []
xpath_subscripts = []
child = element if element.name else element.parent
for parent in child.parents:
siblings = parent.find_all(child.name, recursive=False)
xpath_tags.append(child.name)
xpath_subscripts.append(
0 if 1 == len(siblings) else next(i for i, s in enumerate(siblings, 1) if s is child)
)
child = parent
xpath_tags.reverse()
xpath_subscripts.reverse()
return xpath_tags, xpath_subscripts
def get_three_from_single(self, html_string):
html_code = BeautifulSoup(html_string, "html.parser")
all_doc_strings = []
string2xtag_seq = []
string2xsubs_seq = []
for element in html_code.descendants:
if isinstance(element, bs4.element.NavigableString):
if type(element.parent) != bs4.element.Tag:
continue
text_in_this_tag = html.unescape(element).strip()
if not text_in_this_tag:
continue
all_doc_strings.append(text_in_this_tag)
xpath_tags, xpath_subscripts = self.xpath_soup(element)
string2xtag_seq.append(xpath_tags)
string2xsubs_seq.append(xpath_subscripts)
if len(all_doc_strings) != len(string2xtag_seq):
raise ValueError("Number of doc strings and xtags does not correspond")
if len(all_doc_strings) != len(string2xsubs_seq):
raise ValueError("Number of doc strings and xsubs does not correspond")
return all_doc_strings, string2xtag_seq, string2xsubs_seq
def construct_xpath(self, xpath_tags, xpath_subscripts):
xpath = ""
for tagname, subs in zip(xpath_tags, xpath_subscripts):
xpath += f"/{tagname}"
if subs != 0:
xpath += f"[{subs}]"
return xpath
def __call__(self, html_strings) -> BatchFeature:
"""
Main method to prepare for the model one or several HTML strings.
Args:
html_strings (`str`, `List[str]`):
The HTML string or batch of HTML strings from which to extract nodes and corresponding xpaths.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **nodes** -- Nodes.
- **xpaths** -- Corresponding xpaths.
Examples:
```
>>> from transformers import MarkupLMFeatureExtractor
>>> page_name_1 = "page1.html"
>>> page_name_2 = "page2.html"
>>> page_name_3 = "page3.html"
>>> with open(page_name_1) as f:
... single_html_string = f.read()
>>> feature_extractor = MarkupLMFeatureExtractor()
>>> # single example
>>> encoding = feature_extractor(single_html_string)
>>> print(encoding.keys())
>>> # dict_keys(['nodes', 'xpaths'])
>>> # batched example
>>> multi_html_strings = []
>>> with open(page_name_2) as f:
... multi_html_strings.append(f.read())
>>> with open(page_name_3) as f:
... multi_html_strings.append(f.read())
>>> encoding = feature_extractor(multi_html_strings)
>>> print(encoding.keys())
>>> # dict_keys(['nodes', 'xpaths'])
```"""
valid_strings = False
if isinstance(html_strings, str):
valid_strings = True
elif isinstance(html_strings, (list, tuple)):
if len(html_strings) == 0 or isinstance(html_strings[0], str):
valid_strings = True
if not valid_strings:
raise ValueError(
"HTML strings must of type `str`, `List[str]` (batch of examples), "
f"but is of type {type(html_strings)}."
)
is_batched = bool(isinstance(html_strings, (list, tuple)) and (isinstance(html_strings[0], str)))
if not is_batched:
html_strings = [html_strings]
nodes = []
xpaths = []
for html_string in html_strings:
all_doc_strings, string2xtag_seq, string2xsubs_seq = self.get_three_from_single(html_string)
nodes.append(all_doc_strings)
xpath_strings = []
for node, tag_list, sub_list in zip(all_doc_strings, string2xtag_seq, string2xsubs_seq):
xpath_string = self.construct_xpath(tag_list, sub_list)
xpath_strings.append(xpath_string)
xpaths.append(xpath_strings)
data = {"nodes": nodes, "xpaths": xpaths}
encoded_inputs = BatchFeature(data=data, tensor_type=None)
return encoded_inputs
.\models\markuplm\modeling_markuplm.py
""" PyTorch MarkupLM 模型。"""
import math
import os
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...file_utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
replace_return_docstrings,
)
from ...modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
BaseModelOutputWithPoolingAndCrossAttentions,
MaskedLMOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import (
PreTrainedModel,
apply_chunking_to_forward,
find_pruneable_heads_and_indices,
prune_linear_layer,
)
from ...utils import logging
from .configuration_markuplm import MarkupLMConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "microsoft/markuplm-base"
_CONFIG_FOR_DOC = "MarkupLMConfig"
MARKUPLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/markuplm-base",
"microsoft/markuplm-large",
]
class XPathEmbeddings(nn.Module):
"""Construct the embeddings from xpath tags and subscripts.
We drop tree-id in this version, as its info can be covered by xpath.
"""
def __init__(self, config):
super(XPathEmbeddings, self).__init__()
self.max_depth = config.max_depth
self.xpath_unitseq2_embeddings = nn.Linear(config.xpath_unit_hidden_size * self.max_depth, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.activation = nn.ReLU()
self.xpath_unitseq2_inner = nn.Linear(config.xpath_unit_hidden_size * self.max_depth, 4 * config.hidden_size)
self.inner2emb = nn.Linear(4 * config.hidden_size, config.hidden_size)
self.xpath_tag_sub_embeddings = nn.ModuleList(
[
nn.Embedding(config.max_xpath_tag_unit_embeddings, config.xpath_unit_hidden_size)
for _ in range(self.max_depth)
]
)
self.xpath_subs_sub_embeddings = nn.ModuleList(
[
nn.Embedding(config.max_xpath_subs_unit_embeddings, config.xpath_unit_hidden_size)
for _ in range(self.max_depth)
]
)
python
def forward(self, xpath_tags_seq=None, xpath_subs_seq=None):
xpath_tags_embeddings = []
xpath_subs_embeddings = []
for i in range(self.max_depth):
xpath_tags_embeddings.append(self.xpath_tag_sub_embeddings[i](xpath_tags_seq[:, :, i]))
xpath_subs_embeddings.append(self.xpath_subs_sub_embeddings[i](xpath_subs_seq[:, :, i]))
xpath_tags_embeddings = torch.cat(xpath_tags_embeddings, dim=-1)
xpath_subs_embeddings = torch.cat(xpath_subs_embeddings, dim=-1)
xpath_embeddings = xpath_tags_embeddings + xpath_subs_embeddings
xpath_embeddings = self.inner2emb(self.dropout(self.activation(self.xpath_unitseq2_inner(xpath_embeddings))))
return xpath_embeddings
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
mask = input_ids.ne(padding_idx).int()
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
return incremental_indices.long() + padding_idx
class MarkupLMEmbeddings(nn.Module):
"""从词嵌入、位置嵌入和标记类型嵌入构建嵌入向量。"""
def __init__(self, config):
super(MarkupLMEmbeddings, self).__init__()
self.config = config
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.max_depth = config.max_depth
self.xpath_embeddings = XPathEmbeddings(config)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.padding_idx = config.pad_token_id
self.position_embeddings = nn.Embedding(
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
)
def create_position_ids_from_inputs_embeds(self, inputs_embeds):
"""
由于我们直接提供了嵌入向量,无法推断哪些是填充的,因此直接生成顺序的位置编号。
Args:
inputs_embeds: torch.Tensor
Returns: torch.Tensor
"""
input_shape = inputs_embeds.size()[:-1]
sequence_length = input_shape[1]
position_ids = torch.arange(
self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
)
return position_ids.unsqueeze(0).expand(input_shape)
def forward(
self,
input_ids=None,
xpath_tags_seq=None,
xpath_subs_seq=None,
token_type_ids=None,
position_ids=None,
inputs_embeds=None,
past_key_values_length=0,
):
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
device = input_ids.device if input_ids is not None else inputs_embeds.device
if position_ids is None:
if input_ids is not None:
position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
else:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
if xpath_tags_seq is None:
xpath_tags_seq = self.config.tag_pad_id * torch.ones(
tuple(list(input_shape) + [self.max_depth]), dtype=torch.long, device=device
)
if xpath_subs_seq is None:
xpath_subs_seq = self.config.subs_pad_id * torch.ones(
tuple(list(input_shape) + [self.max_depth]), dtype=torch.long, device=device
)
words_embeddings = inputs_embeds
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
xpath_embeddings = self.xpath_embeddings(xpath_tags_seq, xpath_subs_seq)
embeddings = words_embeddings + position_embeddings + token_type_embeddings + xpath_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class MarkupLMSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class MarkupLMIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class MarkupLMOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class MarkupLMPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class MarkupLMPredictionHeadTransform(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
class MarkupLMLMPredictionHead(nn.Module):
def __init__(self, config):
super().__init__()
self.transform = MarkupLMPredictionHeadTransform(config)
self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.decoder.bias = self.bias
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states)
return hidden_states
class MarkupLMOnlyMLMHead(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = MarkupLMLMPredictionHead(config)
def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
prediction_scores = self.predictions(sequence_output)
return prediction_scores
class MarkupLMSelfAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = position_embedding_type or getattr(
config, "position_embedding_type", "absolute"
)
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
self.is_decoder = config.is_decoder
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
class MarkupLMAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
self.self = MarkupLMSelfAttention(config, position_embedding_type=position_embedding_type)
self.output = MarkupLMSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class MarkupLMLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = MarkupLMAttention(config)
self.is_decoder = config.is_decoder
self.add_cross_attention = config.add_cross_attention
if self.add_cross_attention:
if not self.is_decoder:
raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
self.crossattention = MarkupLMAttention(config, position_embedding_type="absolute")
self.intermediate = MarkupLMIntermediate(config)
self.output = MarkupLMOutput(config)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
past_key_value=self_attn_past_key_value,
)
attention_output = self_attention_outputs[0]
if self.is_decoder:
outputs = self_attention_outputs[1:-1]
present_key_value = self_attention_outputs[-1]
else:
outputs = self_attention_outputs[1:]
cross_attn_present_key_value = None
if self.is_decoder and encoder_hidden_states is not None:
if not hasattr(self, "crossattention"):
raise ValueError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
" by setting `config.add_cross_attention=True`"
)
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
cross_attention_outputs = self.crossattention(
attention_output,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
cross_attn_past_key_value,
output_attentions,
)
attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:-1]
cross_attn_present_key_value = cross_attention_outputs[-1]
present_key_value = present_key_value + cross_attn_present_key_value
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
if self.is_decoder:
outputs = outputs + (present_key_value,)
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class MarkupLMEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([MarkupLMLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False
next_decoder_cache = () if use_cache else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
past_key_value = past_key_values[i] if past_key_values is not None else None
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
else:
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if self.config.add_cross_attention:
all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v
for v in [
hidden_states,
next_decoder_cache,
all_hidden_states,
all_self_attentions,
all_cross_attentions,
]
if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
)
"""
This model is a PyTorch `torch.nn.Module` sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`MarkupLMConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary
xpath_tags_seq (`torch.LongTensor` of shape `({0}, config.max_depth)`, *optional*):
xpath_subs_seq (`torch.LongTensor` of shape `({0}, config.max_depth)`, *optional*):
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
[What are attention masks?](../glossary
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
[What are token type IDs?](../glossary
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
[What are position IDs?](../glossary
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
output_attentions (`bool`, *optional*):
output_hidden_states (`bool`, *optional*):
return_dict (`bool`, *optional*):
"""
@add_start_docstrings(
"The bare MarkupLM Model transformer outputting raw hidden-states without any specific head on top.",
MARKUPLM_START_DOCSTRING,
)
class MarkupLMModel(MarkupLMPreTrainedModel):
# 从transformers.models.bert.modeling_bert.BertModel.__init__复制而来,将Bert改为MarkupLM
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
# 初始化模型的嵌入层
self.embeddings = MarkupLMEmbeddings(config)
# 初始化模型的编码器层
self.encoder = MarkupLMEncoder(config)
# 如果指定要添加池化层,则初始化池化层;否则设为None
self.pooler = MarkupLMPooler(config) if add_pooling_layer else None
# 初始化权重并进行最终处理
self.post_init()
# 返回输入嵌入层
def get_input_embeddings(self):
return self.embeddings.word_embeddings
# 设置输入嵌入层
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
# 剪枝模型中的注意力头
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(MARKUPLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=_CONFIG_FOR_DOC)
# 从transformers.models.bert.modeling_bert.BertModel.forward复制而来
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
xpath_tags_seq: Optional[torch.LongTensor] = None,
xpath_subs_seq: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Model forward pass. See BaseModelOutputWithPoolingAndCrossAttentions for specific outputs.
"""
# 从transformers.models.bert.modeling_bert.BertModel.prepare_inputs_for_generation复制而来
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, use_cache=True, **model_kwargs
):
):
# 获取输入的形状
input_shape = input_ids.shape
# 如果没有提供注意力遮罩(mask),则创建全为1的注意力遮罩
if attention_mask is None:
attention_mask = input_ids.new_ones(input_shape)
# 如果存在过去的键值对(past_key_values),则根据它进行修剪输入的decoder_input_ids
if past_key_values is not None:
# 获取过去键值对的长度
past_length = past_key_values[0][0].shape[2]
# 一些生成方法已经只传递了最后一个输入ID
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
# 默认保留旧行为:仅保留最后一个ID
remove_prefix_length = input_ids.shape[1] - 1
# 对输入进行修剪,仅保留后缀部分
input_ids = input_ids[:, remove_prefix_length:]
# 返回一个包含重排后的缓存信息的字典
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"past_key_values": past_key_values,
"use_cache": use_cache,
}
# 从transformers.models.bert.modeling_bert.BertModel._reorder_cache复制而来
def _reorder_cache(self, past_key_values, beam_idx):
# 重新排序过去的键值对
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
# 对每一层的过去状态根据beam_idx重新排序并添加到元组中
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
@add_start_docstrings(
"""
MarkupLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
MARKUPLM_START_DOCSTRING,
)
class MarkupLMForQuestionAnswering(MarkupLMPreTrainedModel):
# 从 transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ 复制而来,将 bert->markuplm, Bert->MarkupLM
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.markuplm = MarkupLMModel(config, add_pooling_layer=False) # 初始化 MarkupLMModel 模型
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) # 初始化用于 QA 输出的线性层
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(MARKUPLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=QuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
xpath_tags_seq: Optional[torch.Tensor] = None,
xpath_subs_seq: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
start_positions: Optional[torch.Tensor] = None,
end_positions: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""MarkupLM Model with a `token_classification` head on top.""",
MARKUPLM_START_DOCSTRING
)
class MarkupLMForTokenClassification(MarkupLMPreTrainedModel):
# 从 transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ 复制而来,将 bert->markuplm, Bert->MarkupLM
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.markuplm = MarkupLMModel(config, add_pooling_layer=False) # 初始化 MarkupLMModel 模型
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout) # 初始化 dropout 层
self.classifier = nn.Linear(config.hidden_size, config.num_labels) # 初始化用于分类的线性层
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(MARKUPLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
# 定义一个方法 `forward`,用于模型的前向传播
def forward(
# 输入的词索引序列,可以是张量或者空值
self,
input_ids: Optional[torch.Tensor] = None,
# XPath 标签序列,可以是张量或者空值
xpath_tags_seq: Optional[torch.Tensor] = None,
# XPath 子句序列,可以是张量或者空值
xpath_subs_seq: Optional[torch.Tensor] = None,
# 注意力掩码,可以是张量或者空值
attention_mask: Optional[torch.Tensor] = None,
# 标记类型 ID,可以是张量或者空值
token_type_ids: Optional[torch.Tensor] = None,
# 位置 ID,可以是张量或者空值
position_ids: Optional[torch.Tensor] = None,
# 头部掩码,可以是张量或者空值
head_mask: Optional[torch.Tensor] = None,
# 输入的嵌入表示,可以是张量或者空值
inputs_embeds: Optional[torch.Tensor] = None,
# 标签,可以是张量或者空值
labels: Optional[torch.Tensor] = None,
# 是否输出注意力权重,默认为 None
output_attentions: Optional[bool] = None,
# 是否输出隐藏状态,默认为 None
output_hidden_states: Optional[bool] = None,
# 是否以字典形式返回输出,默认为 None
return_dict: Optional[bool] = None,
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果 return_dict 参数不为 None,则使用该参数;否则使用 self.config.use_return_dict
outputs = self.markuplm(
input_ids,
xpath_tags_seq=xpath_tags_seq,
xpath_subs_seq=xpath_subs_seq,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 调用 self.markuplm 进行模型推理,传入各种参数如 input_ids, xpath_tags_seq 等,根据 return_dict 返回不同的结果格式
sequence_output = outputs[0]
# 获取模型输出中的序列输出
prediction_scores = self.classifier(sequence_output) # (batch_size, seq_length, node_type_size)
# 使用 self.classifier 对序列输出进行分类预测,得到预测分数,形状为 (batch_size, seq_length, node_type_size)
loss = None
if labels is not None:
# 如果提供了标签信息
loss_fct = CrossEntropyLoss()
# 使用交叉熵损失函数
loss = loss_fct(
prediction_scores.view(-1, self.config.num_labels),
labels.view(-1),
)
# 计算预测分数和标签之间的损失值
if not return_dict:
# 如果 return_dict 为 False
output = (prediction_scores,) + outputs[2:]
# 构建输出元组,包含预测分数和额外的输出内容
return ((loss,) + output) if loss is not None else output
# 如果有损失值,返回损失和输出内容;否则只返回输出内容
return TokenClassifierOutput(
loss=loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 如果 return_dict 为 True,以 TokenClassifierOutput 形式返回结果,包括损失、预测分数、隐藏状态和注意力权重
"""
在标记LM模型的基础上增加一个顶部的序列分类/回归头部(即在池化输出之上的线性层),例如用于GLUE任务。
"""
@add_start_docstrings(
"""
在标记LM模型的基础上增加一个顶部的序列分类/回归头部(即在池化输出之上的线性层),例如用于GLUE任务。
""",
MARKUPLM_START_DOCSTRING,
)
class MarkupLMForSequenceClassification(MarkupLMPreTrainedModel):
"""
从transformers.models.bert.modeling_bert.BertForSequenceClassification.__init__复制而来,将bert->markuplm, Bert->MarkupLM。
"""
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels # 从配置中获取标签数量
self.config = config # 存储配置信息
self.markuplm = MarkupLMModel(config) # 初始化标记LM模型
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout) # 定义dropout层
self.classifier = nn.Linear(config.hidden_size, config.num_labels) # 定义线性分类器
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(MARKUPLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
xpath_tags_seq: Optional[torch.Tensor] = None,
xpath_subs_seq: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
"""
将输入的多个张量传递给MarkupLM模型以进行前向传播,支持多种输入和输出设置。
"""
**kwargs,
) -> SequenceClassifierOutput:
pass # 实际的前向传播逻辑在这里未展示,需要根据实际情况填充
.\models\markuplm\processing_markuplm.py
"""
Processor class for MarkupLM.
"""
from typing import Optional, Union
from ...file_utils import TensorType
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, TruncationStrategy
class MarkupLMProcessor(ProcessorMixin):
r"""
Constructs a MarkupLM processor which combines a MarkupLM feature extractor and a MarkupLM tokenizer into a single
processor.
[`MarkupLMProcessor`] offers all the functionalities you need to prepare data for the model.
It first uses [`MarkupLMFeatureExtractor`] to extract nodes and corresponding xpaths from one or more HTML strings.
Next, these are provided to [`MarkupLMTokenizer`] or [`MarkupLMTokenizerFast`], which turns them into token-level
`input_ids`, `attention_mask`, `token_type_ids`, `xpath_tags_seq` and `xpath_subs_seq`.
Args:
feature_extractor (`MarkupLMFeatureExtractor`):
An instance of [`MarkupLMFeatureExtractor`]. The feature extractor is a required input.
tokenizer (`MarkupLMTokenizer` or `MarkupLMTokenizerFast`):
An instance of [`MarkupLMTokenizer`] or [`MarkupLMTokenizerFast`]. The tokenizer is a required input.
parse_html (`bool`, *optional*, defaults to `True`):
Whether or not to use `MarkupLMFeatureExtractor` to parse HTML strings into nodes and corresponding xpaths.
"""
feature_extractor_class = "MarkupLMFeatureExtractor"
tokenizer_class = ("MarkupLMTokenizer", "MarkupLMTokenizerFast")
parse_html = True
def __call__(
self,
html_strings=None,
nodes=None,
xpaths=None,
node_labels=None,
questions=None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
) -> BatchEncoding:
"""
This method first forwards the `html_strings` argument to [`~MarkupLMFeatureExtractor.__call__`]. Next, it
passes the `nodes` and `xpaths` along with the additional arguments to [`~MarkupLMTokenizer.__call__`] and
returns the output.
Optionally, one can also provide a `text` argument which is passed along as first sequence.
Please refer to the docstring of the above two methods for more information.
"""
if self.parse_html:
if html_strings is None:
raise ValueError("Make sure to pass HTML strings in case `parse_html` is set to `True`")
if nodes is not None or xpaths is not None or node_labels is not None:
raise ValueError(
"Please don't pass nodes, xpaths nor node labels in case `parse_html` is set to `True`"
)
features = self.feature_extractor(html_strings)
nodes = features["nodes"]
xpaths = features["xpaths"]
else:
if html_strings is not None:
raise ValueError("You have passed HTML strings but `parse_html` is set to `False`.")
if nodes is None or xpaths is None:
raise ValueError("Make sure to pass nodes and xpaths in case `parse_html` is set to `False`")
if questions is not None and self.parse_html:
if isinstance(questions, str):
questions = [questions]
encoded_inputs = self.tokenizer(
text=questions if questions is not None else nodes,
text_pair=nodes if questions is not None else None,
xpaths=xpaths,
node_labels=node_labels,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
return_tensors=return_tensors,
**kwargs,
)
return encoded_inputs
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to TrOCRTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please refer
to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to TrOCRTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
return tokenizer_input_names
.\models\markuplm\tokenization_markuplm.py
"""MarkupLM 的标记化类。"""
import json
import os
from functools import lru_cache
from typing import Dict, List, Optional, Tuple, Union
import regex as re
from ...file_utils import PaddingStrategy, TensorType, add_end_docstrings
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...tokenization_utils_base import (
ENCODE_KWARGS_DOCSTRING,
BatchEncoding,
EncodedInput,
PreTokenizedInput,
TextInput,
TextInputPair,
TruncationStrategy,
)
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/markuplm-base": "https://huggingface.co/microsoft/markuplm-base/resolve/main/vocab.json",
"microsoft/markuplm-large": "https://huggingface.co/microsoft/markuplm-large/resolve/main/vocab.json",
},
"merges_file": {
"microsoft/markuplm-base": "https://huggingface.co/microsoft/markuplm-base/resolve/main/merges.txt",
"microsoft/markuplm-large": "https://huggingface.co/microsoft/markuplm-large/resolve/main/merges.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/markuplm-base": 512,
"microsoft/markuplm-large": 512,
}
@lru_cache()
def bytes_to_unicode():
"""
返回 utf-8 字节列表及其对应的 Unicode 字符映射。
避免映射到空白字符和控制字符,以免在 bpe 编码中出错。
可逆的 bpe 编码适用于 Unicode 字符串,因此如果要避免 UNKs,
则需要在词汇表中包含大量的 Unicode 字符。
例如,对于大约 100 亿个令牌的数据集,您需要大约 5000 个 Unicode 字符以确保良好的覆盖率。
这相当于您正常使用的 32K bpe 词汇表的显著百分比。
为了避免这种情况,我们需要 utf-8 字节和 Unicode 字符串之间的查找表。
"""
bs = (
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
)
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
def get_pairs(word):
"""
# 返回单词中所有相邻字符对组成的集合。这里单词被表示为由符号元组构成(符号是长度可变的字符串)。
def get_symbol_pairs(word):
# 初始化一个空集合来存放符号对
pairs = set()
# 从单词的第一个符号开始迭代到最后一个符号
prev_char = word[0] # 获取单词的第一个符号作为前一个符号
for char in word[1:]: # 从第二个符号开始迭代到最后一个符号
# 将前一个符号和当前符号组成一个符号对,并添加到集合中
pairs.add((prev_char, char))
# 更新前一个符号为当前符号,以便下一次迭代使用
prev_char = char
# 返回包含所有符号对的集合
return pairs
class MarkupLMTokenizer(PreTrainedTokenizer):
r"""
Construct a MarkupLM tokenizer. Based on byte-level Byte-Pair-Encoding (BPE). [`MarkupLMTokenizer`] can be used to
turn HTML strings into to token-level `input_ids`, `attention_mask`, `token_type_ids`, `xpath_tags_seq` and
`xpath_tags_seq`. This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
"""
Args:
vocab_file (`str`):
Path to the vocabulary file.
merges_file (`str`):
Path to the merges file.
errors (`str`, *optional*, defaults to `"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See
[bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the `cls_token`.
</Tip>
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the end of sequence.
The token used is the `sep_token`.
</Tip>
sep_token (`str`, *optional*, defaults to `"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
cls_token (`str`, *optional*, defaults to `"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
mask_token (`str`, *optional*, defaults to `"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
add_prefix_space (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(
self,
vocab_file,
merges_file,
tags_dict,
errors="replace",
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
add_prefix_space=False,
max_depth=50,
max_width=1000,
pad_width=1001,
pad_token_label=-100,
only_label_first_subword=True,
**kwargs,
):
...
def get_xpath_seq(self, xpath):
"""
根据给定的 xpath 表达式(如 "/html/body/div/li[1]/div/span[2]"),返回标签 ID 和对应的下标列表,考虑最大深度限制。
"""
xpath_tags_list = []
xpath_subs_list = []
xpath_units = xpath.split("/")
for unit in xpath_units:
if not unit.strip():
continue
name_subs = unit.strip().split("[")
tag_name = name_subs[0]
sub = 0 if len(name_subs) == 1 else int(name_subs[1][:-1])
xpath_tags_list.append(self.tags_dict.get(tag_name, self.unk_tag_id))
xpath_subs_list.append(min(self.max_width, sub))
xpath_tags_list = xpath_tags_list[: self.max_depth]
xpath_subs_list = xpath_subs_list[: self.max_depth]
xpath_tags_list += [self.pad_tag_id] * (self.max_depth - len(xpath_tags_list))
xpath_subs_list += [self.pad_width] * (self.max_depth - len(xpath_subs_list))
return xpath_tags_list, xpath_subs_list
@property
def vocab_size(self):
return len(self.encoder)
def get_vocab(self):
vocab = self.encoder.copy()
vocab.update(self.added_tokens_encoder)
return vocab
def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token)
pairs = get_pairs(word)
if not pairs:
return token
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = " ".join(word)
self.cache[token] = word
return word
def _tokenize(self, text):
"""Tokenize a string."""
bpe_tokens = []
for token in re.findall(self.pat, text):
token = "".join(
self.byte_encoder[b] for b in token.encode("utf-8")
)
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
return bpe_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
logger.warning(
"MarkupLM now does not support generative tasks, decoding is experimental and subject to change."
)
text = "".join(tokens)
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
return text
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write("#version: 0.2\n")
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
text = " " + text
return (text, kwargs)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A RoBERTa sequence has the following format:
- single sequence: `<s> X </s>`
- pair of sequences: `<s> A </s></s> B </s>`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
def build_xpath_tags_with_special_tokens(
self, xpath_tags_0: List[int], xpath_tags_1: Optional[List[int]] = None
):
) -> List[int]:
pad = [self.pad_xpath_tags_seq]
if len(xpath_tags_1) == 0:
return pad + xpath_tags_0 + pad
return pad + xpath_tags_0 + pad + xpath_tags_1 + pad
def build_xpath_subs_with_special_tokens(
self, xpath_subs_0: List[int], xpath_subs_1: Optional[List[int]] = None
) -> List[int]:
pad = [self.pad_xpath_subs_seq]
if len(xpath_subs_1) == 0:
return pad + xpath_subs_0 + pad
return pad + xpath_subs_0 + pad + xpath_subs_1 + pad
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Args:
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + token_ids_1 + sep) * [0]
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
xpaths: Union[List[List[int]], List[List[List[int]]]] = None,
node_labels: Optional[Union[List[int], List[List[int]]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
],
is_pair: bool = None,
xpaths: Optional[List[List[List[int]]]] = None,
node_labels: Optional[Union[List[int], List[List[int]]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
) -> BatchEncoding:
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
return self._batch_encode_plus(
batch_text_or_text_pairs=batch_text_or_text_pairs,
is_pair=is_pair,
xpaths=xpaths,
node_labels=node_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def _batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
],
is_pair: bool = None,
xpaths: Optional[List[List[List[int]]]] = None,
node_labels: Optional[List[List[int]]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
if return_offsets_mapping:
raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers. "
"To use this feature, change your tokenizer to one deriving from "
"transformers.PreTrainedTokenizerFast."
)
batch_outputs = self._batch_prepare_for_model(
batch_text_or_text_pairs=batch_text_or_text_pairs,
is_pair=is_pair,
xpaths=xpaths,
node_labels=node_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
return_tensors=return_tensors,
verbose=verbose,
)
return BatchEncoding(batch_outputs)
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def _batch_prepare_for_model(
self,
batch_text_or_text_pairs,
is_pair: bool = None,
xpaths: Optional[List[List[int]]] = None,
node_labels: Optional[List[List[int]]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_length: bool = False,
verbose: bool = True,
) -> BatchEncoding:
"""
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
manages a moving window (with user defined stride) for overflowing tokens.
Args:
batch_ids_pairs: list of tokenized input ids or input ids pairs
"""
batch_outputs = {}
for idx, example in enumerate(zip(batch_text_or_text_pairs, xpaths)):
batch_text_or_text_pair, xpaths_example = example
outputs = self.prepare_for_model(
batch_text_or_text_pair[0] if is_pair else batch_text_or_text_pair,
batch_text_or_text_pair[1] if is_pair else None,
xpaths_example,
node_labels=node_labels[idx] if node_labels is not None else None,
add_special_tokens=add_special_tokens,
padding=PaddingStrategy.DO_NOT_PAD.value,
truncation=truncation_strategy.value,
max_length=max_length,
stride=stride,
pad_to_multiple_of=None,
return_attention_mask=False,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
return_tensors=None,
prepend_batch_axis=False,
verbose=verbose,
)
for key, value in outputs.items():
if key not in batch_outputs:
batch_outputs[key] = []
batch_outputs[key].append(value)
batch_outputs = self.pad(
batch_outputs,
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
)
batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
return batch_outputs
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING)
def encode(
self,
text: Union[TextInput, PreTokenizedInput],
text_pair: Optional[PreTokenizedInput] = None,
xpaths: Optional[List[List[int]]] = None,
node_labels: Optional[List[int]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> List[int]:
encoded_inputs = self.encode_plus(
text=text,
text_pair=text_pair,
xpaths=xpaths,
node_labels=node_labels,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
return encoded_inputs["input_ids"]
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def encode_plus(
self,
text: Union[TextInput, PreTokenizedInput],
text_pair: Optional[PreTokenizedInput] = None,
xpaths: Optional[List[List[int]]] = None,
node_labels: Optional[List[int]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
"""
Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
`__call__` should be used instead.
Args:
text (`str`, `List[str]`, `List[List[str]]`):
The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
text_pair (`List[str]` or `List[int]`, *optional*):
Optional second sequence to be encoded. This can be a list of strings (nodes of a single example) or a
list of list of strings (nodes of a batch of examples).
"""
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
return self._encode_plus(
text=text,
xpaths=xpaths,
text_pair=text_pair,
node_labels=node_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
) -> BatchEncoding:
if return_offsets_mapping:
raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers. "
"To use this feature, change your tokenizer to one deriving from "
"transformers.PreTrainedTokenizerFast. "
"More information on available tokenizers at "
"https://github.com/huggingface/transformers/pull/2674"
)
return self.prepare_for_model(
text=text,
text_pair=text_pair,
xpaths=xpaths,
node_labels=node_labels,
add_special_tokens=add_special_tokens,
padding=padding_strategy.value,
truncation=truncation_strategy.value,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
verbose=verbose,
)
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def prepare_for_model(
self,
text: Union[TextInput, PreTokenizedInput],
text_pair: Optional[PreTokenizedInput] = None,
xpaths: Optional[List[List[int]]] = None,
node_labels: Optional[List[int]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
prepend_batch_axis: bool = False,
**kwargs,
def truncate_sequences(
self,
ids: List[int],
xpath_tags_seq: List[List[int]],
xpath_subs_seq: List[List[int]],
pair_ids: Optional[List[int]] = None,
pair_xpath_tags_seq: Optional[List[List[int]]] = None,
pair_xpath_subs_seq: Optional[List[List[int]]] = None,
labels: Optional[List[int]] = None,
num_tokens_to_remove: int = 0,
truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
stride: int = 0,
def _pad(
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
.\models\markuplm\tokenization_markuplm_fast.py
"""
MarkupLM 的快速标记类。它重写了慢速标记器类的两个方法,即 _batch_encode_plus 和 _encode_plus,
在这些方法中使用了 Rust 标记器。
"""
import json
from functools import lru_cache
from typing import Dict, List, Optional, Tuple, Union
from tokenizers import pre_tokenizers, processors
from ...file_utils import PaddingStrategy, TensorType, add_end_docstrings
from ...tokenization_utils_base import (
ENCODE_KWARGS_DOCSTRING,
AddedToken,
BatchEncoding,
EncodedInput,
PreTokenizedInput,
TextInput,
TextInputPair,
TruncationStrategy,
)
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .tokenization_markuplm import MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING, MarkupLMTokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/markuplm-base": "https://huggingface.co/microsoft/markuplm-base/resolve/main/vocab.json",
"microsoft/markuplm-large": "https://huggingface.co/microsoft/markuplm-large/resolve/main/vocab.json",
},
"merges_file": {
"microsoft/markuplm-base": "https://huggingface.co/microsoft/markuplm-base/resolve/main/merges.txt",
"microsoft/markuplm-large": "https://huggingface.co/microsoft/markuplm-large/resolve/main/merges.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/markuplm-base": 512,
"microsoft/markuplm-large": 512,
}
@lru_cache()
def bytes_to_unicode():
"""
返回 utf-8 字节列表及其映射到 unicode 字符串的映射表。我们特别避免映射到空格或控制字符,以免在 bpe 编码时出错。
可逆的 bpe 编码适用于 unicode 字符串。这意味着如果您希望避免 UNKs,您需要在词汇中包含大量的 unicode 字符。
当您处理类似 10B 令牌数据集时,您可能需要约 5K 个 unicode 字符以获得良好的覆盖率。
这相当于正常 32K bpe 词汇表的显著比例。为了避免这种情况,我们希望在 utf-8 字节和 unicode 字符串之间建立查找表。
"""
bs = (
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
)
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
def get_pairs(word):
"""
Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length
strings).
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
r"""
Construct a MarkupLM tokenizer. Based on byte-level Byte-Pair-Encoding (BPE).
[`MarkupLMTokenizerFast`] can be used to turn HTML strings into to token-level `input_ids`, `attention_mask`,
`token_type_ids`, `xpath_tags_seq` and `xpath_tags_seq`. This tokenizer inherits from [`PreTrainedTokenizer`] which
contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
"""
Args:
vocab_file (`str`):
merges_file (`str`):
errors (`str`, *optional*, defaults to `"replace"`):
bos_token (`str`, *optional*, defaults to `"<s>"`):
eos_token (`str`, *optional*, defaults to `"</s>"`):
sep_token (`str`, *optional*, defaults to `"</s>"`):
cls_token (`str`, *optional*, defaults to `"<s>"`):
unk_token (`str`, *optional*, defaults to `"<unk>"`):
pad_token (`str`, *optional*, defaults to `"<pad>"`):
mask_token (`str`, *optional*, defaults to `"<mask>"`):
add_prefix_space (`bool`, *optional*, defaults to `False`):
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
slow_tokenizer_class = MarkupLMTokenizer
def __init__(
self,
vocab_file,
merges_file,
tags_dict,
tokenizer_file=None,
errors="replace",
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
add_prefix_space=False,
max_depth=50,
max_width=1000,
pad_width=1001,
pad_token_label=-100,
only_label_first_subword=True,
trim_offsets=False,
**kwargs,
):
"""
Initialize the class with required and optional parameters for tokenization and tagging.
Args:
vocab_file (str): Path to vocabulary file.
merges_file (str): Path to merges file for tokenization.
tags_dict (dict): Dictionary mapping tag names to IDs.
tokenizer_file (str, optional): Path to tokenizer file. Defaults to None.
errors (str, optional): Error handling method during tokenization. Defaults to "replace".
bos_token (str, optional): Beginning of sequence token. Defaults to "<s>".
eos_token (str, optional): End of sequence token. Defaults to "</s>".
sep_token (str, optional): Separator token. Defaults to "</s>".
cls_token (str, optional): Classification token. Defaults to "<s>".
unk_token (str, optional): Unknown token. Defaults to "<unk>".
pad_token (str, optional): Padding token. Defaults to "<pad>".
mask_token (str, optional): Mask token. Defaults to "<mask>".
add_prefix_space (bool, optional): Whether to add prefix space during tokenization. Defaults to False.
max_depth (int, optional): Maximum depth for XPath processing. Defaults to 50.
max_width (int, optional): Maximum width for XPath processing. Defaults to 1000.
pad_width (int, optional): Padding width for XPath processing. Defaults to 1001.
pad_token_label (int, optional): Padding token label for subword tagging. Defaults to -100.
only_label_first_subword (bool, optional): Whether to label only the first subword. Defaults to True.
trim_offsets (bool, optional): Whether to trim offsets. Defaults to False.
**kwargs: Additional keyword arguments.
"""
pass
def get_xpath_seq(self, xpath):
"""
Given the xpath expression of one particular node (like "/html/body/div/li[1]/div/span[2]"), return a list of
tag IDs and corresponding subscripts, taking into account max depth.
"""
xpath_tags_list = []
xpath_subs_list = []
xpath_units = xpath.split("/")
for unit in xpath_units:
if not unit.strip():
continue
name_subs = unit.strip().split("[")
tag_name = name_subs[0]
sub = 0 if len(name_subs) == 1 else int(name_subs[1][:-1])
xpath_tags_list.append(self.tags_dict.get(tag_name, self.unk_tag_id))
xpath_subs_list.append(min(self.max_width, sub))
xpath_tags_list = xpath_tags_list[: self.max_depth]
xpath_subs_list = xpath_subs_list[: self.max_depth]
xpath_tags_list += [self.pad_tag_id] * (self.max_depth - len(xpath_tags_list))
xpath_subs_list += [self.pad_width] * (self.max_depth - len(xpath_subs_list))
return xpath_tags_list, xpath_subs_list
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
xpaths: Union[List[List[int]], List[List[List[int]]]] = None,
node_labels: Optional[Union[List[int], List[List[int]]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
"""
Encode the input text(s) along with associated parameters into token IDs, token type IDs, and attention masks.
Args:
text (Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]): Input text or texts.
text_pair (Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]], optional): Second input text or texts. Defaults to None.
xpaths (Union[List[List[int]], List[List[List[int]]]], optional): List of XPath sequences. Defaults to None.
node_labels (Optional[Union[List[int], List[List[int]]]], optional): Node labels corresponding to XPaths. Defaults to None.
add_special_tokens (bool, optional): Whether to add special tokens. Defaults to True.
padding (Union[bool, str, PaddingStrategy], optional): Padding strategy or maximum length for padding. Defaults to False.
truncation (Union[bool, str, TruncationStrategy], optional): Truncation strategy or maximum length for truncation. Defaults to None.
max_length (Optional[int], optional): Maximum length of the returned sequences. Defaults to None.
stride (int, optional): Stride for overflowing tokens. Defaults to 0.
pad_to_multiple_of (Optional[int], optional): Pad to a multiple of specified value. Defaults to None.
return_tensors (Optional[Union[str, TensorType]], optional): Type of tensors to return. Defaults to None.
return_token_type_ids (Optional[bool], optional): Whether to return token type IDs. Defaults to None.
return_attention_mask (Optional[bool], optional): Whether to return attention mask. Defaults to None.
return_overflowing_tokens (bool, optional): Whether to return overflowing tokens. Defaults to False.
return_special_tokens_mask (bool, optional): Whether to return special tokens mask. Defaults to False.
return_offsets_mapping (bool, optional): Whether to return offsets mapping. Defaults to False.
return_length (bool, optional): Whether to return length of the encoded sequence. Defaults to False.
verbose (bool, optional): Whether to output verbose information. Defaults to True.
**kwargs: Additional keyword arguments.
"""
pass
def batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
],
is_pair: bool = None,
xpaths: Optional[List[List[List[int]]]] = None,
node_labels: Optional[Union[List[int], List[List[int]]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
return self._batch_encode_plus(
batch_text_or_text_pairs=batch_text_or_text_pairs,
is_pair=is_pair,
xpaths=xpaths,
node_labels=node_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
batched_input = [(text, pair)] if pair else [text]
encodings = self._tokenizer.encode_batch(
batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
)
return encodings[0].tokens
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def encode_plus(
self,
text: Union[TextInput, PreTokenizedInput],
text_pair: Optional[PreTokenizedInput] = None,
xpaths: Optional[List[List[int]]] = None,
node_labels: Optional[List[int]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
"""
Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
`__call__` should be used instead.
Args:
text (`str`, `List[str]`, `List[List[str]]`):
The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
text_pair (`List[str]` or `List[int]`, *optional*):
Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
list of list of strings (words of a batch of examples).
"""
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
return self._encode_plus(
text=text,
xpaths=xpaths,
text_pair=text_pair,
node_labels=node_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def _batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
],
is_pair: bool = None,
xpaths: Optional[List[List[List[int]]]] = None,
node_labels: Optional[List[List[int]]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
):
def _encode_plus(
self,
text: Union[TextInput, PreTokenizedInput],
text_pair: Optional[PreTokenizedInput] = None,
xpaths: Optional[List[List[int]]] = None,
node_labels: Optional[List[int]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[bool] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
) -> BatchEncoding:
batched_input = [(text, text_pair)] if text_pair else [text]
batched_xpaths = [xpaths]
batched_node_labels = [node_labels] if node_labels is not None else None
batched_output = self._batch_encode_plus(
batched_input,
is_pair=bool(text_pair is not None),
xpaths=batched_xpaths,
node_labels=batched_node_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
if return_tensors is None and not return_overflowing_tokens:
batched_output = BatchEncoding(
{
key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
for key, value in batched_output.items()
},
batched_output.encodings,
)
self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
return batched_output
def _pad(
self,
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
):
pass
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and
adding special tokens. A RoBERTa sequence has the following format:
- single sequence: `<s> X </s>`
- pair of sequences: `<s> A </s></s> B </s>`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of input IDs with the appropriate special tokens added.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros representing token type ids (not used in RoBERTa).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + token_ids_1 + sep) * [0]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the tokenizer's vocabulary to a directory.
Args:
save_directory (str):
Directory where the vocabulary files will be saved.
filename_prefix (str, *optional*):
Optional prefix for the saved files.
Returns:
Tuple[str]: Tuple containing the saved file paths.
"""
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
.\models\markuplm\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
_import_structure = {
"configuration_markuplm": ["MARKUPLM_PRETRAINED_CONFIG_ARCHIVE_MAP", "MarkupLMConfig"],
"feature_extraction_markuplm": ["MarkupLMFeatureExtractor"],
"processing_markuplm": ["MarkupLMProcessor"],
"tokenization_markuplm": ["MarkupLMTokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_markuplm_fast"] = ["MarkupLMTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_markuplm"] = [
"MARKUPLM_PRETRAINED_MODEL_ARCHIVE_LIST",
"MarkupLMForQuestionAnswering",
"MarkupLMForSequenceClassification",
"MarkupLMForTokenClassification",
"MarkupLMModel",
"MarkupLMPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_markuplm import MARKUPLM_PRETRAINED_CONFIG_ARCHIVE_MAP, MarkupLMConfig
from .feature_extraction_markuplm import MarkupLMFeatureExtractor
from .processing_markuplm import MarkupLMProcessor
from .tokenization_markuplm import MarkupLMTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_markuplm_fast import MarkupLMTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_markuplm import (
MARKUPLM_PRETRAINED_MODEL_ARCHIVE_LIST,
MarkupLMForQuestionAnswering,
MarkupLMForSequenceClassification,
MarkupLMForTokenClassification,
MarkupLMModel,
MarkupLMPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
.\models\mask2former\configuration_mask2former.py
""" Mask2Former 模型配置"""
from typing import Dict, List, Optional
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto import CONFIG_MAPPING
MASK2FORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/mask2former-swin-small-coco-instance": (
"https://huggingface.co/facebook/mask2former-swin-small-coco-instance/blob/main/config.json"
)
}
logger = logging.get_logger(__name__)
class Mask2FormerConfig(PretrainedConfig):
r"""
这是用于存储 [`Mask2FormerModel`] 配置的配置类。根据指定的参数实例化 Mask2Former 模型,定义模型架构。
使用默认参数实例化配置将生成类似于 Mask2Former [facebook/mask2former-swin-small-coco-instance]
(https://huggingface.co/facebook/mask2former-swin-small-coco-instance) 架构的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型的输出。阅读 [`PretrainedConfig`] 的文档以获取更多信息。
当前,Mask2Former 仅支持 [Swin Transformer](swin) 作为主干。
示例:
```
>>> from transformers import Mask2FormerConfig, Mask2FormerModel
>>> # 初始化 Mask2Former facebook/mask2former-swin-small-coco-instance 配置
>>> configuration = Mask2FormerConfig()
>>> # 使用配置初始化模型(带有随机权重),使用 facebook/mask2former-swin-small-coco-instance 风格的配置
>>> model = Mask2FormerModel(configuration)
>>> # 访问模型配置
>>> configuration = model.config
```
"""
model_type = "mask2former"
backbones_supported = ["swin"]
attribute_map = {"hidden_size": "hidden_dim"}
@classmethod
def from_backbone_config(cls, backbone_config: PretrainedConfig, **kwargs):
"""从预训练的骨干模型配置中实例化一个 [`Mask2FormerConfig`](或其派生类)对象。
Args:
backbone_config ([`PretrainedConfig`]):
骨干模型的配置对象。
Returns:
[`Mask2FormerConfig`]: 返回一个配置对象的实例
"""
return cls(
backbone_config=backbone_config,
**kwargs,
)
.\models\mask2former\convert_mask2former_original_pytorch_checkpoint_to_pytorch.py
import json
import sys
from argparse import ArgumentParser
from dataclasses import dataclass
from pathlib import Path
from pprint import pformat
from typing import Any, Dict, Iterator, List, Set, Tuple
import requests
import torch
import torchvision.transforms as T
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import get_cfg
from detectron2.projects.deeplab import add_deeplab_config
from huggingface_hub import hf_hub_download
from PIL import Image
from torch import Tensor, nn
from transformers import (
Mask2FormerConfig,
Mask2FormerForUniversalSegmentation,
Mask2FormerImageProcessor,
Mask2FormerModel,
SwinConfig,
)
from transformers.models.mask2former.modeling_mask2former import (
Mask2FormerForUniversalSegmentationOutput,
Mask2FormerModelOutput,
)
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger()
torch.manual_seed(0)
class TrackedStateDict:
def __init__(self, to_track: Dict):
"""This class "tracks" a python dictionary by keeping track of which item is accessed.
Args:
to_track (Dict): The dictionary we wish to track
"""
self.to_track = to_track
self._seen: Set[str] = set()
def __getitem__(self, key: str) -> Any:
return self.to_track[key]
def __setitem__(self, key: str, item: Any):
self._seen.add(key)
self.to_track[key] = item
def diff(self) -> List[str]:
"""This method returns a set difference between the keys in the tracked state dict and the one we have access so far.
This is an effective method to check if we have update all the keys
Returns:
List[str]: List of keys not yet updated
"""
return set(self.to_track.keys()) - self._seen
def copy(self) -> Dict:
return self.to_track.copy()
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
img_data = requests.get(url, stream=True).raw
im = Image.open(img_data)
return im
@dataclass
class Args:
"""Fake command line arguments needed by mask2former/detectron implementation"""
config_file: str
def setup_cfg(args: Args):
cfg = get_cfg()
add_deeplab_config(cfg)
add_maskformer2_config(cfg)
cfg.merge_from_file(args.config_file)
cfg.freeze()
return cfg
class OriginalMask2FormerConfigToOursConverter:
class OriginalMask2FormerConfigToImageProcessorConverter:
def __call__(self, original_config: object) -> Mask2FormerImageProcessor:
model = original_config.MODEL
model_input = original_config.INPUT
return Mask2FormerImageProcessor(
image_mean=(torch.tensor(model.PIXEL_MEAN) / 255).tolist(),
image_std=(torch.tensor(model.PIXEL_STD) / 255).tolist(),
size=model_input.MIN_SIZE_TEST,
max_size=model_input.MAX_SIZE_TEST,
num_labels=model.SEM_SEG_HEAD.NUM_CLASSES,
ignore_index=model.SEM_SEG_HEAD.IGNORE_VALUE,
size_divisibility=32,
)
class OriginalMask2FormerCheckpointToOursConverter:
def __init__(self, original_model: nn.Module, config: Mask2FormerConfig):
self.original_model = original_model
self.config = config
def pop_all(self, renamed_keys: List[Tuple[str, str]], dst_state_dict: StateDict, src_state_dict: StateDict):
for src_key, dst_key in renamed_keys:
dst_state_dict[dst_key] = src_state_dict.pop(src_key)
def replace_maskformer_swin_backbone(
self, dst_state_dict: StateDict, src_state_dict: StateDict, config: Mask2FormerConfig
):
dst_prefix: str = "transformer_module.decoder"
src_prefix: str = "sem_seg_head.predictor"
renamed_keys = self.rename_keys_in_masked_attention_decoder(dst_state_dict, src_state_dict)
renamed_keys.extend(
[
(f"{src_prefix}.decoder_norm.weight", f"{dst_prefix}.layernorm.weight"),
(f"{src_prefix}.decoder_norm.bias", f"{dst_prefix}.layernorm.bias"),
]
)
mlp_len = 3
for i in range(mlp_len):
renamed_keys.extend(
[
(
f"{src_prefix}.mask_embed.layers.{i}.weight",
f"{dst_prefix}.mask_predictor.mask_embedder.{i}.0.weight",
),
(
f"{src_prefix}.mask_embed.layers.{i}.bias",
f"{dst_prefix}.mask_predictor.mask_embedder.{i}.0.bias",
),
]
)
self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
def replace_keys_qkv_transformer_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
dst_prefix: str = "transformer_module.decoder.layers"
src_prefix: str = "sem_seg_head.predictor"
for i in range(self.config.decoder_layers - 1):
in_proj_weight = src_state_dict.pop(
f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.in_proj_weight"
)
in_proj_bias = src_state_dict.pop(
f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.in_proj_bias"
)
dst_state_dict[f"{dst_prefix}.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
dst_state_dict[f"{dst_prefix}.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
dst_state_dict[f"{dst_prefix}.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
dst_state_dict[f"{dst_prefix}.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
dst_state_dict[f"{dst_prefix}.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
dst_state_dict[f"{dst_prefix}.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
def replace_transformer_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
dst_prefix: str = "transformer_module"
src_prefix: str = "sem_seg_head.predictor"
self.replace_masked_attention_decoder(dst_state_dict, src_state_dict)
renamed_keys = [
(f"{src_prefix}.query_embed.weight", f"{dst_prefix}.queries_embedder.weight"),
(f"{src_prefix}.query_feat.weight", f"{dst_prefix}.queries_features.weight"),
(f"{src_prefix}.level_embed.weight", f"{dst_prefix}.level_embed.weight"),
]
self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
self.replace_keys_qkv_transformer_decoder(dst_state_dict, src_state_dict)
def replace_universal_segmentation_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
dst_prefix: str = ""
src_prefix: str = "sem_seg_head.predictor"
renamed_keys = [
(f"{src_prefix}.class_embed.weight", f"{dst_prefix}class_predictor.weight"),
(f"{src_prefix}.class_embed.bias", f"{dst_prefix}class_predictor.bias"),
]
logger.info(f"Replacing keys {pformat(renamed_keys)}")
self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
dst_state_dict = TrackedStateDict(mask2former.state_dict())
src_state_dict = self.original_model.state_dict()
self.replace_pixel_module(dst_state_dict, src_state_dict)
self.replace_transformer_module(dst_state_dict, src_state_dict)
logger.info(f"Missed keys are {pformat(dst_state_dict.diff())}")
logger.info(f"Not copied keys are {pformat(src_state_dict.keys())}")
logger.info("🙌 Done")
state_dict = {key: dst_state_dict[key] for key in dst_state_dict.to_track.keys()}
mask2former.load_state_dict(state_dict)
return mask2former
def convert_universal_segmentation(
self, mask2former: Mask2FormerForUniversalSegmentation
) -> Mask2FormerForUniversalSegmentation:
dst_state_dict = TrackedStateDict(mask2former.state_dict())
src_state_dict = self.original_model.state_dict()
self.replace_universal_segmentation_module(dst_state_dict, src_state_dict)
state_dict = {key: dst_state_dict[key] for key in dst_state_dict.to_track.keys()}
mask2former.load_state_dict(state_dict)
return mask2former
@staticmethod
def using_dirs(checkpoints_dir: Path, config_dir: Path) -> Iterator[Tuple[object, Path, Path]]:
checkpoints: List[Path] = checkpoints_dir.glob("**/*.pkl")
for checkpoint in checkpoints:
logger.info(f"💪 Converting {checkpoint.stem}")
dataset_name = checkpoint.parents[2].stem
if dataset_name == "ade":
dataset_name = dataset_name.replace("ade", "ade20k")
segmentation_task = checkpoint.parents[1].stem
config_file_name = f"{checkpoint.parents[0].stem}.yaml"
config: Path = config_dir / dataset_name / segmentation_task / "swin" / config_file_name
yield config, checkpoint
def test(
original_model,
our_model: Mask2FormerForUniversalSegmentation,
image_processor: Mask2FormerImageProcessor,
tolerance: float,
):
with torch.no_grad():
original_model = original_model.eval()
our_model = our_model.eval()
im = prepare_img()
x = image_processor(images=im, return_tensors="pt")["pixel_values"]
original_model_backbone_features = original_model.backbone(x.clone())
our_model_output: Mask2FormerModelOutput = our_model.model(x.clone(), output_hidden_states=True)
for original_model_feature, our_model_feature in zip(
original_model_backbone_features.values(), our_model_output.encoder_hidden_states
):
assert torch.allclose(
original_model_feature, our_model_feature, atol=tolerance
), "The backbone features are not the same."
mask_features, _, multi_scale_features = original_model.sem_seg_head.pixel_decoder.forward_features(
original_model_backbone_features
)
for original_model_feature, our_model_feature in zip(
multi_scale_features, our_model_output.pixel_decoder_hidden_states
):
assert torch.allclose(
original_model_feature, our_model_feature, atol=tolerance
), "The pixel decoder feature are not the same"
tr_complete = T.Compose(
[T.Resize((384, 384)), T.ToTensor()],
)
y = (tr_complete(im) * 255.0).to(torch.int).float()
original_class_logits, original_mask_logits = original_model([{"image": y.clone().squeeze(0)}])
our_model_out: Mask2FormerForUniversalSegmentationOutput = our_model(x.clone())
our_mask_logits = our_model_out.masks_queries_logits
our_class_logits = our_model_out.class_queries_logits
assert original_mask_logits.shape == our_mask_logits.shape, "Output masks shapes are not matching."
assert original_class_logits.shape == our_class_logits.shape, "Output class logits shapes are not matching."
assert torch.allclose(
original_class_logits, our_class_logits, atol=tolerance
), "The class logits are not the same."
assert torch.allclose(
original_mask_logits, our_mask_logits, atol=tolerance
), "The predicted masks are not the same."
logger.info("✅ Test passed!")
def get_model_name(checkpoint_file: Path):
model_name_raw: str = checkpoint_file.parents[0].stem
segmentation_task_name: str = checkpoint_file.parents[1].stem
if segmentation_task_name not in ["instance-segmentation", "panoptic-segmentation", "semantic-segmentation"]:
raise ValueError(
f"{segmentation_task_name} must be wrong since acceptable values are: instance-segmentation,"
" panoptic-segmentation, semantic-segmentation."
)
dataset_name: str = checkpoint_file.parents[2].stem
if dataset_name not in ["coco", "ade", "cityscapes", "mapillary-vistas"]:
raise ValueError(
f"{dataset_name} must be wrong since we didn't find 'coco' or 'ade' or 'cityscapes' or 'mapillary-vistas'"
" in it "
)
backbone = "swin"
backbone_types = ["tiny", "small", "base_IN21k", "base", "large"]
backbone_type = list(filter(lambda x: x in model_name_raw, backbone_types))[0].replace("_", "-")
model_name = f"mask2former-{backbone}-{backbone_type}-{dataset_name}-{segmentation_task_name.split('-')[0]}"
return model_name
if __name__ == "__main__":
parser = ArgumentParser(
description="Command line to convert the original mask2formers (with swin backbone) to our implementations."
)
parser.add_argument(
"--checkpoints_dir",
type=Path,
help=(
"A directory containing the model's checkpoints. The directory has to have the following structure:"
" <DIR_NAME>/<DATASET_NAME>/<SEGMENTATION_TASK_NAME>/<CONFIG_NAME>.pkl"
),
)
parser.add_argument(
"--configs_dir",
type=Path,
help=(
"A directory containing the model's configs, see detectron2 doc. The directory has to have the following"
" structure: <DIR_NAME>/<DATASET_NAME>/<SEGMENTATION_TASK_NAME>/<CONFIG_NAME>.yaml"
),
)
parser.add_argument(
"--mask2former_dir",
required=True,
type=Path,
help=(
"A path to Mask2Former's original implementation directory. You can download from here:"
" https://github.com/facebookresearch/Mask2Former"
),
)
args = parser.parse_args()
checkpoints_dir: Path = args.checkpoints_dir
config_dir: Path = args.configs_dir
mask2former_dir: Path = args.mask2former_dir
sys.path.append(str(mask2former_dir.parent))
from Mask2Former.mask2former.config import add_maskformer2_config
from Mask2Former.mask2former.maskformer_model import MaskFormer as OriginalMask2Former
for config_file, checkpoint_file in OriginalMask2FormerCheckpointToOursConverter.using_dirs(
checkpoints_dir, config_dir
):
model_name = get_model_name(checkpoint_file)
image_processor = OriginalMask2FormerConfigToImageProcessorConverter()(
setup_cfg(Args(config_file=config_file))
)
image_processor.size = {"height": 384, "width": 384}
original_config = setup_cfg(Args(config_file=config_file))
mask2former_kwargs = OriginalMask2Former.from_config(original_config)
original_model = OriginalMask2Former(**mask2former_kwargs).eval()
DetectionCheckpointer(original_model).load(str(checkpoint_file))
config: Mask2FormerConfig = OriginalMask2FormerConfigToOursConverter()(original_config)
mask2former = Mask2FormerModel(config=config).eval()
converter = OriginalMask2FormerCheckpointToOursConverter(original_model, config)
mask2former = converter.convert(mask2former)
mask2former_for_segmentation = Mask2FormerForUniversalSegmentation(config=config).eval()
mask2former_for_segmentation.model = mask2former
mask2former_for_segmentation = converter.convert_universal_segmentation(mask2former_for_segmentation)
tolerance = 3e-1
high_tolerance_models = [
"mask2former-swin-base-IN21k-coco-instance",
"mask2former-swin-base-coco-instance",
"mask2former-swin-small-cityscapes-semantic",
]
if model_name in high_tolerance_models:
tolerance = 3e-1
logger.info(f"🪄 Testing {model_name}...")
test(original_model, mask2former_for_segmentation, image_processor, tolerance)
logger.info(f"🪄 Pushing {model_name} to hub...")
image_processor.push_to_hub(model_name)
mask2former_for_segmentation.push_to_hub(model_name)
.\models\mask2former\image_processing_mask2former.py
"""Mask2Former 的图像处理器类。"""
import math
import warnings
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
PaddingMode,
get_resize_output_image_size,
pad,
rescale,
resize,
to_channel_dimension_format,
)
from ...image_utils import (
ChannelDimension,
ImageInput,
PILImageResampling,
get_image_size,
infer_channel_dimension_format,
is_batched,
is_scaled_image,
to_numpy_array,
valid_images,
validate_kwargs,
validate_preprocess_arguments,
)
from ...utils import (
IMAGENET_DEFAULT_MEAN,
IMAGENET_DEFAULT_STD,
TensorType,
is_torch_available,
is_torch_tensor,
logging,
)
logger = logging.get_logger(__name__)
if is_torch_available():
import torch
from torch import nn
def max_across_indices(values: Iterable[Any]) -> List[Any]:
"""
返回可迭代值的所有索引的最大值。
"""
return [max(values_i) for values_i in zip(*values)]
def get_max_height_width(
images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> List[int]:
"""
获取批处理中所有图像的最大高度和宽度。
"""
if input_data_format is None:
input_data_format = infer_channel_dimension_format(images[0])
if input_data_format == ChannelDimension.FIRST:
_, max_height, max_width = max_across_indices([img.shape for img in images])
elif input_data_format == ChannelDimension.LAST:
max_height, max_width, _ = max_across_indices([img.shape for img in images])
else:
raise ValueError(f"Invalid channel dimension format: {input_data_format}")
return (max_height, max_width)
def make_pixel_mask(
image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray:
"""
创建像素掩码。
"""
Args:
image (`np.ndarray`):
要创建像素掩码的图像。
output_size (`Tuple[int, int]`):
掩码的输出尺寸。
"""
# 获取图像的高度和宽度,根据输入数据格式中的通道维度确定。
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
# 创建一个指定输出尺寸的零矩阵作为掩码,数据类型为64位整数。
mask = np.zeros(output_size, dtype=np.int64)
# 将掩码中图像实际像素部分(未填充部分)置为1。
mask[:input_height, :input_width] = 1
# 返回生成的像素掩码。
return mask
# Copied from transformers.models.detr.image_processing_detr.check_segment_validity
def check_segment_validity(segmentation, area_threshold=2):
"""
Checks the validity of each segment in the segmentation map based on area threshold.
Args:
segmentation (`torch.Tensor` or `numpy.array`):
A segmentation map of shape `(height, width)` where each value denotes a segment or class id.
area_threshold (`int`, optional):
Minimum area threshold for valid segments. Defaults to 2.
Returns:
`List[int]`: List of valid segment indices.
"""
segment_ids = torch.unique(segmentation)
valid_segments = []
for idx in segment_ids:
mask = torch.where(segmentation == idx, 1, 0)
area = mask.sum().item()
if area >= area_threshold:
valid_segments.append(idx.item())
return valid_segments
# 将分割地图转换为二进制掩码集合
def convert_segmentation_map_to_binary_masks(
segmentation_map,
segments,
num_classes,
):
# 初始化二进制掩码列表,每个类别一个列表
binary_masks = [[] for _ in range(num_classes)]
# 遍历所有分割对象
for segment in segments:
segment_id = segment["id"]
label_id = segment["label_id"]
# 创建当前对象的二进制掩码
binary_mask = (segmentation_map == segment_id).int()
binary_masks[label_id].append(binary_mask)
# 将每个类别的掩码列表转换为张量
binary_masks = [torch.stack(masks) if masks else torch.empty(0, dtype=torch.int32) for masks in binary_masks]
return binary_masks
# segmentation_map: "np.ndarray",声明了一个变量 segmentation_map,类型为 np.ndarray
# instance_id_to_semantic_id: Optional[Dict[int, int]] = None,声明了一个可选参数 instance_id_to_semantic_id,类型为 Optional[Dict[int, int]],默认为 None
# ignore_index: Optional[int] = None,声明了一个可选参数 ignore_index,类型为 Optional[int],默认为 None
# reduce_labels: bool = False,声明了一个布尔型参数 reduce_labels,初始化为 False
):
# 如果要减少标签并且没有提供 ignore_index,则抛出值错误异常
if reduce_labels and ignore_index is None:
raise ValueError("If `reduce_labels` is True, `ignore_index` must be provided.")
# 如果要减少标签,则将标记地图中的所有零值替换为 ignore_index,其他值减一
if reduce_labels:
segmentation_map = np.where(segmentation_map == 0, ignore_index, segmentation_map - 1)
# 获取标记地图中的所有唯一标签
all_labels = np.unique(segmentation_map)
# 如果指定了 ignore_index,则从所有标签中移除背景标签
if ignore_index is not None:
all_labels = all_labels[all_labels != ignore_index]
# 为每个对象实例生成二进制掩码
binary_masks = [(segmentation_map == i) for i in all_labels]
binary_masks = np.stack(binary_masks, axis=0) # (num_labels, height, width)
# 如果存在 instance_id_to_semantic_id 映射,则将实例 ID 转换为类别 ID
if instance_id_to_semantic_id is not None:
labels = np.zeros(all_labels.shape[0])
# 遍历所有标签,根据映射表将实例 ID 转换为类别 ID
for label in all_labels:
class_id = instance_id_to_semantic_id[label + 1 if reduce_labels else label]
labels[all_labels == label] = class_id - 1 if reduce_labels else class_id
else:
# 否则直接使用所有标签作为类别标签
labels = all_labels
# 返回二进制掩码和类别标签,分别转换为 float32 和 int64 类型
return binary_masks.astype(np.float32), labels.astype(np.int64)
# 从 transformers.models.maskformer.image_processing_maskformer.get_maskformer_resize_output_image_size 复制,并将 maskformer 改为 mask2former
def get_mask2former_resize_output_image_size(
image: np.ndarray,
size: Union[int, Tuple[int, int], List[int], Tuple[int]],
max_size: Optional[int] = None,
size_divisor: int = 0,
default_to_square: bool = True,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
"""
计算给定所需大小的输出图像大小。
Args:
image (`np.ndarray`):
输入图像。
size (`int` or `Tuple[int, int]` or `List[int]` or `Tuple[int]`):
输出图像的大小。
max_size (`int`, *optional*):
输出图像的最大大小。
size_divisor (`int`, *optional*, defaults to 0):
如果给定了 size_divisor,则输出图像大小将可以被该数整除。
default_to_square (`bool`, *optional*, defaults to `True`):
如果未提供大小,是否默认为正方形。
input_data_format (`ChannelDimension` or `str`, *optional*):
输入图像的通道维度格式。如果未设置,将使用输入图像的推断格式。
Returns:
`Tuple[int, int]`: 输出图像的大小。
"""
# 调用 get_resize_output_image_size 函数计算输出大小
output_size = get_resize_output_image_size(
input_image=image,
size=size,
default_to_square=default_to_square,
max_size=max_size,
input_data_format=input_data_format,
)
# 如果 size_divisor 大于 0,则确保输出大小可以被 size_divisor 整除
if size_divisor > 0:
height, width = output_size
height = int(math.ceil(height / size_divisor) * size_divisor)
width = int(math.ceil(width / size_divisor) * size_divisor)
output_size = (height, width)
return output_size
class Mask2FormerImageProcessor(BaseImageProcessor):
r"""
Constructs a Mask2Former image processor. The image processor can be used to prepare image(s) and optional targets
for the model.
This image processor inherits from [`BaseImageProcessor`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.
"""
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the input to a certain `size`.
size (`int`, *optional*, defaults to 800):
Resize the input to the given size. Only has an effect if `do_resize` is set to `True`. If size is a
sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller edge of
the image will be matched to this number. i.e, if `height > width`, then image will be rescaled to `(size *
height / width, size)`.
size_divisor (`int`, *optional*, defaults to 32):
Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in
Swin Transformer.
resample (`int`, *optional*, defaults to `Resampling.BILINEAR`):
An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
`PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
`PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
to `True`.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the input to a certain `scale`.
rescale_factor (`float`, *optional*, defaults to `1/ 255`):
Rescale the input by the given factor. Only has an effect if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether or not to normalize the input with mean and standard deviation.
image_mean (`int`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean.
image_std (`int`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the
ImageNet std.
ignore_index (`int`, *optional*):
Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
denoted with 0 (background) will be replaced with `ignore_index`.
reduce_labels (`bool`, *optional*, defaults to `False`):
Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0
is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k).
The background label will be replaced by `ignore_index`.
model_input_names = ["pixel_values", "pixel_mask"]
定义一个列表 `model_input_names`,包含两个字符串元素 "pixel_values" 和 "pixel_mask",这些名称用于标识模型的输入。
def __init__(
self,
do_resize: bool = True,
size: Dict[str, int] = None,
size_divisor: int = 32,
resample: PILImageResampling = PILImageResampling.BILINEAR,
do_rescale: bool = True,
rescale_factor: float = 1 / 255,
do_normalize: bool = True,
image_mean: Union[float, List[float]] = None,
image_std: Union[float, List[float]] = None,
ignore_index: Optional[int] = None,
reduce_labels: bool = False,
**kwargs,
):
# 如果传入的参数 kwargs 中包含 "size_divisibility",发出警告并使用新的参数名称
if "size_divisibility" in kwargs:
warnings.warn(
"The `size_divisibility` argument is deprecated and will be removed in v4.27. Please use "
"`size_divisor` instead.",
FutureWarning,
)
size_divisor = kwargs.pop("size_divisibility")
# 如果传入的参数 kwargs 中包含 "max_size",发出警告并设置私有属性 _max_size 为传入的值
if "max_size" in kwargs:
warnings.warn(
"The `max_size` argument is deprecated and will be removed in v4.27. Please use size['longest_edge']"
" instead.",
FutureWarning,
)
# 将 max_size 设置为私有属性,以便在预处理方法中将其作为默认值传递,同时仍然可以将 size 作为整数传递
self._max_size = kwargs.pop("max_size")
else:
# 否则设置默认的 _max_size 值为 1333
self._max_size = 1333
# 如果 size 参数为 None,则设置默认的 size 字典,包含 "shortest_edge" 和 "longest_edge" 键
size = size if size is not None else {"shortest_edge": 800, "longest_edge": self._max_size}
# 根据传入的 size 和 _max_size 参数,获取最终的 size 字典
size = get_size_dict(size, max_size=self._max_size, default_to_square=False)
# 调用父类的初始化方法,传入其余的关键字参数
super().__init__(**kwargs)
# 初始化对象的各个属性
self.do_resize = do_resize
self.size = size
self.resample = resample
self.size_divisor = size_divisor
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.ignore_index = ignore_index
self.reduce_labels = reduce_labels
# 设置一个包含有效处理器键的列表,用于后续处理
self._valid_processor_keys = [
"images",
"segmentation_maps",
"instance_id_to_semantic_id",
"do_resize",
"size",
"size_divisor",
"resample",
"do_rescale",
"rescale_factor",
"do_normalize",
"image_mean",
"image_std",
"ignore_index",
"reduce_labels",
"return_tensors",
"data_format",
"input_data_format",
]
@classmethod
# 重写了基类中的 `from_dict` 方法,用于确保当通过 `from_dict` 和 kwargs 创建图像处理器时参数可以更新,
# 例如 `Mask2FormerImageProcessor.from_pretrained(checkpoint, max_size=800)`
def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
# 复制输入的图像处理器字典,以免修改原始输入
image_processor_dict = image_processor_dict.copy()
# 如果 `kwargs` 中包含 "max_size",则更新图像处理器字典中的 "max_size" 参数,并从 `kwargs` 中删除
if "max_size" in kwargs:
image_processor_dict["max_size"] = kwargs.pop("max_size")
# 如果 `kwargs` 中包含 "size_divisibility",则更新图像处理器字典中的 "size_divisibility" 参数,并从 `kwargs` 中删除
if "size_divisibility" in kwargs:
image_processor_dict["size_divisibility"] = kwargs.pop("size_divisibility")
# 调用基类的 `from_dict` 方法,传递更新后的图像处理器字典和剩余的 `kwargs`
return super().from_dict(image_processor_dict, **kwargs)
# 从 transformers.models.maskformer.image_processing_maskformer.MaskFormerImageProcessor.resize 处复制,
# 将函数名从 `get_maskformer_resize_output_image_size` 修改为 `get_mask2former_resize_output_image_size`
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
size_divisor: int = 0,
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format=None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
def resize_image(
image: np.ndarray,
size: Dict[str, int],
size_divisor: int = 0,
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[Union[ChannelDimension, str]] = None,
input_data_format: Optional[Union[ChannelDimension, str]] = None,
) -> np.ndarray:
"""
Resize the image to the given size. Size can be min_size (scalar) or `(height, width)` tuple. If size is an
int, smaller edge of the image will be matched to this number.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
The size of the output image.
size_divisor (`int`, *optional*, defaults to 0):
If `size_divisor` is given, the output image size will be divisible by the number.
resample (`PILImageResampling` resampling filter, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use when resizing the image.
data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
# Check if deprecated `max_size` parameter is present and issue a warning
if "max_size" in kwargs:
warnings.warn(
"The `max_size` parameter is deprecated and will be removed in v4.27. "
"Please specify in `size['longest_edge'] instead`.",
FutureWarning,
)
max_size = kwargs.pop("max_size")
else:
max_size = None
# Adjust `size` using utility function `get_size_dict`
size = get_size_dict(size, max_size=max_size, default_to_square=False)
# Determine whether `size` contains `shortest_edge` and `longest_edge` or `height` and `width`
if "shortest_edge" in size and "longest_edge" in size:
size, max_size = size["shortest_edge"], size["longest_edge"]
elif "height" in size and "width" in size:
size = (size["height"], size["width"])
max_size = None
else:
# Raise ValueError if `size` does not contain necessary keys
raise ValueError(
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
f" {size.keys()}."
)
# Adjust `size` using another utility function `get_mask2former_resize_output_image_size`
size = get_mask2former_resize_output_image_size(
image=image,
size=size,
max_size=max_size,
size_divisor=size_divisor,
default_to_square=False,
input_data_format=input_data_format,
)
# Resize the `image` using specified parameters and return the resized image
image = resize(
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
)
return image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
def rescale(
self,
image: np.ndarray,
rescale_factor: float,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""
Rescale the image by the given factor. image = image * rescale_factor.
Args:
image (`np.ndarray`):
Image to rescale.
rescale_factor (`float`):
The value to use for rescaling.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
input_data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the input image. If unset, is inferred from the input image. Can be
one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
"""
return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
# Copied from transformers.models.maskformer.image_processing_maskformer.MaskFormerImageProcessor.convert_segmentation_map_to_binary_masks
def convert_segmentation_map_to_binary_masks(
self,
segmentation_map: "np.ndarray",
instance_id_to_semantic_id: Optional[Dict[int, int]] = None,
ignore_index: Optional[int] = None,
reduce_labels: bool = False,
):
"""
Convert a segmentation map to binary masks based on instance IDs and semantic IDs.
Args:
segmentation_map (`np.ndarray`):
The input segmentation map to be converted.
instance_id_to_semantic_id (Optional[Dict[int, int]], *optional*):
Mapping from instance IDs to semantic IDs.
ignore_index (Optional[int], *optional*):
Index to ignore in the segmentation map.
reduce_labels (`bool`, *optional*):
Whether to reduce the number of unique labels in the masks.
Returns:
`np.ndarray`: Binary masks corresponding to the segmentation map.
"""
reduce_labels = reduce_labels if reduce_labels is not None else self.reduce_labels
ignore_index = ignore_index if ignore_index is not None else self.ignore_index
return convert_segmentation_map_to_binary_masks(
segmentation_map=segmentation_map,
instance_id_to_semantic_id=instance_id_to_semantic_id,
ignore_index=ignore_index,
reduce_labels=reduce_labels,
)
def __call__(self, images, segmentation_maps=None, **kwargs) -> BatchFeature:
"""
Perform preprocessing on images and segmentation maps.
Args:
images (ImageInput):
Input images to preprocess.
segmentation_maps (Optional[np.ndarray], *optional*):
Optional segmentation maps corresponding to the images.
**kwargs:
Additional keyword arguments for preprocessing.
Returns:
BatchFeature: Preprocessed batch of images and segmentation maps.
"""
return self.preprocess(images, segmentation_maps=segmentation_maps, **kwargs)
def _preprocess(
self,
image: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
size_divisor: int = None,
resample: PILImageResampling = None,
do_rescale: bool = None,
rescale_factor: float = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
):
# 如果需要进行图片尺寸调整,则调用 resize 方法
if do_resize:
image = self.resize(
image, size=size, size_divisor=size_divisor, resample=resample, input_data_format=input_data_format
)
# 如果需要进行图片缩放,则调用 rescale 方法
if do_rescale:
image = self.rescale(image, rescale_factor=rescale_factor, input_data_format=input_data_format)
# 如果需要进行图片标准化,则调用 normalize 方法
if do_normalize:
image = self.normalize(image, mean=image_mean, std=image_std, input_data_format=input_data_format)
# 返回预处理后的图片数据
return image
def _preprocess_image(
self,
image: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
size_divisor: int = None,
resample: PILImageResampling = None,
do_rescale: bool = None,
rescale_factor: float = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""Preprocesses a single image."""
# 将输入的图像转换为 numpy 数组
image = to_numpy_array(image)
# 如果图像已经是缩放过的并且需要进行再次缩放,则记录警告信息
if is_scaled_image(image) and do_rescale:
logger.warning_once(
"It looks like you are trying to rescale already rescaled images. If the input"
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
)
# 推断输入数据的通道格式
if input_data_format is None:
input_data_format = infer_channel_dimension_format(image)
# 调用 _preprocess 方法进行实际的图像预处理
image = self._preprocess(
image=image,
do_resize=do_resize,
size=size,
size_divisor=size_divisor,
resample=resample,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
input_data_format=input_data_format,
)
# 如果指定了输出数据的通道格式,则进行格式转换
if data_format is not None:
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
# 返回预处理后的图像数据
return image
def _preprocess_mask(
self,
segmentation_map: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
size_divisor: int = 0,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""Preprocesses a single mask."""
# 将分割地图转换为 NumPy 数组
segmentation_map = to_numpy_array(segmentation_map)
# 如果分割地图的维度是 2,则添加一个通道维度,这在某些转换中是必要的
if segmentation_map.ndim == 2:
added_channel_dim = True
segmentation_map = segmentation_map[None, ...] # 在第一维度上添加一个维度
input_data_format = ChannelDimension.FIRST # 设置数据格式为第一通道优先
else:
added_channel_dim = False
if input_data_format is None:
# 推断通道维度的格式
input_data_format = infer_channel_dimension_format(segmentation_map)
# TODO: (Amy)
# 重构分割地图处理流程,包括减少标签和调整大小,以便不丢弃大于 255 的段ID。
segmentation_map = self._preprocess(
image=segmentation_map,
do_resize=do_resize,
resample=PILImageResampling.NEAREST,
size=size,
size_divisor=size_divisor,
do_rescale=False,
do_normalize=False,
input_data_format=input_data_format,
)
# 如果为了处理而添加了额外的通道维度,则移除它
if added_channel_dim:
segmentation_map = segmentation_map.squeeze(0)
# 返回预处理后的分割地图
return segmentation_map
def preprocess(
self,
images: ImageInput,
segmentation_maps: Optional[ImageInput] = None,
instance_id_to_semantic_id: Optional[Dict[int, int]] = None,
do_resize: Optional[bool] = None,
size: Optional[Dict[str, int]] = None,
size_divisor: Optional[int] = None,
resample: PILImageResampling = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
ignore_index: Optional[int] = None,
reduce_labels: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
):
# 该方法未提供完整代码,仅作为示例代码的一部分,后续部分应继续补充
# 以下是从 transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image 复制的
def _pad_image(
self,
image: np.ndarray,
output_size: Tuple[int, int],
constant_values: Union[float, Iterable[float]] = 0,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""
Pad an image with zeros to the given size.
"""
# 获取输入图像的高度和宽度,根据输入数据格式确定通道维度
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
# 获取输出的目标高度和宽度
output_height, output_width = output_size
# 计算需要填充的底部和右侧像素数
pad_bottom = output_height - input_height
pad_right = output_width - input_width
# 定义填充的方式,上部和左部都不需要填充,底部和右侧填充零值
padding = ((0, pad_bottom), (0, pad_right))
# 对图像进行填充操作,使用常量值填充,可以指定数据格式和输入数据格式
padded_image = pad(
image,
padding,
mode=PaddingMode.CONSTANT,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
)
# 返回填充后的图像
return padded_image
# 从 transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad 复制过来的函数
def pad(
self,
images: List[np.ndarray],
constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> BatchFeature:
"""
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
in the batch and optionally returns their corresponding pixel mask.
Args:
image (`np.ndarray`):
Image to pad.
constant_values (`float` or `Iterable[float]`, *optional*):
The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`):
Whether to return a pixel mask.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
# 获取批量图像中的最大高度和宽度,用于填充大小的确定
pad_size = get_max_height_width(images, input_data_format=input_data_format)
# 对每张图像进行填充操作,使它们的尺寸都扩展到批量中最大的高度和宽度
padded_images = [
self._pad_image(
image,
pad_size,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
)
for image in images
]
# 将填充后的图像数据放入字典中
data = {"pixel_values": padded_images}
# 如果需要返回像素掩码
if return_pixel_mask:
# 生成每张图像的像素掩码,并放入列表中
masks = [
make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
for image in images
]
# 将像素掩码列表放入数据字典中
data["pixel_mask"] = masks
# 返回填充后的批量特征对象,包括数据字典和返回的张量类型
return BatchFeature(data=data, tensor_type=return_tensors)
def encode_inputs(
self,
pixel_values_list: List[ImageInput],
segmentation_maps: ImageInput = None,
instance_id_to_semantic_id: Optional[Union[List[Dict[int, int]], Dict[int, int]]] = None,
ignore_index: Optional[int] = None,
reduce_labels: bool = False,
return_tensors: Optional[Union[str, TensorType]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
):
"""
Encodes input images and optional segmentation maps into a format suitable for model input.
Args:
pixel_values_list (`List[ImageInput]`):
List of input images to encode.
segmentation_maps (`ImageInput`, *optional*):
Optional segmentation maps associated with input images.
instance_id_to_semantic_id (`Optional[Union[List[Dict[int, int]], Dict[int, int]]]`, *optional*):
Mapping from instance IDs to semantic IDs, if applicable.
ignore_index (`Optional[int]`, *optional*):
Index to ignore in the encoding process.
reduce_labels (`bool`, *optional*, defaults to `False`):
Whether to reduce the number of unique labels in the segmentation maps.
return_tensors (`Optional[Union[str, TensorType]]`, *optional*):
The type of tensors to return.
input_data_format (`Optional[Union[str, ChannelDimension]]`, *optional*):
The channel dimension format of the input images.
Returns:
Encoded inputs suitable for model processing.
"""
# 编码输入图像及其相关信息,返回适合模型输入的格式
...
def post_process_semantic_segmentation(
self, outputs, target_sizes: Optional[List[Tuple[int, int]]] = None
):
"""
Post-processes semantic segmentation model outputs.
Args:
outputs: Model outputs to be post-processed.
target_sizes (`Optional[List[Tuple[int, int]]]`, *optional*):
Target sizes for resizing outputs.
Returns:
Post-processed outputs.
"""
# 对语义分割模型的输出进行后处理
...
) -> "torch.Tensor":
"""
Converts the output of [`Mask2FormerForUniversalSegmentation`] into semantic segmentation maps. Only supports
PyTorch.
Args:
outputs ([`Mask2FormerForUniversalSegmentation`]):
Raw outputs of the model.
target_sizes (`List[Tuple[int, int]]`, *optional*):
List of length (batch_size), where each list item (`Tuple[int, int]]`) corresponds to the requested
final size (height, width) of each prediction. If left to None, predictions will not be resized.
Returns:
`List[torch.Tensor]`:
A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width)
corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each
`torch.Tensor` correspond to a semantic class id.
"""
# Extract logits for class queries and mask queries from model outputs
class_queries_logits = outputs.class_queries_logits # [batch_size, num_queries, num_classes+1]
masks_queries_logits = outputs.masks_queries_logits # [batch_size, num_queries, height, width]
# Scale masks_queries_logits back to preprocessed image size (384, 384)
masks_queries_logits = torch.nn.functional.interpolate(
masks_queries_logits, size=(384, 384), mode="bilinear", align_corners=False
)
# Remove the null class from class_queries_logits (`[..., :-1]`)
masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1]
masks_probs = masks_queries_logits.sigmoid() # [batch_size, num_queries, height, width]
# Compute semantic segmentation logits of shape (batch_size, num_classes, height, width)
segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
batch_size = class_queries_logits.shape[0]
# Resize logits and compute semantic segmentation maps if target_sizes is provided
if target_sizes is not None:
if batch_size != len(target_sizes):
raise ValueError(
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
)
semantic_segmentation = []
for idx in range(batch_size):
resized_logits = torch.nn.functional.interpolate(
segmentation[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
)
semantic_map = resized_logits[0].argmax(dim=0)
semantic_segmentation.append(semantic_map)
else:
# If target_sizes is None, compute semantic segmentation maps for each batch item
semantic_segmentation = segmentation.argmax(dim=1)
semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
return semantic_segmentation
# 对实例分割模型输出进行后处理,包括阈值化、合并重叠的实例掩码区域等
def post_process_instance_segmentation(
self,
outputs,
threshold: float = 0.5, # 置信度阈值,用于筛选实例
mask_threshold: float = 0.5, # 掩码阈值,用于阈值化掩码
overlap_mask_area_threshold: float = 0.8, # 重叠掩码区域的阈值,用于合并重叠实例的掩码
target_sizes: Optional[List[Tuple[int, int]]] = None, # 目标大小列表,用于尺寸调整
return_coco_annotation: Optional[bool] = False, # 是否返回 COCO 格式的注释
return_binary_maps: Optional[bool] = False, # 是否返回二进制地图
):
# 对全景分割模型输出进行后处理,包括阈值化、合并重叠的分割标签等
def post_process_panoptic_segmentation(
self,
outputs,
threshold: float = 0.5, # 置信度阈值,用于筛选分割标签
mask_threshold: float = 0.5, # 掩码阈值,用于阈值化掩码
overlap_mask_area_threshold: float = 0.8, # 重叠掩码区域的阈值,用于合并重叠分割标签的掩码
label_ids_to_fuse: Optional[Set[int]] = None, # 要融合的标签 ID 集合
target_sizes: Optional[List[Tuple[int, int]]] = None, # 目标大小列表,用于尺寸调整
):