Transformers 源码解析(四十一)
.\models\distilbert\tokenization_distilbert.py
"""Tokenization classes for DistilBERT."""
import collections
import os
import unicodedata
from typing import List, Optional, Tuple
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"distilbert-base-uncased": "https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt",
"distilbert-base-uncased-distilled-squad": (
"https://huggingface.co/distilbert-base-uncased-distilled-squad/resolve/main/vocab.txt"
),
"distilbert-base-cased": "https://huggingface.co/distilbert-base-cased/resolve/main/vocab.txt",
"distilbert-base-cased-distilled-squad": (
"https://huggingface.co/distilbert-base-cased-distilled-squad/resolve/main/vocab.txt"
),
"distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/vocab.txt",
"distilbert-base-multilingual-cased": (
"https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/vocab.txt"
),
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"distilbert-base-uncased": 512,
"distilbert-base-uncased-distilled-squad": 512,
"distilbert-base-cased": 512,
"distilbert-base-cased-distilled-squad": 512,
"distilbert-base-german-cased": 512,
"distilbert-base-multilingual-cased": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"distilbert-base-uncased": {"do_lower_case": True},
"distilbert-base-uncased-distilled-squad": {"do_lower_case": True},
"distilbert-base-cased": {"do_lower_case": False},
"distilbert-base-cased-distilled-squad": {"do_lower_case": False},
"distilbert-base-german-cased": {"do_lower_case": False},
"distilbert-base-multilingual-cased": {"do_lower_case": False},
}
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class DistilBertTokenizer(PreTrainedTokenizer):
r"""
Construct a DistilBERT tokenizer. Based on WordPiece.
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
Args:
vocab_file (`str`):
File containing the vocabulary.
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
do_basic_tokenize (`bool`, *optional*, defaults to `True`):
Whether or not to do basic tokenization before WordPiece.
never_split (`Iterable`, *optional*):
Collection of tokens which will never be split during tokenization. Only has an effect when
`do_basic_tokenize=True`
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (`str`, *optional*, defaults to `"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters.
This should likely be deactivated for Japanese (see this
[issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs,
):
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
" model use `tokenizer = DistilBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
self.do_basic_tokenize = do_basic_tokenize
if do_basic_tokenize:
self.basic_tokenizer = BasicTokenizer(
do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
super().__init__(
do_lower_case=do_lower_case,
do_basic_tokenize=do_basic_tokenize,
never_split=never_split,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
@property
def do_lower_case(self):
return self.basic_tokenizer.do_lower_case
@property
def vocab_size(self):
return len(self.vocab)
def get_vocab(self):
return dict(self.vocab, **self.added_tokens_encoder)
def _tokenize(self, text, split_special_tokens=False):
split_tokens = []
if self.do_basic_tokenize:
for token in self.basic_tokenizer.tokenize(
text, never_split=self.all_special_tokens if not split_special_tokens else None
):
if token in self.basic_tokenizer.never_split:
split_tokens.append(token)
else:
split_tokens += self.wordpiece_tokenizer.tokenize(token)
else:
split_tokens = self.wordpiece_tokenizer.tokenize(text)
return split_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.ids_to_tokens.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
):
"""
Retrieve sequence ids from a sequence of tokens that should not be masked.
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens.
Returns:
`List[int]`: List of indices indicating which tokens are special tokens.
"""
if already_has_special_tokens:
return [1] * len(token_ids_0)
cls_sep = [self.cls_token_id, self.sep_token_id]
return list(map(lambda x: 1 if x in cls_sep else 0, token_ids_0))
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
already_has_special_tokens: bool = False) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
index = 0
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
else:
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!"
)
index = token_index
writer.write(token + "\n")
index += 1
return (vocab_file,)
class BasicTokenizer(object):
"""
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
Args:
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
never_split (`Iterable`, *optional*):
Collection of tokens which will never be split during tokenization. Only has an effect when
`do_basic_tokenize=True`
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters.
This should likely be deactivated for Japanese (see this
[issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of token not to split.
"""
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
text = self._clean_text(text)
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
if self.do_lower_case:
token = token.lower()
if self.strip_accents is not False:
token = self._run_strip_accents(token)
elif self.strip_accents:
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token, never_split))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF)
or (cp >= 0x20000 and cp <= 0x2A6DF)
or (cp >= 0x2A700 and cp <= 0x2B73F)
or (cp >= 0x2B740 and cp <= 0x2B81F)
or (cp >= 0x2B820 and cp <= 0x2CEAF)
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F)
):
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xFFFD or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
tokenization using the given vocabulary.
For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through *BasicTokenizer*.
Returns:
A list of wordpiece tokens.
"""
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
.\models\distilbert\tokenization_distilbert_fast.py
"""Tokenization classes for DistilBERT."""
import json
from typing import List, Optional, Tuple
from tokenizers import normalizers
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .tokenization_distilbert import DistilBertTokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"distilbert-base-uncased": "https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt",
"distilbert-base-uncased-distilled-squad": (
"https://huggingface.co/distilbert-base-uncased-distilled-squad/resolve/main/vocab.txt"
),
"distilbert-base-cased": "https://huggingface.co/distilbert-base-cased/resolve/main/vocab.txt",
"distilbert-base-cased-distilled-squad": (
"https://huggingface.co/distilbert-base-cased-distilled-squad/resolve/main/vocab.txt"
),
"distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/vocab.txt",
"distilbert-base-multilingual-cased": (
"https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/vocab.txt"
),
},
"tokenizer_file": {
"distilbert-base-uncased": "https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer.json",
"distilbert-base-uncased-distilled-squad": (
"https://huggingface.co/distilbert-base-uncased-distilled-squad/resolve/main/tokenizer.json"
),
"distilbert-base-cased": "https://huggingface.co/distilbert-base-cased/resolve/main/tokenizer.json",
"distilbert-base-cased-distilled-squad": (
"https://huggingface.co/distilbert-base-cased-distilled-squad/resolve/main/tokenizer.json"
),
"distilbert-base-german-cased": (
"https://huggingface.co/distilbert-base-german-cased/resolve/main/tokenizer.json"
),
"distilbert-base-multilingual-cased": (
"https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/tokenizer.json"
),
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"distilbert-base-uncased": 512,
"distilbert-base-uncased-distilled-squad": 512,
"distilbert-base-cased": 512,
"distilbert-base-cased-distilled-squad": 512,
}
"distilbert-base-german-cased": 512,
"distilbert-base-multilingual-cased": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"distilbert-base-uncased": {"do_lower_case": True},
"distilbert-base-uncased-distilled-squad": {"do_lower_case": True},
"distilbert-base-cased": {"do_lower_case": False},
"distilbert-base-cased-distilled-squad": {"do_lower_case": False},
"distilbert-base-german-cased": {"do_lower_case": False},
"distilbert-base-multilingual-cased": {"do_lower_case": False},
}
class DistilBertTokenizerFast(PreTrainedTokenizerFast):
r"""
构建一个“快速”的 DistilBERT 分词器(基于 HuggingFace 的 *tokenizers* 库)。基于 WordPiece。
此分词器继承自 [`PreTrainedTokenizerFast`],其中包含大多数主要方法。用户应参考此超类以获取有关这些方法的更多信息。
"""
Args:
vocab_file (`str`):
File containing the vocabulary.
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (`str`, *optional*, defaults to `"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
clean_text (`bool`, *optional*, defaults to `True`):
Whether or not to clean the text before tokenization by removing any control characters and replacing all
whitespaces by the classic one.
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
The prefix for subwords.
"""
# 定义一些常量和映射
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
# 指定模型输入的名称列表
model_input_names = ["input_ids", "attention_mask"]
# 指定慢速分词器的类,这里使用的是 DistilBertTokenizer
slow_tokenizer_class = DistilBertTokenizer
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
do_lower_case=True,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs,
):
# 调用父类的初始化方法,设置词汇文件、分词器文件、大小写敏感、未知标记、分隔标记、填充标记、类别标记、掩码标记、处理中文字符等参数
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
# 获取当前后端分词器的规范化器状态并转换为字典
normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
# 检查规范化器状态中的属性是否与当前初始化参数一致,若不一致则更新
if (
normalizer_state.get("lowercase", do_lower_case) != do_lower_case
or normalizer_state.get("strip_accents", strip_accents) != strip_accents
or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
):
# 获取当前规范化器的类,并更新相关属性
normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
normalizer_state["lowercase"] = do_lower_case
normalizer_state["strip_accents"] = strip_accents
normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
# 设置对象的大小写敏感属性
self.do_lower_case = do_lower_case
# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.build_inputs_with_special_tokens
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary
"""
# 构建带有特殊标记的输入序列,根据是否提供第二个序列决定是否添加第二个分隔符和第二个序列的 token IDs
output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
if token_ids_1 is not None:
output += token_ids_1 + [self.sep_token_id]
return output
# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.create_token_type_ids_from_sequences
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
):
"""
Create token type IDs tensor from given sequences.
Args:
token_ids_0 (`List[int]`):
List of IDs corresponding to the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional list of IDs corresponding to the second sequence for sequence pairs.
Returns:
`List[int]`: List of token type IDs.
"""
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary
"""
# Define special tokens for separation and classification
sep = [self.sep_token_id] # List containing the separator token ID
cls = [self.cls_token_id] # List containing the classification token ID
# If only one sequence is provided (token_ids_1 is None), return a mask with 0s
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0] # Return a list of zeros representing token type IDs
# If two sequences are provided, concatenate their lengths and return a mask with 0s for the first sequence and 1s for the second sequence
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.save_vocabulary
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary files to the specified directory.
Args:
save_directory (str):
Directory path where the vocabulary files will be saved.
filename_prefix (Optional[str]):
Optional prefix for the vocabulary filenames.
Returns:
Tuple[str]: Tuple containing the paths of the saved vocabulary files.
"""
# Call the internal tokenizer's model save method to save vocabulary files
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
# Return the paths of the saved files as a tuple
return tuple(files)
.\models\distilbert\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_flax_available,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_distilbert": [
"DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP",
"DistilBertConfig",
"DistilBertOnnxConfig",
],
"tokenization_distilbert": ["DistilBertTokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_distilbert_fast"] = ["DistilBertTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_distilbert"] = [
"DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
"DistilBertForMaskedLM",
"DistilBertForMultipleChoice",
"DistilBertForQuestionAnswering",
"DistilBertForSequenceClassification",
"DistilBertForTokenClassification",
"DistilBertModel",
"DistilBertPreTrainedModel",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_distilbert"] = [
"TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFDistilBertForMaskedLM",
"TFDistilBertForMultipleChoice",
"TFDistilBertForQuestionAnswering",
"TFDistilBertForSequenceClassification",
"TFDistilBertForTokenClassification",
"TFDistilBertMainLayer",
"TFDistilBertModel",
"TFDistilBertPreTrainedModel",
]
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_flax_distilbert"] = [
"FlaxDistilBertForMaskedLM",
"FlaxDistilBertForMultipleChoice",
"FlaxDistilBertForQuestionAnswering",
"FlaxDistilBertForSequenceClassification",
"FlaxDistilBertForTokenClassification",
"FlaxDistilBertModel",
"FlaxDistilBertPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_distilbert import (
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
DistilBertConfig,
DistilBertOnnxConfig,
)
from .tokenization_distilbert import DistilBertTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_distilbert_fast import DistilBertTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_distilbert import (
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
DistilBertForMaskedLM,
DistilBertForMultipleChoice,
DistilBertForQuestionAnswering,
DistilBertForSequenceClassification,
DistilBertForTokenClassification,
DistilBertModel,
DistilBertPreTrainedModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_distilbert import (
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFDistilBertForMaskedLM,
TFDistilBertForMultipleChoice,
TFDistilBertForQuestionAnswering,
TFDistilBertForSequenceClassification,
TFDistilBertForTokenClassification,
TFDistilBertMainLayer,
TFDistilBertModel,
TFDistilBertPreTrainedModel,
)
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_flax_distilbert import (
FlaxDistilBertForMaskedLM,
FlaxDistilBertForMultipleChoice,
FlaxDistilBertForQuestionAnswering,
FlaxDistilBertForSequenceClassification,
FlaxDistilBertForTokenClassification,
FlaxDistilBertModel,
FlaxDistilBertPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\dit\convert_dit_unilm_to_pytorch.py
"""Convert DiT checkpoints from the unilm repository."""
import argparse
import json
from pathlib import Path
import requests
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import BeitConfig, BeitForImageClassification, BeitForMaskedImageModeling, BeitImageProcessor
from transformers.image_utils import PILImageResampling
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def create_rename_keys(config, has_lm_head=False, is_semantic=False):
prefix = "backbone." if is_semantic else ""
rename_keys = []
for i in range(config.num_hidden_layers):
rename_keys.append((f"{prefix}blocks.{i}.norm1.weight", f"beit.encoder.layer.{i}.layernorm_before.weight"))
rename_keys.append((f"{prefix}blocks.{i}.norm1.bias", f"beit.encoder.layer.{i}.layernorm_before.bias"))
rename_keys.append(
(f"{prefix}blocks.{i}.attn.proj.weight", f"beit.encoder.layer.{i}.attention.output.dense.weight")
)
rename_keys.append(
(f"{prefix}blocks.{i}.attn.proj.bias", f"beit.encoder.layer.{i}.attention.output.dense.bias")
)
rename_keys.append((f"{prefix}blocks.{i}.norm2.weight", f"beit.encoder.layer.{i}.layernorm_after.weight"))
rename_keys.append((f"{prefix}blocks.{i}.norm2.bias", f"beit.encoder.layer.{i}.layernorm_after.bias"))
rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.weight", f"beit.encoder.layer.{i}.intermediate.dense.weight"))
rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.bias", f"beit.encoder.layer.{i}.intermediate.dense.bias"))
rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.weight", f"beit.encoder.layer.{i}.output.dense.weight"))
rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.bias", f"beit.encoder.layer.{i}.output.dense.bias"))
rename_keys.extend(
[
(f"{prefix}cls_token", "beit.embeddings.cls_token"),
(f"{prefix}patch_embed.proj.weight", "beit.embeddings.patch_embeddings.projection.weight"),
(f"{prefix}patch_embed.proj.bias", "beit.embeddings.patch_embeddings.projection.bias"),
(f"{prefix}pos_embed", "beit.embeddings.position_embeddings"),
]
)
if has_lm_head:
rename_keys.extend(
[
("mask_token", "beit.embeddings.mask_token"),
("norm.weight", "layernorm.weight"),
("norm.bias", "layernorm.bias"),
]
)
else:
rename_keys.extend(
[
("fc_norm.weight", "beit.pooler.layernorm.weight"),
("fc_norm.bias", "beit.pooler.layernorm.bias"),
("head.weight", "classifier.weight"),
("head.bias", "classifier.bias"),
]
)
return rename_keys
def read_in_q_k_v(state_dict, config, has_lm_head=False, is_semantic=False):
for i in range(config.num_hidden_layers):
prefix = "backbone." if is_semantic else ""
in_proj_weight = state_dict.pop(f"{prefix}blocks.{i}.attn.qkv.weight")
q_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.q_bias")
v_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.v_bias")
state_dict[f"beit.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
: config.hidden_size, :
]
state_dict[f"beit.encoder.layer.{i}.attention.attention.query.bias"] = q_bias
state_dict[f"beit.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
config.hidden_size : config.hidden_size * 2, :
]
state_dict[f"beit.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-config.hidden_size :, :
]
state_dict[f"beit.encoder.layer.{i}.attention.attention.value.bias"] = v_bias
gamma_1 = state_dict.pop(f"{prefix}blocks.{i}.gamma_1")
gamma_2 = state_dict.pop(f"{prefix}blocks.{i}.gamma_2")
state_dict[f"beit.encoder.layer.{i}.lambda_1"] = gamma_1
state_dict[f"beit.encoder.layer.{i}.lambda_2"] = gamma_2
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_dit_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub=False):
"""
Copy/paste/tweak model's weights to our BEiT structure.
"""
has_lm_head = False if "rvlcdip" in checkpoint_url else True
config = BeitConfig(use_absolute_position_embeddings=True, use_mask_token=has_lm_head)
if "large" in checkpoint_url or "dit-l" in checkpoint_url:
config.hidden_size = 1024
config.intermediate_size = 4096
config.num_hidden_layers = 24
config.num_attention_heads = 16
if "rvlcdip" in checkpoint_url:
config.num_labels = 16
repo_id = "huggingface/label-files"
filename = "rvlcdip-id2label.json"
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]
rename_keys = create_rename_keys(config, has_lm_head=has_lm_head)
for src, dest in rename_keys:
rename_key(state_dict, src, dest)
read_in_q_k_v(state_dict, config, has_lm_head=has_lm_head)
model = BeitForMaskedImageModeling(config) if has_lm_head else BeitForImageClassification(config)
model.eval()
model.load_state_dict(state_dict)
image_processor = BeitImageProcessor(
size=config.image_size, resample=PILImageResampling.BILINEAR, do_center_crop=False
)
image = prepare_img()
encoding = image_processor(images=image, return_tensors="pt")
pixel_values = encoding["pixel_values"]
outputs = model(pixel_values)
logits = outputs.logits
expected_shape = [1, 16] if "rvlcdip" in checkpoint_url else [1, 196, 8192]
assert logits.shape == torch.Size(expected_shape), "Shape of logits not as expected"
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving image processor to {pytorch_dump_folder_path}")
image_processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
if has_lm_head:
model_name = "dit-base" if "base" in checkpoint_url else "dit-large"
else:
model_name = "dit-base-finetuned-rvlcdip" if "dit-b" in checkpoint_url else "dit-large-finetuned-rvlcdip"
image_processor.push_to_hub(
repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
organization="nielsr",
commit_message="Add image processor",
use_temp_dir=True,
)
model.push_to_hub(
repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
organization="nielsr",
commit_message="Add model",
use_temp_dir=True,
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--checkpoint_url",
default="https://layoutlm.blob.core.windows.net/dit/dit-pts/dit-base-224-p16-500k-62d53a.pth",
type=str,
help="URL to the original PyTorch checkpoint (.pth file).",
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
)
parser.add_argument(
"--push_to_hub",
action="store_true",
)
args = parser.parse_args()
convert_dit_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub)
.\models\dit\__init__.py
import datetime
def format_date(dt):
return dt.strftime("%Y-%m-%d")
current_date = datetime.datetime.now()
formatted_date = format_date(current_date)
print(formatted_date)
.\models\donut\configuration_donut_swin.py
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"naver-clova-ix/donut-base": "https://huggingface.co/naver-clova-ix/donut-base/resolve/main/config.json",
}
class DonutSwinConfig(PretrainedConfig):
r"""
这是用于存储 [`DonutSwinModel`] 配置信息的配置类。它用于根据指定的参数实例化 Donut 模型,定义模型架构。
使用默认配置实例化将产生类似于 Donut [naver-clova-ix/donut-base](https://huggingface.co/naver-clova-ix/donut-base)
架构的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型的输出。阅读 [`PretrainedConfig`] 的文档以获取更多信息。
"""
pass
model_type = "donut-swin"
attribute_map = {
"num_attention_heads": "num_heads",
"num_hidden_layers": "num_layers",
}
def __init__(
self,
image_size=224,
patch_size=4,
num_channels=3,
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
mlp_ratio=4.0,
qkv_bias=True,
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
drop_path_rate=0.1,
hidden_act="gelu",
use_absolute_embeddings=False,
initializer_range=0.02,
layer_norm_eps=1e-5,
**kwargs,
):
super().__init__(**kwargs)
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.embed_dim = embed_dim
self.depths = depths
self.num_layers = len(depths)
self.num_heads = num_heads
self.window_size = window_size
self.mlp_ratio = mlp_ratio
self.qkv_bias = qkv_bias
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.drop_path_rate = drop_path_rate
self.hidden_act = hidden_act
self.use_absolute_embeddings = use_absolute_embeddings
self.layer_norm_eps = layer_norm_eps
self.initializer_range = initializer_range
self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
.\models\donut\convert_donut_to_pytorch.py
"""Convert Donut checkpoints using the original `donut-python` library. URL: https://github.com/clovaai/donut"""
import argparse
import torch
from datasets import load_dataset
from donut import DonutModel
from transformers import (
DonutImageProcessor,
DonutProcessor,
DonutSwinConfig,
DonutSwinModel,
MBartConfig,
MBartForCausalLM,
VisionEncoderDecoderModel,
XLMRobertaTokenizerFast,
)
def get_configs(model):
original_config = model.config
encoder_config = DonutSwinConfig(
image_size=original_config.input_size,
patch_size=4,
depths=original_config.encoder_layer,
num_heads=[4, 8, 16, 32],
window_size=original_config.window_size,
embed_dim=128,
)
decoder_config = MBartConfig(
is_decoder=True,
is_encoder_decoder=False,
add_cross_attention=True,
decoder_layers=original_config.decoder_layer,
max_position_embeddings=original_config.max_position_embeddings,
vocab_size=len(
model.decoder.tokenizer
),
scale_embedding=True,
add_final_layer_norm=True,
)
return encoder_config, decoder_config
def rename_key(name):
if "encoder.model" in name:
name = name.replace("encoder.model", "encoder")
if "decoder.model" in name:
name = name.replace("decoder.model", "decoder")
if "patch_embed.proj" in name:
name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
if "patch_embed.norm" in name:
name = name.replace("patch_embed.norm", "embeddings.norm")
if name.startswith("encoder"):
if "layers" in name:
name = "encoder." + name
if "attn.proj" in name:
name = name.replace("attn.proj", "attention.output.dense")
if "attn" in name and "mask" not in name:
name = name.replace("attn", "attention.self")
if "norm1" in name:
name = name.replace("norm1", "layernorm_before")
if "norm2" in name:
name = name.replace("norm2", "layernorm_after")
if "mlp.fc1" in name:
name = name.replace("mlp.fc1", "intermediate.dense")
if "mlp.fc2" in name:
name = name.replace("mlp.fc2", "output.dense")
if name == "encoder.norm.weight":
name = "encoder.layernorm.weight"
if name == "encoder.norm.bias":
name = "encoder.layernorm.bias"
return name
def convert_state_dict(orig_state_dict, model):
for key in orig_state_dict.copy().keys():
val = orig_state_dict.pop(key)
if "qkv" in key:
key_split = key.split(".")
layer_num = int(key_split[3])
block_num = int(key_split[5])
dim = model.encoder.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size
if "weight" in key:
orig_state_dict[
f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"
] = val[:dim, :]
orig_state_dict[
f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"
] = val[dim : dim * 2, :]
orig_state_dict[
f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"
] = val[-dim:, :]
else:
orig_state_dict[
f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"
] = val[:dim]
orig_state_dict[
f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"
] = val[dim : dim * 2]
orig_state_dict[
f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"
] = val[-dim:]
elif "attn_mask" in key or key in ["encoder.model.norm.weight", "encoder.model.norm.bias"]:
pass
else:
orig_state_dict[rename_key(key)] = val
return orig_state_dict
def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
original_model = DonutModel.from_pretrained(model_name).eval()
encoder_config, decoder_config = get_configs(original_model)
encoder = DonutSwinModel(encoder_config)
decoder = MBartForCausalLM(decoder_config)
model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
model.eval()
state_dict = original_model.state_dict()
new_state_dict = convert_state_dict(state_dict, model)
model.load_state_dict(new_state_dict)
dataset = load_dataset("hf-internal-testing/example-documents")
image = dataset["test"][0]["image"].convert("RGB")
tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True)
image_processor = DonutImageProcessor(
do_align_long_axis=original_model.config.align_long_axis, size=original_model.config.input_size[::-1]
)
processor = DonutProcessor(image_processor, tokenizer)
pixel_values = processor(image, return_tensors="pt").pixel_values
if model_name == "naver-clova-ix/donut-base-finetuned-docvqa":
task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
question = "When is the coffee break?"
task_prompt = task_prompt.replace("{user_input}", question)
elif model_name == "naver-clova-ix/donut-base-finetuned-rvlcdip":
task_prompt = "<s_rvlcdip>"
elif model_name in [
"naver-clova-ix/donut-base-finetuned-cord-v1",
"naver-clova-ix/donut-base-finetuned-cord-v1-2560",
]:
task_prompt = "<s_cord>"
elif model_name == "naver-clova-ix/donut-base-finetuned-cord-v2":
task_prompt = "s_cord-v2>"
elif model_name == "naver-clova-ix/donut-base-finetuned-zhtrainticket":
task_prompt = "<s_zhtrainticket>"
elif model_name in ["naver-clova-ix/donut-proto", "naver-clova-ix/donut-base"]:
task_prompt = "hello world"
else:
raise ValueError("Model name not supported")
prompt_tensors = original_model.decoder.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]
original_patch_embed = original_model.encoder.model.patch_embed(pixel_values)
patch_embeddings, _ = model.encoder.embeddings(pixel_values)
assert torch.allclose(original_patch_embed, patch_embeddings, atol=1e-3)
original_last_hidden_state = original_model.encoder(pixel_values)
last_hidden_state = model.encoder(pixel_values).last_hidden_state
assert torch.allclose(original_last_hidden_state, last_hidden_state, atol=1e-2)
original_logits = original_model(pixel_values, prompt_tensors, None).logits
logits = model(pixel_values, decoder_input_ids=prompt_tensors).logits
assert torch.allclose(original_logits, logits, atol=1e-3)
if pytorch_dump_folder_path is not None:
print(f"Saving model and processor to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
model.push_to_hub("nielsr/" + model_name.split("/")[-1], commit_message="Update model")
processor.push_to_hub("nielsr/" + model_name.split("/")[-1], commit_message="Update model")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
default="naver-clova-ix/donut-base-finetuned-docvqa",
required=False,
type=str,
help="Name of the original model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
required=False,
type=str,
help="Path to the output PyTorch model directory.",
)
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Whether or not to push the converted model and processor to the 🤗 hub.",
)
args = parser.parse_args()
convert_donut_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
.\models\donut\feature_extraction_donut.py
"""Feature extractor class for Donut."""
import warnings
from ...utils import logging
from .image_processing_donut import DonutImageProcessor
logger = logging.get_logger(__name__)
class DonutFeatureExtractor(DonutImageProcessor):
def __init__(self, *args, **kwargs) -> None:
warnings.warn(
"The class DonutFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
" use DonutImageProcessor instead.",
FutureWarning,
)
super().__init__(*args, **kwargs)
.\models\donut\image_processing_donut.py
if is_vision_available():
import PIL
class DonutImageProcessor(BaseImageProcessor):
r"""
Constructs a Donut image processor.
"""
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
`do_resize` in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
method.
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
do_thumbnail (`bool`, *optional*, defaults to `True`):
Whether to resize the image using thumbnail method.
do_align_long_axis (`bool`, *optional*, defaults to `False`):
Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
do_pad (`bool`, *optional*, defaults to `True`):
Whether to pad the image. If `random_padding` is set to `True` in `preprocess`, each image is padded with a
random amount of padding on each side, up to the largest image size in the batch. Otherwise, all images are
padded to the largest image size in the batch.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
method.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
Image standard deviation.
"""
# 定义模型输入的名称列表,仅包含像素值
model_input_names = ["pixel_values"]
# 初始化函数,用于设置图像处理的各项参数和默认值
def __init__(
self,
do_resize: bool = True, # 是否进行图像尺寸调整,默认为True
size: Dict[str, int] = None, # 图像的目标尺寸,字典形式表示,包含高度和宽度,默认为None
resample: PILImageResampling = PILImageResampling.BILINEAR, # 图像调整大小时的重采样方法,默认为双线性插值
do_thumbnail: bool = True, # 是否生成缩略图,默认为True
do_align_long_axis: bool = False, # 是否在长轴上对齐图像,默认为False
do_pad: bool = True, # 是否进行图像填充,默认为True
do_rescale: bool = True, # 是否对图像进行重新缩放,默认为True
rescale_factor: Union[int, float] = 1 / 255, # 图像重新缩放的因子,默认为1/255
do_normalize: bool = True, # 是否对图像进行归一化,默认为True
image_mean: Optional[Union[float, List[float]]] = None, # 图像的均值用于归一化,默认为None
image_std: Optional[Union[float, List[float]]] = None, # 图像的标准差用于归一化,默认为None
**kwargs, # 其他可选的关键字参数
) -> None:
# 调用父类的初始化方法,传入其他的关键字参数
super().__init__(**kwargs)
# 如果size为None,则设定默认的高度和宽度
size = size if size is not None else {"height": 2560, "width": 1920}
# 如果size是元组或列表形式,则转换为字典形式,表示高度和宽度
if isinstance(size, (tuple, list)):
# The previous feature extractor size parameter was in (width, height) format
size = size[::-1]
# 使用函数get_size_dict处理size,确保返回的是一个标准化的尺寸字典
size = get_size_dict(size)
# 设置对象的属性值,将初始化函数的参数赋值给对象的属性
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_thumbnail = do_thumbnail
self.do_align_long_axis = do_align_long_axis
self.do_pad = do_pad
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN # 如果image_mean为None,则使用IMAGENET_STANDARD_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD # 如果image_std为None,则使用IMAGENET_STANDARD_STD
# 验证处理器的关键字列表,用于后续处理
self._valid_processor_keys = [
"images",
"do_resize",
"size",
"resample",
"do_thumbnail",
"do_align_long_axis",
"do_pad",
"random_padding",
"do_rescale",
"rescale_factor",
"do_normalize",
"image_mean",
"image_std",
"return_tensors",
"data_format",
"input_data_format",
]
def align_long_axis(
self,
image: np.ndarray,
size: Dict[str, int],
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""
Align the long axis of the image to the longest axis of the specified size.
Args:
image (`np.ndarray`):
The image to be aligned.
size (`Dict[str, int]`):
The size `{"height": h, "width": w}` to align the long axis to.
data_format (`str` or `ChannelDimension`, *optional*):
The data format of the output image. If unset, the same format as the input image is used.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
Returns:
`np.ndarray`: The aligned image.
"""
# 获取输入图像的高度和宽度
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
# 获取输出图像的高度和宽度
output_height, output_width = size["height"], size["width"]
# 如果输出宽度小于高度且输入宽度大于高度,或者输出宽度大于高度且输入宽度小于高度,则须旋转图像
if (output_width < output_height and input_width > input_height) or (
output_width > output_height and input_width < input_height
):
image = np.rot90(image, 3)
# 如果指定了输出数据格式,则转换图像数据格式
if data_format is not None:
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
# 返回对齐后的图像
return image
def pad_image(
self,
image: np.ndarray,
size: Dict[str, int],
random_padding: bool = False,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""
Pad the image to the specified size.
Args:
image (`np.ndarray`):
The image to be padded.
size (`Dict[str, int]`):
The size `{"height": h, "width": w}` to pad the image to.
random_padding (`bool`, *optional*, defaults to `False`):
Whether to use random padding or not.
data_format (`str` or `ChannelDimension`, *optional*):
The data format of the output image. If unset, the same format as the input image is used.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
# Extract output height and width from the size dictionary
output_height, output_width = size["height"], size["width"]
# Obtain input height and width from the input image
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
# Calculate the difference between output and input dimensions
delta_width = output_width - input_width
delta_height = output_height - input_height
# Determine padding amounts based on random_padding flag
if random_padding:
pad_top = np.random.randint(low=0, high=delta_height + 1)
pad_left = np.random.randint(low=0, high=delta_width + 1)
else:
pad_top = delta_height // 2
pad_left = delta_width // 2
# Calculate remaining padding amounts to complete the pad
pad_bottom = delta_height - pad_top
pad_right = delta_width - pad_left
# Construct the padding tuple for np.pad function
padding = ((pad_top, pad_bottom), (pad_left, pad_right))
# Apply padding to the image using np.pad
return pad(image, padding, data_format=data_format, input_data_format=input_data_format)
def pad(self, *args, **kwargs):
# Log a deprecation warning for the `pad` method
logger.info("pad is deprecated and will be removed in version 4.27. Please use pad_image instead.")
# Redirect to `pad_image` method
return self.pad_image(*args, **kwargs)
def thumbnail(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize the image to make a thumbnail. The image is resized so that no dimension is larger than any
corresponding dimension of the specified size.
Args:
image (`np.ndarray`):
The image to be resized.
size (`Dict[str, int]`):
The size `{"height": h, "width": w}` to resize the image to.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
The resampling filter to use.
data_format (`Optional[Union[str, ChannelDimension]]`, *optional*):
The data format of the output image. If unset, the same format as the input image is used.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
# 获取输入图像的高度和宽度
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
# 获取输出图像的目标高度和宽度
output_height, output_width = size["height"], size["width"]
# 始终调整图像大小为输入或输出大小中较小的那一个
height = min(input_height, output_height)
width = min(input_width, output_width)
# 如果输入图像已经符合要求的大小,则直接返回原图像
if height == input_height and width == input_width:
return image
# 根据输入图像的长宽比例调整目标高度或宽度
if input_height > input_width:
width = int(input_width * height / input_height)
elif input_width > input_height:
height = int(input_height * width / input_width)
# 调用 resize 函数,进行图像的实际调整
return resize(
image,
size=(height, width),
resample=resample,
reducing_gap=2.0,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
):
"""
Resize the input image to the specified size.
Args:
image (`np.ndarray`):
The image to be resized.
size (`Dict[str, int]`):
The target size `{"height": h, "width": w}` to resize the image to.
resample (`PILImageResampling`, *optional*):
The resampling filter to use.
data_format (`Optional[Union[str, ChannelDimension]]`, *optional*):
The data format of the output image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image.
Returns:
np.ndarray: The resized image.
"""
) -> np.ndarray:
"""
Resizes `image` to `(height, width)` specified by `size` using the PIL library.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use when resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
# 调整 `size` 参数,确保其为大小字典
size = get_size_dict(size)
# 计算 `size` 中较短的边长
shortest_edge = min(size["height"], size["width"])
# 获取调整大小后的输出图像尺寸
output_size = get_resize_output_image_size(
image, size=shortest_edge, default_to_square=False, input_data_format=input_data_format
)
# 调整图像大小并返回调整后的图像
resized_image = resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
return resized_image
def preprocess(
self,
images: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
do_thumbnail: bool = None,
do_align_long_axis: bool = None,
do_pad: bool = None,
random_padding: bool = False,
do_rescale: bool = None,
rescale_factor: float = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
.\models\donut\modeling_donut_swin.py
""" PyTorch Donut Swin Transformer model.
This implementation is identical to a regular Swin Transformer, without final layer norm on top of the final hidden
states."""
import collections.abc
import math
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from ...activations import ACT2FN
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
)
from .configuration_donut_swin import DonutSwinConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "DonutSwinConfig"
_CHECKPOINT_FOR_DOC = "https://huggingface.co/naver-clova-ix/donut-base"
_EXPECTED_OUTPUT_SHAPE = [1, 49, 768]
DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST = [
"naver-clova-ix/donut-base",
]
@dataclass
class DonutSwinEncoderOutput(ModelOutput):
"""
DonutSwin encoder's outputs, with potential hidden states and attentions.
"""
last_hidden_state: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class DonutSwinModelOutput(ModelOutput):
"""
DonutSwin model's outputs that also contains a pooling of the last hidden states.
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
Average pooling of the last layer hidden-state.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, hidden_size, height, width)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
include the spatial dimensions.
"""
last_hidden_state: torch.FloatTensor = None
pooler_output: Optional[torch.FloatTensor] = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
def window_partition(input_feature, window_size):
"""
Partitions the given input into windows.
"""
batch_size, height, width, num_channels = input_feature.shape
input_feature = input_feature.view(
batch_size, height // window_size, window_size, width // window_size, window_size, num_channels
)
windows = input_feature.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels)
return windows
def window_reverse(windows, window_size, height, width):
num_channels = windows.shape[-1]
windows = windows.view(-1, height // window_size, width // window_size, window_size, window_size, num_channels)
windows = windows.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, height, width, num_channels)
return windows
class DonutSwinEmbeddings(nn.Module):
"""
构建补丁和位置嵌入。可选择添加掩码令牌。
"""
def __init__(self, config, use_mask_token=False):
super().__init__()
self.patch_embeddings = DonutSwinPatchEmbeddings(config)
num_patches = self.patch_embeddings.num_patches
self.patch_grid = self.patch_embeddings.grid_size
self.mask_token = nn.Parameter(torch.zeros(1, 1, config.embed_dim)) if use_mask_token else None
if config.use_absolute_embeddings:
self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.embed_dim))
else:
self.position_embeddings = None
self.norm = nn.LayerNorm(config.embed_dim)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(
self, pixel_values: Optional[torch.FloatTensor], bool_masked_pos: Optional[torch.BoolTensor] = None
) -> Tuple[torch.Tensor]:
embeddings, output_dimensions = self.patch_embeddings(pixel_values)
embeddings = self.norm(embeddings)
batch_size, seq_len, _ = embeddings.size()
if bool_masked_pos is not None:
mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
if self.position_embeddings is not None:
embeddings = embeddings + self.position_embeddings
embeddings = self.dropout(embeddings)
return embeddings, output_dimensions
class DonutSwinPatchEmbeddings(nn.Module):
"""
将形状为(batch_size, num_channels, height, width)的像素值转换为Transformer可消耗的初始隐藏状态(补丁嵌入),
形状为(batch_size, seq_length, hidden_size)。
"""
def __init__(self, config):
super().__init__()
image_size, patch_size = config.image_size, config.patch_size
num_channels, hidden_size = config.num_channels, config.embed_dim
image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.num_patches = num_patches
self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
def maybe_pad(self, pixel_values, height, width):
if width % self.patch_size[1] != 0:
pad_values = (0, self.patch_size[1] - width % self.patch_size[1])
pixel_values = nn.functional.pad(pixel_values, pad_values)
if height % self.patch_size[0] != 0:
pad_values = (0, 0, 0, self.patch_size[0] - height % self.patch_size[0])
pixel_values = nn.functional.pad(pixel_values, pad_values)
return pixel_values
def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor, Tuple[int]]:
_, num_channels, height, width = pixel_values.shape
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
pixel_values = self.maybe_pad(pixel_values, height, width)
embeddings = self.projection(pixel_values)
_, _, height, width = embeddings.shape
output_dimensions = (height, width)
embeddings = embeddings.flatten(2).transpose(1, 2)
return embeddings, output_dimensions
class DonutSwinPatchMerging(nn.Module):
"""
Patch Merging Layer.
Args:
input_resolution (`Tuple[int]`):
Resolution of input feature.
dim (`int`):
Number of input channels.
norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
Normalization layer class.
"""
def __init__(self, input_resolution: Tuple[int], dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
super().__init__()
self.input_resolution = input_resolution
self.dim = dim
self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
self.norm = norm_layer(4 * dim)
def maybe_pad(self, input_feature, height, width):
should_pad = (height % 2 == 1) or (width % 2 == 1)
if should_pad:
pad_values = (0, 0, 0, width % 2, 0, height % 2)
input_feature = nn.functional.pad(input_feature, pad_values)
return input_feature
def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]) -> torch.Tensor:
height, width = input_dimensions
batch_size, dim, num_channels = input_feature.shape
input_feature = input_feature.view(batch_size, height, width, num_channels)
input_feature = self.maybe_pad(input_feature, height, width)
input_feature_0 = input_feature[:, 0::2, 0::2, :]
input_feature_1 = input_feature[:, 1::2, 0::2, :]
input_feature_2 = input_feature[:, 0::2, 1::2, :]
input_feature_3 = input_feature[:, 1::2, 1::2, :]
input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1)
input_feature = input_feature.view(batch_size, -1, 4 * num_channels)
input_feature = self.norm(input_feature)
input_feature = self.reduction(input_feature)
return input_feature
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
"""
if drop_prob == 0.0 or not training:
return input
keep_prob = 1 - drop_prob
shape = (input.shape[0],) + (1,) * (input.ndim - 1)
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_()
output = input.div(keep_prob) * random_tensor
return output
class DonutSwinDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
return "p={}".format(self.drop_prob)
class DonutSwinSelfAttention(nn.Module):
def __init__(self, config, dim, num_heads, window_size):
super().__init__()
if dim % num_heads != 0:
raise ValueError(
f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})"
)
self.num_attention_heads = num_heads
self.attention_head_size = int(dim / num_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.window_size = (
window_size if isinstance(window_size, collections.abc.Iterable) else (window_size, window_size)
)
self.relative_position_bias_table = nn.Parameter(
torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads)
)
coords_h = torch.arange(self.window_size[0])
coords_w = torch.arange(self.window_size[1])
coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
coords_flatten = torch.flatten(coords, 1)
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
relative_coords = relative_coords.permute(1, 2, 0).contiguous()
relative_coords[:, :, 0] += self.window_size[0] - 1
relative_coords[:, :, 1] += self.window_size[1] - 1
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
relative_position_index = relative_coords.sum(-1)
self.register_buffer("relative_position_index", relative_position_index)
self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
batch_size, dim, num_channels = hidden_states.shape
mixed_query_layer = self.query(hidden_states)
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
query_layer = self.transpose_for_scores(mixed_query_layer)
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)]
relative_position_bias = relative_position_bias.view(
self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
)
relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
attention_scores = attention_scores + relative_position_bias.unsqueeze(0)
if attention_mask is not None:
mask_shape = attention_mask.shape[0]
attention_scores = attention_scores.view(
batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim
)
attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(0)
attention_scores = attention_scores.view(-1, self.num_attention_heads, dim, dim)
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
attention_probs = self.dropout(attention_probs)
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
class DonutSwinSelfOutput(nn.Module):
def __init__(self, config, dim):
super().__init__()
self.dense = nn.Linear(dim, dim)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class DonutSwinAttention(nn.Module):
def __init__(self, config, dim, num_heads, window_size):
super().__init__()
self.self = DonutSwinSelfAttention(config, dim, num_heads, window_size)
self.output = DonutSwinSelfOutput(config, dim)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class DonutSwinIntermediate(nn.Module):
def __init__(self, config, dim):
super().__init__()
self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class DonutSwinOutput(nn.Module):
def __init__(self, config, dim):
super().__init__()
self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class DonutSwinLayer(nn.Module):
def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.shift_size = shift_size
self.window_size = config.window_size
self.input_resolution = input_resolution
self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
self.attention = DonutSwinAttention(config, dim, num_heads, window_size=self.window_size)
self.drop_path = DonutSwinDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
self.intermediate = DonutSwinIntermediate(config, dim)
self.output = DonutSwinOutput(config, dim)
def set_shift_and_window_size(self, input_resolution):
if min(input_resolution) <= self.window_size:
self.shift_size = 0
self.window_size = min(input_resolution)
def get_attn_mask(self, height, width, dtype):
if self.shift_size > 0:
img_mask = torch.zeros((1, height, width, 1), dtype=dtype)
height_slices = (
slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None),
)
width_slices = (
slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None),
)
count = 0
for height_slice in height_slices:
for width_slice in width_slices:
img_mask[:, height_slice, width_slice, :] = count
count += 1
mask_windows = window_partition(img_mask, self.window_size)
mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
else:
attn_mask = None
return attn_mask
def maybe_pad(self, hidden_states, height, width):
pad_right = (self.window_size - width % self.window_size) % self.window_size
pad_bottom = (self.window_size - height % self.window_size) % self.window_size
pad_values = (0, 0, 0, pad_right, 0, pad_bottom)
hidden_states = nn.functional.pad(hidden_states, pad_values)
return hidden_states, pad_values
def forward(
self,
hidden_states: torch.Tensor,
input_dimensions: Tuple[int, int],
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
always_partition: Optional[bool] = False,
) -> Tuple[torch.Tensor, torch.Tensor]:
if not always_partition:
self.set_shift_and_window_size(input_dimensions)
else:
pass
height, width = input_dimensions
batch_size, _, channels = hidden_states.size()
shortcut = hidden_states
hidden_states = self.layernorm_before(hidden_states)
hidden_states = hidden_states.view(batch_size, height, width, channels)
hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
_, height_pad, width_pad, _ = hidden_states.shape
if self.shift_size > 0:
shifted_hidden_states = torch.roll(hidden_states, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
else:
shifted_hidden_states = hidden_states
hidden_states_windows = window_partition(shifted_hidden_states, self.window_size)
hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels)
attn_mask = self.get_attn_mask(height_pad, width_pad, dtype=hidden_states.dtype)
if attn_mask is not None:
attn_mask = attn_mask.to(hidden_states_windows.device)
attention_outputs = self.attention(
hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions
)
attention_output = attention_outputs[0]
attention_windows = attention_output.view(-1, self.window_size, self.window_size, channels)
shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad)
if self.shift_size > 0:
attention_windows = torch.roll(shifted_windows, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
else:
attention_windows = shifted_windows
was_padded = pad_values[3] > 0 or pad_values[5] > 0
if was_padded:
attention_windows = attention_windows[:, :height, :width, :].contiguous()
attention_windows = attention_windows.view(batch_size, height * width, channels)
hidden_states = shortcut + self.drop_path(attention_windows)
layer_output = self.layernorm_after(hidden_states)
layer_output = self.intermediate(layer_output)
layer_output = hidden_states + self.output(layer_output)
layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
return layer_outputs
class DonutSwinStage(nn.Module):
def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample):
super().__init__()
self.config = config
self.dim = dim
self.blocks = nn.ModuleList(
[
DonutSwinLayer(
config=config,
dim=dim,
input_resolution=input_resolution,
num_heads=num_heads,
shift_size=0 if (i % 2 == 0) else config.window_size // 2,
)
for i in range(depth)
]
)
if downsample is not None:
self.downsample = downsample(input_resolution, dim=dim, norm_layer=nn.LayerNorm)
else:
self.downsample = None
self.pointing = False
def forward(
self,
hidden_states: torch.Tensor,
input_dimensions: Tuple[int, int],
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
always_partition: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
height, width = input_dimensions
for i, layer_module in enumerate(self.blocks):
layer_head_mask = head_mask[i] if head_mask is not None else None
layer_outputs = layer_module(
hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
)
hidden_states = layer_outputs[0]
hidden_states_before_downsampling = hidden_states
if self.downsample is not None:
height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
output_dimensions = (height, width, height_downsampled, width_downsampled)
hidden_states = self.downsample(hidden_states_before_downsampling, input_dimensions)
else:
output_dimensions = (height, width, height, width)
stage_outputs = (hidden_states, hidden_states_before_downsampling, output_dimensions)
if output_attentions:
stage_outputs += layer_outputs[1:]
return stage_outputs
class DonutSwinEncoder(nn.Module):
def __init__(self, config, grid_size):
super().__init__()
self.num_layers = len(config.depths)
self.config = config
dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
self.layers = nn.ModuleList(
[
DonutSwinStage(
config=config,
dim=int(config.embed_dim * 2**i_layer),
input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)),
depth=config.depths[i_layer],
num_heads=config.num_heads[i_layer],
drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
downsample=DonutSwinPatchMerging if (i_layer < self.num_layers - 1) else None,
)
for i_layer in range(self.num_layers)
]
)
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
input_dimensions: Tuple[int, int],
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
output_hidden_states_before_downsampling: Optional[bool] = False,
always_partition: Optional[bool] = False,
return_dict: Optional[bool] = True,
class DonutSwinPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = DonutSwinConfig
base_model_prefix = "swin"
main_input_name = "pixel_values"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, (nn.Linear, nn.Conv2d)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
SWIN_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`DonutSwinConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
SWIN_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`DonutImageProcessor.__call__`] for details.
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare Donut Swin Model transformer outputting raw hidden-states without any specific head on top.",
SWIN_START_DOCSTRING,
)
class DonutSwinModel(DonutSwinPreTrainedModel):
pass
def __init__(self, config, add_pooling_layer=True, use_mask_token=False):
super().__init__(config)
self.config = config
self.num_layers = len(config.depths)
self.num_features = int(config.embed_dim * 2 ** (self.num_layers - 1))
self.embeddings = DonutSwinEmbeddings(config, use_mask_token=use_mask_token)
self.encoder = DonutSwinEncoder(config, self.embeddings.patch_grid)
self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
self.post_init()
def get_input_embeddings(self):
return self.embeddings.patch_embeddings
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=DonutSwinModelOutput,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
bool_masked_pos: Optional[torch.BoolTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
pass
) -> Union[Tuple, DonutSwinModelOutput]:
r"""
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
head_mask = self.get_head_mask(head_mask, len(self.config.depths))
embedding_output, input_dimensions = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
encoder_outputs = self.encoder(
embedding_output,
input_dimensions,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
pooled_output = None
if self.pooler is not None:
pooled_output = self.pooler(sequence_output.transpose(1, 2))
pooled_output = torch.flatten(pooled_output, 1)
if not return_dict:
output = (sequence_output, pooled_output) + encoder_outputs[1:]
return output
return DonutSwinModelOutput(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
)
.\models\donut\processing_donut.py
"""
Donut 的处理器类。
"""
import re
import warnings
from contextlib import contextmanager
from ...processing_utils import ProcessorMixin
class DonutProcessor(ProcessorMixin):
r"""
构造一个 Donut 处理器,将 Donut 图像处理器和 XLMRoBERTa 分词器封装成一个单一处理器。
[`DonutProcessor`] 提供 [`DonutImageProcessor`] 和 [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`] 的所有功能。
详见 [`~DonutProcessor.__call__`] 和 [`~DonutProcessor.decode`] 获取更多信息。
Args:
image_processor ([`DonutImageProcessor`], *可选*):
[`DonutImageProcessor`] 的实例。图像处理器是必需的输入。
tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`], *可选*):
[`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`] 的实例。分词器是必需的输入。
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "AutoImageProcessor"
tokenizer_class = "AutoTokenizer"
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
feature_extractor = None
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
self.current_processor = self.image_processor
self._in_target_context_manager = False
def __call__(self, *args, **kwargs):
"""
当在正常模式下使用时,该方法将所有参数转发给 AutoImageProcessor 的 [`~AutoImageProcessor.__call__`] 并返回其输出。
如果在上下文 [`~DonutProcessor.as_target_processor`] 中使用,则将所有参数转发给 DonutTokenizer 的 [`~DonutTokenizer.__call__`]。
请参阅上述两个方法的文档了解更多信息。
"""
if self._in_target_context_manager:
return self.current_processor(*args, **kwargs)
images = kwargs.pop("images", None)
text = kwargs.pop("text", None)
if len(args) > 0:
images = args[0]
args = args[1:]
if images is None and text is None:
raise ValueError("You need to specify either an `images` or `text` input to process.")
if images is not None:
inputs = self.image_processor(images, *args, **kwargs)
if text is not None:
encodings = self.tokenizer(text, **kwargs)
if text is None:
return inputs
elif images is None:
return encodings
else:
inputs["labels"] = encodings["input_ids"]
return inputs
def batch_decode(self, *args, **kwargs):
"""
将所有参数转发给 DonutTokenizer 的 [`~PreTrainedTokenizer.batch_decode`] 方法。请参阅该方法的文档了解更多信息。
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
将所有参数转发给 DonutTokenizer 的 [`~PreTrainedTokenizer.decode`] 方法。请参阅该方法的文档了解更多信息。
"""
return self.tokenizer.decode(*args, **kwargs)
@contextmanager
def as_target_processor(self):
"""
临时设置处理输入的分词器。用于在微调 TrOCR 时对标签进行编码。
"""
warnings.warn(
"`as_target_processor` 已弃用,并将在 Transformers 的 v5 中移除。您可以通过在常规 `__call__` 方法的参数 `text` 中处理您的标签(在与图像输入相同的调用中或在单独的调用中)。"
)
self._in_target_context_manager = True
self.current_processor = self.tokenizer
yield
self.current_processor = self.image_processor
self._in_target_context_manager = False
def token2json(self, tokens, is_inner_value=False, added_vocab=None):
"""
Convert a (generated) token sequence into an ordered JSON format.
Args:
tokens (str): The token sequence to convert into JSON format.
is_inner_value (bool, optional): Indicates if the function is processing inner values. Defaults to False.
added_vocab (list, optional): List of added vocabulary tokens. Defaults to None.
Returns:
dict or list: Ordered JSON format representing the token sequence.
Converts a sequence of tokens into a structured JSON format. Handles both leaf and non-leaf nodes
in the token sequence recursively.
"""
if added_vocab is None:
added_vocab = self.tokenizer.get_added_vocab()
output = {}
while tokens:
start_token = re.search(r"<s_(.*?)>", tokens, re.IGNORECASE)
if start_token is None:
break
key = start_token.group(1)
key_escaped = re.escape(key)
end_token = re.search(rf"</s_{key_escaped}>", tokens, re.IGNORECASE)
start_token = start_token.group()
if end_token is None:
tokens = tokens.replace(start_token, "")
else:
end_token = end_token.group()
start_token_escaped = re.escape(start_token)
end_token_escaped = re.escape(end_token)
content = re.search(f"{start_token_escaped}(.*?){end_token_escaped}", tokens, re.IGNORECASE)
if content is not None:
content = content.group(1).strip()
if r"<s_" in content and r"</s_" in content:
value = self.token2json(content, is_inner_value=True, added_vocab=added_vocab)
if value:
if len(value) == 1:
value = value[0]
output[key] = value
else:
output[key] = []
for leaf in content.split(r"<sep/>"):
leaf = leaf.strip()
if leaf in added_vocab and leaf[0] == "<" and leaf[-2:] == "/>":
leaf = leaf[1:-2]
output[key].append(leaf)
if len(output[key]) == 1:
output[key] = output[key][0]
tokens = tokens[tokens.find(end_token) + len(end_token):].strip()
if tokens[:6] == r"<sep/>":
return [output] + self.token2json(tokens[6:], is_inner_value=True, added_vocab=added_vocab)
if len(output):
return [output] if is_inner_value else output
else:
return [] if is_inner_value else {"text_sequence": tokens}
@property
def feature_extractor_class(self):
"""
Property accessor for deprecated feature_extractor_class.
Returns:
class: The image processor class.
Warns:
FutureWarning: This property is deprecated and will be removed in v5.
Use `image_processor_class` instead.
"""
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
"""
Property accessor for deprecated feature_extractor.
Returns:
object: The image processor instance.
Warns:
FutureWarning: This property is deprecated and will be removed in v5.
Use `image_processor` instead.
"""
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
.\models\donut\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
_import_structure = {
"configuration_donut_swin": ["DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP", "DonutSwinConfig"],
"processing_donut": ["DonutProcessor"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_donut_swin"] = [
"DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST",
"DonutSwinModel",
"DonutSwinPreTrainedModel",
]
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["feature_extraction_donut"] = ["DonutFeatureExtractor"]
_import_structure["image_processing_donut"] = ["DonutImageProcessor"]
if TYPE_CHECKING:
from .configuration_donut_swin import DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP, DonutSwinConfig
from .processing_donut import DonutProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_donut_swin import (
DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST,
DonutSwinModel,
DonutSwinPreTrainedModel,
)
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .feature_extraction_donut import DonutFeatureExtractor
from .image_processing_donut import DonutImageProcessor
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\dpr\configuration_dpr.py
""" DPR 模型配置"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
DPR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/dpr-ctx_encoder-single-nq-base": (
"https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base/resolve/main/config.json"
),
"facebook/dpr-question_encoder-single-nq-base": (
"https://huggingface.co/facebook/dpr-question_encoder-single-nq-base/resolve/main/config.json"
),
"facebook/dpr-reader-single-nq-base": (
"https://huggingface.co/facebook/dpr-reader-single-nq-base/resolve/main/config.json"
),
"facebook/dpr-ctx_encoder-multiset-base": (
"https://huggingface.co/facebook/dpr-ctx_encoder-multiset-base/resolve/main/config.json"
),
"facebook/dpr-question_encoder-multiset-base": (
"https://huggingface.co/facebook/dpr-question_encoder-multiset-base/resolve/main/config.json"
),
"facebook/dpr-reader-multiset-base": (
"https://huggingface.co/facebook/dpr-reader-multiset-base/resolve/main/config.json"
),
}
class DPRConfig(PretrainedConfig):
r"""
[`DPRConfig`] 是用于存储 *DPRModel* 配置的配置类。
这是用于存储 [`DPRContextEncoder`], [`DPRQuestionEncoder`] 或 [`DPRReader`] 的配置类。根据指定的参数实例化 DPR 模型组件,
定义模型组件的架构。使用默认值实例化配置将产生类似于 DPRContextEncoder
[facebook/dpr-ctx_encoder-single-nq-base](https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base)
架构的配置。
该类是 [`BertConfig`] 的子类。请查看超类以获取所有 kwargs 的文档。
示例:
```
>>> from transformers import DPRConfig, DPRContextEncoder
>>> # 初始化 DPR facebook/dpr-ctx_encoder-single-nq-base 风格的配置
>>> configuration = DPRConfig()
>>> # 使用配置初始化一个模型(随机权重)来自 facebook/dpr-ctx_encoder-single-nq-base 风格的配置
>>> model = DPRContextEncoder(configuration)
>>> # 访问模型配置
>>> configuration = model.config
```
# 设定模型类型为 "dpr"
model_type = "dpr"
# 定义一个初始化方法,用于初始化模型参数和配置
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=0,
position_embedding_type="absolute",
projection_dim: int = 0,
**kwargs,
):
# 调用父类的初始化方法,传入 pad_token_id 和其他关键字参数
super().__init__(pad_token_id=pad_token_id, **kwargs)
# 设置模型的各种参数
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.projection_dim = projection_dim
self.position_embedding_type = position_embedding_type
.\models\dpr\convert_dpr_original_checkpoint_to_pytorch.py
import argparse
import collections
from pathlib import Path
import torch
from torch.serialization import default_restore_location
from transformers import BertConfig, DPRConfig, DPRContextEncoder, DPRQuestionEncoder, DPRReader
CheckpointState = collections.namedtuple(
"CheckpointState", ["model_dict", "optimizer_dict", "scheduler_dict", "offset", "epoch", "encoder_params"]
)
def load_states_from_checkpoint(model_file: str) -> CheckpointState:
print(f"Reading saved model from {model_file}")
state_dict = torch.load(model_file, map_location=lambda s, l: default_restore_location(s, "cpu"))
return CheckpointState(**state_dict)
class DPRState:
def __init__(self, src_file: Path):
self.src_file = src_file
def load_dpr_model(self):
raise NotImplementedError
@staticmethod
def from_type(comp_type: str, *args, **kwargs) -> "DPRState":
if comp_type.startswith("c"):
return DPRContextEncoderState(*args, **kwargs)
if comp_type.startswith("q"):
return DPRQuestionEncoderState(*args, **kwargs)
if comp_type.startswith("r"):
return DPRReaderState(*args, **kwargs)
else:
raise ValueError("Component type must be either 'ctx_encoder', 'question_encoder' or 'reader'.")
class DPRContextEncoderState(DPRState):
def load_dpr_model(self):
model = DPRContextEncoder(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0]))
print(f"Loading DPR biencoder from {self.src_file}")
saved_state = load_states_from_checkpoint(self.src_file)
encoder, prefix = model.ctx_encoder, "ctx_model."
state_dict = {"bert_model.embeddings.position_ids": model.ctx_encoder.bert_model.embeddings.position_ids}
for key, value in saved_state.model_dict.items():
if key.startswith(prefix):
key = key[len(prefix) :]
if not key.startswith("encode_proj."):
key = "bert_model." + key
state_dict[key] = value
encoder.load_state_dict(state_dict)
return model
class DPRQuestionEncoderState(DPRState):
def load_dpr_model(self):
model = DPRQuestionEncoder(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0]))
print(f"Loading DPR biencoder from {self.src_file}")
saved_state = load_states_from_checkpoint(self.src_file)
encoder, prefix = model.question_encoder, "question_model."
state_dict = {"bert_model.embeddings.position_ids": model.question_encoder.bert_model.embeddings.position_ids}
for key, value in saved_state.model_dict.items():
if key.startswith(prefix):
key = key[len(prefix) :]
if not key.startswith("encode_proj."):
key = "bert_model." + key
state_dict[key] = value
encoder.load_state_dict(state_dict)
return model
class DPRReaderState(DPRState):
def load_dpr_model(self):
model = DPRReader(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0]))
print(f"Loading DPR reader from {self.src_file}")
saved_state = load_states_from_checkpoint(self.src_file)
state_dict = {
"encoder.bert_model.embeddings.position_ids": model.span_predictor.encoder.bert_model.embeddings.position_ids
}
for key, value in saved_state.model_dict.items():
if key.startswith("encoder.") and not key.startswith("encoder.encode_proj"):
key = "encoder.bert_model." + key[len("encoder.") :]
state_dict[key] = value
model.span_predictor.load_state_dict(state_dict)
return model
def convert(comp_type: str, src_file: Path, dest_dir: Path):
dest_dir = Path(dest_dir)
dest_dir.mkdir(exist_ok=True)
dpr_state = DPRState.from_type(comp_type, src_file=src_file)
model = dpr_state.load_dpr_model()
model.save_pretrained(dest_dir)
model.from_pretrained(dest_dir)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--type", type=str, help="Type of the component to convert: 'ctx_encoder', 'question_encoder' or 'reader'."
)
parser.add_argument(
"--src",
type=str,
help=(
"Path to the dpr checkpoint file. They can be downloaded from the official DPR repo"
" https://github.com/facebookresearch/DPR. Note that in the official repo, both encoders are stored in the"
" 'retriever' checkpoints."
),
)
parser.add_argument("--dest", type=str, default=None, help="Path to the output PyTorch model directory.")
args = parser.parse_args()
src_file = Path(args.src)
dest_dir = f"converted-{src_file.name}" if args.dest is None else args.dest
dest_dir = Path(dest_dir)
assert src_file.exists()
assert (
args.type is not None
), "Please specify the component type of the DPR model to convert: 'ctx_encoder', 'question_encoder' or 'reader'."
convert(args.type, src_file, dest_dir)
.\models\dpr\modeling_dpr.py
""" PyTorch DPR model for Open Domain Question Answering. """
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import torch
from torch import Tensor, nn
from ...modeling_outputs import BaseModelOutputWithPooling
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from ..bert.modeling_bert import BertModel
from .configuration_dpr import DPRConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "DPRConfig"
_CHECKPOINT_FOR_DOC = "facebook/dpr-ctx_encoder-single-nq-base"
DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/dpr-ctx_encoder-single-nq-base",
"facebook/dpr-ctx_encoder-multiset-base",
]
DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/dpr-question_encoder-single-nq-base",
"facebook/dpr-question_encoder-multiset-base",
]
DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/dpr-reader-single-nq-base",
"facebook/dpr-reader-multiset-base",
]
@dataclass
class DPRContextEncoderOutput(ModelOutput):
"""
Class for outputs of [`DPRQuestionEncoder`].
"""
pooler_output: torch.FloatTensor
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class DPRQuestionEncoderOutput(ModelOutput):
"""
Class for outputs of [`DPRQuestionEncoder`].
Args:
pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
The DPR encoder outputs the *pooler_output* that corresponds to the question representation. Last layer
hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
This output is to be used to embed questions for nearest neighbors queries with context embeddings.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
pooler_output: torch.FloatTensor
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class DPRReaderOutput(ModelOutput):
"""
Class for outputs of [`DPRQuestionEncoder`].
"""
start_logits: torch.FloatTensor
end_logits: torch.FloatTensor = None
relevance_logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
class DPRPreTrainedModel(PreTrainedModel):
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
class DPREncoder(DPRPreTrainedModel):
base_model_prefix = "bert_model"
def __init__(self, config: DPRConfig):
super().__init__(config)
self.bert_model = BertModel(config, add_pooling_layer=False)
if self.bert_model.config.hidden_size <= 0:
raise ValueError("Encoder hidden_size can't be zero")
self.projection_dim = config.projection_dim
if self.projection_dim > 0:
self.encode_proj = nn.Linear(self.bert_model.config.hidden_size, config.projection_dim)
self.post_init()
def forward(
self,
input_ids: Tensor,
attention_mask: Optional[Tensor] = None,
token_type_ids: Optional[Tensor] = None,
inputs_embeds: Optional[Tensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = False,
) -> Union[BaseModelOutputWithPooling, Tuple[Tensor, ...]]:
outputs = self.bert_model(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
pooled_output = sequence_output[:, 0, :]
if self.projection_dim > 0:
pooled_output = self.encode_proj(pooled_output)
if not return_dict:
return (sequence_output, pooled_output) + outputs[2:]
return BaseModelOutputWithPooling(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@property
def embeddings_size(self) -> int:
if self.projection_dim > 0:
return self.encode_proj.out_features
return self.bert_model.config.hidden_size
class DPRSpanPredictor(DPRPreTrainedModel):
base_model_prefix = "encoder"
def __init__(self, config: DPRConfig):
super().__init__(config)
self.encoder = DPREncoder(config)
self.qa_outputs = nn.Linear(self.encoder.embeddings_size, 2)
self.qa_classifier = nn.Linear(self.encoder.embeddings_size, 1)
self.post_init()
def forward(
self,
input_ids: Tensor,
attention_mask: Tensor,
inputs_embeds: Optional[Tensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = False,
) -> Union[DPRReaderOutput, Tuple[Tensor, ...]]:
n_passages, sequence_length = input_ids.size() if input_ids is not None else inputs_embeds.size()[:2]
outputs = self.encoder(
input_ids,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
relevance_logits = self.qa_classifier(sequence_output[:, 0, :])
start_logits = start_logits.view(n_passages, sequence_length)
end_logits = end_logits.view(n_passages, sequence_length)
relevance_logits = relevance_logits.view(n_passages)
if not return_dict:
return (start_logits, end_logits, relevance_logits) + outputs[2:]
return DPRReaderOutput(
start_logits=start_logits,
end_logits=end_logits,
relevance_logits=relevance_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
"""
Contains the docstring for input specifications to DPR encoders.
"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
(a) 对于序列对(例如标题+文本对):
```
tokens: [CLS] is this jack
token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
```
(b) 对于单个序列(例如问题):
```
tokens: [CLS] the dog is hairy . [SEP]
token_type_ids: 0 0 0 0 0 0 0
```
DPR是一个具有绝对位置嵌入的模型,因此通常建议在右侧而不是左侧填充输入。
可以使用[`AutoTokenizer`]获取索引。详见[`PreTrainedTokenizer.encode`]和[`PreTrainedTokenizer.__call__`]的详细说明。
[什么是输入ID?](../glossary
attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
- 1 表示**未掩码**的标记,
- 0 表示**已掩码**的标记。
[什么是注意力掩码?](../glossary
token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
- 0 对应于*句子A*标记,
- 1 对应于*句子B*标记。
[什么是标记类型ID?](../glossary
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
output_attentions (`bool`, *optional*):
output_hidden_states (`bool`, *optional*):
return_dict (`bool`, *optional*):
"""
DPR_READER_INPUTS_DOCSTRING = r"""
Args:
input_ids (`Tuple[torch.LongTensor]` of shapes `(n_passages, sequence_length)`):
Indices of input sequence tokens in the vocabulary. It has to be a sequence triplet with 1) the question
and 2) the passages titles and 3) the passages texts To match pretraining, DPR `input_ids` sequence should
be formatted with [CLS] and [SEP] with the format:
`[CLS] <question token ids> [SEP] <titles ids> [SEP] <texts ids>`
DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
rather than the left.
Indices can be obtained using [`DPRReaderTokenizer`]. See this class documentation for more details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.FloatTensor` of shape `(n_passages, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
inputs_embeds (`torch.FloatTensor` of shape `(n_passages, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare DPRContextEncoder transformer outputting pooler outputs as context representations.",
DPR_START_DOCSTRING,
)
class DPRContextEncoder(DPRPretrainedContextEncoder):
"""
DPRContextEncoder extends DPRPretrainedContextEncoder to encode context using DPR models.
Args:
config (DPRConfig): Configuration object specifying the model configuration.
Attributes:
config (DPRConfig): The configuration object used to initialize the model.
ctx_encoder (DPREncoder): Encoder instance responsible for encoding contexts.
Methods:
post_init(): Initializes weights and applies final processing after initialization.
"""
def __init__(self, config: DPRConfig):
"""
Initializes a DPRContextEncoder instance.
Args:
config (DPRConfig): Configuration object specifying the model configuration.
"""
super().__init__(config)
self.config = config
self.ctx_encoder = DPREncoder(config)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(DPR_ENCODERS_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=DPRContextEncoderOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids=None,
attention_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
"""
Performs forward pass of the DPRContextEncoder model.
Args:
input_ids (Tuple[torch.LongTensor] of shapes `(n_passages, sequence_length)`, optional):
Indices of input sequence tokens in the vocabulary.
attention_mask (torch.FloatTensor of shape `(n_passages, sequence_length)`, optional):
Mask to avoid performing attention on padding token indices.
inputs_embeds (torch.FloatTensor of shape `(n_passages, sequence_length, hidden_size)`, optional):
Optionally, directly pass an embedded representation instead of `input_ids`.
output_attentions (bool, optional):
Whether or not to return the attentions tensors of all attention layers.
output_hidden_states (bool, optional):
Whether or not to return the hidden states of all layers.
return_dict (bool, optional):
Whether or not to return a `ModelOutput` instead of a plain tuple.
Returns:
DPRContextEncoderOutput or tuple:
The output of the DPRContextEncoder model.
Raises:
ValueError: If `input_ids` and `inputs_embeds` are both provided.
"""
# Implementation of forward pass is handled by the superclass DPRPretrainedContextEncoder
return super().forward(
input_ids=input_ids,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
def forward(
self,
input_ids: Optional[Tensor] = None,
attention_mask: Optional[Tensor] = None,
token_type_ids: Optional[Tensor] = None,
inputs_embeds: Optional[Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[DPRContextEncoderOutput, Tuple[Tensor, ...]]:
r"""
此方法定义了模型的前向传播逻辑,接受多个输入参数并返回相应的输出。
Return:
返回一个包含池化输出、隐藏状态、注意力权重的对象,具体取决于return_dict参数设置。
Examples:
```
>>> from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
>>> tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
>>> model = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
>>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="pt")["input_ids"]
>>> embeddings = model(input_ids).pooler_output
```"""
# 确定是否输出注意力权重,默认使用配置中的设置
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 确定是否输出隐藏状态,默认使用配置中的设置
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 确定是否使用返回字典形式的输出,默认使用配置中的设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 检查输入参数的一致性
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = input_ids.size()
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
# 确定设备位置,根据输入参数选择
device = input_ids.device if input_ids is not None else inputs_embeds.device
# 如果未提供注意力掩码,则根据输入数据是否为填充令牌来生成
if attention_mask is None:
attention_mask = (
torch.ones(input_shape, device=device)
if input_ids is None
else (input_ids != self.config.pad_token_id)
)
# 如果未提供token_type_ids,则默认为全零向量
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
# 调用上下文编码器模型的前向传播
outputs = self.ctx_encoder(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 如果不要求返回字典形式的输出,则返回元组形式的结果
if not return_dict:
return outputs[1:]
# 否则返回自定义输出对象,包含池化输出、隐藏状态和注意力权重
return DPRContextEncoderOutput(
pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions
)
# 为 DPRQuestionEncoder 类添加文档字符串,描述其作为问题表示的池化输出的基本功能
@add_start_docstrings(
"The bare DPRQuestionEncoder transformer outputting pooler outputs as question representations.",
DPR_START_DOCSTRING,
)
class DPRQuestionEncoder(DPRPretrainedQuestionEncoder):
def __init__(self, config: DPRConfig):
# 调用父类的初始化方法,传入配置对象
super().__init__(config)
# 存储配置对象到实例中
self.config = config
# 创建一个 DPREncoder 对象作为问题编码器
self.question_encoder = DPREncoder(config)
# 初始化权重并进行最终处理
self.post_init()
@add_start_docstrings_to_model_forward(DPR_ENCODERS_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=DPRQuestionEncoderOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[Tensor] = None,
attention_mask: Optional[Tensor] = None,
token_type_ids: Optional[Tensor] = None,
inputs_embeds: Optional[Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[DPRQuestionEncoderOutput, Tuple[Tensor, ...]]:
r"""
定义函数的返回类型,可以是 DPRQuestionEncoderOutput 类型或者 Tensor 的元组类型
Examples:
```
>>> from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
>>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
>>> model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
>>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="pt")["input_ids"]
>>> embeddings = model(input_ids).pooler_output
```
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 如果未指定 output_attentions,则使用 self.config 中的默认值
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果未指定 output_hidden_states,则使用 self.config 中的默认值
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果未指定 return_dict,则使用 self.config 中的 use_return_dict 的值
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
input_shape = input_ids.size()
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
# 根据输入的 input_ids 或 inputs_embeds 确定 input_shape,如果都未指定则报错
device = input_ids.device if input_ids is not None else inputs_embeds.device
# 根据 input_ids 或 inputs_embeds 确定设备类型
if attention_mask is None:
attention_mask = (
torch.ones(input_shape, device=device)
if input_ids is None
else (input_ids != self.config.pad_token_id)
)
# 如果未指定 attention_mask,则根据 input_ids 是否为 None 来确定是否为 pad_token_id
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
# 如果未指定 token_type_ids,则创建全零张量,形状与 input_shape 相同,数据类型为 long
outputs = self.question_encoder(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 调用 question_encoder 方法进行编码,根据参数进行不同的处理
if not return_dict:
return outputs[1:]
# 如果 return_dict 为 False,则返回 outputs 的第二个元素之后的内容
return DPRQuestionEncoderOutput(
pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions
)
# 否则,返回一个包含 pooler_output、hidden_states 和 attentions 的 DPRQuestionEncoderOutput 对象
# 使用装饰器为类添加文档字符串,描述了该类的基本功能和用途
@add_start_docstrings(
"The bare DPRReader transformer outputting span predictions.",
DPR_START_DOCSTRING,
)
# 定义 DPRReader 类,继承自 DPRPretrainedReader 类
class DPRReader(DPRPretrainedReader):
# 初始化方法,接受一个 DPRConfig 类型的参数 config
def __init__(self, config: DPRConfig):
# 调用父类的初始化方法,传入 config 参数
super().__init__(config)
# 将传入的 config 参数保存为类的属性
self.config = config
# 初始化一个 DPRSpanPredictor 实例作为类的属性 span_predictor
self.span_predictor = DPRSpanPredictor(config)
# 调用类的 post_init 方法,用于初始化权重和应用最终处理
self.post_init()
# 前向传播方法,接受多个输入参数并返回输出结果
@add_start_docstrings_to_model_forward(DPR_READER_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=DPRReaderOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[Tensor] = None,
attention_mask: Optional[Tensor] = None,
inputs_embeds: Optional[Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 方法接受的输入参数详细文档,包括输入类型和说明
# 输出返回值类型的文档替换为 DPRReaderOutput 类型,使用 _CONFIG_FOR_DOC 配置类
) -> Union[DPRReaderOutput, Tuple[Tensor, ...]]:
r"""
返回预测结果或包含多个张量的元组。
Examples:
```
>>> from transformers import DPRReader, DPRReaderTokenizer
>>> tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base")
>>> model = DPRReader.from_pretrained("facebook/dpr-reader-single-nq-base")
>>> encoded_inputs = tokenizer(
... questions=["What is love ?"],
... titles=["Haddaway"],
... texts=["'What Is Love' is a song recorded by the artist Haddaway"],
... return_tensors="pt",
... )
>>> outputs = model(**encoded_inputs)
>>> start_logits = outputs.start_logits
>>> end_logits = outputs.end_logits
>>> relevance_logits = outputs.relevance_logits
```
"""
# 根据输入的输出参数设置是否返回注意力权重
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 根据输入的输出参数设置是否返回隐藏状态
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 根据输入的输出参数设置是否返回字典形式的输出
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果同时指定了 input_ids 和 inputs_embeds,则抛出错误
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
# 如果只指定了 input_ids,则检查是否需要警告没有指定 attention_mask
elif input_ids is not None:
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
input_shape = input_ids.size()
# 如果只指定了 inputs_embeds,则获取其形状
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
# 如果既没有指定 input_ids 也没有指定 inputs_embeds,则抛出错误
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
# 确定使用的设备,根据是否有 input_ids 来决定
device = input_ids.device if input_ids is not None else inputs_embeds.device
# 如果没有提供 attention_mask,则创建一个全为1的张量作为默认的 attention_mask
if attention_mask is None:
attention_mask = torch.ones(input_shape, device=device)
# 调用 span_predictor 方法进行预测并返回结果
return self.span_predictor(
input_ids,
attention_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
.\models\dpr\modeling_tf_dpr.py
from __future__ import annotations
from dataclasses import dataclass
from typing import Tuple, Union
import tensorflow as tf
from ...modeling_tf_outputs import TFBaseModelOutputWithPooling
from ...modeling_tf_utils import TFModelInputType, TFPreTrainedModel, get_initializer, keras, shape_list, unpack_inputs
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from ..bert.modeling_tf_bert import TFBertMainLayer
from .configuration_dpr import DPRConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "DPRConfig"
TF_DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/dpr-ctx_encoder-single-nq-base",
"facebook/dpr-ctx_encoder-multiset-base",
]
TF_DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/dpr-question_encoder-single-nq-base",
"facebook/dpr-question_encoder-multiset-base",
]
TF_DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/dpr-reader-single-nq-base",
"facebook/dpr-reader-multiset-base",
]
@dataclass
class TFDPRContextEncoderOutput(ModelOutput):
r"""
Class for outputs of [`TFDPRContextEncoder`].
"""
"""
Args:
pooler_output (`tf.Tensor` of shape `(batch_size, embeddings_size)`):
DPR编码器的输出,对应于上下文表示。是序列中第一个标记(分类标记)的最后一层隐藏状态,
进一步由线性层处理。此输出用于嵌入上下文,以便使用问题嵌入进行最近邻查询。
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
`tf.Tensor`元组(当传递`output_hidden_states=True`或`config.output_hidden_states=True`时返回),
包含形状为`(batch_size, sequence_length, hidden_size)`的张量。
模型在每个层的输出隐藏状态,以及初始嵌入输出。
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
`tf.Tensor`元组(当传递`output_attentions=True`或`config.output_attentions=True`时返回),
包含形状为`(batch_size, num_heads, sequence_length, sequence_length)`的张量。
注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
"""
pooler_output: tf.Tensor = None
hidden_states: Tuple[tf.Tensor, ...] | None = None
attentions: Tuple[tf.Tensor, ...] | None = None
@dataclass
class TFDPRQuestionEncoderOutput(ModelOutput):
"""
Class for outputs of [`TFDPRQuestionEncoder`].
Args:
pooler_output (`tf.Tensor` of shape `(batch_size, embeddings_size)`):
The DPR encoder outputs the *pooler_output* that corresponds to the question representation. Last layer
hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
This output is to be used to embed questions for nearest neighbors queries with context embeddings.
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
pooler_output: tf.Tensor = None
hidden_states: Tuple[tf.Tensor, ...] | None = None
attentions: Tuple[tf.Tensor, ...] | None = None
@dataclass
class TFDPRReaderOutput(ModelOutput):
"""
Class for outputs of [`TFDPRReaderEncoder`].
"""
start_logits: tf.Tensor = None
end_logits: tf.Tensor = None
relevance_logits: tf.Tensor = None
hidden_states: Tuple[tf.Tensor, ...] | None = None
attentions: Tuple[tf.Tensor, ...] | None = None
class TFDPREncoderLayer(keras.layers.Layer):
base_model_prefix = "bert_model"
def __init__(self, config: DPRConfig, **kwargs):
super().__init__(**kwargs)
self.bert_model = TFBertMainLayer(config, add_pooling_layer=False, name="bert_model")
self.config = config
if self.config.hidden_size <= 0:
raise ValueError("Encoder hidden_size can't be zero")
self.projection_dim = config.projection_dim
if self.projection_dim > 0:
self.encode_proj = keras.layers.Dense(
config.projection_dim, kernel_initializer=get_initializer(config.initializer_range), name="encode_proj"
)
@unpack_inputs
def call(
self,
input_ids: tf.Tensor = None,
attention_mask: tf.Tensor | None = None,
token_type_ids: tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
output_attentions: bool = None,
output_hidden_states: bool = None,
return_dict: bool = None,
training: bool = False,
) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor, ...]]:
outputs = self.bert_model(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
pooled_output = sequence_output[:, 0, :]
if self.projection_dim > 0:
pooled_output = self.encode_proj(pooled_output)
if not return_dict:
return (sequence_output, pooled_output) + outputs[1:]
return TFBaseModelOutputWithPooling(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@property
def embeddings_size(self) -> int:
if self.projection_dim > 0:
return self.projection_dim
return self.bert_model.config.hidden_size
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "bert_model", None) is not None:
with tf.name_scope(self.bert_model.name):
self.bert_model.build(None)
if getattr(self, "encode_proj", None) is not None:
with tf.name_scope(self.encode_proj.name):
self.encode_proj.build(None)
class TFDPRSpanPredictorLayer(keras.layers.Layer):
base_model_prefix = "encoder"
def __init__(self, config: DPRConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.encoder = TFDPREncoderLayer(config, name="encoder")
self.qa_outputs = keras.layers.Dense(
2, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
self.qa_classifier = keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="qa_classifier"
)
def call(
self,
input_ids: tf.Tensor = None,
attention_mask: tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = False,
training: bool = False,
) -> Union[TFDPRReaderOutput, Tuple[tf.Tensor, ...]]:
n_passages, sequence_length = shape_list(input_ids) if input_ids is not None else shape_list(inputs_embeds)[:2]
outputs = self.encoder(
input_ids=input_ids,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = tf.split(logits, 2, axis=-1)
start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1)
relevance_logits = self.qa_classifier(sequence_output[:, 0, :])
start_logits = tf.reshape(start_logits, [n_passages, sequence_length])
end_logits = tf.reshape(end_logits, [n_passages, sequence_length])
relevance_logits = tf.reshape(relevance_logits, [n_passages])
if not return_dict:
return (start_logits, end_logits, relevance_logits) + outputs[2:]
return TFDPRReaderOutput(
start_logits=start_logits,
end_logits=end_logits,
relevance_logits=relevance_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.encoder.embeddings_size])
if getattr(self, "qa_classifier", None) is not None:
with tf.name_scope(self.qa_classifier.name):
self.qa_classifier.build([None, None, self.encoder.embeddings_size])
class TFDPRSpanPredictor(TFPreTrainedModel):
base_model_prefix = "encoder"
def __init__(self, config: DPRConfig, **kwargs):
super().__init__(config, **kwargs)
self.encoder = TFDPRSpanPredictorLayer(config)
@unpack_inputs
def call(
self,
input_ids: tf.Tensor = None,
attention_mask: tf.Tensor | None = None,
token_type_ids: tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = False,
training: bool = False,
) -> Union[TFDPRReaderOutput, Tuple[tf.Tensor, ...]]:
outputs = self.encoder(
input_ids=input_ids,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
class TFDPREncoder(TFPreTrainedModel):
base_model_prefix = "encoder"
def __init__(self, config: DPRConfig, **kwargs):
super().__init__(config, **kwargs)
self.encoder = TFDPREncoderLayer(config)
@unpack_inputs
def call(
self,
input_ids: tf.Tensor = None,
attention_mask: tf.Tensor | None = None,
token_type_ids: tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = False,
training: bool = False,
) -> Union[TFDPRReaderOutput, Tuple[tf.Tensor, ...]]:
outputs = self.encoder(
input_ids=input_ids,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
class TFDPRPretrainedContextEncoder(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = DPRConfig
base_model_prefix = "ctx_encoder"
class TFDPRPretrainedQuestionEncoder(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = DPRConfig
base_model_prefix = "question_encoder"
class TFDPRPretrainedReader(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = DPRConfig
base_model_prefix = "reader"
TF_DPR_START_DOCSTRING = r"""
# 此模型继承自 `TFPreTrainedModel`。请查看超类文档,了解库实现的通用方法,如下载或保存模型、调整输入嵌入大小、修剪头等。
# 此模型还是一个 Tensorflow 的 `keras.Model` 子类。您可以将其用作常规的 TF 2.0 Keras 模型,并参考 TF 2.0 文档了解所有与一般用法和行为相关的事项。
# <Tip>
# 在 `transformers` 中,TensorFlow 模型和层接受两种输入格式:
# - 将所有输入作为关键字参数(类似于 PyTorch 模型),
# - 将所有输入作为列表、元组或字典传递给第一个位置参数。
# 支持第二种格式的原因是,Keras 方法在将输入传递给模型和层时更倾向于此格式。因此,在使用 `model.fit()` 等方法时,只需将输入和标签以 `model.fit()` 支持的任何格式传递即可!但是,如果您想在 Keras `Functional` API 中创建自己的层或模型时使用第二种格式,比如在创建自己的层或模型时,可以使用以下三种可能性将所有输入张量收集到第一个位置参数中:
# - 只有 `input_ids` 的单个张量:`model(input_ids)`
# - 长度不同的列表,按照文档字符串中给定的顺序包含一个或多个输入张量:`model([input_ids, attention_mask])` 或 `model([input_ids, attention_mask, token_type_ids])`
# - 一个字典,其中包含一个或多个输入张量,与文档字符串中给定的输入名称关联:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
# 注意,如果使用 [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) 创建模型和层,则无需担心这些问题,因为可以像对任何其他 Python 函数一样传递输入!
# </Tip>
# 参数:
# config ([`DPRConfig`]): 包含模型所有参数的模型配置类。
# 使用配置文件初始化不会加载与模型关联的权重,仅加载配置。查看 [`~TFPreTrainedModel.from_pretrained`] 方法以加载模型权重。
"""
TF_DPR_ENCODERS_INPUTS_DOCSTRING = r"""
"""
TF_DPR_READER_INPUTS_DOCSTRING = r"""
Args:
input_ids (`Numpy array` or `tf.Tensor` of shapes `(n_passages, sequence_length)`):
Indices of input sequence tokens in the vocabulary. It has to be a sequence triplet with 1) the question
and 2) the passages titles and 3) the passages texts To match pretraining, DPR `input_ids` sequence should
be formatted with [CLS] and [SEP] with the format:
`[CLS] <question token ids> [SEP] <titles ids> [SEP] <texts ids>`
DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
rather than the left.
Indices can be obtained using [`DPRReaderTokenizer`]. See this class documentation for more details.
attention_mask (`Numpy array` or `tf.Tensor` of shape `(n_passages, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
inputs_embeds (`Numpy array` or `tf.Tensor` of shape `(n_passages, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
eager mode, in graph mode the value will always be set to True.
training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@add_start_docstrings(
"The bare DPRContextEncoder transformer outputting pooler outputs as context representations.",
TF_DPR_START_DOCSTRING,
)
class TFDPRContextEncoder(TFDPRPretrainedContextEncoder):
def __init__(self, config: DPRConfig, *args, **kwargs):
super().__init__(config, *args, **kwargs)
self.ctx_encoder = TFDPREncoderLayer(config, name="ctx_encoder")
try:
return self.ctx_encoder.bert_model.get_input_embeddings()
except AttributeError:
self.build()
return self.ctx_encoder.bert_model.get_input_embeddings()
@unpack_inputs
@add_start_docstrings_to_model_forward(TF_DPR_ENCODERS_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFDPRContextEncoderOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: tf.Tensor | None = None,
token_type_ids: tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
output_attentions: bool | None = None,
output_hidden_states: bool | None = None,
return_dict: bool | None = None,
training: bool = False,
) -> TFDPRContextEncoderOutput | Tuple[tf.Tensor, ...]:
r"""
返回模型的输出:
Examples:
```
>>> from transformers import TFDPRContextEncoder, DPRContextEncoderTokenizer
>>> tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
>>> model = TFDPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base", from_pt=True)
>>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="tf")["input_ids"]
>>> embeddings = model(input_ids).pooler_output
```
"""
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = shape_list(input_ids)
elif inputs_embeds is not None:
input_shape = shape_list(inputs_embeds)[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if attention_mask is None:
attention_mask = (
tf.ones(input_shape, dtype=tf.dtypes.int32)
if input_ids is None
else (input_ids != self.config.pad_token_id)
)
if token_type_ids is None:
token_type_ids = tf.zeros(input_shape, dtype=tf.dtypes.int32)
outputs = self.ctx_encoder(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
if not return_dict:
return outputs[1:]
return TFDPRContextEncoderOutput(
pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "ctx_encoder", None) is not None:
with tf.name_scope(self.ctx_encoder.name):
self.ctx_encoder.build(None)
@add_start_docstrings(
"The bare DPRQuestionEncoder transformer outputting pooler outputs as question representations.",
TF_DPR_START_DOCSTRING,
)
class TFDPRQuestionEncoder(TFDPRPretrainedQuestionEncoder):
def __init__(self, config: DPRConfig, *args, **kwargs):
super().__init__(config, *args, **kwargs)
self.question_encoder = TFDPREncoderLayer(config, name="question_encoder")
def get_input_embeddings(self):
try:
return self.question_encoder.bert_model.get_input_embeddings()
except AttributeError:
self.build()
return self.question_encoder.bert_model.get_input_embeddings()
@unpack_inputs
@add_start_docstrings_to_model_forward(TF_DPR_ENCODERS_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFDPRQuestionEncoderOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: tf.Tensor | None = None,
token_type_ids: tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
output_attentions: bool | None = None,
output_hidden_states: bool | None = None,
return_dict: bool | None = None,
training: bool = False,
) -> TFDPRQuestionEncoderOutput | Tuple[tf.Tensor, ...]:
r"""
指定该方法的返回类型为 TFDPRQuestionEncoderOutput 或包含 tf.Tensor 的元组
Examples:
```
>>> from transformers import TFDPRQuestionEncoder, DPRQuestionEncoderTokenizer
>>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
>>> model = TFDPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base", from_pt=True)
>>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="tf")["input_ids"]
>>> embeddings = model(input_ids).pooler_output
```
"""
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = shape_list(input_ids)
elif inputs_embeds is not None:
input_shape = shape_list(inputs_embeds)[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if attention_mask is None:
attention_mask = (
tf.ones(input_shape, dtype=tf.dtypes.int32)
if input_ids is None
else (input_ids != self.config.pad_token_id)
)
if token_type_ids is None:
token_type_ids = tf.zeros(input_shape, dtype=tf.dtypes.int32)
outputs = self.question_encoder(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
if not return_dict:
return outputs[1:]
return TFDPRQuestionEncoderOutput(
pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "question_encoder", None) is not None:
with tf.name_scope(self.question_encoder.name):
self.question_encoder.build(None)
@add_start_docstrings(
"The bare DPRReader transformer outputting span predictions.",
TF_DPR_START_DOCSTRING,
)
class TFDPRReader(TFDPRPretrainedReader):
def __init__(self, config: DPRConfig, *args, **kwargs):
super().__init__(config, *args, **kwargs)
self.span_predictor = TFDPRSpanPredictorLayer(config, name="span_predictor")
def get_input_embeddings(self):
try:
return self.span_predictor.encoder.bert_model.get_input_embeddings()
except AttributeError:
self.build()
return self.span_predictor.encoder.bert_model.get_input_embeddings()
@unpack_inputs
@add_start_docstrings_to_model_forward(TF_DPR_READER_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFDPRReaderOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
output_attentions: bool | None = None,
output_hidden_states: bool | None = None,
return_dict: bool | None = None,
training: bool = False,
) -> TFDPRReaderOutput | Tuple[tf.Tensor, ...]:
r"""
模型前向传播函数,接受多种输入参数,返回预测结果。
Return:
TFDPRReaderOutput或者一个元组包含tf.Tensor
Examples:
```
>>> from transformers import TFDPRReader, DPRReaderTokenizer
>>> tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base")
>>> model = TFDPRReader.from_pretrained("facebook/dpr-reader-single-nq-base", from_pt=True)
>>> encoded_inputs = tokenizer(
... questions=["What is love ?"],
... titles=["Haddaway"],
... texts=["'What Is Love' is a song recorded by the artist Haddaway"],
... return_tensors="tf",
... )
>>> outputs = model(encoded_inputs)
>>> start_logits = outputs.start_logits
>>> end_logits = outputs.end_logits
>>> relevance_logits = outputs.relevance_logits
```
"""
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = shape_list(input_ids)
elif inputs_embeds is not None:
input_shape = shape_list(inputs_embeds)[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if attention_mask is None:
attention_mask = tf.ones(input_shape, dtype=tf.dtypes.int32)
return self.span_predictor(
input_ids=input_ids,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "span_predictor", None) is not None:
with tf.name_scope(self.span_predictor.name):
self.span_predictor.build(None)