Transformers 源码解析(三十四)
.\models\deberta_v2\tokenization_deberta_v2.py
"""
Tokenization class for model DeBERTa.
"""
import os
import unicodedata
from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as sp
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/deberta-v2-xlarge": "https://huggingface.co/microsoft/deberta-v2-xlarge/resolve/main/spm.model",
"microsoft/deberta-v2-xxlarge": "https://huggingface.co/microsoft/deberta-v2-xxlarge/resolve/main/spm.model",
"microsoft/deberta-v2-xlarge-mnli": (
"https://huggingface.co/microsoft/deberta-v2-xlarge-mnli/resolve/main/spm.model"
),
"microsoft/deberta-v2-xxlarge-mnli": (
"https://huggingface.co/microsoft/deberta-v2-xxlarge-mnli/resolve/main/spm.model"
),
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/deberta-v2-xlarge": 512,
"microsoft/deberta-v2-xxlarge": 512,
"microsoft/deberta-v2-xlarge-mnli": 512,
"microsoft/deberta-v2-xxlarge-mnli": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"microsoft/deberta-v2-xlarge": {"do_lower_case": False},
"microsoft/deberta-v2-xxlarge": {"do_lower_case": False},
"microsoft/deberta-v2-xlarge-mnli": {"do_lower_case": False},
"microsoft/deberta-v2-xxlarge-mnli": {"do_lower_case": False},
}
VOCAB_FILES_NAMES = {"vocab_file": "spm.model"}
class DebertaV2Tokenizer(PreTrainedTokenizer):
r"""
Constructs a DeBERTa-v2 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(
self,
vocab_file,
do_lower_case=False,
split_by_punct=False,
bos_token="[CLS]",
eos_token="[SEP]",
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs,
):
"""
Initialize a DebertaV2Tokenizer with essential parameters.
Args:
vocab_file (str): The vocabulary file path.
do_lower_case (bool): Whether to convert tokens to lowercase.
split_by_punct (bool): Whether to split tokens by punctuation.
bos_token (str): Beginning of sequence token.
eos_token (str): End of sequence token.
unk_token (str): Token for unknown or unrecognized tokens.
sep_token (str): Separator token.
pad_token (str): Token used for padding sequences.
cls_token (str): Classification token.
mask_token (str): Mask token for masked language modeling.
sp_model_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments for SentencePiece model.
**kwargs: Additional keyword arguments.
"""
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
**kwargs,
)
self.vocab_file = vocab_file
self.do_lower_case = do_lower_case
self.split_by_punct = split_by_punct
self.sp_model_kwargs = sp_model_kwargs if sp_model_kwargs is not None else {}
) -> None:
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
" model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
self.do_lower_case = do_lower_case
self.split_by_punct = split_by_punct
self.vocab_file = vocab_file
self._tokenizer = SPMTokenizer(
vocab_file, None, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs
)
unk_token = AddedToken(unk_token, normalized=True, special=True) if isinstance(unk_token, str) else unk_token
super().__init__(
do_lower_case=do_lower_case,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
split_by_punct=split_by_punct,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs,
)
self._tokenizer.special_tokens = self.all_special_tokens
@property
def vocab_size(self):
return len(self.vocab)
@property
def vocab(self):
return self._tokenizer.vocab
def get_vocab(self):
vocab = self.vocab.copy()
vocab.update(self.get_added_vocab())
return vocab
def _tokenize(self, text: str) -> List[str]:
"""Take as input a string and return a list of strings (tokens) for words/sub-words"""
if self.do_lower_case:
text = text.lower()
return self._tokenizer.tokenize(text)
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self._tokenizer.spm.PieceToId(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self._tokenizer.spm.IdToPiece(index) if index < self.vocab_size else self.unk_token
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
return self._tokenizer.decode(tokens)
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
从序列或序列对中构建模型输入,用于序列分类任务,通过连接和添加特殊标记。DeBERTa 序列的格式如下:
- 单个序列:[CLS] X [SEP]
- 序列对:[CLS] A [SEP] B [SEP]
Args:
token_ids_0 (`List[int]`):
要添加特殊标记的 ID 列表。
token_ids_1 (`List[int]`, *可选*):
第二个序列的 ID 列表,用于序列对输入。
Returns:
`List[int]`: 包含适当特殊标记的输入 ID 列表。
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
"""
从没有添加特殊标记的标记列表中检索序列 ID。当使用 tokenizer 的 `prepare_for_model` 或 `encode_plus` 方法添加特殊标记时调用此方法。
Args:
token_ids_0 (`List[int]`):
ID 列表。
token_ids_1 (`List[int]`, *可选*):
第二个序列的 ID 列表,用于序列对输入。
already_has_special_tokens (`bool`, *可选*, 默认为 `False`):
标记列表是否已经格式化为模型所需的特殊标记。
Returns:
`List[int]`: 一个整数列表,取值为 [0, 1]:1 表示特殊标记,0 表示序列标记。
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
add_prefix_space = kwargs.pop("add_prefix_space", False)
if is_split_into_words or add_prefix_space:
text = " " + text
return (text, kwargs)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
return self._tokenizer.save_pretrained(save_directory, filename_prefix=filename_prefix)
r"""
Constructs a tokenizer based on [SentencePiece](https://github.com/google/sentencepiece).
Args:
vocab_file (`str`):
[SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
contains the vocabulary necessary to instantiate a tokenizer.
sp_model_kwargs (`dict`, *optional*):
Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
to set:
- `enable_sampling`: Enable subword regularization.
- `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
- `nbest_size = {0,1}`: No sampling is performed.
- `nbest_size > 1`: samples from the nbest_size results.
- `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
"""
def __init__(
self, vocab_file, special_tokens, split_by_punct=False, sp_model_kwargs: Optional[Dict[str, Any]] = None
):
self.split_by_punct = split_by_punct
self.vocab_file = vocab_file
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
if not os.path.exists(vocab_file):
raise FileNotFoundError(f"{vocab_file} does not exist!")
spm.load(vocab_file)
bpe_vocab_size = spm.GetPieceSize()
self.vocab = {spm.IdToPiece(i): i for i in range(bpe_vocab_size)}
self.ids_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)]
self.spm = spm
self.special_tokens = special_tokens
def __getstate__(self):
state = self.__dict__.copy()
state["spm"] = None
return state
def __setstate__(self, d):
self.__dict__ = d
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
self.spm.Load(self.vocab_file)
def tokenize(self, text):
return self._encode_as_pieces(text)
def convert_ids_to_tokens(self, ids):
tokens = []
for i in ids:
tokens.append(self.ids_to_tokens[i])
return tokens
def decode(self, tokens, start=-1, end=-1, raw_text=None):
if raw_text is None:
current_sub_tokens = []
out_string = ""
prev_is_special = False
for token in tokens:
if token in self.special_tokens:
if not prev_is_special:
out_string += " "
out_string += self.spm.decode_pieces(current_sub_tokens) + token
prev_is_special = True
current_sub_tokens = []
else:
current_sub_tokens.append(token)
prev_is_special = False
out_string += self.spm.decode_pieces(current_sub_tokens)
return out_string.strip()
else:
words = self.split_to_words(raw_text)
word_tokens = [self.tokenize(w) for w in words]
token2words = [0] * len(tokens)
tid = 0
for i, w in enumerate(word_tokens):
for k, t in enumerate(w):
token2words[tid] = i
tid += 1
word_start = token2words[start]
word_end = token2words[end] if end < len(tokens) else len(words)
text = "".join(words[word_start:word_end])
return text
def add_special_token(self, token):
if token not in self.special_tokens:
self.special_tokens.append(token)
if token not in self.vocab:
self.vocab[token] = len(self.vocab) - 1
self.ids_to_tokens.append(token)
return self.id(token)
def part_of_whole_word(self, token, is_bos=False):
logger.warning_once(
"The `DebertaTokenizer.part_of_whole_word` method is deprecated and will be removed in `transformers==4.35`"
)
if is_bos:
return True
if (
len(token) == 1
and (_is_whitespace(list(token)[0]) or _is_control(list(token)[0]) or _is_punctuation(list(token)[0]))
) or token in self.special_tokens:
return False
word_start = b"\xe2\x96\x81".decode("utf-8")
return not token.startswith(word_start)
def pad(self):
return "[PAD]"
def bos(self):
return "[CLS]"
def eos(self):
return "[SEP]"
def unk(self):
return "[UNK]"
def mask(self):
return "[MASK]"
def sym(self, id):
return self.ids_to_tokens[id]
def id(self, sym):
logger.warning_once(
"The `DebertaTokenizer.id` method is deprecated and will be removed in `transformers==4.35`"
)
return self.vocab[sym] if sym in self.vocab else 1
def _encode_as_pieces(self, text):
text = convert_to_unicode(text)
if self.split_by_punct:
words = self._run_split_on_punc(text)
pieces = [self.spm.encode(w, out_type=str) for w in words]
return [p for w in pieces for p in w]
else:
return self.spm.encode(text, out_type=str)
def split_to_words(self, text):
pieces = self._encode_as_pieces(text)
word_start = b"\xe2\x96\x81".decode("utf-8")
words = []
offset = 0
prev_end = 0
for i, p in enumerate(pieces):
if p.startswith(word_start):
if offset > prev_end:
words.append(text[prev_end:offset])
prev_end = offset
w = p.replace(word_start, "")
else:
w = p
try:
s = text.index(w, offset)
pn = ""
k = i + 1
while k < len(pieces):
pn = pieces[k].replace(word_start, "")
if len(pn) > 0:
break
k += 1
if len(pn) > 0 and pn in text[offset:s]:
offset = offset + 1
else:
offset = s + len(w)
except Exception:
offset = offset + 1
if prev_end < offset:
words.append(text[prev_end:offset])
return words
def _run_split_on_punc(self, text):
"""Splits punctuation on a piece of text."""
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def save_pretrained(self, path: str, filename_prefix: str = None):
filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]]
if filename_prefix is not None:
filename = filename_prefix + "-" + filename
full_path = os.path.join(path, filename)
with open(full_path, "wb") as fs:
fs.write(self.spm.serialized_model_proto())
return (full_path,)
def _is_whitespace(char):
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def _is_control(char):
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False
def _is_punctuation(char):
cp = ord(char)
if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
def convert_to_unicode(text):
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError(f"Unsupported string type: {type(text)}")
.\models\deberta_v2\tokenization_deberta_v2_fast.py
"""Fast Tokenization class for model DeBERTa."""
import os
from shutil import copyfile
from typing import Optional, Tuple
from ...file_utils import is_sentencepiece_available
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
if is_sentencepiece_available():
from .tokenization_deberta_v2 import DebertaV2Tokenizer
else:
DebertaV2Tokenizer = None
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "spm.model", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/deberta-v2-xlarge": "https://huggingface.co/microsoft/deberta-v2-xlarge/resolve/main/spm.model",
"microsoft/deberta-v2-xxlarge": "https://huggingface.co/microsoft/deberta-v2-xxlarge/resolve/main/spm.model",
"microsoft/deberta-v2-xlarge-mnli": (
"https://huggingface.co/microsoft/deberta-v2-xlarge-mnli/resolve/main/spm.model"
),
"microsoft/deberta-v2-xxlarge-mnli": (
"https://huggingface.co/microsoft/deberta-v2-xxlarge-mnli/resolve/main/spm.model"
),
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/deberta-v2-xlarge": 512,
"microsoft/deberta-v2-xxlarge": 512,
"microsoft/deberta-v2-xlarge-mnli": 512,
"microsoft/deberta-v2-xxlarge-mnli": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"microsoft/deberta-v2-xlarge": {"do_lower_case": False},
"microsoft/deberta-v2-xxlarge": {"do_lower_case": False},
"microsoft/deberta-v2-xlarge-mnli": {"do_lower_case": False},
"microsoft/deberta-v2-xxlarge-mnli": {"do_lower_case": False},
}
class DebertaV2TokenizerFast(PreTrainedTokenizerFast):
r"""
Constructs a DeBERTa-v2 fast tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
slow_tokenizer_class = DebertaV2Tokenizer
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
do_lower_case=False,
split_by_punct=False,
bos_token="[CLS]",
eos_token="[SEP]",
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
**kwargs,
) -> None:
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
split_by_punct=split_by_punct,
**kwargs,
)
self.do_lower_case = do_lower_case
self.split_by_punct = split_by_punct
self.vocab_file = vocab_file
@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
从一个序列或一对序列构建模型输入,用于序列分类任务,通过连接和添加特殊标记。
DeBERTa 模型的序列格式如下:
- 单个序列: [CLS] X [SEP]
- 一对序列: [CLS] A [SEP] B [SEP]
Args:
token_ids_0 (`List[int]`):
要添加特殊标记的 ID 列表。
token_ids_1 (`List[int]`, *optional*):
第二个可选的序列 ID 列表,用于序列对。
Returns:
`List[int]`: 包含适当特殊标记的输入 ID 列表。
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not self.can_save_slow_tokenizer:
raise ValueError(
"Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
"tokenizer."
)
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)
.\models\deberta_v2\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config", "DebertaV2OnnxConfig"],
"tokenization_deberta_v2": ["DebertaV2Tokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_deberta_v2_fast"] = ["DebertaV2TokenizerFast"]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_deberta_v2"] = [
"TF_DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFDebertaV2ForMaskedLM",
"TFDebertaV2ForQuestionAnswering",
"TFDebertaV2ForMultipleChoice",
"TFDebertaV2ForSequenceClassification",
"TFDebertaV2ForTokenClassification",
"TFDebertaV2Model",
"TFDebertaV2PreTrainedModel",
]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_deberta_v2"] = [
"DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST",
"DebertaV2ForMaskedLM",
"DebertaV2ForMultipleChoice",
"DebertaV2ForQuestionAnswering",
"DebertaV2ForSequenceClassification",
"DebertaV2ForTokenClassification",
"DebertaV2Model",
"DebertaV2PreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_deberta_v2 import (
DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP,
DebertaV2Config,
DebertaV2OnnxConfig,
)
from .tokenization_deberta_v2 import DebertaV2Tokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_deberta_v2_fast import DebertaV2TokenizerFast
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
from .modeling_tf_deberta_v2 import (
TF_DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST,
TFDebertaV2ForMaskedLM,
TFDebertaV2ForMultipleChoice,
TFDebertaV2ForQuestionAnswering,
TFDebertaV2ForSequenceClassification,
TFDebertaV2ForTokenClassification,
TFDebertaV2Model,
TFDebertaV2PreTrainedModel,
)
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_deberta_v2 import (
DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST,
DebertaV2ForMaskedLM,
DebertaV2ForMultipleChoice,
DebertaV2ForQuestionAnswering,
DebertaV2ForSequenceClassification,
DebertaV2ForTokenClassification,
DebertaV2Model,
DebertaV2PreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\decision_transformer\configuration_decision_transformer.py
""" Decision Transformer model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
DECISION_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"edbeeching/decision-transformer-gym-hopper-medium": (
"https://huggingface.co/edbeeching/decision-transformer-gym-hopper-medium/resolve/main/config.json"
),
}
class DecisionTransformerConfig(PretrainedConfig):
"""
This is the configuration class to store the configuration of a [`DecisionTransformerModel`]. It is used to
instantiate a Decision Transformer model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the standard
DecisionTransformer architecture. Many of the config options are used to instatiate the GPT2 model that is used as
part of the architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import DecisionTransformerConfig, DecisionTransformerModel
>>> # Initializing a DecisionTransformer configuration
>>> configuration = DecisionTransformerConfig()
>>> # Initializing a model (with random weights) from the configuration
>>> model = DecisionTransformerModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "decision_transformer"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"max_position_embeddings": "n_positions",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}
def __init__(
self,
state_dim=17,
act_dim=4,
hidden_size=128,
max_ep_len=4096,
action_tanh=True,
vocab_size=1,
n_positions=1024,
n_layer=3,
n_head=1,
n_inner=None,
activation_function="relu",
resid_pdrop=0.1,
embd_pdrop=0.1,
attn_pdrop=0.1,
layer_norm_epsilon=1e-5,
initializer_range=0.02,
scale_attn_weights=True,
use_cache=True,
bos_token_id=50256,
eos_token_id=50256,
scale_attn_by_inverse_layer_idx=False,
reorder_and_upcast_attn=False,
**kwargs,
):
self.state_dim = state_dim
self.act_dim = act_dim
self.hidden_size = hidden_size
self.max_ep_len = max_ep_len
self.action_tanh = action_tanh
self.vocab_size = vocab_size
self.n_positions = n_positions
self.n_layer = n_layer
self.n_head = n_head
self.n_inner = n_inner
self.activation_function = activation_function
self.resid_pdrop = resid_pdrop
self.embd_pdrop = embd_pdrop
self.attn_pdrop = attn_pdrop
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.scale_attn_weights = scale_attn_weights
self.use_cache = use_cache
self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
self.reorder_and_upcast_attn = reorder_and_upcast_attn
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
.\models\decision_transformer\modeling_decision_transformer.py
""" PyTorch DecisionTransformer model."""
import math
import os
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.cuda.amp import autocast
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_decision_transformer import DecisionTransformerConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "edbeeching/decision-transformer-gym-hopper-medium"
_CONFIG_FOR_DOC = "DecisionTransformerConfig"
DECISION_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"edbeeching/decision-transformer-gym-hopper-medium",
]
def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
"""Load tf checkpoints in a pytorch model"""
try:
import re
import tensorflow as tf
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path = os.path.abspath(gpt2_checkpoint_path)
logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
logger.info(f"Loading TF weight {name} with shape {shape}")
array = tf.train.load_variable(tf_path, name)
names.append(name)
arrays.append(array.squeeze())
for name, array in zip(names, arrays):
name = name[6:]
name = name.split("/")
pointer = model
for m_name in name:
if re.fullmatch(r"[A-Za-z]+\d+", m_name):
scope_names = re.split(r"(\d+)", m_name)
else:
scope_names = [m_name]
if scope_names[0] == "w" or scope_names[0] == "g":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "b":
pointer = getattr(pointer, "bias")
elif scope_names[0] == "wpe" or scope_names[0] == "wte":
pointer = getattr(pointer, scope_names[0])
pointer = getattr(pointer, "weight")
else:
pointer = getattr(pointer, scope_names[0])
if len(scope_names) >= 2:
num = int(scope_names[1])
pointer = pointer[num]
try:
if pointer.shape != array.shape:
raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
except ValueError as e:
e.args += (pointer.shape, array.shape)
raise
logger.info(f"Initialize PyTorch weight {name}")
pointer.data = torch.from_numpy(array)
return model
class DecisionTransformerGPT2Attention(nn.Module):
def __init__(self, config, is_cross_attention=False, layer_idx=None):
super().__init__()
max_positions = config.max_position_embeddings
self.register_buffer(
"bias",
torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
1, 1, max_positions, max_positions
),
persistent=False,
)
self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.embed_dim // self.num_heads
self.split_size = self.embed_dim
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {self.num_heads})."
)
self.scale_attn_weights = config.scale_attn_weights
self.is_cross_attention = is_cross_attention
self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx
self.layer_idx = layer_idx
self.reorder_and_upcast_attn = config.reorder_and_upcast_attn
if self.is_cross_attention:
self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
else:
self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
self.attn_dropout = nn.Dropout(config.attn_pdrop)
self.resid_dropout = nn.Dropout(config.resid_pdrop)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads)
index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads))
self.num_heads = self.num_heads - len(heads)
self.pruned_heads = self.pruned_heads.union(heads)
attn_weights = torch.matmul(query, key.transpose(-1, -2))
if self.scale_attn_weights:
attn_weights = attn_weights / torch.full(
[], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
)
if self.scale_attn_by_inverse_layer_idx:
attn_weights = attn_weights / float(self.layer_idx + 1)
if not self.is_cross_attention:
query_length, key_length = query.size(-2), key.size(-2)
causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
mask_value = torch.finfo(attn_weights.dtype).min
mask_value = torch.full([], mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value)
if attention_mask is not None:
attn_weights = attn_weights + attention_mask
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
attn_weights = attn_weights.type(value.dtype)
attn_weights = self.attn_dropout(attn_weights)
if head_mask is not None:
attn_weights = attn_weights * head_mask
attn_output = torch.matmul(attn_weights, value)
return attn_output, attn_weights
def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None):
bsz, num_heads, q_seq_len, dk = query.size()
_, _, k_seq_len, _ = key.size()
attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device)
scale_factor = 1.0
if self.scale_attn_weights:
scale_factor /= float(value.size(-1)) ** 0.5
if self.scale_attn_by_inverse_layer_idx:
scale_factor /= float(self.layer_idx + 1)
with autocast(enabled=False):
q = query.reshape(-1, q_seq_len, dk)
k = key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
if not self.is_cross_attention:
query_length, key_length = query.size(-2), key.size(-2)
causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
mask_value = torch.finfo(attn_weights.dtype).min
mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
attn_weights = torch.where(causal_mask, attn_weights, mask_value)
if attention_mask is not None:
attn_weights = attn_weights + attention_mask
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
if attn_weights.dtype != torch.float32:
raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32")
attn_weights = attn_weights.type(value.dtype)
attn_weights = self.attn_dropout(attn_weights)
if head_mask is not None:
attn_weights = attn_weights * head_mask
attn_output = torch.matmul(attn_weights, value)
return attn_output, attn_weights
def _split_heads(self, tensor, num_heads, attn_head_size):
"""
Splits hidden_size dim into attn_head_size and num_heads
"""
new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
tensor = tensor.view(new_shape)
tensor = tensor.permute(0, 2, 1, 3)
return tensor
def _merge_heads(self, tensor, num_heads, attn_head_size):
"""
Merges attn_head_size dim and num_attn_heads dim into hidden_size
"""
tensor = tensor.permute(0, 2, 1, 3).contiguous()
new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
return tensor.view(new_shape)
def forward(
self,
hidden_states: Optional[Tuple[torch.FloatTensor]],
layer_past: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = False,
output_attentions: Optional[bool] = False,
) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
if encoder_hidden_states is not None:
if not hasattr(self, "q_attn"):
raise ValueError(
"If class is used as cross attention, the weights `q_attn` have to be defined. "
"Please make sure to instantiate class with `DecisionTransformerGPT2Attention(..., is_cross_attention=True)`."
)
query = self.q_attn(hidden_states)
key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
attention_mask = encoder_attention_mask
else:
query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
query = self._split_heads(query, self.num_heads, self.head_dim)
key = self._split_heads(key, self.num_heads, self.head_dim)
value = self._split_heads(value, self.num_heads, self.head_dim)
if layer_past is not None:
past_key, past_value = layer_past
key = torch.cat((past_key, key), dim=-2)
value = torch.cat((past_value, value), dim=-2)
if use_cache is True:
present = (key, value)
else:
present = None
if self.reorder_and_upcast_attn:
attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask)
else:
attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
attn_output = self.c_proj(attn_output)
attn_output = self.resid_dropout(attn_output)
outputs = (attn_output, present)
if output_attentions:
outputs += (attn_weights,)
return outputs
class DecisionTransformerGPT2MLP(nn.Module):
def __init__(self, intermediate_size, config):
super().__init__()
embed_dim = config.hidden_size
self.c_fc = Conv1D(intermediate_size, embed_dim)
self.c_proj = Conv1D(embed_dim, intermediate_size)
self.act = ACT2FN[config.activation_function]
self.dropout = nn.Dropout(config.resid_pdrop)
def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
hidden_states = self.c_fc(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.c_proj(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class DecisionTransformerGPT2Block(nn.Module):
def __init__(self, config, layer_idx=None):
super().__init__()
hidden_size = config.hidden_size
self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
self.attn = DecisionTransformerGPT2Attention(config, layer_idx=layer_idx)
self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
if config.add_cross_attention:
self.crossattention = DecisionTransformerGPT2Attention(
config, is_cross_attention=True, layer_idx=layer_idx
)
self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
self.mlp = DecisionTransformerGPT2MLP(inner_dim, config)
def forward(
self,
hidden_states: Optional[Tuple[torch.FloatTensor]],
layer_past: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = False,
output_attentions: Optional[bool] = False,
) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
residual = hidden_states
hidden_states = self.ln_1(hidden_states)
attn_outputs = self.attn(
hidden_states,
layer_past=layer_past,
attention_mask=attention_mask,
head_mask=head_mask,
use_cache=use_cache,
output_attentions=output_attentions,
)
attn_output = attn_outputs[0]
outputs = attn_outputs[1:]
hidden_states = attn_output + residual
if encoder_hidden_states is not None:
if not hasattr(self, "crossattention"):
raise ValueError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
"cross-attention layers by setting `config.add_cross_attention=True`"
)
residual = hidden_states
hidden_states = self.ln_cross_attn(hidden_states)
cross_attn_outputs = self.crossattention(
hidden_states,
attention_mask=attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions,
)
attn_output = cross_attn_outputs[0]
hidden_states = residual + attn_output
outputs = outputs + cross_attn_outputs[2:]
residual = hidden_states
hidden_states = self.ln_2(hidden_states)
feed_forward_hidden_states = self.mlp(hidden_states)
hidden_states = residual + feed_forward_hidden_states
if use_cache:
outputs = (hidden_states,) + outputs
else:
outputs = (hidden_states,) + outputs[1:]
return outputs
class DecisionTransformerGPT2PreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = DecisionTransformerConfig
load_tf_weights = load_tf_weights_in_gpt2
base_model_prefix = "transformer"
is_parallelizable = True
supports_gradient_checkpointing = True
def __init__(self, *inputs, **kwargs):
super().__init__(*inputs, **kwargs)
def _init_weights(self, module):
"""Initialize the weights."""
if isinstance(module, (nn.Linear, Conv1D)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
for name, p in module.named_parameters():
if "c_proj" in name and "weight" in name:
p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer)))
class DecisionTransformerGPT2Model(DecisionTransformerGPT2PreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.embed_dim = config.hidden_size
self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
self.drop = nn.Dropout(config.embd_pdrop)
self.h = nn.ModuleList(
[DecisionTransformerGPT2Block(config, layer_idx=i) for i in range(config.num_hidden_layers)]
)
self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
self.model_parallel = False
self.device_map = None
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.wte
def set_input_embeddings(self, new_embeddings):
self.wte = new_embeddings
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@dataclass
class DecisionTransformerOutput(ModelOutput):
"""
Base class for model's outputs that also contains a pooling of the last hidden states.
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
state_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, state_dim)`):
Environment state predictions
action_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, action_dim)`):
Model action predictions
return_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, 1)`):
Predicted returns for each state
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
state_preds: torch.FloatTensor = None
action_preds: torch.FloatTensor = None
return_preds: torch.FloatTensor = None
hidden_states: torch.FloatTensor = None
attentions: torch.FloatTensor = None
last_hidden_state: torch.FloatTensor = None
class DecisionTransformerPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = DecisionTransformerConfig
base_model_prefix = "decision_transformer"
main_input_name = "states"
supports_gradient_checkpointing = False
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
DECISION_TRANSFORMER_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`~DecisionTransformerConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
DECISION_TRANSFORMER_INPUTS_DOCSTRING = r"""
Args:
states (`torch.FloatTensor` of shape `(batch_size, episode_length, state_dim)`):
The states for each step in the trajectory
actions (`torch.FloatTensor` of shape `(batch_size, episode_length, act_dim)`):
The actions taken by the "expert" policy for the current state, these are masked for auto regressive
prediction
rewards (`torch.FloatTensor` of shape `(batch_size, episode_length, 1)`):
The rewards for each state, action
returns_to_go (`torch.FloatTensor` of shape `(batch_size, episode_length, 1)`):
The returns for each state in the trajectory
timesteps (`torch.LongTensor` of shape `(batch_size, episode_length)`):
The timestep for each step in the trajectory
attention_mask (`torch.FloatTensor` of shape `(batch_size, episode_length)`):
Masking, used to mask the actions when performing autoregressive prediction
"""
@add_start_docstrings("The Decision Transformer Model", DECISION_TRANSFORMER_START_DOCSTRING)
class DecisionTransformerModel(DecisionTransformerPreTrainedModel):
"""
The model builds upon the GPT2 architecture to perform autoregressive prediction of actions in an offline RL
setting. Refer to the paper for more details: https://arxiv.org/abs/2106.01345
"""
def __init__(self, config):
super().__init__(config)
self.config = config
self.hidden_size = config.hidden_size
self.encoder = DecisionTransformerGPT2Model(config)
self.embed_timestep = nn.Embedding(config.max_ep_len, config.hidden_size)
self.embed_return = torch.nn.Linear(1, config.hidden_size)
self.embed_state = torch.nn.Linear(config.state_dim, config.hidden_size)
self.embed_action = torch.nn.Linear(config.act_dim, config.hidden_size)
self.embed_ln = nn.LayerNorm(config.hidden_size)
self.predict_state = torch.nn.Linear(config.hidden_size, config.state_dim)
self.predict_action = nn.Sequential(
*([nn.Linear(config.hidden_size, config.act_dim)] + ([nn.Tanh()] if config.action_tanh else []))
)
self.predict_return = torch.nn.Linear(config.hidden_size, 1)
self.post_init()
@add_start_docstrings_to_model_forward(DECISION_TRANSFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=DecisionTransformerOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
states: Optional[torch.FloatTensor] = None,
actions: Optional[torch.FloatTensor] = None,
rewards: Optional[torch.FloatTensor] = None,
returns_to_go: Optional[torch.FloatTensor] = None,
timesteps: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
output_hidden_states: Optional[bool] = None,
output_attentions: Optional[bool] = None,
return_dict: Optional[bool] = None,
.\models\decision_transformer\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_decision_transformer": [
"DECISION_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
"DecisionTransformerConfig",
],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_decision_transformer"] = [
"DECISION_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
"DecisionTransformerGPT2Model",
"DecisionTransformerGPT2PreTrainedModel",
"DecisionTransformerModel",
"DecisionTransformerPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_decision_transformer import (
DECISION_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
DecisionTransformerConfig,
)
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_decision_transformer import (
DECISION_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
DecisionTransformerGPT2Model,
DecisionTransformerGPT2PreTrainedModel,
DecisionTransformerModel,
DecisionTransformerPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\deformable_detr\configuration_deformable_detr.py
""" Deformable DETR 模型配置 """
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto import CONFIG_MAPPING
logger = logging.get_logger(__name__)
DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"SenseTime/deformable-detr": "https://huggingface.co/sensetime/deformable-detr/resolve/main/config.json",
}
class DeformableDetrConfig(PretrainedConfig):
r"""
这是用于存储 [`DeformableDetrModel`] 配置的类。它用于根据指定参数实例化 Deformable DETR 模型,定义模型架构。
使用默认配置来实例化对象将会生成类似于 Deformable DETR [SenseTime/deformable-detr]
(https://huggingface.co/SenseTime/deformable-detr) 架构的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型的输出。更多信息请参阅 [`PretrainedConfig`] 的文档。
Examples:
```
>>> from transformers import DeformableDetrConfig, DeformableDetrModel
>>> # 初始化一个 Deformable DETR SenseTime/deformable-detr 风格的配置
>>> configuration = DeformableDetrConfig()
>>> # 从指定配置文件初始化一个(带有随机权重)SenseTime/deformable-detr 风格的模型
>>> model = DeformableDetrModel(configuration)
>>> # 访问模型的配置
>>> configuration = model.config
```
"""
model_type = "deformable_detr"
attribute_map = {
"hidden_size": "d_model",
"num_attention_heads": "encoder_attention_heads",
}
def __init__(
self,
use_timm_backbone=True,
backbone_config=None,
num_channels=3,
num_queries=300,
max_position_embeddings=1024,
encoder_layers=6,
encoder_ffn_dim=1024,
encoder_attention_heads=8,
decoder_layers=6,
decoder_ffn_dim=1024,
decoder_attention_heads=8,
encoder_layerdrop=0.0,
is_encoder_decoder=True,
activation_function="relu",
d_model=256,
dropout=0.1,
attention_dropout=0.0,
activation_dropout=0.0,
init_std=0.02,
init_xavier_std=1.0,
return_intermediate=True,
auxiliary_loss=False,
position_embedding_type="sine",
backbone="resnet50",
use_pretrained_backbone=True,
backbone_kwargs=None,
dilation=False,
num_feature_levels=4,
encoder_n_points=4,
decoder_n_points=4,
two_stage=False,
two_stage_num_proposals=300,
with_box_refine=False,
class_cost=1,
bbox_cost=5,
giou_cost=2,
mask_loss_coefficient=1,
dice_loss_coefficient=1,
bbox_loss_coefficient=5,
giou_loss_coefficient=2,
eos_coefficient=0.1,
focal_alpha=0.25,
disable_custom_kernels=False,
**kwargs,
):
super().__init__(**kwargs)
@property
def num_attention_heads(self) -> int:
return self.encoder_attention_heads
@property
def hidden_size(self) -> int:
return self.d_model
.\models\deformable_detr\convert_deformable_detr_to_pytorch.py
def rename_key(orig_key):
if "backbone.0.body" in orig_key:
orig_key = orig_key.replace("backbone.0.body", "backbone.conv_encoder.model")
if "transformer" in orig_key:
orig_key = orig_key.replace("transformer.", "")
if "norm1" in orig_key:
if "encoder" in orig_key:
orig_key = orig_key.replace("norm1", "self_attn_layer_norm")
else:
orig_key = orig_key.replace("norm1", "encoder_attn_layer_norm")
if "norm2" in orig_key:
if "encoder" in orig_key:
orig_key = orig_key.replace("norm2", "final_layer_norm")
else:
orig_key = orig_key.replace("norm2", "self_attn_layer_norm")
if "norm3" in orig_key:
orig_key = orig_key.replace("norm3", "final_layer_norm")
if "linear1" in orig_key:
orig_key = orig_key.replace("linear1", "fc1")
if "linear2" in orig_key:
orig_key = orig_key.replace("linear2", "fc2")
if "query_embed" in orig_key:
orig_key = orig_key.replace("query_embed", "query_position_embeddings")
if "cross_attn" in orig_key:
orig_key = orig_key.replace("cross_attn", "encoder_attn")
return orig_key
def read_in_q_k_v(state_dict):
for i in range(6):
in_proj_weight = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_weight")
in_proj_bias = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_bias")
state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_deformable_detr_checkpoint(
checkpoint_path,
single_scale,
dilation,
with_box_refine,
two_stage,
pytorch_dump_folder_path,
push_to_hub,
):
"""
复制/粘贴/调整模型的权重以适应我们的 Deformable DETR 结构。
"""
config = DeformableDetrConfig()
if single_scale:
config.num_feature_levels = 1
config.dilation = dilation
config.with_box_refine = with_box_refine
config.two_stage = two_stage
config.num_labels = 91
repo_id = "huggingface/label-files"
filename = "coco-detection-id2label.json"
id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
image_processor = DeformableDetrImageProcessor(format="coco_detection")
img = prepare_img()
encoding = image_processor(images=img, return_tensors="pt")
pixel_values = encoding["pixel_values"]
logger.info("Converting model...")
state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
for key in state_dict.copy().keys():
val = state_dict.pop(key)
state_dict[rename_key(key)] = val
read_in_q_k_v(state_dict)
prefix = "model."
for key in state_dict.copy().keys():
if not key.startswith("class_embed") and not key.startswith("bbox_embed"):
val = state_dict.pop(key)
state_dict[prefix + key] = val
model = DeformableDetrForObjectDetection(config)
model.load_state_dict(state_dict)
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
outputs = model(pixel_values.to(device))
expected_logits = torch.tensor(
[[-9.6645, -4.3449, -5.8705], [-9.7035, -3.8504, -5.0724], [-10.5634, -5.3379, -7.5116]]
)
expected_boxes = torch.tensor([[0.8693, 0.2289, 0.2492], [0.3150, 0.5489, 0.5845], [0.5563, 0.7580, 0.8518]])
if single_scale:
expected_logits = torch.tensor(
[[-9.9051, -4.2541, -6.4852], [-9.6947, -4.0854, -6.8033], [-10.0665, -5.8470, -7.7003]]
)
expected_boxes = torch.tensor([[0.7292, 0.4991, 0.5532], [0.7959, 0.2426, 0.4236], [0.7582, 0.3518, 0.4451]])
if single_scale and dilation:
expected_logits = torch.tensor(
[[-8.9652, -4.1074, -5.6635], [-9.0596, -4.9447, -6.6075], [-10.1178, -4.5275, -6.2671]]
)
expected_boxes = torch.tensor([[0.7665, 0.4130, 0.4769], [0.8364, 0.1841, 0.3391], [0.6261, 0.3895, 0.7978]])
if with_box_refine:
expected_logits = torch.tensor(
[[-8.8895, -5.4187, -6.8153], [-8.4706, -6.1668, -7.6184], [-9.0042, -5.5359, -6.9141]]
)
expected_boxes = torch.tensor([[0.7828, 0.2208, 0.4323], [0.0892, 0.5996, 0.1319], [0.5524, 0.6389, 0.8914]])
if with_box_refine and two_stage:
expected_logits = torch.tensor(
[[-6.7108, -4.3213, -6.3777], [-8.9014, -6.1799, -6.7240], [-6.9315, -4.4735, -6.2298]]
)
expected_boxes = torch.tensor([[0.2583, 0.5499, 0.4683], [0.7652, 0.9068, 0.4882], [0.5490, 0.2763, 0.0564]])
print("Logits:", outputs.logits[0, :3, :3])
assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
print("Everything ok!")
logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_folder_path)
image_processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
model_name = "deformable-detr"
model_name += "-single-scale" if single_scale else ""
model_name += "-dc5" if dilation else ""
model_name += "-with-box-refine" if with_box_refine else ""
model_name += "-two-stage" if two_stage else ""
print("Pushing model to hub...")
model.push_to_hub(repo_path_or_name=model_name, organization="nielsr", commit_message="Add model")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--checkpoint_path",
type=str,
default="/home/niels/checkpoints/deformable_detr/r50_deformable_detr-checkpoint.pth",
help="Path to Pytorch checkpoint (.pth file) you'd like to convert.",
)
parser.add_argument("--single_scale", action="store_true", help="Whether to set config.num_features_levels = 1.")
parser.add_argument("--dilation", action="store_true", help="Whether to set config.dilation=True.")
parser.add_argument("--with_box_refine", action="store_true", help="Whether to set config.with_box_refine=True.")
parser.add_argument("--two_stage", action="store_true", help="Whether to set config.two_stage=True.")
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
required=True,
help="Path to the folder to output PyTorch model.",
)
parser.add_argument(
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
)
args = parser.parse_args()
convert_deformable_detr_checkpoint(
args.checkpoint_path,
args.single_scale,
args.dilation,
args.with_box_refine,
args.two_stage,
args.pytorch_dump_folder_path,
args.push_to_hub,
)
.\models\deformable_detr\feature_extraction_deformable_detr.py
"""Deformable DETR 的特征提取器类。"""
import warnings
from ...image_transforms import rgb_to_id as _rgb_to_id
from ...utils import logging
from .image_processing_deformable_detr import DeformableDetrImageProcessor
logger = logging.get_logger(__name__)
def rgb_to_id(x):
warnings.warn(
"rgb_to_id has moved and will not be importable from this module from v5. "
"Please import from transformers.image_transforms instead.",
FutureWarning,
)
return _rgb_to_id(x)
class DeformableDetrFeatureExtractor(DeformableDetrImageProcessor):
def __init__(self, *args, **kwargs) -> None:
warnings.warn(
"The class DeformableDetrFeatureExtractor is deprecated and will be removed in version 5 of Transformers."
" Please use DeformableDetrImageProcessor instead.",
FutureWarning,
)
super().__init__(*args, **kwargs)
.\models\deformable_detr\image_processing_deformable_detr.py
"""Image processor class for Deformable DETR."""
import io
import pathlib
from collections import defaultdict
from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
import numpy as np
from ...feature_extraction_utils import BatchFeature
from ...image_processing_utils import BaseImageProcessor, get_size_dict
from ...image_transforms import (
PaddingMode,
center_to_corners_format,
corners_to_center_format,
id_to_rgb,
pad,
rescale,
resize,
rgb_to_id,
to_channel_dimension_format,
)
from ...image_utils import (
IMAGENET_DEFAULT_MEAN,
IMAGENET_DEFAULT_STD,
AnnotationFormat,
AnnotationType,
ChannelDimension,
ImageInput,
PILImageResampling,
get_image_size,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
to_numpy_array,
valid_images,
validate_annotations,
validate_kwargs,
validate_preprocess_arguments,
)
from ...utils import (
TensorType,
is_flax_available,
is_jax_tensor,
is_scipy_available,
is_tf_available,
is_tf_tensor,
is_torch_available,
is_torch_tensor,
is_vision_available,
logging,
)
if is_torch_available():
import torch
from torch import nn
if is_vision_available():
import PIL
if is_scipy_available():
import scipy.special
import scipy.stats
logger = logging.get_logger(__name__)
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
"""
Computes the output image size given the input image size and the desired output size.
Args:
image_size (`Tuple[int, int]`):
The input image size.
size (`int`):
The desired output size.
max_size (`int`, *optional*):
The maximum allowed output size.
"""
height, width = image_size
if max_size is not None:
min_original_size = float(min((height, width)))
max_original_size = float(max((height, width)))
if max_original_size / min_original_size * size > max_size:
size = int(round(max_size * min_original_size / max_original_size))
if (height <= width and height == size) or (width <= height and width == size):
return height, width
if width < height:
ow = size
oh = int(size * height / width)
else:
oh = size
ow = int(size * width / height)
return (oh, ow)
def get_resize_output_image_size(
input_image: np.ndarray,
size: Union[int, Tuple[int, int], List[int]],
max_size: Optional[int] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
"""
Computes the output image size given the input image size and the desired output size. If the desired output size
is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output
image size is computed by keeping the aspect ratio of the input image size.
Args:
input_image (`np.ndarray`):
The image to resize.
size (`int` or `Tuple[int, int]` or `List[int]`):
The desired output size.
max_size (`int`, *optional*):
The maximum allowed output size.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
"""
image_size = get_image_size(input_image, input_data_format)
if isinstance(size, (list, tuple)):
return size
return get_size_with_aspect_ratio(image_size, size, max_size)
def get_numpy_to_framework_fn(arr) -> Callable:
"""
Returns a function that converts a numpy array to the framework of the input array.
Args:
arr (`np.ndarray`): The array to convert.
"""
if isinstance(arr, np.ndarray):
return np.array
if is_tf_available() and is_tf_tensor(arr):
import tensorflow as tf
return tf.convert_to_tensor
if is_torch_available() and is_torch_tensor(arr):
import torch
return torch.tensor
if is_flax_available() and is_jax_tensor(arr):
import jax.numpy as jnp
return jnp.array
raise ValueError(f"Cannot convert arrays of type {type(arr)}")
def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
"""
Squeezes an array, but only if the axis specified has dim 1.
"""
if axis is None:
return arr.squeeze()
try:
return arr.squeeze(axis=axis)
except ValueError:
return arr
def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict:
image_height, image_width = image_size
norm_annotation = {}
for key, value in annotation.items():
if key == "boxes":
boxes = value
boxes = corners_to_center_format(boxes)
boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
norm_annotation[key] = boxes
else:
norm_annotation[key] = value
return norm_annotation
def max_across_indices(values: Iterable[Any]) -> List[Any]:
"""
返回一个可迭代值中所有索引的最大值列表。
"""
return [max(values_i) for values_i in zip(*values)]
def get_max_height_width(
images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> List[int]:
"""
获取批次中所有图像的最大高度和宽度。
"""
if input_data_format is None:
input_data_format = infer_channel_dimension_format(images[0])
if input_data_format == ChannelDimension.FIRST:
_, max_height, max_width = max_across_indices([img.shape for img in images])
elif input_data_format == ChannelDimension.LAST:
max_height, max_width, _ = max_across_indices([img.shape for img in images])
else:
raise ValueError(f"Invalid channel dimension format: {input_data_format}")
return (max_height, max_width)
def make_pixel_mask(
image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray:
"""
生成图像的像素掩码,其中 1 表示有效像素,0 表示填充像素。
Args:
image (`np.ndarray`):
要生成像素掩码的图像。
output_size (`Tuple[int, int]`):
掩码的输出尺寸。
"""
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
mask = np.zeros(output_size, dtype=np.int64)
mask[:input_height, :input_width] = 1
return mask
def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
"""
将 COCO 多边形注释转换为掩码。
Args:
segmentations (`List[List[float]]`):
多边形列表,每个多边形由一组 x-y 坐标表示。
height (`int`):
掩码的高度。
width (`int`):
掩码的宽度。
"""
try:
from pycocotools import mask as coco_mask
except ImportError:
raise ImportError("Pycocotools is not installed in your environment.")
masks = []
for polygons in segmentations:
rles = coco_mask.frPyObjects(polygons, height, width)
mask = coco_mask.decode(rles)
if len(mask.shape) < 3:
mask = mask[..., None]
mask = np.asarray(mask, dtype=np.uint8)
mask = np.any(mask, axis=2)
masks.append(mask)
if masks:
masks = np.stack(masks, axis=0)
else:
masks = np.zeros((0, height, width), dtype=np.uint8)
return masks
def prepare_coco_detection_annotation(
image,
target,
return_segmentation_masks: bool = False,
input_data_format: Optional[Union[ChannelDimension, str]] = None,
):
"""
将COCO格式的目标转换为DeformableDetr期望的格式。
"""
image_height, image_width = get_image_size(image, channel_dim=input_data_format)
image_id = target["image_id"]
image_id = np.asarray([image_id], dtype=np.int64)
annotations = target["annotations"]
annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]
classes = [obj["category_id"] for obj in annotations]
classes = np.asarray(classes, dtype=np.int64)
area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64)
boxes = [obj["bbox"] for obj in annotations]
boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
boxes[:, 2:] += boxes[:, :2]
boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
new_target = {}
new_target["image_id"] = image_id
new_target["class_labels"] = classes[keep]
new_target["boxes"] = boxes[keep]
new_target["area"] = area[keep]
new_target["iscrowd"] = iscrowd[keep]
new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)
if annotations and "keypoints" in annotations[0]:
keypoints = [obj["keypoints"] for obj in annotations]
keypoints = np.asarray(keypoints, dtype=np.float32)
keypoints = keypoints[keep]
num_keypoints = keypoints.shape[0]
keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
new_target["keypoints"] = keypoints
if return_segmentation_masks:
segmentation_masks = [obj["segmentation"] for obj in annotations]
masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width)
new_target["masks"] = masks[keep]
return new_target
def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
"""
计算提供的全景分割掩码周围的边界框。
Args:
masks: 格式为`[number_masks, height, width]`的掩码,其中N是掩码的数量
Returns:
boxes: 格式为`[number_masks, 4]`的边界框,xyxy格式
"""
if masks.size == 0:
return np.zeros((0, 4))
h, w = masks.shape[-2:]
y = np.arange(0, h, dtype=np.float32)
x = np.arange(0, w, dtype=np.float32)
y, x = np.meshgrid(y, x, indexing="ij")
x_mask = masks * np.expand_dims(x, axis=0)
x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
x_min = x.filled(fill_value=1e8)
x_min = x_min.reshape(x_min.shape[0], -1).min(-1)
y_mask = masks * np.expand_dims(y, axis=0)
y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
y_min = y.filled(fill_value=1e8)
y_min = y_min.reshape(y_min.shape[0], -1).min(-1)
return np.stack([x_min, y_min, x_max, y_max], 1)
def prepare_coco_panoptic_annotation(
image: np.ndarray,
target: Dict,
masks_path: Union[str, pathlib.Path],
return_masks: bool = True,
input_data_format: Union[ChannelDimension, str] = None,
) -> Dict:
"""
Prepare a coco panoptic annotation for DeformableDetr.
"""
image_height, image_width = get_image_size(image, channel_dim=input_data_format)
annotation_path = pathlib.Path(masks_path) / target["file_name"]
new_target = {}
new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64)
new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64)
new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64)
if "segments_info" in target:
masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32)
masks = rgb_to_id(masks)
ids = np.array([segment_info["id"] for segment_info in target["segments_info"]])
masks = masks == ids[:, None, None]
masks = masks.astype(np.uint8)
if return_masks:
new_target["masks"] = masks
new_target["boxes"] = masks_to_boxes(masks)
new_target["class_labels"] = np.array(
[segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64
)
new_target["iscrowd"] = np.asarray(
[segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64
)
new_target["area"] = np.asarray(
[segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32
)
return new_target
def get_segmentation_image(
masks: np.ndarray, input_size: Tuple, target_size: Tuple, stuff_equiv_classes, deduplicate=False
):
h, w = input_size
final_h, final_w = target_size
m_id = scipy.special.softmax(masks.transpose(0, 1), -1)
if m_id.shape[-1] == 0:
m_id = np.zeros((h, w), dtype=np.int64)
else:
m_id = m_id.argmax(-1).reshape(h, w)
if deduplicate:
for equiv in stuff_equiv_classes.values():
for eq_id in equiv:
m_id[m_id == eq_id] = equiv[0]
seg_img = id_to_rgb(m_id)
seg_img = resize(seg_img, (final_w, final_h), resample=PILImageResampling.NEAREST)
return seg_img
def get_mask_area(seg_img: np.ndarray, target_size: Tuple[int, int], n_classes: int) -> np.ndarray:
final_h, final_w = target_size
np_seg_img = seg_img.astype(np.uint8)
np_seg_img = np_seg_img.reshape(final_h, final_w, 3)
m_id = rgb_to_id(np_seg_img)
area = [(m_id == i).sum() for i in range(n_classes)]
return area
def score_labels_from_class_probabilities(logits: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
probs = scipy.special.softmax(logits, axis=-1)
labels = probs.argmax(-1, keepdims=True)
scores = np.take_along_axis(probs, labels, axis=-1)
scores, labels = scores.squeeze(-1), labels.squeeze(-1)
return scores, labels
def post_process_panoptic_sample(
out_logits: np.ndarray,
masks: np.ndarray,
boxes: np.ndarray,
processed_size: Tuple[int, int],
target_size: Tuple[int, int],
is_thing_map: Dict,
threshold=0.85,
) -> Dict:
"""
Converts the output of [`DetrForSegmentation`] into panoptic segmentation predictions for a single sample.
Args:
out_logits (`torch.Tensor`):
The logits for this sample.
masks (`torch.Tensor`):
The predicted segmentation masks for this sample.
boxes (`torch.Tensor`):
The prediced bounding boxes for this sample. The boxes are in the normalized format `(center_x, center_y,
width, height)` and values between `[0, 1]`, relative to the size the image (disregarding padding).
processed_size (`Tuple[int, int]`):
The processed size of the image `(height, width)`, as returned by the preprocessing step i.e. the size
after data augmentation but before batching.
target_size (`Tuple[int, int]`):
The target size of the image, `(height, width)` corresponding to the requested final size of the
prediction.
is_thing_map (`Dict`):
A dictionary mapping class indices to a boolean value indicating whether the class is a thing or not.
threshold (`float`, *optional*, defaults to 0.85):
The threshold used to binarize the segmentation masks.
"""
scores, labels = score_labels_from_class_probabilities(out_logits)
keep = (labels != out_logits.shape[-1] - 1) & (scores > threshold)
cur_scores = scores[keep]
cur_classes = labels[keep]
cur_boxes = center_to_corners_format(boxes[keep])
if len(cur_boxes) != len(cur_classes):
raise ValueError("Not as many boxes as there are classes")
cur_masks = masks[keep]
cur_masks = resize(cur_masks[:, None], processed_size, resample=PILImageResampling.BILINEAR)
cur_masks = safe_squeeze(cur_masks, 1)
b, h, w = cur_masks.shape
cur_masks = cur_masks.reshape(b, -1)
stuff_equiv_classes = defaultdict(list)
for k, label in enumerate(cur_classes):
if not is_thing_map[label]:
stuff_equiv_classes[label].append(k)
seg_img = get_segmentation_image(cur_masks, processed_size, target_size, stuff_equiv_classes, deduplicate=True)
area = get_mask_area(cur_masks, processed_size, n_classes=len(cur_scores))
if cur_classes.size() > 0:
filtered_small = np.array([a <= 4 for a in area], dtype=bool)
while filtered_small.any():
cur_masks = cur_masks[~filtered_small]
cur_scores = cur_scores[~filtered_small]
cur_classes = cur_classes[~filtered_small]
seg_img = get_segmentation_image(cur_masks, (h, w), target_size, stuff_equiv_classes, deduplicate=True)
area = get_mask_area(seg_img, target_size, n_classes=len(cur_scores))
filtered_small = np.array([a <= 4 for a in area], dtype=bool)
else:
cur_classes = np.ones((1, 1), dtype=np.int64)
segments_info = [
{"id": i, "isthing": is_thing_map[cat], "category_id": int(cat), "area": a}
for i, (cat, a) in enumerate(zip(cur_classes, area))
]
del cur_classes
with io.BytesIO() as out:
PIL.Image.fromarray(seg_img).save(out, format="PNG")
predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
return predictions
def resize_annotation(
annotation: Dict[str, Any],
orig_size: Tuple[int, int],
target_size: Tuple[int, int],
threshold: float = 0.5,
resample: PILImageResampling = PILImageResampling.NEAREST,
):
"""
Resizes an annotation to a target size.
Args:
annotation (`Dict[str, Any]`):
The annotation dictionary.
orig_size (`Tuple[int, int]`):
The original size of the input image.
target_size (`Tuple[int, int]`):
The target size of the image, as returned by the preprocessing `resize` step.
threshold (`float`, *optional*, defaults to 0.5):
The threshold used to binarize the segmentation masks.
resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
The resampling filter to use when resizing the masks.
"""
ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
ratio_height, ratio_width = ratios
new_annotation = {}
new_annotation["size"] = target_size
for key, value in annotation.items():
if key == "boxes":
boxes = value
scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
new_annotation["boxes"] = scaled_boxes
elif key == "area":
area = value
scaled_area = area * (ratio_width * ratio_height)
new_annotation["area"] = scaled_area
elif key == "masks":
masks = value[:, None]
masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
masks = masks.astype(np.float32)
masks = masks[:, 0] > threshold
new_annotation["masks"] = masks
elif key == "size":
new_annotation["size"] = target_size
else:
new_annotation[key] = value
return new_annotation
def binary_mask_to_rle(mask):
"""
Converts given binary mask of shape `(height, width)` to the run-length encoding (RLE) format.
Args:
mask (`torch.Tensor` or `numpy.array`):
A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target
segment_id or class_id.
Returns:
`List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE
format.
"""
if is_torch_tensor(mask):
mask = mask.numpy()
pixels = mask.flatten()
pixels = np.concatenate([[0], pixels, [0]])
runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
runs[1::2] -= runs[::2]
return list(runs)
def convert_segmentation_to_rle(segmentation):
"""
Converts given segmentation map of shape `(height, width)` to the run-length encoding (RLE) format.
Args:
segmentation (`torch.Tensor` or `numpy.array`):
A segmentation map of shape `(height, width)` where each value denotes a segment or class id.
Returns:
`List[List]`: A list of lists, where each list is the run-length encoding of a segment / class id.
"""
segment_ids = torch.unique(segmentation)
run_length_encodings = []
for idx in segment_ids:
mask = torch.where(segmentation == idx, 1, 0)
rle = binary_mask_to_rle(mask)
run_length_encodings.append(rle)
return run_length_encodings
def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels):
"""
Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and
`labels`.
Args:
masks (`torch.Tensor`):
A tensor of shape `(num_queries, height, width)`.
scores (`torch.Tensor`):
A tensor of shape `(num_queries)`.
labels (`torch.Tensor`):
A tensor of shape `(num_queries)`.
object_mask_threshold (`float`):
A number between 0 and 1 used to binarize the masks.
Raises:
`ValueError`: Raised when the first dimension doesn't match in all input tensors.
Returns:
`Tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region
< `object_mask_threshold`.
"""
if not (masks.shape[0] == scores.shape[0] == labels.shape[0]):
raise ValueError("mask, scores and labels must have the same shape!")
to_keep = labels.ne(num_labels) & (scores > object_mask_threshold)
return masks[to_keep], scores[to_keep], labels[to_keep]
def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8):
"""
Determine the validity of a segment based on mask labels and probabilities.
Args:
mask_labels (`torch.Tensor`):
Tensor indicating mask labels.
mask_probs (`torch.Tensor`):
Tensor of mask probabilities.
k (`int`):
Class index to evaluate.
mask_threshold (`float`, optional):
Threshold value for binarizing masks. Default is 0.5.
overlap_mask_area_threshold (`float`, optional):
Threshold for determining valid segment based on area overlap. Default is 0.8.
Returns:
`Tuple[bool, torch.Tensor]`: A tuple indicating segment validity and the mask for the class `k`.
"""
mask_k = mask_labels == k
mask_k_area = mask_k.sum()
original_area = (mask_probs[k] >= mask_threshold).sum()
mask_exists = mask_k_area > 0 and original_area > 0
if mask_exists:
area_ratio = mask_k_area / original_area
if not area_ratio.item() > overlap_mask_area_threshold:
mask_exists = False
return mask_exists, mask_k
def compute_segments(
mask_probs,
pred_scores,
pred_labels,
mask_threshold: float = 0.5,
overlap_mask_area_threshold: float = 0.8,
label_ids_to_fuse: Optional[Set[int]] = None,
target_size: Tuple[int, int] = None,
):
"""
Compute segments based on mask probabilities, prediction scores, and labels.
Args:
mask_probs (`torch.Tensor`):
Tensor of mask probabilities.
pred_scores (`torch.Tensor`):
Tensor of prediction scores.
pred_labels (`torch.Tensor`):
Tensor of prediction labels.
mask_threshold (`float`, optional):
Threshold value for binarizing masks. Default is 0.5.
overlap_mask_area_threshold (`float`, optional):
Threshold for determining valid segment based on area overlap. Default is 0.8.
label_ids_to_fuse (`Optional[Set[int]]`, optional):
Set of label IDs to fuse. Default is None.
target_size (`Tuple[int, int]`, optional):
Tuple specifying target size. Default is None.
Returns:
`torch.Tensor`: Segmentation results as a tensor of integers.
"""
height = mask_probs.shape[1] if target_size is None else target_size[0]
width = mask_probs.shape[2] if target_size is None else target_size[1]
segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device)
segments: List[Dict] = []
if target_size is not None:
mask_probs = nn.functional.interpolate(
mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False
)[0]
current_segment_id = 0
mask_probs *= pred_scores.view(-1, 1, 1)
mask_labels = mask_probs.argmax(0)
stuff_memory_list: Dict[str, int] = {}
for k in range(pred_labels.shape[0]):
pred_class = pred_labels[k].item()
should_fuse = pred_class in label_ids_to_fuse
mask_exists, mask_k = check_segment_validity(
mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold
)
if mask_exists:
if pred_class in stuff_memory_list:
current_segment_id = stuff_memory_list[pred_class]
else:
current_segment_id += 1
segmentation[mask_k] = current_segment_id
segment_score = round(pred_scores[k].item(), 6)
segments.append(
{
"id": current_segment_id,
"label_id": pred_class,
"was_fused": should_fuse,
"score": segment_score,
}
)
if should_fuse:
stuff_memory_list[pred_class] = current_segment_id
return segmentation, segments
class DeformableDetrImageProcessor(BaseImageProcessor):
"""
Constructs a Deformable DETR image processor.
Args:
format (`str`, *optional*, defaults to `"coco_detection"`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
do_resize (`bool`, *optional*, defaults to `True`):
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
overridden by the `do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
the `preprocess` method.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`):
Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
`do_rescale` parameter in the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
`preprocess` method.
do_normalize:
Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
`preprocess` method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True` will pad the images in the batch to the largest height and width in the batch.
Padding will be applied to the bottom and right of the image with zeros.
"""
model_input_names = ["pixel_values", "pixel_mask"]
def __init__(
self,
format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Union[float, List[float]] = None,
image_std: Union[float, List[float]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: bool = True,
**kwargs,
) -> None:
if "pad_and_return_pixel_mask" in kwargs:
do_pad = kwargs.pop("pad_and_return_pixel_mask")
if "max_size" in kwargs:
logger.warning_once(
"The `max_size` parameter is deprecated and will be removed in v4.26. "
"Please specify in `size['longest_edge'] instead`.",
)
max_size = kwargs.pop("max_size")
else:
max_size = None if size is None else 1333
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
size = get_size_dict(size, max_size=max_size, default_to_square=False)
if do_convert_annotations is None:
do_convert_annotations = do_normalize
super().__init__(**kwargs)
self.format = format
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.do_convert_annotations = do_convert_annotations
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad
self._valid_processor_keys = [
"images",
"annotations",
"return_segmentation_masks",
"masks_path",
"do_resize",
"size",
"resample",
"do_rescale",
"rescale_factor",
"do_normalize",
"do_convert_annotations",
"image_mean",
"image_std",
"do_pad",
"format",
"return_tensors",
"data_format",
"input_data_format",
]
@classmethod
def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
"""
Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
created using from_dict and kwargs e.g. `DeformableDetrImageProcessor.from_pretrained(checkpoint, size=600,
max_size=800)`
"""
image_processor_dict = image_processor_dict.copy()
if "max_size" in kwargs:
image_processor_dict["max_size"] = kwargs.pop("max_size")
if "pad_and_return_pixel_mask" in kwargs:
image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
return super().from_dict(image_processor_dict, **kwargs)
def prepare_annotation(
self,
image: np.ndarray,
target: Dict,
format: Optional[AnnotationFormat] = None,
return_segmentation_masks: bool = None,
masks_path: Optional[Union[str, pathlib.Path]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Dict:
"""
Prepare an annotation for feeding into DeformableDetr model.
"""
format = format if format is not None else self.format
if format == AnnotationFormat.COCO_DETECTION:
return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
target = prepare_coco_detection_annotation(
image, target, return_segmentation_masks, input_data_format=input_data_format
)
elif format == AnnotationFormat.COCO_PANOPTIC:
return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
target = prepare_coco_panoptic_annotation(
image,
target,
masks_path=masks_path,
return_masks=return_segmentation_masks,
input_data_format=input_data_format,
)
else:
raise ValueError(f"Format {format} is not supported.")
return target
def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
logger.warning_once(
"The `prepare` method is deprecated and will be removed in a v4.33. "
"Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
"does not return the image anymore.",
)
target = self.prepare_annotation(image, target, return_segmentation_masks, masks_path, self.format)
return image, target
def convert_coco_poly_to_mask(self, *args, **kwargs):
logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ")
return convert_coco_poly_to_mask(*args, **kwargs)
def prepare_coco_detection(self, *args, **kwargs):
logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ")
return prepare_coco_detection_annotation(*args, **kwargs)
def prepare_coco_panoptic(self, *args, **kwargs):
logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ")
return prepare_coco_panoptic_annotation(*args, **kwargs)
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
def resize(
image: np.ndarray,
size: Union[int, Tuple[int, int]],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: str = "channels_last",
input_data_format: Optional[str] = None,
**kwargs
) -> np.ndarray:
"""
Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
int, smaller edge of the image will be matched to this number.
Args:
image (`np.ndarray`):
Image to resize.
size (`Union[int, Tuple[int, int]]`):
Size to resize to. Can be an integer or a tuple of height and width.
resample (`PILImageResampling`, optional):
Resampling filter to use if resizing the image.
data_format (`str`, optional):
The channel dimension format for the output image.
input_data_format (`str`, optional):
The channel dimension format of the input image.
"""
if "max_size" in kwargs:
logger.warning_once(
"The `max_size` parameter is deprecated and will be removed in v4.26. "
"Please specify in `size['longest_edge'] instead`.",
)
max_size = kwargs.pop("max_size")
else:
max_size = None
size = get_size_dict(size, max_size=max_size, default_to_square=False)
if "shortest_edge" in size and "longest_edge" in size:
size = get_resize_output_image_size(
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
)
elif "height" in size and "width" in size:
size = (size["height"], size["width"])
else:
raise ValueError(
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
f" {size.keys()}."
)
image = resize(
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
)
return image
def resize_annotation(
self,
annotation,
orig_size,
size,
resample: PILImageResampling = PILImageResampling.NEAREST,
) -> Dict:
"""
Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
to this number.
"""
return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
def rescale(
self,
image: np.ndarray,
rescale_factor: float,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""
Rescale the image by the given factor. image = image * rescale_factor.
Args:
image (`np.ndarray`):
Image to rescale.
rescale_factor (`float`):
The value to use for rescaling.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
input_data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the input image. If unset, is inferred from the input image. Can be
one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
"""
return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
"""
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
`[center_x, center_y, width, height]` format and from absolute to relative pixel values.
"""
return normalize_annotation(annotation, image_size=image_size)
def _update_annotation_for_padded_image(
self,
annotation: Dict,
input_image_size: Tuple[int, int],
output_image_size: Tuple[int, int],
padding,
update_bboxes,
):
"""
Update the annotation to reflect changes made due to image padding.
Args:
annotation (`Dict`):
The annotation dictionary to update.
input_image_size (`Tuple[int, int]`):
The size of the original input image (height, width).
output_image_size (`Tuple[int, int]`):
The size of the padded output image (height, width).
padding:
The padding applied to the image.
update_bboxes:
Boolean flag indicating whether to update bounding boxes in the annotation.
"""
pass
) -> Dict:
"""
Update the annotation for a padded image.
"""
new_annotation = {}
new_annotation["size"] = output_image_size
for key, value in annotation.items():
if key == "masks":
masks = value
masks = pad(
masks,
padding,
mode=PaddingMode.CONSTANT,
constant_values=0,
input_data_format=ChannelDimension.FIRST,
)
masks = safe_squeeze(masks, 1)
new_annotation["masks"] = masks
elif key == "boxes" and update_bboxes:
boxes = value
boxes *= np.asarray(
[
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
]
)
new_annotation["boxes"] = boxes
elif key == "size":
new_annotation["size"] = output_image_size
else:
new_annotation[key] = value
return new_annotation
def _pad_image(
self,
image: np.ndarray,
output_size: Tuple[int, int],
annotation: Optional[Dict[str, Any]] = None,
constant_values: Union[float, Iterable[float]] = 0,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> np.ndarray:
"""
Pad an image with zeros to the given size.
"""
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
output_height, output_width = output_size
pad_bottom = output_height - input_height
pad_right = output_width - input_width
padding = ((0, pad_bottom), (0, pad_right))
padded_image = pad(
image,
padding,
mode=PaddingMode.CONSTANT,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
)
if annotation is not None:
annotation = self._update_annotation_for_padded_image(
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
)
return padded_image, annotation
def pad(
self,
images: List[np.ndarray],
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
def preprocess(
self,
images: ImageInput,
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
return_segmentation_masks: bool = None,
masks_path: Optional[Union[str, pathlib.Path]] = None,
do_resize: Optional[bool] = None,
size: Optional[Dict[str, int]] = None,
resample=None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[Union[int, float]] = None,
do_normalize: Optional[bool] = None,
do_convert_annotations: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_pad: Optional[bool] = None,
format: Optional[Union[str, AnnotationFormat]] = None,
return_tensors: Optional[Union[TensorType, str]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
"""
将 [`DeformableDetrForObjectDetection`] 的原始输出转换为最终的边界框,格式为 (top_left_x, top_left_y, bottom_right_x, bottom_right_y)。仅支持 PyTorch。
Args:
outputs ([`DeformableDetrObjectDetectionOutput`]):
模型的原始输出。
target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
包含批处理中每个图像的大小(高度,宽度)的张量。在评估时,这必须是原始图像大小(在任何数据增强之前)。在可视化时,这应该是数据增强后,但在填充之前的图像大小。
Returns:
`List[Dict]`: 一个字典列表,每个字典包含模型预测的批处理中每个图像的分数、标签和边界框。
"""
logger.warning_once(
"`post_process` is deprecated and will be removed in v5 of Transformers, please use"
" `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
)
out_logits, out_bbox = outputs.logits, outputs.pred_boxes
if len(out_logits) != len(target_sizes):
raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
if target_sizes.shape[1] != 2:
raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
prob = out_logits.sigmoid()
topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
scores = topk_values
topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
labels = topk_indexes % out_logits.shape[2]
boxes = center_to_corners_format(out_bbox)
boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
img_h, img_w = target_sizes.unbind(1)
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
boxes = boxes * scale_fct[:, None, :]
results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
return results
):
"""
Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x,
top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
Args:
outputs ([`DetrObjectDetectionOutput`]):
Raw outputs of the model.
threshold (`float`, *optional*):
Score threshold to keep object detection predictions.
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
(height, width) of each image in the batch. If left to None, predictions will not be resized.
top_k (`int`, *optional*, defaults to 100):
Keep only top k bounding boxes before filtering by thresholding.
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
in the batch as predicted by the model.
"""
out_logits, out_bbox = outputs.logits, outputs.pred_boxes
if target_sizes is not None:
if len(out_logits) != len(target_sizes):
raise ValueError(
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
)
prob = out_logits.sigmoid()
prob = prob.view(out_logits.shape[0], -1)
k_value = min(top_k, prob.size(1))
topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
scores = topk_values
topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
labels = topk_indexes % out_logits.shape[2]
boxes = center_to_corners_format(out_bbox)
boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
if target_sizes is not None:
if isinstance(target_sizes, list):
img_h = torch.Tensor([i[0] for i in target_sizes])
img_w = torch.Tensor([i[1] for i in target_sizes])
else:
img_h, img_w = target_sizes.unbind(1)
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
boxes = boxes * scale_fct[:, None, :]
results = []
for s, l, b in zip(scores, labels, boxes):
score = s[s > threshold]
label = l[s > threshold]
box = b[s > threshold]
results.append({"scores": score, "labels": label, "boxes": box})
return results
.\models\deformable_detr\load_custom.py
""" Loading of Deformable DETR's CUDA kernels"""
import os
from pathlib import Path
def load_cuda_kernels():
from torch.utils.cpp_extension import load
root = Path(__file__).resolve().parent.parent.parent / "kernels" / "deformable_detr"
src_files = [
root / filename
for filename in [
"vision.cpp",
os.path.join("cpu", "ms_deform_attn_cpu.cpp"),
os.path.join("cuda", "ms_deform_attn_cuda.cu"),
]
]
load(
"MultiScaleDeformableAttention",
src_files,
with_cuda=True,
extra_include_paths=[str(root)],
extra_cflags=["-DWITH_CUDA=1"],
extra_cuda_cflags=[
"-DCUDA_HAS_FP16=1",
"-D__CUDA_NO_HALF_OPERATORS__",
"-D__CUDA_NO_HALF_CONVERSIONS__",
"-D__CUDA_NO_HALF2_OPERATORS__",
],
)
import MultiScaleDeformableAttention as MSDA
return MSDA
.\models\deformable_detr\modeling_deformable_detr.py
""" PyTorch Deformable DETR model."""
import copy
import math
import os
import warnings
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
import torch
import torch.nn.functional as F
from torch import Tensor, nn
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from ...activations import ACT2FN
from ...file_utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_scipy_available,
is_timm_available,
is_torch_cuda_available,
is_vision_available,
replace_return_docstrings,
requires_backends,
)
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
from ...modeling_outputs import BaseModelOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import meshgrid
from ...utils import is_accelerate_available, is_ninja_available, logging
from ...utils.backbone_utils import load_backbone
from .configuration_deformable_detr import DeformableDetrConfig
logger = logging.get_logger(__name__)
MultiScaleDeformableAttention = None
def load_cuda_kernels():
from torch.utils.cpp_extension import load
global MultiScaleDeformableAttention
root = Path(__file__).resolve().parent.parent.parent / "kernels" / "deformable_detr"
src_files = [
root / filename
for filename in [
"vision.cpp",
os.path.join("cpu", "ms_deform_attn_cpu.cpp"),
os.path.join("cuda", "ms_deform_attn_cuda.cu"),
]
]
MultiScaleDeformableAttention = load(
"MultiScaleDeformableAttention",
src_files,
with_cuda=True,
extra_include_paths=[str(root)],
extra_cflags=["-DWITH_CUDA=1"],
extra_cuda_cflags=[
"-DCUDA_HAS_FP16=1",
"-D__CUDA_NO_HALF_OPERATORS__",
"-D__CUDA_NO_HALF_CONVERSIONS__",
"-D__CUDA_NO_HALF2_OPERATORS__",
],
)
if is_vision_available():
from transformers.image_transforms import center_to_corners_format
if is_accelerate_available():
from accelerate import PartialState
from accelerate.utils import reduce
class MultiScaleDeformableAttentionFunction(Function):
@staticmethod
def forward(
context,
value,
value_spatial_shapes,
value_level_start_index,
sampling_locations,
attention_weights,
im2col_step,
):
context.im2col_step = im2col_step
output = MultiScaleDeformableAttention.ms_deform_attn_forward(
value,
value_spatial_shapes,
value_level_start_index,
sampling_locations,
attention_weights,
context.im2col_step,
)
context.save_for_backward(
value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights
)
return output
@staticmethod
@once_differentiable
def backward(context, grad_output):
(
value,
value_spatial_shapes,
value_level_start_index,
sampling_locations,
attention_weights,
) = context.saved_tensors
grad_value, grad_sampling_loc, grad_attn_weight = MultiScaleDeformableAttention.ms_deform_attn_backward(
value,
value_spatial_shapes,
value_level_start_index,
sampling_locations,
attention_weights,
grad_output,
context.im2col_step,
)
return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
if is_scipy_available():
from scipy.optimize import linear_sum_assignment
if is_timm_available():
from timm import create_model
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "DeformableDetrConfig"
_CHECKPOINT_FOR_DOC = "sensetime/deformable-detr"
DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [
"sensetime/deformable-detr",
]
@dataclass
class DeformableDetrDecoderOutput(ModelOutput):
"""
DeformableDetrDecoder 的输出的基类。这个类向 BaseModelOutputWithCrossAttentions 添加了两个属性:
- 一个堆叠的中间解码器隐藏状态张量(即每个解码器层的输出)
- 一个堆叠的中间参考点张量
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
模型最后一层的隐藏状态序列。
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
堆叠的中间隐藏状态(解码器每层的输出)。
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
堆叠的中间参考点(解码器每层的参考点)。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
`torch.FloatTensor` 元组(一个用于嵌入输出 + 一个用于每层输出),形状为 `(batch_size, sequence_length, hidden_size)`。
模型每层输出的隐藏状态加上初始嵌入输出。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
`torch.FloatTensor` 元组(每层一个),形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
注意力权重经过注意力 softmax 后的结果,在自注意力头中用于计算加权平均。
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
`torch.FloatTensor` 元组(每层一个),形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
解码器交叉注意力层的注意力权重,在注意力 softmax 后用于计算加权平均。
"""
last_hidden_state: torch.FloatTensor = None
intermediate_hidden_states: torch.FloatTensor = None
intermediate_reference_points: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
class DeformableDetrModelOutput(ModelOutput):
"""
Base class for outputs of the Deformable DETR encoder-decoder model.
"""
init_reference_points: torch.FloatTensor = None
last_hidden_state: torch.FloatTensor = None
intermediate_hidden_states: torch.FloatTensor = None
intermediate_reference_points: torch.FloatTensor = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
enc_outputs_class: Optional[torch.FloatTensor] = None
enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
class DeformableDetrObjectDetectionOutput(ModelOutput):
"""
Output type of [`DeformableDetrForObjectDetection`].
"""
loss: Optional[torch.FloatTensor] = None
loss_dict: Optional[Dict] = None
logits: torch.FloatTensor = None
pred_boxes: torch.FloatTensor = None
auxiliary_outputs: Optional[List[Dict]] = None
init_reference_points: Optional[torch.FloatTensor] = None
last_hidden_state: Optional[torch.FloatTensor] = None
intermediate_hidden_states: Optional[torch.FloatTensor] = None
intermediate_reference_points: Optional[torch.FloatTensor] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
enc_outputs_class: Optional = None
enc_outputs_coord_logits: Optional = None
def _get_clones(module, N):
return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
def inverse_sigmoid(x, eps=1e-5):
x = x.clamp(min=0, max=1)
x1 = x.clamp(min=eps)
x2 = (1 - x).clamp(min=eps)
return torch.log(x1 / x2)
class DeformableDetrFrozenBatchNorm2d(nn.Module):
"""
BatchNorm2d where the batch statistics and the affine parameters are fixed.
Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
torchvision.models.resnet[18,34,50,101] produce nans.
"""
def __init__(self, n):
super().__init__()
self.register_buffer("weight", torch.ones(n))
self.register_buffer("bias", torch.zeros(n))
self.register_buffer("running_mean", torch.zeros(n))
self.register_buffer("running_var", torch.ones(n))
def _load_from_state_dict(
self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
):
num_batches_tracked_key = prefix + "num_batches_tracked"
if num_batches_tracked_key in state_dict:
del state_dict[num_batches_tracked_key]
super()._load_from_state_dict(
state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
)
def forward(self, x):
weight = self.weight.reshape(1, -1, 1, 1)
bias = self.bias.reshape(1, -1, 1, 1)
running_var = self.running_var.reshape(1, -1, 1, 1)
running_mean = self.running_mean.reshape(1, -1, 1, 1)
epsilon = 1e-5
scale = weight * (running_var + epsilon).rsqrt()
bias = bias - running_mean * scale
return x * scale + bias
def replace_batch_norm(model):
"""
递归地将所有 `torch.nn.BatchNorm2d` 替换为 `DeformableDetrFrozenBatchNorm2d`。
Args:
model (torch.nn.Module):
输入的模型
"""
for name, module in model.named_children():
if isinstance(module, nn.BatchNorm2d):
new_module = DeformableDetrFrozenBatchNorm2d(module.num_features)
if not module.weight.device == torch.device("meta"):
new_module.weight.data.copy_(module.weight)
new_module.bias.data.copy_(module.bias)
new_module.running_mean.data.copy_(module.running_mean)
new_module.running_var.data.copy_(module.running_var)
model._modules[name] = new_module
if len(list(module.children())) > 0:
replace_batch_norm(module)
class DeformableDetrConvEncoder(nn.Module):
"""
使用 AutoBackbone API 或 timm 库之一的卷积主干网络。
所有 nn.BatchNorm2d 层都被上面定义的 DeformableDetrFrozenBatchNorm2d 替换。
"""
def __init__(self, config):
super().__init__()
self.config = config
if config.use_timm_backbone:
requires_backends(self, ["timm"])
kwargs = {}
if config.dilation:
kwargs["output_stride"] = 16
backbone = create_model(
config.backbone,
pretrained=config.use_pretrained_backbone,
features_only=True,
out_indices=(2, 3, 4) if config.num_feature_levels > 1 else (4,),
in_chans=config.num_channels,
**kwargs,
)
else:
backbone = load_backbone(config)
with torch.no_grad():
replace_batch_norm(backbone)
self.model = backbone
self.intermediate_channel_sizes = (
self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
)
backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type
if "resnet" in backbone_model_type:
for name, parameter in self.model.named_parameters():
if config.use_timm_backbone:
if "layer2" not in name and "layer3" not in name and "layer4" not in name:
parameter.requires_grad_(False)
else:
if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
parameter.requires_grad_(False)
def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
out = []
for feature_map in features:
mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
out.append((feature_map, mask))
return out
class DeformableDetrConvModel(nn.Module):
"""
This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
"""
def __init__(self, conv_encoder, position_embedding):
super().__init__()
self.conv_encoder = conv_encoder
self.position_embedding = position_embedding
def forward(self, pixel_values, pixel_mask):
out = self.conv_encoder(pixel_values, pixel_mask)
pos = []
for feature_map, mask in out:
pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype))
return out, pos
class DeformableDetrSinePositionEmbedding(nn.Module):
"""
This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
need paper, generalized to work on images.
"""
def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
super().__init__()
self.embedding_dim = embedding_dim
self.temperature = temperature
self.normalize = normalize
if scale is not None and normalize is False:
raise ValueError("normalize should be True if scale is passed")
if scale is None:
scale = 2 * math.pi
self.scale = scale
def forward(self, pixel_values, pixel_mask):
if pixel_mask is None:
raise ValueError("No pixel mask provided")
y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
if self.normalize:
eps = 1e-6
y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
dim_t = torch.arange(self.embedding_dim, dtype=torch.int64, device=pixel_values.device).float()
dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
pos_x = x_embed[:, :, :, None] / dim_t
pos_y = y_embed[:, :, :, None] / dim_t
pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
return pos
class DeformableDetrLearnedPositionEmbedding(nn.Module):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
def __init__(self, embedding_dim=256):
super().__init__()
self.row_embeddings = nn.Embedding(50, embedding_dim)
self.column_embeddings = nn.Embedding(50, embedding_dim)
def forward(self, pixel_values, pixel_mask=None):
height, width = pixel_values.shape[-2:]
width_values = torch.arange(width, device=pixel_values.device)
height_values = torch.arange(height, device=pixel_values.device)
x_emb = self.column_embeddings(width_values)
y_emb = self.row_embeddings(height_values)
pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
pos = pos.permute(2, 0, 1)
pos = pos.unsqueeze(0)
pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
return pos
def build_position_encoding(config):
n_steps = config.d_model // 2
if config.position_embedding_type == "sine":
position_embedding = DeformableDetrSinePositionEmbedding(n_steps, normalize=True)
elif config.position_embedding_type == "learned":
position_embedding = DeformableDetrLearnedPositionEmbedding(n_steps)
else:
raise ValueError(f"Not supported {config.position_embedding_type}")
return position_embedding
def multi_scale_deformable_attention(
value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
) -> Tensor:
batch_size, _, num_heads, hidden_dim = value.shape
_, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1)
sampling_grids = 2 * sampling_locations - 1
sampling_value_list = []
for level_id, (height, width) in enumerate(value_spatial_shapes):
value_l_ = (
value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width)
)
sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
sampling_value_l_ = nn.functional.grid_sample(
value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
)
sampling_value_list.append(sampling_value_l_)
attention_weights = attention_weights.transpose(1, 2).reshape(
batch_size * num_heads, 1, num_queries, num_levels * num_points
)
output = (
(torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
.sum(-1)
.view(batch_size, num_heads * hidden_dim, num_queries)
)
return output.transpose(1, 2).contiguous()
class DeformableDetrMultiscaleDeformableAttention(nn.Module):
"""
Deformable DETR 中提出的多尺度可变形注意力模块。
"""
def __init__(self, config: DeformableDetrConfig, num_heads: int, n_points: int):
super().__init__()
kernel_loaded = MultiScaleDeformableAttention is not None
if is_torch_cuda_available() and is_ninja_available() and not kernel_loaded:
try:
load_cuda_kernels()
except Exception as e:
logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}")
if config.d_model % num_heads != 0:
raise ValueError(
f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}"
)
dim_per_head = config.d_model // num_heads
if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0):
warnings.warn(
"You'd better set embed_dim (d_model) in DeformableDetrMultiscaleDeformableAttention to make the"
" dimension of each attention head a power of 2 which is more efficient in the authors' CUDA"
" implementation."
)
self.im2col_step = 64
self.d_model = config.d_model
self.n_levels = config.num_feature_levels
self.n_heads = num_heads
self.n_points = n_points
self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2)
self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points)
self.value_proj = nn.Linear(config.d_model, config.d_model)
self.output_proj = nn.Linear(config.d_model, config.d_model)
self.disable_custom_kernels = config.disable_custom_kernels
self._reset_parameters()
def _reset_parameters(self):
nn.init.constant_(self.sampling_offsets.weight.data, 0.0)
default_dtype = torch.get_default_dtype()
thetas = torch.arange(self.n_heads, dtype=torch.int64).to(default_dtype) * (2.0 * math.pi / self.n_heads)
grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
grid_init = (
(grid_init / grid_init.abs().max(-1, keepdim=True)[0])
.view(self.n_heads, 1, 1, 2)
.repeat(1, self.n_levels, self.n_points, 1)
)
for i in range(self.n_points):
grid_init[:, :, i, :] *= i + 1
with torch.no_grad():
self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
nn.init.constant_(self.attention_weights.weight.data, 0.0)
nn.init.constant_(self.attention_weights.bias.data, 0.0)
nn.init.xavier_uniform_(self.value_proj.weight.data)
nn.init.constant_(self.value_proj.bias.data, 0.0)
nn.init.xavier_uniform_(self.output_proj.weight.data)
nn.init.constant_(self.output_proj.bias.data, 0.0)
def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
return tensor if position_embeddings is None else tensor + position_embeddings
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states=None,
encoder_attention_mask=None,
position_embeddings: Optional[torch.Tensor] = None,
reference_points=None,
spatial_shapes=None,
level_start_index=None,
output_attentions: bool = False,
class DeformableDetrMultiheadAttention(nn.Module):
"""
Multi-headed attention from 'Attention Is All You Need' paper.
Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper).
"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
bias: bool = True,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
if self.head_dim * num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
return tensor if position_embeddings is None else tensor + position_embeddings
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_embeddings: Optional[torch.Tensor] = None,
output_attentions: bool = False,
class DeformableDetrEncoderLayer(nn.Module):
def __init__(self, config: DeformableDetrConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = DeformableDetrMultiscaleDeformableAttention(
config, num_heads=config.encoder_attention_heads, n_points=config.encoder_n_points
)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
position_embeddings: torch.Tensor = None,
reference_points=None,
spatial_shapes=None,
level_start_index=None,
output_attentions: bool = False,
def forward(
hidden_states: `torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
"""
Args:
hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
输入的张量数据,代表层的输入。
attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
注意力掩码,用于指示哪些元素需要被忽略。
position_embeddings (`torch.FloatTensor`, *optional*):
位置嵌入,将被加到 `hidden_states` 上。
reference_points (`torch.FloatTensor`, *optional*):
参考点。
spatial_shapes (`torch.LongTensor`, *optional*):
主干特征图的空间形状。
level_start_index (`torch.LongTensor`, *optional*):
级别起始索引。
output_attentions (`bool`, *optional*):
是否返回所有注意力层的注意力张量。详见返回的张量中的 `attentions`。
"""
residual = hidden_states
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
encoder_hidden_states=hidden_states,
encoder_attention_mask=attention_mask,
position_embeddings=position_embeddings,
reference_points=reference_points,
spatial_shapes=spatial_shapes,
level_start_index=level_start_index,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
residual = hidden_states
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.final_layer_norm(hidden_states)
if self.training:
if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
clamp_value = torch.finfo(hidden_states.dtype).max - 1000
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
class DeformableDetrDecoderLayer(nn.Module):
def __init__(self, config: DeformableDetrConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = DeformableDetrMultiheadAttention(
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.encoder_attn = DeformableDetrMultiscaleDeformableAttention(
config,
num_heads=config.decoder_attention_heads,
n_points=config.decoder_n_points,
)
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
position_embeddings: Optional[torch.Tensor] = None,
reference_points=None,
spatial_shapes=None,
level_start_index=None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = False,
):
self_attn_output = self.self_attn(
hidden_states,
position_embeddings=position_embeddings,
reference_points=reference_points,
spatial_shapes=spatial_shapes,
level_start_index=level_start_index,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions,
)
hidden_states = F.dropout(self_attn_output, p=self.dropout, training=self.training)
hidden_states = self.activation_fn(hidden_states)
hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.self_attn_layer_norm(hidden_states)
encoder_attn_output = self.encoder_attn(
hidden_states,
position_embeddings=position_embeddings,
reference_points=reference_points,
spatial_shapes=spatial_shapes,
level_start_index=level_start_index,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions,
)
hidden_states = F.dropout(encoder_attn_output, p=self.dropout, training=self.training)
hidden_states = self.encoder_attn_layer_norm(hidden_states)
hidden_states = self.fc1(hidden_states)
hidden_states = self.activation_fn(hidden_states)
hidden_states = self.fc2(hidden_states)
hidden_states = self.final_layer_norm(hidden_states)
return hidden_states
def _init_weights(self, module):
std = self.config.init_std
if isinstance(module, DeformableDetrLearnedPositionEmbedding):
nn.init.uniform_(module.row_embeddings.weight)
nn.init.uniform_(module.column_embeddings.weight)
elif isinstance(module, DeformableDetrMultiscaleDeformableAttention):
module._reset_parameters()
elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
if hasattr(module, "reference_points") and not self.config.two_stage:
nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0)
nn.init.constant_(module.reference_points.bias.data, 0.0)
if hasattr(module, "level_embed"):
nn.init.normal_(module.level_embed)
DEFORMABLE_DETR_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`DeformableDetrConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
DEFORMABLE_DETR_INPUTS_DOCSTRING = r"""
Inputs:
input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, optional):
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
- 1 for tokens that are not masked,
- 0 for tokens that are masked.
position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, optional):
Indices of positions of each input sequence tokens in the position embeddings.
Selecting a position_id equal to :obj:`padding_idx` will result in padding token. Position embeddings are
not used by default in Deformable DETR. Therefore, this argument can be safely ignored.
bbox (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, 4)`, optional):
The normalized coordinates of the bounding boxes for the input queries.
Coordinates are normalized in the format `(y_min, x_min, y_max, x_max)` and their values are in the
interval `[0, 1]`.
query_embed (:obj:`torch.FloatTensor` of shape :obj:`(num_queries, embed_dim)`, optional):
The learnable embedding of each query token in the object queries. It is a learnable parameter initialized
randomly if not provided.
relation_embed (:obj:`torch.FloatTensor` of shape :obj:`(num_object_queries, num_object_queries, embed_dim)`, optional):
The learnable embedding of each pair of object queries in the object queries.
It is a learnable parameter initialized randomly if not provided.
masks (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, num_object_queries)`, optional):
The relation mask used to calculate the attention between object queries.
return_dict (:obj:`bool`, optional, defaults to :obj:`True`):
Whether or not to return a :obj:`Dict` with the output of the model. If set to :obj:`False`, returns a
:obj:`Tuple` with the sequence of token logits and the attention.
Returns:
If :obj:`return_dict` is :obj:`True`, a :obj:`Dict` with the model's outputs will be returned that
include the logits and hidden states.
Returns:
:obj:`Dict[str, torch.FloatTensor]`: Dictionary of outputs containing:
- **logits** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, config.num_classes)`):
Classification logits (scores) for each query.
- **hidden_states** (:obj:`List[torch.FloatTensor]` of length :obj:`config.num_hidden_layers`):
Hidden states for each layer in the model. Each hidden state is a :obj:`torch.FloatTensor` of shape
:obj:`(batch_size, sequence_length, hidden_size)`.
"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values can be obtained using [`AutoImageProcessor`]. See [`DeformableDetrImageProcessor.__call__`]
for details.
pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
- 1 表示真实像素(即**未遮罩**),
- 0 表示填充像素(即**已遮罩**)。
[What are attention masks?](../glossary
decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
output_attentions (`bool`, *optional*):
output_hidden_states (`bool`, *optional*):
return_dict (`bool`, *optional*):
"""
class DeformableDetrEncoder(DeformableDetrPreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a
[`DeformableDetrEncoderLayer`].
The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers.
Args:
config: DeformableDetrConfig
"""
def __init__(self, config: DeformableDetrConfig):
super().__init__(config)
self.gradient_checkpointing = False
# 设置 dropout 概率
self.dropout = config.dropout
# 创建多个 DeformableDetrEncoderLayer 层,并放入 ModuleList 中
self.layers = nn.ModuleList([DeformableDetrEncoderLayer(config) for _ in range(config.encoder_layers)])
# 初始化权重并进行最终处理
self.post_init()
@staticmethod
def get_reference_points(spatial_shapes, valid_ratios, device):
"""
Get reference points for each feature map. Used in decoder.
Args:
spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
Spatial shapes of each feature map.
valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
Valid ratios of each feature map.
device (`torch.device`):
Device on which to create the tensors.
Returns:
`torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
"""
reference_points_list = []
# 遍历每个特征图的空间形状
for level, (height, width) in enumerate(spatial_shapes):
# 创建网格矩阵,作为参考点的初始值
ref_y, ref_x = meshgrid(
torch.linspace(0.5, height - 0.5, height, dtype=valid_ratios.dtype, device=device),
torch.linspace(0.5, width - 0.5, width, dtype=valid_ratios.dtype, device=device),
indexing="ij",
)
# 对参考点进行调整,考虑有效比例因子和特征图的高度和宽度
ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height)
ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width)
ref = torch.stack((ref_x, ref_y), -1)
reference_points_list.append(ref)
# 将参考点列表堆叠起来,形成最终的参考点张量
reference_points = torch.cat(reference_points_list, 1)
reference_points = reference_points[:, :, None] * valid_ratios[:, None]
return reference_points
def forward(
self,
inputs_embeds=None,
attention_mask=None,
position_embeddings=None,
spatial_shapes=None,
level_start_index=None,
valid_ratios=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
"""
Some tweaks for Deformable DETR:
- `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass.
- it also returns a stack of intermediate outputs and reference points from all decoding layers.
Args:
config: DeformableDetrConfig
"""
# 初始化函数,根据给定的配置参数初始化 Deformable DETR 模型
def __init__(self, config: DeformableDetrConfig):
# 调用父类的初始化方法
super().__init__(config)
# 设定模型中使用的 dropout 概率
self.dropout = config.dropout
# 创建多个 DeformableDetrDecoderLayer 层组成的列表
self.layers = nn.ModuleList([DeformableDetrDecoderLayer(config) for _ in range(config.decoder_layers)])
# 是否使用渐变检查点,默认为 False
self.gradient_checkpointing = False
# hack 实现,用于迭代边界框细化和两阶段 Deformable DETR
self.bbox_embed = None # 边界框嵌入,目前未指定具体的实现
self.class_embed = None # 类别嵌入,目前未指定具体的实现
# 初始化权重并应用最终处理
self.post_init()
# 前向传播函数,接收多个输入和参数,执行模型的前向计算过程
def forward(
self,
inputs_embeds=None, # 输入的嵌入表示,通常是编码器的输出
encoder_hidden_states=None, # 编码器的隐藏状态
encoder_attention_mask=None, # 编码器的注意力掩码
position_embeddings=None, # 位置嵌入,用于处理空间信息的嵌入向量
reference_points=None, # 参考点,用于变形注意力机制
spatial_shapes=None, # 空间形状,用于处理不同层次的空间信息
level_start_index=None, # 层级开始索引,用于多层级处理
valid_ratios=None, # 有效比率,用于多尺度处理
output_attentions=None, # 是否输出注意力权重
output_hidden_states=None, # 是否输出隐藏状态
return_dict=None, # 是否返回字典形式的输出
"""
The bare Deformable DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
hidden-states without any specific head on top.
"""
# 使用装饰器将类的文档字符串与已有的文档字符串合并
@add_start_docstrings(
"""
The bare Deformable DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
hidden-states without any specific head on top.
""",
DEFORMABLE_DETR_START_DOCSTRING,
)
# 定义 DeformableDetrModel 类,继承自 DeformableDetrPreTrainedModel 类
class DeformableDetrModel(DeformableDetrPreTrainedModel):
# 构造函数,接收一个 DeformableDetrConfig 类型的 config 参数
def __init__(self, config: DeformableDetrConfig):
# 调用父类的构造函数
super().__init__(config)
# 创建 backbone + positional encoding
# 使用 DeformableDetrConvEncoder 创建 backbone
backbone = DeformableDetrConvEncoder(config)
# 构建位置编码
position_embeddings = build_position_encoding(config)
# 将 backbone 和位置编码传递给 DeformableDetrConvModel,并赋值给 self.backbone
self.backbone = DeformableDetrConvModel(backbone, position_embeddings)
# 创建输入投影层
if config.num_feature_levels > 1:
# 获取 backbone 的中间通道大小列表
num_backbone_outs = len(backbone.intermediate_channel_sizes)
input_proj_list = []
# 根据中间通道大小列表创建输入投影层列表
for _ in range(num_backbone_outs):
in_channels = backbone.intermediate_channel_sizes[_]
input_proj_list.append(
nn.Sequential(
nn.Conv2d(in_channels, config.d_model, kernel_size=1),
nn.GroupNorm(32, config.d_model),
)
)
# 如果配置中的特征级别数大于 backbone 输出的特征级别数,则继续添加投影层
for _ in range(config.num_feature_levels - num_backbone_outs):
input_proj_list.append(
nn.Sequential(
nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1),
nn.GroupNorm(32, config.d_model),
)
)
in_channels = config.d_model
# 将输入投影层列表转换为 ModuleList,并赋值给 self.input_proj
self.input_proj = nn.ModuleList(input_proj_list)
else:
# 如果只有一个特征级别,创建单个输入投影层并赋值给 self.input_proj
self.input_proj = nn.ModuleList(
[
nn.Sequential(
nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1),
nn.GroupNorm(32, config.d_model),
)
]
)
# 如果不是两阶段模型,创建查询位置编码层
if not config.two_stage:
self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2)
# 创建 DeformableDetrEncoder 和 DeformableDetrDecoder 实例,并赋值给 self.encoder 和 self.decoder
self.encoder = DeformableDetrEncoder(config)
self.decoder = DeformableDetrDecoder(config)
# 创建级别嵌入参数,并赋值给 self.level_embed
self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model))
# 如果是两阶段模型,创建额外的层和正则化
if config.two_stage:
self.enc_output = nn.Linear(config.d_model, config.d_model)
self.enc_output_norm = nn.LayerNorm(config.d_model)
self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2)
self.pos_trans_norm = nn.LayerNorm(config.d_model * 2)
else:
# 如果不是两阶段模型,创建参考点层
self.reference_points = nn.Linear(config.d_model, 2)
# 执行初始化后的操作
self.post_init()
# 返回 encoder 对象
def get_encoder(self):
return self.encoder
# 返回 decoder 对象
def get_decoder(self):
return self.decoder
# 冻结 backbone 的参数
def freeze_backbone(self):
# 遍历 backbone 的模型参数,并设置为不可训练
for name, param in self.backbone.conv_encoder.model.named_parameters():
param.requires_grad_(False)
def unfreeze_backbone(self):
# 解冻模型的骨干网络(backbone)中的所有参数,使其可以进行梯度计算
for name, param in self.backbone.conv_encoder.model.named_parameters():
param.requires_grad_(True)
def get_valid_ratio(self, mask, dtype=torch.float32):
"""Get the valid ratio of all feature maps."""
# 获取掩码(mask)的高度和宽度
_, height, width = mask.shape
# 计算每个特征图在高度和宽度上的有效比例
valid_height = torch.sum(mask[:, :, 0], 1)
valid_width = torch.sum(mask[:, 0, :], 1)
valid_ratio_height = valid_height.to(dtype) / height
valid_ratio_width = valid_width.to(dtype) / width
# 将高度和宽度的有效比例组合成一个张量
valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1)
return valid_ratio
def get_proposal_pos_embed(self, proposals):
"""Get the position embedding of the proposals."""
# 获取位置嵌入(position embedding)的维度
num_pos_feats = self.config.d_model // 2
temperature = 10000
scale = 2 * math.pi
# 生成维度张量,用于计算位置嵌入
dim_t = torch.arange(num_pos_feats, dtype=torch.int64, device=proposals.device).float()
dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
# 对提议框进行 sigmoid 转换,并乘以比例尺度
proposals = proposals.sigmoid() * scale
# 计算位置嵌入,将结果展开为(batch_size, num_queries, 512)的形式
pos = proposals[:, :, :, None] / dim_t
pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
return pos
def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
"""Generate the encoder output proposals from encoded enc_output.
Args:
enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder.
padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`.
spatial_shapes (Tensor[num_feature_levels, 2]): Spatial shapes of the feature maps.
Returns:
`tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
- object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to
directly predict a bounding box. (without the need of a decoder)
- output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse
sigmoid.
"""
batch_size = enc_output.shape[0] # 获取批量大小
proposals = [] # 初始化建议列表
_cur = 0 # 当前处理的位置索引初始化为0
for level, (height, width) in enumerate(spatial_shapes): # 遍历空间形状列表
mask_flatten_ = padding_mask[:, _cur : (_cur + height * width)].view(batch_size, height, width, 1) # 根据当前级别的高度和宽度计算扁平化的掩码
valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1) # 计算有效的高度
valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1) # 计算有效的宽度
grid_y, grid_x = meshgrid(
torch.linspace(0, height - 1, height, dtype=torch.float32, device=enc_output.device),
torch.linspace(0, width - 1, width, dtype=torch.float32, device=enc_output.device),
indexing="ij",
) # 创建网格坐标
grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) # 合并网格坐标
scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2) # 计算比例
grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale # 根据比例调整网格
width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level) # 计算宽度和高度
proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4) # 构建建议
proposals.append(proposal) # 将建议添加到列表中
_cur += height * width # 更新当前位置索引
output_proposals = torch.cat(proposals, 1) # 合并所有建议
output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) # 确定有效的建议
output_proposals = torch.log(output_proposals / (1 - output_proposals)) # 对建议进行逆sigmoid转换
output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf")) # 将填充位置置为无穷大
output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf")) # 将无效的建议位置置为无穷大
# 每个像素分配为一个对象查询
object_query = enc_output # 使用编码输出作为对象查询
object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0)) # 将填充位置置为0
object_query = object_query.masked_fill(~output_proposals_valid, float(0)) # 将无效的建议位置置为0
object_query = self.enc_output_norm(self.enc_output(object_query)) # 对对象查询进行归一化处理
return object_query, output_proposals # 返回对象查询和输出建议
# 给模型的前向传播方法添加文档字符串,文档字符串的内容来源于 DEFORMABLE_DETR_INPUTS_DOCSTRING
@add_start_docstrings_to_model_forward(DEFORMABLE_DETR_INPUTS_DOCSTRING)
# 替换前向传播方法的返回文档字符串,指定输出类型为 DeformableDetrModelOutput,配置类为 _CONFIG_FOR_DOC
@replace_return_docstrings(output_type=DeformableDetrModelOutput, config_class=_CONFIG_FOR_DOC)
# 定义模型的前向传播方法
def forward(
self,
pixel_values: torch.FloatTensor,
pixel_mask: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.FloatTensor] = None,
encoder_outputs: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
"""
Deformable DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on
top, for tasks such as COCO detection.
"""
# 导入开始文档字符串装饰器和相关的模块文档字符串
@add_start_docstrings(
"""
Deformable DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on
top, for tasks such as COCO detection.
""",
DEFORMABLE_DETR_START_DOCSTRING,
)
# 继承自预训练的 Deformable DETR 模型
class DeformableDetrForObjectDetection(DeformableDetrPreTrainedModel):
# 当使用克隆时,所有大于 0 的层都将被克隆,但层 0 是必需的
_tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"]
# 不能在元设备上初始化模型,因为某些权重在初始化过程中会被修改
_no_split_modules = None
def __init__(self, config: DeformableDetrConfig):
super().__init__(config)
# Deformable DETR encoder-decoder 模型
self.model = DeformableDetrModel(config)
# 放置在顶部的检测头
self.class_embed = nn.Linear(config.d_model, config.num_labels)
self.bbox_embed = DeformableDetrMLPPredictionHead(
input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
)
# 设置先验概率和偏置值
prior_prob = 0.01
bias_value = -math.log((1 - prior_prob) / prior_prob)
self.class_embed.bias.data = torch.ones(config.num_labels) * bias_value
nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)
# 如果是两阶段模型,最后的 class_embed 和 bbox_embed 用于区域提议生成
num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers
if config.with_box_refine:
self.class_embed = _get_clones(self.class_embed, num_pred)
self.bbox_embed = _get_clones(self.bbox_embed, num_pred)
nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0)
# 对迭代式边界框细化的 hack 实现
self.model.decoder.bbox_embed = self.bbox_embed
else:
nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0)
self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)])
self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)])
self.model.decoder.bbox_embed = None
if config.two_stage:
# 对两阶段模型的 hack 实现
self.model.decoder.class_embed = self.class_embed
for box_embed in self.bbox_embed:
nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0)
# 初始化权重并应用最终处理
self.post_init()
# 从 https://github.com/facebookresearch/detr/blob/master/models/detr.py 中获取的未使用的 torch.jit 注解
@torch.jit.unused
# 设置辅助损失函数,接受分类输出和坐标输出作为参数
def _set_aux_loss(self, outputs_class, outputs_coord):
# 这是为了使 torchscript 能够正常工作的一种解决方法,因为 torchscript
# 不支持包含非同质值的字典,例如既有张量又有列表的字典。
# 返回一个列表,其中每个元素是一个字典,包含"logits"和"pred_boxes"两个键,分别对应 outputs_class 和 outputs_coord 的每个元素(除最后一个)。
return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
# 将模型前向方法(forward)添加文档字符串
@add_start_docstrings_to_model_forward(DEFORMABLE_DETR_INPUTS_DOCSTRING)
# 替换返回值的文档字符串为 DeformableDetrObjectDetectionOutput 类型,使用 _CONFIG_FOR_DOC 作为配置类
@replace_return_docstrings(output_type=DeformableDetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.FloatTensor,
pixel_mask: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.FloatTensor] = None,
encoder_outputs: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[List[dict]] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# Copied from transformers.models.detr.modeling_detr.dice_loss
def dice_loss(inputs, targets, num_boxes):
"""
Compute the DICE loss, similar to generalized IOU for masks
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs (0 for the negative class and 1 for the positive
class).
"""
# 对模型输出进行 sigmoid 激活,使其在 (0, 1) 范围内
inputs = inputs.sigmoid()
# 将输入扁平化,以便计算损失
inputs = inputs.flatten(1)
# 计算 DICE 损失的分子部分
numerator = 2 * (inputs * targets).sum(1)
# 计算 DICE 损失的分母部分
denominator = inputs.sum(-1) + targets.sum(-1)
# 计算最终的 DICE 损失
loss = 1 - (numerator + 1) / (denominator + 1)
# 对所有样本的损失求和并取平均
return loss.sum() / num_boxes
# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss
def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
"""
Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
Args:
inputs (`torch.FloatTensor` of arbitrary shape):
The predictions for each example.
targets (`torch.FloatTensor` with the same shape as `inputs`)
A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
and 1 for the positive class).
alpha (`float`, *optional*, defaults to `0.25`):
Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
gamma (`int`, *optional*, defaults to `2`):
Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.
Returns:
Loss tensor
"""
# 对模型输出进行 sigmoid 激活,将其转换为概率值
prob = inputs.sigmoid()
# 使用二元交叉熵损失计算损失,reduction="none"表示不进行求和
ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
# 计算 modulating factor
p_t = prob * targets + (1 - prob) * (1 - targets)
# 计算最终的 focal loss
loss = ce_loss * ((1 - p_t) ** gamma)
if alpha >= 0:
# 计算 alpha 加权
alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
loss = alpha_t * loss
# 对所有样本的损失求和并取平均
return loss.mean(1).sum() / num_boxes
class DeformableDetrLoss(nn.Module):
"""
This class computes the losses for `DeformableDetrForObjectDetection`. The process happens in two steps: 1) we
compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of
matched ground-truth / prediction (supervise class and box).
Args:
matcher (`DeformableDetrHungarianMatcher`):
Module able to compute a matching between targets and proposals.
num_classes (`int`):
Number of object categories, omitting the special no-object category.
focal_alpha (`float`):
Alpha parameter in focal loss.
losses (`List[str]`):
List of all the losses to be applied. See `get_loss` for a list of all available losses.
"""
def __init__(self, matcher, num_classes, focal_alpha, losses):
super().__init__()
self.matcher = matcher
self.num_classes = num_classes
self.focal_alpha = focal_alpha
self.losses = losses
# 初始化函数,设置模型的匹配器、类别数、focal loss 的 alpha 参数和损失函数
super().__init__()
# 保存参数到对象实例中
self.matcher = matcher
self.num_classes = num_classes
self.focal_alpha = focal_alpha
self.losses = losses
# removed logging parameter, which was part of the original implementation
def loss_labels(self, outputs, targets, indices, num_boxes):
"""
Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
of dim [nb_target_boxes]
"""
# 检查输出中是否存在 "logits" 键
if "logits" not in outputs:
raise KeyError("No logits were found in the outputs")
# 获取模型输出中的 logits
source_logits = outputs["logits"]
# 获取源索引的排列顺序
idx = self._get_source_permutation_idx(indices)
# 从目标中提取类别标签
target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
# 创建一个填充了默认类别值的张量
target_classes = torch.full(
source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device
)
target_classes[idx] = target_classes_o
# 创建一个 one-hot 编码的类别张量
target_classes_onehot = torch.zeros(
[source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1],
dtype=source_logits.dtype,
layout=source_logits.layout,
device=source_logits.device,
)
# 在目标类别张量上进行 scatter 操作,填充 one-hot 编码
target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
# 去除多余的最后一个类别维度
target_classes_onehot = target_classes_onehot[:, :, :-1]
# 计算分类交叉熵损失
loss_ce = (
sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2)
* source_logits.shape[1]
)
# 返回损失字典
losses = {"loss_ce": loss_ce}
return losses
@torch.no_grad()
# Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_cardinality
def loss_cardinality(self, outputs, targets, indices, num_boxes):
"""
Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
"""
# 获取模型输出中的 logits
logits = outputs["logits"]
# 确定设备类型
device = logits.device
# 计算目标长度的张量
target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
# 计算预测的非空盒子数量
card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
# 计算基于 L1 损失的基数错误
card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
# 返回基数错误的损失字典
losses = {"cardinality_error": card_err}
return losses
# Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_boxes
# 从 transformers.models.detr.modeling_detr.DetrLoss.loss_boxes 复制过来
# 定义计算边界框损失的方法,包括 L1 回归损失和 GIoU 损失
def loss_boxes(self, outputs, targets, indices, num_boxes):
"""
Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
are expected in format (center_x, center_y, w, h), normalized by the image size.
"""
# 检查输出中是否包含预测的边界框
if "pred_boxes" not in outputs:
raise KeyError("No predicted boxes found in outputs")
# 根据 indices 获取源排列的索引
idx = self._get_source_permutation_idx(indices)
# 获取预测的边界框和目标边界框,并按照 indices 给定的顺序连接起来
source_boxes = outputs["pred_boxes"][idx]
target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
# 计算 L1 损失
loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none")
losses = {}
# 将 L1 损失求和并归一化
losses["loss_bbox"] = loss_bbox.sum() / num_boxes
# 计算 GIoU 损失
loss_giou = 1 - torch.diag(
generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes))
)
# 将 GIoU 损失求和并归一化
losses["loss_giou"] = loss_giou.sum() / num_boxes
return losses
# 从 DETR 模型中复制的方法,用于获取源排列的索引
def _get_source_permutation_idx(self, indices):
# 根据 indices 重新排列预测结果
batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
source_idx = torch.cat([source for (source, _) in indices])
return batch_idx, source_idx
# 从 DETR 模型中复制的方法,用于获取目标排列的索引
def _get_target_permutation_idx(self, indices):
# 根据 indices 重新排列目标标签
batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
target_idx = torch.cat([target for (_, target) in indices])
return batch_idx, target_idx
# 根据给定的损失类型选择相应的损失计算方法,并调用
def get_loss(self, loss, outputs, targets, indices, num_boxes):
loss_map = {
"labels": self.loss_labels,
"cardinality": self.loss_cardinality,
"boxes": self.loss_boxes,
}
if loss not in loss_map:
raise ValueError(f"Loss {loss} not supported")
return loss_map[loss](outputs, targets, indices, num_boxes)
def forward(self, outputs, targets):
"""
This performs the loss computation.
Args:
outputs (`dict`, *optional*):
Dictionary of tensors, see the output specification of the model for the format.
targets (`List[dict]`, *optional*):
List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
losses applied, see each loss' doc.
"""
# Filter out auxiliary outputs from the main outputs dictionary
outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs" and k != "enc_outputs"}
# Retrieve the indices that match the outputs with the corresponding targets
indices = self.matcher(outputs_without_aux, targets)
# Compute the total number of target boxes for normalization
num_boxes = sum(len(t["class_labels"]) for t in targets)
num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
world_size = 1
# Adjust num_boxes and world_size if using the `accelerate` library
if is_accelerate_available():
if PartialState._shared_state != {}:
num_boxes = reduce(num_boxes)
world_size = PartialState().num_processes
# Normalize num_boxes and clamp the result to ensure it's at least 1
num_boxes = torch.clamp(num_boxes / world_size, min=1).item()
# Compute all requested losses and store them in the losses dictionary
losses = {}
for loss in self.losses:
losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
# If there are auxiliary outputs, compute losses for each and append to the losses dictionary
if "auxiliary_outputs" in outputs:
for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
indices = self.matcher(auxiliary_outputs, targets)
for loss in self.losses:
l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
losses.update(l_dict)
# If there are encoder outputs, compute losses specific to these outputs and add to the losses dictionary
if "enc_outputs" in outputs:
enc_outputs = outputs["enc_outputs"]
bin_targets = copy.deepcopy(targets)
for bt in bin_targets:
bt["class_labels"] = torch.zeros_like(bt["class_labels"]) # Zero out class labels
indices = self.matcher(enc_outputs, bin_targets)
for loss in self.losses:
l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes)
l_dict = {k + "_enc": v for k, v in l_dict.items()}
losses.update(l_dict)
# Return the computed losses dictionary
return losses
# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead
class DeformableDetrMLPPredictionHead(nn.Module):
"""
Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
height and width of a bounding box w.r.t. an image.
Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
"""
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
super().__init__()
self.num_layers = num_layers
h = [hidden_dim] * (num_layers - 1)
# 创建一个由多个线性层组成的神经网络,用于预测边界框的中心坐标、高度和宽度
self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
def forward(self, x):
# 前向传播函数,通过多个线性层进行特征提取和预测
for i, layer in enumerate(self.layers):
x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
return x
class DeformableDetrHungarianMatcher(nn.Module):
"""
This class computes an assignment between the targets and the predictions of the network.
For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
un-matched (and thus treated as non-objects).
Args:
class_cost:
The relative weight of the classification error in the matching cost.
bbox_cost:
The relative weight of the L1 error of the bounding box coordinates in the matching cost.
giou_cost:
The relative weight of the giou loss of the bounding box in the matching cost.
"""
def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
super().__init__()
# 引入后端依赖的函数库
requires_backends(self, ["scipy"])
self.class_cost = class_cost
self.bbox_cost = bbox_cost
self.giou_cost = giou_cost
# 如果所有的成本都为零,则抛出异常
if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
raise ValueError("All costs of the Matcher can't be 0")
@torch.no_grad()
def forward(self, outputs, targets):
"""
Args:
outputs (`dict`):
A dictionary that contains at least these entries:
* "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
* "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
targets (`List[dict]`):
A list of targets (len(targets) = batch_size), where each target is a dict containing:
* "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
ground-truth objects in the target) containing the class labels
* "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
Returns:
`List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
- index_i is the indices of the selected predictions (in order)
- index_j is the indices of the corresponding selected targets (in order)
For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
"""
batch_size, num_queries = outputs["logits"].shape[:2]
# We flatten to compute the cost matrices in a batch
out_prob = outputs["logits"].flatten(0, 1).sigmoid() # 将分类 logits 展平并应用 sigmoid 函数,得到概率 [batch_size * num_queries, num_classes]
out_bbox = outputs["pred_boxes"].flatten(0, 1) # 将预测框坐标展平 [batch_size * num_queries, 4]
# Also concat the target labels and boxes
target_ids = torch.cat([v["class_labels"] for v in targets]) # 将所有目标的类别标签拼接起来
target_bbox = torch.cat([v["boxes"] for v in targets]) # 将所有目标的框坐标拼接起来
# Compute the classification cost.
alpha = 0.25
gamma = 2.0
neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log()) # 计算分类损失中的负类损失项
pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) # 计算分类损失中的正类损失项
class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids] # 根据目标类别计算分类损失
# Compute the L1 cost between boxes
bbox_cost = torch.cdist(out_bbox, target_bbox, p=1) # 计算框之间的 L1 损失
# Compute the giou cost between boxes
giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox)) # 计算框之间的 GIoU 损失
# Final cost matrix
cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost # 组合成最终的损失矩阵
cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu() # 将损失矩阵调整形状并转移到 CPU 上处理
sizes = [len(v["boxes"]) for v in targets] # 获取每个目标的框数量
indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))] # 使用匈牙利算法计算最佳匹配索引
return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] # 将匹配索引转换为张量并返回
# Copied from transformers.models.detr.modeling_detr._upcast
def _upcast(t: Tensor) -> Tensor:
# 如果输入张量是浮点型,则保护免受乘法溢出风险,通过升级到相应更高的类型进行处理
if t.is_floating_point():
return t if t.dtype in (torch.float32, torch.float64) else t.float()
else:
return t if t.dtype in (torch.int32, torch.int64) else t.int()
# Copied from transformers.models.detr.modeling_detr.box_area
def box_area(boxes: Tensor) -> Tensor:
"""
计算一组边界框的面积,这些边界框由它们的 (x1, y1, x2, y2) 坐标指定。
Args:
boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
需要计算面积的边界框。它们应以 (x1, y1, x2, y2) 格式提供,其中 `0 <= x1 < x2` 和 `0 <= y1 < y2`。
Returns:
`torch.FloatTensor`: 包含每个边界框面积的张量。
"""
boxes = _upcast(boxes)
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
# Copied from transformers.models.detr.modeling_detr.box_iou
def box_iou(boxes1, boxes2):
area1 = box_area(boxes1)
area2 = box_area(boxes2)
left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2]
inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M]
union = area1[:, None] + area2 - inter
iou = inter / union
return iou, union
# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
def generalized_box_iou(boxes1, boxes2):
"""
来自 https://giou.stanford.edu/ 的广义 IoU 计算方法。边界框应处于 [x0, y0, x1, y1] (角点) 格式。
Returns:
`torch.FloatTensor`: 一个 [N, M] 的成对矩阵,其中 N = len(boxes1),M = len(boxes2)
"""
# 退化的边界框会产生无穷大 / NaN 的结果,因此进行早期检查
if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
raise ValueError(f"boxes1 必须以 [x0, y0, x1, y1] (角点) 格式提供,但给定的是 {boxes1}")
if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
raise ValueError(f"boxes2 必须以 [x0, y0, x1, y1] (角点) 格式提供,但给定的是 {boxes2}")
iou, union = box_iou(boxes1, boxes2)
top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
width_height = (bottom_right - top_left).clamp(min=0) # [N,M,2]
area = width_height[:, :, 0] * width_height[:, :, 1]
return iou - (area - union) / area
# Copied from transformers.models.detr.modeling_detr._max_by_axis
def _max_by_axis(the_list):
# type: (List[List[int]]) -> List[int]
maxes = the_list[0]
for sublist in the_list[1:]:
for index, item in enumerate(sublist):
maxes[index] = max(maxes[index], item)
return maxes
# 定义了一个 NestedTensor 类,用于处理包含张量和可选遮罩的嵌套张量对象
class NestedTensor(object):
def __init__(self, tensors, mask: Optional[Tensor]):
self.tensors = tensors # 初始化对象时传入的张量列表
self.mask = mask # 初始化对象时传入的遮罩张量(可选)
# 将嵌套张量对象转移到指定的设备上
def to(self, device):
cast_tensor = self.tensors.to(device) # 将张量列表转移到指定设备上
mask = self.mask
if mask is not None:
cast_mask = mask.to(device) # 如果存在遮罩张量,将其也转移到指定设备上
else:
cast_mask = None # 如果没有遮罩张量,则设置为 None
return NestedTensor(cast_tensor, cast_mask) # 返回转移后的嵌套张量对象
# 返回嵌套张量对象的原始张量和遮罩张量(如果存在)
def decompose(self):
return self.tensors, self.mask
# 返回嵌套张量对象的字符串表示,即其张量列表的字符串表示
def __repr__(self):
return str(self.tensors)
# 从给定的张量列表创建嵌套张量对象
# 函数来自于 transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
if tensor_list[0].ndim == 3: # 检查张量列表中的第一个张量是否为三维张量
max_size = _max_by_axis([list(img.shape) for img in tensor_list]) # 获取张量列表中张量的最大尺寸
batch_shape = [len(tensor_list)] + max_size # 计算批次的形状
batch_size, num_channels, height, width = batch_shape # 解构批次形状
dtype = tensor_list[0].dtype # 获取张量的数据类型
device = tensor_list[0].device # 获取张量的设备
tensor = torch.zeros(batch_shape, dtype=dtype, device=device) # 创建全零张量作为批次张量
mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device) # 创建全一的遮罩张量
# 将每个张量复制到批次张量中,并生成相应的遮罩
for img, pad_img, m in zip(tensor_list, tensor, mask):
pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
m[: img.shape[1], : img.shape[2]] = False # 根据张量的实际尺寸设置遮罩
else:
raise ValueError("Only 3-dimensional tensors are supported") # 抛出错误,只支持三维张量
return NestedTensor(tensor, mask) # 返回创建的嵌套张量对象