Transformers 源码解析(四十二)
.\models\dpr\tokenization_dpr.py
"""Tokenization classes for DPR."""
import collections
from typing import List, Optional, Union
from ...tokenization_utils_base import BatchEncoding
from ...utils import TensorType, add_end_docstrings, add_start_docstrings, logging
from ..bert.tokenization_bert import BertTokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/dpr-ctx_encoder-single-nq-base": (
"https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base/resolve/main/vocab.txt"
),
"facebook/dpr-ctx_encoder-multiset-base": (
"https://huggingface.co/facebook/dpr-ctx_encoder-multiset-base/resolve/main/vocab.txt"
),
},
"tokenizer_file": {
"facebook/dpr-ctx_encoder-single-nq-base": (
"https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base/resolve/main/tokenizer.json"
),
"facebook/dpr-ctx_encoder-multiset-base": (
"https://huggingface.co/facebook/dpr-ctx_encoder-multiset-base/resolve/main/tokenizer.json"
),
},
}
QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/dpr-question_encoder-single-nq-base": (
"https://huggingface.co/facebook/dpr-question_encoder-single-nq-base/resolve/main/vocab.txt"
),
"facebook/dpr-question_encoder-multiset-base": (
"https://huggingface.co/facebook/dpr-question_encoder-multiset-base/resolve/main/vocab.txt"
),
},
"tokenizer_file": {
"facebook/dpr-question_encoder-single-nq-base": (
"https://huggingface.co/facebook/dpr-question_encoder-single-nq-base/resolve/main/tokenizer.json"
),
"facebook/dpr-question_encoder-multiset-base": (
"https://huggingface.co/facebook/dpr-question_encoder-multiset-base/resolve/main/tokenizer.json"
),
},
}
READER_PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/dpr-reader-single-nq-base": (
"https://huggingface.co/facebook/dpr-reader-single-nq-base/resolve/main/vocab.txt"
),
"facebook/dpr-reader-multiset-base": (
"https://huggingface.co/facebook/dpr-reader-multiset-base/resolve/main/vocab.txt"
),
},
"tokenizer_file": {
"facebook/dpr-reader-single-nq-base": (
"https://huggingface.co/facebook/dpr-reader-single-nq-base/resolve/main/tokenizer.json"
),
"facebook/dpr-reader-multiset-base": (
"https://huggingface.co/facebook/dpr-reader-multiset-base/resolve/main/tokenizer.json"
),
},
}
CONTEXT_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/dpr-ctx_encoder-single-nq-base": 512,
"facebook/dpr-ctx_encoder-multiset-base": 512,
}
QUESTION_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/dpr-question_encoder-single-nq-base": 512,
"facebook/dpr-question_encoder-multiset-base": 512,
}
READER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/dpr-reader-single-nq-base": 512,
"facebook/dpr-reader-multiset-base": 512,
}
CONTEXT_ENCODER_PRETRAINED_INIT_CONFIGURATION = {
"facebook/dpr-ctx_encoder-single-nq-base": {"do_lower_case": True},
"facebook/dpr-ctx_encoder-multiset-base": {"do_lower_case": True},
}
QUESTION_ENCODER_PRETRAINED_INIT_CONFIGURATION = {
"facebook/dpr-question_encoder-single-nq-base": {"do_lower_case": True},
"facebook/dpr-question_encoder-multiset-base": {"do_lower_case": True},
}
READER_PRETRAINED_INIT_CONFIGURATION = {
"facebook/dpr-reader-single-nq-base": {"do_lower_case": True},
"facebook/dpr-reader-multiset-base": {"do_lower_case": True},
}
class DPRContextEncoderTokenizer(BertTokenizer):
r"""
Construct a DPRContextEncoder tokenizer.
[`DPRContextEncoderTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end tokenization: punctuation
splitting and wordpiece.
Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = CONTEXT_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = CONTEXT_ENCODER_PRETRAINED_INIT_CONFIGURATION
class DPRQuestionEncoderTokenizer(BertTokenizer):
r"""
Constructs a DPRQuestionEncoder tokenizer.
[`DPRQuestionEncoderTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end tokenization: punctuation
splitting and wordpiece.
Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = QUESTION_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = QUESTION_ENCODER_PRETRAINED_INIT_CONFIGURATION
DPRSpanPrediction = collections.namedtuple(
"DPRSpanPrediction", ["span_score", "relevance_score", "doc_id", "start_index", "end_index", "text"]
)
DPRReaderOutput = collections.namedtuple("DPRReaderOutput", ["start_logits", "end_logits", "relevance_logits"])
CUSTOM_DPR_READER_DOCSTRING = r"""
Return a dictionary with the token ids of the input strings and other information to give to `.decode_best_spans`.
It converts the strings of a question and different passages (title and text) in a sequence of IDs (integers),
"""
Prepares input data for a question answering model by tokenizing passages and creating input IDs and attention masks.
Returns:
`Dict[str, List[List[int]]]`: A dictionary containing the following keys:
- `input_ids`: List of token IDs formatted as `[CLS] <question token ids> [SEP] <titles ids> [SEP] <texts ids>`.
- `attention_mask`: List indicating which tokens should be attended to by the model.
"""
# 将自定义的文档字符串添加到类的注释中
@add_start_docstrings(CUSTOM_DPR_READER_DOCSTRING)
# 定义一个混合类,用于处理DPR阅读器的定制标记器
class CustomDPRReaderTokenizerMixin:
# 实现__call__方法,使类实例可以像函数一样调用
def __call__(
self,
questions, # 输入的问题或问题列表
titles: Optional[str] = None, # 输入的标题或标题列表(可选)
texts: Optional[str] = None, # 输入的文本或文本列表(可选)
padding: Union[bool, str] = False, # 是否填充到最大长度或指定填充方法
truncation: Union[bool, str] = False, # 是否截断到最大长度或指定截断方法
max_length: Optional[int] = None, # 最大序列长度(可选)
return_tensors: Optional[Union[str, TensorType]] = None, # 返回的张量类型(可选)
return_attention_mask: Optional[bool] = None, # 是否返回注意力遮罩(可选)
**kwargs, # 其他关键字参数
) -> BatchEncoding: # 返回类型为BatchEncoding对象
# 如果标题和文本均为空,则调用父类的__call__方法
if titles is None and texts is None:
return super().__call__(
questions,
padding=padding,
truncation=truncation,
max_length=max_length,
return_tensors=return_tensors,
return_attention_mask=return_attention_mask,
**kwargs,
)
# 如果标题或文本有一个为空,则处理成对的标题-文本
elif titles is None or texts is None:
text_pair = titles if texts is None else texts
return super().__call__(
questions,
text_pair,
padding=padding,
truncation=truncation,
max_length=max_length,
return_tensors=return_tensors,
return_attention_mask=return_attention_mask,
**kwargs,
)
# 如果标题和文本均为单个字符串,则转换为列表
titles = titles if not isinstance(titles, str) else [titles]
texts = texts if not isinstance(texts, str) else [texts]
n_passages = len(titles) # 获取标题的数量
# 如果问题是单个字符串,则复制为与标题数量相匹配的列表
questions = questions if not isinstance(questions, str) else [questions] * n_passages
# 检查标题和文本的数量是否相同,不同则引发值错误
if len(titles) != len(texts):
raise ValueError(
f"There should be as many titles than texts but got {len(titles)} titles and {len(texts)} texts."
)
# 获取问题和标题的编码输入(input_ids)
encoded_question_and_titles = super().__call__(questions, titles, padding=False, truncation=False)["input_ids"]
# 获取文本的编码输入(input_ids),不添加特殊标记
encoded_texts = super().__call__(texts, add_special_tokens=False, padding=False, truncation=False)["input_ids"]
# 构建编码输入字典
encoded_inputs = {
"input_ids": [
(encoded_question_and_title + encoded_text)[:max_length] # 若截断则截断到最大长度
if max_length is not None and truncation
else encoded_question_and_title + encoded_text # 否则直接拼接
for encoded_question_and_title, encoded_text in zip(encoded_question_and_titles, encoded_texts)
]
}
# 如果需要返回attention_mask,则生成对应的attention_mask
if return_attention_mask is not False:
attention_mask = []
for input_ids in encoded_inputs["input_ids"]:
attention_mask.append([int(input_id != self.pad_token_id) for input_id in input_ids])
encoded_inputs["attention_mask"] = attention_mask
# 调用pad方法进行填充处理,并返回结果
return self.pad(encoded_inputs, padding=padding, max_length=max_length, return_tensors=return_tensors)
def decode_best_spans(
self,
reader_input: BatchEncoding,
reader_output: DPRReaderOutput,
num_spans: int = 16,
max_answer_length: int = 64,
num_spans_per_passage: int = 4,
):
"""
解码最佳跨度,用于从抽取式问答模型中找出一个段落的最佳答案跨度。
按照 `span_score` 降序排列,保留最大的 `top_spans` 个跨度。忽略超过 `max_answer_length` 的跨度。
"""
scores = []
for start_index, start_score in enumerate(start_logits):
for answer_length, end_score in enumerate(end_logits[start_index : start_index + max_answer_length]):
scores.append(((start_index, start_index + answer_length), start_score + end_score))
# 根据得分降序排序所有跨度
scores = sorted(scores, key=lambda x: x[1], reverse=True)
chosen_span_intervals = []
for (start_index, end_index), score in scores:
# 检查跨度索引的合法性
if start_index > end_index:
raise ValueError(f"Wrong span indices: [{start_index}:{end_index}]")
length = end_index - start_index + 1
# 检查跨度长度是否超过最大答案长度
if length > max_answer_length:
raise ValueError(f"Span is too long: {length} > {max_answer_length}")
# 检查是否存在重叠的跨度
if any(
start_index <= prev_start_index <= prev_end_index <= end_index
or prev_start_index <= start_index <= end_index <= prev_end_index
for (prev_start_index, prev_end_index) in chosen_span_intervals
):
continue
chosen_span_intervals.append((start_index, end_index))
# 如果已选出了指定数量的跨度,则停止
if len(chosen_span_intervals) == top_spans:
break
return chosen_span_intervals
# 使用自定义的文档字符串装饰器来添加文档字符串到类定义中,基于给定的自定义文档字符串
@add_end_docstrings(CUSTOM_DPR_READER_DOCSTRING)
# 定义一个类 DPRReaderTokenizer,继承自 CustomDPRReaderTokenizerMixin 和 BertTokenizer
class DPRReaderTokenizer(CustomDPRReaderTokenizerMixin, BertTokenizer):
"""
Construct a DPRReader tokenizer.
[`DPRReaderTokenizer`] is almost identical to [`BertTokenizer`] and runs end-to-end tokenization: punctuation
splitting and wordpiece. The difference is that is has three inputs strings: question, titles and texts that are
combined to be fed to the [`DPRReader`] model.
Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning parameters.
"""
# 类属性:词汇表文件名列表,值为 VOCAB_FILES_NAMES
vocab_files_names = VOCAB_FILES_NAMES
# 类属性:预训练词汇文件映射,值为 READER_PRETRAINED_VOCAB_FILES_MAP
pretrained_vocab_files_map = READER_PRETRAINED_VOCAB_FILES_MAP
# 类属性:最大模型输入尺寸列表,值为 READER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
max_model_input_sizes = READER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# 类属性:预训练初始化配置,值为 READER_PRETRAINED_INIT_CONFIGURATION
pretrained_init_configuration = READER_PRETRAINED_INIT_CONFIGURATION
# 类属性:模型输入名称列表,包含 "input_ids" 和 "attention_mask"
model_input_names = ["input_ids", "attention_mask"]
.\models\dpr\tokenization_dpr_fast.py
"""Tokenization classes for DPR."""
import collections
from typing import List, Optional, Union
from ...tokenization_utils_base import BatchEncoding
from ...utils import TensorType, add_end_docstrings, add_start_docstrings, logging
from ..bert.tokenization_bert_fast import BertTokenizerFast
from .tokenization_dpr import DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer, DPRReaderTokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/dpr-ctx_encoder-single-nq-base": (
"https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base/resolve/main/vocab.txt"
),
"facebook/dpr-ctx_encoder-multiset-base": (
"https://huggingface.co/facebook/dpr-ctx_encoder-multiset-base/resolve/main/vocab.txt"
),
},
"tokenizer_file": {
"facebook/dpr-ctx_encoder-single-nq-base": (
"https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base/resolve/main/tokenizer.json"
),
"facebook/dpr-ctx_encoder-multiset-base": (
"https://huggingface.co/facebook/dpr-ctx_encoder-multiset-base/resolve/main/tokenizer.json"
),
},
}
QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/dpr-question_encoder-single-nq-base": (
"https://huggingface.co/facebook/dpr-question_encoder-single-nq-base/resolve/main/vocab.txt"
),
"facebook/dpr-question_encoder-multiset-base": (
"https://huggingface.co/facebook/dpr-question_encoder-multiset-base/resolve/main/vocab.txt"
),
},
"tokenizer_file": {
"facebook/dpr-question_encoder-single-nq-base": (
"https://huggingface.co/facebook/dpr-question_encoder-single-nq-base/resolve/main/tokenizer.json"
),
"facebook/dpr-question_encoder-multiset-base": (
"https://huggingface.co/facebook/dpr-question_encoder-multiset-base/resolve/main/tokenizer.json"
),
},
}
READER_PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/dpr-reader-single-nq-base": (
"https://huggingface.co/facebook/dpr-reader-single-nq-base/resolve/main/vocab.txt"
),
"facebook/dpr-reader-multiset-base": (
"https://huggingface.co/facebook/dpr-reader-multiset-base/resolve/main/vocab.txt"
),
},
"tokenizer_file": {
"facebook/dpr-reader-single-nq-base": (
"https://huggingface.co/facebook/dpr-reader-single-nq-base/resolve/main/tokenizer.json"
),
"facebook/dpr-reader-multiset-base": (
"https://huggingface.co/facebook/dpr-reader-multiset-base/resolve/main/tokenizer.json"
),
},
}
CONTEXT_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/dpr-ctx_encoder-single-nq-base": 512,
"facebook/dpr-ctx_encoder-multiset-base": 512,
}
QUESTION_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/dpr-question_encoder-single-nq-base": 512,
"facebook/dpr-question_encoder-multiset-base": 512,
}
READER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/dpr-reader-single-nq-base": 512,
"facebook/dpr-reader-multiset-base": 512,
}
CONTEXT_ENCODER_PRETRAINED_INIT_CONFIGURATION = {
"facebook/dpr-ctx_encoder-single-nq-base": {"do_lower_case": True},
"facebook/dpr-ctx_encoder-multiset-base": {"do_lower_case": True},
}
QUESTION_ENCODER_PRETRAINED_INIT_CONFIGURATION = {
"facebook/dpr-question_encoder-single-nq-base": {"do_lower_case": True},
"facebook/dpr-question_encoder-multiset-base": {"do_lower_case": True},
}
READER_PRETRAINED_INIT_CONFIGURATION = {
"facebook/dpr-reader-single-nq-base": {"do_lower_case": True},
"facebook/dpr-reader-multiset-base": {"do_lower_case": True},
}
class DPRContextEncoderTokenizerFast(BertTokenizerFast):
r"""
Construct a "fast" DPRContextEncoder tokenizer (backed by HuggingFace's *tokenizers* library).
[`DPRContextEncoderTokenizerFast`] is identical to [`BertTokenizerFast`] and runs end-to-end tokenization:
punctuation splitting and wordpiece.
Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = CONTEXT_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = CONTEXT_ENCODER_PRETRAINED_INIT_CONFIGURATION
slow_tokenizer_class = DPRContextEncoderTokenizer
class DPRQuestionEncoderTokenizerFast(BertTokenizerFast):
r"""
Constructs a "fast" DPRQuestionEncoder tokenizer (backed by HuggingFace's *tokenizers* library).
[`DPRQuestionEncoderTokenizerFast`] is identical to [`BertTokenizerFast`] and runs end-to-end tokenization:
punctuation splitting and wordpiece.
Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = QUESTION_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = QUESTION_ENCODER_PRETRAINED_INIT_CONFIGURATION
slow_tokenizer_class = DPRQuestionEncoderTokenizer
DPRSpanPrediction = collections.namedtuple(
"DPRSpanPrediction", ["span_score", "relevance_score", "doc_id", "start_index", "end_index", "text"]
)
DPRReaderOutput = collections.namedtuple("DPRReaderOutput", ["start_logits", "end_logits", "relevance_logits"])
CUSTOM_DPR_READER_DOCSTRING = r"""
# 返回一个包含输入字符串的token id及其他信息的字典,用于传递给 `.decode_best_spans` 函数。
# 使用分词器和词汇表将问题和不同段落(标题和文本)的字符串转换为一系列整数ID。结果的 `input_ids` 是一个大小为 `(n_passages, sequence_length)` 的矩阵,
# 其格式为:
#
# [CLS] <question token ids> [SEP] <titles ids> [SEP] <texts ids>
#
# 返回:
# `Dict[str, List[List[int]]]`: 包含以下键的字典:
#
# - `input_ids`: 要输入模型的token id列表。
# - `attention_mask`: 指定模型应关注哪些token的索引列表。
# 将自定义的文档字符串添加到类上,通常用于API文档生成
@add_start_docstrings(CUSTOM_DPR_READER_DOCSTRING)
# 定义一个混合类,用于处理DPR Reader的自定义Tokenizer功能
class CustomDPRReaderTokenizerMixin:
# 定义__call__方法,使对象可以像函数一样调用
def __call__(
self,
questions, # 输入的问题或问题列表
titles: Optional[str] = None, # 可选参数,输入的标题或单个标题字符串
texts: Optional[str] = None, # 可选参数,输入的文本或单个文本字符串
padding: Union[bool, str] = False, # 是否进行填充,可以是布尔值或填充策略字符串
truncation: Union[bool, str] = False, # 是否进行截断,可以是布尔值或截断策略字符串
max_length: Optional[int] = None, # 可选参数,最大长度限制
return_tensors: Optional[Union[str, TensorType]] = None, # 返回张量类型
return_attention_mask: Optional[bool] = None, # 是否返回注意力遮罩
**kwargs, # 其他未命名的关键字参数
) -> BatchEncoding: # 返回值为BatchEncoding类型的对象
# 如果标题和文本均未提供,则直接调用父类的__call__方法
if titles is None and texts is None:
return super().__call__(
questions,
padding=padding,
truncation=truncation,
max_length=max_length,
return_tensors=return_tensors,
return_attention_mask=return_attention_mask,
**kwargs,
)
# 如果标题或文本中有一个为None,则将其作为文本对处理
elif titles is None or texts is None:
text_pair = titles if texts is None else texts
return super().__call__(
questions,
text_pair,
padding=padding,
truncation=truncation,
max_length=max_length,
return_tensors=return_tensors,
return_attention_mask=return_attention_mask,
**kwargs,
)
# 如果titles是字符串,则转换为列表
titles = titles if not isinstance(titles, str) else [titles]
# 如果texts是字符串,则转换为列表
texts = texts if not isinstance(texts, str) else [texts]
# 计算标题的数量,作为文本对的数量
n_passages = len(titles)
# 如果问题是字符串,则复制为问题列表,使每个问题对应一个文本对
questions = questions if not isinstance(questions, str) else [questions] * n_passages
# 断言标题和文本的数量应该相同
assert len(titles) == len(
texts
), f"There should be as many titles than texts but got {len(titles)} titles and {len(texts)} texts."
# 调用父类的__call__方法对问题和标题进行编码,禁用填充和截断
encoded_question_and_titles = super().__call__(questions, titles, padding=False, truncation=False)["input_ids"]
# 调用父类的__call__方法对文本进行编码,禁用特殊令牌、填充和截断
encoded_texts = super().__call__(texts, add_special_tokens=False, padding=False, truncation=False)["input_ids"]
# 合并编码后的问题和标题与文本,并根据最大长度和截断策略进行处理
encoded_inputs = {
"input_ids": [
(encoded_question_and_title + encoded_text)[:max_length]
if max_length is not None and truncation
else encoded_question_and_title + encoded_text
for encoded_question_and_title, encoded_text in zip(encoded_question_and_titles, encoded_texts)
]
}
# 如果不返回注意力遮罩,则创建注意力遮罩列表
if return_attention_mask is not False:
attention_mask = []
for input_ids in encoded_inputs["input_ids"]:
attention_mask.append([int(input_id != self.pad_token_id) for input_id in input_ids])
encoded_inputs["attention_mask"] = attention_mask
# 调用pad方法对编码输入进行填充,根据填充策略和最大长度进行处理
return self.pad(encoded_inputs, padding=padding, max_length=max_length, return_tensors=return_tensors)
def decode_best_spans(
self,
reader_input: BatchEncoding,
reader_output: DPRReaderOutput,
num_spans: int = 16,
max_answer_length: int = 64,
num_spans_per_passage: int = 4,
):
"""
解码最佳跨度的函数,用于从抽取式问答模型中找出一个段落的最佳答案跨度。它按照降序的 `span_score` 排序,并保留最多 `top_spans` 个跨度。超过 `max_answer_length` 的跨度将被忽略。
"""
scores = []
for start_index, start_score in enumerate(start_logits):
for answer_length, end_score in enumerate(end_logits[start_index : start_index + max_answer_length]):
scores.append(((start_index, start_index + answer_length), start_score + end_score))
scores = sorted(scores, key=lambda x: x[1], reverse=True)
chosen_span_intervals = []
for (start_index, end_index), score in scores:
assert start_index <= end_index, f"Wrong span indices: [{start_index}:{end_index}]"
length = end_index - start_index + 1
assert length <= max_answer_length, f"Span is too long: {length} > {max_answer_length}"
if any(
start_index <= prev_start_index <= prev_end_index <= end_index
or prev_start_index <= start_index <= end_index <= prev_end_index
for (prev_start_index, prev_end_index) in chosen_span_intervals
):
continue
chosen_span_intervals.append((start_index, end_index))
if len(chosen_span_intervals) == top_spans:
break
return chosen_span_intervals
# 应用装饰器 @add_end_docstrings(CUSTOM_DPR_READER_DOCSTRING) 来添加自定义文档字符串到类 DPRReaderTokenizerFast
@add_end_docstrings(CUSTOM_DPR_READER_DOCSTRING)
# 声明 DPRReaderTokenizerFast 类,继承自 CustomDPRReaderTokenizerMixin 和 BertTokenizerFast
class DPRReaderTokenizerFast(CustomDPRReaderTokenizerMixin, BertTokenizerFast):
# 构造函数说明
r"""
构造一个“快速” DPRReader 分词器(由 HuggingFace 的 *tokenizers* 库支持)。
[`DPRReaderTokenizerFast`] 几乎与 [`BertTokenizerFast`] 相同,并运行端到端的分词:
标点符号拆分和 wordpiece。区别在于它有三个输入字符串:问题、标题和文本,这些被组合后供 [`DPRReader`] 模型使用。
参考超类 [`BertTokenizerFast`] 以获取有关参数的使用示例和文档。
"""
# 定义词汇文件的名称
vocab_files_names = VOCAB_FILES_NAMES
# 定义预训练词汇文件的映射
pretrained_vocab_files_map = READER_PRETRAINED_VOCAB_FILES_MAP
# 定义模型最大输入大小
max_model_input_sizes = READER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# 定义预训练初始化配置
pretrained_init_configuration = READER_PRETRAINED_INIT_CONFIGURATION
# 模型输入名称列表
model_input_names = ["input_ids", "attention_mask"]
# 慢速分词器类的定义
slow_tokenizer_class = DPRReaderTokenizer
.\models\dpr\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_dpr": ["DPR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DPRConfig"],
"tokenization_dpr": [
"DPRContextEncoderTokenizer",
"DPRQuestionEncoderTokenizer",
"DPRReaderOutput",
"DPRReaderTokenizer",
],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_dpr_fast"] = [
"DPRContextEncoderTokenizerFast",
"DPRQuestionEncoderTokenizerFast",
"DPRReaderTokenizerFast",
]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_dpr"] = [
"DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST",
"DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST",
"DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST",
"DPRContextEncoder",
"DPRPretrainedContextEncoder",
"DPRPreTrainedModel",
"DPRPretrainedQuestionEncoder",
"DPRPretrainedReader",
"DPRQuestionEncoder",
"DPRReader",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_dpr"] = [
"TF_DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST",
"TF_DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST",
"TF_DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFDPRContextEncoder",
"TFDPRPretrainedContextEncoder",
"TFDPRPretrainedQuestionEncoder",
"TFDPRPretrainedReader",
"TFDPRQuestionEncoder",
"TFDPRReader",
]
if TYPE_CHECKING:
from .configuration_dpr import DPR_PRETRAINED_CONFIG_ARCHIVE_MAP, DPRConfig
from .tokenization_dpr import (
DPRContextEncoderTokenizer,
DPRQuestionEncoderTokenizer,
DPRReaderOutput,
DPRReaderTokenizer,
)
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
from .tokenization_dpr_fast import (
DPRContextEncoderTokenizerFast,
DPRQuestionEncoderTokenizerFast,
DPRReaderTokenizerFast,
)
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_dpr import (
DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST,
DPRContextEncoder,
DPRPretrainedContextEncoder,
DPRPreTrainedModel,
DPRPretrainedQuestionEncoder,
DPRPretrainedReader,
DPRQuestionEncoder,
DPRReader,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_dpr import (
TF_DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
TF_DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
TF_DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST,
TFDPRContextEncoder,
TFDPRPretrainedContextEncoder,
TFDPRPretrainedQuestionEncoder,
TFDPRPretrainedReader,
TFDPRQuestionEncoder,
TFDPRReader,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\dpt\configuration_dpt.py
""" DPT model configuration"""
import copy
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto.configuration_auto import CONFIG_MAPPING
from ..bit import BitConfig
logger = logging.get_logger(__name__)
DPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"Intel/dpt-large": "https://huggingface.co/Intel/dpt-large/resolve/main/config.json",
}
class DPTConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`DPTModel`]. It is used to instantiate an DPT
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the DPT
[Intel/dpt-large](https://huggingface.co/Intel/dpt-large) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import DPTModel, DPTConfig
>>> # Initializing a DPT dpt-large style configuration
>>> configuration = DPTConfig()
>>> # Initializing a model from the dpt-large style configuration
>>> model = DPTModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "dpt"
def __init__(
self,
hidden`
def __init__(
self,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
initializer_range=0.02,
layer_norm_eps=1e-12,
image_size=384,
patch_size=16,
num_channels=3,
is_hybrid=False,
qkv_bias=True,
backbone_out_indices=[2, 5, 8, 11],
readout_type="project",
reassemble_factors=[4, 2, 1, 0.5],
neck_hidden_sizes=[96, 192, 384, 768],
fusion_hidden_size=256,
head_in_index=-1,
use_batch_norm_in_fusion_residual=False,
use_bias_in_fusion_residual=None,
add_projection=False,
use_auxiliary_head=True,
auxiliary_loss_weight=0.4,
semantic_loss_ignore_index=255,
semantic_classifier_dropout=0.1,
backbone_featmap_shape=[1, 1024, 24, 24],
neck_ignore_stages=[0, 1],
backbone_config=None,
backbone=None,
use_pretrained_backbone=False,
use_timm_backbone=False,
backbone_kwargs=None,
**kwargs,
):
"""
初始化方法,设置模型的各种参数。
"""
super().__init__(**kwargs)
def to_dict(self):
"""
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
"""
output = copy.deepcopy(self.__dict__)
if output["backbone_config"] is not None:
output["backbone_config"] = self.backbone_config.to_dict()
output["model_type"] = self.__class__.model_type
return output
.\models\dpt\convert_dinov2_depth_to_hf.py
"""Convert DINOv2 + DPT checkpoints from the original repository. URL:
https://github.com/facebookresearch/dinov2/tree/main"""
import argparse
import itertools
import math
from pathlib import Path
import requests
import torch
from PIL import Image
from torchvision import transforms
from transformers import Dinov2Config, DPTConfig, DPTForDepthEstimation, DPTImageProcessor
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def get_dpt_config(model_name):
if "small" in model_name:
backbone_config = Dinov2Config.from_pretrained(
"facebook/dinov2-small", out_indices=[3, 6, 9, 12], apply_layernorm=False, reshape_hidden_states=False
)
neck_hidden_sizes = [48, 96, 192, 384]
elif "base" in model_name:
backbone_config = Dinov2Config.from_pretrained(
"facebook/dinov2-base", out_indices=[3, 6, 9, 12], apply_layernorm=False, reshape_hidden_states=False
)
neck_hidden_sizes = [96, 192, 384, 768]
elif "large" in model_name:
backbone_config = Dinov2Config.from_pretrained(
"facebook/dinov2-large", out_indices=[5, 12, 18, 24], apply_layernorm=False, reshape_hidden_states=False
)
neck_hidden_sizes = [128, 256, 512, 1024]
elif "giant" in model_name:
backbone_config = Dinov2Config.from_pretrained(
"facebook/dinov2-giant", out_indices=[10, 20, 30, 40], apply_layernorm=False, reshape_hidden_states=False
)
neck_hidden_sizes = [192, 384, 768, 1536]
else:
raise NotImplementedError("To do")
config = DPTConfig(
backbone_config=backbone_config,
neck_hidden_sizes=neck_hidden_sizes,
use_bias_in_fusion_residual=False,
add_projection=True,
)
return config
def create_rename_keys_dpt(config):
rename_keys = []
for i in range(4):
rename_keys.append((f"decode_head.reassemble_blocks.projects.{i}.conv.weight", f"neck.reassemble_stage.layers.{i}.projection.weight"))
rename_keys.append((f"decode_head.reassemble_blocks.projects.{i}.conv.bias", f"neck.reassemble_stage.layers.{i}.projection.bias"))
rename_keys.append((f"decode_head.reassemble_blocks.readout_projects.{i}.0.weight", f"neck.reassemble_stage.readout_projects.{i}.0.weight"))
rename_keys.append((f"decode_head.reassemble_blocks.readout_projects.{i}.0.bias", f"neck.reassemble_stage.readout_projects.{i}.0.bias"))
if i != 2:
rename_keys.append((f"decode_head.reassemble_blocks.resize_layers.{i}.weight", f"neck.reassemble_stage.layers.{i}.resize.weight"))
rename_keys.append((f"decode_head.reassemble_blocks.resize_layers.{i}.bias", f"neck.reassemble_stage.layers.{i}.resize.bias"))
for i in range(4):
rename_keys.append((f"decode_head.fusion_blocks.{i}.project.conv.weight", f"neck.fusion_stage.layers.{i}.projection.weight"))
rename_keys.append((f"decode_head.fusion_blocks.{i}.project.conv.bias", f"neck.fusion_stage.layers.{i}.projection.bias"))
if i != 0:
rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit1.conv1.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer1.convolution1.weight"))
rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit1.conv2.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer1.convolution2.weight"))
rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit2.conv1.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer2.convolution1.weight"))
rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit2.conv2.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer2.convolution2.weight"))
for i in range(4):
rename_keys.append((f"decode_head.convs.{i}.conv.weight", f"neck.convs.{i}.weight"))
rename_keys.append(("decode_head.project.conv.weight", "head.projection.weight"))
rename_keys.append(("decode_head.project.conv.bias", "head.projection.bias"))
for i in range(0, 5, 2):
rename_keys.append((f"decode_head.conv_depth.head.{i}.weight", f"head.head.{i}.weight"))
rename_keys.append((f"decode_head.conv_depth.head.{i}.bias", f"head.head.{i}.bias"))
return rename_keys
def create_rename_keys_backbone(config):
rename_keys = []
rename_keys.append(("cls_token", "backbone.embeddings.cls_token"))
rename_keys.append(("mask_token", "backbone.embeddings.mask_token"))
rename_keys.append(("pos_embed", "backbone.embeddings.position_embeddings"))
rename_keys.append(("patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
rename_keys.append(("patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias"))
for i in range(config.backbone_config.num_hidden_layers):
rename_keys.append((f"blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.norm1.weight"))
rename_keys.append((f"blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.norm1.bias"))
rename_keys.append((f"blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.norm2.weight"))
rename_keys.append((f"blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.norm2.bias"))
if config.backbone_config.use_swiglu_ffn:
rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"backbone.encoder.layer.{i}.mlp.w12.weight"))
rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"backbone.encoder.layer.{i}.mlp.w12.bias"))
rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"backbone.encoder.layer.{i}.mlp.w3.weight"))
rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"backbone.encoder.layer.{i}.mlp.w3.bias"))
else:
rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.mlp.fc1.weight"))
rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.mlp.fc1.bias"))
rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.mlp.fc2.weight"))
rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.mlp.fc2.bias"))
rename_keys.append((f"blocks.{i}.ls1.gamma", f"backbone.encoder.layer.{i}.layer_scale1.lambda1"))
rename_keys.append((f"blocks.{i}.ls2.gamma", f"backbone.encoder.layer.{i}.layer_scale2.lambda1"))
rename_keys.append((f"blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight"))
rename_keys.append((f"blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias"))
rename_keys.append(("norm.weight", "backbone.layernorm.weight"))
rename_keys.append(("norm.bias", "backbone.layernorm.bias"))
return rename_keys
for i in range(config.backbone_config.num_hidden_layers):
in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
hidden_size = config.backbone_config.hidden_size
state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :]
state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[:hidden_size]
state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :]
state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :]
state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-hidden_size:]
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def prepare_img():
url = "https://dl.fbaipublicfiles.com/dinov2/images/example.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
name_to_url = {
"dpt-dinov2-small-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_nyu_dpt_head.pth",
"dpt-dinov2-small-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_kitti_dpt_head.pth",
"dpt-dinov2-base-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_nyu_dpt_head.pth",
"dpt-dinov2-base-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_kitti_dpt_head.pth",
"dpt-dinov2-large-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_nyu_dpt_head.pth",
"dpt-dinov2-large-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_kitti_dpt_head.pth",
"dpt-dinov2-giant-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_nyu_dpt_head.pth",
"dpt-dinov2-giant-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_kitti_dpt_head.pth",
}
def get_original_pixel_values(image):
class CenterPadding(object):
def __init__(self, multiple):
super().__init__()
self.multiple = multiple
def _get_pad(self, size):
new_size = math.ceil(size / self.multiple) * self.multiple
pad_size = new_size - size
pad_size_left = pad_size // 2
pad_size_right = pad_size - pad_size_left
return pad_size_left, pad_size_right
def __call__(self, img):
pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in img.shape[-2:][::-1]))
output = torch.nn.functional.pad(img, pads)
return output
def __repr__(self):
return self.__class__.__name__ + "()"
def make_depth_transform() -> transforms.Compose:
return transforms.Compose(
[
transforms.ToTensor(),
lambda x: 255.0 * x[:3],
transforms.Normalize(
mean=(123.675, 116.28, 103.53),
std=(58.395, 57.12, 57.375),
),
CenterPadding(multiple=14),
]
)
transform = make_depth_transform()
original_pixel_values = transform(image).unsqueeze(0)
return original_pixel_values
@torch.no_grad()
def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, verify_logits):
"""
Copy/paste/tweak model's weights to our DPT structure.
"""
checkpoint_url = name_to_url[model_name]
config = get_dpt_config(model_name)
print("URL:", checkpoint_url)
dpt_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["state_dict"]
rename_keys = create_rename_keys_dpt(config)
for src, dest in rename_keys:
rename_key(dpt_state_dict, src, dest)
if "small" in model_name:
original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vits14")
elif "base" in model_name:
original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitb14")
elif "large" in model_name:
original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitl14")
elif "giant" in model_name:
original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitg14")
else:
raise NotImplementedError("To do")
original_model.eval()
backbone_state_dict = original_model.state_dict()
rename_keys = create_rename_keys_backbone(config)
for src, dest in rename_keys:
rename_key(backbone_state_dict, src, dest)
read_in_q_k_v(backbone_state_dict, config)
for key, val in backbone_state_dict.copy().items():
val = backbone_state_dict.pop(key)
if "w12" in key:
key = key.replace("w12", "weights_in")
if "w3" in key:
key = key.replace("w3", "weights_out")
backbone_state_dict[key] = val
state_dict = {**backbone_state_dict, **dpt_state_dict}
model = DPTForDepthEstimation(config)
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
print("Missing keys:", missing_keys)
print("Unexpected keys:", unexpected_keys)
assert missing_keys == [
"neck.fusion_stage.layers.0.residual_layer1.convolution1.weight",
"neck.fusion_stage.layers.0.residual_layer1.convolution2.weight",
]
model.eval()
processor = DPTImageProcessor(
do_resize=False,
do_rescale=False,
do_pad=True,
size_divisor=14,
do_normalize=True,
image_mean=(123.675, 116.28, 103.53),
image_std=(58.395, 57.12, 57.375),
)
image = prepare_img()
pixel_values = processor(image, return_tensors="pt").pixel_values.float()
original_pixel_values = get_original_pixel_values(image)
assert torch.allclose(pixel_values, original_pixel_values)
with torch.no_grad():
outputs = model(pixel_values)
predicted_depth = outputs.predicted_depth
print("Shape of predicted depth:", predicted_depth.shape)
print("First values of predicted depth:", predicted_depth[0, :3, :3])
if verify_logits:
if model_name == "dpt-dinov2-small-nyu":
expected_shape = torch.Size([1, 576, 736])
expected_slice = torch.tensor(
[[3.3576, 3.4741, 3.4345], [3.4324, 3.5012, 3.2775], [3.2560, 3.3563, 3.2354]]
)
assert predicted_depth.shape == torch.Size(expected_shape)
assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-5)
print("Looks ok!")
if pytorch_dump_folder_path is not None:
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model and processor to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
print("Pushing model and processor to hub...")
model.push_to_hub(repo_id=f"facebook/{model_name}")
processor.push_to_hub(repo_id=f"facebook/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
default="dpt-dinov2-small-nyu",
type=str,
choices=name_to_url.keys(),
help="Name of the model you'd like to convert."
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the output PyTorch model directory."
)
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Whether to push the model to the hub after conversion."
)
parser.add_argument(
"--verify_logits",
action="store_true",
required=False,
help="Path to the output PyTorch model directory."
)
args = parser.parse_args()
convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits)
.\models\dpt\convert_dpt_beit_to_hf.py
"""从 MiDaS 仓库转换 DPT 3.1 检查点。URL:https://github.com/isl-org/MiDaS"""
import argparse
from pathlib import Path
import requests
import torch
from PIL import Image
from transformers import BeitConfig, DPTConfig, DPTForDepthEstimation, DPTImageProcessor
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def get_dpt_config(model_name):
hidden_size = 768
num_hidden_layers = 12
num_attention_heads = 12
intermediate_size = 3072
out_features = ["stage3", "stage6", "stage9", "stage12"]
if "large" in model_name:
hidden_size = 1024
num_hidden_layers = 24
num_attention_heads = 16
intermediate_size = 4096
out_features = ["stage6", "stage12", "stage18", "stage24"]
if "512" in model_name:
image_size = 512
elif "384" in model_name:
image_size = 384
else:
raise ValueError("Model not supported")
backbone_config = BeitConfig(
image_size=image_size,
num_hidden_layers=num_hidden_layers,
hidden_size=hidden_size,
intermediate_size=intermediate_size,
num_attention_heads=num_attention_heads,
use_relative_position_bias=True,
reshape_hidden_states=False,
out_features=out_features,
)
neck_hidden_sizes = [256, 512, 1024, 1024] if "large" in model_name else [96, 192, 384, 768]
config = DPTConfig(backbone_config=backbone_config, neck_hidden_sizes=neck_hidden_sizes)
return config, image_size
def create_rename_keys(config):
rename_keys = []
rename_keys.append(("pretrained.model.cls_token", "backbone.embeddings.cls_token"))
rename_keys.append(("pretrained.model.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
rename_keys.append(("pretrained.model.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias"))
for i in range(config.backbone_config.num_hidden_layers):
rename_keys.append((f"pretrained.model.blocks.{i}.gamma_1", f"backbone.encoder.layer.{i}.lambda_1"))
rename_keys.append((f"pretrained.model.blocks.{i}.gamma_2", f"backbone.encoder.layer.{i}.lambda_2"))
rename_keys.append((f"pretrained.model.blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.layernorm_before.weight"))
rename_keys.append((f"pretrained.model.blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.layernorm_before.bias"))
rename_keys.append((f"pretrained.model.blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.layernorm_after.weight"))
rename_keys.append((f"pretrained.model.blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.layernorm_after.bias"))
rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.intermediate.dense.weight"))
rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.intermediate.dense.bias"))
rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.output.dense.weight"))
rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.output.dense.bias"))
rename_keys.append((f"pretrained.model.blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight"))
rename_keys.append((f"pretrained.model.blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias"))
rename_keys.append((f"pretrained.model.blocks.{i}.attn.relative_position_bias_table", f"backbone.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_bias_table"))
rename_keys.append((f"pretrained.model.blocks.{i}.attn.relative_position_index", f"backbone.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_index"))
for i in range(4):
rename_keys.append((f"pretrained.act_postprocess{i+1}.0.project.0.weight", f"neck.reassemble_stage.readout_projects.{i}.0.weight"))
rename_keys.append((f"pretrained.act_postprocess{i+1}.0.project.0.bias", f"neck.reassemble_stage.readout_projects.{i}.0.bias"))
rename_keys.append((f"pretrained.act_postprocess{i+1}.3.weight", f"neck.reassemble_stage.layers.{i}.projection.weight"))
rename_keys.append((f"pretrained.act_postprocess{i+1}.3.bias", f"neck.reassemble_stage.layers.{i}.projection.bias"))
if i != 2:
rename_keys.append((f"pretrained.act_postprocess{i+1}.4.weight", f"neck.reassemble_stage.layers.{i}.resize.weight"))
rename_keys.append((f"pretrained.act_postprocess{i+1}.4.bias", f"neck.reassemble_stage.layers.{i}.resize.bias"))
mapping = {1:3, 2:2, 3:1, 4:0}
for i in range(1, 5):
j = mapping[i]
rename_keys.append((f"scratch.refinenet{i}.out_conv.weight", f"neck.fusion_stage.layers.{j}.projection.weight"))
rename_keys.append((f"scratch.refinenet{i}.out_conv.bias", f"neck.fusion_stage.layers.{j}.projection.bias"))
rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.weight"))
rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.bias"))
rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.weight"))
rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.bias"))
rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.weight"))
rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.bias"))
rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.weight"))
rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.bias"))
for i in range(4):
rename_keys.append((f"scratch.layer{i+1}_rn.weight", f"neck.convs.{i}.weight"))
for i in range(0, 5, 2):
rename_keys.append((f"scratch.output_conv.{i}.weight", f"head.head.{i}.weight"))
rename_keys.append((f"scratch.output_conv.{i}.bias", f"head.head.{i}.bias"))
return rename_keys
def remove_ignore_keys_(state_dict):
ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"]
for k in ignore_keys:
state_dict.pop(k, None)
def read_in_q_k_v(state_dict, config):
hidden_size = config.backbone_config.hidden_size
for i in range(config.backbone_config.num_hidden_layers):
in_proj_weight = state_dict.pop(f"pretrained.model.blocks.{i}.attn.qkv.weight")
q_bias = state_dict.pop(f"pretrained.model.blocks.{i}.attn.q_bias")
v_bias = state_dict.pop(f"pretrained.model.blocks.{i}.attn.v_bias")
state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :]
state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = q_bias
state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :]
state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :]
state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = v_bias
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
"""
Copy/paste/tweak model's weights to our DPT structure.
"""
name_to_url = {
"dpt-beit-large-512": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt",
"dpt-beit-large-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_384.pt",
"dpt-beit-base-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_base_384.pt",
}
checkpoint_url = name_to_url[model_name]
config, image_size = get_dpt_config(model_name)
state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
remove_ignore_keys_(state_dict)
rename_keys = create_rename_keys(config)
for src, dest in rename_keys:
rename_key(state_dict, src, dest)
read_in_q_k_v(state_dict, config)
model = DPTForDepthEstimation(config)
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
print("Missing keys:", missing_keys)
print("Unexpected keys:", unexpected_keys)
assert missing_keys == []
model.eval()
processor = DPTImageProcessor(
size={"height": image_size, "width": image_size}, keep_aspect_ratio=False, ensure_multiple_of=32
)
image = prepare_img()
pixel_values = processor(image, return_tensors="pt").pixel_values
print("First values of pixel values:", pixel_values[0, 0, :3, :3])
print("Mean of pixel values:", pixel_values.mean().item())
print("Shape of pixel values:", pixel_values.shape)
import requests
from PIL import Image
from torchvision import transforms
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
transforms = transforms.Compose(
[
transforms.Resize((image_size, image_size)),
transforms.ToTensor(),
]
)
pixel_values = transforms(image).unsqueeze(0)
with torch.no_grad():
outputs = model(pixel_values)
predicted_depth = outputs.predicted_depth
print("Shape of predicted depth:", predicted_depth.shape)
print("First values of predicted depth:", predicted_depth[0, :3, :3])
if model_name == "dpt-beit-large-512":
expected_shape = torch.Size([1, 512, 512])
expected_slice = torch.tensor(
[[2804.6260, 2792.5708, 2812.9263], [2772.0288, 2780.1118, 2796.2529], [2748.1094, 2766.6558, 2766.9834]]
)
elif model_name == "dpt-beit-large-384":
expected_shape = torch.Size([1, 384, 384])
expected_slice = torch.tensor(
[[1783.2273, 1780.5729, 1792.6453], [1759.9817, 1765.5359, 1778.5002], [1739.1633, 1754.7903, 1757.1990]],
)
elif model_name == "dpt-beit-base-384":
expected_shape = torch.Size([1, 384, 384])
expected_slice = torch.tensor(
[[2898.4482, 2891.3750, 2904.8079], [2858.6685, 2877.2615, 2894.4507], [2842.1235, 2854.1023, 2861.6328]],
)
assert predicted_depth.shape == torch.Size(expected_shape)
assert torch.allclose(predicted_depth[0, :3, :3], expected_slice)
print("Looks ok!")
if pytorch_dump_folder_path is not None:
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model and processor to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
print("Pushing model and processor to hub...")
model.push_to_hub(repo_id=f"nielsr/{model_name}")
processor.push_to_hub(repo_id=f"nielsr/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
default="dpt-beit-large-512",
type=str,
choices=["dpt-beit-large-512", "dpt-beit-large-384", "dpt-beit-base-384"],
help="Name of the model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the output PyTorch model directory.",
)
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Whether to push the model to the hub after conversion.",
)
args = parser.parse_args()
convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
.\models\dpt\convert_dpt_hybrid_to_pytorch.py
"""从原始存储库中转换 DPT 检查点。URL:https://github.com/isl-org/DPT"""
import argparse
import json
from pathlib import Path
import requests
import torch
from huggingface_hub import cached_download, hf_hub_url
from PIL import Image
from transformers import DPTConfig, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTImageProcessor
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def get_dpt_config(checkpoint_url):
config = DPTConfig(embedding_type="hybrid")
if "large" in checkpoint_url:
config.hidden_size = 1024
config.intermediate_size = 4096
config.num_hidden_layers = 24
config.num_attention_heads = 16
config.backbone_out_indices = [5, 11, 17, 23]
config.neck_hidden_sizes = [256, 512, 1024, 1024]
expected_shape = (1, 384, 384)
if "nyu" or "midas" in checkpoint_url:
config.hidden_size = 768
config.reassemble_factors = [1, 1, 1, 0.5]
config.neck_hidden_sizes = [256, 512, 768, 768]
config.num_labels = 150
config.patch_size = 16
expected_shape = (1, 384, 384)
config.use_batch_norm_in_fusion_residual = False
config.readout_type = "project"
if "ade" in checkpoint_url:
config.use_batch_norm_in_fusion_residual = True
config.hidden_size = 768
config.reassemble_stage = [1, 1, 1, 0.5]
config.num_labels = 150
config.patch_size = 16
repo_id = "huggingface/label-files"
filename = "ade20k-id2label.json"
id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
expected_shape = [1, 150, 480, 480]
return config, expected_shape
def remove_ignore_keys_(state_dict):
ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"]
for k in ignore_keys:
state_dict.pop(k, None)
def rename_key(name):
if (
"pretrained.model" in name
and "cls_token" not in name
and "pos_embed" not in name
and "patch_embed" not in name
):
name = name.replace("pretrained.model", "dpt.encoder")
if "pretrained.model" in name:
name = name.replace("pretrained.model", "dpt.embeddings")
if "patch_embed" in name:
name = name.replace("patch_embed", "")
if "pos_embed" in name:
name = name.replace("pos_embed", "position_embeddings")
if "attn.proj" in name:
name = name.replace("attn.proj", "attention.output.dense")
if "proj" in name and "project" not in name:
name = name.replace("proj", "projection")
if "blocks" in name:
name = name.replace("blocks", "layer")
if "mlp.fc1" in name:
name = name.replace("mlp.fc1", "intermediate.dense")
if "mlp.fc2" in name:
name = name.replace("mlp.fc2", "output.dense")
if "norm1" in name and "backbone" not in name:
name = name.replace("norm1", "layernorm_before")
if "norm2" in name and "backbone" not in name:
name = name.replace("norm2", "layernorm_after")
if "scratch.output_conv" in name:
name = name.replace("scratch.output_conv", "head")
if "scratch" in name:
name = name.replace("scratch", "neck")
if "layer1_rn" in name:
name = name.replace("layer1_rn", "convs.0")
if "layer2_rn" in name:
name = name.replace("layer2_rn", "convs.1")
if "layer3_rn" in name:
name = name.replace("layer3_rn", "convs.2")
if "layer4_rn" in name:
name = name.replace("layer4_rn", "convs.3")
if "refinenet" in name:
layer_idx = int(name[len("neck.refinenet"): len("neck.refinenet") + 1])
name = name.replace(f"refinenet{layer_idx}", f"fusion_stage.layers.{abs(layer_idx-4)}")
if "out_conv" in name:
name = name.replace("out_conv", "projection")
if "resConfUnit1" in name:
name = name.replace("resConfUnit1", "residual_layer1")
if "resConfUnit2" in name:
name = name.replace("resConfUnit2", "residual_layer2")
if "conv1" in name:
name = name.replace("conv1", "convolution1")
if "conv2" in name:
name = name.replace("conv2", "convolution2")
if "pretrained.act_postprocess1.0.project.0" in name:
name = name.replace("pretrained.act_postprocess1.0.project.0", "neck.reassemble_stage.readout_projects.0.0")
if "pretrained.act_postprocess2.0.project.0" in name:
name = name.replace("pretrained.act_postprocess2.0.project.0", "neck.reassemble_stage.readout_projects.1.0")
if "pretrained.act_postprocess3.0.project.0" in name:
name = name.replace("pretrained.act_postprocess3.0.project.0", "neck.reassemble_stage.readout_projects.2.0")
if "pretrained.act_postprocess4.0.project.0" in name:
name = name.replace("pretrained.act_postprocess4.0.project.0", "neck.reassemble_stage.readout_projects.3.0")
if "pretrained.act_postprocess1.3" in name:
name = name.replace("pretrained.act_postprocess1.3", "neck.reassemble_stage.layers.0.projection")
if "pretrained.act_postprocess1.4" in name:
name = name.replace("pretrained.act_postprocess1.4", "neck.reassemble_stage.layers.0.resize")
if "pretrained.act_postprocess2.3" in name:
name = name.replace("pretrained.act_postprocess2.3", "neck.reassemble_stage.layers.1.projection")
if "pretrained.act_postprocess2.4" in name:
name = name.replace("pretrained.act_postprocess2.4", "neck.reassemble_stage.layers.1.resize")
if "pretrained.act_postprocess3.3" in name:
name = name.replace("pretrained.act_postprocess3.3", "neck.reassemble_stage.layers.2.projection")
if "pretrained.act_postprocess4.3" in name:
name = name.replace("pretrained.act_postprocess4.3", "neck.reassemble_stage.layers.3.projection")
if "pretrained.act_postprocess4.4" in name:
name = name.replace("pretrained.act_postprocess4.4", "neck.reassemble_stage.layers.3.resize")
if "pretrained" in name:
name = name.replace("pretrained", "dpt")
if "bn" in name:
name = name.replace("bn", "batch_norm")
if "head" in name:
name = name.replace("head", "head.head")
if "encoder.norm" in name:
name = name.replace("encoder.norm", "layernorm")
if "auxlayer" in name:
name = name.replace("auxlayer", "auxiliary_head.head")
if "backbone" in name:
name = name.replace("backbone", "backbone.bit.encoder")
if ".." in name:
name = name.replace("..", ".")
if "stem.conv" in name:
name = name.replace("stem.conv", "bit.embedder.convolution")
if "blocks" in name:
name = name.replace("blocks", "layers")
if "convolution" in name and "backbone" in name:
name = name.replace("convolution", "conv")
if "layer" in name and "backbone" in name:
name = name.replace("layer", "layers")
if "backbone.bit.encoder.bit" in name:
name = name.replace("backbone.bit.encoder.bit", "backbone.bit")
if "embedder.conv" in name:
name = name.replace("embedder.conv", "embedder.convolution")
if "backbone.bit.encoder.stem.norm" in name:
name = name.replace("backbone.bit.encoder.stem.norm", "backbone.bit.embedder.norm")
return name
def read_in_q_k_v(state_dict, config):
for i in range(config.num_hidden_layers):
in_proj_weight = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.weight")
in_proj_bias = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.bias")
state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
config.hidden_size : config.hidden_size * 2, :
]
state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
config.hidden_size : config.hidden_size * 2
]
state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-config.hidden_size :, :
]
state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub, model_name, show_prediction):
"""
复制/粘贴/调整模型权重到我们的DPT结构中。
"""
config, expected_shape = get_dpt_config(checkpoint_url)
state_dict = torch.load(checkpoint_url, map_location="cpu")
remove_ignore_keys_(state_dict)
for key in state_dict.copy().keys():
val = state_dict.pop(key)
state_dict[rename_key(key)] = val
read_in_q_k_v(state_dict, config)
model = DPTForSemanticSegmentation(config) if "ade" in checkpoint_url else DPTForDepthEstimation(config)
model.load_state_dict(state_dict)
model.eval()
size = 480 if "ade" in checkpoint_url else 384
image_processor = DPTImageProcessor(size=size)
image = prepare_img()
encoding = image_processor(image, return_tensors="pt")
outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth
if show_prediction:
prediction = (
torch.nn.functional.interpolate(
outputs.unsqueeze(1),
size=(image.size[1], image.size[0]),
mode="bicubic",
align_corners=False,
)
.squeeze()
.cpu()
.numpy()
)
Image.fromarray((prediction / prediction.max()) * 255).show()
if pytorch_dump_folder_path is not None:
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving image processor to {pytorch_dump_folder_path}")
image_processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
model.push_to_hub("ybelkada/dpt-hybrid-midas")
image_processor.push_to_hub("ybelkada/dpt-hybrid-midas")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--checkpoint_url",
default="https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt",
type=str,
help="URL of the original DPT checkpoint you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
required=False,
help="Path to the output PyTorch model directory.",
)
parser.add_argument(
"--push_to_hub",
action="store_true",
)
parser.add_argument(
"--model_name",
default="dpt-large",
type=str,
help="Name of the model, in case you're pushing to the hub.",
)
parser.add_argument(
"--show_prediction",
action="store_true",
)
args = parser.parse_args()
convert_dpt_checkpoint(
args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub, args.model_name, args.show_prediction
)
.\models\dpt\convert_dpt_swinv2_to_hf.py
def create_rename_keys(config):
rename_keys = []
rename_keys.extend([
rename_keys.append(("pretrained.model.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
rename_keys.append(("pretrained.model.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias"))
rename_keys.append(("pretrained.model.patch_embed.norm.weight", "backbone.embeddings.norm.weight"))
rename_keys.append(("pretrained.model.patch_embed.norm.bias", "backbone.embeddings.norm.bias"))
mapping = {1:3, 2:2, 3:1, 4:0}
for i in range(1, 5):
j = mapping[i]
rename_keys.append((f"scratch.refinenet{i}.out_conv.weight", f"neck.fusion_stage.layers.{j}.projection.weight"))
rename_keys.append((f"scratch.refinenet{i}.out_conv.bias", f"neck.fusion_stage.layers.{j}.projection.bias"))
rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.weight"))
rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.bias"))
rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.weight"))
rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.bias"))
rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.weight"))
rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.bias"))
rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.weight"))
rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.bias"))
for i in range(4):
rename_keys.append((f"scratch.layer{i+1}_rn.weight", f"neck.convs.{i}.weight"))
for i in range(0, 5, 2):
rename_keys.append((f"scratch.output_conv.{i}.weight", f"head.head.{i}.weight"))
rename_keys.append((f"scratch.output_conv.{i}.bias", f"head.head.{i}.bias"))
return rename_keys
def remove_ignore_keys_(state_dict):
ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"]
for k in ignore_keys:
state_dict.pop(k, None)
def read_in_q_k_v(state_dict, config, model):
for i in range(len(config.backbone_config.depths)):
for j in range(config.backbone_config.depths[i]):
dim = model.backbone.encoder.layers[i].blocks[j].attention.self.all_head_size
in_proj_weight = state_dict.pop(f"pretrained.model.layers.{i}.blocks.{j}.attn.qkv.weight")
state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :]
state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[dim: dim * 2, :]
state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[-dim:, :]
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, verify_logits, push_to_hub):
"""
Copy/paste/tweak model's weights to our DPT structure.
"""
name_to_url = {
"dpt-swinv2-tiny-256": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_tiny_256.pt",
"dpt-swinv2-base-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_base_384.pt",
"dpt-swinv2-large-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt",
}
checkpoint_url = name_to_url[model_name]
config, image_size = get_dpt_config(model_name)
state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
model = DPTForDepthEstimation(config)
remove_ignore_keys_(state_dict)
rename_keys = create_rename_keys(config)
for src, dest in rename_keys:
rename_key(state_dict, src, dest)
read_in_q_k_v(state_dict, config, model)
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
print("Missing keys:", missing_keys)
print("Unexpected keys:", unexpected_keys)
model.eval()
processor = DPTImageProcessor(size={"height": image_size, "width": image_size})
image = prepare_img()
processor(image, return_tensors="pt")
if verify_logits:
from torchvision import transforms
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
transforms = transforms.Compose(
[
transforms.Resize((image_size, image_size)),
transforms.ToTensor(),
]
)
pixel_values = transforms(image).unsqueeze(0)
with torch.no_grad():
outputs = model(pixel_values)
predicted_depth = outputs.predicted_depth
print("Shape of predicted depth:", predicted_depth.shape)
print("First values of predicted depth:", predicted_depth[0, :3, :3])
if model_name == "dpt-swinv2-base-384":
expected_shape = torch.Size([1, 384, 384])
expected_slice = torch.tensor(
[
[1998.5575, 1997.3887, 2009.2981],
[1952.8607, 1979.6488, 2001.0854],
[1953.7697, 1961.7711, 1968.8904],
],
)
elif model_name == "dpt-swinv2-tiny-256":
expected_shape = torch.Size([1, 256, 256])
expected_slice = torch.tensor(
[[978.9163, 976.5215, 978.5349], [974.1859, 971.7249, 975.8046], [971.3419, 970.3118, 971.6830]],
)
elif model_name == "dpt-swinv2-large-384":
expected_shape = torch.Size([1, 384, 384])
expected_slice = torch.tensor(
[
[1203.7206, 1200.1495, 1197.8234],
[1196.2484, 1183.5033, 1186.4640],
[1178.8131, 1182.3260, 1174.3975],
],
)
assert predicted_depth.shape == torch.Size(expected_shape)
assert torch.allclose(predicted_depth[0, :3, :3], expected_slice)
print("Looks ok!")
if pytorch_dump_folder_path is not None:
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model and processor to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
print("Pushing model and processor to hub...")
model.push_to_hub(repo_id=f"Intel/{model_name}")
processor.push_to_hub(repo_id=f"Intel/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
default="dpt-swinv2-base-384",
type=str,
choices=["dpt-swinv2-tiny-256", "dpt-swinv2-base-384", "dpt-swinv2-large-384"],
help="Name of the model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the output PyTorch model directory.",
)
parser.add_argument(
"--verify_logits",
action="store_true",
help="Whether to verify logits after conversion.",
)
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Whether to push the model to the hub after conversion.",
)
args = parser.parse_args()
convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.verify_logits, args.push_to_hub)
.\models\dpt\convert_dpt_to_pytorch.py
"""从原始代码库中转换 DPT 模型的检查点。URL: https://github.com/isl-org/DPT"""
import argparse
import json
from pathlib import Path
import requests
import torch
from huggingface_hub import cached_download, hf_hub_url
from PIL import Image
from transformers import DPTConfig, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTImageProcessor
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def get_dpt_config(checkpoint_url):
config = DPTConfig()
if "large" in checkpoint_url:
config.hidden_size = 1024
config.intermediate_size = 4096
config.num_hidden_layers = 24
config.num_attention_heads = 16
config.backbone_out_indices = [5, 11, 17, 23]
config.neck_hidden_sizes = [256, 512, 1024, 1024]
expected_shape = (1, 384, 384)
if "ade" in checkpoint_url:
config.use_batch_norm_in_fusion_residual = True
config.num_labels = 150
repo_id = "huggingface/label-files"
filename = "ade20k-id2label.json"
id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
expected_shape = [1, 150, 480, 480]
return config, expected_shape
def remove_ignore_keys_(state_dict):
ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"]
for k in ignore_keys:
state_dict.pop(k, None)
def rename_key(name):
if (
"pretrained.model" in name
and "cls_token" not in name
and "pos_embed" not in name
and "patch_embed" not in name
):
name = name.replace("pretrained.model", "dpt.encoder")
if "pretrained.model" in name:
name = name.replace("pretrained.model", "dpt.embeddings")
if "patch_embed" in name:
name = name.replace("patch_embed", "patch_embeddings")
if "pos_embed" in name:
name = name.replace("pos_embed", "position_embeddings")
if "attn.proj" in name:
name = name.replace("attn.proj", "attention.output.dense")
if "proj" in name and "project" not in name:
name = name.replace("proj", "projection")
if "blocks" in name:
name = name.replace("blocks", "layer")
if "mlp.fc1" in name:
name = name.replace("mlp.fc1", "intermediate.dense")
if "mlp.fc2" in name:
name = name.replace("mlp.fc2", "output.dense")
if "norm1" in name:
name = name.replace("norm1", "layernorm_before")
if "norm2" in name:
name = name.replace("norm2", "layernorm_after")
if "scratch.output_conv" in name:
name = name.replace("scratch.output_conv", "head")
if "scratch" in name:
name = name.replace("scratch", "neck")
if "layer1_rn" in name:
name = name.replace("layer1_rn", "convs.0")
if "layer2_rn" in name:
name = name.replace("layer2_rn", "convs.1")
if "layer3_rn" in name:
name = name.replace("layer3_rn", "convs.2")
if "layer4_rn" in name:
name = name.replace("layer4_rn", "convs.3")
if "refinenet" in name:
layer_idx = int(name[len("neck.refinenet") : len("neck.refinenet") + 1])
name = name.replace(f"refinenet{layer_idx}", f"fusion_stage.layers.{abs(layer_idx-4)}")
if "out_conv" in name:
name = name.replace("out_conv", "projection")
if "resConfUnit1" in name:
name = name.replace("resConfUnit1", "residual_layer1")
if "resConfUnit2" in name:
name = name.replace("resConfUnit2", "residual_layer2")
if "conv1" in name:
name = name.replace("conv1", "convolution1")
if "conv2" in name:
name = name.replace("conv2", "convolution2")
if "pretrained.act_postprocess1.0.project.0" in name:
name = name.replace("pretrained.act_postprocess1.0.project.0", "neck.reassemble_stage.readout_projects.0.0")
if "pretrained.act_postprocess2.0.project.0" in name:
name = name.replace("pretrained.act_postprocess2.0.project.0", "neck.reassemble_stage.readout_projects.1.0")
if "pretrained.act_postprocess3.0.project.0" in name:
name = name.replace("pretrained.act_postprocess3.0.project.0", "neck.reassemble_stage.readout_projects.2.0")
if "pretrained.act_postprocess4.0.project.0" in name:
name = name.replace("pretrained.act_postprocess4.0.project.0", "neck.reassemble_stage.readout_projects.3.0")
if "pretrained.act_postprocess1.3" in name:
name = name.replace("pretrained.act_postprocess1.3", "neck.reassemble_stage.layers.0.projection")
if "pretrained.act_postprocess1.4" in name:
name = name.replace("pretrained.act_postprocess1.4", "neck.reassemble_stage.layers.0.resize")
if "pretrained.act_postprocess2.3" in name:
name = name.replace("pretrained.act_postprocess2.3", "neck.reassemble_stage.layers.1.projection")
if "pretrained.act_postprocess2.4" in name:
if "pretrained.act_postprocess3.3" in name:
name = name.replace("pretrained.act_postprocess3.3", "neck.reassemble_stage.layers.2.projection")
if "pretrained.act_postprocess4.3" in name:
name = name.replace("pretrained.act_postprocess4.3", "neck.reassemble_stage.layers.3.projection")
if "pretrained.act_postprocess4.4" in name:
name = name.replace("pretrained.act_postprocess4.4", "neck.reassemble_stage.layers.3.resize")
if "pretrained" in name:
name = name.replace("pretrained", "dpt")
if "bn" in name:
name = name.replace("bn", "batch_norm")
if "head" in name:
name = name.replace("head", "head.head")
if "encoder.norm" in name:
name = name.replace("encoder.norm", "layernorm")
if "auxlayer" in name:
name = name.replace("auxlayer", "auxiliary_head.head")
return name
def read_in_q_k_v(state_dict, config):
for i in range(config.num_hidden_layers):
in_proj_weight = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.weight")
in_proj_bias = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.bias")
state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
config.hidden_size : config.hidden_size * 2, :
]
state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
config.hidden_size : config.hidden_size * 2
]
state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-config.hidden_size :, :
]
state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub, model_name):
"""
复制/粘贴/调整模型权重到我们的DPT结构。
"""
config, expected_shape = get_dpt_config(checkpoint_url)
state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
remove_ignore_keys_(state_dict)
for key in state_dict.copy().keys():
val = state_dict.pop(key)
state_dict[rename_key(key)] = val
read_in_q_k_v(state_dict, config)
model = DPTForSemanticSegmentation(config) if "ade" in checkpoint_url else DPTForDepthEstimation(config)
model.load_state_dict(state_dict)
model.eval()
size = 480 if "ade" in checkpoint_url else 384
image_processor = DPTImageProcessor(size=size)
image = prepare_img()
encoding = image_processor(image, return_tensors="pt")
outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth
expected_slice = torch.tensor([[6.3199, 6.3629, 6.4148], [6.3850, 6.3615, 6.4166], [6.3519, 6.3176, 6.3575]])
if "ade" in checkpoint_url:
expected_slice = torch.tensor([[4.0480, 4.2420, 4.4360], [4.3124, 4.5693, 4.8261], [4.5768, 4.8965, 5.2163]])
assert outputs.shape == torch.Size(expected_shape)
assert (
torch.allclose(outputs[0, 0, :3, :3], expected_slice, atol=1e-4)
if "ade" in checkpoint_url
else torch.allclose(outputs[0, :3, :3], expected_slice)
)
print("Looks ok!")
if pytorch_dump_folder_path is not None:
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving image processor to {pytorch_dump_folder_path}")
image_processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
print("Pushing model to hub...")
model.push_to_hub(
repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
organization="nielsr",
commit_message="Add model",
use_temp_dir=True,
)
image_processor.push_to_hub(
repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
organization="nielsr",
commit_message="Add image processor",
use_temp_dir=True,
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--checkpoint_url",
default="https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt",
type=str,
help="URL of the original DPT checkpoint you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
required=False,
help="Path to the output PyTorch model directory.",
)
parser.add_argument(
"--push_to_hub",
action="store_true",
)
parser.add_argument(
"--model_name",
default="dpt-large",
type=str,
required=False,
help="Name of the model, in case you're pushing to the hub.",
)
args = parser.parse_args()
convert_dpt_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub, args.model_name)
.\models\dpt\feature_extraction_dpt.py
import warnings
from ...utils import logging
from .image_processing_dpt import DPTImageProcessor
logger = logging.get_logger(__name__)
class DPTFeatureExtractor(DPTImageProcessor):
def __init__(self, *args, **kwargs) -> None:
warnings.warn(
"The class DPTFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
" use DPTImageProcessor instead.",
FutureWarning,
)
super().__init__(*args, **kwargs)
.\models\dpt\image_processing_dpt.py
"""Image processor class for DPT."""
import math
from typing import Dict, Iterable, List, Optional, Tuple, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import pad, resize, to_channel_dimension_format
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
get_image_size,
infer_channel_dimension_format,
is_scaled_image,
is_torch_available,
is_torch_tensor,
make_list_of_images,
to_numpy_array,
valid_images,
validate_kwargs,
validate_preprocess_arguments,
)
from ...utils import TensorType, is_vision_available, logging
if is_torch_available():
import torch
if is_vision_available():
import PIL
logger = logging.get_logger(__name__)
def get_resize_output_image_size(
input_image: np.ndarray,
output_size: Union[int, Iterable[int]],
keep_aspect_ratio: bool,
multiple: int,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
"""
Calculate the output size for resizing an image while optionally constraining to multiples and maintaining aspect ratio.
Args:
input_image (np.ndarray): The input image as a NumPy array.
output_size (Union[int, Iterable[int]]): Desired output size for resizing.
keep_aspect_ratio (bool): Whether to maintain the aspect ratio of the input image.
multiple (int): Constraint to resize dimensions to multiples of this value.
input_data_format (Optional[Union[str, ChannelDimension]], optional):
Format of the input image data. Defaults to None.
Returns:
Tuple[int, int]: Output height and width after resizing.
"""
def constraint_to_multiple_of(val, multiple, min_val=0, max_val=None):
"""
Helper function to constrain a value to be a multiple of another value within specified bounds.
Args:
val (float): Value to constrain.
multiple (int): Constraint value.
min_val (int, optional): Minimum value constraint. Defaults to 0.
max_val (int, optional): Maximum value constraint. Defaults to None.
Returns:
float: Constrained value.
"""
x = round(val / multiple) * multiple
if max_val is not None and x > max_val:
x = math.floor(val / multiple) * multiple
if x < min_val:
x = math.ceil(val / multiple) * multiple
return x
output_size = (output_size, output_size) if isinstance(output_size, int) else output_size
input_height, input_width = get_image_size(input_image, input_data_format)
output_height, output_width = output_size
scale_height = output_height / input_height
scale_width = output_width / input_width
if keep_aspect_ratio:
if abs(1 - scale_width) < abs(1 - scale_height):
scale_height = scale_width
else:
scale_width = scale_height
new_height = constraint_to_multiple_of(scale_height * input_height, multiple=multiple)
new_width = constraint_to_multiple_of(scale_width * input_width, multiple=multiple)
return (new_height, new_width)
class DPTImageProcessor(BaseImageProcessor):
r"""
Constructs a DPT image processor.
This class extends BaseImageProcessor and provides methods specific to the DPT image processing.
"""
Args:
do_resize (`bool`, *optional*, defaults to `True`):
是否调整图像的(高度,宽度)尺寸。可以被 `preprocess` 中的 `do_resize` 覆盖。
size (`Dict[str, int]` *optional*, defaults to `{"height": 384, "width": 384}`):
调整后的图像尺寸。可以被 `preprocess` 中的 `size` 覆盖。
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
定义调整图像时要使用的重采样滤波器。可以被 `preprocess` 中的 `resample` 覆盖。
keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
如果为 `True`,则调整图像尺寸以保持宽高比最大可能的大小。可以被 `preprocess` 中的 `keep_aspect_ratio` 覆盖。
ensure_multiple_of (`int`, *optional*, defaults to 1):
如果 `do_resize` 为 `True`,则调整图像尺寸为此值的倍数。可以被 `preprocess` 中的 `ensure_multiple_of` 覆盖。
do_rescale (`bool`, *optional*, defaults to `True`):
是否按照指定的比例因子 `rescale_factor` 进行图像缩放。可以被 `preprocess` 中的 `do_rescale` 覆盖。
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
如果进行图像缩放,使用的缩放因子。可以被 `preprocess` 中的 `rescale_factor` 覆盖。
do_normalize (`bool`, *optional*, defaults to `True`):
是否对图像进行归一化。可以被 `preprocess` 中的 `do_normalize` 参数覆盖。
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
如果进行图像归一化,使用的均值。这是一个浮点数或与图像通道数相等长度的浮点数列表。可以被 `preprocess` 中的 `image_mean` 参数覆盖。
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
如果进行图像归一化,使用的标准差。这是一个浮点数或与图像通道数相等长度的浮点数列表。可以被 `preprocess` 中的 `image_std` 参数覆盖。
do_pad (`bool`, *optional*, defaults to `False`):
是否应用中心填充。这在与 DPT 结合使用的 DINOv2 论文中被引入。
size_divisor (`int`, *optional*):
如果 `do_pad` 为 `True`,则将图像维度填充为此值的倍数。这在与 DPT 结合使用的 DINOv2 论文中被引入。
"""
# 定义了模型输入的名称,这是一个包含单个元素 "pixel_values" 的列表
model_input_names = ["pixel_values"]
# 初始化方法,用于实例化对象时进行初始化设置
def __init__(
self,
do_resize: bool = True, # 是否进行图片大小调整的标志,默认为True
size: Dict[str, int] = None, # 图片大小的字典,包含"height"和"width"两个键,默认为384x384
resample: PILImageResampling = PILImageResampling.BICUBIC, # 图片调整大小时的重采样方法,默认为双三次插值
keep_aspect_ratio: bool = False, # 是否保持图片宽高比的标志,默认为False
ensure_multiple_of: int = 1, # 调整后的图片尺寸需为此值的倍数,默认为1
do_rescale: bool = True, # 是否对图片进行重新缩放的标志,默认为True
rescale_factor: Union[int, float] = 1 / 255, # 图片缩放因子,将像素值缩放到[0, 1]区间,默认为1/255
do_normalize: bool = True, # 是否对图片进行归一化处理的标志,默认为True
image_mean: Optional[Union[float, List[float]]] = None, # 图片归一化的均值,默认为ImageNet数据集的标准均值
image_std: Optional[Union[float, List[float]]] = None, # 图片归一化的标准差,默认为ImageNet数据集的标准差
do_pad: bool = False, # 是否对图片进行填充的标志,默认为False
size_divisor: int = None, # 图片调整后尺寸需为此值的倍数,默认为None
**kwargs, # 其他可选参数
) -> None:
# 调用父类的初始化方法
super().__init__(**kwargs)
# 如果未提供size参数,则使用默认值384x384
size = size if size is not None else {"height": 384, "width": 384}
# 根据提供的size参数获取标准化后的尺寸字典
size = get_size_dict(size)
# 初始化对象的各个属性
self.do_resize = do_resize # 是否进行图片大小调整的标志
self.size = size # 图片大小的字典
self.keep_aspect_ratio = keep_aspect_ratio # 是否保持图片宽高比的标志
self.ensure_multiple_of = ensure_multiple_of # 调整后的图片尺寸需为此值的倍数
self.resample = resample # 图片调整大小时的重采样方法
self.do_rescale = do_rescale # 是否对图片进行重新缩放的标志
self.rescale_factor = rescale_factor # 图片缩放因子
self.do_normalize = do_normalize # 是否对图片进行归一化处理的标志
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN # 图片归一化的均值
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD # 图片归一化的标准差
self.do_pad = do_pad # 是否对图片进行填充的标志
self.size_divisor = size_divisor # 图片调整后尺寸需为此值的倍数
# 验证处理器可接受的键列表
self._valid_processor_keys = [
"images",
"do_resize",
"size",
"keep_aspect_ratio",
"ensure_multiple_of",
"resample",
"do_rescale",
"rescale_factor",
"do_normalize",
"image_mean",
"image_std",
"do_pad",
"size_divisor",
"return_tensors",
"data_format",
"input_data_format",
]
) -> np.ndarray:
"""
Resize an image to target size `(size["height"], size["width"])`. If `keep_aspect_ratio` is `True`, the image
is resized to the largest possible size such that the aspect ratio is preserved. If `ensure_multiple_of` is
set, the image is resized to a size that is a multiple of this value.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Target size of the output image.
keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
If `True`, the image is resized while preserving its aspect ratio.
ensure_multiple_of (`int`, *optional*, defaults to 1):
The image is resized to a size that is a multiple of this value.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use when resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the output image.
input_data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
size = get_size_dict(size) # 调用函数 `get_size_dict` 将 `size` 参数转换为标准尺寸字典
if "height" not in size or "width" not in size:
raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}")
output_size = get_resize_output_image_size(
image,
output_size=(size["height"], size["width"]),
keep_aspect_ratio=keep_aspect_ratio,
multiple=ensure_multiple_of,
input_data_format=input_data_format,
)
# 调用 `get_resize_output_image_size` 函数计算输出图像的尺寸,并确保尺寸为 `size` 的倍数
return resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
def pad_image(
self,
image: np.array,
size_divisor: int,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs, # 允许额外的关键字参数
):
"""
Pad an image to ensure its dimensions are divisible by `size_divisor`.
Args:
image (`np.array`):
Image to pad.
size_divisor (`int`):
The divisor that the dimensions of the padded image should be divisible by.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the output image.
input_data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
**kwargs:
Additional keyword arguments to be passed to the `resize` function.
Returns:
`np.ndarray`: Padded image.
"""
):
"""
Center pad an image to be a multiple of `multiple`.
Args:
image (`np.ndarray`):
Image to pad.
size_divisor (`int`):
The width and height of the image will be padded to a multiple of this number.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
"""
def _get_pad(size, size_divisor):
"""
Calculate padding sizes for an image dimension to be a multiple of `size_divisor`.
Args:
size (`int`): Original size of the image dimension.
size_divisor (`int`): The width or height will be padded to a multiple of this number.
Returns:
tuple: Left and right padding sizes.
"""
new_size = math.ceil(size / size_divisor) * size_divisor
pad_size = new_size - size
pad_size_left = pad_size // 2
pad_size_right = pad_size - pad_size_left
return pad_size_left, pad_size_right
if input_data_format is None:
input_data_format = infer_channel_dimension_format(image)
height, width = get_image_size(image, input_data_format)
pad_size_left, pad_size_right = _get_pad(height, size_divisor)
pad_size_top, pad_size_bottom = _get_pad(width, size_divisor)
return pad(image, ((pad_size_left, pad_size_right), (pad_size_top, pad_size_bottom)), data_format=data_format)
def preprocess(
self,
images: ImageInput,
do_resize: bool = None,
size: int = None,
keep_aspect_ratio: bool = None,
ensure_multiple_of: int = None,
resample: PILImageResampling = None,
do_rescale: bool = None,
rescale_factor: float = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_pad: bool = None,
size_divisor: int = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
):
"""
Preprocess images according to specified transformations.
Args:
images (ImageInput): Input images to be preprocessed.
do_resize (bool, optional): Whether to resize the images.
size (int, optional): Size to which images should be resized.
keep_aspect_ratio (bool, optional): Whether to maintain aspect ratio during resizing.
ensure_multiple_of (int, optional): Ensure image dimensions are multiples of this number.
resample (PILImageResampling, optional): Resampling method for resizing.
do_rescale (bool, optional): Whether to rescale image pixel values.
rescale_factor (float, optional): Factor to rescale image pixel values.
do_normalize (bool, optional): Whether to normalize image pixel values.
image_mean (float or List[float], optional): Mean values for image normalization.
image_std (float or List[float], optional): Standard deviation values for image normalization.
do_pad (bool, optional): Whether to pad images.
size_divisor (int, optional): Pad image dimensions to be multiples of this number.
return_tensors (str or TensorType, optional): Desired tensor type for output images.
data_format (ChannelDimension, optional): Output image channel format.
input_data_format (str or ChannelDimension, optional): Input image channel format.
**kwargs: Additional keyword arguments for preprocessing.
Returns:
Preprocessed images according to the specified transformations.
"""
# Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.post_process_semantic_segmentation with Beit->DPT
# 后处理语义分割模型输出,将[`DPTForSemanticSegmentation`]的输出转换为语义分割地图。仅支持PyTorch。
# 获取模型输出中的逻辑回归值
logits = outputs.logits
# 调整逻辑回归值的大小并计算语义分割地图
if target_sizes is not None:
# 检查目标大小列表长度是否与逻辑回归值的批次维度相匹配
if len(logits) != len(target_sizes):
raise ValueError(
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
)
# 如果目标大小是PyTorch张量,则转换为NumPy数组
if is_torch_tensor(target_sizes):
target_sizes = target_sizes.numpy()
# 初始化语义分割结果列表
semantic_segmentation = []
# 对每个样本的逻辑回归值进行处理
for idx in range(len(logits)):
# 调整逻辑回归值的大小,使用双线性插值方法
resized_logits = torch.nn.functional.interpolate(
logits[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
)
# 计算每个像素点的语义类别(最大概率对应的类别)
semantic_map = resized_logits[0].argmax(dim=0)
# 将处理后的语义分割地图添加到结果列表中
semantic_segmentation.append(semantic_map)
else:
# 若未指定目标大小,则直接计算每个样本的语义分割结果
semantic_segmentation = logits.argmax(dim=1)
semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
# 返回所有样本的语义分割结果列表
return semantic_segmentation
.\models\dpt\modeling_dpt.py
"""
PyTorch DPT (Dense Prediction Transformers) model.
This implementation is heavily inspired by OpenMMLab's implementation, found here:
https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/dpt_head.py.
"""
import collections.abc
import math
from dataclasses import dataclass
from typing import List, Optional, Set, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...file_utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
replace_return_docstrings,
)
from ...modeling_outputs import BaseModelOutput, DepthEstimatorOutput, SemanticSegmenterOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import ModelOutput, logging
from ...utils.backbone_utils import load_backbone
from .configuration_dpt import DPTConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "DPTConfig"
_CHECKPOINT_FOR_DOC = "Intel/dpt-large"
_EXPECTED_OUTPUT_SHAPE = [1, 577, 1024]
DPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"Intel/dpt-large",
"Intel/dpt-hybrid-midas",
]
@dataclass
class BaseModelOutputWithIntermediateActivations(ModelOutput):
"""
Base class for model's outputs that also contains intermediate activations that can be used at later stages. Useful
in the context of Vision models.
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
Intermediate activations that can be used to compute hidden states of the model at various layers.
"""
last_hidden_states: torch.FloatTensor = None
intermediate_activations: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class BaseModelOutputWithPoolingAndIntermediateActivations(ModelOutput):
"""
Base class for model's outputs that also contains a pooling of the last hidden states as well as intermediate
"""
last_hidden_state: torch.FloatTensor = None
pooler_output: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
intermediate_activations: Optional[Tuple[torch.FloatTensor, ...]] = None
"""
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
"""
def __init__(self, config, feature_size=None):
super().__init__()
image_size, patch_size = config.image_size, config.patch_size
num_channels, hidden_size = config.num_channels, config.hidden_size
image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
self.backbone = load_backbone(config)
feature_dim = self.backbone.channels[-1]
if len(self.backbone.channels) != 3:
raise ValueError(f"Expected backbone to have 3 output features, got {len(self.backbone.channels)}")
self.residual_feature_map_index = [0, 1]
if feature_size is None:
feat_map_shape = config.backbone_featmap_shape
feature_size = feat_map_shape[-2:]
feature_dim = feat_map_shape[1]
else:
feature_size = (
feature_size if isinstance(feature_size, collections.abc.Iterable) else (feature_size, feature_size)
)
feature_dim = self.backbone.channels[-1]
self.image_size = image_size
self.patch_size = patch_size[0]
self.num_channels = num_channels
self.projection = nn.Conv2d(feature_dim, hidden_size, kernel_size=1)
self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
def _resize_pos_embed(self, posemb, grid_size_height, grid_size_width, start_index=1):
"""
Resize the positional embeddings to match a specified grid size.
Args:
posemb (torch.Tensor): Positional embeddings tensor.
grid_size_height (int): Target grid height.
grid_size_width (int): Target grid width.
start_index (int, optional): Starting index for grid reshaping. Defaults to 1.
Returns:
torch.Tensor: Resized positional embeddings tensor.
"""
posemb_tok = posemb[:, :start_index]
posemb_grid = posemb[0, start_index:]
old_grid_size = int(math.sqrt(len(posemb_grid)))
posemb_grid = posemb_grid.reshape(1, old_grid_size, old_grid_size, -1).permute(0, 3, 1, 2)
posemb_grid = nn.functional.interpolate(posemb_grid, size=(grid_size_height, grid_size_width), mode="bilinear")
posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, grid_size_height * grid_size_width, -1)
posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
return posemb
def forward(
self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False, return_dict: bool = False
):
"""
Perform forward pass of the model.
Args:
pixel_values (torch.Tensor): Input tensor of shape `(batch_size, num_channels, height, width)`.
interpolate_pos_encoding (bool): Whether to interpolate positional embeddings. Defaults to False.
return_dict (bool): Whether to return output as dictionary. Defaults to False.
Returns:
torch.Tensor or dict: Output tensor or dictionary depending on `return_dict`.
"""
) -> torch.Tensor:
batch_size, num_channels, height, width = pixel_values.shape
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
if not interpolate_pos_encoding:
if height != self.image_size[0] or width != self.image_size[1]:
raise ValueError(
f"Input image size ({height}*{width}) doesn't match model"
f" ({self.image_size[0]}*{self.image_size[1]})."
)
position_embeddings = self._resize_pos_embed(
self.position_embeddings, height // self.patch_size, width // self.patch_size
)
backbone_output = self.backbone(pixel_values)
features = backbone_output.feature_maps[-1]
output_hidden_states = [backbone_output.feature_maps[index] for index in self.residual_feature_map_index]
embeddings = self.projection(features).flatten(2).transpose(1, 2)
cls_tokens = self.cls_token.expand(batch_size, -1, -1)
embeddings = torch.cat((cls_tokens, embeddings), dim=1)
embeddings = embeddings + position_embeddings
if not return_dict:
return (embeddings, output_hidden_states)
return BaseModelOutputWithIntermediateActivations(
last_hidden_states=embeddings,
intermediate_activations=output_hidden_states,
)
class DPTViTEmbeddings(nn.Module):
"""
Construct the CLS token, position and patch embeddings.
构建 CLS token、位置编码和图像补丁的嵌入。
"""
def __init__(self, config):
super().__init__()
self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
self.patch_embeddings = DPTViTPatchEmbeddings(config)
num_patches = self.patch_embeddings.num_patches
self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.config = config
def _resize_pos_embed(self, posemb, grid_size_height, grid_size_width, start_index=1):
posemb_tok = posemb[:, :start_index]
posemb_grid = posemb[0, start_index:]
old_grid_size = int(math.sqrt(len(posemb_grid)))
posemb_grid = posemb_grid.reshape(1, old_grid_size, old_grid_size, -1).permute(0, 3, 1, 2)
posemb_grid = nn.functional.interpolate(posemb_grid, size=(grid_size_height, grid_size_width), mode="bilinear")
posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, grid_size_height * grid_size_width, -1)
posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
return posemb
def forward(self, pixel_values, return_dict=False):
batch_size, num_channels, height, width = pixel_values.shape
patch_size = self.config.patch_size
position_embeddings = self._resize_pos_embed(
self.position_embeddings, height // patch_size, width // patch_size
)
embeddings = self.patch_embeddings(pixel_values)
batch_size, seq_len, _ = embeddings.size()
cls_tokens = self.cls_token.expand(batch_size, -1, -1)
embeddings = torch.cat((cls_tokens, embeddings), dim=1)
embeddings = embeddings + position_embeddings
embeddings = self.dropout(embeddings)
if not return_dict:
return (embeddings,)
return BaseModelOutputWithIntermediateActivations(last_hidden_states=embeddings)
class DPTViTPatchEmbeddings(nn.Module):
"""
Image to Patch Embedding.
图像到补丁的嵌入。
"""
def __init__(self, config):
super().__init__()
image_size, patch_size = config.image_size, config.patch_size
num_channels, hidden_size = config.num_channels, config.hidden_size
image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.num_patches = num_patches
self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
def forward(self, pixel_values):
batch_size, num_channels, height, width = pixel_values.shape
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
return embeddings
class DPTViTSelfAttention(nn.Module):
def __init__(self, config: DPTConfig) -> None:
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
f"heads {config.num_attention_heads}."
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
mixed_query_layer = self.query(hidden_states)
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
query_layer = self.transpose_for_scores(mixed_query_layer)
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
attention_probs = self.dropout(attention_probs)
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
class DPTViTSelfOutput(nn.Module):
"""
The residual connection is defined in DPTLayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
"""
def __init__(self, config: DPTConfig) -> None:
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class DPTViTAttention(nn.Module):
def __init__(self, config: DPTConfig) -> None:
super().__init__()
self.attention = DPTViTSelfAttention(config)
self.output = DPTViTSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads: Set[int]) -> None:
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
)
self.attention.query = prune_linear_layer(self.attention.query, index)
self.attention.key = prune_linear_layer(self.attention.key, index)
self.attention.value = prune_linear_layer(self.attention.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
self_outputs = self.attention(hidden_states, head_mask, output_attentions)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class DPTViTIntermediate(nn.Module):
def __init__(self, config: DPTConfig) -> None:
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class DPTViTOutput(nn.Module):
def __init__(self, config: DPTConfig) -> None:
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = hidden_states + input_tensor
return hidden_states
class DPTViTLayer(nn.Module):
"""This corresponds to the Block class in the timm implementation."""
def __init__(self, config: DPTConfig) -> None:
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = DPTViTAttention(config)
self.intermediate = DPTViTIntermediate(config)
self.output = DPTViTOutput(config)
self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(
self,
hidden_states: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
self_attention_outputs = self.attention(
self.layernorm_before(hidden_states),
head_mask,
output_attentions=output_attentions,
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:]
hidden_states = attention_output + hidden_states
layer_output = self.layernorm_after(hidden_states)
layer_output = self.intermediate(layer_output)
layer_output = self.output(layer_output, hidden_states)
outputs = (layer_output,) + outputs
return outputs
class DPTViTEncoder(nn.Module):
def __init__(self, config: DPTConfig) -> None:
super().__init__()
self.config = config
self.layer = nn.ModuleList([DPTViTLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
) -> Union[tuple, BaseModelOutput]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
layer_head_mask,
output_attentions,
)
else:
layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
class DPTReassembleStage(nn.Module):
"""
This class reassembles the hidden states of the backbone into image-like feature representations at various
resolutions.
This happens in 3 stages:
1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to
`config.readout_type`.
2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
3. Resizing the spatial dimensions (height, width).
Args:
config (`[DPTConfig]`):
Model configuration class defining the model architecture.
"""
def __init__(self, config):
super().__init__()
self.config = config
self.layers = nn.ModuleList()
if config.is_hybrid:
self._init_reassemble_dpt_hybrid(config)
else:
self._init_reassemble_dpt(config)
self.neck_ignore_stages = config.neck_ignore_stages
def _init_reassemble_dpt_hybrid(self, config):
r"""
For DPT-Hybrid the first 2 reassemble layers are set to `nn.Identity()`, please check the official
implementation: https://github.com/isl-org/DPT/blob/f43ef9e08d70a752195028a51be5e1aff227b913/dpt/vit.py#L438
for more details.
"""
for i, factor in zip(range(len(config.neck_hidden_sizes)), config.reassemble_factors):
if i <= 1:
self.layers.append(nn.Identity())
elif i > 1:
self.layers.append(DPTReassembleLayer(config, channels=config.neck_hidden_sizes[i], factor=factor))
if config.readout_type != "project":
raise ValueError(f"Readout type {config.readout_type} is not supported for DPT-Hybrid.")
self.readout_projects = nn.ModuleList()
hidden_size = _get_backbone_hidden_size(config)
for i in range(len(config.neck_hidden_sizes)):
if i <= 1:
self.readout_projects.append(nn.Sequential(nn.Identity()))
elif i > 1:
self.readout_projects.append(
nn.Sequential(nn.Linear(2 * hidden_size, hidden_size), ACT2FN[config.hidden_act])
)
def _init_reassemble_dpt(self, config):
for i, factor in zip(range(len(config.neck_hidden_sizes)), config.reassemble_factors):
self.layers.append(DPTReassembleLayer(config, channels=config.neck_hidden_sizes[i], factor=factor))
if config.readout_type == "project":
self.readout_projects = nn.ModuleList()
hidden_size = _get_backbone_hidden_size(config)
for _ in range(len(config.neck_hidden_sizes)):
self.readout_projects.append(
nn.Sequential(nn.Linear(2 * hidden_size, hidden_size), ACT2FN[config.hidden_act])
)
def forward(self, hidden_states: List[torch.Tensor], patch_height=None, patch_width=None) -> List[torch.Tensor]:
"""
Args:
hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
List of hidden states from the backbone.
"""
out = []
for i, hidden_state in enumerate(hidden_states):
if i not in self.neck_ignore_stages:
cls_token, hidden_state = hidden_state[:, 0], hidden_state[:, 1:]
batch_size, sequence_length, num_channels = hidden_state.shape
if patch_height is not None and patch_width is not None:
hidden_state = hidden_state.reshape(batch_size, patch_height, patch_width, num_channels)
else:
size = int(math.sqrt(sequence_length))
hidden_state = hidden_state.reshape(batch_size, size, size, num_channels)
hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
feature_shape = hidden_state.shape
if self.config.readout_type == "project":
hidden_state = hidden_state.flatten(2).permute((0, 2, 1))
readout = cls_token.unsqueeze(1).expand_as(hidden_state)
hidden_state = self.readout_projects[i](torch.cat((hidden_state, readout), -1))
hidden_state = hidden_state.permute(0, 2, 1).reshape(feature_shape)
elif self.config.readout_type == "add":
hidden_state = hidden_state.flatten(2) + cls_token.unsqueeze(-1)
hidden_state = hidden_state.reshape(feature_shape)
hidden_state = self.layers[i](hidden_state)
out.append(hidden_state)
return out
def _get_backbone_hidden_size(config):
if config.backbone_config is not None and config.is_hybrid is False:
return config.backbone_config.hidden_size
else:
return config.hidden_size
class DPTReassembleLayer(nn.Module):
def __init__(self, config, channels, factor):
super().__init__()
hidden_size = _get_backbone_hidden_size(config)
self.projection = nn.Conv2d(in_channels=hidden_size, out_channels=channels, kernel_size=1)
if factor > 1:
self.resize = nn.ConvTranspose2d(channels, channels, kernel_size=factor, stride=factor, padding=0)
elif factor == 1:
self.resize = nn.Identity()
elif factor < 1:
self.resize = nn.Conv2d(channels, channels, kernel_size=3, stride=int(1 / factor), padding=1)
def forward(self, hidden_state):
hidden_state = self.projection(hidden_state)
hidden_state = self.resize(hidden_state)
return hidden_state
class DPTFeatureFusionStage(nn.Module):
def __init__(self, config):
super().__init__()
self.layers = nn.ModuleList()
for _ in range(len(config.neck_hidden_sizes)):
self.layers.append(DPTFeatureFusionLayer(config))
def forward(self, hidden_states):
hidden_states = hidden_states[::-1]
fused_hidden_states = []
fused_hidden_state = self.layers[0](hidden_states[0])
fused_hidden_states.append(fused_hidden_state)
for hidden_state, layer in zip(hidden_states[1:], self.layers[1:]):
fused_hidden_state = layer(fused_hidden_state, hidden_state)
fused_hidden_states.append(fused_hidden_state)
return fused_hidden_states
class DPTPreActResidualLayer(nn.Module):
"""
预激活残差层,即ResidualConvUnit。
Args:
config (`[DPTConfig]`):
定义模型架构的模型配置类。
"""
def __init__(self, config):
super().__init__()
self.use_batch_norm = config.use_batch_norm_in_fusion_residual
use_bias_in_fusion_residual = (
config.use_bias_in_fusion_residual
if config.use_bias_in_fusion_residual is not None
else not self.use_batch_norm
)
self.activation1 = nn.ReLU()
self.convolution1 = nn.Conv2d(
config.fusion_hidden_size,
config.fusion_hidden_size,
kernel_size=3,
stride=1,
padding=1,
bias=use_bias_in_fusion_residual,
)
self.activation2 = nn.ReLU()
self.convolution2 = nn.Conv2d(
config.fusion_hidden_size,
config.fusion_hidden_size,
kernel_size=3,
stride=1,
padding=1,
bias=use_bias_in_fusion_residual,
)
if self.use_batch_norm:
self.batch_norm1 = nn.BatchNorm2d(config.fusion_hidden_size)
self.batch_norm2 = nn.BatchNorm2d(config.fusion_hidden_size)
def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
residual = hidden_state
hidden_state = self.activation1(hidden_state)
hidden_state = self.convolution1(hidden_state)
if self.use_batch_norm:
hidden_state = self.batch_norm1(hidden_state)
hidden_state = self.activation2(hidden_state)
hidden_state = self.convolution2(hidden_state)
if self.use_batch_norm:
hidden_state = self.batch_norm2(hidden_state)
return hidden_state + residual
"""
Inputs:
hidden_state (`torch.Tensor`):
The input tensor representing the feature maps.
residual (`torch.Tensor`, *optional*):
The tensor representing residual feature maps from previous stages.
Default is `None`.
Returns:
`torch.Tensor`: The output tensor after feature fusion.
Raises:
None
"""
if residual is not None:
if hidden_state.shape != residual.shape:
residual = nn.functional.interpolate(
residual, size=(hidden_state.shape[2], hidden_state.shape[3]), mode="bilinear", align_corners=False
)
hidden_state = hidden_state + self.residual_layer1(residual)
hidden_state = self.residual_layer2(hidden_state)
hidden_state = nn.functional.interpolate(
hidden_state, scale_factor=2, mode="bilinear", align_corners=self.align_corners
)
hidden_state = self.projection(hidden_state)
return hidden_state
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
output_attentions (`bool`, *optional*):
output_hidden_states (`bool`, *optional*):
return_dict (`bool`, *optional*):
"""
定义一个 DPT 模型,该模型是 DPTPreTrainedModel 的子类,用于输出原始的隐藏状态,没有额外的特定输出头部。
:param config: 模型的配置对象,包含了模型的各种参数和设置
:param add_pooling_layer: 是否添加池化层,默认为 True
"""
class DPTModel(DPTPreTrainedModel):
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
if config.is_hybrid:
self.embeddings = DPTViTHybridEmbeddings(config)
else:
self.embeddings = DPTViTEmbeddings(config)
self.encoder = DPTViTEncoder(config)
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.pooler = DPTViTPooler(config) if add_pooling_layer else None
self.post_init()
def get_input_embeddings(self):
if self.config.is_hybrid:
return self.embeddings
else:
return self.embeddings.patch_embeddings
def _prune_heads(self, heads_to_prune):
"""
修剪模型中的注意力头部。
:param heads_to_prune: 要修剪的头部的字典,格式为 {层编号: 要修剪的头部列表}
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
def forward(
self,
pixel_values: torch.FloatTensor,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
DPT 模型的前向传播方法,处理输入数据并返回相应的输出。
:param pixel_values: 输入的像素值张量,大小为 [batch_size, num_channels, height, width]
:param head_mask: 可选参数,用于掩盖某些注意力头部的掩码张量
:param output_attentions: 可选参数,是否输出注意力权重
:param output_hidden_states: 可选参数,是否输出所有隐藏状态
:param return_dict: 可选参数,是否以字典形式返回输出
:return: 模型的输出,具体格式根据参数决定
"""
) -> Union[Tuple, BaseModelOutputWithPoolingAndIntermediateActivations]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
embedding_output = self.embeddings(pixel_values, return_dict=return_dict)
embedding_last_hidden_states = embedding_output[0] if not return_dict else embedding_output.last_hidden_states
encoder_outputs = self.encoder(
embedding_last_hidden_states,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
sequence_output = self.layernorm(sequence_output)
pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
if not return_dict:
head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
return head_outputs + encoder_outputs[1:] + embedding_output[1:]
return BaseModelOutputWithPoolingAndIntermediateActivations(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
intermediate_activations=embedding_output.intermediate_activations,
)
class DPTViTPooler(nn.Module):
def __init__(self, config: DPTConfig):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states):
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class DPTNeck(nn.Module):
"""
DPTNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
input and produces another list of tensors as output. For DPT, it includes 2 stages:
* DPTReassembleStage
* DPTFeatureFusionStage.
Args:
config (dict): config dict.
"""
def __init__(self, config):
super().__init__()
self.config = config
if config.backbone_config is not None and config.backbone_config.model_type in ["swinv2"]:
self.reassemble_stage = None
else:
self.reassemble_stage = DPTReassembleStage(config)
self.convs = nn.ModuleList()
for channel in config.neck_hidden_sizes:
self.convs.append(nn.Conv2d(channel, config.fusion_hidden_size, kernel_size=3, padding=1, bias=False))
self.fusion_stage = DPTFeatureFusionStage(config)
def forward(self, hidden_states: List[torch.Tensor], patch_height=None, patch_width=None) -> List[torch.Tensor]:
"""
Args:
hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
List of hidden states from the backbone.
"""
if not isinstance(hidden_states, (tuple, list)):
raise ValueError("hidden_states should be a tuple or list of tensors")
if len(hidden_states) != len(self.config.neck_hidden_sizes):
raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.")
if self.reassemble_stage is not None:
hidden_states = self.reassemble_stage(hidden_states, patch_height, patch_width)
features = [self.convs[i](feature) for i, feature in enumerate(hidden_states)]
output = self.fusion_stage(features)
return output
class DPTDepthEstimationHead(nn.Module):
"""
Output head head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
the predictions to the input resolution after the first convolutional layer (details can be found in the paper's
"""
def __init__(self, config):
super().__init__()
self.config = config
self.projection = None
if config.add_projection:
self.projection = nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
features = config.fusion_hidden_size
self.head = nn.Sequential(
nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True),
nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
nn.ReLU(),
)
def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor:
hidden_states = hidden_states[self.config.head_in_index]
if self.projection is not None:
hidden_states = self.projection(hidden_states)
hidden_states = nn.ReLU()(hidden_states)
predicted_depth = self.head(hidden_states)
predicted_depth = predicted_depth.squeeze(dim=1)
return predicted_depth
"""
DPT Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
"""
@add_start_docstrings(
"""
Add docstring for model initialization with DPT-specific documentation.
""",
DPT_START_DOCSTRING,
)
class DPTForDepthEstimation(DPTPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.backbone = None
if config.backbone_config is not None and config.is_hybrid is False:
self.backbone = load_backbone(config)
else:
self.dpt = DPTModel(config, add_pooling_layer=False)
self.neck = DPTNeck(config)
self.head = DPTDepthEstimationHead(config)
self.post_init()
@add_start_docstrings_to_model_forward(DPT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.FloatTensor,
head_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Perform forward pass of the DPT model for depth estimation.
"""
pass
"""
DPT Model with a semantic segmentation head on top e.g. for ADE20k, CityScapes.
"""
@add_start_docstrings(
"""
Add docstring for model initialization with DPT-specific documentation.
""",
DPT_START_DOCSTRING,
)
class DPTForSemanticSegmentation(DPTPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.dpt = DPTModel(config, add_pooling_layer=False)
self.neck = DPTNeck(config)
self.head = DPTSemanticSegmentationHead(config)
self.auxiliary_head = DPTAuxiliaryHead(config) if config.use_auxiliary_head else None
self.post_init()
@add_start_docstrings_to_model_forward(DPT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=SemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
.\models\dpt\__init__.py
from typing import TYPE_CHECKING
from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available, is_vision_available
from ...utils import OptionalDependencyNotAvailable
_import_structure = {"configuration_dpt": ["DPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DPTConfig"]}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["feature_extraction_dpt"] = ["DPTFeatureExtractor"]
_import_structure["image_processing_dpt"] = ["DPTImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_dpt"] = [
"DPT_PRETRAINED_MODEL_ARCHIVE_LIST",
"DPTForDepthEstimation",
"DPTForSemanticSegmentation",
"DPTModel",
"DPTPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_dpt import DPT_PRETRAINED_CONFIG_ARCHIVE_MAP, DPTConfig
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .feature_extraction_dpt import DPTFeatureExtractor
from .image_processing_dpt import DPTImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_dpt import (
DPT_PRETRAINED_MODEL_ARCHIVE_LIST,
DPTForDepthEstimation,
DPTForSemanticSegmentation,
DPTModel,
DPTPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\efficientformer\configuration_efficientformer.py
""" EfficientFormer 模型配置 """
from typing import List
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"snap-research/efficientformer-l1-300": (
"https://huggingface.co/snap-research/efficientformer-l1-300/resolve/main/config.json"
),
}
class EfficientFormerConfig(PretrainedConfig):
r"""
这是配置类,用于存储 [`EfficientFormerModel`] 的配置信息。根据指定的参数实例化 EfficientFormer 模型,
定义模型架构。使用默认值实例化配置将产生类似于 EfficientFormer
[snap-research/efficientformer-l1](https://huggingface.co/snap-research/efficientformer-l1) 架构的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型的输出。阅读 [`PretrainedConfig`] 的文档获取更多信息。
示例:
```
>>> from transformers import EfficientFormerConfig, EfficientFormerModel
>>> # 初始化 EfficientFormer efficientformer-l1 风格的配置
>>> configuration = EfficientFormerConfig()
>>> # 从 efficientformer-l3 风格的配置初始化 EfficientFormerModel(具有随机权重)
>>> model = EfficientFormerModel(configuration)
>>> # 访问模型配置
>>> configuration = model.config
```
"""
model_type = "efficientformer"
def __init__(
self,
depths: List[int] = [3, 2, 6, 4],
hidden_sizes: List[int] = [48, 96, 224, 448],
downsamples: List[bool] = [True, True, True, True],
dim: int = 448,
key_dim: int = 32,
attention_ratio: int = 4,
resolution: int = 7,
num_hidden_layers: int = 5,
num_attention_heads: int = 8,
mlp_expansion_ratio: int = 4,
hidden_dropout_prob: float = 0.0,
patch_size: int = 16,
num_channels: int = 3,
pool_size: int = 3,
downsample_patch_size: int = 3,
downsample_stride: int = 2,
downsample_pad: int = 1,
drop_path_rate: float = 0.0,
num_meta3d_blocks: int = 1,
distillation: bool = True,
use_layer_scale: bool = True,
layer_scale_init_value: float = 1e-5,
hidden_act: str = "gelu",
initializer_range: float = 0.02,
layer_norm_eps: float = 1e-12,
image_size: int = 224,
batch_norm_eps: float = 1e-05,
**kwargs,
) -> None:
super().__init__(**kwargs)
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.hidden_sizes = hidden_sizes
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.patch_size = patch_size
self.num_channels = num_channels
self.depths = depths
self.mlp_expansion_ratio = mlp_expansion_ratio
self.downsamples = downsamples
self.dim = dim
self.key_dim = key_dim
self.attention_ratio = attention_ratio
self.resolution = resolution
self.pool_size = pool_size
self.downsample_patch_size = downsample_patch_size
self.downsample_stride = downsample_stride
self.downsample_pad = downsample_pad
self.drop_path_rate = drop_path_rate
self.num_meta3d_blocks = num_meta3d_blocks
self.distillation = distillation
self.use_layer_scale = use_layer_scale
self.layer_scale_init_value = layer_scale_init_value
self.image_size = image_size
self.batch_norm_eps = batch_norm_eps
.\models\efficientformer\convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
"""从原始仓库转换EfficientFormer检查点。
URL: https://github.com/snap-research/EfficientFormer
"""
import argparse
import re
from pathlib import Path
import requests
import torch
from PIL import Image
from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor
from transformers import (
EfficientFormerConfig,
EfficientFormerForImageClassificationWithTeacher,
EfficientFormerImageProcessor,
)
from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
def rename_key(old_name, num_meta4D_last_stage):
new_name = old_name
if "patch_embed" in old_name:
_, layer, param = old_name.split(".")
if layer == "0":
new_name = old_name.replace("0", "convolution1")
elif layer == "1":
new_name = old_name.replace("1", "batchnorm_before")
elif layer == "3":
new_name = old_name.replace("3", "convolution2")
else:
new_name = old_name.replace("4", "batchnorm_after")
if "network" in old_name and re.search(r"\d\.\d", old_name):
two_digit_num = r"\b\d{2}\b"
if bool(re.search(two_digit_num, old_name)):
match = re.search(r"\d\.\d\d.", old_name).group()
else:
match = re.search(r"\d\.\d.", old_name).group()
if int(match[0]) < 6:
trimmed_name = old_name.replace(match, "")
trimmed_name = trimmed_name.replace("network", match[0] + ".meta4D_layers.blocks." + match[2:-1])
new_name = "intermediate_stages." + trimmed_name
else:
trimmed_name = old_name.replace(match, "")
if int(match[2]) < num_meta4D_last_stage:
trimmed_name = trimmed_name.replace("network", "meta4D_layers.blocks." + match[2])
else:
layer_index = str(int(match[2]) - num_meta4D_last_stage)
trimmed_name = trimmed_name.replace("network", "meta3D_layers.blocks." + layer_index)
if "norm1" in old_name:
trimmed_name = trimmed_name.replace("norm1", "layernorm1")
elif "norm2" in old_name:
trimmed_name = trimmed_name.replace("norm2", "layernorm2")
elif "fc1" in old_name:
trimmed_name = trimmed_name.replace("fc1", "linear_in")
elif "fc2" in old_name:
trimmed_name = trimmed_name.replace("fc2", "linear_out")
new_name = "last_stage." + trimmed_name
elif "network" in old_name and re.search(r".\d.", old_name):
new_name = old_name.replace("network", "intermediate_stages")
if "fc" in new_name:
new_name = new_name.replace("fc", "convolution")
elif ("norm1" in new_name) and ("layernorm1" not in new_name):
new_name = new_name.replace("norm1", "batchnorm_before")
elif ("norm2" in new_name) and ("layernorm2" not in new_name):
new_name = new_name.replace("norm2", "batchnorm_after")
if "proj" in new_name:
new_name = new_name.replace("proj", "projection")
if "dist_head" in new_name:
new_name = new_name.replace("dist_head", "distillation_classifier")
elif "head" in new_name:
new_name = new_name.replace("head", "classifier")
elif "patch_embed" in new_name:
new_name = "efficientformer." + new_name
elif new_name == "norm.weight" or new_name == "norm.bias":
new_name = new_name.replace("norm", "layernorm")
new_name = "efficientformer." + new_name
else:
new_name = "efficientformer.encoder." + new_name
return new_name
def convert_torch_checkpoint(checkpoint, num_meta4D_last_stage):
for key in checkpoint.copy().keys():
val = checkpoint.pop(key)
checkpoint[rename_key(key, num_meta4D_last_stage)] = val
return checkpoint
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
return image
def convert_efficientformer_checkpoint(
checkpoint_path: Path, efficientformer_config_file: Path, pytorch_dump_path: Path, push_to_hub: bool
):
orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
config = EfficientFormerConfig.from_json_file(efficientformer_config_file)
model = EfficientFormerForImageClassificationWithTeacher(config)
model_name = "_".join(checkpoint_path.split("/")[-1].split(".")[0].split("_")[:-1])
num_meta4D_last_stage = config.depths[-1] - config.num_meta3d_blocks + 1
new_state_dict = convert_torch_checkpoint(orig_state_dict, num_meta4D_last_stage)
model.load_state_dict(new_state_dict)
model.eval()
pillow_resamplings = {
"bilinear": PILImageResampling.BILINEAR,
"bicubic": PILImageResampling.BICUBIC,
"nearest": PILImageResampling.NEAREST,
}
image = prepare_img()
image_size = 256
crop_size = 224
processor = EfficientFormerImageProcessor(
size={"shortest_edge": image_size},
crop_size={"height": crop_size, "width": crop_size},
resample=pillow_resamplings["bicubic"],
)
pixel_values = processor(images=image, return_tensors="pt").pixel_values
image_transforms = Compose(
[
Resize(image_size, interpolation=pillow_resamplings["bicubic"]),
CenterCrop(crop_size),
ToTensor(),
Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
]
)
original_pixel_values = image_transforms(image).unsqueeze(0)
assert torch.allclose(original_pixel_values, pixel_values)
outputs = model(pixel_values)
logits = outputs.logits
expected_shape = (1, 1000)
if "l1" in model_name:
expected_logits = torch.Tensor(
[-0.1312, 0.4353, -1.0499, -0.5124, 0.4183, -0.6793, -1.3777, -0.0893, -0.7358, -2.4328]
)
assert torch.allclose(logits[0, :10], expected_logits, atol=1e-3)
assert logits.shape == expected_shape
elif "l3" in model_name:
expected_logits = torch.Tensor(
[-1.3150, -1.5456, -1.2556, -0.8496, -0.7127, -0.7897, -0.9728, -0.3052, 0.3751, -0.3127]
)
assert torch.allclose(logits[0, :10], expected_logits, atol=1e-3)
assert logits.shape == expected_shape
elif "l7" in model_name:
expected_logits = torch.Tensor(
[-1.0283, -1.4131, -0.5644, -1.3115, -0.5785, -1.2049, -0.7528, 0.1992, -0.3822, -0.0878]
)
assert logits.shape == expected_shape
else:
raise ValueError(
f"Unknown model checkpoint: {checkpoint_path}. Supported versions of efficientformer are l1, l3, and l7"
)
Path(pytorch_dump_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_path)
print(f"Checkpoint successfully converted. Model saved at {pytorch_dump_path}")
processor.save_pretrained(pytorch_dump_path)
print(f"Processor successfully saved at {pytorch_dump_path}")
if push_to_hub:
print("Pushing model to the hub...")
model.push_to_hub(
repo_id=f"Bearnardd/{pytorch_dump_path}",
commit_message="Add model",
use_temp_dir=True,
)
processor.push_to_hub(
repo_id=f"Bearnardd/{pytorch_dump_path}",
commit_message="Add image processor",
use_temp_dir=True,
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--pytorch_model_path",
default=None,
type=str,
required=True,
help="Path to EfficientFormer pytorch checkpoint.",
)
parser.add_argument(
"--config_file",
default=None,
type=str,
required=True,
help="The json file for EfficientFormer model config.",
)
parser.add_argument(
"--pytorch_dump_path",
default=None,
type=str,
required=True,
help="Path to the output PyTorch model."
)
parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
parser.add_argument(
"--no-push_to_hub",
dest="push_to_hub",
action="store_false",
help="Do not push model and image processor to the hub",
)
parser.set_defaults(push_to_hub=True)
args = parser.parse_args()
convert_efficientformer_checkpoint(
checkpoint_path=args.pytorch_model_path,
efficientformer_config_file=args.config_file,
pytorch_dump_path=args.pytorch_dump_path,
push_to_hub=args.push_to_hub,
)