Transformers 源码解析(五十)
.\models\fnet\tokenization_fnet.py
""" Tokenization classes for FNet model."""
import os
import unicodedata
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"google/fnet-base": "https://huggingface.co/google/fnet-base/resolve/main/spiece.model",
"google/fnet-large": "https://huggingface.co/google/fnet-large/resolve/main/spiece.model",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"google/fnet-base": 512,
"google/fnet-large": 512,
}
SPIECE_UNDERLINE = "▁"
class FNetTokenizer(PreTrainedTokenizer):
"""
Construct an FNet tokenizer. Adapted from [`AlbertTokenizer`]. Based on
[SentencePiece](https://github.com/google/sentencepiece). This tokenizer inherits from [`PreTrainedTokenizer`]
which contains most of the main methods. Users should refer to this superclass for more information regarding those
methods.
"""
sp_model = SentencePieceProcessor()
sp_model.Load(vocab_file)
self.do_lower_case = do_lower_case
self.remove_space = remove_space
self.keep_accents = keep_accents
self.unk_token = unk_token
self.sep_token = sep_token
self.pad_token = pad_token
self.cls_token = cls_token
self.mask_token = mask_token
self.sp_model_kwargs = sp_model_kwargs if sp_model_kwargs is not None else {}
Attributes:
sp_model (`SentencePieceProcessor`):
The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
"""
# 定义类变量,包含模型需要的文件名列表
vocab_files_names = VOCAB_FILES_NAMES
# 预训练模型的词汇文件映射
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
# 预训练模型的最大输入尺寸
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# 模型输入名称列表
model_input_names = ["input_ids", "token_type_ids"]
def __init__(
self,
vocab_file,
do_lower_case=False,
remove_space=True,
keep_accents=True,
unk_token="<unk>",
sep_token="[SEP]",
pad_token="<pad>",
cls_token="[CLS]",
mask_token="[MASK]",
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs,
) -> None:
# 如果 mask_token 是字符串,则创建一个特殊的 AddedToken 对象
mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
# 如果 cls_token 是字符串,则创建一个特殊的 AddedToken 对象
cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
# 如果 sep_token 是字符串,则创建一个特殊的 AddedToken 对象
sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
# 如果 mask_token 是字符串,则创建一个特殊的 AddedToken 对象
mask_token = AddedToken(mask_token, special=True) if isinstance(mask_token, str) else mask_token
# 如果未提供 sp_model_kwargs,则设为默认空字典
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
# 初始化参数赋值
self.do_lower_case = do_lower_case
self.remove_space = remove_space
self.keep_accents = keep_accents
self.vocab_file = vocab_file
# 使用 SentencePieceProcessor 初始化 sp_model,并加载词汇文件
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(vocab_file)
# 调用父类的初始化方法
super().__init__(
do_lower_case=do_lower_case,
remove_space=remove_space,
keep_accents=keep_accents,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs,
)
@property
def vocab_size(self):
# 返回当前 sp_model 的词汇大小
return len(self.sp_model)
def get_vocab(self):
# 生成词汇表,将 ID 映射到对应的词汇符号
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
# 将已添加的特殊符号编码器合并到词汇表中
vocab.update(self.added_tokens_encoder)
return vocab
def __getstate__(self):
# 获取对象状态的副本,去除 sp_model 属性以便序列化
state = self.__dict__.copy()
state["sp_model"] = None
return state
def __setstate__(self, d):
# 恢复对象状态,包括 sp_model 属性的重新初始化
self.__dict__ = d
# 兼容旧版本的处理
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
# 使用 SentencePieceProcessor 重新初始化 sp_model 并加载词汇文件
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file)
# 预处理文本,根据初始化时的设置进行处理
def preprocess_text(self, inputs):
if self.remove_space:
# 如果需要移除空格,则去除首尾空格并用单个空格重新连接单词
outputs = " ".join(inputs.strip().split())
else:
outputs = inputs
# 替换文本中的特殊引号格式为标准双引号
outputs = outputs.replace("``", '"').replace("''", '"')
if not self.keep_accents:
# 如果不保留重音符号,则使用Unicode标准化处理文本
outputs = unicodedata.normalize("NFKD", outputs)
# 过滤掉所有组合字符,保留文本中的基本字符
outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
if self.do_lower_case:
# 如果需要转换为小写,则将文本全部转换为小写
outputs = outputs.lower()
return outputs
# 使用SentencePiece模型进行分词
def _tokenize(self, text: str) -> List[str]:
"""Tokenize a string."""
# 预处理文本
text = self.preprocess_text(text)
# 使用SentencePiece模型对文本进行编码,返回编码后的片段列表
pieces = self.sp_model.encode(text, out_type=str)
new_pieces = []
for piece in pieces:
if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
# 处理以数字结尾且倒数第二个字符为逗号的片段
cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
if len(cur_pieces[0]) == 1:
cur_pieces = cur_pieces[1:]
else:
cur_pieces[0] = cur_pieces[0][1:]
cur_pieces.append(piece[-1])
new_pieces.extend(cur_pieces)
else:
new_pieces.append(piece)
return new_pieces
# 将Token转换为对应的ID
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.sp_model.PieceToId(token)
# 将ID转换为对应的Token
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.sp_model.IdToPiece(index)
# 从tokens序列中恢复成单个字符串
# 参考自transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
current_sub_tokens = []
out_string = ""
prev_is_special = False
for token in tokens:
# 确保特殊token不会被SentencePiece模型解码
if token in self.all_special_tokens:
if not prev_is_special:
out_string += " "
out_string += self.sp_model.decode(current_sub_tokens) + token
prev_is_special = True
current_sub_tokens = []
else:
current_sub_tokens.append(token)
prev_is_special = False
out_string += self.sp_model.decode(current_sub_tokens)
return out_string.strip()
# 解码token_ids列表
def _decode(
self,
token_ids: List[int],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = None,
spaces_between_special_tokens: bool = False,
**kwargs,
) -> str:
# 调用父类的 _decode 方法,解码 token_ids 为文本
text = super()._decode(
token_ids=token_ids,
skip_special_tokens=skip_special_tokens,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
spaces_between_special_tokens=spaces_between_special_tokens,
**kwargs,
)
# 模仿 Rust 分词器的行为:
# 在 <unk> 后面不加空格
if not spaces_between_special_tokens:
text = text.replace("<unk> ", "<unk>")
# 返回处理后的文本
return text
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
通过连接和添加特殊标记,构建用于序列分类任务的模型输入。一个 FNet 序列的格式如下:
- 单个序列:`[CLS] X [SEP]`
- 序列对:`[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
将添加特殊标记的 ID 列表。
token_ids_1 (`List[int]`, *optional*):
可选的第二个 ID 列表,用于序列对。
Returns:
`List[int]`: 包含适当特殊标记的输入 ID 列表。
"""
sep = [self.sep_token_id] # 获取 SEP token 的 ID
cls = [self.cls_token_id] # 获取 CLS token 的 ID
if token_ids_1 is None:
return cls + token_ids_0 + sep # 返回单个序列的特殊标记输入
return cls + token_ids_0 + sep + token_ids_1 + sep # 返回序列对的特殊标记输入
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
从未添加特殊标记的标记列表中检索序列 ID。在使用分词器的 `prepare_for_model` 方法添加特殊标记时调用此方法。
Args:
token_ids_0 (`List[int]`):
ID 列表。
token_ids_1 (`List[int]`, *optional*):
可选的第二个 ID 列表,用于序列对。
already_has_special_tokens (`bool`, *optional*, 默认为 `False`):
标记列表是否已经格式化为模型的特殊标记。
Returns:
`List[int]`: 一个整数列表,范围在 [0, 1] 内:1 表示特殊标记,0 表示序列标记。
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
):
"""
根据输入的序列列表创建 token_type_ids。在使用分词器的 `prepare_for_model` 方法添加特殊标记时调用此方法。
Args:
token_ids_0 (`List[int]`):
第一个序列的 ID 列表。
token_ids_1 (`List[int]`, *optional*):
可选的第二个序列的 ID 列表,用于序列对。
Returns:
None
"""
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. An FNet sequence
pair mask has the following format: :
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary
"""
# Define the separator and classification tokens
sep = [self.sep_token_id]
cls = [self.cls_token_id]
# If token_ids_1 is None, return a mask with all zeros for the first sequence part
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
# Otherwise, concatenate the lengths of cls, token_ids_0, sep with all zeros,
# and concatenate the length of token_ids_1 and sep with all ones for the second sequence part
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# Check if the save_directory exists; if not, log an error and return None
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
# Construct the output vocabulary file path
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
# If the current vocab_file path is different from the output path and is a file,
# copy the current vocab_file to the output vocab_file path
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
# If the current vocab_file does not exist, write the serialized sp_model proto to the output vocab_file
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
# Return the path of the saved vocabulary file
return (out_vocab_file,)
.\models\fnet\tokenization_fnet_fast.py
""" Tokenization classes for FNet model."""
import os
from shutil import copyfile
from typing import List, Optional, Tuple
from ...tokenization_utils import AddedToken
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import is_sentencepiece_available, logging
if is_sentencepiece_available():
from .tokenization_fnet import FNetTokenizer
else:
FNetTokenizer = None
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"google/fnet-base": "https://huggingface.co/google/fnet-base/resolve/main/spiece.model",
"google/fnet-large": "https://huggingface.co/google/fnet-large/resolve/main/spiece.model",
},
"tokenizer_file": {
"google/fnet-base": "https://huggingface.co/google/fnet-base/resolve/main/tokenizer.json",
"google/fnet-large": "https://huggingface.co/google/fnet-large/resolve/main/tokenizer.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"google/fnet-base": 512,
"google/fnet-large": 512,
}
SPIECE_UNDERLINE = "▁"
class FNetTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" FNetTokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
[`AlbertTokenizerFast`]. Based on
[Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This
tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "token_type_ids"]
slow_tokenizer_class = FNetTokenizer
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
do_lower_case=False,
remove_space=True,
keep_accents=True,
unk_token="<unk>",
sep_token="[SEP]",
pad_token="<pad>",
cls_token="[CLS]",
mask_token="[MASK]",
**kwargs,
):
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
remove_space=remove_space,
keep_accents=keep_accents,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
**kwargs,
)
self.do_lower_case = do_lower_case
self.remove_space = remove_space
self.keep_accents = keep_accents
self.vocab_file = vocab_file
@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
通过连接并添加特殊 token 构建用于序列分类任务的模型输入。FNet 序列有以下格式:
- 单序列:`[CLS] X [SEP]`
- 序列对:`[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
要添加特殊 token 的 ID 列表
token_ids_1 (`List[int]`, *optional*):
第二个序列的可选 ID 列表,用于序列对任务
Returns:
`List[int]`: 包含适当特殊 token 的输入 ID 列表
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return cls + token_ids_0 + sep
return cls + token_ids_0 + sep + token_ids_1 + sep
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An FNet
sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
if token_ids_1 is None, only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of ids.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)
.\models\fnet\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_sentencepiece_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {"configuration_fnet": ["FNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "FNetConfig"]}
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_fnet"] = ["FNetTokenizer"]
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_fnet_fast"] = ["FNetTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_fnet"] = [
"FNET_PRETRAINED_MODEL_ARCHIVE_LIST",
"FNetForMaskedLM",
"FNetForMultipleChoice",
"FNetForNextSentencePrediction",
"FNetForPreTraining",
"FNetForQuestionAnswering",
"FNetForSequenceClassification",
"FNetForTokenClassification",
"FNetLayer",
"FNetModel",
"FNetPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_fnet import FNET_PRETRAINED_CONFIG_ARCHIVE_MAP, FNetConfig
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_fnet import FNetTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_fnet_fast import FNetTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_fnet import (
FNET_PRETRAINED_MODEL_ARCHIVE_LIST,
FNetForMaskedLM,
FNetForMultipleChoice,
FNetForNextSentencePrediction,
FNetForPreTraining,
FNetForQuestionAnswering,
FNetForSequenceClassification,
FNetForTokenClassification,
FNetLayer,
FNetModel,
FNetPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\focalnet\configuration_focalnet.py
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
logger = logging.get_logger(__name__)
FOCALNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"microsoft/focalnet-tiny": "https://huggingface.co/microsoft/focalnet-tiny/resolve/main/config.json",
}
class FocalNetConfig(BackboneConfigMixin, PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`FocalNetModel`]. It is used to instantiate a
FocalNet model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the FocalNet
[microsoft/focalnet-tiny](https://huggingface.co/microsoft/focalnet-tiny) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import FocalNetConfig, FocalNetModel
>>> # Initializing a FocalNet microsoft/focalnet-tiny style configuration
>>> configuration = FocalNetConfig()
>>> # Initializing a model (with random weights) from the microsoft/focalnet-tiny style configuration
>>> model = FocalNetModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "focalnet"
def __init__(
self,
image_size=224,
patch_size=4,
num_channels=3,
embed_dim=96,
use_conv_embed=False,
hidden_sizes=[192, 384, 768, 768],
depths=[2, 2, 6, 2],
focal_levels=[2, 2, 2, 2],
focal_windows=[3, 3, 3, 3],
hidden_act="gelu",
mlp_ratio=4.0,
hidden_dropout_prob=0.0,
drop_path_rate=0.1,
use_layerscale=False,
layerscale_value=1e-4,
use_post_layernorm=False,
use_post_layernorm_in_modulation=False,
normalize_modulator=False,
initializer_range=0.02,
layer_norm_eps=1e-5,
encoder_stride=32,
out_features=None,
out_indices=None,
**kwargs,
):
"""
初始化方法,用于配置FocalNet模型的各种参数和选项
Parameters:
- image_size (int): 输入图像的尺寸,默认为224
- patch_size (int): 感兴趣区域(patch)的尺寸,默认为4
- num_channels (int): 输入图像的通道数,默认为3(RGB)
- embed_dim (int): 嵌入维度,默认为96
- use_conv_embed (bool): 是否使用卷积进行嵌入,默认为False
- hidden_sizes (list of int): 隐藏层的大小列表,默认为[192, 384, 768, 768]
- depths (list of int): 各阶段的深度列表,默认为[2, 2, 6, 2]
- focal_levels (list of int): 各阶段的聚焦级别列表,默认为[2, 2, 2, 2]
- focal_windows (list of int): 各阶段的聚焦窗口大小列表,默认为[3, 3, 3, 3]
- hidden_act (str): 隐藏层激活函数,默认为"gelu"
- mlp_ratio (float): MLP扩展比例,默认为4.0
- hidden_dropout_prob (float): 隐藏层的dropout概率,默认为0.0
- drop_path_rate (float): drop path的概率,默认为0.1
- use_layerscale (bool): 是否使用层标准化,默认为False
- layerscale_value (float): 层标准化的值,默认为1e-4
- use_post_layernorm (bool): 是否使用后层标准化,默认为False
- use_post_layernorm_in_modulation (bool): 是否在调制中使用后层标准化,默认为False
- normalize_modulator (bool): 是否正常化调制器,默认为False
- initializer_range (float): 初始化范围,默认为0.02
- layer_norm_eps (float): 层标准化的epsilon值,默认为1e-5
- encoder_stride (int): 编码器步长,默认为32
- out_features (None or list of int): 输出特征的索引列表,默认为None
- out_indices (None or list of int): 输出索引的列表,默认为None
- **kwargs: 其他参数
Notes:
- Parameters prefixed with 'use_' control the activation of various features in the model.
- The defaults are set to mimic the microsoft/focalnet-tiny architecture as closely as possible.
"""
super().__init__(**kwargs)
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.embed_dim = embed_dim
self.use_conv_embed = use_conv_embed
self.hidden_sizes = hidden_sizes
self.depths = depths
self.focal_levels = focal_levels
self.focal_windows = focal_windows
self.hidden_act = hidden_act
self.mlp_ratio = mlp_ratio
self.hidden_dropout_prob = hidden_dropout_prob
self.drop_path_rate = drop_path_rate
self.use_layerscale = use_layerscale
self.layerscale_value = layerscale_value
self.use_post_layernorm = use_post_layernorm
self.use_post_layernorm_in_modulation = use_post_layernorm_in_modulation
self.normalize_modulator = normalize_modulator
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.encoder_stride = encoder_stride
self.out_features = out_features
self.out_indices = out_indices
):
super().__init__(**kwargs)
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.embed_dim = embed_dim
self.use_conv_embed = use_conv_embed
self.hidden_sizes = hidden_sizes
self.depths = depths
self.focal_levels = focal_levels
self.focal_windows = focal_windows
self.hidden_act = hidden_act
self.mlp_ratio = mlp_ratio
self.hidden_dropout_prob = hidden_dropout_prob
self.drop_path_rate = drop_path_rate
self.use_layerscale = use_layerscale
self.layerscale_value = layerscale_value
self.use_post_layernorm = use_post_layernorm
self.use_post_layernorm_in_modulation = use_post_layernorm_in_modulation
self.normalize_modulator = normalize_modulator
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.encoder_stride = encoder_stride
self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)]
self._out_features, self._out_indices = get_aligned_output_features_output_indices(
out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
)
.\models\focalnet\convert_focalnet_to_hf_format.py
import argparse
import json
import requests
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from torchvision import transforms
from transformers import BitImageProcessor, FocalNetConfig, FocalNetForImageClassification
from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
def get_focalnet_config(model_name):
depths = [2, 2, 6, 2] if "tiny" in model_name else [2, 2, 18, 2]
use_conv_embed = True if "large" in model_name or "huge" in model_name else False
use_post_layernorm = True if "large" in model_name or "huge" in model_name else False
use_layerscale = True if "large" in model_name or "huge" in model_name else False
if "large" in model_name or "xlarge" in model_name or "huge" in model_name:
if "fl3" in model_name:
focal_levels = [3, 3, 3, 3]
focal_windows = [5, 5, 5, 5]
elif "fl4" in model_name:
focal_levels = [4, 4, 4, 4]
focal_windows = [3, 3, 3, 3]
if "tiny" in model_name or "small" in model_name or "base" in model_name:
focal_windows = [3, 3, 3, 3]
if "lrf" in model_name:
focal_levels = [3, 3, 3, 3]
else:
focal_levels = [2, 2, 2, 2]
if "tiny" in model_name:
embed_dim = 96
elif "small" in model_name:
embed_dim = 96
elif "base" in model_name:
embed_dim = 128
elif "large" in model_name:
embed_dim = 192
elif "xlarge" in model_name:
embed_dim = 256
elif "huge" in model_name:
embed_dim = 352
repo_id = "huggingface/label-files"
if "large" in model_name or "huge" in model_name:
filename = "imagenet-22k-id2label.json"
else:
filename = "imagenet-1k-id2label.json"
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
label2id = {v: k for k, v in id2label.items()}
config = FocalNetConfig(
embed_dim=embed_dim,
depths=depths,
focal_levels=focal_levels,
focal_windows=focal_windows,
use_conv_embed=use_conv_embed,
id2label=id2label,
label2id=label2id,
use_post_layernorm=use_post_layernorm,
use_layerscale=use_layerscale,
)
return config
if "patch_embed.proj" in name:
name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
if "patch_embed.norm" in name:
name = name.replace("patch_embed.norm", "embeddings.norm")
if "layers" in name:
name = "encoder." + name
if "encoder.layers" in name:
name = name.replace("encoder.layers", "encoder.stages")
if "downsample.proj" in name:
name = name.replace("downsample.proj", "downsample.projection")
if "blocks" in name:
name = name.replace("blocks", "layers")
if "modulation.f.weight" in name or "modulation.f.bias" in name:
name = name.replace("modulation.f", "modulation.projection_in")
if "modulation.h.weight" in name or "modulation.h.bias" in name:
name = name.replace("modulation.h", "modulation.projection_context")
if "modulation.proj.weight" in name or "modulation.proj.bias" in name:
name = name.replace("modulation.proj", "modulation.projection_out")
if name == "norm.weight":
name = "layernorm.weight"
if name == "norm.bias":
name = "layernorm.bias"
if "head" in name:
name = name.replace("head", "classifier")
else:
name = "focalnet." + name
return name
for key in state_dict.copy().keys():
val = state_dict.pop(key)
state_dict[rename_key(key)] = val
config = get_focalnet_config(model_name)
model = FocalNetForImageClassification(config)
model.eval()
model.load_state_dict(state_dict)
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
processor = BitImageProcessor(
do_resize=True,
size={"shortest_edge": 256},
resample=PILImageResampling.BILINEAR,
do_center_crop=True,
crop_size=224,
do_normalize=True,
image_mean=IMAGENET_DEFAULT_MEAN,
image_std=IMAGENET_DEFAULT_STD,
)
image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(images=image, return_tensors="pt")
image_transforms = transforms.Compose(
[
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
]
)
original_pixel_values = image_transforms(image).unsqueeze(0)
assert torch.allclose(inputs.pixel_values, original_pixel_values, atol=1e-4)
outputs = model(**inputs)
predicted_class_idx = outputs.logits.argmax(-1).item()
print("Predicted class:", model.config.id2label[predicted_class_idx])
print("First values of logits:", outputs.logits[0, :3])
if model_name == "focalnet-tiny":
expected_slice = torch.tensor([0.2166, -0.4368, 0.2191])
elif model_name == "focalnet-tiny-lrf":
expected_slice = torch.tensor([1.1669, 0.0125, -0.1695])
elif model_name == "focalnet-small":
expected_slice = torch.tensor([0.4917, -0.0430, 0.1341])
elif model_name == "focalnet-small-lrf":
expected_slice = torch.tensor([-0.2588, -0.5342, -0.2331])
elif model_name == "focalnet-base":
expected_slice = torch.tensor([-0.1655, -0.4090, -0.1730])
elif model_name == "focalnet-base-lrf":
expected_slice = torch.tensor([0.5306, -0.0483, -0.3928])
assert torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4)
print("Looks ok!")
if pytorch_dump_folder_path is not None:
print(f"Saving model and processor of {model_name} to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
print(f"Pushing model and processor of {model_name} to the hub...")
model.push_to_hub(f"{model_name}")
processor.push_to_hub(f"{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
default="focalnet-tiny",
type=str,
help="Name of the FocalNet model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
)
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Whether to push the model and processor to the hub.",
)
args = parser.parse_args()
convert_focalnet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
.\models\focalnet\modeling_focalnet.py
""" PyTorch FocalNet model."""
import collections.abc
import math
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import BackboneOutput
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from ...utils.backbone_utils import BackboneMixin
from .configuration_focalnet import FocalNetConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "FocalNetConfig"
_CHECKPOINT_FOR_DOC = "microsoft/focalnet-tiny"
_EXPECTED_OUTPUT_SHAPE = [1, 49, 768]
_IMAGE_CLASS_CHECKPOINT = "microsoft/focalnet-tiny"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/focalnet-tiny",
]
@dataclass
class FocalNetEncoderOutput(ModelOutput):
"""
FocalNet encoder's outputs, with potential hidden states.
This dataclass inherits from ModelOutput provided by Hugging Face.
"""
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
模型最后一层的隐藏状态输出序列,形状为(batch_size, sequence_length, hidden_size)。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
模型每一层的隐藏状态组成的元组,包括初始嵌入层输出。
形状为(batch_size, sequence_length, hidden_size)。
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
模型每一层的隐藏状态组成的元组,包括初始嵌入层输出,并且重新整形以包括空间维度。
形状为(batch_size, hidden_size, height, width)。
last_hidden_state: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class FocalNetModelOutput(ModelOutput):
"""
FocalNet model's outputs that also contains a pooling of the last hidden states.
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
Average pooling of the last layer hidden-state.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, hidden_size, height, width)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
include the spatial dimensions.
"""
last_hidden_state: torch.FloatTensor = None
pooler_output: Optional[torch.FloatTensor] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class FocalNetMaskedImageModelingOutput(ModelOutput):
"""
FocalNet masked image model outputs.
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
Masked image modeling (MLM) loss.
图像模型的掩码损失,如果提供了 `bool_masked_pos`,则返回。
reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Reconstructed pixel values.
重建后的像素数值,形状为 `(batch_size, num_channels, height, width)`。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` representing hidden-states of the model at the output of each layer plus the initial embedding outputs.
模型在每层输出和初始嵌入输出时的隐藏状态的元组。
形状为 `(batch_size, sequence_length, hidden_size)`。
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` representing hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to include the spatial dimensions.
模型在每层输出和初始嵌入输出时的隐藏状态的元组,已重塑以包括空间维度。
形状为 `(batch_size, hidden_size, height, width)`。
"""
# 可选的损失值,如果没有提供将为 None
loss: Optional[torch.FloatTensor] = None
# 可选的重建像素值
reconstruction: torch.FloatTensor = None
# 可选的隐藏状态,表示模型在每层输出和初始嵌入输出时的隐藏状态
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
# 可选的重塑后的隐藏状态,表示模型在每层输出和初始嵌入输出时的隐藏状态,并已重塑以包括空间维度
reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class FocalNetImageClassifierOutput(ModelOutput):
"""
FocalNet outputs for image classification.
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Classification (or regression if config.num_labels==1) loss.
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, hidden_size, height, width)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
include the spatial dimensions.
"""
loss: Optional[torch.FloatTensor] = None # 损失值,用于分类或回归任务的损失
logits: torch.FloatTensor = None # 分类(或回归)得分,经过 SoftMax 之前的输出
hidden_states: Optional[Tuple[torch.FloatTensor]] = None # 每个层输出的隐藏状态,包括初始嵌入输出
reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None # 每个层输出的隐藏状态,包括空间维度的重塑
class FocalNetEmbeddings(nn.Module):
"""
Construct the patch embeddings and layernorm. Optionally, also the mask token.
"""
def __init__(self, config, use_mask_token=False):
super().__init__()
self.patch_embeddings = FocalNetPatchEmbeddings(
config=config,
image_size=config.image_size,
patch_size=config.patch_size,
num_channels=config.num_channels,
embed_dim=config.embed_dim,
use_conv_embed=config.use_conv_embed,
is_stem=True,
) # 创建图像的补丁嵌入
self.patch_grid = self.patch_embeddings.grid_size # 获取补丁嵌入的网格大小
self.mask_token = nn.Parameter(torch.zeros(1, 1, config.embed_dim)) if use_mask_token else None # 可选地创建掩码令牌
self.norm = nn.LayerNorm(config.embed_dim, eps=config.layer_norm_eps) # LayerNorm 归一化层
self.dropout = nn.Dropout(config.hidden_dropout_prob) # 随机失活层
def forward(
self, pixel_values: Optional[torch.FloatTensor], bool_masked_pos: Optional[torch.BoolTensor] = None
# 前向传播方法,接收像素值和可选的掩码位置张量
)
# 前向传播方法,接收像素值和可选的掩码位置张量
) -> Tuple[torch.Tensor]:
# 获取图像的补丁嵌入和输出维度信息
embeddings, output_dimensions = self.patch_embeddings(pixel_values)
# 对嵌入向量进行归一化处理
embeddings = self.norm(embeddings)
# 获取当前批次的大小和序列长度
batch_size, seq_len, _ = embeddings.size()
# 如果存在布尔类型的遮罩位置信息
if bool_masked_pos is not None:
# 将遮罩标记扩展到与嵌入向量相同的维度
mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
# 将布尔类型的遮罩位置转换成与mask_tokens相同类型的张量
mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
# 使用遮罩标记替换被遮罩的视觉标记
embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
# 对嵌入向量应用dropout操作
embeddings = self.dropout(embeddings)
# 返回处理后的嵌入向量和输出维度信息
return embeddings, output_dimensions
class FocalNetPatchEmbeddings(nn.Module):
def __init__(
self,
config,
image_size,
patch_size,
num_channels,
embed_dim,
add_norm=False,
use_conv_embed=False,
is_stem=False,
):
super().__init__()
# 将图像大小和补丁大小转换为元组,如果它们不是可迭代对象,则分别使用默认值
image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
# 计算图像中的补丁数量
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.num_patches = num_patches
# 计算网格大小,即图像尺寸与补丁尺寸的整除结果
self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
if use_conv_embed:
# 如果选择使用卷积嵌入,则根据是否是 stem 层选择不同的卷积参数
if is_stem:
kernel_size = 7
padding = 2
stride = 4
else:
kernel_size = 3
padding = 1
stride = 2
# 设置卷积投影层,根据参数创建卷积层对象
self.projection = nn.Conv2d(
num_channels, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
)
else:
# 否则,使用常规的卷积设置补丁大小作为卷积核大小和步幅
self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
if add_norm:
# 如果指定要添加 LayerNorm,则创建 LayerNorm 层
self.norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
else:
# 否则,不添加标准化层
self.norm = None
def maybe_pad(self, pixel_values, height, width):
# 如果图像宽度不能被补丁宽度整除,则对像素值进行填充
if width % self.patch_size[1] != 0:
pad_values = (0, self.patch_size[1] - width % self.patch_size[1])
pixel_values = nn.functional.pad(pixel_values, pad_values)
# 如果图像高度不能被补丁高度整除,则对像素值进行填充
if height % self.patch_size[0] != 0:
pad_values = (0, 0, 0, self.patch_size[0] - height % self.patch_size[0])
pixel_values = nn.functional.pad(pixel_values, pad_values)
return pixel_values
def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor, Tuple[int]]:
# 获取输入张量的形状信息
_, num_channels, height, width = pixel_values.shape
# 检查通道数是否与配置中指定的数值相符合
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
# 对输入像素进行可能的填充,使其能够被补丁大小整除
pixel_values = self.maybe_pad(pixel_values, height, width)
# 使用投影层进行特征提取,得到嵌入特征张量
embeddings = self.projection(pixel_values)
# 获取嵌入特征张量的新形状信息
_, _, height, width = embeddings.shape
output_dimensions = (height, width)
# 对嵌入特征进行展平和转置操作,以便后续处理
embeddings = embeddings.flatten(2).transpose(1, 2)
if self.norm is not None:
# 如果存在 LayerNorm 层,则对嵌入特征进行标准化处理
embeddings = self.norm(embeddings)
return embeddings, output_dimensions
# Copied from transformers.models.beit.modeling_beit.drop_path
# 定义一个函数用于在神经网络中应用路径丢弃(Stochastic Depth),每个样本都可能执行该操作(当应用于残差块的主路径时)。
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
"""
# 如果丢失概率为0或者当前非训练状态,则直接返回输入
if drop_prob == 0.0 or not training:
return input
# 计算保留概率
keep_prob = 1 - drop_prob
# 创建一个与输入形状兼容的随机张量,用于随机选择保留的路径
shape = (input.shape[0],) + (1,) * (input.ndim - 1) # 适用于不同维度的张量,而不仅仅是二维卷积网络
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_() # 对随机张量进行二值化处理
# 对输入进行路径丢弃操作,并返回处理后的输出
output = input.div(keep_prob) * random_tensor
return output
# 从transformers.models.beit.modeling_beit.BeitDropPath复制并更改为FocalNet
class FocalNetDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 调用上面定义的drop_path函数来实现路径丢弃操作
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
# 返回当前模块的额外描述信息,这里是丢弃概率(drop_prob)
return "p={}".format(self.drop_prob)
class FocalNetModulation(nn.Module):
# 这里可以添加FocalNetModulation的具体实现,以进行FocalNet的特定调制
def __init__(self, config, index, dim, focal_factor=2, bias=True, projection_dropout=0.0):
super().__init__()
self.dim = dim # 设置对象的维度属性
self.focal_window = config.focal_windows[index] # 获取配置中的焦点窗口大小
self.focal_level = config.focal_levels[index] # 获取配置中的焦点级别
self.focal_factor = focal_factor # 设置焦点因子
self.use_post_layernorm_in_modulation = config.use_post_layernorm_in_modulation # 是否使用后层标准化调制
self.normalize_modulator = config.normalize_modulator # 是否标准化调制器
self.projection_in = nn.Linear(dim, 2 * dim + (self.focal_level + 1), bias=bias) # 输入投影层
self.projection_context = nn.Conv2d(dim, dim, kernel_size=1, stride=1, bias=bias) # 上下文投影卷积层
self.activation = nn.GELU() # 激活函数
self.projection_out = nn.Linear(dim, dim) # 输出投影层
self.projection_dropout = nn.Dropout(projection_dropout) # 投影层的dropout
self.focal_layers = nn.ModuleList() # 焦点层列表
self.kernel_sizes = [] # 焦点层的卷积核尺寸列表
for k in range(self.focal_level):
kernel_size = self.focal_factor * k + self.focal_window # 计算每个焦点层的卷积核尺寸
self.focal_layers.append(
nn.Sequential(
nn.Conv2d(
dim, dim, kernel_size=kernel_size, stride=1, groups=dim, padding=kernel_size // 2, bias=False
), # 焦点层的卷积操作
nn.GELU(), # 焦点层后的激活函数
)
)
self.kernel_sizes.append(kernel_size) # 将卷积核尺寸添加到列表中
if self.use_post_layernorm_in_modulation:
self.layernorm = nn.LayerNorm(dim, eps=config.layer_norm_eps) # 后层标准化层
def forward(self, hidden_state):
"""
Args:
hidden_state:
Input features with shape of (batch_size, height, width, num_channels)
"""
num_channels = hidden_state.shape[-1] # 获取输入张量中的通道数
# pre linear projection
x = self.projection_in(hidden_state).permute(0, 3, 1, 2).contiguous() # 线性投影操作,并对张量维度进行转置和连续化处理
q, ctx, self.gates = torch.split(x, (num_channels, num_channels, self.focal_level + 1), 1) # 按通道数切分张量为q, ctx和门控信号
# context aggreation
ctx_all = 0 # 初始化上下文聚合变量
for level in range(self.focal_level):
ctx = self.focal_layers[level](ctx) # 使用每个焦点层处理上下文
ctx_all = ctx_all + ctx * self.gates[:, level : level + 1] # 加权累积上下文特征
ctx_global = self.activation(ctx.mean(2, keepdim=True).mean(3, keepdim=True)) # 全局上下文特征的平均池化和激活处理
ctx_all = ctx_all + ctx_global * self.gates[:, self.focal_level :] # 添加全局上下文特征的加权结果
# normalize context
if self.normalize_modulator:
ctx_all = ctx_all / (self.focal_level + 1) # 如果需要,对上下文进行标准化
# focal modulation
self.modulator = self.projection_context(ctx_all) # 使用上下文调制器对输入进行调制
x_out = q * self.modulator # 根据调制结果对q进行调制
x_out = x_out.permute(0, 2, 3, 1).contiguous() # 对输出张量进行转置和连续化处理
if self.use_post_layernorm_in_modulation:
x_out = self.layernorm(x_out) # 如果需要,对调制后的输出进行后层标准化处理
# post linear projection
x_out = self.projection_out(x_out) # 输出层的线性投影
x_out = self.projection_dropout(x_out) # 输出层的dropout处理
return x_out # 返回最终的输出张量
# 定义一个名为 FocalNetLayer 的自定义神经网络层
class FocalNetLayer(nn.Module):
r"""Focal Modulation Network layer (block).
Args:
config (`FocalNetConfig`):
Model config.
index (`int`):
Layer index.
dim (`int`):
Number of input channels.
input_resolution (`Tuple[int]`):
Input resolution.
drop_path (`float`, *optional*, defaults to 0.0):
Stochastic depth rate.
"""
# 初始化函数,用于设置层的各种属性和参数
def __init__(self, config, index, dim, input_resolution, drop_path=0.0):
super().__init__()
self.config = config
# 设置层特定的属性
self.dim = dim # 输入通道数
self.input_resolution = input_resolution # 输入分辨率
# 设置通用属性
self.drop = config.hidden_dropout_prob # 隐藏层的 Dropout 概率
self.use_post_layernorm = config.use_post_layernorm # 是否使用层归一化
# 第一个层归一化模块
self.norm1 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
# FocalNetModulation 类的实例化,用于模块化调节
self.modulation = FocalNetModulation(
config=config,
index=index,
dim=dim,
projection_dropout=self.drop,
)
# 根据 drop_path 参数选择是否应用随机深度
self.drop_path = FocalNetDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
# 第二个层归一化模块
self.norm2 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
# 计算 MLP 隐藏层的维度
mlp_hidden_dim = int(dim * config.mlp_ratio)
# 实例化 FocalNetMlp 类,定义 MLP 结构
self.mlp = FocalNetMlp(config=config, in_features=dim, hidden_features=mlp_hidden_dim, drop=self.drop)
# 初始化 layerscale 的 gamma 参数
self.gamma_1 = 1.0
self.gamma_2 = 1.0
if config.use_layerscale:
self.gamma_1 = nn.Parameter(config.layerscale_value * torch.ones((dim)), requires_grad=True)
self.gamma_2 = nn.Parameter(config.layerscale_value * torch.ones((dim)), requires_grad=True)
# 定义前向传播函数,接收隐藏状态和输入尺寸作为参数
def forward(self, hidden_state, input_dimensions):
# 解包输入尺寸为高度和宽度
height, width = input_dimensions
# 获取隐藏状态的批大小、深度和通道数
batch_size, _, num_channels = hidden_state.shape
# 保存原始的隐藏状态作为快捷方式
shortcut = hidden_state
# Focal Modulation(集中调制)
# 如果未使用后层归一化,则对隐藏状态进行归一化处理
hidden_state = hidden_state if self.use_post_layernorm else self.norm1(hidden_state)
# 将隐藏状态重新调整形状为(batch_size, height, width, num_channels)
hidden_state = hidden_state.view(batch_size, height, width, num_channels)
# 应用调制器(modulation)到隐藏状态,再将其展平为(batch_size, height * width, num_channels)
hidden_state = self.modulation(hidden_state).view(batch_size, height * width, num_channels)
# 如果使用后层归一化,则再次对隐藏状态进行归一化处理
hidden_state = hidden_state if not self.use_post_layernorm else self.norm1(hidden_state)
# FFN(Feed Forward Network,前馈神经网络)
# 结合快捷方式和经过DropPath处理的隐藏状态乘以gamma_1
hidden_state = shortcut + self.drop_path(self.gamma_1 * hidden_state)
# 将DropPath处理后的MLP输出乘以gamma_2加回到隐藏状态上
hidden_state = hidden_state + self.drop_path(
self.gamma_2
* (self.norm2(self.mlp(hidden_state)) if self.use_post_layernorm else self.mlp(self.norm2(hidden_state)))
)
# 返回最终的隐藏状态
return hidden_state
# 定义 FocalNetStage 类,继承自 nn.Module,用于 FocalNet 的每个阶段处理
class FocalNetStage(nn.Module):
# 初始化方法,接收配置、阶段索引和输入分辨率作为参数
def __init__(self, config, index, input_resolution):
super().__init__()
# 将配置参数保存到实例变量中
self.config = config
# 计算深度列表的长度,即阶段数
self.num_stages = len(config.depths)
# 计算当前阶段的嵌入维度和输出维度
embed_dim = [config.embed_dim * (2**i) for i in range(self.num_stages)]
dim = embed_dim[index]
out_dim = embed_dim[index + 1] if (index < self.num_stages - 1) else None
# 如果不是最后一个阶段,则设置下采样函数
downsample = FocalNetPatchEmbeddings if (index < self.num_stages - 1) else None
# 根据随机深度衰减规则生成当前阶段的丢弃路径率
dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
drop_path = dpr[sum(config.depths[:index]):sum(config.depths[:index + 1])]
# 创建当前阶段的层列表,每一层使用 FocalNetLayer 类处理
self.layers = nn.ModuleList(
[
FocalNetLayer(
config=config,
index=index,
dim=dim,
input_resolution=input_resolution,
drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
)
for i in range(config.depths[index])
]
)
# 如果有下采样函数,则初始化它
if downsample is not None:
self.downsample = downsample(
config=config,
image_size=input_resolution,
patch_size=2,
num_channels=dim,
embed_dim=out_dim,
add_norm=True,
use_conv_embed=config.use_conv_embed,
is_stem=False,
)
else:
self.downsample = None
# 初始化指针状态为 False
self.pointing = False
# 前向传播方法,接收隐藏状态张量和输入尺寸元组作为参数,返回包含三个张量的元组
def forward(self, hidden_states: torch.Tensor, input_dimensions: Tuple[int, int]) -> Tuple[torch.Tensor]:
height, width = input_dimensions
# 遍历所有层,逐层进行前向传播计算
for layer_module in self.layers:
hidden_states = layer_module(hidden_states, input_dimensions)
# 在进行下采样之前保存当前隐藏状态
hidden_states_before_downsampling = hidden_states
# 如果有下采样函数,则对隐藏状态进行形状变换和下采样操作
if self.downsample is not None:
height, width = input_dimensions
hidden_states = hidden_states.transpose(1, 2).reshape(
hidden_states_before_downsampling.shape[0], -1, height, width
)
hidden_states, output_dimensions = self.downsample(hidden_states)
else:
# 如果没有下采样函数,则直接使用原始的输入尺寸作为输出尺寸
output_dimensions = (height, width, height, width)
# 返回阶段的输出元组,包括下采样后的隐藏状态、未下采样前的隐藏状态和输出尺寸
stage_outputs = (hidden_states, hidden_states_before_downsampling, output_dimensions)
return stage_outputs
# 初始化方法,用于创建 FocalNet 对象实例
def __init__(self, config, grid_size):
# 调用父类的初始化方法
super().__init__()
# 获取深度网络层数
self.num_stages = len(config.depths)
# 保存配置对象
self.config = config
# 创建一个包含多个 FocalNetStage 实例的列表,每个实例对应一个深度网络阶段
self.stages = nn.ModuleList(
[
FocalNetStage(
config=config,
index=i_layer,
# 设置输入分辨率,根据层次 index 和网格大小计算
input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)),
)
for i_layer in range(self.num_stages)
]
)
# 梯度检查点设为 False
self.gradient_checkpointing = False
# 前向传播方法,接收隐藏状态张量、输入维度、可选输出隐藏状态标志、可选输出前采样隐藏状态标志和返回字典标志
def forward(
self,
hidden_states: torch.Tensor,
input_dimensions: Tuple[int, int],
output_hidden_states: Optional[bool] = False,
output_hidden_states_before_downsampling: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> Union[Tuple, FocalNetEncoderOutput]:
# 如果需要输出隐藏状态,则初始化空元组来存储所有隐藏状态
all_hidden_states = () if output_hidden_states else None
# 如果需要输出隐藏状态,则初始化空元组来存储所有重塑后的隐藏状态
all_reshaped_hidden_states = () if output_hidden_states else None
# 如果需要输出隐藏状态,则重塑隐藏状态张量的形状
if output_hidden_states:
batch_size, _, hidden_size = hidden_states.shape
# rearrange b (h w) c -> b c h w
reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
all_hidden_states += (hidden_states,)
all_reshaped_hidden_states += (reshaped_hidden_state,)
# 遍历所有阶段模块进行处理
for i, stage_module in enumerate(self.stages):
# 如果启用了梯度检查点且正在训练阶段,则使用梯度检查点函数来计算阶段输出
if self.gradient_checkpointing and self.training:
stage_outputs = self._gradient_checkpointing_func(
stage_module.__call__,
hidden_states,
input_dimensions,
)
else:
# 否则直接调用阶段模块来计算阶段输出
stage_outputs = stage_module(hidden_states, input_dimensions)
# 更新隐藏状态为当前阶段的主要输出
hidden_states = stage_outputs[0]
# 保存当前阶段的下采样之前的隐藏状态
hidden_states_before_downsampling = stage_outputs[1]
# 更新输出的尺寸维度信息
output_dimensions = stage_outputs[2]
# 更新输入尺寸为当前输出尺寸的高度和宽度
input_dimensions = (output_dimensions[-2], output_dimensions[-1])
# 如果需要输出隐藏状态且输出下采样之前的隐藏状态
if output_hidden_states and output_hidden_states_before_downsampling:
batch_size, _, hidden_size = hidden_states_before_downsampling.shape
# rearrange b (h w) c -> b c h w
# 使用原始(未下采样)的高度和宽度来重塑隐藏状态张量的形状
reshaped_hidden_state = hidden_states_before_downsampling.view(
batch_size, *(output_dimensions[0], output_dimensions[1]), hidden_size
)
reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
all_hidden_states += (hidden_states_before_downsampling,)
all_reshaped_hidden_states += (reshaped_hidden_state,)
# 如果需要输出隐藏状态但不需要输出下采样之前的隐藏状态
elif output_hidden_states and not output_hidden_states_before_downsampling:
batch_size, _, hidden_size = hidden_states.shape
# rearrange b (h w) c -> b c h w
# 重塑隐藏状态张量的形状
reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
all_hidden_states += (hidden_states,)
all_reshaped_hidden_states += (reshaped_hidden_state,)
# 如果不需要以字典形式返回结果,则返回元组形式的隐藏状态和所有隐藏状态
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
# 否则以 FocalNetEncoderOutput 类的形式返回结果,包括最后的隐藏状态、所有隐藏状态和所有重塑后的隐藏状态
return FocalNetEncoderOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
reshaped_hidden_states=all_reshaped_hidden_states,
)
# 从transformers.models.swin.modeling_swin.SwinPreTrainedModel复制过来,并将Swin->FocalNet,swin->focalnet
class FocalNetPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
# 使用FocalNetConfig作为配置类
config_class = FocalNetConfig
# base_model_prefix指定模型前缀为"focalnet"
base_model_prefix = "focalnet"
# 主输入的名称为"pixel_values"
main_input_name = "pixel_values"
# 支持梯度检查点
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
# 初始化权重:对于Linear和Conv2d层使用正态分布初始化权重,均值为0,标准差为config.initializer_range
if isinstance(module, (nn.Linear, nn.Conv2d)):
# 与TF版本稍有不同,TF版本使用截断正态分布进行初始化
# 参考 https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
# 如果存在偏置项,则将其初始化为0
module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
# 如果是LayerNorm层,将偏置项初始化为0,权重初始化为1
module.bias.data.zero_()
module.weight.data.fill_(1.0)
# FOCALNET_START_DOCSTRING是FocalNetModel的文档字符串的一部分,包含模型的基本用法和参数说明
FOCALNET_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`FocalNetConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# FOCALNET_INPUTS_DOCSTRING是FocalNetModel的输入参数说明文档字符串的一部分,详细描述了输入参数的类型和含义
FOCALNET_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoImageProcessor.__call__`] for details.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
# 使用@add_start_docstrings注解,将FocalNetModel的文档字符串合并生成
@add_start_docstrings(
"The bare FocalNet Model outputting raw hidden-states without any specific head on top.",
FOCALNET_START_DOCSTRING,
)
# 定义FocalNetModel类,继承自FocalNetPreTrainedModel类
class FocalNetModel(FocalNetPreTrainedModel):
pass # Placeholder for future model implementation
# 初始化函数,用于初始化模型
def __init__(self, config, add_pooling_layer=True, use_mask_token=False):
# 调用父类的初始化函数
super().__init__(config)
# 将配置参数保存到对象中
self.config = config
# 计算深度列表的长度,确定特征数量
self.num_stages = len(config.depths)
self.num_features = int(config.embed_dim * 2 ** (self.num_stages - 1))
# 创建嵌入层对象
self.embeddings = FocalNetEmbeddings(config, use_mask_token=use_mask_token)
# 创建编码器对象,使用嵌入层的 patch_grid 参数
self.encoder = FocalNetEncoder(config, self.embeddings.patch_grid)
# 创建 LayerNorm 层,设置归一化的特征数量和 epsilon 值
self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps)
# 如果指定要添加池化层,则创建 AdaptiveAvgPool1d 层,用于池化操作
self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
# 初始化权重并进行最终处理
self.post_init()
# 获取输入嵌入的函数,返回 patch_embeddings 属性
def get_input_embeddings(self):
return self.embeddings.patch_embeddings
# 前向传播函数,根据输入参数进行模型的前向计算
@add_start_docstrings_to_model_forward(FOCALNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=FocalNetModelOutput,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
bool_masked_pos: Optional[torch.BoolTensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, FocalNetModelOutput]:
r"""
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
"""
# 如果未指定 output_hidden_states,则使用配置中的设定
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果未指定 return_dict,则使用配置中的设定
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果 pixel_values 为 None,则抛出数值错误异常
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
# 调用嵌入层的前向函数,得到嵌入输出和输入维度
embedding_output, input_dimensions = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
# 调用编码器的前向函数,得到编码器的输出
encoder_outputs = self.encoder(
embedding_output,
input_dimensions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取序列输出并进行 LayerNorm 处理
sequence_output = encoder_outputs[0]
sequence_output = self.layernorm(sequence_output)
# 初始化池化输出为 None
pooled_output = None
# 如果池化层不为 None,则进行池化操作和扁平化处理
if self.pooler is not None:
pooled_output = self.pooler(sequence_output.transpose(1, 2))
pooled_output = torch.flatten(pooled_output, 1)
# 如果不返回字典,则返回元组形式的输出
if not return_dict:
output = (sequence_output, pooled_output) + encoder_outputs[1:]
return output
# 如果返回字典,则返回 FocalNetModelOutput 类型的对象
return FocalNetModelOutput(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
)
@add_start_docstrings(
"""
FocalNet Model with a decoder on top for masked image modeling.
This follows the same implementation as in [SimMIM](https://arxiv.org/abs/2111.09886).
<Tip>
Note that we provide a script to pre-train this model on custom data in our [examples
directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).
</Tip>
""",
FOCALNET_START_DOCSTRING,
)
class FocalNetForMaskedImageModeling(FocalNetPreTrainedModel):
"""
FocalNet Model for masked image modeling, extending FocalNetPreTrainedModel.
Inherits from FocalNetPreTrainedModel and implements a model architecture with a decoder.
"""
def __init__(self, config):
"""
Initializes the FocalNetForMaskedImageModeling.
Args:
config: FocalNet configuration class instance.
"""
super().__init__(config)
# Initialize FocalNet model with specified configuration
self.focalnet = FocalNetModel(config, add_pooling_layer=False, use_mask_token=True)
# Calculate number of stages and features for the decoder
self.num_stages = len(config.depths)
num_features = int(config.embed_dim * 2 ** (self.num_stages - 1))
# Define decoder architecture using convolution and pixel shuffle
self.decoder = nn.Sequential(
nn.Conv2d(
in_channels=num_features, out_channels=config.encoder_stride**2 * config.num_channels, kernel_size=1
),
nn.PixelShuffle(config.encoder_stride),
)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(FOCALNET_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=FocalNetMaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
bool_masked_pos: Optional[torch.BoolTensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""
FocalNet Model with an image classification head on top (a linear layer on top of the pooled output) e.g. for
ImageNet.
""",
FOCALNET_START_DOCSTRING,
)
class FocalNetForImageClassification(FocalNetPreTrainedModel):
"""
FocalNet Model for image classification tasks, extending FocalNetPreTrainedModel.
Inherits from FocalNetPreTrainedModel and implements a model architecture with an image classification head.
"""
# Copied from transformers.models.swin.modeling_swin.SwinForImageClassification.__init__ with Swin->FocalNet, swin->focalnet
def __init__(self, config):
"""
Initializes the FocalNetForImageClassification.
Args:
config: FocalNet configuration class instance.
"""
super().__init__(config)
self.num_labels = config.num_labels
self.focalnet = FocalNetModel(config)
# Define classifier head
self.classifier = (
nn.Linear(self.focalnet.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity()
)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(FOCALNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=FocalNetImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, FocalNetImageClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 如果 return_dict 为 None,则使用 self.config.use_return_dict
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用 FocalNet 模型进行前向传播
outputs = self.focalnet(
pixel_values,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取池化后的输出,通常是经过全局平均池化的结果
pooled_output = outputs[1]
# 对池化后的输出进行分类器的前向传播,得到分类器的输出 logits
logits = self.classifier(pooled_output)
# 初始化损失为 None
loss = None
# 如果 labels 不为 None,则计算损失
if labels is not None:
# 如果问题类型未定义,则根据条件自动定义问题类型
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
# 根据问题类型选择相应的损失函数
if self.config.problem_type == "regression":
loss_fct = MSELoss() # 使用均方误差损失函数
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss() # 使用交叉熵损失函数
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss() # 使用带 logits 的二元交叉熵损失函数
loss = loss_fct(logits, labels)
# 如果 return_dict 为 False,则返回损失和模型输出,否则返回自定义输出对象 FocalNetImageClassifierOutput
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return FocalNetImageClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
reshaped_hidden_states=outputs.reshaped_hidden_states,
)
@add_start_docstrings(
"""
FocalNet backbone, to be used with frameworks like X-Decoder.
""",
FOCALNET_START_DOCSTRING,
)
class FocalNetBackbone(FocalNetPreTrainedModel, BackboneMixin):
def __init__(self, config: FocalNetConfig):
super().__init__(config)
super()._init_backbone(config)
# 设置特征的维度列表,包括嵌入维度和隐藏层尺寸
self.num_features = [config.embed_dim] + config.hidden_sizes
# 创建 FocalNet 模型对象
self.focalnet = FocalNetModel(config)
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(FOCALNET_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.Tensor,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> BackboneOutput:
"""
Returns:
Examples:
```
>>> from transformers import AutoImageProcessor, AutoBackbone
>>> import torch
>>> from PIL import Image
>>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> processor = AutoImageProcessor.from_pretrained("microsoft/focalnet-tiny-lrf")
>>> model = AutoBackbone.from_pretrained("microsoft/focalnet-tiny-lrf")
>>> inputs = processor(image, return_tensors="pt")
>>> outputs = model(**inputs)
```"""
# 如果 return_dict 为 None,则使用配置中的默认设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果 output_hidden_states 为 None,则使用配置中的默认设置
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 调用 FocalNet 模型进行前向传播
outputs = self.focalnet(pixel_values, output_hidden_states=True, return_dict=True)
# 获取重塑后的隐藏状态
hidden_states = outputs.reshaped_hidden_states
feature_maps = ()
# 遍历阶段名称和输出特征名称,获取特征映射
for idx, stage in enumerate(self.stage_names):
if stage in self.out_features:
feature_maps += (hidden_states[idx],)
# 如果不要求返回字典,则返回一个元组
if not return_dict:
output = (feature_maps,)
if output_hidden_states:
output += (outputs.hidden_states,)
return output
# 返回 BackboneOutput 对象,包含特征映射、隐藏状态(如果需要)、注意力(暂时为 None)
return BackboneOutput(
feature_maps=feature_maps,
hidden_states=outputs.hidden_states if output_hidden_states else None,
attentions=None,
)
.\models\focalnet\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {"configuration_focalnet": ["FOCALNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "FocalNetConfig"]}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_focalnet"] = [
"FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST",
"FocalNetForImageClassification",
"FocalNetForMaskedImageModeling",
"FocalNetBackbone",
"FocalNetModel",
"FocalNetPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_focalnet import FOCALNET_PRETRAINED_CONFIG_ARCHIVE_MAP, FocalNetConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_focalnet import (
FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST,
FocalNetBackbone,
FocalNetForImageClassification,
FocalNetForMaskedImageModeling,
FocalNetModel,
FocalNetPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\fsmt\configuration_fsmt.py
"""
FSMT configuration
"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
class DecoderConfig(PretrainedConfig):
r"""
Configuration class for FSMT's decoder specific things. note: this is a private helper class
"""
model_type = "fsmt_decoder"
def __init__(self, vocab_size=0, bos_token_id=0):
super().__init__()
self.vocab_size = vocab_size
self.bos_token_id = bos_token_id
class FSMTConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`FSMTModel`]. It is used to instantiate a FSMT
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the FSMT
[facebook/wmt19-en-ru](https://huggingface.co/facebook/wmt19-en-ru) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Examples:
```
>>> from transformers import FSMTConfig, FSMTModel
>>> # Initializing a FSMT facebook/wmt19-en-ru style configuration
>>> config = FSMTConfig()
>>> # Initializing a model (with random weights) from the configuration
>>> model = FSMTModel(config)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "fsmt"
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__(
self,
langs=["en", "de"],
src_vocab_size=42024,
tgt_vocab_size=42024,
activation_function="relu",
d_model=1024,
max_length=200,
max_position_embeddings=1024,
encoder_ffn_dim=4096,
encoder_layers=12,
encoder_attention_heads=16,
encoder_layerdrop=0.0,
decoder_ffn_dim=4096,
decoder_layers=12,
decoder_attention_heads=16,
decoder_layerdrop=0.0,
attention_dropout=0.0,
dropout=0.1,
activation_dropout=0.0,
init_std=0.02,
decoder_start_token_id=2,
is_encoder_decoder=True,
scale_embedding=True,
tie_word_embeddings=False,
num_beams=5,
length_penalty=1.0,
early_stopping=False,
use_cache=True,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
forced_eos_token_id=2,
**common_kwargs,
):
self.langs = langs
self.src_vocab_size = src_vocab_size
self.tgt_vocab_size = tgt_vocab_size
self.d_model = d_model
self.encoder_ffn_dim = encoder_ffn_dim
self.encoder_layers = self.num_hidden_layers = encoder_layers
self.encoder_attention_heads = encoder_attention_heads
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.decoder_ffn_dim = decoder_ffn_dim
self.decoder_layers = decoder_layers
self.decoder_attention_heads = decoder_attention_heads
self.max_position_embeddings = max_position_embeddings
self.init_std = init_std
self.activation_function = activation_function
self.decoder = DecoderConfig(vocab_size=tgt_vocab_size, bos_token_id=eos_token_id)
if "decoder" in common_kwargs:
del common_kwargs["decoder"]
self.scale_embedding = scale_embedding
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.dropout = dropout
self.use_cache = use_cache
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
decoder_start_token_id=decoder_start_token_id,
is_encoder_decoder=is_encoder_decoder,
tie_word_embeddings=tie_word_embeddings,
forced_eos_token_id=forced_eos_token_id,
max_length=max_length,
num_beams=num_beams,
length_penalty=length_penalty,
early_stopping=early_stopping,
**common_kwargs,
)
.\models\fsmt\convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
import argparse
import json
import os
import re
from collections import OrderedDict
from os.path import basename, dirname
import fairseq
import torch
from fairseq import hub_utils
from fairseq.data.dictionary import Dictionary
from transformers import FSMTConfig, FSMTForConditionalGeneration
from transformers.models.fsmt.tokenization_fsmt import VOCAB_FILES_NAMES
from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
from transformers.utils import WEIGHTS_NAME, logging
logging.set_verbosity_warning()
json_indent = 2
best_score_hparams = {
"wmt19-ru-en": {"length_penalty": 1.1},
"wmt19-en-ru": {"length_penalty": 1.15},
"wmt19-en-de": {"length_penalty": 1.0},
"wmt19-de-en": {"length_penalty": 1.1},
"wmt16-en-de-dist-12-1": {"length_penalty": 0.6},
"wmt16-en-de-dist-6-1": {"length_penalty": 0.6},
"wmt16-en-de-12-1": {"length_penalty": 0.8},
"wmt19-de-en-6-6-base": {"length_penalty": 0.6},
"wmt19-de-en-6-6-big": {"length_penalty": 0.6},
}
org_names = {}
for m in ["wmt19-ru-en", "wmt19-en-ru", "wmt19-en-de", "wmt19-de-en"]:
org_names[m] = "facebook"
for m in [
"wmt16-en-de-dist-12-1",
"wmt16-en-de-dist-6-1",
"wmt16-en-de-12-1",
"wmt19-de-en-6-6-base",
"wmt19-de-en-6-6-big",
]:
org_names[m] = "allenai"
def rewrite_dict_keys(d):
d2 = dict((re.sub(r"@@$", "", k), v) if k.endswith("@@") else (re.sub(r"$", "</w>", k), v) for k, v in d.items())
keep_keys = "<s> <pad> </s> <unk>".split()
for k in keep_keys:
del d2[f"{k}</w>"]
d2[k] = d[k]
return d2
def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder_path):
assert os.path.exists(fsmt_checkpoint_path)
os.makedirs(pytorch_dump_folder_path, exist_ok=True)
print(f"Writing results to {pytorch_dump_folder_path}")
checkpoint_file = basename(fsmt_checkpoint_path)
fsmt_folder_path = dirname(fsmt_checkpoint_path)
cls = fairseq.model_parallel.models.transformer.ModelParallelTransformerModel
models = cls.hub_models()
kwargs = {"bpe": "fastbpe", "tokenizer": "moses"}
data_name_or_path = "."
print(f"using checkpoint {checkpoint_file}")
chkpt = hub_utils.from_pretrained(
fsmt_folder_path, checkpoint_file, data_name_or_path, archive_map=models, **kwargs
)
args = vars(chkpt["args"]["model"])
src_lang = args["source_lang"]
tgt_lang = args["target_lang"]
data_root = dirname(pytorch_dump_folder_path)
model_dir = basename(pytorch_dump_folder_path)
src_dict_file = os.path.join(fsmt_folder_path, f"dict.{src_lang}.txt")
tgt_dict_file = os.path.join(fsmt_folder_path, f"dict.{tgt_lang}.txt")
src_dict = Dictionary.load(src_dict_file)
src_vocab = rewrite_dict_keys(src_dict.indices)
src_vocab_size = len(src_vocab)
src_vocab_file = os.path.join(pytorch_dump_folder_path, "vocab-src.json")
print(f"Generating {src_vocab_file} of {src_vocab_size} of {src_lang} records")
with open(src_vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent))
do_lower_case = True
for k in src_vocab.keys():
if not k.islower():
do_lower_case = False
break
tgt_dict = Dictionary.load(tgt_dict_file)
tgt_vocab = rewrite_dict_keys(tgt_dict.indices)
tgt_vocab_size = len(tgt_vocab)
tgt_vocab_file = os.path.join(pytorch_dump_folder_path, "vocab-tgt.json")
print(f"Generating {tgt_vocab_file} of {tgt_vocab_size} of {tgt_lang} records")
with open(tgt_vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(tgt_vocab, ensure_ascii=False, indent=json_indent))
merges_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["merges_file"])
for fn in ["bpecodes", "code"]:
fsmt_merges_file = os.path.join(fsmt_folder_path, fn)
if os.path.exists(fsmt_merges_file):
break
with open(fsmt_merges_file, encoding="utf-8") as fin:
merges = fin.read()
merges = re.sub(r" \d+$", "", merges, 0, re.M)
print(f"Generating {merges_file}")
with open(merges_file, "w", encoding="utf-8") as fout:
fout.write(merges)
fsmt_model_config_file = os.path.join(pytorch_dump_folder_path, "config.json")
assert args["bpe"] == "fastbpe", f"need to extend tokenizer to support bpe={args['bpe']}"
assert args["tokenizer"] == "moses", f"need to extend tokenizer to support bpe={args['tokenizer']}"
model_conf = {
"architectures": ["FSMTForConditionalGeneration"],
"model_type": "fsmt",
"activation_dropout": args["activation_dropout"],
"activation_function": "relu",
"attention_dropout": args["attention_dropout"],
"d_model": args["decoder_embed_dim"],
"dropout": args["dropout"],
"init_std": 0.02,
"max_position_embeddings": args["max_source_positions"],
"num_hidden_layers": args["encoder_layers"],
"src_vocab_size": src_vocab_size,
"tgt_vocab_size": tgt_vocab_size,
"langs": [src_lang, tgt_lang],
"encoder_attention_heads": args["encoder_attention_heads"],
"encoder_ffn_dim": args["encoder_ffn_embed_dim"],
"encoder_layerdrop": args["encoder_layerdrop"],
"encoder_layers": args["encoder_layers"],
"decoder_attention_heads": args["decoder_attention_heads"],
"decoder_ffn_dim": args["decoder_ffn_embed_dim"],
"decoder_layerdrop": args["decoder_layerdrop"],
"decoder_layers": args["decoder_layers"],
"bos_token_id": 0,
"pad_token_id": 1,
"eos_token_id": 2,
"is_encoder_decoder": True,
"scale_embedding": not args["no_scale_embedding"],
"tie_word_embeddings": args["share_all_embeddings"],
}
model_conf["num_beams"] = 5
model_conf["early_stopping"] = False
if model_dir in best_score_hparams and "length_penalty" in best_score_hparams[model_dir]:
model_conf["length_penalty"] = best_score_hparams[model_dir]["length_penalty"]
else:
model_conf["length_penalty"] = 1.0
print(f"Generating {fsmt_model_config_file}")
with open(fsmt_model_config_file, "w", encoding="utf-8") as f:
f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent))
fsmt_tokenizer_config_file = os.path.join(pytorch_dump_folder_path, TOKENIZER_CONFIG_FILE)
tokenizer_conf = {
"langs": [src_lang, tgt_lang],
"model_max_length": 1024,
"do_lower_case": do_lower_case,
}
print(f"Generating {fsmt_tokenizer_config_file}")
with open(fsmt_tokenizer_config_file, "w", encoding="utf-8") as f:
f.write(json.dumps(tokenizer_conf, ensure_ascii=False, indent=json_indent))
model = chkpt["models"][0]
model_state_dict = model.state_dict()
model_state_dict = OrderedDict(("model." + k, v) for k, v in model_state_dict.items())
ignore_keys = [
"model.model",
"model.encoder.version",
"model.decoder.version",
"model.encoder_embed_tokens.weight",
"model.decoder_embed_tokens.weight",
"model.encoder.embed_positions._float_tensor",
"model.decoder.embed_positions._float_tensor",
]
for k in ignore_keys:
model_state_dict.pop(k, None)
config = FSMTConfig.from_pretrained(pytorch_dump_folder_path)
model_new = FSMTForConditionalGeneration(config)
model_new.load_state_dict(model_state_dict, strict=False)
pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
print(f"Generating {pytorch_weights_dump_path}")
torch.save(model_state_dict, pytorch_weights_dump_path)
print("Conversion is done!")
print("\nLast step is to upload the files to s3")
print(f"cd {data_root}")
print(f"transformers-cli upload {model_dir}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--fsmt_checkpoint_path",
default=None,
type=str,
required=True,
help=(
"Path to the official PyTorch checkpoint file which is expected to reside in the dump dir with dicts,"
" bpecodes, etc."
),
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
args = parser.parse_args()
convert_fsmt_checkpoint_to_pytorch(args.fsmt_checkpoint_path, args.pytorch_dump_folder_path)
.\models\fsmt\modeling_fsmt.py
"""
PyTorch Fairseq model, ported from https://github.com/pytorch/fairseq/tree/master/examples/wmt19
PyTorch Fairseq模型,从https://github.com/pytorch/fairseq/tree/master/examples/wmt19 迁移而来
"""
import math
from typing import Any, Dict, List, Optional, Tuple, Union
import torch
from torch import Tensor, nn
from torch.nn import CrossEntropyLoss, LayerNorm
from ...activations import ACT2FN
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
Seq2SeqLMOutput,
Seq2SeqModelOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_code_sample_docstrings,
add_end_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_fsmt import FSMTConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "facebook/wmt19-ru-en"
_CONFIG_FOR_DOC = "FSMTConfig"
"""
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
# 运行一个命令行脚本,评估模型翻译效果并生成评估结果文件,包括源文件、目标文件、BLEU 分数等参数设置
# (fairseq BLEU: 43.1 http://matrix.statmt.org/matrix/output/1909?run_id=6862)
# 指示 fairseq 模型的 BLEU 分数及其详细信息的链接
"""
FSMT_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`FSMTConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
FSMT_GENERATION_EXAMPLE = r"""
Translation example::
```
>>> from transformers import AutoTokenizer, FSMTForConditionalGeneration
>>> mname = "facebook/wmt19-ru-en"
>>> model = FSMTForConditionalGeneration.from_pretrained(mname)
>>> tokenizer = AutoTokenizer.from_pretrained(mname)
>>> src_text = "Машинное обучение - это здорово, не так ли?"
>>> input_ids = tokenizer(src_text, return_tensors="pt").input_ids
>>> outputs = model.generate(input_ids, num_beams=5, num_return_sequences=3)
>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
"Machine learning is great, isn't it?"
```
"""
FSMT_INPUTS_DOCSTRING = r"""
"""
def invert_mask(attention_mask):
"""Turns 1->0, 0->1, False->True, True-> False"""
assert attention_mask.dim() == 2
return attention_mask.eq(0)
def triu_onnx(x, diagonal=0):
l = x.shape[0]
arange = torch.arange(l, device=x.device)
mask = arange.expand(l, l)
arange = arange.unsqueeze(-1)
if diagonal:
arange = arange + diagonal
mask = mask >= arange
return x.masked_fill(mask == 0, 0)
def _prepare_fsmt_decoder_inputs(
config,
input_ids,
decoder_input_ids=None,
decoder_padding_mask=None,
causal_mask_dtype=torch.float32,
):
"""
Prepare masks that ignore padding tokens in the decoder and a causal mask for the decoder if none are provided.
This mimics the default behavior in fairseq. To override it pass in masks. Note: this is not called during
generation
"""
pad_token_id = config.pad_token_id
if decoder_input_ids is None:
decoder_input_ids = shift_tokens_right(input_ids, pad_token_id)
bsz, tgt_len = decoder_input_ids.size()
if decoder_padding_mask is None:
decoder_padding_mask = make_padding_mask(decoder_input_ids, pad_token_id)
else:
decoder_padding_mask = invert_mask(decoder_padding_mask)
causal_mask = triu_onnx(fill_with_neg_inf(torch.zeros(tgt_len, tgt_len, dtype=causal_mask_dtype)), 1).to(
device=decoder_input_ids.device
)
return decoder_input_ids, decoder_padding_mask, causal_mask
class PretrainedFSMTModel(PreTrainedModel):
config_class = FSMTConfig
base_model_prefix = "model"
def _init_weights(self, module):
std = self.config.init_std
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, SinusoidalPositionalEmbedding):
pass
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
@property
def dummy_inputs(self):
pad_token = self.config.pad_token_id
input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
dummy_inputs = {
"attention_mask": input_ids.ne(pad_token),
"input_ids": input_ids,
}
return dummy_inputs
def _make_linear_from_emb(emb):
vocab_size, emb_size = emb.weight.shape
lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
lin_layer.weight.data = emb.weight.data
return lin_layer
def _check_shapes(shape_1, shape2):
if shape_1 != shape2:
raise AssertionError(f"shape mismatch: {shape_1} != {shape2}")
def shift_tokens_right(input_ids, pad_token_id):
"""Shift input ids one token to the right, and wrap the last non pad token (usually <eos>)."""
input_ids.masked_fill_(input_ids == -100, pad_token_id)
prev_output_tokens = input_ids.clone()
index_of_eos = (input_ids.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
prev_output_tokens[:, 0] = input_ids.gather(1, index_of_eos).squeeze()
prev_output_tokens[:, 1:] = input_ids[:, :-1]
return prev_output_tokens
def make_padding_mask(input_ids, padding_idx=1):
"""True for pad tokens"""
padding_mask = input_ids.eq(padding_idx)
if not padding_mask.any():
padding_mask = None
return padding_mask
class EncoderLayer(nn.Module):
def __init__(self, config: FSMTConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = Attention(self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout)
self.self_attn_layer_norm = LayerNorm(self.embed_dim)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
self.final_layer_norm = LayerNorm(self.embed_dim)
def forward(self, x, encoder_padding_mask, layer_head_mask, output_attentions=False):
"""
Args:
x (`torch.Tensor`): 输入到层的输入,形状为 *(seq_len, batch, embed_dim)*
encoder_padding_mask (`torch.ByteTensor`): 二进制 ByteTensor,形状为
*(batch, src_len)*,其中填充元素由 `1` 表示。
对于 t_tgt,t_src 被排除在外(或者被掩盖),=0 表示在注意力机制中包含它们。
layer_head_mask (`torch.FloatTensor`): 给定层中注意力头的掩码,大小为
*(config.encoder_attention_heads,)*。
Returns:
编码后的输出,形状为 *(seq_len, batch, embed_dim)*
"""
residual = x
x, attn_weights = self.self_attn(
query=x,
key=x,
key_padding_mask=encoder_padding_mask,
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
x = nn.functional.dropout(x, p=self.dropout, training=self.training)
x = residual + x
x = self.self_attn_layer_norm(x)
residual = x
x = self.activation_fn(self.fc1(x))
x = nn.functional.dropout(x, p=self.activation_dropout, training=self.training)
x = self.fc2(x)
x = nn.functional.dropout(x, p=self.dropout, training=self.training)
x = residual + x
x = self.final_layer_norm(x)
return x, attn_weights
class FSMTEncoder(nn.Module):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a [`EncoderLayer`].
Args:
config: FSMTConfig
"""
def __init__(self, config: FSMTConfig, embed_tokens):
super().__init__()
self.dropout = config.dropout
self.layerdrop = config.encoder_layerdrop
self.padding_idx = embed_tokens.padding_idx
self.embed_tokens = embed_tokens
embed_dim = embed_tokens.embedding_dim
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
self.embed_positions = SinusoidalPositionalEmbedding(
config.max_position_embeddings + self.padding_idx + 1, embed_dim, self.padding_idx
)
self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(config.encoder_layers)])
def forward(
self,
input_ids: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
inputs_embeds: torch.Tensor = None,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
"""
Transformer 编码器的前向传播方法。
Args:
input_ids: 输入的 token ids
attention_mask: 注意力遮罩,可选
inputs_embeds: 嵌入的输入,可选
head_mask: 头部遮罩,可选
output_attentions: 是否输出注意力权重,可选
output_hidden_states: 是否输出隐藏状态,可选
return_dict: 是否返回字典形式的输出,可选
Returns:
根据 return_dict 返回不同形式的输出结果
"""
pass
class DecoderLayer(nn.Module):
def __init__(self, config: FSMTConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = Attention(
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.self_attn_layer_norm = LayerNorm(self.embed_dim)
self.encoder_attn = Attention(
self.embed_dim,
config.decoder_attention_heads,
dropout=config.attention_dropout,
encoder_decoder_attention=True,
)
self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
self.final_layer_norm = LayerNorm(self.embed_dim)
def forward(
self,
x,
encoder_hidden_states,
encoder_attn_mask=None,
layer_state=None,
causal_mask=None,
layer_head_mask=None,
cross_attn_layer_head_mask=None,
decoder_padding_mask=None,
output_attentions=False,
):
"""
Transformer 解码器层的前向传播方法。
Args:
x: 输入张量
encoder_hidden_states: 编码器的隐藏状态
encoder_attn_mask: 编码器注意力的遮罩,可选
layer_state: 层状态,可选
causal_mask: 因果遮罩,可选
layer_head_mask: 层头部遮罩,可选
cross_attn_layer_head_mask: 交叉注意力层头部遮罩,可选
decoder_padding_mask: 解码器填充遮罩,可选
output_attentions: 是否输出注意力权重,可选
Returns:
根据不同的参数返回不同形式的输出结果
"""
pass
):
residual = x
if layer_state is None:
layer_state = {}
x, self_attn_weights = self.self_attn(
query=x,
key=x,
layer_state=layer_state,
key_padding_mask=decoder_padding_mask,
attn_mask=causal_mask,
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
x = nn.functional.dropout(x, p=self.dropout, training=self.training)
x = residual + x
x = self.self_attn_layer_norm(x)
residual = x
assert self.encoder_attn.cache_key != self.self_attn.cache_key
x, cross_attn_weights = self.encoder_attn(
query=x,
key=encoder_hidden_states,
key_padding_mask=encoder_attn_mask,
layer_state=layer_state,
layer_head_mask=cross_attn_layer_head_mask,
output_attentions=output_attentions,
)
x = nn.functional.dropout(x, p=self.dropout, training=self.training)
x = residual + x
x = self.encoder_attn_layer_norm(x)
residual = x
x = self.activation_fn(self.fc1(x))
x = nn.functional.dropout(x, p=self.activation_dropout, training=self.training)
x = self.fc2(x)
x = nn.functional.dropout(x, p=self.dropout, training=self.training)
x = residual + x
x = self.final_layer_norm(x)
return (
x,
self_attn_weights,
layer_state,
cross_attn_weights,
)
class FSMTDecoder(nn.Module):
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DecoderLayer`]
Args:
config: FSMTConfig
embed_tokens (nn.Embedding): output embedding
"""
def __init__(self, config: FSMTConfig, embed_tokens: nn.Embedding):
super().__init__()
self.dropout = config.dropout
self.layerdrop = config.decoder_layerdrop
self.padding_idx = embed_tokens.padding_idx
self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
self.embed_tokens = embed_tokens
embed_dim = embed_tokens.embedding_dim
self.embed_positions = SinusoidalPositionalEmbedding(
config.max_position_embeddings + self.padding_idx + 1, embed_dim, self.padding_idx
)
self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(config.decoder_layers)])
if is_deepspeed_zero3_enabled():
import deepspeed
with deepspeed.zero.GatheredParameters(self.embed_tokens.weight, modifier_rank=None):
embed_tokens_weight_shape = self.embed_tokens.weight.shape
else:
embed_tokens_weight_shape = self.embed_tokens.weight.shape
self.output_projection = nn.Linear(embed_tokens_weight_shape[1], embed_tokens_weight_shape[0], bias=False)
self.output_projection.weight = self.embed_tokens.weight
def forward(
self,
input_ids: torch.Tensor,
encoder_hidden_states: torch.Tensor,
encoder_padding_mask: torch.Tensor,
decoder_padding_mask: torch.Tensor,
decoder_causal_mask: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
use_cache: bool = False,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
"""
Defines the forward pass for the FSMTDecoder module.
Args:
input_ids (torch.Tensor): Input token IDs.
encoder_hidden_states (torch.Tensor): Hidden states from the encoder.
encoder_padding_mask (torch.Tensor): Mask for encoder padding.
decoder_padding_mask (torch.Tensor): Mask for decoder padding.
decoder_causal_mask (torch.Tensor): Mask for causal (autoregressive) decoding.
head_mask (Optional[torch.Tensor]): Mask for attention heads.
inputs_embeds (Optional[torch.Tensor]): Embedded inputs.
cross_attn_head_mask (Optional[torch.Tensor]): Mask for cross-attention heads.
past_key_values (Optional[List[torch.FloatTensor]]): Cached key-value states.
use_cache (bool): Whether to use cached key-values.
output_attentions (bool): Whether to output attention weights.
output_hidden_states (bool): Whether to output hidden states.
return_dict (bool): Whether to return a dictionary.
Returns:
torch.Tensor or Dict[str, torch.Tensor]: Depending on `return_dict` flag, either logits or dictionary.
"""
pass
def _reorder_buffer(attn_cache, new_order):
"""
Reorders the attention cache according to the new order of indices.
Args:
attn_cache (Dict[str, torch.Tensor]): Attention cache dictionary.
new_order (torch.Tensor): New order of indices.
Returns:
Dict[str, torch.Tensor]: Reordered attention cache.
"""
for k, input_buffer_k in attn_cache.items():
if input_buffer_k is not None:
attn_cache[k] = input_buffer_k.index_select(0, new_order)
return attn_cache
class Attention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(
self,
embed_dim,
num_heads,
dropout=0.0,
bias=True,
encoder_decoder_attention=False,
):
"""
Initializes the Attention module.
Args:
embed_dim (int): Dimensionality of input embeddings.
num_heads (int): Number of attention heads.
dropout (float): Dropout probability.
bias (bool): Whether to use bias in linear layers.
encoder_decoder_attention (bool): Whether it's encoder-decoder attention or self-attention.
"""
super().__init__()
pass
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
self.scaling = self.head_dim**-0.5
self.encoder_decoder_attention = encoder_decoder_attention
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.cache_key = "encoder_decoder" if self.encoder_decoder_attention else "self"
def _shape(self, tensor, seq_len, bsz):
return tensor.contiguous().view(seq_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
def forward(
self,
query,
key: Optional[Tensor],
key_padding_mask: Optional[Tensor] = None,
layer_state: Optional[Dict[str, Optional[Tensor]]] = None,
attn_mask: Optional[Tensor] = None,
layer_head_mask: Optional[Tensor] = None,
output_attentions=False,
def _use_saved_state(self, k, v, saved_state, key_padding_mask, static_kv, bsz):
if "prev_key" in saved_state:
_prev_key = saved_state["prev_key"]
assert _prev_key is not None
prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim)
if static_kv:
k = prev_key
else:
assert k is not None
k = torch.cat([prev_key, k], dim=1)
if "prev_value" in saved_state:
_prev_value = saved_state["prev_value"]
assert _prev_value is not None
prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim)
if static_kv:
v = prev_value
else:
assert v is not None
v = torch.cat([prev_value, v], dim=1)
assert k is not None and v is not None
prev_key_padding_mask: Optional[Tensor] = saved_state.get("prev_key_padding_mask", None)
if prev_key_padding_mask is not None:
if static_kv:
new_key_padding_mask = prev_key_padding_mask
else:
new_key_padding_mask = torch.cat([prev_key_padding_mask, key_padding_mask], dim=1)
else:
new_key_padding_mask = key_padding_mask
return k, v, new_key_padding_mask
def fill_with_neg_inf(t):
return t.float().fill_(torch.finfo(t.dtype).min).type_as(t)
def _get_shape(t):
return getattr(t, "shape", None)
@add_start_docstrings(
"The bare FSMT Model outputting raw hidden-states without any specific head on top.",
FSMT_START_DOCSTRING,
)
class FSMTModel(PretrainedFSMTModel):
_tied_weights_keys = ["decoder.embed_tokens.weight", "decoder.output_projection.weight"]
def __init__(self, config: FSMTConfig):
super().__init__(config)
padding_idx = config.pad_token_id
encoder_embed_tokens = nn.Embedding(config.src_vocab_size, config.d_model, padding_idx)
decoder_embed_tokens = nn.Embedding(config.tgt_vocab_size, config.d_model, padding_idx)
self.encoder = FSMTEncoder(config, encoder_embed_tokens)
self.decoder = FSMTDecoder(config, decoder_embed_tokens)
self.post_init()
def get_encoder(self):
return self.encoder
def get_decoder(self):
return self.decoder
def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.decoder.embed_tokens, self.get_input_embeddings())
self._tie_or_clone_weights(self.decoder.output_projection, self.get_input_embeddings())
@add_start_docstrings_to_model_forward(FSMT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Seq2SeqModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: torch.LongTensor,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.BoolTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
return_dict: Optional[bool] = None,
):
def get_input_embeddings(self):
return self.encoder.embed_tokens
def set_input_embeddings(self, value):
self.encoder.embed_tokens = value
def get_output_embeddings(self):
return self.decoder.embed_tokens
def set_output_embeddings(self, value):
self.decoder.embed_tokens = value
@add_start_docstrings(
"The FSMT Model with a language modeling head. Can be used for summarization.", FSMT_START_DOCSTRING
)
class FSMTForConditionalGeneration(PretrainedFSMTModel):
base_model_prefix = "model"
_tied_weights_keys = ["decoder.embed_tokens.weight", "decoder.output_projection.weight"]
def __init__(self, config: FSMTConfig):
super().__init__(config)
base_model = FSMTModel(config)
self.model = base_model
self.post_init()
@add_start_docstrings_to_model_forward(FSMT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
@add_end_docstrings(FSMT_GENERATION_EXAMPLE)
def forward(
self,
input_ids: torch.LongTensor,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.BoolTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
decoder_inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], Seq2SeqLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
Depending on `return_dict`, either a tuple containing `masked_lm_loss` and model outputs or a `Seq2SeqLMOutput`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
use_cache = False
outputs = self.model(
input_ids,
inputs_embeds=inputs_embeds,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
decoder_inputs_embeds=decoder_inputs_embeds,
encoder_outputs=encoder_outputs,
decoder_attention_mask=decoder_attention_mask,
head_mask=head_mask,
decoder_head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
lm_logits = outputs[0]
masked_lm_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.tgt_vocab_size), labels.view(-1))
if not return_dict:
output = (lm_logits,) + outputs[1:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return Seq2SeqLMOutput(
loss=masked_lm_loss,
logits=lm_logits,
past_key_values=outputs.past_key_values,
decoder_hidden_states=outputs.decoder_hidden_states,
decoder_attentions=outputs.decoder_attentions,
cross_attentions=outputs.cross_attentions,
encoder_last_hidden_state=outputs.encoder_last_hidden_state,
encoder_hidden_states=outputs.encoder_hidden_states,
encoder_attentions=outputs.encoder_attentions,
)
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache,
}
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
return shift_tokens_right(labels, self.config.pad_token_id)
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = []
for layer_past in past_key_values:
layer_past_new = {
attn_key: _reorder_buffer(attn_cache, beam_idx) for attn_key, attn_cache in layer_past.items()
}
reordered_past.append(layer_past_new)
return reordered_past
def get_encoder(self):
return self.model.encoder
def get_decoder(self):
return self.model.decoder
def get_output_embeddings(self):
return self.model.decoder.embed_tokens
def set_output_embeddings(self, value):
self.model.decoder.embed_tokens = value
class SinusoidalPositionalEmbedding(nn.Embedding):
"""
This module produces sinusoidal positional embeddings of any length.
We don't want to save the weight of this embedding since it's not trained (deterministic) and it can be huge.
Padding symbols are ignored.
These embeddings get automatically extended in forward if more positions is needed.
"""
def __init__(self, num_positions, embedding_dim, padding_idx):
self.make_weight(num_positions, embedding_dim, padding_idx)
def make_weight(self, num_positions, embedding_dim, padding_idx):
weight = self.get_embedding(num_positions, embedding_dim, padding_idx)
if not hasattr(self, "weight"):
super().__init__(num_positions, embedding_dim, padding_idx, _weight=weight)
else:
weight = weight.to(dtype=self.weight.dtype, device=self.weight.device)
self.weight = nn.Parameter(weight)
self.weight.detach_()
self.weight.requires_grad = False
@staticmethod
def get_embedding(num_embeddings, embedding_dim, padding_idx):
"""
Build sinusoidal embeddings.
This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
"Attention Is All You Need".
"""
half_dim = embedding_dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
if embedding_dim % 2 == 1:
emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
if padding_idx is not None:
emb[padding_idx, :] = 0
return emb
@staticmethod
def make_positions(tensor, padding_idx: int):
"""
Replace non-padding symbols with their position numbers.
Position numbers begin at padding_idx+1. Padding symbols are ignored.
"""
mask = tensor.ne(padding_idx).int()
return (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + padding_idx
def forward(
self,
input,
incremental_state: Optional[Any] = None,
timestep: Optional[Tensor] = None,
):
"""
Input is expected to be of size [bsz x seqlen].
"""
bsz, seq_len = input.shape[:2]
max_pos = self.padding_idx + 1 + seq_len
if max_pos > self.weight.size(0):
self.make_weight(max_pos, self.embedding_dim, self.padding_idx)
positions = self.make_positions(input, self.padding_idx)
return super().forward(positions)
.\models\fsmt\tokenization_fsmt.py
"""Tokenization classes for FSMT."""
import json
import os
import re
import unicodedata
from typing import Dict, List, Optional, Tuple
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"src_vocab_file": "vocab-src.json",
"tgt_vocab_file": "vocab-tgt.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
"src_vocab_file": {
"stas/tiny-wmt19-en-de": "https://huggingface.co/stas/tiny-wmt19-en-de/resolve/main/vocab-src.json"
},
"tgt_vocab_file": {
"stas/tiny-wmt19-en-de": "https://huggingface.co/stas/tiny-wmt19-en-de/resolve/main/vocab-tgt.json"
},
"merges_file": {
"stas/tiny-wmt19-en-de": "https://huggingface.co/stas/tiny-wmt19-en-de/resolve/main/merges.txt"
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"stas/tiny-wmt19-en-de": 1024}
PRETRAINED_INIT_CONFIGURATION = {
"stas/tiny-wmt19-en-de": {
"langs": ["en", "de"],
"model_max_length": 1024,
"special_tokens_map_file": None,
"full_tokenizer_file": None,
}
}
def get_pairs(word):
"""
Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
strings)
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
def replace_unicode_punct(text):
"""
Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
"""
text = text.replace(",", ",")
text = re.sub(r"。\s*", ". ", text)
text = text.replace("、", ",")
text = text.replace("”", '"')
text = text.replace("“", '"')
text = text.replace("∶", ":")
text = text.replace(":", ":")
text = text.replace("?", "?")
text = text.replace("《", '"')
text = text.replace("》", '"')
text = text.replace(")", ")")
text = text.replace("!", "!")
text = text.replace("(", "(")
text = text.replace(";", ";")
text = text.replace("1", "1")
text = text.replace("」", '"')
text = text.replace("「", '"')
text = text.replace("0", "0")
text = text.replace("3", "3")
text = text.replace("2", "2")
text = text.replace("5", "5")
text = text.replace("6", "6")
text = text.replace("9", "9")
text = text.replace("7", "7")
text = text.replace("8", "8")
text = text.replace("4", "4")
text = re.sub(r".\s*", ". ", text)
text = text.replace("~", "~")
text = text.replace("’", "'")
text = text.replace("…", "...")
text = text.replace("━", "-")
text = text.replace("〈", "<")
text = text.replace("〉", ">")
text = text.replace("【", "[")
text = text.replace("】", "]")
text = text.replace("%", "%")
return text
def remove_non_printing_char(text):
"""
Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
"""
output = []
for char in text:
cat = unicodedata.category(char)
if cat.startswith("C"):
continue
output.append(char)
return "".join(output)
class FSMTTokenizer(PreTrainedTokenizer):
"""
Construct an FAIRSEQ Transformer tokenizer. Based on Byte-Pair Encoding. The tokenization process is the following:
- Moses preprocessing and tokenization.
- Normalizing all inputs text.
- The arguments `special_tokens` and the function `set_special_tokens`, can be used to add additional symbols (like
"__classify__") to a vocabulary.
- The argument `langs` defines a pair of languages.
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
Args:
langs (`List[str]`, *optional*):
A list of two languages to translate from and to, for instance `["en", "ru"]`.
src_vocab_file (`str`, *optional*):
File containing the vocabulary for the source language.
tgt_vocab_file (`st`, *optional*):
File containing the vocabulary for the target language.
merges_file (`str`, *optional*):
File containing the merges.
do_lower_case (`bool`, *optional*, defaults to `False`):
Whether or not to lowercase the input when tokenizing.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the `cls_token`.
</Tip>
sep_token (`str`, *optional*, defaults to `"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
langs=None,
src_vocab_file=None,
tgt_vocab_file=None,
merges_file=None,
do_lower_case=False,
unk_token="<unk>",
bos_token="<s>",
sep_token="</s>",
pad_token="<pad>",
**kwargs,
):
try:
import sacremoses
except ImportError:
raise ImportError(
"You need to install sacremoses to use XLMTokenizer. "
"See https://pypi.org/project/sacremoses/ for installation."
)
self.sm = sacremoses
self.src_vocab_file = src_vocab_file
self.tgt_vocab_file = tgt_vocab_file
self.merges_file = merges_file
self.do_lower_case = do_lower_case
self.cache_moses_punct_normalizer = {}
self.cache_moses_tokenizer = {}
self.cache_moses_detokenizer = {}
if langs and len(langs) == 2:
self.src_lang, self.tgt_lang = langs
else:
raise ValueError(
f"arg `langs` needs to be a list of 2 langs, e.g. ['en', 'ru'], but got {langs}. "
"Usually that means that tokenizer can't find a mapping for the given model path "
"in PRETRAINED_VOCAB_FILES_MAP, and other maps of this tokenizer."
)
with open(src_vocab_file, encoding="utf-8") as src_vocab_handle:
self.encoder = json.load(src_vocab_handle)
with open(tgt_vocab_file, encoding="utf-8") as tgt_vocab_handle:
tgt_vocab = json.load(tgt_vocab_handle)
self.decoder = {v: k for k, v in tgt_vocab.items()}
with open(merges_file, encoding="utf-8") as merges_handle:
merges = merges_handle.read().split("\n")[:-1]
merges = [tuple(merge.split()[:2]) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
super().__init__(
langs=langs,
src_vocab_file=src_vocab_file,
tgt_vocab_file=tgt_vocab_file,
merges_file=merges_file,
do_lower_case=do_lower_case,
unk_token=unk_token,
bos_token=bos_token,
sep_token=sep_token,
pad_token=pad_token,
**kwargs,
)
def get_vocab(self) -> Dict[str, int]:
return self.get_src_vocab()
@property
def vocab_size(self) -> int:
return self.src_vocab_size
def moses_punct_norm(self, text, lang):
if lang not in self.cache_moses_punct_normalizer:
punct_normalizer = self.sm.MosesPunctNormalizer(lang=lang)
self.cache_moses_punct_normalizer[lang] = punct_normalizer
return self.cache_moses_punct_normalizer[lang].normalize(text)
def moses_tokenize(self, text, lang):
if lang not in self.cache_moses_tokenizer:
moses_tokenizer = self.sm.MosesTokenizer(lang=lang)
self.cache_moses_tokenizer[lang] = moses_tokenizer
return self.cache_moses_tokenizer[lang].tokenize(
text, aggressive_dash_splits=True, return_str=False, escape=True
)
def moses_detokenize(self, tokens, lang):
if lang not in self.cache_moses_detokenizer:
moses_detokenizer = self.sm.MosesDetokenizer(lang=lang)
self.cache_moses_detokenizer[lang] = moses_detokenizer
return self.cache_moses_detokenizer[lang].detokenize(tokens)
def moses_pipeline(self, text, lang):
text = replace_unicode_punct(text)
text = self.moses_punct_norm(text, lang)
text = remove_non_printing_char(text)
return text
@property
def src_vocab_size(self):
return len(self.encoder)
@property
def tgt_vocab_size(self):
return len(self.decoder)
def get_src_vocab(self):
return dict(self.encoder, **self.added_tokens_encoder)
def get_tgt_vocab(self):
return dict(self.decoder, **self.added_tokens_decoder)
def bpe(self, token):
word = tuple(token[:-1]) + (token[-1] + "</w>",)
if token in self.cache:
return self.cache[token]
pairs = get_pairs(word)
if not pairs:
return token + "</w>"
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = " ".join(word)
if word == "\n </w>":
word = "\n</w>"
self.cache[token] = word
return word
def _tokenize(self, text, lang="en", bypass_tokenizer=False):
"""
Tokenize a string given language code using Moses.
Details of tokenization:
- [sacremoses](https://github.com/alvations/sacremoses): port of Moses
- Install with `pip install sacremoses`
Args:
- lang: ISO language code (default = 'en') (string). Languages should belong of the model supported
languages. However, we don't enforce it.
- bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)
(bool). If True, we only apply BPE.
Returns:
List of tokens.
"""
lang = self.src_lang
if self.do_lower_case:
text = text.lower()
if bypass_tokenizer:
text = text.split()
else:
text = self.moses_pipeline(text, lang=lang)
text = self.moses_tokenize(text, lang=lang)
split_tokens = []
for token in text:
if token:
split_tokens.extend(list(self.bpe(token).split(" ")))
return split_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
tokens = [t.replace(" ", "").replace("</w>", " ") for t in tokens]
tokens = "".join(tokens).split()
text = self.moses_detokenize(tokens, self.tgt_lang)
return text
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A RoBERTa sequence has the following format:
single sequence: [CLS] X [SEP]
pair of sequences: [CLS] A [SEP] B [SEP]
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs corresponding to the first sequence.
token_ids_1 (:obj:`List[int]`, `optional`):
List of IDs corresponding to the second sequence.
Returns:
:obj:`List[int]`: List of IDs with the appropriate special tokens.
"""
input_ids = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
if token_ids_1 is not None:
input_ids += token_ids_1 + [self.sep_token_id]
return input_ids
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and
adding special tokens. A FAIRSEQ Transformer sequence has the following format:
- single sequence: `<s> X </s>`
- pair of sequences: `<s> A </s> B </s>`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of input IDs with the appropriate special tokens.
"""
sep = [self.sep_token_id]
if token_ids_1 is None:
return token_ids_0 + sep
return token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is not None:
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create token type IDs tensor from a list of token ids. In a sequence pair, A and B would have different types (0 and 1).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of token type IDs.
"""
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in
# 定义 __getstate__ 方法,用于返回对象的状态字典
def __getstate__(self):
# 复制对象的 __dict__ 属性,获取当前对象的状态
state = self.__dict__.copy()
# 将对象的 "sm" 属性设为 None,可能是为了清除敏感信息或重置状态
state["sm"] = None
# 返回对象的状态字典
return state
# 定义 __setstate__ 方法,用于设置对象的状态
def __setstate__(self, d):
# 将传入的状态字典 d 直接赋给对象的 __dict__ 属性,以恢复对象的状态
self.__dict__ = d
# 尝试导入 sacremoses 库,如果导入失败则抛出 ImportError
try:
import sacremoses
except ImportError:
raise ImportError(
"You need to install sacremoses to use XLMTokenizer. "
"See https://pypi.org/project/sacremoses/ for installation."
)
# 将导入的 sacremoses 库赋给对象的 "sm" 属性,可能用于后续的操作
self.sm = sacremoses
.\models\fsmt\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_fsmt": ["FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FSMTConfig"],
"tokenization_fsmt": ["FSMTTokenizer"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_fsmt"] = ["FSMTForConditionalGeneration", "FSMTModel", "PretrainedFSMTModel"]
if TYPE_CHECKING:
from .configuration_fsmt import FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP, FSMTConfig
from .tokenization_fsmt import FSMTTokenizer
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_fsmt import FSMTForConditionalGeneration, FSMTModel, PretrainedFSMTModel
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\funnel\configuration_funnel.py
""" Funnel Transformer model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"funnel-transformer/small": "https://huggingface.co/funnel-transformer/small/resolve/main/config.json",
"funnel-transformer/small-base": "https://huggingface.co/funnel-transformer/small-base/resolve/main/config.json",
"funnel-transformer/medium": "https://huggingface.co/funnel-transformer/medium/resolve/main/config.json",
"funnel-transformer/medium-base": "https://huggingface.co/funnel-transformer/medium-base/resolve/main/config.json",
"funnel-transformer/intermediate": (
"https://huggingface.co/funnel-transformer/intermediate/resolve/main/config.json"
),
"funnel-transformer/intermediate-base": (
"https://huggingface.co/funnel-transformer/intermediate-base/resolve/main/config.json"
),
"funnel-transformer/large": "https://huggingface.co/funnel-transformer/large/resolve/main/config.json",
"funnel-transformer/large-base": "https://huggingface.co/funnel-transformer/large-base/resolve/main/config.json",
"funnel-transformer/xlarge": "https://huggingface.co/funnel-transformer/xlarge/resolve/main/config.json",
"funnel-transformer/xlarge-base": "https://huggingface.co/funnel-transformer/xlarge-base/resolve/main/config.json",
}
class FunnelConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`FunnelModel`] or a [`TFBertModel`]. It is used to
instantiate a Funnel Transformer model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the Funnel
Transformer [funnel-transformer/small](https://huggingface.co/funnel-transformer/small) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
model_type = "funnel"
attribute_map = {
"hidden_size": "d_model",
"num_attention_heads": "n_head",
}
def __init__(
self,
vocab_size=30522,
block_sizes=[4, 4, 4],
block_repeats=None,
num_decoder_layers=2,
d_model=768,
n_head=12,
d_head=64,
d_inner=3072,
hidden_act="gelu_new",
hidden_dropout=0.1,
attention_dropout=0.1,
activation_dropout=0.0,
initializer_range=0.1,
initializer_std=None,
layer_norm_eps=1e-9,
pooling_type="mean",
attention_type="relative_shift",
separate_cls=True,
truncate_seq=True,
pool_q_only=True,
**kwargs,
):
self.vocab_size = vocab_size
self.block_sizes = block_sizes
self.block_repeats = [1] * len(block_sizes) if block_repeats is None else block_repeats
assert len(block_sizes) == len(
self.block_repeats
), "`block_sizes` and `block_repeats` should have the same length."
self.num_decoder_layers = num_decoder_layers
self.d_model = d_model
self.n_head = n_head
self.d_head = d_head
self.d_inner = d_inner
self.hidden_act = hidden_act
self.hidden_dropout = hidden_dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.initializer_range = initializer_range
self.initializer_std = initializer_std
self.layer_norm_eps = layer_norm_eps
assert pooling_type in [
"mean",
"max",
], f"Got {pooling_type} for `pooling_type` but only 'mean' and 'max' are supported."
self.pooling_type = pooling_type
assert attention_type in [
"relative_shift",
"factorized",
], f"Got {attention_type} for `attention_type` but only 'relative_shift' and 'factorized' are supported."
self.attention_type = attention_type
self.separate_cls = separate_cls
self.truncate_seq = truncate_seq
self.pool_q_only = pool_q_only
super().__init__(**kwargs)
@property
def num_hidden_layers(self):
return sum(self.block_sizes)
@num_hidden_layers.setter
def num_hidden_layers(self, value):
raise NotImplementedError(
"This model does not support the setting of `num_hidden_layers`. Please set `block_sizes`."
)
@property
def num_blocks(self):
return len(self.block_sizes)
@num_blocks.setter
def num_blocks(self, value):
raise NotImplementedError("This model does not support the setting of `num_blocks`. Please set `block_sizes`.")
.\models\funnel\convert_funnel_original_tf_checkpoint_to_pytorch.py
"""Convert Funnel checkpoint."""
import argparse
import torch
from transformers import FunnelBaseModel, FunnelConfig, FunnelModel, load_tf_weights_in_funnel
from transformers.utils import logging
logging.set_verbosity_info()
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, base_model):
config = FunnelConfig.from_json_file(config_file)
print(f"Building PyTorch model from configuration: {config}")
model = FunnelBaseModel(config) if base_model else FunnelModel(config)
load_tf_weights_in_funnel(model, config, tf_checkpoint_path)
print(f"Save PyTorch model to {pytorch_dump_path}")
torch.save(model.state_dict(), pytorch_dump_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
)
parser.add_argument(
"--config_file",
default=None,
type=str,
required=True,
help="The config json file corresponding to the pre-trained model. \nThis specifies the model architecture.",
)
parser.add_argument(
"--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
parser.add_argument(
"--base_model", action="store_true", help="Whether you want just the base model (no decoder) or not."
)
args = parser.parse_args()
convert_tf_checkpoint_to_pytorch(
args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.base_model
)
.\models\funnel\modeling_funnel.py
""" PyTorch Funnel Transformer model."""
import os
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutput,
MaskedLMOutput,
MultipleChoiceModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_funnel import FunnelConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "FunnelConfig"
_CHECKPOINT_FOR_DOC = "funnel-transformer/small"
FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST = [
"funnel-transformer/small",
"funnel-transformer/small-base",
"funnel-transformer/medium",
"funnel-transformer/medium-base",
"funnel-transformer/intermediate",
"funnel-transformer/intermediate-base",
"funnel-transformer/large",
"funnel-transformer/large-base",
"funnel-transformer/xlarge-base",
"funnel-transformer/xlarge",
]
INF = 1e6
def load_tf_weights_in_funnel(model, config, tf_checkpoint_path):
"""Load tf checkpoints in a pytorch model."""
try:
import re
import numpy as np
import tensorflow as tf
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path = os.path.abspath(tf_checkpoint_path)
logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
logger.info(f"Loading TF weight {name} with shape {shape}")
array = tf.train.load_variable(tf_path, name)
names.append(name)
arrays.append(array)
_layer_map = {
"k": "k_head",
"q": "q_head",
"v": "v_head",
"o": "post_proj",
"layer_1": "linear_1",
"layer_2": "linear_2",
"rel_attn": "attention",
"ff": "ffn",
"kernel": "weight",
"gamma": "weight",
"beta": "bias",
"lookup_table": "weight",
"word_embedding": "word_embeddings",
"input": "embeddings",
}
for name, array in zip(names, arrays):
name = name.split("/")
if any(
n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
for n in name
):
logger.info(f"Skipping {'/'.join(name)}")
continue
if name[0] == "generator":
continue
pointer = model
skipped = False
for m_name in name[1:]:
if not isinstance(pointer, FunnelPositionwiseFFN) and re.fullmatch(r"layer_\d+", m_name):
layer_index = int(re.search(r"layer_(\d+)", m_name).groups()[0])
if layer_index < config.num_hidden_layers:
block_idx = 0
while layer_index >= config.block_sizes[block_idx]:
layer_index -= config.block_sizes[block_idx]
block_idx += 1
pointer = pointer.blocks[block_idx][layer_index]
else:
layer_index -= config.num_hidden_layers
pointer = pointer.layers[layer_index]
elif m_name == "r" and isinstance(pointer, FunnelRelMultiheadAttention):
pointer = pointer.r_kernel
break
elif m_name in _layer_map:
pointer = getattr(pointer, _layer_map[m_name])
else:
try:
pointer = getattr(pointer, m_name)
except AttributeError:
print(f"Skipping {'/'.join(name)}", array.shape)
skipped = True
break
if not skipped:
if len(pointer.shape) != len(array.shape):
array = array.reshape(pointer.shape)
if m_name == "kernel":
array = np.transpose(array)
pointer.data = torch.from_numpy(array)
return model
class FunnelEmbeddings(nn.Module):
def __init__(self, config: FunnelConfig) -> None:
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout)
def forward(
self, input_ids: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None
) -> torch.Tensor:
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
embeddings = self.layer_norm(inputs_embeds)
embeddings = self.dropout(embeddings)
return embeddings
class FunnelAttentionStructure(nn.Module):
"""
Contains helpers for `FunnelRelMultiheadAttention `.
"""
cls_token_type_id: int = 2
def __init__(self, config: FunnelConfig) -> None:
super().__init__()
self.config = config
self.sin_dropout = nn.Dropout(config.hidden_dropout)
self.cos_dropout = nn.Dropout(config.hidden_dropout)
self.pooling_mult = None
def init_attention_inputs(
self,
inputs_embeds: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor]:
"""Returns the attention inputs associated to the inputs of the model."""
self.pooling_mult = 1
self.seq_len = seq_len = inputs_embeds.size(1)
position_embeds = self.get_position_embeds(seq_len, inputs_embeds.dtype, inputs_embeds.device)
token_type_mat = self.token_type_ids_to_mat(token_type_ids) if token_type_ids is not None else None
cls_mask = (
nn.functional.pad(inputs_embeds.new_ones([seq_len - 1, seq_len - 1]), (1, 0, 1, 0))
if self.config.separate_cls
else None
)
return (position_embeds, token_type_mat, attention_mask, cls_mask)
def token_type_ids_to_mat(self, token_type_ids: torch.Tensor) -> torch.Tensor:
"""Convert `token_type_ids` to `token_type_mat`."""
token_type_mat = token_type_ids[:, :, None] == token_type_ids[:, None]
cls_ids = token_type_ids == self.cls_token_type_id
cls_mat = cls_ids[:, :, None] | cls_ids[:, None]
return cls_mat | token_type_mat
def get_position_embeds(
self, seq_len: int, dtype: torch.dtype, device: torch.device
) -> torch.Tensor:
pass
def stride_pool_pos(self, pos_id: torch.Tensor, block_index: int):
"""
Pool `pos_id` while keeping the cls token separate (if `config.separate_cls=True`).
"""
if self.config.separate_cls:
cls_pos = pos_id.new_tensor([-(2**block_index) + 1])
pooled_pos_id = pos_id[1:-1] if self.config.truncate_seq else pos_id[1:]
return torch.cat([cls_pos, pooled_pos_id[::2]], 0)
else:
return pos_id[::2]
def relative_pos(self, pos: torch.Tensor, stride: int, pooled_pos=None, shift: int = 1) -> torch.Tensor:
"""
Build the relative positional vector between `pos` and `pooled_pos`.
"""
if pooled_pos is None:
pooled_pos = pos
ref_point = pooled_pos[0] - pos[0]
num_remove = shift * len(pooled_pos)
max_dist = ref_point + num_remove * stride
min_dist = pooled_pos[0] - pos[-1]
return torch.arange(max_dist, min_dist - 1, -stride, dtype=torch.long, device=pos.device)
def stride_pool(
self,
tensor: Union[torch.Tensor, Tuple[torch.Tensor], List[torch.Tensor]],
axis: Union[int, Tuple[int], List[int]],
) -> torch.Tensor:
"""
Perform pooling by stride slicing the tensor along the given axis.
"""
if tensor is None:
return None
if isinstance(axis, (list, tuple)):
for ax in axis:
tensor = self.stride_pool(tensor, ax)
return tensor
if isinstance(tensor, (tuple, list)):
return type(tensor)(self.stride_pool(x, axis) for x in tensor)
axis %= tensor.ndim
axis_slice = (
slice(None, -1, 2) if self.config.separate_cls and self.config.truncate_seq else slice(None, None, 2)
)
enc_slice = [slice(None)] * axis + [axis_slice]
if self.config.separate_cls:
cls_slice = [slice(None)] * axis + [slice(None, 1)]
tensor = torch.cat([tensor[cls_slice], tensor], axis=axis)
return tensor[enc_slice]
def pool_tensor(
self, tensor: Union[torch.Tensor, Tuple[torch.Tensor], List[torch.Tensor]], mode: str = "mean", stride: int = 2
) -> torch.Tensor:
"""
Perform pooling operation on the input tensor.
"""
pass
) -> torch.Tensor:
"""Apply 1D pooling to a tensor of size [B x T (x H)]."""
if tensor is None:
return None
if isinstance(tensor, (tuple, list)):
return type(tensor)(self.pool_tensor(tensor, mode=mode, stride=stride) for x in tensor)
if self.config.separate_cls:
suffix = tensor[:, :-1] if self.config.truncate_seq else tensor
tensor = torch.cat([tensor[:, :1], suffix], dim=1)
ndim = tensor.ndim
if ndim == 2:
tensor = tensor[:, None, :, None]
elif ndim == 3:
tensor = tensor[:, None, :, :]
stride = (stride, 1)
if mode == "mean":
tensor = nn.functional.avg_pool2d(tensor, stride, stride=stride, ceil_mode=True)
elif mode == "max":
tensor = nn.functional.max_pool2d(tensor, stride, stride=stride, ceil_mode=True)
elif mode == "min":
tensor = -nn.functional.max_pool2d(-tensor, stride, stride=stride, ceil_mode=True)
else:
raise NotImplementedError("The supported modes are 'mean', 'max' and 'min'.")
if ndim == 2:
return tensor[:, 0, :, 0]
elif ndim == 3:
return tensor[:, 0]
return tensor
def pre_attention_pooling(
self, output, attention_inputs: Tuple[torch.Tensor]
) -> Tuple[torch.Tensor, Tuple[torch.Tensor]]:
"""Pool `output` and the proper parts of `attention_inputs` before the attention layer."""
position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs
if self.config.pool_q_only:
if self.config.attention_type == "factorized":
position_embeds = self.stride_pool(position_embeds[:2], 0) + position_embeds[2:]
token_type_mat = self.stride_pool(token_type_mat, 1)
cls_mask = self.stride_pool(cls_mask, 0)
output = self.pool_tensor(output, mode=self.config.pooling_type)
else:
self.pooling_mult *= 2
if self.config.attention_type == "factorized":
position_embeds = self.stride_pool(position_embeds, 0)
token_type_mat = self.stride_pool(token_type_mat, [1, 2])
cls_mask = self.stride_pool(cls_mask, [1, 2])
attention_mask = self.pool_tensor(attention_mask, mode="min")
output = self.pool_tensor(output, mode=self.config.pooling_type)
attention_inputs = (position_embeds, token_type_mat, attention_mask, cls_mask)
return output, attention_inputs
def post_attention_pooling(self, attention_inputs: Tuple[torch.Tensor]) -> Tuple[torch.Tensor]:
"""Pool the proper parts of `attention_inputs` after the attention layer."""
position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs
if self.config.pool_q_only:
self.pooling_mult *= 2
if self.config.attention_type == "factorized":
position_embeds = position_embeds[:2] + self.stride_pool(position_embeds[2:], 0)
token_type_mat = self.stride_pool(token_type_mat, 2)
cls_mask = self.stride_pool(cls_mask, 1)
attention_mask = self.pool_tensor(attention_mask, mode="min")
attention_inputs = (position_embeds, token_type_mat, attention_mask, cls_mask)
return attention_inputs
def _relative_shift_gather(positional_attn: torch.Tensor, context_len: int, shift: int) -> torch.Tensor:
batch_size, n_head, seq_len, max_rel_len = positional_attn.shape
positional_attn = torch.reshape(positional_attn, [batch_size, n_head, max_rel_len, seq_len])
positional_attn = positional_attn[:, :, shift:, :]
positional_attn = torch.reshape(positional_attn, [batch_size, n_head, seq_len, max_rel_len - shift])
positional_attn = positional_attn[..., :context_len]
return positional_attn
class FunnelRelMultiheadAttention(nn.Module):
def __init__(self, config: FunnelConfig, block_index: int) -> None:
super().__init__()
self.config = config
self.block_index = block_index
d_model, n_head, d_head = config.d_model, config.n_head, config.d_head
self.hidden_dropout = nn.Dropout(config.hidden_dropout)
self.attention_dropout = nn.Dropout(config.attention_dropout)
self.q_head = nn.Linear(d_model, n_head * d_head, bias=False)
self.k_head = nn.Linear(d_model, n_head * d_head)
self.v_head = nn.Linear(d_model, n_head * d_head)
self.r_w_bias = nn.Parameter(torch.zeros([n_head, d_head]))
self.r_r_bias = nn.Parameter(torch.zeros([n_head, d_head]))
self.r_kernel = nn.Parameter(torch.zeros([d_model, n_head, d_head]))
self.r_s_bias = nn.Parameter(torch.zeros([n_head, d_head]))
self.seg_embed = nn.Parameter(torch.zeros([2, n_head, d_head]))
self.post_proj = nn.Linear(n_head * d_head, d_model)
self.layer_norm = nn.LayerNorm(d_model, eps=config.layer_norm_eps)
self.scale = 1.0 / (d_head**0.5)
def relative_positional_attention(self, position_embeds, q_head, context_len, cls_mask=None):
"""Relative attention score for the positional encodings"""
if self.config.attention_type == "factorized":
phi, pi, psi, omega = position_embeds
u = self.r_r_bias * self.scale
w_r = self.r_kernel
q_r_attention = torch.einsum("binh,dnh->bind", q_head + u, w_r)
q_r_attention_1 = q_r_attention * phi[:, None]
q_r_attention_2 = q_r_attention * pi[:, None]
positional_attn = torch.einsum("bind,jd->bnij", q_r_attention_1, psi) + torch.einsum(
"bind,jd->bnij", q_r_attention_2, omega
)
else:
shift = 2 if q_head.shape[1] != context_len else 1
r = position_embeds[self.block_index][shift - 1]
v = self.r_r_bias * self.scale
w_r = self.r_kernel
r_head = torch.einsum("td,dnh->tnh", r, w_r)
positional_attn = torch.einsum("binh,tnh->bnit", q_head + v, r_head)
positional_attn = _relative_shift_gather(positional_attn, context_len, shift)
if cls_mask is not None:
positional_attn *= cls_mask
return positional_attn
def relative_token_type_attention(self, token_type_mat, q_head, cls_mask=None):
"""Relative attention score for the token_type_ids"""
if token_type_mat is None:
return 0
batch_size, seq_len, context_len = token_type_mat.shape
r_s_bias = self.r_s_bias * self.scale
token_type_bias = torch.einsum("bind,snd->bnis", q_head + r_s_bias, self.seg_embed)
token_type_mat = token_type_mat[:, None].expand([batch_size, q_head.shape[2], seq_len, context_len])
diff_token_type, same_token_type = torch.split(token_type_bias, 1, dim=-1)
token_type_attn = torch.where(
token_type_mat,
same_token_type.expand(token_type_mat.shape),
diff_token_type.expand(token_type_mat.shape)
)
if cls_mask is not None:
token_type_attn *= cls_mask
return token_type_attn
) -> Tuple[torch.Tensor, ...]:
position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs
batch_size, seq_len, _ = query.shape
context_len = key.shape[1]
n_head, d_head = self.config.n_head, self.config.d_head
q_head = self.q_head(query).view(batch_size, seq_len, n_head, d_head)
k_head = self.k_head(key).view(batch_size, context_len, n_head, d_head)
v_head = self.v_head(value).view(batch_size, context_len, n_head, d_head)
q_head = q_head * self.scale
r_w_bias = self.r_w_bias * self.scale
content_score = torch.einsum("bind,bjnd->bnij", q_head + r_w_bias, k_head)
positional_attn = self.relative_positional_attention(position_embeds, q_head, context_len, cls_mask)
token_type_attn = self.relative_token_type_attention(token_type_mat, q_head, cls_mask)
attn_score = content_score + positional_attn + token_type_attn
dtype = attn_score.dtype
attn_score = attn_score.float()
if attention_mask is not None:
attn_score = attn_score - INF * (1 - attention_mask[:, None, None].float())
attn_prob = torch.softmax(attn_score, dim=-1, dtype=dtype)
attn_prob = self.attention_dropout(attn_prob)
attn_vec = torch.einsum("bnij,bjnd->bind", attn_prob, v_head)
attn_out = self.post_proj(attn_vec.reshape(batch_size, seq_len, n_head * d_head))
attn_out = self.hidden_dropout(attn_out)
output = self.layer_norm(query + attn_out)
return (output, attn_prob) if output_attentions else (output,)
class FunnelEncoder(nn.Module):
def __init__(self, config: FunnelConfig) -> None:
super().__init__()
self.config = config
self.attention_structure = FunnelAttentionStructure(config)
self.blocks = nn.ModuleList(
[
nn.ModuleList([FunnelLayer(config, block_index) for _ in range(block_size)])
for block_index, block_size in enumerate(config.block_sizes)
]
)
def forward(
self,
inputs_embeds: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
) -> Union[Tuple[torch.Tensor, Tuple], torch.Tensor]:
) -> Union[Tuple, BaseModelOutput]:
attention_mask = attention_mask.type_as(inputs_embeds)
attention_inputs = self.attention_structure.init_attention_inputs(
inputs_embeds,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
)
hidden = inputs_embeds
all_hidden_states = (inputs_embeds,) if output_hidden_states else None
all_attentions = () if output_attentions else None
for block_index, block in enumerate(self.blocks):
pooling_flag = hidden.size(1) > (2 if self.config.separate_cls else 1)
pooling_flag = pooling_flag and block_index > 0
if pooling_flag:
pooled_hidden, attention_inputs = self.attention_structure.pre_attention_pooling(
hidden, attention_inputs
)
for layer_index, layer in enumerate(block):
for repeat_index in range(self.config.block_repeats[block_index]):
do_pooling = (repeat_index == 0) and (layer_index == 0) and pooling_flag
if do_pooling:
query = pooled_hidden
key = value = hidden if self.config.pool_q_only else pooled_hidden
else:
query = key = value = hidden
layer_output = layer(query, key, value, attention_inputs, output_attentions=output_attentions)
hidden = layer_output[0]
if do_pooling:
attention_inputs = self.attention_structure.post_attention_pooling(attention_inputs)
if output_attentions:
all_attentions = all_attentions + layer_output[1:]
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden,)
if not return_dict:
return tuple(v for v in [hidden, all_hidden_states, all_attentions] if v is not None)
return BaseModelOutput(last_hidden_state=hidden, hidden_states=all_hidden_states, attentions=all_attentions)
def upsample(
x: torch.Tensor, stride: int, target_len: int, separate_cls: bool = True, truncate_seq: bool = False
) -> torch.Tensor:
"""
Upsample tensor `x` to match `target_len` by repeating the tokens `stride` time on the sequence length dimension.
"""
if stride == 1:
return x
if separate_cls:
cls = x[:, :1]
x = x[:, 1:]
output = torch.repeat_interleave(x, repeats=stride, dim=1)
if separate_cls:
if truncate_seq:
output = nn.functional.pad(output, (0, 0, 0, stride - 1, 0, 0))
output = output[:, : target_len - 1]
output = torch.cat([cls, output], dim=1)
else:
output = output[:, :target_len]
return output
class FunnelDecoder(nn.Module):
def __init__(self, config: FunnelConfig) -> None:
super().__init__()
self.config = config
self.attention_structure = FunnelAttentionStructure(config)
self.layers = nn.ModuleList([FunnelLayer(config, 0) for _ in range(config.num_decoder_layers)])
def forward(
self,
final_hidden: torch.Tensor,
first_block_hidden: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
) -> Union[Tuple, BaseModelOutput]:
upsampled_hidden = upsample(
final_hidden,
stride=2 ** (len(self.config.block_sizes) - 1),
target_len=first_block_hidden.shape[1],
separate_cls=self.config.separate_cls,
truncate_seq=self.config.truncate_seq,
)
hidden = upsampled_hidden + first_block_hidden
all_hidden_states = (hidden,) if output_hidden_states else None
all_attentions = () if output_attentions else None
attention_inputs = self.attention_structure.init_attention_inputs(
hidden,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
)
for layer in self.layers:
layer_output = layer(hidden, hidden, hidden, attention_inputs, output_attentions=output_attentions)
hidden = layer_output[0]
if output_attentions:
all_attentions = all_attentions + layer_output[1:]
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden,)
if not return_dict:
return tuple(v for v in [hidden, all_hidden_states, all_attentions] if v is not None)
return BaseModelOutput(last_hidden_state=hidden, hidden_states=all_hidden_states, attentions=all_attentions)
class FunnelDiscriminatorPredictions(nn.Module):
"""Prediction module for the discriminator, made up of two dense layers."""
def __init__(self, config: FunnelConfig) -> None:
super().__init__()
self.config = config
self.dense = nn.Linear(config.d_model, config.d_model)
self.dense_prediction = nn.Linear(config.d_model, 1)
def forward(self, discriminator_hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(discriminator_hidden_states)
hidden_states = ACT2FN[self.config.hidden_act](hidden_states)
logits = self.dense_prediction(hidden_states).squeeze(-1)
return logits
class FunnelPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = FunnelConfig
load_tf_weights = load_tf_weights_in_funnel
base_model_prefix = "funnel"
def _init_weights(self, module):
classname = module.__class__.__name__
if classname.find("Linear") != -1:
if getattr(module, "weight", None) is not None:
if self.config.initializer_std is None:
fan_out, fan_in = module.weight.shape
std = np.sqrt(1.0 / float(fan_in + fan_out))
else:
std = self.config.initializer_std
nn.init.normal_(module.weight, std=std)
if getattr(module, "bias", None) is not None:
nn.init.constant_(module.bias, 0.0)
elif classname == "FunnelRelMultiheadAttention":
nn.init.uniform_(module.r_w_bias, b=self.config.initializer_range)
nn.init.uniform_(module.r_r_bias, b=self.config.initializer_range)
nn.init.uniform_(module.r_kernel, b=self.config.initializer_range)
nn.init.uniform_(module.r_s_bias, b=self.config.initializer_range)
nn.init.uniform_(module.seg_embed, b=self.config.initializer_range)
elif classname == "FunnelEmbeddings":
std = 1.0 if self.config.initializer_std is None else self.config.initializer_std
nn.init.normal_(module.word_embeddings.weight, std=std)
if module.word_embeddings.padding_idx is not None:
module.word_embeddings.weight.data[module.padding_idx].zero_()
class FunnelClassificationHead(nn.Module):
def __init__(self, config: FunnelConfig, n_labels: int) -> None:
super().__init__()
self.linear_hidden = nn.Linear(config.d_model, config.d_model)
self.dropout = nn.Dropout(config.hidden_dropout)
self.linear_out = nn.Linear(config.d_model, n_labels)
def forward(self, hidden: torch.Tensor) -> torch.Tensor:
hidden = self.linear_hidden(hidden)
hidden = torch.tanh(hidden)
hidden = self.dropout(hidden)
return self.linear_out(hidden)
@dataclass
class FunnelForPreTrainingOutput(ModelOutput):
"""
Output type of [`FunnelForPreTraining`].
"""
Args:
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
ELECTRA-style目标函数的总损失。
如果提供了`labels`,则返回该损失。
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
模型头部的预测分数(每个token的分数,未经过SoftMax)。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
模型每一层的隐藏状态,以及初始嵌入输出的元组。
每个元素是`torch.FloatTensor`,形状为`(batch_size, sequence_length, hidden_size)`。
当参数`output_hidden_states=True`或`config.output_hidden_states=True`时返回。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
每层注意力权重的元组。
每个元素是`torch.FloatTensor`,形状为`(batch_size, num_heads, sequence_length, sequence_length)`。
在注意力softmax后得到的注意力权重,用于计算自注意力头部的加权平均值。
当参数`output_attentions=True`或`config.output_attentions=True`时返回。
FUNNEL_START_DOCSTRING = r"""
The Funnel Transformer model was proposed in [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient
Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`FunnelConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
注释:
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
output_attentions (`bool`, *optional*):
output_hidden_states (`bool`, *optional*):
return_dict (`bool`, *optional*):
"""
FunnelTransformer 模型的基础类,输出原始的隐藏状态,没有上采样头(也称为解码器)或任何特定任务的顶部头部。
该类包含了 FunnelTransformer 模型的基本结构和方法。
"""
@add_start_docstrings(
"""
The base Funnel Transformer Model transformer outputting raw hidden-states without upsampling head (also called
decoder) or any task-specific head on top.
""",
FUNNEL_START_DOCSTRING,
)
class FunnelBaseModel(FunnelPreTrainedModel):
def __init__(self, config: FunnelConfig) -> None:
"""
初始化 FunnelBaseModel 类的实例。
Args:
config (FunnelConfig): 包含模型配置信息的对象。
"""
super().__init__(config)
self.embeddings = FunnelEmbeddings(config)
self.encoder = FunnelEncoder(config)
self.post_init()
def get_input_embeddings(self) -> nn.Embedding:
"""
获取输入嵌入层的方法。
Returns:
nn.Embedding: 返回用于词嵌入的 nn.Embedding 对象。
"""
return self.embeddings.word_embeddings
def set_input_embeddings(self, new_embeddings: nn.Embedding) -> None:
"""
设置输入嵌入层的方法。
Args:
new_embeddings (nn.Embedding): 新的词嵌入对象。
"""
self.embeddings.word_embeddings = new_embeddings
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="funnel-transformer/small-base",
output_type=BaseModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
这段代码定义了一个名为 `FunnelBaseModel` 的类,作为 Funnel Transformer 模型的基础实现。它包含了模型的初始化方法、嵌入层和编码器的设置方法,以及模型的前向传播方法,用于处理输入并生成输出隐藏状态。
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
input_shape = input_ids.size()
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
device = input_ids.device if input_ids is not None else inputs_embeds.device
if attention_mask is None:
attention_mask = torch.ones(input_shape, device=device)
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
if inputs_embeds is None:
inputs_embeds = self.embeddings(input_ids)
encoder_outputs = self.encoder(
inputs_embeds,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
return encoder_outputs
@add_start_docstrings(
"The bare Funnel Transformer Model transformer outputting raw hidden-states without any specific head on top.",
FUNNEL_START_DOCSTRING,
)
class FunnelModel(FunnelPreTrainedModel):
def __init__(self, config: FunnelConfig) -> None:
super().__init__(config)
self.config = config
self.embeddings = FunnelEmbeddings(config)
self.encoder = FunnelEncoder(config)
self.decoder = FunnelDecoder(config)
self.post_init()
def get_input_embeddings(self) -> nn.Embedding:
return self.embeddings.word_embeddings
def set_input_embeddings(self, new_embeddings: nn.Embedding) -> None:
self.embeddings.word_embeddings = new_embeddings
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
input_shape = input_ids.size()
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
device = input_ids.device if input_ids is not None else inputs_embeds.device
if attention_mask is None:
attention_mask = torch.ones(input_shape, device=device)
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
if inputs_embeds is None:
inputs_embeds = self.embeddings(input_ids)
encoder_outputs = self.encoder(
inputs_embeds,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
output_attentions=output_attentions,
output_hidden_states=True,
return_dict=return_dict,
)
decoder_outputs = self.decoder(
final_hidden=encoder_outputs[0],
first_block_hidden=encoder_outputs[1][self.config.block_sizes[0]],
attention_mask=attention_mask,
token_type_ids=token_type_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
if not return_dict:
idx = 0
outputs = (decoder_outputs[0],)
if output_hidden_states:
idx += 1
outputs = outputs + (encoder_outputs[1] + decoder_outputs[idx],)
if output_attentions:
idx += 1
outputs = outputs + (encoder_outputs[2] + decoder_outputs[idx],)
return outputs
return BaseModelOutput(
last_hidden_state=decoder_outputs[0],
hidden_states=(encoder_outputs.hidden_states + decoder_outputs.hidden_states)
if output_hidden_states
else None,
attentions=(encoder_outputs.attentions + decoder_outputs.attentions)
if output_attentions
else None,
)
add_start_docstrings(
"""
Funnel Transformer model with a binary classification head on top as used during pretraining for identifying
generated tokens.
""",
FUNNEL_START_DOCSTRING,
)
class FunnelForPreTraining(FunnelPreTrainedModel):
def __init__(self, config: FunnelConfig) -> None:
super().__init__(config)
self.funnel = FunnelModel(config)
self.discriminator_predictions = FunnelDiscriminatorPredictions(config)
self.post_init()
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=FunnelForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Funnel 模型的前向传播方法,支持的输入参数包括:
- input_ids: 输入的 token IDs
- attention_mask: 注意力掩码
- token_type_ids: token 类型 IDs
- inputs_embeds: 输入的嵌入向量
- labels: 标签
- output_attentions: 是否输出注意力权重
- output_hidden_states: 是否输出隐藏状态
- return_dict: 是否返回结果字典形式
返回一个包含预测输出的 FunnelForPreTrainingOutput 对象。
"""
) -> Union[Tuple, FunnelForPreTrainingOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the ELECTRA-style loss. Input should be a sequence of tokens (see `input_ids`
docstring) Indices should be in `[0, 1]`:
- 0 indicates the token is an original token,
- 1 indicates the token was replaced.
Returns:
Examples:
```
>>> from transformers import AutoTokenizer, FunnelForPreTraining
>>> import torch
>>> tokenizer = AutoTokenizer.from_pretrained("funnel-transformer/small")
>>> model = FunnelForPreTraining.from_pretrained("funnel-transformer/small")
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> logits = model(**inputs).logits
```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
discriminator_hidden_states = self.funnel(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
discriminator_sequence_output = discriminator_hidden_states[0]
logits = self.discriminator_predictions(discriminator_sequence_output)
loss = None
if labels is not None:
loss_fct = nn.BCEWithLogitsLoss()
if attention_mask is not None:
active_loss = attention_mask.view(-1, discriminator_sequence_output.shape[1]) == 1
active_logits = logits.view(-1, discriminator_sequence_output.shape[1])[active_loss]
active_labels = labels[active_loss]
loss = loss_fct(active_logits, active_labels.float())
else:
loss = loss_fct(logits.view(-1, discriminator_sequence_output.shape[1]), labels.float())
if not return_dict:
output = (logits,) + discriminator_hidden_states[1:]
return ((loss,) + output) if loss is not None else output
return FunnelForPreTrainingOutput(
loss=loss,
logits=logits,
hidden_states=discriminator_hidden_states.hidden_states,
attentions=discriminator_hidden_states.attentions,
)
@add_start_docstrings("""Funnel Transformer Model with a `language modeling` head on top.""", FUNNEL_START_DOCSTRING)
class FunnelForMaskedLM(FunnelPreTrainedModel):
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config: FunnelConfig) -> None:
super().__init__(config)
self.funnel = FunnelModel(config)
self.lm_head = nn.Linear(config.d_model, config.vocab_size)
self.post_init()
def get_output_embeddings(self) -> nn.Linear:
return self.lm_head
def set_output_embeddings(self, new_embeddings: nn.Embedding) -> None:
self.lm_head = new_embeddings
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
mask="<mask>",
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.funnel(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
last_hidden_state = outputs[0]
prediction_logits = self.lm_head(last_hidden_state)
masked_lm_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_logits.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (prediction_logits,) + outputs[1:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return MaskedLMOutput(
loss=masked_lm_loss,
logits=prediction_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
Funnel Transformer Model with a sequence classification/regression head on top (two linear layer on top of the
first timestep of the last hidden state) e.g. for GLUE tasks.
""",
FUNNEL_START_DOCSTRING,
)
class FunnelForSequenceClassification(FunnelPreTrainedModel):
def __init__(self, config: FunnelConfig) -> None:
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
self.funnel = FunnelBaseModel(config)
self.classifier = FunnelClassificationHead(config, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="funnel-transformer/small-base",
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
将输入传递给Funnel模型以执行前向传播。
"""
) -> Union[Tuple, SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.funnel(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
last_hidden_state = outputs[0]
pooled_output = last_hidden_state[:, 0]
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
Funnel Transformer Model with a multiple choice classification head on top (two linear layer on top of the first
timestep of the last hidden state, and a softmax) e.g. for RocStories/SWAG tasks.
""",
FUNNEL_START_DOCSTRING,
)
class FunnelForMultipleChoice(FunnelPreTrainedModel):
def __init__(self, config: FunnelConfig) -> None:
super().__init__(config)
self.funnel = FunnelBaseModel(config)
self.classifier = FunnelClassificationHead(config, 1)
self.post_init()
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint="funnel-transformer/small-base",
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
FunnelForMultipleChoice 模型的前向传播方法,接收多个输入参数,返回模型输出结果。
Args:
input_ids (Optional[torch.Tensor], optional): 输入序列的 token IDs. Defaults to None.
attention_mask (Optional[torch.Tensor], optional): 注意力遮罩,掩盖无效输入. Defaults to None.
token_type_ids (Optional[torch.Tensor], optional): token 类型 IDs, 用于区分 segment. Defaults to None.
inputs_embeds (Optional[torch.Tensor], optional): 替代输入 token IDs 的嵌入. Defaults to None.
labels (Optional[torch.Tensor], optional): 真实标签. Defaults to None.
output_attentions (Optional[bool], optional): 是否输出注意力权重. Defaults to None.
output_hidden_states (Optional[bool], optional): 是否输出隐藏状态. Defaults to None.
return_dict (Optional[bool], optional): 是否返回字典格式的输出. Defaults to None.
Returns:
输出结果,根据 return_dict 的设置返回不同的格式,可能包括分类结果、注意力权重或隐藏状态等信息.
"""
) -> Union[Tuple, MultipleChoiceModelOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
outputs = self.funnel(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
last_hidden_state = outputs[0]
pooled_output = last_hidden_state[:, 0]
logits = self.classifier(pooled_output)
reshaped_logits = logits.view(-1, num_choices)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
if not return_dict:
output = (reshaped_logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
"""
Funnel Transformer Model with a span classification head on top for extractive question-answering tasks like SQuAD
"""
@add_start_docstrings(
"""
Funnel Transformer Model with a token classification head on top (a linear layer on top of the hidden-states
output) e.g. for Named-Entity-Recognition (NER) tasks.
""",
FUNNEL_START_DOCSTRING,
)
class FunnelForTokenClassification(FunnelPreTrainedModel):
def __init__(self, config: FunnelConfig) -> None:
super().__init__(config)
self.num_labels = config.num_labels
self.funnel = FunnelModel(config)
self.dropout = nn.Dropout(config.hidden_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.funnel(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
last_hidden_state = outputs[0]
last_hidden_state = self.dropout(last_hidden_state)
logits = self.classifier(last_hidden_state)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
(a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
FUNNEL_START_DOCSTRING,
)
class FunnelForQuestionAnswering(FunnelPreTrainedModel):
def __init__(self, config: FunnelConfig) -> None:
super().__init__(config)
self.num_labels = config.num_labels
self.funnel = FunnelModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
start_positions: Optional[torch.Tensor] = None,
end_positions: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, QuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.funnel(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
last_hidden_state = outputs[0]
logits = self.qa_outputs(last_hidden_state)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
total_loss = None
if start_positions is not None and end_positions is not None:
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
output = (start_logits, end_logits) + outputs[1:]
return ((total_loss,) + output) if total_loss is not None else output
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)