Transformers 源码解析(八十一)
.\models\mvp\tokenization_mvp.py
import json
import os
from functools import lru_cache
from typing import List, Optional, Tuple
import regex as re
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"RUCAIBox/mvp": "https://huggingface.co/RUCAIBox/mvp/resolve/main/vocab.json",
},
"added_tokens.json": {
"RUCAIBox/mvp": "https://huggingface.co/RUCAIBox/mvp/resolve/main/added_tokens.json",
},
"merges_file": {
"RUCAIBox/mvp": "https://huggingface.co/RUCAIBox/mvp/resolve/main/merges.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"RUCAIBox/mvp": 1024,
}
@lru_cache()
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
characters the bpe code barfs on.
The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
tables between utf-8 bytes and unicode strings.
"""
bs = (
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
)
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
def get_pairs(word):
"""
Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
class MvpTokenizer(PreTrainedTokenizer):
"""
Constructs a MVP tokenizer, which is smilar to the RoBERTa tokenizer, using byte-level Byte-Pair-Encoding.
"""
>>> from transformers import MvpTokenizer
>>> tokenizer = MvpTokenizer.from_pretrained("RUCAIBox/mvp")
>>> tokenizer("Hello world")["input_ids"]
[0, 31414, 232, 2]
>>> tokenizer(" Hello world")["input_ids"]
[0, 20920, 232, 2]
Args:
vocab_file (`str`):
Path to the vocabulary file.
merges_file (`str`):
Path to the merges file.
errors (`str`, *optional*, defaults to `"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See
[bytes.decode](https://docs.python.org/3/library/stdtypes.html
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the `cls_token`.
</Tip>
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the end of sequence.
The token used is the `sep_token`.
</Tip>
sep_token (`str`, *optional*, defaults to `"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
cls_token (`str`, *optional*, defaults to `"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
mask_token (`str`, *optional*, defaults to `"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
add_prefix_space (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (MVP tokenizer detect beginning of words by the preceding space).
"""
# 定义一些常量和映射,用于处理预训练模型的输入和输出
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
merges_file,
errors="replace",
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
add_prefix_space=False,
**kwargs,
):
# 初始化方法,用于创建一个新的对象实例,并初始化其属性
bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
# 如果 mask_token 是字符串类型,则创建一个特殊的 AddedToken 对象,左去空格,表示它在标记化时不带空格
mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
# 使用 UTF-8 编码打开词汇文件,并加载其中的 JSON 数据到 self.encoder 字典中
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
# 创建 self.decoder 字典,将 self.encoder 的键值对反转,用于从索引解码到词汇
self.decoder = {v: k for k, v in self.encoder.items()}
# 设置处理解码错误的方法(默认是替换错误)
self.errors = errors
# 初始化字节到 Unicode 编码的映射
self.byte_encoder = bytes_to_unicode()
# 创建字节解码器,将字节到 Unicode 编码的映射反转,用于从 Unicode 解码到字节
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
# 使用 UTF-8 编码打开 merges_file 文件,并读取其中的 BPE 合并规则
with open(merges_file, encoding="utf-8") as merges_handle:
bpe_merges = merges_handle.read().split("\n")[1:-1]
# 将每行 BPE 合并规则转换为元组列表
bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
# 创建 self.bpe_ranks 字典,将 BPE 合并规则和其索引值构成键值对
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
# 创建一个空的缓存字典
self.cache = {}
# 设置是否在词汇前添加空格的选项
self.add_prefix_space = add_prefix_space
# 使用正则表达式创建 self.pat 模式,用于词汇分割和合并
# 包括缩略词、字母、数字、非空格非字母非数字字符、空格等的匹配
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
# 调用父类的初始化方法,传递相应参数和关键字参数
super().__init__(
errors=errors,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
cls_token=cls_token,
pad_token=pad_token,
mask_token=mask_token,
add_prefix_space=add_prefix_space,
**kwargs,
)
@property
def vocab_size(self):
# 返回词汇表的大小,即 self.encoder 字典的键值对数量
return len(self.encoder)
def get_vocab(self):
# 获取词汇表,包括已添加的特殊 token
vocab = self.encoder.copy()
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text):
"""Tokenize a string."""
# 定义一个空列表,用于存储经过 BPE 处理后的 token
bpe_tokens = []
# 使用正则表达式找到文本中所有符合模式的 token,并进行处理
for token in re.findall(self.pat, text):
# 将每个 token 编码成字节,并通过字节编码器映射为 unicode 字符串,避免 BPE 中的控制标记(在本例中是空格)
token = "".join(
self.byte_encoder[b] for b in token.encode("utf-8")
) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
# 将经过 BPE 处理后的 token 拆分成多个子 token,并加入到 bpe_tokens 列表中
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
# 返回经过 BPE 处理后的所有 token
return bpe_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
# 根据 token 在词汇表中查找对应的 id,如果找不到则返回未知 token 对应的 id
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
# 根据 id 查找词汇表中对应的 token
return self.decoder.get(index)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
# 将 tokens 列表中的所有 token 拼接成一个字符串
text = "".join(tokens)
# 将字符串转换为字节,并根据字节解码器将其解码为 utf-8 编码的字符串,处理可能的错误情况
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
# 返回转换后的字符串
return text
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# 检查保存目录是否存在,如果不存在则记录错误并返回
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
# 构建词汇表文件路径
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
# 构建合并文件路径
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
# 将编码器内容以 JSON 格式写入词汇表文件
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
index = 0
# 将 BPE 标记和对应的索引写入合并文件
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write("#version: 0.2\n")
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
# 检查 BPE 合并索引是否连续,如果不连续则记录警告信息
if index != token_index:
logger.warning(
f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
# 返回词汇表文件路径和合并文件路径
return vocab_file, merge_file
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A MVP sequence has the following format:
- single sequence: `<s> X </s>`
- pair of sequences: `<s> A </s></s> B </s>`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
# 如果只有一个序列,则添加开头和结尾的特殊标记
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
# 如果有两个序列,则按照格式添加特殊标记
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
):
"""
Return a mask indicating the positions of special tokens in the input sequences.
Args:
token_ids_0 (`List[int]`):
List of IDs corresponding to the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs corresponding to the second sequence.
already_has_special_tokens (`bool`):
Whether the inputs already include special tokens or not.
Returns:
`List[int]`: Mask indicating the positions of special tokens (1 for special token, 0 for regular token).
"""
# 如果输入已经包含特殊标记,则直接返回全零的掩码
if already_has_special_tokens:
return [0] * len(token_ids_0)
# 初始化一个全零掩码
special_tokens_mask = [0] * len(token_ids_0)
# 设置第一个序列的开头和结尾标记位置
special_tokens_mask[0] = 1 # CLS token
special_tokens_mask[-1] = 1 # SEP token
# 如果有第二个序列,设置第二个序列的开头和结尾标记位置
if token_ids_1 is not None:
special_tokens_mask.extend([1] * len(token_ids_1) + [1]) # two SEP tokens
return special_tokens_mask
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
# 如果已经包含特殊标记,则调用父类方法获取特殊标记的掩码
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
# 如果没有第二个 token_ids_1,则返回一个列表,表示有特殊标记的序列
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
# 否则返回一个列表,表示有特殊标记的序列对
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. MVP does not
make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
# 分隔符和类别标记的 ID
sep = [self.sep_token_id]
cls = [self.cls_token_id]
# 如果没有第二个 token_ids_1,则返回一个全零列表
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
# 否则返回一个全零列表,用于序列对的掩码
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
# 如果文本已经分割成单词或者需要在文本前加空格,并且文本的第一个字符不是空格,则在文本前加一个空格
if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
text = " " + text
return (text, kwargs)
.\models\mvp\tokenization_mvp_fast.py
import json
from typing import List, Optional, Tuple
from tokenizers import pre_tokenizers, processors
from ...tokenization_utils_base import AddedToken, BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .tokenization_mvp import MvpTokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"RUCAIBox/mvp": "https://huggingface.co/RUCAIBox/mvp/resolve/main/vocab.json",
},
"added_tokens.json": {
"RUCAIBox/mvp": "https://huggingface.co/RUCAIBox/mvp/resolve/main/added_tokens.json",
},
"merges_file": {
"RUCAIBox/mvp": "https://huggingface.co/RUCAIBox/mvp/resolve/main/merges.txt",
},
"tokenizer_file": {
"RUCAIBox/mvp": "https://huggingface.co/RUCAIBox/mvp/resolve/main/tokenizer.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"RUCAIBox/mvp": 1024,
}
class MvpTokenizerFast(PreTrainedTokenizerFast):
r"""
Construct a "fast" MVP tokenizer (backed by HuggingFace's *tokenizers* library), derived from the GPT-2 tokenizer,
using byte-level Byte-Pair-Encoding.
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
be encoded differently whether it is at the beginning of the sentence (without space) or not:
```
>>> from transformers import MvpTokenizerFast
>>> tokenizer = MvpTokenizerFast.from_pretrained("RUCAIBox/mvp")
>>> tokenizer("Hello world")["input_ids"]
[0, 31414, 232, 2]
>>> tokenizer(" Hello world")["input_ids"]
[0, 20920, 232, 2]
```
You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
<Tip>
When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
</Tip>
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
```
# MvpTokenizerFast 类,是基于 HuggingFace 的 tokenizers 库实现的 MVP tokenizer,使用字节级别的 BPE 进行编码
# 继承自 PreTrainedTokenizerFast 类,包含大多数主要方法,支持从预训练模型加载和使用
# 通过示例展示了该 tokenizer 如何处理空格以及在不同位置编码单词的不同方式
# 提示用户在特定情况下实例化时需要传递额外参数以获得最佳效果
# 定义一个类,用于实现基于特定语言模型的Tokenizer,继承自一个提供了相关方法信息的超类。
class PreTrainedTokenizer(PreTrainedTokenizerBase):
# 初始化方法,接受词汇表文件路径、合并文件路径和可选的解码错误处理方式等参数。
def __init__(
self,
vocab_file: str,
merges_file: str,
errors: str = "replace",
bos_token: str = "<s>",
eos_token: str = "</s>",
sep_token: str = "</s>",
cls_token: str = "<s>",
unk_token: str = "<unk>",
pad_token: str = "<pad>",
mask_token: str = "<mask>",
add_prefix_space: bool = False,
trim_offsets: bool = True,
):
# 调用超类的初始化方法,传递必要的参数
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
cls_token=cls_token,
pad_token=pad_token,
mask_token=mask_token,
add_prefix_space=add_prefix_space,
trim_offsets=trim_offsets,
)
# 记录词汇表文件名和预训练词汇表文件映射
self.vocab_files_names = vocab_files_names
self.pretrained_vocab_files_map = pretrained_vocab_files_map
# 将预训练模型的位置编码大小赋给 max_model_input_sizes 变量
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# 定义模型输入的名称列表
model_input_names = ["input_ids", "attention_mask"]
# 指定慢速分词器的类别为 MvpTokenizer
slow_tokenizer_class = MvpTokenizer
def __init__(
self,
vocab_file=None,
merges_file=None,
tokenizer_file=None,
errors="replace",
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
add_prefix_space=False,
trim_offsets=True,
**kwargs,
):
# 初始化方法,设置各种初始化参数
@property
def mask_token(self) -> str:
"""
`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
having been set.
MVP tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
comprise the space before the *<mask>*.
"""
# 获取 mask_token 属性的 getter 方法,返回当前的 mask token 字符串
if self._mask_token is None:
if self.verbose:
logger.error("Using mask_token, but it is not set yet.")
return None
return str(self._mask_token)
@mask_token.setter
def mask_token(self, value):
"""
Overriding the default behavior of the mask token to have it eat the space before it.
This is needed to preserve backward compatibility with all the previously used models based on Mvp.
"""
# 设置 mask_token 属性的 setter 方法,使其吞噬前面的空格
# 如果 value 是字符串,则创建一个 AddedToken 对象,设置 lstrip 为 True,rstrip 为 False
value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
self._mask_token = value
def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
# 批量编码方法,返回 BatchEncoding 对象
is_split_into_words = kwargs.get("is_split_into_words", False)
if is_split_into_words and not self.add_prefix_space:
# 如果输入已经被分割成单词但没有加前缀空格,则抛出错误
raise ValueError(
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
"to use it with pretokenized inputs."
)
return super()._batch_encode_plus(*args, **kwargs)
def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
# 编码方法,返回 BatchEncoding 对象
is_split_into_words = kwargs.get("is_split_into_words", False)
if is_split_into_words and not self.add_prefix_space:
# 如果输入已经被分割成单词但没有加前缀空格,则抛出错误
raise ValueError(
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
"to use it with pretokenized inputs."
)
return super()._encode_plus(*args, **kwargs)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# 保存词汇表到指定目录
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
# 构建包含特殊标记的输入序列,用于模型输入
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
# 初始化输出列表,以起始标记开始,接着是token_ids_0,最后是结束标记
output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
# 如果没有第二个序列token_ids_1,则直接返回构建好的output
if token_ids_1 is None:
return output
# 如果有第二个序列token_ids_1,则在output后面添加结束标记,接着是token_ids_1,最后再加一个结束标记
return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
# 根据输入的两个序列token_ids_0和token_ids_1创建token type ids序列
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. MVP does not
make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
# 分隔符标记列表
sep = [self.sep_token_id]
# 分类起始标记列表
cls = [self.cls_token_id]
# 如果没有第二个序列token_ids_1,则返回一个全为0的列表,长度为cls + token_ids_0 + sep的长度
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
# 如果有第二个序列token_ids_1,则返回一个全为0的列表,长度为cls + token_ids_0 + sep + sep + token_ids_1 + sep的长度
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
.\models\mvp\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
_import_structure = {
"configuration_mvp": ["MVP_PRETRAINED_CONFIG_ARCHIVE_MAP", "MvpConfig", "MvpOnnxConfig"],
"tokenization_mvp": ["MvpTokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_mvp_fast"] = ["MvpTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_mvp"] = [
"MVP_PRETRAINED_MODEL_ARCHIVE_LIST",
"MvpForCausalLM",
"MvpForConditionalGeneration",
"MvpForQuestionAnswering",
"MvpForSequenceClassification",
"MvpModel",
"MvpPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_mvp import MVP_PRETRAINED_CONFIG_ARCHIVE_MAP, MvpConfig, MvpOnnxConfig
from .tokenization_mvp import MvpTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_mvp_fast import MvpTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_mvp import (
MVP_PRETRAINED_MODEL_ARCHIVE_LIST,
MvpForCausalLM,
MvpForConditionalGeneration,
MvpForQuestionAnswering,
MvpForSequenceClassification,
MvpModel,
MvpPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\nat\configuration_nat.py
""" Neighborhood Attention Transformer model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
logger = logging.get_logger(__name__)
NAT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"shi-labs/nat-mini-in1k-224": "https://huggingface.co/shi-labs/nat-mini-in1k-224/resolve/main/config.json",
}
class NatConfig(BackboneConfigMixin, PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`NatModel`]. It is used to instantiate a Nat model
according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Nat
[shi-labs/nat-mini-in1k-224](https://huggingface.co/shi-labs/nat-mini-in1k-224) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import NatConfig, NatModel
>>> # Initializing a Nat shi-labs/nat-mini-in1k-224 style configuration
>>> configuration = NatConfig()
>>> # Initializing a model (with random weights) from the shi-labs/nat-mini-in1k-224 style configuration
>>> model = NatModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "nat"
attribute_map = {
"num_attention_heads": "num_heads",
"num_hidden_layers": "num_layers",
}
def __init__(
self,
patch_size=4,
num_channels=3,
embed_dim=64,
depths=[3, 4, 6, 5],
num_heads=[2, 4, 8, 16],
kernel_size=7,
mlp_ratio=3.0,
qkv_bias=True,
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
drop_path_rate=0.1,
hidden_act="gelu",
initializer_range=0.02,
layer_norm_eps=1e-5,
layer_scale_init_value=0.0,
out_features=None,
out_indices=None,
**kwargs,
):
super().__init__(**kwargs)
self.patch_size = patch_size
self.num_channels = num_channels
self.embed_dim = embed_dim
self.depths = depths
self.num_layers = len(depths)
self.num_heads = num_heads
self.kernel_size = kernel_size
self.mlp_ratio = mlp_ratio
self.qkv_bias = qkv_bias
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.drop_path_rate = drop_path_rate
self.hidden_act = hidden_act
self.layer_norm_eps = layer_norm_eps
self.initializer_range = initializer_range
self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
self.layer_scale_init_value = layer_scale_init_value
self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
self._out_features, self._out_indices = get_aligned_output_features_output_indices(
out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
)
.\models\nat\modeling_nat.py
""" PyTorch Neighborhood Attention Transformer model."""
import math
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import BackboneOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
ModelOutput,
OptionalDependencyNotAvailable,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_natten_available,
logging,
replace_return_docstrings,
requires_backends,
)
from ...utils.backbone_utils import BackboneMixin
from .configuration_nat import NatConfig
if is_natten_available():
from natten.functional import natten2dav, natten2dqkrpb
else:
def natten2dqkrpb(*args, **kwargs):
raise OptionalDependencyNotAvailable()
def natten2dav(*args, **kwargs):
raise OptionalDependencyNotAvailable()
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "NatConfig"
_CHECKPOINT_FOR_DOC = "shi-labs/nat-mini-in1k-224"
_EXPECTED_OUTPUT_SHAPE = [1, 7, 7, 512]
_IMAGE_CLASS_CHECKPOINT = "shi-labs/nat-mini-in1k-224"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tiger cat"
NAT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"shi-labs/nat-mini-in1k-224",
]
@dataclass
class NatEncoderOutput(ModelOutput):
"""
Nat encoder's outputs, with potential hidden states and attentions.
"""
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
模型最后一层的隐藏状态序列。
Sequence of hidden-states at the output of the last layer of the model.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
模型在每一层输出的隐藏状态的元组。
Tuple of `torch.FloatTensor` representing hidden-states of the model at the output of each layer.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
模型在每一层输出的注意力权重的元组,用于计算自注意力头中的加权平均值。
Tuple of `torch.FloatTensor` representing attention weights after attention softmax.
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
模型在每一层输出的隐藏状态的元组,包含了空间维度重塑后的输出。
Tuple of `torch.FloatTensor` representing hidden-states of the model at the output of each layer, reshaped to include spatial dimensions.
"""
# 初始化函数的参数,默认值为None
last_hidden_state: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
# 创建一个数据类 NatModelOutput,继承自 ModelOutput 类,用于表示 NAT 模型的输出,包括最后隐藏状态的汇总信息。
@dataclass
class NatModelOutput(ModelOutput):
"""
Nat model's outputs that also contains a pooling of the last hidden states.
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
Average pooling of the last layer hidden-state.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, hidden_size, height, width)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
include the spatial dimensions.
"""
# 定义类的属性,用于存储 NAT 模型的输出信息
last_hidden_state: torch.FloatTensor = None
pooler_output: Optional[torch.FloatTensor] = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
# 创建一个数据类 NatImageClassifierOutput,继承自 ModelOutput 类,用于表示 NAT 图像分类的输出。
@dataclass
class NatImageClassifierOutput(ModelOutput):
"""
Nat outputs for image classification.
"""
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
分类(如果 `config.num_labels==1` 则为回归)损失。
分类损失或回归损失的张量。
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
分类(如果 `config.num_labels==1` 则为回归)得分(SoftMax 之前)。
模型输出的分类或回归得分(经过 SoftMax 之前的)。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
`torch.FloatTensor` 元组(当 `output_hidden_states=True` 传入或 `config.output_hidden_states=True` 时返回),
包含形状为 `(batch_size, sequence_length, hidden_size)` 的张量。
每个层的模型隐藏状态以及初始嵌入输出。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
`torch.FloatTensor` 元组(当 `output_attentions=True` 传入或 `config.output_attentions=True` 时返回),
包含形状为 `(batch_size, num_heads, sequence_length, sequence_length)` 的张量。
经过注意力 softmax 后的注意力权重,用于计算自注意力头部的加权平均值。
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
`torch.FloatTensor` 元组(当 `output_hidden_states=True` 传入或 `config.output_hidden_states=True` 时返回),
包含形状为 `(batch_size, hidden_size, height, width)` 的张量。
每个层的模型隐藏状态以及初始嵌入输出重塑为包括空间维度。
"""
# 可选的损失张量,当提供 `labels` 时返回
loss: Optional[torch.FloatTensor] = None
# 分类或回归得分张量,形状为 `(batch_size, config.num_labels)`
logits: torch.FloatTensor = None
# 可选的隐藏状态张量元组,当 `output_hidden_states=True` 时返回
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
# 可选的注意力权重张量元组,当 `output_attentions=True` 时返回
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# 可选的重塑后的隐藏状态张量元组,当 `output_hidden_states=True` 时返回
reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
class NatEmbeddings(nn.Module):
"""
Construct the patch and position embeddings.
"""
def __init__(self, config):
super().__init__()
self.patch_embeddings = NatPatchEmbeddings(config) # 创建补丁嵌入对象
self.norm = nn.LayerNorm(config.embed_dim) # 初始化 LayerNorm 层
self.dropout = nn.Dropout(config.hidden_dropout_prob) # 初始化 Dropout 层
def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor]:
embeddings = self.patch_embeddings(pixel_values) # 获取补丁嵌入向量
embeddings = self.norm(embeddings) # 应用 LayerNorm
embeddings = self.dropout(embeddings) # 应用 Dropout
return embeddings
class NatPatchEmbeddings(nn.Module):
"""
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, height, width, hidden_size)` to be consumed by a
Transformer.
"""
def __init__(self, config):
super().__init__()
patch_size = config.patch_size
num_channels, hidden_size = config.num_channels, config.embed_dim
self.num_channels = num_channels
if patch_size == 4:
pass # 当 patch_size 为 4 时,无需额外操作
else:
# TODO: Support arbitrary patch sizes.
raise ValueError("Dinat only supports patch size of 4 at the moment.") # 报错:当前仅支持 patch 大小为 4
self.projection = nn.Sequential(
nn.Conv2d(self.num_channels, hidden_size // 2, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
nn.Conv2d(hidden_size // 2, hidden_size, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
) # 初始化卷积投影层
def forward(self, pixel_values: Optional[torch.FloatTensor]) -> torch.Tensor:
_, num_channels, height, width = pixel_values.shape
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
) # 检查通道维度是否匹配配置中设置的通道维度
embeddings = self.projection(pixel_values) # 对输入进行投影
embeddings = embeddings.permute(0, 2, 3, 1) # 调整维度顺序,使得最后一维为 hidden_size
return embeddings
class NatDownsampler(nn.Module):
"""
Convolutional Downsampling Layer.
Args:
dim (`int`):
Number of input channels.
norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
Normalization layer class.
"""
def __init__(self, dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
super().__init__()
self.dim = dim
self.reduction = nn.Conv2d(dim, 2 * dim, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
self.norm = norm_layer(2 * dim) # 初始化规范化层
def forward(self, input_feature: torch.Tensor) -> torch.Tensor:
input_feature = self.reduction(input_feature.permute(0, 3, 1, 2)).permute(0, 2, 3, 1) # 执行卷积降采样
input_feature = self.norm(input_feature) # 应用规范化层
return input_feature
# 定义一个函数,用于在神经网络中按概率丢弃路径(随机深度)以减少模型复杂度
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
"""
# 如果丢弃概率为0或者不处于训练模式,则直接返回输入
if drop_prob == 0.0 or not training:
return input
# 计算保留的概率
keep_prob = 1 - drop_prob
# 为了支持不同维度的张量,创建与输入形状相同的随机张量
shape = (input.shape[0],) + (1,) * (input.ndim - 1)
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_() # 将随机张量二值化
# 应用随机深度丢弃操作并返回输出
output = input.div(keep_prob) * random_tensor
return output
# 从transformers.models.beit.modeling_beit.BeitDropPath中复制的类,修改为NatDropPath
class NatDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 调用前面定义的drop_path函数,传递当前对象的dropout概率和训练模式
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
# 返回描述对象的额外字符串表示,显示dropout概率
return "p={}".format(self.drop_prob)
class NeighborhoodAttention(nn.Module):
def __init__(self, config, dim, num_heads, kernel_size):
super().__init__()
# 检查隐藏层大小是否能被注意力头数量整除
if dim % num_heads != 0:
raise ValueError(
f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})"
)
self.num_attention_heads = num_heads
self.attention_head_size = int(dim / num_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.kernel_size = kernel_size
# rpb是可学习的相对位置偏置,与Swin中的概念相同
self.rpb = nn.Parameter(torch.zeros(num_heads, (2 * self.kernel_size - 1), (2 * self.kernel_size - 1)))
# 创建用于查询、键、值的线性变换层
self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
# 使用配置中的注意力概率丢弃层
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def transpose_for_scores(self, x):
# 调整张量形状以便计算注意力分数
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 3, 1, 2, 4)
# 定义前向传播方法,接受隐藏状态和是否输出注意力矩阵作为参数,返回元组类型的输出
def forward(
self,
hidden_states: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
# 通过查询(query)权重网络处理隐藏状态,以备计算注意力得分
query_layer = self.transpose_for_scores(self.query(hidden_states))
# 通过键(key)权重网络处理隐藏状态,以备计算注意力得分
key_layer = self.transpose_for_scores(self.key(hidden_states))
# 通过值(value)权重网络处理隐藏状态,以备计算注意力得分
value_layer = self.transpose_for_scores(self.value(hidden_states))
# 在计算注意力权重之前应用缩放因子,通常更高效,因为注意力权重通常比查询向量更大
# 由于矩阵乘法中标量是可交换的,因此结果相同
query_layer = query_layer / math.sqrt(self.attention_head_size)
# 计算“查询”和“键”的归一化注意力得分,并添加相对位置偏置
attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.kernel_size, 1)
# 将注意力得分归一化为概率
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# 通过丢弃操作随机地“丢弃”一些注意力概率,这在原始Transformer论文中有所提及
attention_probs = self.dropout(attention_probs)
# 计算加权后的值(value)以生成上下文向量
context_layer = natten2dav(attention_probs, value_layer, self.kernel_size, 1)
# 调整上下文向量的维度顺序,以便进一步处理
context_layer = context_layer.permute(0, 2, 3, 1, 4).contiguous()
# 将调整后的上下文向量形状改变为全头尺寸
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(new_context_layer_shape)
# 根据是否需要输出注意力矩阵,选择性地返回上下文向量和注意力概率
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
# 返回输出元组
return outputs
# 定义一个邻域注意力输出模块的神经网络模型
class NeighborhoodAttentionOutput(nn.Module):
def __init__(self, config, dim):
super().__init__()
# 初始化一个线性层,用于将输入维度为dim的张量线性变换为维度仍为dim的输出张量
self.dense = nn.Linear(dim, dim)
# 初始化一个dropout层,使用config中指定的概率进行dropout操作
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
# 将输入张量hidden_states通过线性层self.dense进行线性变换
hidden_states = self.dense(hidden_states)
# 对线性变换后的张量进行dropout操作
hidden_states = self.dropout(hidden_states)
return hidden_states
# 定义一个邻域注意力模块的神经网络模型
class NeighborhoodAttentionModule(nn.Module):
def __init__(self, config, dim, num_heads, kernel_size):
super().__init__()
# 初始化一个邻域自注意力模块,使用config、dim、num_heads和kernel_size参数进行初始化
self.self = NeighborhoodAttention(config, dim, num_heads, kernel_size)
# 初始化一个邻域注意力输出模块,使用config和dim参数进行初始化
self.output = NeighborhoodAttentionOutput(config, dim)
# 初始化一个集合,用于存储需要剪枝的注意力头
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
# 调用find_pruneable_heads_and_indices函数,找到可剪枝的注意力头和对应的索引
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
# 剪枝线性层
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
# 更新超参数并存储剪枝的注意力头
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
# 调用self.self的forward方法,对隐藏状态进行自注意力操作
self_outputs = self.self(hidden_states, output_attentions)
# 调用self.output的forward方法,将自注意力操作的输出和输入隐藏状态作为输入计算注意力输出
attention_output = self.output(self_outputs[0], hidden_states)
# 构造输出元组,如果需要输出注意力,将它们加入到输出元组中
outputs = (attention_output,) + self_outputs[1:]
return outputs
# 定义一个邻域中间层的神经网络模型
class NatIntermediate(nn.Module):
def __init__(self, config, dim):
super().__init__()
# 初始化一个线性层,用于将输入维度为dim的张量线性变换为维度为config.mlp_ratio * dim的输出张量
self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
# 根据config中指定的隐藏层激活函数类型,选择对应的激活函数ACT2FN进行初始化
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 将输入张量hidden_states通过线性层self.dense进行线性变换
hidden_states = self.dense(hidden_states)
# 将线性变换后的张量通过选择的激活函数self.intermediate_act_fn进行非线性变换
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
# 定义一个邻域输出层的神经网络模型
class NatOutput(nn.Module):
def __init__(self, config, dim):
super().__init__()
# 初始化一个线性层,用于将输入维度为config.mlp_ratio * dim的张量线性变换为维度为dim的输出张量
self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
# 初始化一个dropout层,使用config中指定的概率进行dropout操作
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 定义一个方法 `forward`,用于模型的前向传播过程,接受隐藏状态作为输入张量,并返回处理后的张量作为输出
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 将输入张量通过全连接层 `self.dense` 进行线性变换
hidden_states = self.dense(hidden_states)
# 对线性变换后的张量进行 dropout 操作,以防止过拟合
hidden_states = self.dropout(hidden_states)
# 返回经过线性变换和 dropout 处理后的张量作为输出
return hidden_states
class NatLayer(nn.Module):
def __init__(self, config, dim, num_heads, drop_path_rate=0.0):
super().__init__()
# 设置前馈分块大小
self.chunk_size_feed_forward = config.chunk_size_feed_forward
# 设置卷积核大小
self.kernel_size = config.kernel_size
# 应用层归一化在注意力模块之前
self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
# 创建注意力模块
self.attention = NeighborhoodAttentionModule(config, dim, num_heads, kernel_size=self.kernel_size)
# 根据丢弃路径率设置丢弃路径层
self.drop_path = NatDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
# 应用层归一化在注意力模块之后
self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
# 创建自然语言中间层
self.intermediate = NatIntermediate(config, dim)
# 创建自然语言输出层
self.output = NatOutput(config, dim)
# 如果配置允许,创建层缩放参数
self.layer_scale_parameters = (
nn.Parameter(config.layer_scale_init_value * torch.ones((2, dim)), requires_grad=True)
if config.layer_scale_init_value > 0
else None
)
def maybe_pad(self, hidden_states, height, width):
# 设置窗口大小为卷积核大小
window_size = self.kernel_size
pad_values = (0, 0, 0, 0, 0, 0)
# 如果输入的高度或宽度小于窗口大小,则进行填充
if height < window_size or width < window_size:
pad_l = pad_t = 0
pad_r = max(0, window_size - width)
pad_b = max(0, window_size - height)
pad_values = (0, 0, pad_l, pad_r, pad_t, pad_b)
hidden_states = nn.functional.pad(hidden_states, pad_values)
return hidden_states, pad_values
def forward(
self,
hidden_states: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor, torch.Tensor]:
# 获取批处理大小、高度、宽度和通道数
batch_size, height, width, channels = hidden_states.size()
# 保存输入的快捷连接
shortcut = hidden_states
# 应用层归一化在注意力模块之前
hidden_states = self.layernorm_before(hidden_states)
# 如果输入的大小小于卷积核大小,则进行填充
hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
# 获取填充后的高度和宽度
_, height_pad, width_pad, _ = hidden_states.shape
# 应用注意力模块
attention_outputs = self.attention(hidden_states, output_attentions=output_attentions)
# 获取注意力输出
attention_output = attention_outputs[0]
# 检查是否进行了填充
was_padded = pad_values[3] > 0 or pad_values[5] > 0
if was_padded:
# 如果进行了填充,则裁剪注意力输出以匹配原始输入的尺寸
attention_output = attention_output[:, :height, :width, :].contiguous()
# 如果存在层缩放参数,则应用第一个参数到注意力输出
if self.layer_scale_parameters is not None:
attention_output = self.layer_scale_parameters[0] * attention_output
# 计算最终的隐藏状态,结合快捷连接和丢弃路径
hidden_states = shortcut + self.drop_path(attention_output)
# 应用层归一化在注意力模块之后
layer_output = self.layernorm_after(hidden_states)
# 经过中间层和输出层处理的最终输出
layer_output = self.output(self.intermediate(layer_output))
# 如果存在层缩放参数,则应用第二个参数到最终输出
if self.layer_scale_parameters is not None:
layer_output = self.layer_scale_parameters[1] * layer_output
# 结合快捷连接和丢弃路径到最终输出
layer_output = hidden_states + self.drop_path(layer_output)
# 返回层输出,如果需要,还返回注意力输出
layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
return layer_outputs
# 初始化函数,用于初始化一个神经网络模型
def __init__(self, config, dim, depth, num_heads, drop_path_rate, downsample):
# 调用父类的初始化函数
super().__init__()
# 将传入的参数保存到对象的属性中
self.config = config # 保存配置信息
self.dim = dim # 保存输入的维度信息
# 创建神经网络层的列表,每一层是一个 NatLayer 对象
self.layers = nn.ModuleList(
[
NatLayer(
config=config,
dim=dim,
num_heads=num_heads,
drop_path_rate=drop_path_rate[i],
)
for i in range(depth) # 根据指定的深度创建对应数量的层
]
)
# 如果存在下采样层,则创建该下采样层对象
if downsample is not None:
self.downsample = downsample(dim=dim, norm_layer=nn.LayerNorm)
else:
self.downsample = None # 否则置为 None
# 初始化一个指示变量,用于标记指向状态
self.pointing = False
# 前向传播函数,用于定义数据在模型中的正向流动过程
def forward(
self,
hidden_states: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
_, height, width, _ = hidden_states.size() # 获取输入张量的高度和宽度信息
# 遍历所有层,并将输入张量按顺序传递给每一层进行处理
for i, layer_module in enumerate(self.layers):
layer_outputs = layer_module(hidden_states, output_attentions)
hidden_states = layer_outputs[0] # 更新隐藏状态
hidden_states_before_downsampling = hidden_states # 保存下采样之前的隐藏状态
# 如果存在下采样层,则对隐藏状态进行下采样处理
if self.downsample is not None:
hidden_states = self.downsample(hidden_states_before_downsampling)
stage_outputs = (hidden_states, hidden_states_before_downsampling) # 构造阶段输出元组
# 如果需要输出注意力权重,则将每一层的注意力权重也加入到输出中
if output_attentions:
stage_outputs += layer_outputs[1:]
return stage_outputs # 返回阶段输出元组
class NatEncoder(nn.Module):
# 自然编码器的类定义,继承自nn.Module
def __init__(self, config):
super().__init__()
# 初始化方法,接受一个配置对象作为参数
# 获取层级深度列表的长度
self.num_levels = len(config.depths)
# 将配置对象保存在self.config中
self.config = config
# 根据drop_path_rate参数创建一个线性间隔的列表
dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
# 创建一个模块列表,每个元素是一个NatStage对象
self.levels = nn.ModuleList(
[
NatStage(
config=config,
# 设置每一层的嵌入维度
dim=int(config.embed_dim * 2**i_layer),
# 设置每一层的深度
depth=config.depths[i_layer],
# 设置每一层的头数
num_heads=config.num_heads[i_layer],
# 设置每一层的drop_path_rate值
drop_path_rate=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
# 设置是否下采样,最后一层不进行下采样
downsample=NatDownsampler if (i_layer < self.num_levels - 1) else None,
)
# 对每一个层级进行循环
for i_layer in range(self.num_levels)
]
)
# 前向传播方法定义
def forward(
self,
hidden_states: torch.Tensor,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
output_hidden_states_before_downsampling: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> Union[Tuple, NatEncoderOutput]:
all_hidden_states = () if output_hidden_states else None
all_reshaped_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
if output_hidden_states:
# 将隐藏状态重排列为 b h w c -> b c h w
reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2)
# 将当前隐藏状态添加到所有隐藏状态元组中
all_hidden_states += (hidden_states,)
# 将重排列后的隐藏状态添加到所有重排列隐藏状态元组中
all_reshaped_hidden_states += (reshaped_hidden_state,)
for i, layer_module in enumerate(self.levels):
# 调用每个层模块处理隐藏状态和注意力输出
layer_outputs = layer_module(hidden_states, output_attentions)
# 更新当前隐藏状态为当前层的隐藏状态输出
hidden_states = layer_outputs[0]
# 更新在下采样之前的隐藏状态为当前层的下采样前隐藏状态输出
hidden_states_before_downsampling = layer_outputs[1]
if output_hidden_states and output_hidden_states_before_downsampling:
# 将下采样前的隐藏状态重排列为 b h w c -> b c h w
reshaped_hidden_state = hidden_states_before_downsampling.permute(0, 3, 1, 2)
# 将下采样前的隐藏状态添加到所有隐藏状态元组中
all_hidden_states += (hidden_states_before_downsampling,)
# 将重排列后的下采样前的隐藏状态添加到所有重排列隐藏状态元组中
all_reshaped_hidden_states += (reshaped_hidden_state,)
elif output_hidden_states and not output_hidden_states_before_downsampling:
# 将当前隐藏状态重排列为 b h w c -> b c h w
reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2)
# 将当前隐藏状态添加到所有隐藏状态元组中
all_hidden_states += (hidden_states,)
# 将重排列后的当前隐藏状态添加到所有重排列隐藏状态元组中
all_reshaped_hidden_states += (reshaped_hidden_state,)
if output_attentions:
# 将当前层的注意力输出添加到所有自注意力元组中
all_self_attentions += layer_outputs[2:]
if not return_dict:
# 如果不返回字典形式的输出,则返回非空元组的组成部分
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
# 返回按照 NatEncoderOutput 结构组织的输出字典
return NatEncoderOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
reshaped_hidden_states=all_reshaped_hidden_states,
)
class NatPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
# 指定配置类为 NatConfig
config_class = NatConfig
# 基础模型前缀设定为 "nat"
base_model_prefix = "nat"
# 主输入名称设定为 "pixel_values"
main_input_name = "pixel_values"
def _init_weights(self, module):
"""Initialize the weights"""
# 如果是线性层或卷积层,使用正态分布初始化权重
if isinstance(module, (nn.Linear, nn.Conv2d)):
# 与 TF 版本稍有不同,TF 使用截断正态分布进行初始化
# 参考 https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
# 如果存在偏置,初始化为零
if module.bias is not None:
module.bias.data.zero_()
# 如果是 LayerNorm 层,初始化偏置为零,权重为全1
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
NAT_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`NatConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# NatModel 类的文档字符串
NAT_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
# 使用 add_start_docstrings 装饰 NatModel 类,添加其描述信息和参数文档字符串
@add_start_docstrings(
"The bare Nat Model transformer outputting raw hidden-states without any specific head on top.",
NAT_START_DOCSTRING,
)
class NatModel(NatPreTrainedModel):
pass
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
# 确保模型需要的后端库已经加载
requires_backends(self, ["natten"])
# 设置配置信息
self.config = config
# 确定模型深度的层数
self.num_levels = len(config.depths)
# 计算特征向量的维度
self.num_features = int(config.embed_dim * 2 ** (self.num_levels - 1))
# 初始化自然语言嵌入层
self.embeddings = NatEmbeddings(config)
# 初始化自然语言编码器
self.encoder = NatEncoder(config)
# 使用指定的层归一化函数初始化
self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps)
# 如果需要添加池化层,则初始化自适应平均池化层
self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
# 初始化权重并进行最终处理
self.post_init()
def get_input_embeddings(self):
# 返回嵌入层中的补丁嵌入
return self.embeddings.patch_embeddings
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
# 遍历需要修剪的层和对应的头部信息
for layer, heads in heads_to_prune.items():
# 在编码器中修剪指定层的注意力头
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=NatModelOutput,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 输出注意力张量,如果未指定则使用配置中的输出注意力设置
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 输出隐藏状态张量,如果未指定则使用配置中的输出隐藏状态设置
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 返回字典标志,如果未指定则使用配置中的返回字典设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果未提供像素值,则抛出数值错误
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
# 将像素值嵌入到嵌入层中
embedding_output = self.embeddings(pixel_values)
# 编码器处理嵌入输出
encoder_outputs = self.encoder(
embedding_output,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取编码器的序列输出
sequence_output = encoder_outputs[0]
# 应用层归一化到序列输出
sequence_output = self.layernorm(sequence_output)
# 初始化池化输出为 None
pooled_output = None
# 如果存在池化器,则对序列输出进行池化操作
if self.pooler is not None:
pooled_output = self.pooler(sequence_output.flatten(1, 2).transpose(1, 2))
pooled_output = torch.flatten(pooled_output, 1)
# 如果不使用返回字典形式
if not return_dict:
# 返回序列输出,池化输出以及可能的其他编码器输出
output = (sequence_output, pooled_output) + encoder_outputs[1:]
return output
# 如果使用返回字典形式,则返回自定义的模型输出对象
return NatModelOutput(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
)
# 使用装饰器为类添加文档字符串,描述其作为 Nat 模型变换器和图像分类头的功能,该头部是在 [CLS] 标记的最终隐藏状态之上的线性层
@add_start_docstrings(
"""
Nat Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
the [CLS] token) e.g. for ImageNet.
""",
NAT_START_DOCSTRING,
)
class NatForImageClassification(NatPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 检查所需的后端是否已加载
requires_backends(self, ["natten"])
# 初始化分类器的标签数量
self.num_labels = config.num_labels
# 创建 NatModel 实例
self.nat = NatModel(config)
# 分类器头部
self.classifier = (
nn.Linear(self.nat.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity()
)
# 初始化权重并应用最终处理
self.post_init()
# 使用装饰器添加模型前向方法的文档字符串,描述其输入参数
@add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=NatImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 如果 return_dict 不为 None,则使用 return_dict;否则使用 self.config.use_return_dict
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用自然语言处理模型进行处理,根据参数设置输出注意力权重和隐藏状态
outputs = self.nat(
pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取池化后的输出,通常用于分类任务
pooled_output = outputs[1]
# 使用分类器对池化后的输出进行分类,得到预测的 logits
logits = self.classifier(pooled_output)
# 初始化损失为 None
loss = None
# 如果提供了标签 labels,则计算损失
if labels is not None:
# 如果问题类型未指定,则根据标签类型确定问题类型
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
# 根据问题类型选择对应的损失函数
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
# 对于单标签回归任务,计算均方误差损失
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
# 对于多标签回归任务,计算均方误差损失
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
# 对于单标签分类任务,计算交叉熵损失
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
# 对于多标签分类任务,计算二元交叉熵损失
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
# 如果不需要返回字典形式的输出,则按照元组的形式返回结果
if not return_dict:
output = (logits,) + outputs[2:] # outputs[2:] 包含额外的隐藏状态信息
return ((loss,) + output) if loss is not None else output
# 返回自定义的输出类 NatImageClassifierOutput,包含损失、logits、隐藏状态和注意力权重等信息
return NatImageClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
reshaped_hidden_states=outputs.reshaped_hidden_states,
)
# 使用装饰器添加文档字符串,描述这个类的作用是提供 NAT 的主干结构,可用于 DETR 和 MaskFormer 等框架。
# NAT_START_DOCSTRING 是预定义的一部分文档字符串内容。
@add_start_docstrings(
"NAT backbone, to be used with frameworks like DETR and MaskFormer.",
NAT_START_DOCSTRING,
)
class NatBackbone(NatPreTrainedModel, BackboneMixin):
def __init__(self, config):
# 调用父类的构造函数,初始化 NAT 模型的配置
super().__init__(config)
# 调用父类的方法,初始化主干结构
super()._init_backbone(config)
# 检查并确保需要的后端支持模块 "natten" 已经加载
requires_backends(self, ["natten"])
# 初始化嵌入层
self.embeddings = NatEmbeddings(config)
# 初始化编码器
self.encoder = NatEncoder(config)
# 计算每个特征层的通道数,并将其保存在列表中
self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
# 为输出特征层的隐藏状态添加层归一化层
hidden_states_norms = {}
for stage, num_channels in zip(self.out_features, self.channels):
hidden_states_norms[stage] = nn.LayerNorm(num_channels)
self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)
# 执行初始化权重并应用最终处理
self.post_init()
# 获取输入嵌入层的方法
def get_input_embeddings(self):
return self.embeddings.patch_embeddings
# 重写 forward 方法,添加输入文档字符串和返回值文档字符串,指定了输入参数和返回输出类型
@add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.Tensor,
output_hidden_states: Optional[bool] = None,
output_attentions: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
根据输入的参数返回BackboneOutput对象。
Parameters:
return_dict (bool, optional): 控制是否返回字典形式的输出,默认从self.config.use_return_dict获取。
output_hidden_states (bool, optional): 控制是否输出隐藏状态,默认从self.config.output_hidden_states获取。
output_attentions (bool, optional): 控制是否输出注意力,默认从self.config.output_attentions获取。
Returns:
BackboneOutput: 包含特征图、隐藏状态和注意力的输出对象。
Examples:
```
>>> from transformers import AutoImageProcessor, AutoBackbone
>>> import torch
>>> from PIL import Image
>>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> processor = AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224")
>>> model = AutoBackbone.from_pretrained(
... "shi-labs/nat-mini-in1k-224", out_features=["stage1", "stage2", "stage3", "stage4"]
... )
>>> inputs = processor(image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> feature_maps = outputs.feature_maps
>>> list(feature_maps[-1].shape)
[1, 512, 7, 7]
```
"""
# 如果未提供return_dict参数,则使用self.config.use_return_dict的设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果未提供output_hidden_states参数,则使用self.config.output_hidden_states的设置
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果未提供output_attentions参数,则使用self.config.output_attentions的设置
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 对输入的像素值进行嵌入处理,获取嵌入输出
embedding_output = self.embeddings(pixel_values)
# 将嵌入输出传入编码器,获取编码器的输出
outputs = self.encoder(
embedding_output,
output_attentions=output_attentions,
output_hidden_states=True,
output_hidden_states_before_downsampling=True,
return_dict=True,
)
# 获取重塑后的隐藏状态
hidden_states = outputs.reshaped_hidden_states
# 初始化特征图为空元组
feature_maps = ()
# 遍历阶段名称和隐藏状态,将符合输出特征要求的阶段的处理后的隐藏状态添加到特征图中
for stage, hidden_state in zip(self.stage_names, hidden_states):
if stage in self.out_features:
# TODO can we simplify this? 可以简化这部分代码吗?
batch_size, num_channels, height, width = hidden_state.shape
hidden_state = hidden_state.permute(0, 2, 3, 1).contiguous()
hidden_state = hidden_state.view(batch_size, height * width, num_channels)
hidden_state = self.hidden_states_norms[stage](hidden_state)
hidden_state = hidden_state.view(batch_size, height, width, num_channels)
hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
feature_maps += (hidden_state,)
# 如果不需要返回字典形式的输出,则构造输出元组
if not return_dict:
output = (feature_maps,)
if output_hidden_states:
output += (outputs.hidden_states,)
return output
# 返回BackboneOutput对象,包含特征图、隐藏状态和注意力
return BackboneOutput(
feature_maps=feature_maps,
hidden_states=outputs.hidden_states if output_hidden_states else None,
attentions=outputs.attentions,
)
.\models\nat\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {"configuration_nat": ["NAT_PRETRAINED_CONFIG_ARCHIVE_MAP", "NatConfig"]}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_nat"] = [
"NAT_PRETRAINED_MODEL_ARCHIVE_LIST",
"NatForImageClassification",
"NatModel",
"NatPreTrainedModel",
"NatBackbone",
]
if TYPE_CHECKING:
from .configuration_nat import NAT_PRETRAINED_CONFIG_ARCHIVE_MAP, NatConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_nat import (
NAT_PRETRAINED_MODEL_ARCHIVE_LIST,
NatBackbone,
NatForImageClassification,
NatModel,
NatPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\nezha\configuration_nezha.py
from ... import PretrainedConfig
NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"sijunhe/nezha-cn-base": "https://huggingface.co/sijunhe/nezha-cn-base/resolve/main/config.json",
}
class NezhaConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of an [`NezhaModel`]. It is used to instantiate an Nezha
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Nezha
[sijunhe/nezha-cn-base](https://huggingface.co/sijunhe/nezha-cn-base) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
# NEZHA 模型的配置类,用于定义模型的各种参数和超参数
Args:
vocab_size (`int`, optional, defaults to 21128):
NEZHA 模型的词汇表大小,定义了可以被输入到 `NezhaModel` 的 `input_ids` 中的不同标记。
hidden_size (`int`, optional, defaults to 768):
编码器层和池化层的维度。
num_hidden_layers (`int`, optional, defaults to 12):
Transformer 编码器中的隐藏层数量。
num_attention_heads (`int`, optional, defaults to 12):
Transformer 编码器中每个注意力层的注意头数量。
intermediate_size (`int`, optional, defaults to 3072):
Transformer 编码器中“中间”(即前馈)层的维度。
hidden_act (`str` or `function`, optional, defaults to "gelu"):
编码器和池化层中的非线性激活函数。
hidden_dropout_prob (`float`, optional, defaults to 0.1):
嵌入层、编码器和池化层中所有全连接层的 dropout 概率。
attention_probs_dropout_prob (`float`, optional, defaults to 0.1):
注意力概率的 dropout 比率。
max_position_embeddings (`int`, optional, defaults to 512):
模型可能使用的最大序列长度。通常设置为一个较大的值(例如 512、1024 或 2048)。
type_vocab_size (`int`, optional, defaults to 2):
传递给 `NezhaModel` 的 `token_type_ids` 的词汇表大小。
initializer_range (`float`, optional, defaults to 0.02):
用于初始化所有权重矩阵的截断正态初始化器的标准差。
layer_norm_eps (`float`, optional, defaults to 1e-12):
层归一化层使用的 epsilon 值。
classifier_dropout (`float`, optional, defaults to 0.1):
附加分类器的 dropout 比率。
is_decoder (`bool`, *optional*, defaults to `False`):
模型是否作为解码器使用。如果为 `False`,则模型作为编码器使用。
Example:
```
>>> from transformers import NezhaConfig, NezhaModel
>>> # Initializing an Nezha configuration
>>> configuration = NezhaConfig()
>>> # Initializing a model (with random weights) from the Nezha-base style configuration model
>>> model = NezhaModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
# NEZHA 预训练模型配置文件的存档映射
pretrained_config_archive_map = NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP
# 模型类型为 NEZHA
model_type = "nezha"
# 初始化函数,用于创建一个新的实例对象
def __init__(
self,
vocab_size=21128, # 词汇表大小,默认为21128
hidden_size=768, # 隐藏层大小,默认为768
num_hidden_layers=12, # 隐藏层的数量,默认为12
num_attention_heads=12, # 注意力头的数量,默认为12
intermediate_size=3072, # 中间层大小,默认为3072
hidden_act="gelu", # 隐藏层激活函数,默认为GELU
hidden_dropout_prob=0.1, # 隐藏层的Dropout概率,默认为0.1
attention_probs_dropout_prob=0.1, # 注意力概率的Dropout概率,默认为0.1
max_position_embeddings=512, # 最大位置编码数,默认为512
max_relative_position=64, # 最大相对位置数,默认为64
type_vocab_size=2, # 类型词汇表大小,默认为2
initializer_range=0.02, # 初始化范围,默认为0.02
layer_norm_eps=1e-12, # 层归一化的epsilon,默认为1e-12
classifier_dropout=0.1, # 分类器的Dropout概率,默认为0.1
pad_token_id=0, # 填充标记ID,默认为0
bos_token_id=2, # 起始标记ID,默认为2
eos_token_id=3, # 结束标记ID,默认为3
use_cache=True, # 是否使用缓存,默认为True
**kwargs, # 其他关键字参数
):
# 调用父类的初始化函数,传递填充标记ID、起始标记ID、结束标记ID以及其他关键字参数
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
# 设置实例对象的属性
self.vocab_size = vocab_size # 设置词汇表大小属性
self.hidden_size = hidden_size # 设置隐藏层大小属性
self.num_hidden_layers = num_hidden_layers # 设置隐藏层数量属性
self.num_attention_heads = num_attention_heads # 设置注意力头数量属性
self.hidden_act = hidden_act # 设置隐藏层激活函数属性
self.intermediate_size = intermediate_size # 设置中间层大小属性
self.hidden_dropout_prob = hidden_dropout_prob # 设置隐藏层的Dropout概率属性
self.attention_probs_dropout_prob = attention_probs_dropout_prob # 设置注意力概率的Dropout概率属性
self.max_position_embeddings = max_position_embeddings # 设置最大位置编码数属性
self.max_relative_position = max_relative_position # 设置最大相对位置数属性
self.type_vocab_size = type_vocab_size # 设置类型词汇表大小属性
self.initializer_range = initializer_range # 设置初始化范围属性
self.layer_norm_eps = layer_norm_eps # 设置层归一化的epsilon属性
self.classifier_dropout = classifier_dropout # 设置分类器的Dropout概率属性
self.use_cache = use_cache # 设置是否使用缓存属性
.\models\nezha\modeling_nezha.py
"""PyTorch Nezha model."""
import math
import os
import warnings
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
BaseModelOutputWithPoolingAndCrossAttentions,
MaskedLMOutput,
MultipleChoiceModelOutput,
NextSentencePredictorOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import (
apply_chunking_to_forward,
find_pruneable_heads_and_indices,
prune_linear_layer,
)
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_nezha import NezhaConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "sijunhe/nezha-cn-base"
_CONFIG_FOR_DOC = "NezhaConfig"
NEZHA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"sijunhe/nezha-cn-base",
"sijunhe/nezha-cn-large",
"sijunhe/nezha-base-wwm",
"sijunhe/nezha-large-wwm",
]
def load_tf_weights_in_nezha(model, config, tf_checkpoint_path):
"""Load tf checkpoints in a pytorch model."""
try:
import re
import numpy as np
import tensorflow as tf
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path = os.path.abspath(tf_checkpoint_path)
logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
logger.info(f"Loading TF weight {name} with shape {shape}")
array = tf.train.load_variable(tf_path, name)
names.append(name)
arrays.append(array)
for name, array in zip(names, arrays):
name = name.split("/")
if any(
n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
for n in name
):
logger.info(f"Skipping {'/'.join(name)}")
continue
pointer = model
for m_name in name:
if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
scope_names = re.split(r"_(\d+)", m_name)
else:
scope_names = [m_name]
if scope_names[0] == "kernel" or scope_names[0] == "gamma":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
pointer = getattr(pointer, "bias")
elif scope_names[0] == "output_weights":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "squad":
pointer = getattr(pointer, "classifier")
else:
try:
pointer = getattr(pointer, scope_names[0])
except AttributeError:
logger.info(f"Skipping {'/'.join(name)}")
continue
if len(scope_names) >= 2:
num = int(scope_names[1])
pointer = pointer[num]
if m_name[-11:] == "_embeddings":
pointer = getattr(pointer, "weight")
elif m_name == "kernel":
array = np.transpose(array)
try:
if pointer.shape != array.shape:
raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
except AssertionError as e:
e.args += (pointer.shape, array.shape)
raise
logger.info(f"Initialize PyTorch weight {name}")
pointer.data = torch.from_numpy(array)
return model
"""Implement the Functional Relative Position Encoding"""
def __init__(self, length, depth, max_relative_position=127):
super().__init__()
vocab_size = max_relative_position * 2 + 1
range_vec = torch.arange(length)
range_mat = range_vec.repeat(length).view(length, length)
distance_mat = range_mat - torch.t(range_mat)
distance_mat_clipped = torch.clamp(distance_mat, -max_relative_position, max_relative_position)
final_mat = distance_mat_clipped + max_relative_position
embeddings_table = torch.zeros(vocab_size, depth)
position = torch.arange(0, vocab_size, dtype=torch.int64).float().unsqueeze(1)
div_term = torch.exp(torch.arange(0, depth, 2).float() * (-math.log(10000.0) / depth))
embeddings_table[:, 0::2] = torch.sin(position * div_term)
embeddings_table[:, 1::2] = torch.cos(position * div_term)
flat_relative_positions_matrix = final_mat.view(-1)
one_hot_relative_positions_matrix = torch.nn.functional.one_hot(
flat_relative_positions_matrix, num_classes=vocab_size
).float()
positions_encoding = torch.matmul(one_hot_relative_positions_matrix, embeddings_table)
my_shape = list(final_mat.size())
my_shape.append(depth)
positions_encoding = positions_encoding.view(my_shape)
self.register_buffer("positions_encoding", positions_encoding, persistent=False)
def forward(self, length):
return self.positions_encoding[:length, :length, :]
class NezhaEmbeddings(nn.Module):
"""Construct the embeddings from word and token_type embeddings."""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.register_buffer(
"token_type_ids", torch.zeros((1, config.max_position_embeddings), dtype=torch.long), persistent=False
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
) -> torch.Tensor:
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
if token_type_ids is None:
if hasattr(self, "token_type_ids"):
buffered_token_type_ids = self.token_type_ids[:, :seq_length]
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
token_type_ids = buffered_token_type_ids_expanded
else:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=inputs_embeds.device)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class NezhaSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.relative_positions_encoding = NezhaRelativePositionsEncoding(
length=config.max_position_embeddings,
depth=self.attention_head_size,
max_relative_position=config.max_relative_position,
)
self.is_decoder = config.is_decoder
def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
):
class NezhaSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class NezhaAttention(nn.Module):
def __init__(self, config):
super().__init__()
self.self = NezhaSelfAttention(config)
self.output = NezhaSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class NezhaIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class NezhaOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class NezhaLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = NezhaAttention(config)
self.is_decoder = config.is_decoder
self.add_cross_attention = config.add_cross_attention
if self.add_cross_attention:
if not self.is_decoder:
raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
self.crossattention = NezhaAttention(config)
self.intermediate = NezhaIntermediate(config)
self.output = NezhaOutput(config)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
):
) -> Tuple[torch.Tensor]:
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
past_key_value=self_attn_past_key_value,
)
attention_output = self_attention_outputs[0]
if self.is_decoder:
outputs = self_attention_outputs[1:-1]
present_key_value = self_attention_outputs[-1]
else:
outputs = self_attention_outputs[1:]
cross_attn_present_key_value = None
if self.is_decoder and encoder_hidden_states is not None:
if not hasattr(self, "crossattention"):
raise ValueError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
" by setting `config.add_cross_attention=True`"
)
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
cross_attention_outputs = self.crossattention(
attention_output,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
cross_attn_past_key_value,
output_attentions,
)
attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:-1]
cross_attn_present_key_value = cross_attention_outputs[-1]
present_key_value = present_key_value + cross_attn_present_key_value
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
if self.is_decoder:
outputs = outputs + (present_key_value,)
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class NezhaEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([NezhaLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False
next_decoder_cache = () if use_cache else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
past_key_value = past_key_values[i] if past_key_values is not None else None
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
else:
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if self.config.add_cross_attention:
all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v
for v in [
hidden_states,
next_decoder_cache,
all_hidden_states,
all_self_attentions,
all_cross_attentions,
]
if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
)
class NezhaPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class NezhaPredictionHeadTransform(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
class NezhaLMPredictionHead(nn.Module):
def __init__(self, config):
super().__init__()
self.transform = NezhaPredictionHeadTransform(config)
self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.decoder.bias = self.bias
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states)
return hidden_states
class NezhaOnlyMLMHead(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = NezhaLMPredictionHead(config)
def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
prediction_scores = self.predictions(sequence_output)
return prediction_scores
class NezhaOnlyNSPHead(nn.Module):
def __init__(self, config):
super().__init__()
self.seq_relationship = nn.Linear(config.hidden_size, 2)
def forward(self, pooled_output):
seq_relationship_score = self.seq_relationship(pooled_output)
return seq_relationship_score
class NezhaPreTrainingHeads(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = NezhaLMPredictionHead(config)
self.seq_relationship = nn.Linear(config.hidden_size, 2)
def forward(self, sequence_output, pooled_output):
prediction_scores = self.predictions(sequence_output)
seq_relationship_score = self.seq_relationship(pooled_output)
return prediction_scores, seq_relationship_score
class NezhaPreTrainedModel(PreTrainedModel):
"""
一个处理权重初始化、下载和加载预训练模型的抽象类。
"""
config_class = NezhaConfig
load_tf_weights = load_tf_weights_in_nezha
base_model_prefix = "nezha"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""初始化权重"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
@dataclass
class NezhaForPreTrainingOutput(ModelOutput):
"""
NezhaForPreTraining 的输出类型。
"""
"""
loss: Optional[torch.FloatTensor] = None
prediction_logits: torch.FloatTensor = None
seq_relationship_logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
"""
"""
This constant defines a docstring for Nezha models, providing an overview and usage guidelines.
It inherits from `PreTrainedModel`, indicating that it leverages methods defined in the superclass
for tasks like downloading, saving, resizing embeddings, and pruning heads.
Additionally, it specifies that the model is a subclass of PyTorch's `torch.nn.Module`, implying
that it can be used as a regular PyTorch module. Users are directed to consult the PyTorch
documentation for general usage and behavior details.
Parameters:
config (`NezhaConfig`): A configuration object holding all model parameters. When initializing
with a config file, only configuration settings are loaded, not model weights. Refer to
`PreTrainedModel.from_pretrained` to load both configuration and weights.
"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
output_attentions (`bool`, *optional*):
output_hidden_states (`bool`, *optional*):
return_dict (`bool`, *optional*):
@add_start_docstrings(
"The bare Nezha Model transformer outputting raw hidden-states without any specific head on top.",
NEZHA_START_DOCSTRING,
)
class NezhaModel(NezhaPreTrainedModel):
"""
# 初始化函数
def __init__(self, config, add_pooling_layer=True):
# 调用父类初始化函数
super().__init__(config)
# 保存配置
self.config = config
# 初始化嵌入层
self.embeddings = NezhaEmbeddings(config)
# 初始化编码器
self.encoder = NezhaEncoder(config)
# 如果需要添加池化层,则初始化池化层
self.pooler = NezhaPooler(config) if add_pooling_layer else None
# 初始化权重并应用最终处理
self.post_init()
# 获取输入嵌入层
def get_input_embeddings(self):
return self.embeddings.word_embeddings
# 设置输入嵌入层
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
# 剪枝模型中的注意力头
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
# 前向传播函数
@add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""
Nezha Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
sentence prediction (classification)` head.
"""
这是一个多行字符串(docstring),通常用来描述函数或类的作用、参数、返回值等信息。
在这里,它描述的是一个叫做 "sentence prediction (classification)" 的部分的头部信息。
"""
NEZHA_START_DOCSTRING,
class NezhaForPreTraining(NezhaPreTrainedModel):
_tied_weights_keys = ["cls.predictions.decoder"]
def __init__(self, config):
super().__init__(config)
self.nezha = NezhaModel(config)
self.cls = NezhaPreTrainingHeads(config)
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings
@add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=NezhaForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
next_sentence_label: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings("""Nezha Model with a `language modeling` head on top.""", NEZHA_START_DOCSTRING)
class NezhaForMaskedLM(NezhaPreTrainedModel):
_tied_weights_keys = ["cls.predictions.decoder"]
def __init__(self, config):
super().__init__(config)
if config.is_decoder:
logger.warning(
"If you want to use `NezhaForMaskedLM` make sure `config.is_decoder=False` for "
"bi-directional self-attention."
)
self.nezha = NezhaModel(config, add_pooling_layer=False)
self.cls = NezhaOnlyMLMHead(config)
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings
@add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.nezha(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
prediction_scores = self.cls(sequence_output)
masked_lm_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return MaskedLMOutput(
loss=masked_lm_loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
input_shape = input_ids.shape
effective_batch_size = input_shape[0]
if self.config.pad_token_id is None:
raise ValueError("The PAD token should be defined for generation")
attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
dummy_token = torch.full(
(effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
)
input_ids = torch.cat([input_ids, dummy_token], dim=1)
return {"input_ids": input_ids, "attention_mask": attention_mask}
@add_start_docstrings(
"""Nezha Model with a `next sentence prediction (classification)` head on top.""",
NEZHA_START_DOCSTRING,
)
class NezhaForNextSentencePrediction(NezhaPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.nezha = NezhaModel(config)
self.cls = NezhaOnlyNSPHead(config)
self.post_init()
@add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
):
) -> Union[Tuple[torch.Tensor], NextSentencePredictorOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
(see `input_ids` docstring). Indices should be in `[0, 1]`:
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
Returns:
Depending on `return_dict`, either a tuple with `NextSentencePredictorOutput` or separate elements.
Example:
```
>>> from transformers import AutoTokenizer, NezhaForNextSentencePrediction
>>> import torch
>>> tokenizer = AutoTokenizer.from_pretrained("sijunhe/nezha-cn-base")
>>> model = NezhaForNextSentencePrediction.from_pretrained("sijunhe/nezha-cn-base")
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
>>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
>>> outputs = model(**encoding, labels=torch.LongTensor([1]))
>>> logits = outputs.logits
>>> assert logits[0, 0] < logits[0, 1] # next sentence was random
```
"""
if "next_sentence_label" in kwargs:
warnings.warn(
"The `next_sentence_label` argument is deprecated and will be removed in a future version, use"
" `labels` instead.",
FutureWarning,
)
labels = kwargs.pop("next_sentence_label")
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.nezha(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
seq_relationship_scores = self.cls(pooled_output)
next_sentence_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))
if not return_dict:
output = (seq_relationship_scores,) + outputs[2:]
return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
return NextSentencePredictorOutput(
loss=next_sentence_loss,
logits=seq_relationship_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
Nezha Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
output) e.g. for GLUE tasks.
""",
NEZHA_START_DOCSTRING,
)
class NezhaForSequenceClassification(NezhaPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
self.nezha = NezhaModel(config)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.nezha(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
Nezha Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
NEZHA_START_DOCSTRING,
)
class NezhaForMultipleChoice(NezhaPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.nezha = NezhaModel(config)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, 1)
self.post_init()
@add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
outputs = self.nezha(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
print(pooled_output.shape)
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
print(logits.shape)
print(num_choices)
reshaped_logits = logits.view(-1, num_choices)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
Nezha Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
NEZHA_START_DOCSTRING,
)
class NezhaForTokenClassification(NezhaPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.nezha = NezhaModel(config, add_pooling_layer=False)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.nezha(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
"""
Nezha Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
"""
@add_start_docstrings(NEZHA_START_DOCSTRING)
class NezhaForQuestionAnswering(NezhaPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.nezha = NezhaModel(config, add_pooling_layer=False)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
start_positions: Optional[torch.Tensor] = None,
end_positions: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.nezha(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
total_loss = None
if start_positions is not None and end_positions is not None:
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
.\models\nezha\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
_import_structure = {
"configuration_nezha": ["NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP", "NezhaConfig"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_nezha"] = [
"NEZHA_PRETRAINED_MODEL_ARCHIVE_LIST",
"NezhaForNextSentencePrediction",
"NezhaForMaskedLM",
"NezhaForPreTraining",
"NezhaForMultipleChoice",
"NezhaForQuestionAnswering",
"NezhaForSequenceClassification",
"NezhaForTokenClassification",
"NezhaModel",
"NezhaPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_nezha import NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP, NezhaConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_nezha import (
NEZHA_PRETRAINED_MODEL_ARCHIVE_LIST,
NezhaForMaskedLM,
NezhaForMultipleChoice,
NezhaForNextSentencePrediction,
NezhaForPreTraining,
NezhaForQuestionAnswering,
NezhaForSequenceClassification,
NezhaForTokenClassification,
NezhaModel,
NezhaPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\nllb\tokenization_nllb.py
import os
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm
from ...tokenization_utils import AddedToken, BatchEncoding, PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
SPIECE_UNDERLINE = "▁"
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/nllb-200-distilled-600M": (
"https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/sentencepiece.bpe.model"
),
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/nllb-200-distilled-600M": 1024,
}
FAIRSEQ_LANGUAGE_CODES = ['ace_Arab', 'ace_Latn', 'acm_Arab', 'acq_Arab', 'aeb_Arab', 'afr_Latn', 'ajp_Arab', 'aka_Latn', 'amh_Ethi', 'apc_Arab', 'arb_Arab', 'ars_Arab', 'ary_Arab', 'arz_Arab', 'asm_Beng', 'ast_Latn', 'awa_Deva', 'ayr_Latn', 'azb_Arab', 'azj_Latn', 'bak_Cyrl', 'bam_Latn', 'ban_Latn', 'bel_Cyrl', 'bem_Latn', 'ben_Beng', 'bho_Deva', 'bjn_Arab', 'bjn_Latn', 'bod_Tibt', 'bos_Latn', 'bug_Latn', 'bul_Cyrl', 'cat_Latn', 'ceb_Latn', 'ces_Latn', 'cjk_Latn', 'ckb_Arab', 'crh_Latn', 'cym_Latn', 'dan_Latn', 'deu_Latn', 'dik_Latn', 'dyu_Latn', 'dzo_Tibt', 'ell_Grek', 'eng_Latn', 'epo_Latn', 'est_Latn', 'eus_Latn', 'ewe_Latn', 'fao_Latn', 'pes_Arab', 'fij_Latn', 'fin_Latn', 'fon_Latn', 'fra_Latn', 'fur_Latn', 'fuv_Latn', 'gla_Latn', 'gle_Latn', 'glg_Latn', 'grn_Latn', 'guj_Gujr', 'hat_Latn', 'hau_Latn', 'heb_Hebr', 'hin_Deva', 'hne_Deva', 'hrv_Latn', 'hun_Latn', 'hye_Armn', 'ibo_Latn', 'ilo_Latn', 'ind_Latn', 'isl_Latn', 'ita_Latn', 'jav_Latn', 'jpn_Jpan', 'kab_Latn', 'kac_Latn', 'kam_Latn', 'kan_Knda', 'kas_Arab', 'kas_Deva', 'kat_Geor', 'knc_Arab', 'knc_Latn', 'kaz_Cyrl', 'kbp_Latn', 'kea_Latn', 'khm_Khmr', 'kik_Latn', 'kin_Latn', 'kir_Cyrl', 'kmb_Latn', 'kon_Latn', 'kor_Hang', 'kmr_Latn', 'lao_Laoo', 'lvs_Latn', 'lij_Latn', 'lim_Latn', 'lin_Latn', 'lit_Latn', 'lmo_Latn', 'ltg_Latn', 'ltz_Latn', 'lua_Latn', 'lug_Latn', 'luo_Latn', 'lus_Latn', 'mag_Deva', 'mai_Deva', 'mal_Mlym', 'mar_Deva', 'min_Latn', 'mkd_Cyrl', 'plt_Latn', 'mlt_Latn', 'mni_Beng', 'khk_Cyrl', 'mos_Latn', 'mri_Latn', 'zsm_Latn', 'mya_Mymr', 'nld_Latn', 'nno_Latn', 'nob_Latn', 'npi_Deva', 'nso_Latn', 'nus_Latn', 'nya_Latn', 'oci_Latn', 'gaz_Latn', 'ory_Orya', 'pag_Latn', 'pan_Guru', 'pap_Latn', 'pol_Latn', 'por_Latn', 'prs_Arab', 'pbt_Arab', 'quy_Latn', 'ron_Latn', 'run_Latn', 'rus_Cyrl', 'sag_Latn', 'san_Deva', 'sat_Beng', 'scn_Latn', 'shn_Mymr', 'sin_Sinh', 'slk_Latn', 'slv_Latn', 'smo_Latn', 'sna_Latn', 'snd_Arab', 'som_Latn', 'sot_Latn', 'spa_Latn', 'als_Latn', 'srd_Latn', 'srp_Cyrl', 'ssw_Latn', 'sun_Latn', 'swe_Latn', 'swh_Latn', 'szl_Latn', 'tam_Taml', 'tat_Cyrl', 'tel_Telu', 'tgk_Cyrl', 'tgl_Latn', 'tha_Thai', 'tir_Ethi', 'taq_Latn', 'taq_Tfng', 'tpi_Latn', 'tsn_Latn', 'tso_Latn', 'tuk_Latn', 'tum_Latn', 'tur_Latn', 'twi_Latn', 'tzm_Tfng', 'uig_Arab', 'ukr_Cyrl', 'umb_Latn', 'urd_Arab', 'uzn_Latn', 'vec_Latn', 'vie_Latn', 'war_Latn', 'wol_Latn', 'xho_Latn', 'ydd_Hebr', 'yor_Latn', 'yue_Hant', 'zho_Hans', 'zho_Hant', 'zul_Latn']
class NllbTokenizer(PreTrainedTokenizer):
"""
构建一个NLLB分词器。
从RobertaTokenizer和XLNetTokenizer进行了适配。
基于SentencePiece(https://github.com/google/sentencepiece)。
分词方法对于源语言文档是'<tokens> <eos> <language code>',对于目标语言文档是'<language code> <tokens> <eos>'。
示例:
```
>>> from transformers import NllbTokenizer
>>> tokenizer = NllbTokenizer.from_pretrained(
```
"""
vocab_file (`str`):
Path to the vocabulary file.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token.
sep_token (`str`, *optional*, defaults to `"</s>"`):
The separator token, which is used when building a sequence from multiple sequences.
cls_token (`str`, *optional*, defaults to `"<s>"`):
The classifier token which is used when doing sequence classification.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding.
mask_token (`str`, *optional*, defaults to `"<mask>"`):
The token used for masking values.
tokenizer_file (`str`, *optional*):
The path to a tokenizer file to use instead of the vocab file.
src_lang (`str`, *optional*):
The language to use as source language for translation.
tgt_lang (`str`, *optional*):
The language to use as target language for translation.
sp_model_kwargs (`Dict[str, str]`):
Additional keyword arguments to pass to the model initialization.
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
model_input_names = ["input_ids", "attention_mask"]
prefix_tokens: List[int] = []
suffix_tokens: List[int] = []
def __init__(
self,
vocab_file,
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
tokenizer_file=None,
src_lang=None,
tgt_lang=None,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
additional_special_tokens=None,
legacy_behaviour=False,
**kwargs,
):
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
state["sp_model_proto"] = self.sp_model.serialized_model_proto()
return state
def __setstate__(self, d):
self.__dict__ = d
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
@property
def vocab_size(self):
return len(self.sp_model) + self.fairseq_offset
@property
def src_lang(self) -> str:
return self._src_lang
@property
def lang_code_to_id(self):
logger.warning_once(
"the `lang_code_to_id` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
" this attribute will be removed in `transformers` v4.38"
)
return self._lang_code_to_id
@property
def fairseq_tokens_to_ids(self):
logger.warning_once(
"the `fairseq_tokens_to_ids` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
" this attribute will be removed in `transformers` v4.38"
)
return self._fairseq_tokens_to_ids
@property
def id_to_lang_code(self):
logger.warning_once(
"the `id_to_lang_code` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
" this attribute will be removed in `transformers` v4.38"
)
return self._id_to_lang_code
@property
def fairseq_ids_to_tokens(self):
logger.warning_once(
"the `_fairseq_ids_to_tokens` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
" this attribute will be removed in `transformers` v4.38"
)
return self._fairseq_ids_to_tokens
@src_lang.setter
def src_lang(self, new_src_lang: str) -> None:
self._src_lang = new_src_lang
self.set_src_lang_special_tokens(self._src_lang)
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
prefix_ones = [1] * len(self.prefix_tokens)
suffix_ones = [1] * len(self.suffix_tokens)
if token_ids_1 is None:
return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An NLLB sequence has the following format, where `X` represents the sequence:
- `input_ids` (for encoder) `X [eos, src_lang_code]`
- `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`
BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator.
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return self.prefix_tokens + token_ids_0 + self.suffix_tokens
return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
):
"""
Create token type IDs tensor from token id lists.
Args:
token_ids_0 (`List[int]`):
List of token IDs representing the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional list of token IDs representing the second sequence (for sequence pairs).
Returns:
`List[int]`: A list of token type IDs based on the input sequences.
"""
token_type_ids = [0] * len(token_ids_0)
if token_ids_1 is not None:
token_type_ids += [1] * len(token_ids_1)
return token_type_ids
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. nllb does not
make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def _build_translation_inputs(
self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
):
"""Used by translation pipeline, to prepare inputs for the generate function"""
if src_lang is None or tgt_lang is None:
raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
self.src_lang = src_lang
inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
inputs["forced_bos_token_id"] = tgt_lang_id
return inputs
def get_vocab(self):
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text: str) -> List[str]:
return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token):
"""Converts a token (str) into an ID using the vocabulary."""
spm_id = self.sp_model.PieceToId(token)
return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
def _convert_id_to_token(self, index):
"""Converts an index (integer) into a token (str) using the vocabulary."""
return self.sp_model.IdToPiece(index - self.fairseq_offset)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) into a single string."""
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file,)
def prepare_seq2seq_batch(
self,
src_texts: List[str],
src_lang: str = "eng_Latn",
tgt_texts: Optional[List[str]] = None,
tgt_lang: str = "fra_Latn",
**kwargs,
) -> BatchEncoding:
self.src_lang = src_lang
self.tgt_lang = tgt_lang
return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
def _switch_to_input_mode(self):
return self.set_src_lang_special_tokens(self.src_lang)
def _switch_to_target_mode(self):
return self.set_tgt_lang_special_tokens(self.tgt_lang)
def set_src_lang_special_tokens(self, src_lang) -> None:
"""Reset the special tokens to the source lang setting.
- In legacy mode: No prefix and suffix=[eos, src_lang_code].
- In default mode: Prefix=[src_lang_code], suffix = [eos]
"""
self.cur_lang_code = self.convert_tokens_to_ids(src_lang)
if self.legacy_behaviour:
self.prefix_tokens = []
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
else:
self.prefix_tokens = [self.cur_lang_code]
self.suffix_tokens = [self.eos_token_id]
def set_tgt_lang_special_tokens(self, lang: str) -> None:
"""Reset the special tokens to the target lang setting.
- In legacy mode: No prefix and suffix=[eos, tgt_lang_code].
- In default mode: Prefix=[tgt_lang_code], suffix = [eos]
"""
self.cur_lang_code = self.convert_tokens_to_ids(lang)
if self.legacy_behaviour:
self.prefix_tokens = []
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
else:
self.prefix_tokens = [self.cur_lang_code]
self.suffix_tokens = [self.eos_token_id]
.\models\nllb\tokenization_nllb_fast.py
import os
from shutil import copyfile
from typing import List, Optional, Tuple
from tokenizers import processors
from ...tokenization_utils import AddedToken, BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import is_sentencepiece_available, logging
if is_sentencepiece_available():
from .tokenization_nllb import NllbTokenizer
else:
NllbTokenizer = None
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/nllb-200-distilled-600M": (
"https://huggingface.co/facebook/nllb-200-distilled-600M/resolve/main/sentencepiece.bpe.model"
),
},
"tokenizer_file": {
"facebook/nllb-200-distilled-600M": (
"https://huggingface.co/facebook/nllb-200-distilled-600M/resolve/main/tokenizer.json"
),
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/nllb-large-en-ro": 1024,
"facebook/nllb-200-distilled-600M": 1024,
}
FAIRSEQ_LANGUAGE_CODES = ['ace_Arab', 'ace_Latn', 'acm_Arab', 'acq_Arab', 'aeb_Arab', 'afr_Latn', 'ajp_Arab', 'aka_Latn', 'amh_Ethi', 'apc_Arab', 'arb_Arab', 'ars_Arab', 'ary_Arab', 'arz_Arab', 'asm_Beng', 'ast_Latn', 'awa_Deva', 'ayr_Latn', 'azb_Arab', 'azj_Latn', 'bak_Cyrl', 'bam_Latn', 'ban_Latn', 'bel_Cyrl', 'bem_Latn', 'ben_Beng', 'bho_Deva', 'bjn_Arab', 'bjn_Latn', 'bod_Tibt', 'bos_Latn', 'bug_Latn', 'bul_Cyrl', 'cat_Latn', 'ceb_Latn', 'ces_Latn', 'cjk_Latn', 'ckb_Arab', 'crh_Latn', 'cym_Latn', 'dan_Latn', 'deu_Latn', 'dik_Latn', 'dyu_Latn', 'dzo_Tibt', 'ell_Grek', 'eng_Latn', 'epo_Latn', 'est_Latn', 'eus_Latn', 'ewe_Latn', 'fao_Latn', 'pes_Arab', 'fij_Latn', 'fin_Latn', 'fon_Latn', 'fra_Latn', 'fur_Latn', 'fuv_Latn', 'gla_Latn', 'gle_Latn', 'glg_Latn', 'grn_Latn', 'guj_Gujr', 'hat_Latn', 'hau_Latn', 'heb_Hebr', 'hin_Deva', 'hne_Deva', 'hrv_Latn', 'hun_Latn', 'hye_Armn', 'ibo_Latn', 'ilo_Latn', 'ind_Latn', 'isl_Latn', 'ita_Latn', 'jav_Latn', 'jpn_Jpan', 'kab_Latn', 'kac_Latn', 'kam_Latn', 'kan_Knda', 'kas_Arab', 'kas_Deva', 'kat_Geor', 'knc_Arab', 'knc_Latn', 'kaz_Cyrl', 'kbp_Latn', 'kea_Latn', 'khm_Khmr', 'kik_Latn', 'kin_Latn', 'kir_Cyrl', 'kmb_Latn', 'kon_Latn', 'kor_Hang', 'kmr_Latn', 'lao_Laoo', 'lvs_Latn', 'lij_Latn', 'lim_Latn', 'lin_Latn', 'lit_Latn', 'lmo_Latn', 'ltg_Latn', 'ltz_Latn', 'lua_Latn', 'lug_Latn', 'luo_Latn', 'lus_Latn', 'mag_Deva', 'mai_Deva', 'mal_Mlym', 'mar_Deva', 'min_Latn', 'mkd_Cyrl', 'plt_Latn', 'mlt_Latn', 'mni_Beng', 'khk_Cyrl', 'mos_Latn', 'mri_Latn', 'zsm_Latn', 'mya_Mymr', 'nld_Latn', 'nno_Latn', 'nob_Latn', 'npi_Deva', 'nso_Latn', 'nus_Latn', 'nya_Latn', 'oci_Latn', 'gaz_Latn', 'ory_Orya', 'pag_Latn', 'pan_Guru', 'pap_Latn', 'pol_Latn', 'por_Latn', 'prs_Arab', 'pbt_Arab', 'quy_Latn', 'ron_Latn', 'run_Latn', 'rus_Cyrl', 'sag_Latn', 'san_Deva', 'sat_Beng', 'scn_Latn', 'shn_Mymr', 'sin_Sinh', 'slk_Latn', 'slv_Latn', 'smo_Latn', 'sna_Latn', 'snd_Arab', 'som_Latn', 'sot_Latn', 'spa_Latn', 'als_Latn', 'srd_Latn', 'srp_Cyrl', 'ssw_Latn', 'sun_Latn', 'swe_Latn', 'swh_Latn', 'szl_Latn', 'tam_Taml', 'tat_Cyrl', 'tel_Telu', 'tgk_Cyrl', 'tgl_Latn', 'tha_Thai', 'tir_Ethi', 'taq_Latn', 'taq_Tfng', 'tpi_Latn', 'tsn_Latn', 'tso_Latn', 'tuk_Latn', 'tum_Latn', 'tur_Latn', 'twi_Latn', 'tzm_Tfng', 'uig_Arab', 'ukr_Cyrl', 'umb_Latn', 'urd_Arab', 'uzn_Latn', 'vec_Latn', 'vie_Latn', 'war_Latn', 'wol_Latn', 'xho_Latn', 'ydd_Hebr', 'yor_Latn', 'yue_Hant', 'zho_Hans', 'zho_Hant', 'zul_Latn']
class NllbTokenizerFast(PreTrainedTokenizerFast):
"""
构建一个“快速”NLLB分词器,使用HuggingFace的 *tokenizers* 库作为后端。基于[BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models)。
这个分词器继承自 `PreTrainedTokenizerFast`,该类包含大部分主要方法。用户应参考这个超类以获取关于这些方法更多信息。
"""
vocab_files_names = VOCAB_FILES_NAMES
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = NllbTokenizer
prefix_tokens: List[int] = []
suffix_tokens: List[int] = []
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
src_lang=None,
tgt_lang=None,
additional_special_tokens=None,
legacy_behaviour=False,
**kwargs,
):
if additional_special_tokens is None:
additional_special_tokens = FAIRSEQ_LANGUAGE_CODES
self.vocab_file = vocab_file
mask_token = (
AddedToken(mask_token, normalized=True, lstrip=True, special=True)
if isinstance(mask_token, str)
else mask_token
)
self.legacy_behaviour = legacy_behaviour
super().__init__(
vocab_file=vocab_file,
tokenizer_file=tokenizer_file,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
cls_token=cls_token,
unk_token=unk_token,
pad_token=pad_token,
src_lang=src_lang,
tgt_lang=tgt_lang,
mask_token=mask_token,
additional_special_tokens=additional_special_tokens,
legacy_behaviour=legacy_behaviour,
**kwargs,
)
self._lang_code_to_id = {
lang_code: self.convert_tokens_to_ids(str(lang_code)) for lang_code in additional_special_tokens
}
self._src_lang = src_lang if src_lang is not None else "eng_Latn"
self.cur_lang_code = self.convert_tokens_to_ids(self._src_lang)
self.tgt_lang = tgt_lang
self.set_src_lang_special_tokens(self._src_lang)
@property
def lang_code_to_id(self):
logger.warning_once(
"the `lang_code_to_id` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
" this attribute will be removed in `transformers` v4.38"
)
return self._lang_code_to_id
@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False
@property
def src_lang(self) -> str:
return self._src_lang
@src_lang.setter
def src_lang(self, new_src_lang: str) -> None:
self._src_lang = new_src_lang
self.set_src_lang_special_tokens(self._src_lang)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
def set_lang(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. The special tokens depend on calling set_lang.
An NLLB sequence has the following format, where `X` represents the sequence:
- `input_ids` (for encoder) `X [eos, src_lang_code]`
- `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`
BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator.
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return self.prefix_tokens + token_ids_0 + self.suffix_tokens
return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. nllb does not
make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def _build_translation_inputs(
self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
):
"""Used by translation pipeline, to prepare inputs for the generate function"""
if src_lang is None or tgt_lang is None:
raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
self.src_lang = src_lang
inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
inputs["forced_bos_token_id"] = tgt_lang_id
return inputs
def prepare_seq2seq_batch(
self,
src_texts: List[str],
src_lang: str = "eng_Latn",
tgt_texts: Optional[List[str]] = None,
tgt_lang: str = "fra_Latn",
**kwargs,
):
"""
Prepare a batch of inputs for sequence-to-sequence tasks, including source and target texts and languages.
Args:
src_texts (`List[str]`):
List of source texts.
src_lang (`str`):
Source language identifier.
tgt_texts (`List[str]`, *optional*):
List of target texts.
tgt_lang (`str`):
Target language identifier.
**kwargs:
Additional keyword arguments for further customization.
Returns:
Dictionary containing prepared inputs for the model.
"""
pass
def set_src_lang_special_tokens(self, src_lang) -> None:
"""设置特殊标记以适应源语言设置。
- 在传统模式下:无前缀,后缀=[eos, src_lang_code]。
- 在默认模式下:前缀=[src_lang_code],后缀=[eos]。
"""
self.cur_lang_code = self.convert_tokens_to_ids(src_lang)
if self.legacy_behaviour:
self.prefix_tokens = []
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
else:
self.prefix_tokens = [self.cur_lang_code]
self.suffix_tokens = [self.eos_token_id]
prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
self._tokenizer.post_processor = processors.TemplateProcessing(
single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
)
def set_tgt_lang_special_tokens(self, lang: str) -> None:
"""设置特殊标记以适应目标语言设置。
- 在传统模式下:无前缀,后缀=[eos, tgt_lang_code]。
- 在默认模式下:前缀=[tgt_lang_code],后缀=[eos]。
"""
self.cur_lang_code = self.convert_tokens_to_ids(lang)
if self.legacy_behaviour:
self.prefix_tokens = []
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
else:
self.prefix_tokens = [self.cur_lang_code]
self.suffix_tokens = [self.eos_token_id]
prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
self._tokenizer.post_processor = processors.TemplateProcessing(
single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not self.can_save_slow_tokenizer:
raise ValueError(
"Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
"tokenizer."
)
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)
.\models\nllb\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_sentencepiece_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {}
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_nllb"] = ["NllbTokenizer"]
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_nllb_fast"] = ["NllbTokenizerFast"]
if TYPE_CHECKING:
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_nllb import NllbTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_nllb_fast import NllbTokenizerFast
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\nllb_moe\configuration_nllb_moe.py
"""
NLLB-MoE model configuration
"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
NLLB_MOE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/nllb-moe-54B": "https://huggingface.co/facebook/nllb-moe-54b/resolve/main/config.json",
}
class NllbMoeConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`NllbMoeModel`]. It is used to instantiate an
NLLB-MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the NLLB-MoE
[facebook/nllb-moe-54b](https://huggingface.co/facebook/nllb-moe-54b) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import NllbMoeModel, NllbMoeConfig
>>> # Initializing a NllbMoe facebook/nllb-moe-54b style configuration
>>> configuration = NllbMoeConfig()
>>> # Initializing a model from the facebook/nllb-moe-54b style configuration
>>> model = NllbMoeModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "nllb-moe"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__(
self,
vocab_size=128112,
max_position_embeddings=1024,
encoder_layers=12,
encoder_ffn_dim=4096,
encoder_attention_heads=16,
decoder_layers=12,
decoder_ffn_dim=4096,
decoder_attention_heads=16,
encoder_layerdrop=0.05,
decoder_layerdrop=0.05,
use_cache=True,
is_encoder_decoder=True,
activation_function="relu",
d_model=1024,
dropout=0.1,
attention_dropout=0.1,
activation_dropout=0.0,
init_std=0.02,
decoder_start_token_id=2,
scale_embedding=True,
router_bias=False,
router_dtype="float32",
router_ignore_padding_tokens=False,
num_experts=128,
expert_capacity=64,
encoder_sparse_step=4,
decoder_sparse_step=4,
router_z_loss_coef=0.001,
router_aux_loss_coef=0.001,
second_expert_policy="all",
normalize_router_prob_before_dropping=False,
batch_prioritized_routing=False,
moe_eval_capacity_token_fraction=1.0,
moe_token_dropout=0.2,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
output_router_logits=False,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.d_model = d_model
self.encoder_ffn_dim = encoder_ffn_dim
self.encoder_layers = encoder_layers
self.encoder_attention_heads = encoder_attention_heads
self.decoder_ffn_dim = decoder_ffn_dim
self.decoder_layers = decoder_layers
self.decoder_attention_heads = decoder_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.activation_function = activation_function
self.init_std = init_std
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.use_cache = use_cache
self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding
self.router_z_loss_coef = router_z_loss_coef
self.router_aux_loss_coef = router_aux_loss_coef
self.decoder_sparse_step = decoder_sparse_step
self.encoder_sparse_step = encoder_sparse_step
self.num_experts = num_experts
self.expert_capacity = expert_capacity
self.router_bias = router_bias
if router_dtype not in ["float32", "float16", "bfloat16"]:
raise ValueError(f"`router_dtype` must be one of 'float32', 'float16' or 'bfloat16', got {router_dtype}")
self.router_dtype = router_dtype
self.router_ignore_padding_tokens = router_ignore_padding_tokens
self.batch_prioritized_routing = batch_prioritized_routing
self.second_expert_policy = second_expert_policy
self.normalize_router_prob_before_dropping = normalize_router_prob_before_dropping
self.moe_eval_capacity_token_fraction = moe_eval_capacity_token_fraction
self.moe_token_dropout = moe_token_dropout
self.output_router_logits = output_router_logits
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
.\models\nllb_moe\convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py
import argparse
import json
import os
import torch
from torch import nn
from transformers import NllbMoeConfig, NllbMoeModel
from transformers.modeling_utils import dtype_byte_size
from transformers.utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME
def remove_ignore_keys_(state_dict):
ignore_keys = [
"encoder.version",
"decoder.version",
"model.encoder.version",
"model.decoder.version",
"decoder.output_projection.weight",
"_float_tensor",
"encoder.embed_positions._float_tensor",
"decoder.embed_positions._float_tensor",
]
for k in ignore_keys:
state_dict.pop(k, None)
def make_linear_from_emb(emb):
vocab_size, emb_size = emb.weight.shape
lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
lin_layer.weight.data = emb.weight.data
return lin_layer
def rename_fairseq_keys(state_dict, expert_idx=None):
new_dict = {}
for old_key in state_dict.keys():
key = old_key
if "moe_layer.experts." in key:
if expert_idx is not None:
key = key.replace("moe_layer.experts.0", f"ffn.experts.expert_{expert_idx}")
else:
key = key.replace("moe_layer.experts.", "ffn.experts.expert_")
if "gate" in key:
key = key.replace(".moe_layer.gate.wg", ".ffn.router.classifier")
if "fc2" and "experts" not in key:
key = key.replace(".fc2.", ".ffn.fc2.")
if "fc1" and "experts" not in key:
key = key.replace(".fc1.", ".ffn.fc1.")
if ".encoder_attn." in key:
key = key.replace(".encoder_attn.", ".cross_attention.")
if "encoder_attn_layer_norm" in key:
key = key.replace("encoder_attn_layer_norm", "cross_attention_layer_norm")
if "final_layer_norm" in key:
key = key.replace("final_layer_norm", "ff_layer_norm")
new_dict[key] = state_dict[old_key]
return new_dict
def shard_on_the_fly(switch_checkpoint_path, dump_path, num_experts, dtype, weights_name: str = WEIGHTS_NAME):
sharded_state_dicts = []
total_size = 0
os.makedirs(dump_path, exist_ok=True)
for expert in range(num_experts):
expert_path = switch_checkpoint_path + f"-rank-{expert}.pt"
if os.path.isfile(expert_path):
expert_state = torch.load(expert_path)["model"]
remove_ignore_keys_(expert_state)
expert_state = rename_fairseq_keys(expert_state, expert)
save_path = os.path.join(
dump_path, weights_name.replace(".bin", f"-{len(sharded_state_dicts)+1:05d}-of-???.bin")
)
torch.save(expert_state, save_path)
sharded_state_dicts.append(expert_state.keys())
total_size += sum([value.numel() for key, value in expert_state.items()]) * dtype_byte_size(
expert_state[list(expert_state)[0]].dtype
)
save_path = os.path.join(dump_path, weights_name.replace(".bin", f"-{len(sharded_state_dicts)+1:05d}-of-???.bin"))
shared_weights = torch.load(switch_checkpoint_path + "-shared.pt")["model"]
remove_ignore_keys_(shared_weights)
shared_weights = rename_fairseq_keys(shared_weights, None)
shared_weights["shared.weight"] = shared_weights["decoder.embed_tokens.weight"]
sharded_state_dicts.append(shared_weights.keys())
if len(sharded_state_dicts) == 1:
save_path = os.path.join(dump_path, weights_name)
torch.save(shared_weights, save_path)
return {weights_name: sharded_state_dicts[0]}, None
else:
torch.save(shared_weights, save_path)
weight_map = {}
for idx, shard in enumerate(sharded_state_dicts):
shard_file = weights_name.replace(".bin", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.bin")
temp_filename = os.path.join(dump_path, weights_name.replace(".bin", f"-{idx+1:05d}-of-???.bin"))
os.rename(temp_filename, os.path.join(dump_path, shard_file))
for key in shard:
weight_map[key] = shard_file
metadata = {"total_size": total_size}
index = {"metadata": metadata, "weight_map": weight_map}
with open(os.path.join(dump_path, WEIGHTS_INDEX_NAME), "w", encoding="utf-8") as f:
content = json.dumps(index, indent=2, sort_keys=True) + "\n"
f.write(content)
return metadata, index
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--nllb_moe_checkpoint_path",
default="/home/arthur_huggingface_co/fairseq/weights/checkpoints/model_moe_54b/checkpoint_2_300000",
type=str,
required=False,
help="Path to a directory containing a folder per layer. Follows the original Google format.",
)
parser.add_argument("--dtype", default="float32", type=str, required=False, help="dtype of the saved model")
parser.add_argument(
"--pytorch_dump_folder_path",
default="/home/arthur_huggingface_co/fairseq/weights/checkpoints/hf-converted-moe-54b",
type=str,
required=False,
help="Path to the output pytorch model.",
)
args = parser.parse_args()
metadata, index = shard_on_the_fly(
args.nllb_moe_checkpoint_path,
args.pytorch_dump_folder_path,
128,
args.dtype,
)
config = NllbMoeConfig.from_pretrained(
"facebook/nllb-200-3.3B", encoder_sparse_step=4, decoder_sparse_step=4, num_experts=128
)
config.save_pretrained(args.pytorch_dump_folder_path)
model = NllbMoeModel.from_pretrained(args.pytorch_dump_folder_path)
print("Done")
model.save_pretrained(args.pytorch_dump_folder_path)