Transformers 源码解析(十六)
.\models\bert\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_flax_available,
is_tensorflow_text_available,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_bert": ["BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BertConfig", "BertOnnxConfig"],
"tokenization_bert": ["BasicTokenizer", "BertTokenizer", "WordpieceTokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_bert_fast"] = ["BertTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_bert"] = [
"BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
"BertForMaskedLM",
"BertForMultipleChoice",
"BertForNextSentencePrediction",
"BertForPreTraining",
"BertForQuestionAnswering",
"BertForSequenceClassification",
"BertForTokenClassification",
"BertLayer",
"BertLMHeadModel",
"BertModel",
"BertPreTrainedModel",
"load_tf_weights_in_bert",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_bert"] = [
"TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFBertEmbeddings",
"TFBertForMaskedLM",
"TFBertForMultipleChoice",
"TFBertForNextSentencePrediction",
"TFBertForPreTraining",
"TFBertForQuestionAnswering",
"TFBertForSequenceClassification",
"TFBertForTokenClassification",
"TFBertLMHeadModel",
"TFBertMainLayer",
"TFBertModel",
"TFBertPreTrainedModel",
]
try:
if not is_tensorflow_text_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_bert_tf"] = ["TFBertTokenizer"]
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
pass
_import_structure["modeling_flax_bert"] = [
"FlaxBertForCausalLM",
"FlaxBertForMaskedLM",
"FlaxBertForMultipleChoice",
"FlaxBertForNextSentencePrediction",
"FlaxBertForPreTraining",
"FlaxBertForQuestionAnswering",
"FlaxBertForSequenceClassification",
"FlaxBertForTokenClassification",
"FlaxBertModel",
"FlaxBertPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig, BertOnnxConfig
from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_bert_fast import BertTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_bert import (
BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
BertForMaskedLM,
BertForMultipleChoice,
BertForNextSentencePrediction,
BertForPreTraining,
BertForQuestionAnswering,
BertForSequenceClassification,
BertForTokenClassification,
BertLayer,
BertLMHeadModel,
BertModel,
BertPreTrainedModel,
load_tf_weights_in_bert,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_bert import (
TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFBertEmbeddings,
TFBertForMaskedLM,
TFBertForMultipleChoice,
TFBertForNextSentencePrediction,
TFBertForPreTraining,
TFBertForQuestionAnswering,
TFBertForSequenceClassification,
TFBertForTokenClassification,
TFBertLMHeadModel,
TFBertMainLayer,
TFBertModel,
TFBertPreTrainedModel,
)
try:
if not is_tensorflow_text_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_bert_tf import TFBertTokenizer
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_flax_bert import (
FlaxBertForCausalLM,
FlaxBertForMaskedLM,
FlaxBertForMultipleChoice,
FlaxBertForNextSentencePrediction,
FlaxBertForPreTraining,
FlaxBertForQuestionAnswering,
FlaxBertForSequenceClassification,
FlaxBertForTokenClassification,
FlaxBertModel,
FlaxBertPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\bertweet\tokenization_bertweet.py
import html
import os
import re
from shutil import copyfile
from typing import List, Optional, Tuple
import regex
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.txt",
"merges_file": "bpe.codes",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"vinai/bertweet-base": "https://huggingface.co/vinai/bertweet-base/resolve/main/vocab.txt",
},
"merges_file": {
"vinai/bertweet-base": "https://huggingface.co/vinai/bertweet-base/resolve/main/bpe.codes",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"vinai/bertweet-base": 128,
}
def get_pairs(word):
"""
返回单词中的符号对集合。
单词被表示为符号元组(符号是长度可变的字符串)。
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
pairs = set(pairs)
return pairs
class BertweetTokenizer(PreTrainedTokenizer):
"""
构造一个 BERTweet 分词器,使用字节对编码。
此分词器继承自 PreTrainedTokenizer,该类包含大多数主要方法。用户应参考这个超类以获取更多关于这些方法的信息。
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(
self,
vocab_file,
merges_file,
normalization=False,
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
**kwargs,
):
try:
from emoji import demojize
self.demojizer = demojize
except ImportError:
logger.warning(
"emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3"
" install emoji==0.6.0"
)
self.demojizer = None
self.vocab_file = vocab_file
self.merges_file = merges_file
self.encoder = {}
self.encoder[str(bos_token)] = 0
self.encoder[str(pad_token)] = 1
self.encoder[str(eos_token)] = 2
self.encoder[str(unk_token)] = 3
self.add_from_file(vocab_file)
self.decoder = {v: k for k, v in self.encoder.items()}
with open(merges_file, encoding="utf-8") as merges_handle:
merges = merges_handle.read().split("\n")[:-1]
merges = [tuple(merge.split()[:-1]) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
self.normalization = normalization
self.tweetPreprocessor = TweetTokenizer()
self.special_puncts = {"’": "'", "…": "..."}
super().__init__(
normalization=normalization,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
cls_token=cls_token,
unk_token=unk_token,
pad_token=pad_token,
mask_token=mask_token,
**kwargs,
)
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. BERTweet does
not make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
@property
def vocab_size(self):
return len(self.encoder)
def get_vocab(self):
return dict(self.encoder, **self.added_tokens_encoder)
def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token)
word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
pairs = get_pairs(word)
if not pairs:
return token
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = "@@ ".join(word)
word = word[:-4]
self.cache[token] = word
return word
def _tokenize(self, text):
"""Tokenize a string."""
if self.normalization:
text = self.normalizeTweet(text)
split_tokens = []
words = re.findall(r"\S+\n?", text)
for token in words:
split_tokens.extend(list(self.bpe(token).split(" ")))
return split_tokens
def normalizeTweet(self, tweet):
"""
Normalize a raw Tweet
"""
for punct in self.special_puncts:
tweet = tweet.replace(punct, self.special_puncts[punct])
tokens = self.tweetPreprocessor.tokenize(tweet)
normTweet = " ".join([self.normalizeToken(token) for token in tokens])
normTweet = (
normTweet.replace("cannot ", "can not ")
.replace("n't ", " n't ")
.replace("n 't ", " n't ")
.replace("ca n't", "can't")
.replace("ai n't", "ain't")
)
normTweet = (
normTweet.replace("'m ", " 'm ")
.replace("'re ", " 're ")
.replace("'s ", " 's ")
.replace("'ll ", " 'll ")
.replace("'d ", " 'd ")
.replace("'ve ", " 've ")
)
normTweet = (
normTweet.replace(" p . m .", " p.m.")
.replace(" p . m ", " p.m ")
.replace(" a . m .", " a.m.")
.replace(" a . m ", " a.m ")
)
return " ".join(normTweet.split())
def normalizeToken(self, token):
lowercased_token = token.lower()
if token.startswith("@"):
return "@USER"
elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
return "HTTPURL"
elif len(token) == 1:
if token in self.special_puncts:
return self.special_puncts[token]
if self.demojizer is not None:
return self.demojizer(token)
else:
return token
else:
return token
def _convert_token_to_id(self, token):
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
return self.decoder.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
out_string = " ".join(tokens).replace("@@ ", "").strip()
return out_string
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
out_merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
if os.path.abspath(self.merges_file) != os.path.abspath(out_merge_file):
copyfile(self.merges_file, out_merge_file)
return out_vocab_file, out_merge_file
def add_from_file(self, f):
"""
从文本文件中加载一个预先存在的字典,并将其符号添加到当前实例中。
"""
if isinstance(f, str):
try:
with open(f, "r", encoding="utf-8") as fd:
self.add_from_file(fd)
except FileNotFoundError as fnfe:
raise fnfe
except UnicodeError:
raise Exception(f"Incorrect encoding detected in {f}, please rebuild the dataset")
return
lines = f.readlines()
for lineTmp in lines:
line = lineTmp.strip()
idx = line.rfind(" ")
if idx == -1:
raise ValueError("Incorrect dictionary format, expected '<token> <cnt>'")
word = line[:idx]
self.encoder[word] = len(self.encoder)
"""
Twitter-aware tokenizer, designed to be flexible and easy to adapt to new domains and tasks. The basic logic is this:
1. The tuple regex_strings defines a list of regular expression strings.
2. The regex_strings strings are put, in order, into a compiled regular expression object called word_re.
3. The tokenization is done by word_re.findall(s), where s is the user-supplied string, inside the tokenize() method of
the class Tokenizer.
4. When instantiating Tokenizer objects, there is a single option: preserve_case. By default, it is set to True. If it
is set to False, then the tokenizer will lowercase everything except for emoticons.
"""
EMOTICONS = r"""
(?:
[<>]? # optional opening angle bracket
[:;=8] # eyes
[\-o\*\']? # optional nose
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
|
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
[\-o\*\']? # optional nose
[:;=8] # eyes
[<>]? # optional closing angle bracket
|
<3 # heart
)"""
URLS = r""" # Capture 1: entire matched URL
(?:
https?: # URL protocol and colon
(?:
/{1,3} # 1-3 slashes
| # or
[a-z0-9%] # Single letter or digit or '%'
# (Trying not to match e.g. "URI::Escape")
)
| # or
# looks like domain name followed by a slash:
[a-z0-9.\-]+[.]
(?:[a-z]{2,13})
/
)
(?: # One or more:
[^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
| # or
\(
[^\s()<>{}\[\]]+
\)
)+
(?: # End with:
\(
[^\s()<>{}\[\]]+
\)
| # or
[^\s`!()\[\]{};:'".,<>?«»“”‘’]
)
"""
\([^\s()]*?\([^\s()]+\)[^\s()]*?\)
|
\([^\s]+?\)
)+
(?:
\([^\s()]*?\([^\s()]+\)[^\s()]*?\)
|
\([^\s]+?\)
|
[^\s`!()\[\]{};:'".,<>?«»“”‘’] # 不是空格或特定的标点字符
)
| # 或者,用于匹配裸域名:
(?:
(?<!@) # 前面不是 @,避免在电子邮件地址中匹配例如 "foo@_gmail.com_"
[a-z0-9]+
(?:[.\-][a-z0-9]+)*
[.]
(?:[a-z]{2,13})
\b
/?
(?!@) # 后面不是 @,避免在电子邮件地址中匹配例如 "foo.na" 在 "foo.na@example.com" 中
)
这段代码是一个正则表达式模式,用于匹配具有特定形式的括号结构和裸域名。
# 定义正则表达式模式以识别不同类型的标记
# 包括 URL、电话号码、ASCII 表情、HTML 标签、ASCII 箭头、Twitter 用户名、Twitter 主题标签、电子邮件地址等
REGEXPS = (
URLS, # 匹配 URL
r"""
(?:
(?: # (国际)
\+?[01]
[ *\-.\)]*
)?
(?: # (区号)
[\(]?
\d{3}
[ *\-.\)]*
)?
\d{3} # 交换机
[ *\-.\)]*
\d{4} # 基站
)""", # 匹配电话号码
EMOTICONS, # 匹配 ASCII 表情
r"""<[^>\s]+>""", # 匹配 HTML 标签
r"""[\-]+>|<[\-]+""", # 匹配 ASCII 箭头
r"""(?:@[\w_]+)""", # 匹配 Twitter 用户名
r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""", # 匹配 Twitter 主题标签
r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""", # 匹配电子邮件地址
r"""
(?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_])
|
(?:[+\-]?\d+[,/.:-]\d+[+\-]?)
|
(?:[\w_]+)
|
(?:\.(?:\s*\.){1,})
|
(?:\S)
""", # 匹配剩余的词类
)
######################################################################
# 这是核心的分词正则表达式:
# 将 REGEXPS 中的所有模式组合成一个大的正则表达式
WORD_RE = regex.compile(r"""(%s)""" % "|".join(REGEXPS), regex.VERBOSE | regex.I | regex.UNICODE)
# HANG_RE 用于识别连续字符的模式
HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}")
# EMOTICON_RE 用于识别表情符号的模式
EMOTICON_RE = regex.compile(EMOTICONS, regex.VERBOSE | regex.I | regex.UNICODE)
# ENT_RE 用于将 HTML 实体转换为 Unicode 字符的模式
ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);")
# 导入HTML实体替换函数
from nltk.tokenize.casual import _replace_html_entities
# 使用HTML实体替换函数处理包含HTML实体的字节字符串,返回替换后的字符串
_replace_html_entities(b"Price: £100")
# 输出结果:'Price: \\xa3100'
# 打印使用HTML实体替换函数处理包含HTML实体的字节字符串,应该输出替换后的Unicode字符串
print(_replace_html_entities(b"Price: £100"))
# 输出结果:Price: £100
class TweetTokenizer:
r"""
Examples:
```
>>>
>>> from nltk.tokenize import TweetTokenizer
>>> tknzr = TweetTokenizer()
>>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
>>> tknzr.tokenize(s0)
['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']
>>>
>>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
>>> s1 = "@remy: This is waaaaayyyy too much for you!!!!!!"
>>> tknzr.tokenize(s1)
[':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
```"""
def __init__(self, preserve_case=True, reduce_len=False, strip_handles=False):
# Initialize the TweetTokenizer with options to preserve case, reduce elongated words, and strip handles.
self.preserve_case = preserve_case
self.reduce_len = reduce_len
self.strip_handles = strip_handles
def tokenize(self, text):
"""
Tokenize a given text into a list of words.
Args:
text: str
Returns:
list(str): A list of tokens extracted from the text.
"""
# Fix HTML character entities before tokenization
text = _replace_html_entities(text)
# Remove Twitter handles if strip_handles is enabled
if self.strip_handles:
text = remove_handles(text)
# Reduce elongated words to their base form if reduce_len is enabled
if self.reduce_len:
text = reduce_lengthening(text)
# Replace problematic sequences of characters for safe tokenization
safe_text = HANG_RE.sub(r"\1\1\1", text)
# Tokenize the text using a regular expression for word boundaries
words = WORD_RE.findall(safe_text)
# Adjust word case unless it is part of an emoticon to preserve emoticon capitalization
if not self.preserve_case:
words = [x if EMOTICON_RE.search(x) else x.lower() for x in words]
return words
######################################################################
# Normalization Functions
######################################################################
def reduce_lengthening(text):
"""
Reduce repeated character sequences of length 3 or greater to sequences of length 3.
Args:
text: str
Returns:
str: Text with reduced elongations.
"""
pattern = regex.compile(r"(.)\1{2,}")
return pattern.sub(r"\1\1\1", text)
def remove_handles(text):
"""
Remove Twitter username handles from text.
Args:
text: str
Returns:
str: Text with removed handles replaced by spaces.
"""
pattern = regex.compile(
r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)"
)
# Substitute handles with ' ' to ensure correct tokenization around removed handles
return pattern.sub(" ", text)
######################################################################
# Tokenization Function
######################################################################
def casual_tokenize(text, preserve_case=True, reduce_len=False, strip_handles=False):
"""
Tokenize a text string using casual tokenization rules.
Args:
text: str
preserve_case: bool, optional (default=True)
Whether to preserve case in tokens.
reduce_len: bool, optional (default=False)
Whether to reduce elongated words.
strip_handles: bool, optional (default=False)
Whether to remove Twitter handles.
Returns:
list(str): A list of tokens extracted from the text based on specified rules.
"""
# 创建一个TweetTokenizer对象,用于分词化处理,根据参数设置保留大小写、缩短长度和去除句柄
"""
Convenience function for wrapping the tokenizer.
"""
# 返回通过TweetTokenizer对象对文本进行分词化处理得到的结果
return TweetTokenizer(preserve_case=preserve_case, reduce_len=reduce_len, strip_handles=strip_handles).tokenize(
text
)
###############################################################################
# 定义一个函数 `calculate_total`,接收一个参数 `items`
def calculate_total(items):
# 初始化一个变量 `total`,用于累计总和
total = 0
# 遍历参数 `items` 中的每个元素,将其加到 `total` 中
for item in items:
total += item
# 返回累计的总和 `total`
return total
.\models\bertweet\__init__.py
from typing import TYPE_CHECKING
from ...utils import _LazyModule
_import_structure = {"tokenization_bertweet": ["BertweetTokenizer"]}
if TYPE_CHECKING:
from .tokenization_bertweet import BertweetTokenizer
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\bert_generation\configuration_bert_generation.py
""" BertGeneration model configuration"""
from ...configuration_utils import PretrainedConfig
class BertGenerationConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`BertGenerationPreTrainedModel`]. It is used to
instantiate a BertGeneration model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the BertGeneration
[google/bert_for_seq_generation_L-24_bbc_encoder](https://huggingface.co/google/bert_for_seq_generation_L-24_bbc_encoder)
architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Examples:
```
>>> from transformers import BertGenerationConfig, BertGenerationEncoder
>>> # Initializing a BertGeneration config
>>> configuration = BertGenerationConfig()
>>> # Initializing a model (with random weights) from the config
>>> model = BertGenerationEncoder(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
# Set the model_type attribute to "bert-generation"
model_type = "bert-generation"
# Define the constructor (__init__) method for initializing an instance of BertGenerationConfig
def __init__(
self,
vocab_size=50358, # Size of the vocabulary used by the model
hidden_size=1024, # Dimensionality of the encoder layers and the pooler layer
num_hidden_layers=24, # Number of hidden layers in the Transformer encoder
num_attention_heads=16, # Number of attention heads for each attention layer in the Transformer encoder
intermediate_size=4096, # Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder
hidden_act="gelu", # The activation function to be used in the hidden layers
hidden_dropout_prob=0.1, # The dropout probability for all fully connected layers in the embeddings, encoder, and pooler
attention_probs_dropout_prob=0.1, # The dropout ratio for the attention probabilities
max_position_embeddings=512, # The maximum sequence length that this model might ever be used with
initializer_range=0.02, # The standard deviation of the truncated_normal_initializer for initializing all weight matrices
layer_norm_eps=1e-12, # The epsilon used by the layer normalization layers
pad_token_id=0, # The token id for padding
bos_token_id=2, # The token id for the beginning of sentence token
eos_token_id=1, # The token id for the end of sentence token
position_embedding_type="absolute", # Type of position embedding to use
use_cache=True, # Whether to use an output cache
**kwargs, # Additional keyword arguments for future expansion
):
# Call the constructor of the base class (PretrainedConfig) with all the provided arguments
super().__init__(
vocab_size=vocab_size,
hidden_size=hidden_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
intermediate_size=intermediate_size,
hidden_act=hidden_act,
hidden_dropout_prob=hidden_dropout_prob,
attention_probs_dropout_prob=attention_probs_dropout_prob,
max_position_embeddings=max_position_embeddings,
initializer_range=initializer_range,
layer_norm_eps=layer_norm_eps,
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
position_embedding_type=position_embedding_type,
use_cache=use_cache,
**kwargs,
)
):
# 调用父类的初始化方法,传递相关参数,并继承其行为
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
# 设置当前类的词汇表大小
self.vocab_size = vocab_size
# 设置隐藏层大小
self.hidden_size = hidden_size
# 设置隐藏层的数量
self.num_hidden_layers = num_hidden_layers
# 设置注意力头的数量
self.num_attention_heads = num_attention_heads
# 设置隐藏层激活函数类型
self.hidden_act = hidden_act
# 设置中间层大小
self.intermediate_size = intermediate_size
# 设置隐藏层的 dropout 概率
self.hidden_dropout_prob = hidden_dropout_prob
# 设置注意力概率的 dropout 概率
self.attention_probs_dropout_prob = attention_probs_dropout_prob
# 设置最大位置嵌入长度
self.max_position_embeddings = max_position_embeddings
# 设置初始化范围
self.initializer_range = initializer_range
# 设置层归一化的 epsilon 值
self.layer_norm_eps = layer_norm_eps
# 设置位置嵌入类型
self.position_embedding_type = position_embedding_type
# 设置是否使用缓存
self.use_cache = use_cache
.\models\bert_generation\modeling_bert_generation.py
"""PyTorch BERT model specific for generation."""
import math
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_bert_generation import BertGenerationConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "google/bert_for_seq_generation_L-24_bbc_encoder"
_CONFIG_FOR_DOC = "BertGenerationConfig"
class BertGenerationSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class BertGenerationSelfAttention(nn.Module):
pass
def __init__(self, config, position_embedding_type=None):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = position_embedding_type or getattr(
config, "position_embedding_type", "absolute"
)
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
self.is_decoder = config.is_decoder
def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
class BertGenerationAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
self.self = BertGenerationSelfAttention(config, position_embedding_type=position_embedding_type)
self.output = BertGenerationSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class BertGenerationIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class BertGenerationOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class BertGenerationLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = BertGenerationAttention(config)
self.is_decoder = config.is_decoder
self.add_cross_attention = config.add_cross_attention
if self.add_cross_attention:
if not self.is_decoder:
raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
self.crossattention = BertGenerationAttention(config, position_embedding_type="absolute")
self.intermediate = BertGenerationIntermediate(config)
self.output = BertGenerationOutput(config)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
past_key_value=self_attn_past_key_value,
)
attention_output = self_attention_outputs[0]
if self.is_decoder:
outputs = self_attention_outputs[1:-1]
present_key_value = self_attention_outputs[-1]
else:
outputs = self_attention_outputs[1:]
cross_attn_present_key_value = None
if self.is_decoder and encoder_hidden_states is not None:
if not hasattr(self, "crossattention"):
raise ValueError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
" by setting `config.add_cross_attention=True`"
)
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
cross_attention_outputs = self.crossattention(
attention_output,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
cross_attn_past_key_value,
output_attentions,
)
attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:-1]
cross_attn_present_key_value = cross_attention_outputs[-1]
present_key_value = present_key_value + cross_attn_present_key_value
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
if self.is_decoder:
outputs = outputs + (present_key_value,)
return outputs
class BertEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([BertGenerationLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False
next_decoder_cache = () if use_cache else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
past_key_value = past_key_values[i] if past_key_values is not None else None
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
else:
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if self.config.add_cross_attention:
all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v
for v in [
hidden_states,
next_decoder_cache,
all_hidden_states,
all_self_attentions,
all_cross_attentions,
]
if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
)
def load_tf_weights_in_bert_generation(
model, tf_hub_path, model_class, is_encoder_named_decoder=False, is_encoder=False
):
try:
import numpy as np
import tensorflow.compat.v1 as tf
import tensorflow_hub as hub
import tensorflow_text
tf.disable_eager_execution()
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_model = hub.Module(tf_hub_path)
init = tf.global_variables_initializer()
class BertGenerationEmbeddings(nn.Module):
"""Construct the embeddings from word and position embeddings."""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0):
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
embeddings = inputs_embeds + position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class BertGenerationPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = BertGenerationConfig
base_model_prefix = "bert"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
BERT_GENERATION_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`BertGenerationConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BERT_GENERATION_INPUTS_DOCSTRING = r"""
Inputs:
input_ids (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`~transformers.BertTokenizer`. See
:meth:`transformers.PreTrainedTokenizer.__call__` and :meth:`transformers.PreTrainedTokenizer.encode` for
details.
attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, optional):
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
Defaults to ``None``.
decoder_input_ids (:obj:`torch.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, optional):
Provide for generation tasks to guide decoding. Indices can be obtained using
:class:`~transformers.BertTokenizer`. See :meth:`transformers.PreTrainedTokenizer.__call__` and
:meth:`transformers.PreTrainedTokenizer.encode` for details.
decoder_attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, optional):
Mask to avoid performing attention on padding token indices for the decoder input. Mask values selected in
``[0, 1]``:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
Defaults to ``None``.
head_mask (:obj:`torch.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, optional):
Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
Defaults to ``None``.
inputs_embeds (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, optional):
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert tokens to embeddings before feeding them into
the model.
decoder_inputs_embeds (:obj:`torch.Tensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`,
optional):
Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
representation. This is useful if you want more control over how to convert tokens to embeddings before
feeding them into the model.
labels (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, optional):
Labels for computing the masked language modeling loss.
Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (padding value).
Tokens with labels set to -100 are ignored (masked), otherwise compute loss.
Returns:
:obj:`torch.Tensor`: Returns a tuple comprising various elements depending on the configuration. The first
element is the final hidden states from the model, which can be used for further downstream tasks such as
classification, regression, or sequence generation.
Example::
from transformers import BertTokenizer, BertGenerationModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertGenerationModel.from_pretrained('bert-base-uncased')
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
output_attentions (`bool`, *optional*):
output_hidden_states (`bool`, *optional*):
return_dict (`bool`, *optional*):
"""
# 给BertGenerationEncoder类添加文档字符串,描述其作为无特定头部的原始隐藏状态输出的BertGeneration模型转换器
@add_start_docstrings(
"The bare BertGeneration model transformer outputting raw hidden-states without any specific head on top.",
BERT_GENERATION_START_DOCSTRING,
)
"""
"""
# BertGenerationEncoder类,用于BertGenerationPreTrainedModel的扩展
class BertGenerationEncoder(BertGenerationPreTrainedModel):
"""
def __init__(self, config):
super().__init__(config)
self.config = config
self.embeddings = BertGenerationEmbeddings(config)
self.encoder = BertEncoder(config)
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
"""
"""
@add_start_docstrings_to_model_forward(BERT_GENERATION_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPastAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
```
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
class BertGenerationOnlyLMHead(nn.Module):
def __init__(self, config):
super().__init__()
self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.decoder.bias = self.bias
def forward(self, hidden_states):
logits = self.decoder(hidden_states)
return logits
def _tie_weights(self):
self.bias = self.decoder.bias
@add_start_docstrings(
"""BertGeneration Model with a `language modeling` head on top for CLM fine-tuning.""",
BERT_GENERATION_START_DOCSTRING,
)
class BertGenerationDecoder(BertGenerationPreTrainedModel):
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
def __init__(self, config):
super().__init__(config)
if not config.is_decoder:
logger.warning("If you want to use `BertGenerationDecoder` as a standalone, add `is_decoder=True.`")
self.bert = BertGenerationEncoder(config)
self.lm_head = BertGenerationOnlyLMHead(config)
self.post_init()
def get_output_embeddings(self):
return self.lm_head.decoder
def set_output_embeddings(self, new_embeddings):
self.lm_head.decoder = new_embeddings
@add_start_docstrings_to_model_forward(BERT_GENERATION_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
input_shape = input_ids.shape
if attention_mask is None:
attention_mask = input_ids.new_ones(input_shape)
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
def _reorder_cache(self, past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
.\models\bert_generation\tokenization_bert_generation.py
import os
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"bert_for_seq_generation": (
"https://huggingface.co/google/bert_for_seq_generation_L-24_bbc_encoder/resolve/main/spiece.model"
),
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"bert_for_seq_generation": 512}
class BertGenerationTokenizer(PreTrainedTokenizer):
"""
Construct a BertGeneration tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
"""
Args:
vocab_file (`str`):
[SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
contains the vocabulary necessary to instantiate a tokenizer.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The begin of sequence token.
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
sep_token (`str`, *optional*, defaults to `"<::::>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
sp_model_kwargs (`dict`, *optional*):
Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
to set:
- `enable_sampling`: Enable subword regularization.
- `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
- `nbest_size = {0,1}`: No sampling is performed.
- `nbest_size > 1`: samples from the nbest_size results.
- `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
"""
# 定义了一些常量用于指定预训练模型所需的文件名称
vocab_files_names = VOCAB_FILES_NAMES
# 定义了一个映射,指定了预训练模型所需的词汇文件的位置
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
# 定义了一个映射,指定了预训练模型的最大输入大小
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# 初始化一个空列表,用于存储前缀 tokens
prefix_tokens: List[int] = []
# 模型输入的名称列表
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
bos_token="<s>",
eos_token="</s>",
unk_token="<unk>",
pad_token="<pad>",
sep_token="<::::>",
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs,
):
"""
初始化函数,用于实例化一个新的 Tokenizer 对象。
Parameters:
vocab_file (str):
SentencePiece 文件名,包含用于实例化分词器所需的词汇表。
bos_token (str, optional, default="<s>"):
序列的开始标记。
eos_token (str, optional, default="</s>"):
序列的结束标记。
unk_token (str, optional, default="<unk>"):
未知标记。如果词汇表中不存在的 token,将被设置为此标记。
pad_token (str, optional, default="<pad>"):
用于填充的标记,例如在对不同长度的序列进行批处理时使用。
sep_token (str, optional, default="<::::>"):
分隔符标记,用于构建由多个序列组成的序列,例如用于序列分类或问答时的文本和问题。
也作为包含特殊标记的序列的最后一个标记使用。
sp_model_kwargs (dict, optional):
传递给 `SentencePieceProcessor.__init__()` 方法的参数字典。
可以用于设置 SentencePiece 的各种参数,例如启用子词正则化等。
"""
) -> None:
"""初始化函数,用于设置特定的参数并加载SentencePiece模型。
Args:
sp_model_kwargs (dict, optional): SentencePiece模型的参数设置,默认为空字典。
vocab_file (str): SentencePiece模型的词汇文件路径。
bos_token (str, optional): SentencePiece模型的开始符号,默认为None。
eos_token (str, optional): SentencePiece模型的结束符号,默认为None。
unk_token (str, optional): SentencePiece模型的未知符号,默认为None。
pad_token (str, optional): SentencePiece模型的填充符号,默认为None。
sep_token (str, optional): SentencePiece模型的分隔符号,默认为None。
**kwargs: 其他可能的参数。
"""
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
self.vocab_file = vocab_file
# 使用给定的参数初始化SentencePieceProcessor对象
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
# 加载指定的词汇文件到SentencePieceProcessor对象中
self.sp_model.Load(vocab_file)
# 调用父类的初始化方法,设置特殊符号的参数
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
sep_token=sep_token,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs,
)
@property
def vocab_size(self):
"""获取当前SentencePiece模型的词汇大小(词汇量)。"""
return self.sp_model.get_piece_size()
def get_vocab(self):
"""返回一个词汇表字典,将token转换为对应的id。"""
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
# 添加额外的特殊token编码到词汇表字典中
vocab.update(self.added_tokens_encoder)
return vocab
def __getstate__(self):
"""获取对象的状态信息,用于序列化对象。"""
state = self.__dict__.copy()
state["sp_model"] = None # 将sp_model设置为None,以便序列化时不包含模型
return state
def __setstate__(self, d):
"""设置对象的状态信息,用于反序列化对象。"""
self.__dict__ = d
# 为了向后兼容性,如果不存在sp_model_kwargs属性,则设置为空字典
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
# 重新创建SentencePieceProcessor对象并加载词汇文件
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file)
def _tokenize(self, text: str) -> List[str]:
"""将输入的文本进行分词(tokenize),返回token列表。"""
return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token):
"""将token转换为对应的id。"""
return self.sp_model.piece_to_id(token)
def _convert_id_to_token(self, index):
"""将id转换为对应的token。"""
token = self.sp_model.IdToPiece(index)
return token
def convert_tokens_to_string(self, tokens):
"""将token序列转换为单个字符串。"""
current_sub_tokens = []
out_string = ""
for token in tokens:
# 确保特殊token不使用sentencepiece模型进行解码
if token in self.all_special_tokens:
out_string += self.sp_model.decode(current_sub_tokens) + token
current_sub_tokens = []
else:
current_sub_tokens.append(token)
out_string += self.sp_model.decode(current_sub_tokens)
return out_string.strip()
# 保存词汇表到指定目录,可选择添加前缀到文件名
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# 检查保存目录是否存在,如果不存在则记录错误并返回
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
# 构建输出词汇表文件的路径,如果有前缀则添加到文件名中
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
# 如果当前词汇表文件与输出文件不是同一个文件并且当前词汇表文件存在,则复制当前词汇表文件到输出文件
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
# 如果当前词汇表文件不存在,则将序列化后的模型内容写入输出文件
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
# 返回保存的输出词汇表文件路径的元组
return (out_vocab_file,)
.\models\bert_generation\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_torch_available
_import_structure = {"configuration_bert_generation": ["BertGenerationConfig"]}
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_bert_generation"] = ["BertGenerationTokenizer"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_bert_generation"] = [
"BertGenerationDecoder",
"BertGenerationEncoder",
"BertGenerationPreTrainedModel",
"load_tf_weights_in_bert_generation",
]
if TYPE_CHECKING:
from .configuration_bert_generation import BertGenerationConfig
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_bert_generation import BertGenerationTokenizer
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_bert_generation import (
BertGenerationDecoder,
BertGenerationEncoder,
BertGenerationPreTrainedModel,
load_tf_weights_in_bert_generation,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\bert_japanese\tokenization_bert_japanese.py
"""Tokenization classes."""
import collections
import copy
import os
import unicodedata
from typing import Any, Dict, List, Optional, Tuple
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...utils import is_sentencepiece_available, is_sudachi_projection_available, logging
if is_sentencepiece_available():
import sentencepiece as spm
else:
spm = None
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "spm_file": "spiece.model"}
SPIECE_UNDERLINE = "▁"
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"cl-tohoku/bert-base-japanese": "https://huggingface.co/cl-tohoku/bert-base-japanese/resolve/main/vocab.txt",
"cl-tohoku/bert-base-japanese-whole-word-masking": (
"https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/resolve/main/vocab.txt"
),
"cl-tohoku/bert-base-japanese-char": (
"https://huggingface.co/cl-tohoku/bert-base-japanese-char/resolve/main/vocab.txt"
),
"cl-tohoku/bert-base-japanese-char-whole-word-masking": (
"https://huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking/resolve/main/vocab.txt"
),
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"cl-tohoku/bert-base-japanese": 512,
"cl-tohoku/bert-base-japanese-whole-word-masking": 512,
"cl-tohoku/bert-base-japanese-char": 512,
"cl-tohoku/bert-base-japanese-char-whole-word-masking": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"cl-tohoku/bert-base-japanese": {
"do_lower_case": False,
"word_tokenizer_type": "mecab",
"subword_tokenizer_type": "wordpiece",
},
"cl-tohoku/bert-base-japanese-whole-word-masking": {
"do_lower_case": False,
"word_tokenizer_type": "mecab",
"subword_tokenizer_type": "wordpiece",
},
"cl-tohoku/bert-base-japanese-char": {
"do_lower_case": False,
"word_tokenizer_type": "mecab",
"subword_tokenizer_type": "character",
},
"cl-tohoku/bert-base-japanese-char-whole-word-masking": {
"do_lower_case": False,
"word_tokenizer_type": "mecab",
"subword_tokenizer_type": "character",
},
}
`
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class BertJapaneseTokenizer(PreTrainedTokenizer):
r"""
Construct a BERT tokenizer for Japanese text.
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer
to: this superclass for more information regarding those methods.
Args:
vocab_file (`str`):
Path to a one-wordpiece-per-line vocabulary file.
spm_file (`str`, *optional*):
Path to [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm or .model
extension) that contains the vocabulary.
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether to lower case the input. Only has an effect when do_basic_tokenize=True.
do_word_tokenize (`bool`, *optional*, defaults to `True`):
Whether to do word tokenization.
do_subword_tokenize (`bool`, *optional*, defaults to `True`):
Whether to do subword tokenization.
word_tokenizer_type (`str`, *optional*, defaults to `"basic"`):
Type of word tokenizer. Choose from ["basic", "mecab", "sudachi", "jumanpp"].
subword_tokenizer_type (`str`, *optional*, defaults to `"wordpiece"`):
Type of subword tokenizer. Choose from ["wordpiece", "character", "sentencepiece",].
mecab_kwargs (`dict`, *optional*):
Dictionary passed to the `MecabTokenizer` constructor.
sudachi_kwargs (`dict`, *optional*):
Dictionary passed to the `SudachiTokenizer` constructor.
jumanpp_kwargs (`dict`, *optional*):
Dictionary passed to the `JumanppTokenizer` constructor.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(
self,
vocab_file,
spm_file=None,
do_lower_case=False,
do_word_tokenize=True,
do_subword_tokenize=True,
word_tokenizer_type="basic",
subword_tokenizer_type="wordpiece",
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
mecab_kwargs=None,
sudachi_kwargs=None,
jumanpp_kwargs=None,
**kwargs,
):
@property
def do_lower_case(self):
return self.lower_case
def __getstate__(self):
state = dict(self.__dict__)
if self.word_tokenizer_type in ["mecab", "sudachi", "jumanpp"]:
del state["word_tokenizer"]
return state
def __setstate__(self, state):
self.__dict__ = state
if self.word_tokenizer_type == "mecab":
self.word_tokenizer = MecabTokenizer(
do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.mecab_kwargs or {})
)
elif self.word_tokenizer_type == "sudachi":
self.word_tokenizer = SudachiTokenizer(
do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.sudachi_kwargs or {})
)
elif self.word_tokenizer_type == "jumanpp":
self.word_tokenizer = JumanppTokenizer(
do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.jumanpp_kwargs or {})
)
def _tokenize(self, text):
if self.do_word_tokenize:
tokens = self.word_tokenizer.tokenize(text, never_split=self.all_special_tokens)
else:
tokens = [text]
if self.do_subword_tokenize:
split_tokens = [sub_token for token in tokens for sub_token in self.subword_tokenizer.tokenize(token)]
else:
split_tokens = tokens
return split_tokens
@property
def vocab_size(self):
if self.subword_tokenizer_type == "sentencepiece":
return len(self.subword_tokenizer.sp_model)
return len(self.vocab)
def get_vocab(self):
if self.subword_tokenizer_type == "sentencepiece":
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
return dict(self.vocab, **self.added_tokens_encoder)
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
if self.subword_tokenizer_type == "sentencepiece":
return self.subword_tokenizer.sp_model.PieceToId(token)
return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
if self.subword_tokenizer_type == "sentencepiece":
return self.subword_tokenizer.sp_model.IdToPiece(index)
return self.ids_to_tokens.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
if self.subword_tokenizer_type == "sentencepiece":
return self.subword_tokenizer.sp_model.decode(tokens)
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: List of integers indicating special tokens (1 for special token, 0 for regular token).
"""
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if os.path.isdir(save_directory):
if self.subword_tokenizer_type == "sentencepiece":
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["spm_file"]
)
else:
vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"],
)
else:
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
if self.subword_tokenizer_type == "sentencepiece":
with open(vocab_file, "wb") as writer:
content_spiece_model = self.subword_tokenizer.sp_model.serialized_model_proto()
writer.write(content_spiece_model)
else:
with open(vocab_file, "w", encoding="utf-8") as writer:
index = 0
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!"
)
index = token_index
writer.write(token + "\n")
index += 1
return (vocab_file,)
class MecabTokenizer:
"""Runs basic tokenization with MeCab morphological parser."""
def __init__(
self,
do_lower_case=False,
never_split=None,
normalize_text=True,
mecab_dic: Optional[str] = "ipadic",
mecab_option: Optional[str] = None,
):
def tokenize(self, text, never_split=None, **kwargs):
"""Tokenizes a piece of text."""
if self.normalize_text:
text = unicodedata.normalize("NFKC", text)
never_split = self.never_split + (never_split if never_split is not None else [])
tokens = []
for word in self.mecab(text):
token = word.surface
if self.do_lower_case and token not in never_split:
token = token.lower()
tokens.append(token)
return tokens
class SudachiTokenizer:
"""Runs basic tokenization with Sudachi morphological parser."""
def __init__(
self,
do_lower_case=False,
never_split=None,
normalize_text=True,
trim_whitespace=False,
sudachi_split_mode="A",
sudachi_config_path=None,
sudachi_resource_dir=None,
sudachi_dict_type="core",
sudachi_projection=None,
):
):
"""
Constructs a SudachiTokenizer.
Args:
**do_lower_case**: (*optional*) boolean (default True)
Whether to lowercase the input.
**never_split**: (*optional*) list of str
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
**normalize_text**: (*optional*) boolean (default True)
Whether to apply unicode normalization to text before tokenization.
**trim_whitespace**: (*optional*) boolean (default False)
Whether to trim all whitespace, tab, newline from tokens.
**sudachi_split_mode**: (*optional*) string
Split mode of sudachi, choose from `["A", "B", "C"]`.
**sudachi_config_path**: (*optional*) string
Path to Sudachi configuration file.
**sudachi_resource_dir**: (*optional*) string
Directory containing Sudachi resources.
**sudachi_dict_type**: (*optional*) string
Dictionary type of Sudachi, choose from `["small", "core", "full"]`.
**sudachi_projection**: (*optional*) string
Word projection mode of Sudachi, choose from `["surface", "normalized", "reading", "dictionary", "dictionary_and_surface", "normalized_and_surface", "normalized_nouns"]`.
"""
self.do_lower_case = do_lower_case
self.never_split = never_split if never_split is not None else []
self.normalize_text = normalize_text
self.trim_whitespace = trim_whitespace
try:
from sudachipy import dictionary, tokenizer
except ImportError:
raise ImportError(
"You need to install sudachipy to use SudachiTokenizer. "
"See https://github.com/WorksApplications/SudachiPy for installation."
)
if sudachi_split_mode == "A":
self.split_mode = tokenizer.Tokenizer.SplitMode.A
elif sudachi_split_mode == "B":
self.split_mode = tokenizer.Tokenizer.SplitMode.B
elif sudachi_split_mode == "C":
self.split_mode = tokenizer.Tokenizer.SplitMode.C
else:
raise ValueError("Invalid sudachi_split_mode is specified.")
self.projection = sudachi_projection
sudachi_dictionary = dictionary.Dictionary(
config_path=sudachi_config_path, resource_dir=sudachi_resource_dir, dict=sudachi_dict_type
)
if is_sudachi_projection_available():
self.sudachi = sudachi_dictionary.create(self.split_mode, projection=self.projection)
elif self.projection is not None:
raise ImportError("You need to install sudachipy>=0.6.8 to specify `projection` field in sudachi_kwargs.")
else:
self.sudachi = sudachi_dictionary.create(self.split_mode)
def tokenize(self, text, never_split=None, **kwargs):
"""Tokenizes a piece of text."""
if self.normalize_text:
text = unicodedata.normalize("NFKC", text)
never_split = self.never_split + (never_split if never_split is not None else [])
tokens = []
for word in self.sudachi.tokenize(text):
token = word.surface()
if self.do_lower_case and token not in never_split:
token = token.lower()
if self.trim_whitespace:
if token.strip() == "":
continue
else:
token = token.strip()
tokens.append(token)
return tokens
class JumanppTokenizer:
"""Runs basic tokenization with jumanpp morphological parser."""
def __init__(
self,
do_lower_case=False,
never_split=None,
normalize_text=True,
trim_whitespace=False,
):
"""
Constructs a JumanppTokenizer.
Args:
**do_lower_case**: (*optional*) boolean (default True)
Whether to lowercase the input.
**never_split**: (*optional*) list of str
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
**normalize_text**: (*optional*) boolean (default True)
Whether to apply unicode normalization to text before tokenization.
**trim_whitespace**: (*optional*) boolean (default False)
Whether to trim all whitespace, tab, newline from tokens.
"""
self.do_lower_case = do_lower_case
self.never_split = never_split if never_split is not None else []
self.normalize_text = normalize_text
self.trim_whitespace = trim_whitespace
try:
import rhoknp
except ImportError:
raise ImportError(
"You need to install rhoknp to use JumanppTokenizer. "
"See https://github.com/ku-nlp/rhoknp for installation."
)
self.juman = rhoknp.Jumanpp()
def tokenize(self, text, never_split=None, **kwargs):
"""Tokenizes a piece of text."""
if self.normalize_text:
text = unicodedata.normalize("NFKC", text)
text = text.strip()
never_split = self.never_split + (never_split if never_split is not None else [])
tokens = []
for mrph in self.juman.apply_to_sentence(text).morphemes:
token = mrph.text
if self.do_lower_case and token not in never_split:
token = token.lower()
if self.trim_whitespace:
if token.strip() == "":
continue
else:
token = token.strip()
tokens.append(token)
return tokens
class CharacterTokenizer:
"""Runs Character tokenization."""
def __init__(self, vocab, unk_token, normalize_text=True):
"""
Constructs a CharacterTokenizer.
Args:
**vocab**:
Vocabulary object.
**unk_token**: str
A special symbol for out-of-vocabulary token.
**normalize_text**: (`optional`) boolean (default True)
Whether to apply unicode normalization to text before tokenization.
"""
self.vocab = vocab
self.unk_token = unk_token
self.normalize_text = normalize_text
def tokenize(self, text):
"""
将文本分词为字符列表。
例如,`input = "apple"` 将返回 `["a", "p", "p", "l", "e"]`。
Args:
text: 单个标记或以空格分隔的标记。
应该已经通过 *BasicTokenizer* 处理过。
Returns:
包含字符的列表。
"""
if self.normalize_text:
text = unicodedata.normalize("NFKC", text)
output_tokens = []
for char in text:
if char not in self.vocab:
output_tokens.append(self.unk_token)
continue
output_tokens.append(char)
return output_tokens
class BasicTokenizer(object):
"""
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
Args:
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
never_split (`Iterable`, *optional*):
Collection of tokens which will never be split during tokenization. Only has an effect when
`do_basic_tokenize=True`
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters.
This should likely be deactivated for Japanese (see this
[issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of token not to split.
"""
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
text = self._clean_text(text)
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
if self.do_lower_case:
token = token.lower()
if self.strip_accents is not False:
token = self._run_strip_accents(token)
elif self.strip_accents:
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token, never_split))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF)
or (cp >= 0x20000 and cp <= 0x2A6DF)
or (cp >= 0x2A700 and cp <= 0x2B73F)
or (cp >= 0x2B740 and cp <= 0x2B81F)
or (cp >= 0x2B820 and cp <= 0x2CEAF)
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F)
):
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xFFFD or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
tokenization using the given vocabulary.
For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through *BasicTokenizer*.
Returns:
A list of wordpiece tokens.
"""
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
class SentencepieceTokenizer(object):
"""
Runs sentencepiece tokenization. Based on transformers.models.albert.tokenization_albert.AlbertTokenizer.
"""
def __init__(
self,
vocab,
unk_token,
do_lower_case=False,
remove_space=True,
keep_accents=True,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
):
self.vocab = vocab
self.unk_token = unk_token
self.do_lower_case = do_lower_case
self.remove_space = remove_space
self.keep_accents = keep_accents
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab)
def preprocess_text(self, inputs):
if self.remove_space:
outputs = " ".join(inputs.strip().split())
else:
outputs = inputs
outputs = outputs.replace("``", '"').replace("''", '"')
if not self.keep_accents:
outputs = unicodedata.normalize("NFKD", outputs)
outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
if self.do_lower_case:
outputs = outputs.lower()
return outputs
def tokenize(self, text):
"""
Tokenizes text by sentencepiece. Based on [SentencePiece](https://github.com/google/sentencepiece).
Tokenization needs the given vocabulary.
Args:
text: A string needs to be tokenized.
Returns:
A list of sentencepiece tokens.
"""
text = self.preprocess_text(text)
pieces = self.sp_model.encode(text, out_type=str)
new_pieces = []
for piece in pieces:
if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
if len(cur_pieces[0]) == 1:
cur_pieces = cur_pieces[1:]
else:
cur_pieces[0] = cur_pieces[0][1:]
cur_pieces.append(piece[-1])
new_pieces.extend(cur_pieces)
else:
new_pieces.append(piece)
return new_pieces
.\models\bert_japanese\__init__.py
from typing import TYPE_CHECKING
from ...utils import _LazyModule
_import_structure = {"tokenization_bert_japanese": ["BertJapaneseTokenizer", "CharacterTokenizer", "MecabTokenizer"]}
if TYPE_CHECKING:
from .tokenization_bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\bigbird_pegasus\configuration_bigbird_pegasus.py
""" BigBirdPegasus 模型配置"""
from collections import OrderedDict
from typing import Any, Mapping, Optional
from ... import PreTrainedTokenizer
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig, OnnxConfigWithPast, OnnxSeq2SeqConfigWithPast
from ...onnx.utils import compute_effective_axis_dimension
from ...utils import TensorType, is_torch_available, logging
logger = logging.get_logger(__name__)
BIGBIRD_PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"google/bigbird-pegasus-large-arxiv": (
"https://huggingface.co/google/bigbird-pegasus-large-arxiv/resolve/main/config.json"
),
"google/bigbird-pegasus-large-pubmed": (
"https://huggingface.co/google/bigbird-pegasus-large-pubmed/resolve/main/config.json"
),
"google/bigbird-pegasus-large-bigpatent": (
"https://huggingface.co/google/bigbird-pegasus-large-bigpatent/resolve/main/config.json"
),
}
class BigBirdPegasusConfig(PretrainedConfig):
r"""
这是用于存储 BigBirdPegasusModel 配置的类。它用于根据指定的参数实例化 BigBirdPegasus 模型,定义模型架构。
使用默认值实例化配置将产生类似于 BigBirdPegasus google/bigbird-pegasus-large-arxiv 架构的配置。
配置对象继承自 PretrainedConfig,并可用于控制模型输出。阅读 PretrainedConfig 的文档以获取更多信息。
Example:
```
>>> from transformers import BigBirdPegasusConfig, BigBirdPegasusModel
>>> # 初始化一个 BigBirdPegasus bigbird-pegasus-base 风格的配置
>>> configuration = BigBirdPegasusConfig()
>>> # 从配置中初始化一个模型(带有随机权重)
>>> model = BigBirdPegasusModel(configuration)
>>> # 访问模型配置
>>> configuration = model.config
```
"""
model_type = "bigbird_pegasus"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_attention_heads": "encoder_attention_heads",
"hidden_size": "d_model",
"attention_probs_dropout_prob": "attention_dropout",
}
def __init__(
self,
vocab_size=96103,
max_position_embeddings=4096,
encoder_layers=16,
encoder_ffn_dim=4096,
encoder_attention_heads=16,
decoder_layers=16,
decoder_ffn_dim=4096,
decoder_attention_heads=16,
encoder_layerdrop=0.0,
decoder_layerdrop=0.0,
use_cache=True,
is_encoder_decoder=True,
activation_function="gelu_new",
d_model=1024,
dropout=0.1,
attention_dropout=0.0,
activation_dropout=0.0,
init_std=0.02,
decoder_start_token_id=2,
classifier_dropout=0.0,
scale_embedding=True,
pad_token_id=0,
bos_token_id=2,
eos_token_id=1,
attention_type="block_sparse",
block_size=64,
num_random_blocks=3,
use_bias=False,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.d_model = d_model
self.encoder_ffn_dim = encoder_ffn_dim
self.encoder_layers = encoder_layers
self.encoder_attention_heads = encoder_attention_heads
self.decoder_ffn_dim = decoder_ffn_dim
self.decoder_layers = decoder_layers
self.decoder_attention_heads = decoder_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.activation_function = activation_function
self.init_std = init_std
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.classifier_dropout = classifier_dropout
self.use_cache = use_cache
self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding
self.attention_type = attention_type
self.block_size = block_size
self.num_random_blocks = num_random_blocks
self.use_bias = use_bias
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
class BigBirdPegasusOnnxConfig(OnnxSeq2SeqConfigWithPast):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task in ["default", "seq2seq-lm"]:
common_inputs = OrderedDict(
[
("input_ids", {0: "batch", 1: "encoder_sequence"}),
("attention_mask", {0: "batch", 1: "encoder_sequence"}),
]
)
if self.use_past:
common_inputs["decoder_input_ids"] = {0: "batch"}
common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
else:
common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"}
if self.use_past:
self.fill_with_past_key_values_(common_inputs, direction="inputs")
elif self.task == "causal-lm":
common_inputs = OrderedDict(
[
("input_ids", {0: "batch", 1: "encoder_sequence"}),
("attention_mask", {0: "batch", 1: "encoder_sequence"}),
]
)
if self.use_past:
num_encoder_layers, _ = self.num_layers
for i in range(num_encoder_layers):
common_inputs[f"past_key_values.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
common_inputs[f"past_key_values.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
else:
common_inputs = OrderedDict(
[
("input_ids", {0: "batch", 1: "encoder_sequence"}),
("attention_mask", {0: "batch", 1: "encoder_sequence"}),
("decoder_input_ids", {0: "batch", 1: "decoder_sequence"}),
("decoder_attention_mask", {0: "batch", 1: "decoder_sequence"}),
]
)
return common_inputs
@property
def outputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task in ["default", "seq2seq-lm"]:
common_outputs = super().outputs
else:
common_outputs = super(OnnxConfigWithPast, self).outputs
if self.use_past:
num_encoder_layers, _ = self.num_layers
for i in range(num_encoder_layers):
common_outputs[f"present.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
common_outputs[f"present.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
return common_outputs
def _generate_dummy_inputs_for_default_and_seq2seq_lm(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
tokenizer, batch_size, seq_length, is_pair, framework
)
decoder_seq_length = seq_length if not self.use_past else 1
decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
tokenizer, batch_size, decoder_seq_length, is_pair, framework
)
decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()}
common_inputs = dict(**encoder_inputs, **decoder_inputs)
if self.use_past:
if not is_torch_available():
raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
else:
import torch
batch, encoder_seq_length = common_inputs["input_ids"].shape
decoder_seq_length = common_inputs["decoder_input_ids"].shape[1]
num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads
encoder_shape = (
batch,
num_encoder_attention_heads,
encoder_seq_length,
self._config.hidden_size // num_encoder_attention_heads,
)
decoder_past_length = decoder_seq_length + 3
decoder_shape = (
batch,
num_decoder_attention_heads,
decoder_past_length,
self._config.hidden_size // num_decoder_attention_heads,
)
common_inputs["decoder_attention_mask"] = torch.cat(
[common_inputs["decoder_attention_mask"], torch.ones(batch, decoder_past_length)], dim=1
)
common_inputs["past_key_values"] = []
num_encoder_layers, num_decoder_layers = self.num_layers
min_num_layers = min(num_encoder_layers, num_decoder_layers)
remaining_side_name = "encoder" if num_encoder_layers > num_decoder_layers else "decoder"
for _ in range(min_num_layers):
common_inputs["past_key_values"].append(
(
torch.zeros(decoder_shape),
torch.zeros(decoder_shape),
torch.zeros(encoder_shape),
torch.zeros(encoder_shape),
)
)
shape = encoder_shape if remaining_side_name == "encoder" else decoder_shape
for _ in range(min_num_layers, max_num_layers):
common_inputs["past_key_values"].append((torch.zeros(shape), torch.zeros(shape)))
return common_inputs
def _generate_dummy_inputs_for_causal_lm(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
tokenizer, batch_size, seq_length, is_pair, framework
)
if self.use_past:
if not is_torch_available():
raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
else:
import torch
batch, seqlen = common_inputs["input_ids"].shape
past_key_values_length = seqlen + 2
num_encoder_layers, _ = self.num_layers
num_encoder_attention_heads, _ = self.num_attention_heads
past_shape = (
batch,
num_encoder_attention_heads,
past_key_values_length,
self._config.hidden_size // num_encoder_attention_heads,
)
mask_dtype = common_inputs["attention_mask"].dtype
common_inputs["attention_mask"] = torch.cat(
[common_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
)
common_inputs["past_key_values"] = [
(torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(num_encoder_layers)
]
return common_inputs
def _generate_dummy_inputs_for_sequence_classification_and_question_answering(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
batch_size = compute_effective_axis_dimension(
batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0
)
token_to_add = tokenizer.num_special_tokens_to_add(is_pair)
seq_length = compute_effective_axis_dimension(
seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
)
dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size
common_inputs = dict(tokenizer(dummy_input, return_tensors=framework))
return common_inputs
def generate_dummy_inputs(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
if self.task in ["default", "seq2seq-lm"]:
common_inputs = self._generate_dummy_inputs_for_default_and_seq2seq_lm(
tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
)
elif self.task == "causal-lm":
common_inputs = self._generate_dummy_inputs_for_causal_lm(
tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
)
else:
common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
)
return common_inputs
def _flatten_past_key_values_(self, flattened_output, name, idx, t):
if self.task in ["default", "seq2seq-lm"]:
flattened_output = super()._flatten_past_key_values_(flattened_output, name, idx, t)
else:
flattened_output = super(OnnxSeq2SeqConfigWithPast, self)._flatten_past_key_values_(
flattened_output, name, idx, t
)
.\models\bigbird_pegasus\convert_bigbird_pegasus_tf_to_pytorch.py
import argparse
from typing import Dict
import tensorflow as tf
import torch
from tqdm import tqdm
from transformers import BigBirdPegasusConfig, BigBirdPegasusForConditionalGeneration
INIT_COMMON = [
("/", "."),
("layer_", "layers."),
("kernel", "weight"),
("beta", "bias"),
("gamma", "weight"),
("pegasus", "model"),
]
END_COMMON = [
(".output.dense", ".fc2"),
("intermediate.LayerNorm", "final_layer_norm"),
("intermediate.dense", "fc1"),
]
DECODER_PATTERNS = (
INIT_COMMON
+ [
("attention.self.LayerNorm", "self_attn_layer_norm"),
("attention.output.dense", "self_attn.out_proj"),
("attention.self", "self_attn"),
("attention.encdec.LayerNorm", "encoder_attn_layer_norm"),
("attention.encdec_output.dense", "encoder_attn.out_proj"),
("attention.encdec", "encoder_attn"),
("key", "k_proj"),
("value", "v_proj"),
("query", "q_proj"),
("decoder.LayerNorm", "decoder.layernorm_embedding"),
]
+ END_COMMON
)
REMAINING_PATTERNS = (
INIT_COMMON
+ [
("embeddings.word_embeddings", "shared.weight"),
("embeddings.position_embeddings", "embed_positions.weight"),
("attention.self.LayerNorm", "self_attn_layer_norm"),
("attention.output.dense", "self_attn.output"),
("attention.self", "self_attn.self"),
("encoder.LayerNorm", "encoder.layernorm_embedding"),
]
+ END_COMMON
)
KEYS_TO_IGNORE = [
"encdec/key/bias",
"encdec/query/bias",
"encdec/value/bias",
"self/key/bias",
"self/query/bias",
"self/value/bias",
"encdec_output/dense/bias",
"attention/output/dense/bias",
]
def rename_state_dict_key(k, patterns):
for tf_name, hf_name in patterns:
k = k.replace(tf_name, hf_name)
return k
def convert_bigbird_pegasus(tf_weights: dict, config_update: dict) -> BigBirdPegasusForConditionalGeneration:
cfg = BigBirdPegasusConfig(**config_update)
torch_model = BigBirdPegasusForConditionalGeneration(cfg)
state_dict = torch_model.state_dict()
mapping = {}
decoder_weights = {k: tf_weights[k] for k in tf_weights if k.startswith("pegasus/decoder")}
remaining_weights = {k: tf_weights[k] for k in tf_weights if not k.startswith("pegasus/decoder")}
for k, v in tqdm(decoder_weights.items(), "tf -> hf conversion"):
conditions = [k.endswith(ending) for ending in KEYS_TO_IGNORE]
if any(conditions):
continue
patterns = DECODER_PATTERNS
new_k = rename_state_dict_key(k, patterns)
if new_k not in state_dict:
raise ValueError(f"could not find new key {new_k} in state dict. (converted from {k})")
if any(True if i in k else False for i in ["dense", "query", "key", "value"]):
v = v.T
mapping[new_k] = torch.from_numpy(v)
assert v.shape == state_dict[new_k].shape, f"{new_k}, {k}, {v.shape}, {state_dict[new_k].shape}"
for k, v in tqdm(remaining_weights.items(), "tf -> hf conversion"):
conditions = [k.endswith(ending) for ending in KEYS_TO_IGNORE]
if any(conditions):
continue
patterns = REMAINING_PATTERNS
new_k = rename_state_dict_key(k, patterns)
if new_k not in state_dict and k != "pegasus/embeddings/position_embeddings":
raise ValueError(f"could not find new key {new_k} in state dict. (converted from {k})")
if any(True if i in k else False for i in ["dense", "query", "key", "value"]):
v = v.T
mapping[new_k] = torch.from_numpy(v)
if k != "pegasus/embeddings/position_embeddings":
assert v.shape == state_dict[new_k].shape, f"{new_k}, {k}, {v.shape}, {state_dict[new_k].shape}"
mapping["model.encoder.embed_positions.weight"] = mapping["model.embed_positions.weight"]
mapping["model.decoder.embed_positions.weight"] = mapping.pop("model.embed_positions.weight")
missing, extra = torch_model.load_state_dict(mapping, strict=False)
unexpected_missing = [
k
for k in missing
if k
not in [
"final_logits_bias",
"model.encoder.embed_tokens.weight",
"model.decoder.embed_tokens.weight",
"lm_head.weight",
]
]
assert unexpected_missing == [], f"no matches found for the following torch keys {unexpected_missing}"
assert extra == [], f"no matches found for the following tf keys {extra}"
return torch_model
def get_tf_weights_as_numpy(path) -> Dict:
init_vars = tf.train.list_variables(path)
tf_weights = {}
ignore_name = ["global_step"]
for name, shape in tqdm(init_vars, desc="converting tf checkpoint to dict"):
skip_key = any(pat in name for pat in ignore_name)
if skip_key:
continue
array = tf.train.load_variable(path, name)
tf_weights[name] = array
return tf_weights
def convert_bigbird_pegasus_ckpt_to_pytorch(ckpt_path: str, save_dir: str, config_update: dict):
tf_weights = get_tf_weights_as_numpy(ckpt_path)
torch_model = convert_bigbird_pegasus(tf_weights, config_update)
torch_model.save_pretrained(save_dir)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--tf_ckpt_path", type=str, help="passed to tf.train.list_variables")
parser.add_argument("--save_dir", default=None, type=str, help="Path to the output PyTorch model.")
args = parser.parse_args()
config_update = {}
convert_bigbird_pegasus_ckpt_to_pytorch(args.tf_ckpt_path, args.save_dir, config_update=config_update)
.\models\bigbird_pegasus\modeling_bigbird_pegasus.py
""" PyTorch BigBirdPegasus model."""
import copy
import math
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
Seq2SeqLMOutput,
Seq2SeqModelOutput,
Seq2SeqQuestionAnsweringModelOutput,
Seq2SeqSequenceClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_code_sample_docstrings,
add_end_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_bigbird_pegasus import BigBirdPegasusConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "google/bigbird-pegasus-large-arxiv"
_CONFIG_FOR_DOC = "BigBirdPegasusConfig"
_EXPECTED_OUTPUT_SHAPE = [1, 7, 1024]
BIGBIRD_PEGASUS_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/bigbird-pegasus-large-arxiv",
"google/bigbird-pegasus-large-pubmed",
"google/bigbird-pegasus-large-bigpatent",
]
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
"""
Shift input ids one token to the right.
"""
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
shifted_input_ids[:, 0] = decoder_start_token_id
if pad_token_id is None:
raise ValueError("self.model.config.pad_token_id has to be defined.")
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
return shifted_input_ids
class BigBirdPegasusLearnedPositionalEmbedding(nn.Embedding):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
def __init__(self, num_embeddings: int, embedding_dim: int):
super().__init__(num_embeddings, embedding_dim)
def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
"""`input_ids_shape` is expected to be [bsz x seqlen]."""
bsz, seq_len = input_ids_shape[:2]
positions = torch.arange(
past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
)
return super().forward(positions)
class BigBirdPegasusSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias)
self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias)
self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.is_decoder = config.is_decoder
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
):
class BigBirdPegasusBlockSparseAttention(nn.Module):
def __init__(self, config, seed=None):
super().__init__()
self.max_seqlen = config.max_position_embeddings
self.seed = seed
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
f"heads {config.num_attention_heads}."
)
self.num_attention_heads = config.num_attention_heads
self.num_random_blocks = config.num_random_blocks
self.block_size = config.block_size
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias)
self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias)
self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias)
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states,
band_mask=None,
from_mask=None,
to_mask=None,
from_blocked_mask=None,
to_blocked_mask=None,
output_attentions=None,
):
batch_size, seqlen, _ = hidden_states.size()
to_seq_length = from_seq_length = seqlen
from_block_size = to_block_size = self.block_size
if from_seq_length % from_block_size != 0:
raise ValueError("Query sided sequence length must be multiple of block size")
if to_seq_length % to_block_size != 0:
raise ValueError("Key/Value sided sequence length must be multiple of block size")
query_layer = self.transpose_for_scores(self.query(hidden_states))
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
context_layer, attention_probs = self.bigbird_block_sparse_attention(
query_layer,
key_layer,
value_layer,
band_mask,
from_mask,
to_mask,
from_blocked_mask,
to_blocked_mask,
self.num_attention_heads,
self.num_random_blocks,
self.attention_head_size,
from_block_size,
to_block_size,
batch_size,
from_seq_length,
to_seq_length,
seed=self.seed,
plan_from_length=None,
plan_num_rand_blocks=None,
output_attentions=output_attentions,
)
context_layer = context_layer.contiguous().view(batch_size, from_seq_length, -1)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
@staticmethod
def torch_bmm_nd(inp_1, inp_2, ndim=None):
"""快速的多维矩阵乘法"""
return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 1])
)
@staticmethod
def torch_bmm_nd_transpose(inp_1, inp_2, ndim=None):
"""带转置的快速多维矩阵乘法"""
return torch.bmm(
inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
@staticmethod
def bigbird_block_sparse_attention(
self,
query_layer,
key_layer,
value_layer,
band_mask,
from_mask,
to_mask,
from_blocked_mask,
to_blocked_mask,
n_heads,
n_rand_blocks,
attention_head_size,
from_block_size,
to_block_size,
batch_size,
from_seq_len,
to_seq_len,
seed,
plan_from_length,
plan_num_rand_blocks,
output_attentions,
):
@staticmethod
def torch_gather_b2(params, indices):
if params.shape[:2] != indices.shape[:2]:
raise ValueError(
"Make sure that the first two dimensions of params and indices are identical, but"
f" they are params: {params.shape[:2]} vs. indices: {indices.shape[:2]}"
)
num_indices_to_gather = indices.shape[-2] * indices.shape[-1]
num_indices_to_pick_from = params.shape[2]
shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from
flattened_indices = indices.view(-1) + indices_shift
flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
out_flattened = flattened_params.index_select(0, flattened_indices)
out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
return out
@staticmethod
def _create_rand_mask_from_inputs(
from_blocked_mask,
to_blocked_mask,
rand_attn,
num_attention_heads,
num_rand_blocks,
batch_size,
from_seq_length,
from_block_size,
):
"""
Create 3D attention mask from a 2D tensor mask.
Args:
from_blocked_mask: 2D Tensor of shape [batch_size,
from_seq_length//from_block_size, from_block_size].
输入的来自的序列被块化后的掩码,形状为 [batch_size, from_seq_length//from_block_size, from_block_size]。
to_blocked_mask: int32 Tensor of shape [batch_size,
to_seq_length//to_block_size, to_block_size].
输入的目标序列被块化后的掩码,形状为 [batch_size, to_seq_length//to_block_size, to_block_size]。
rand_attn: [batch_size, num_attention_heads,
from_seq_length//from_block_size-2, num_rand_blocks]
随机注意力的掩码,形状为 [batch_size, num_attention_heads, from_seq_length//from_block_size-2, num_rand_blocks]。
num_attention_heads: int. Number of attention heads.
注意力头的数量。
num_rand_blocks: int. Number of random chunks per row.
每行的随机块数。
batch_size: int. Batch size for computation.
计算的批次大小。
from_seq_length: int. length of from sequence.
输入序列的长度。
from_block_size: int. size of block in from sequence.
输入序列中的块大小。
Returns:
float Tensor of shape [batch_size, num_attention_heads, from_seq_length//from_block_size-2,
from_block_size, num_rand_blocks*to_block_size].
返回形状为 [batch_size, num_attention_heads, from_seq_length//from_block_size-2,
from_block_size, num_rand_blocks*to_block_size] 的浮点数张量。
"""
num_windows = from_seq_length // from_block_size - 2
rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size)
rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask)
return rand_mask
@staticmethod
def _get_rand_attn_plan(from_seq_length, from_block_size, num_rand_blocks):
"""
Gives the plan of where to put random attention.
Args:
from_seq_length: int. length of from sequence.
输入序列的长度。
from_block_size: int. size of block in from sequence.
输入序列中的块大小。
num_rand_blocks: int. Number of random chunks per row.
每行的随机块数。
Returns:
plan_from_length: ending location of from block plan_num_rand_blocks: number of random ending location for
each block
返回计划的输入序列块结束位置和每个块的随机结束位置的计划。
"""
plan_from_length = []
plan_num_rand_blocks = []
if (2 * num_rand_blocks + 5) < (from_seq_length // from_block_size):
plan_from_length.append(int((2 * num_rand_blocks + 5) * from_block_size))
plan_num_rand_blocks.append(num_rand_blocks)
plan_from_length.append(from_seq_length)
plan_num_rand_blocks.append(0)
elif (num_rand_blocks + 5) < (from_seq_length // from_block_size):
plan_from_length.append(int((num_rand_blocks + 5) * from_block_size))
plan_num_rand_blocks.append(num_rand_blocks // 2)
plan_from_length.append(from_seq_length)
plan_num_rand_blocks.append(num_rand_blocks - (num_rand_blocks // 2))
else:
plan_from_length.append(from_seq_length)
plan_num_rand_blocks.append(num_rand_blocks)
return plan_from_length, plan_num_rand_blocks
def _bigbird_block_rand_mask(
self, from_seq_length, to_seq_length, from_block_size, to_block_size, num_rand_blocks, last_idx=-1
):
"""
Create adjacency list of random attention.
Args:
from_seq_length: int. length of from sequence.
to_seq_length: int. length of to sequence.
from_block_size: int. size of block in from sequence.
to_block_size: int. size of block in to sequence.
num_rand_blocks: int. Number of random chunks per row.
last_idx: if -1 then num_rand_blocks blocks chosen anywhere in to sequence,
if positive then num_rand_blocks blocks chosen only up to last_idx.
Returns:
adjacency list of size from_seq_length//from_block_size-2 by num_rand_blocks
"""
if from_seq_length // from_block_size != to_seq_length // to_block_size:
raise ValueError("Error the number of blocks needs to be same!")
rand_attn = np.zeros((from_seq_length // from_block_size - 2, num_rand_blocks), dtype=np.int32)
if not self.training:
return rand_attn
middle_seq = np.arange(1, to_seq_length // to_block_size - 1, dtype=np.int32)
last = to_seq_length // to_block_size - 1
if last_idx > (2 * to_block_size):
last = (last_idx // to_block_size) - 1
r = num_rand_blocks
for i in range(1, from_seq_length // from_block_size - 1):
start = i - 2
end = i
if i == 1:
rand_attn[i - 1, :] = np.random.permutation(middle_seq[2:last])[:r]
elif i == 2:
rand_attn[i - 1, :] = np.random.permutation(middle_seq[3:last])[:r]
elif i == from_seq_length // from_block_size - 3:
rand_attn[i - 1, :] = np.random.permutation(middle_seq[:last])[:r]
elif i == from_seq_length // from_block_size - 2:
rand_attn[i - 1, :] = np.random.permutation(middle_seq[:last])[:r]
else:
if start > last:
start = last
rand_attn[i - 1, :] = np.random.permutation(middle_seq[:start])[:r]
elif (end + 1) == last:
rand_attn[i - 1, :] = np.random.permutation(middle_seq[:start])[:r]
else:
rand_attn[i - 1, :] = np.random.permutation(
np.concatenate((middle_seq[:start], middle_seq[end + 1 : last]))
)[:r]
return rand_attn
def _bigbird_block_rand_mask_with_head(
self,
from_seq_length,
to_seq_length,
from_block_size,
to_block_size,
num_heads,
plan_from_length,
plan_num_rand_blocks,
window_block_left=1,
window_block_right=1,
global_block_top=1,
global_block_bottom=1,
global_block_left=1,
global_block_right=1,
):
"""
Generates a random mask for BigBird attention with head.
Args:
from_seq_length: int. Length of the source sequence.
to_seq_length: int. Length of the target sequence.
from_block_size: int. Block size of the source sequence.
to_block_size: int. Block size of the target sequence.
num_heads: int. Number of attention heads.
plan_from_length: int. Planned length of the source sequence.
plan_num_rand_blocks: int. Planned number of random blocks.
window_block_left: int. Number of blocks of window to the left of a block.
window_block_right: int. Number of blocks of window to the right of a block.
global_block_top: int. Number of blocks globally used at the top.
global_block_bottom: int. Number of blocks globally used at the bottom.
global_block_left: int. Number of blocks globally used to the left.
global_block_right: int. Number of blocks globally used to the right.
Returns:
Random mask with head for BigBird attention.
"""
pass
@staticmethod
def _get_single_block_row_attention(
block_id,
to_start_block_id,
to_end_block_id,
num_rand_blocks,
window_block_left=1,
window_block_right=1,
global_block_left=1,
global_block_right=1,
):
"""
For a single row block, get random row attention.
Args:
block_id: int. Block ID of the row.
to_start_block_id: int. Start ID of the target blocks for random attention.
to_end_block_id: int. End ID of the target blocks for random attention.
num_rand_blocks: int. Number of random blocks to be selected.
window_block_left: int. Number of blocks of window to the left of a block.
window_block_right: int. Number of blocks of window to the right of a block.
global_block_left: int. Number of blocks globally used to the left.
global_block_right: int. Number of blocks globally used to the right.
Returns:
Array containing the selected random attention vector of size num_rand_blocks.
"""
to_block_list = np.arange(to_start_block_id, to_end_block_id, dtype=np.int32)
perm_block = np.random.permutation(to_block_list)
illegal_blocks = list(range(block_id - window_block_left, block_id + window_block_right + 1))
illegal_blocks.extend(list(range(global_block_left)))
illegal_blocks.extend(list(range(to_end_block_id - global_block_right, to_end_block_id)))
if block_id == 1:
illegal_blocks.append(to_end_block_id - 2)
if block_id == to_end_block_id - 2:
illegal_blocks.append(1)
selected_random_blocks = []
for i in range(to_end_block_id - to_start_block_id):
if perm_block[i] not in illegal_blocks:
selected_random_blocks.append(perm_block[i])
if len(selected_random_blocks) == num_rand_blocks:
break
return np.array(selected_random_blocks, dtype=np.int32)
class BigBirdPegasusEncoderAttention(nn.Module):
def __init__(self, config, seed=None):
super().__init__()
self.config = config
self.seed = seed
self.attention_type = config.attention_type
if self.attention_type == "original_full":
self.self = BigBirdPegasusSelfAttention(config)
elif self.attention_type == "block_sparse":
self.self = BigBirdPegasusBlockSparseAttention(config, seed)
else:
raise ValueError(
f"attention_type can either be original_full or block_sparse, but is {self.config.attention_type}"
)
self.output = nn.Linear(config.hidden_size, config.hidden_size, bias=config.use_bias)
def set_attention_type(self, value: str):
if value not in ["original_full", "block_sparse"]:
raise ValueError(
f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}"
)
if value == self.attention_type:
return
self.attention_type = value
if value == "original_full":
attn_weights = BigBirdPegasusSelfAttention(self.config)
else:
attn_weights = BigBirdPegasusBlockSparseAttention(self.config, self.seed)
attn_weights.query = self.self.query
attn_weights.value = self.self.value
attn_weights.key = self.self.key
self.self = attn_weights
self.attention_type = value
if not self.training:
self.self.eval()
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
past_key_value=None,
output_attentions=False,
band_mask=None,
from_mask=None,
to_mask=None,
from_blocked_mask=None,
to_blocked_mask=None,
):
head_mask = head_mask.reshape(1, -1, 1, 1) if head_mask is not None else None
if self.config.attention_type == "original_full":
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
past_key_value=past_key_value,
output_attentions=output_attentions,
)
else:
self_outputs = self.self(
hidden_states, band_mask, from_mask, to_mask, from_blocked_mask, to_blocked_mask, output_attentions
)
attention_output = self.output(self_outputs[0])
outputs = (attention_output,) + self_outputs[1:]
return outputs
class BigBirdPegasusDecoderAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
is_causal: bool = False,
config: Optional[BigBirdPegasusConfig] = None,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
self.config = config
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.is_causal = is_causal
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
self_attention_outputs = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
head_mask=layer_head_mask,
output_attentions=output_attentions,
band_mask=band_mask,
from_mask=from_mask,
to_mask=to_mask,
from_blocked_mask=from_blocked_mask,
to_blocked_mask=to_blocked_mask,
)
hidden_states = self_attention_outputs[0]
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.final_layer_norm(hidden_states)
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = self.fc2(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
if hidden_states.dtype == torch.float16 and (
torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
):
clamp_value = torch.finfo(hidden_states.dtype).max - 1000
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
outputs = (hidden_states,)
if output_attentions:
outputs += (self_attention_outputs[1],)
return outputs
def set_attention_type(self, value: str):
if value not in ["original_full", "block_sparse"]:
raise ValueError(
f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}"
)
if value == self.attention_type:
return
self.attention_type = value
self.self_attn.set_attention_type(value)
class BigBirdPegasusDecoderLayer(nn.Module):
def __init__(self, config: BigBirdPegasusConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = BigBirdPegasusDecoderAttention(
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
bias=config.use_bias,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.encoder_attn = BigBirdPegasusDecoderAttention(
self.embed_dim,
config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
bias=config.use_bias,
)
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = True,
):
pass
class BigBirdPegasusClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
def __init__(
self,
input_dim: int,
inner_dim: int,
num_classes: int,
pooler_dropout: float,
):
super().__init__()
self.dense = nn.Linear(input_dim, inner_dim)
self.dropout = nn.Dropout(p=pooler_dropout)
self.out_proj = nn.Linear(inner_dim, num_classes)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dropout(hidden_states)
hidden_states = self.dense(hidden_states)
hidden_states = torch.tanh(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.out_proj(hidden_states)
return hidden_states
class BigBirdPegasusPreTrainedModel(PreTrainedModel):
config_class = BigBirdPegasusConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["BigBirdPegasusEncoderLayer", "BigBirdPegasusDecoderLayer"]
_skip_keys_device_placement = "past_key_values"
def _init_weights(self, module):
std = self.config.init_std
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
@property
def dummy_inputs(self):
pad_token = self.config.pad_token_id
input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
dummy_inputs = {
"attention_mask": input_ids.ne(pad_token),
"input_ids": input_ids,
}
return dummy_inputs
BIGBIRD_PEGASUS_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`BigBirdPegasusConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BIGBIRD_PEGASUS_GENERATION_EXAMPLE = r"""
Summarization example:
```
>>> from transformers import AutoTokenizer, BigBirdPegasusForConditionalGeneration
>>> model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv")
>>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
>>> ARTICLE_TO_SUMMARIZE = (
... "The dominant sequence transduction models are based on complex recurrent or convolutional neural "
... "networks in an encoder-decoder configuration. The best performing models also connect the encoder "
... "and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, "
... "based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. "
... "Experiments on two machine translation tasks show these models to be superior in quality "
... "while being more parallelizable and requiring significantly less time to train."
... )
>>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=4096, return_tensors="pt", truncation=True)
>>> # Generate Summary
>>> summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=15)
>>> tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
'dominant sequence models are based on recurrent or convolutional neural networks .'
```
"""
BIGBIRD_PEGASUS_INPUTS_DOCSTRING = r"""
Placeholder for documenting inputs for BigBirdPegasus models.
"""
BIGBIRD_PEGASUS_STANDALONE_INPUTS_DOCSTRING = r"""
Placeholder for documenting standalone inputs for BigBirdPegasus models.
"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
output_attentions (`bool`, *optional*):
output_hidden_states (`bool`, *optional*):
return_dict (`bool`, *optional*):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`BigBirdPegasusEncoderLayer`].
Args:
config: BigBirdPegasusConfig
embed_tokens (nn.Embedding): output embedding
"""
def __init__(self, config: BigBirdPegasusConfig, embed_tokens: Optional[nn.Embedding] = None):
super().__init__(config)
self.attention_type = config.attention_type
self.block_size = config.block_size
self.dropout = config.dropout
self.layerdrop = config.encoder_layerdrop
embed_dim = config.d_model
self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_position_embeddings
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
if embed_tokens is not None:
self.embed_tokens.weight = embed_tokens.weight
self.embed_positions = BigBirdPegasusLearnedPositionalEmbedding(
config.max_position_embeddings,
embed_dim,
)
self.layers = nn.ModuleList([BigBirdPegasusEncoderLayer(config, seed=i) for i in range(config.encoder_layers)])
self.layernorm_embedding = nn.LayerNorm(embed_dim)
self.gradient_checkpointing = False
self.post_init()
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
pass
def set_attention_type(self, value: str):
if value not in ["original_full", "block_sparse"]:
raise ValueError(
f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}"
)
if value == self.attention_type:
return
self.attention_type = value
for layer in self.layers:
layer.set_attention_type(value)
@staticmethod
def create_masks_for_block_sparse_attn(attention_mask: torch.Tensor, block_size: int):
batch_size, seq_length = attention_mask.size()
if seq_length % block_size != 0:
raise ValueError(
f"Sequence length must be multiple of block size, but sequence length is {seq_length}, while block"
f" size is {block_size}."
)
def create_band_mask_from_inputs(from_blocked_mask, to_blocked_mask):
"""
从二维张量掩码创建三维注意力掩码。
Args:
from_blocked_mask: 形状为 [batch_size, from_seq_length//from_block_size, from_block_size] 的二维张量掩码。
to_blocked_mask: 形状为 [batch_size, to_seq_length//to_block_size, to_block_size] 的整数张量掩码。
Returns:
形状为 [batch_size, 1, from_seq_length//from_block_size-4, from_block_size, 3*to_block_size] 的浮点张量。
"""
exp_blocked_to_pad = torch.cat(
[to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2
)
band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad)
band_mask.unsqueeze_(1)
return band_mask
blocked_encoder_mask = attention_mask.view(batch_size, seq_length // block_size, block_size)
band_mask = create_band_mask_from_inputs(blocked_encoder_mask, blocked_encoder_mask)
from_mask = attention_mask.view(batch_size, 1, seq_length, 1)
to_mask = attention_mask.view(batch_size, 1, 1, seq_length)
return blocked_encoder_mask, band_mask, from_mask, to_mask
def _pad_to_block_size(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor):
"""A helper function to pad tokens and mask to work with implementation of BigBird block-sparse attention."""
block_size = self.config.block_size
batch_size, seq_len = hidden_states.shape[:2]
padding_len = (block_size - seq_len % block_size) % block_size
if padding_len > 0:
logger.warning_once(
f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of "
f"`config.block_size`: {block_size}"
)
pad_id = self.config.pad_token_id
device = hidden_states.device
input_ids_padding = torch.ones((batch_size, padding_len), dtype=torch.long, device=device) * pad_id
inputs_embeds_padding = self.embed_tokens(input_ids_padding)
hidden_states = torch.cat([hidden_states, inputs_embeds_padding], dim=-2)
attention_mask = nn.functional.pad(
attention_mask, (0, padding_len), value=0
)
return padding_len, hidden_states, attention_mask
class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel):
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`BigBirdPegasusDecoderLayer`]
Args:
config: BigBirdPegasusConfig
embed_tokens (nn.Embedding): output embedding
"""
def __init__(self, config: BigBirdPegasusConfig, embed_tokens: Optional[nn.Embedding] = None):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.decoder_layerdrop
self.padding_idx = config.pad_token_id
self.max_target_positions = config.max_position_embeddings
self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
if embed_tokens is not None:
self.embed_tokens.weight = embed_tokens.weight
self.embed_positions = BigBirdPegasusLearnedPositionalEmbedding(
config.max_position_embeddings,
config.d_model,
)
self.layers = nn.ModuleList([BigBirdPegasusDecoderLayer(config) for _ in range(config.decoder_layers)])
self.layernorm_embedding = nn.LayerNorm(config.d_model)
self.gradient_checkpointing = False
self.post_init()
@add_start_docstrings(
"The bare BigBirdPegasus Model outputting raw hidden-states without any specific head on top.",
BIGBIRD_PEGASUS_START_DOCSTRING,
)
class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel):
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: BigBirdPegasusConfig):
super().__init__(config)
padding_idx, vocab_size = config.pad_token_id, config.vocab_size
self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
self.encoder = BigBirdPegasusEncoder(config, self.shared)
self.decoder = BigBirdPegasusDecoder(config, self.shared)
self.post_init()
def get_input_embeddings(self):
return self.shared
def set_input_embeddings(self, value):
self.shared = value
self.encoder.embed_tokens = self.shared
self.decoder.embed_tokens = self.shared
def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)
def get_encoder(self):
return self.encoder
def get_decoder(self):
return self.decoder
@add_start_docstrings_to_model_forward(BIGBIRD_PEGASUS_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Seq2SeqModelOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"The BigBirdPegasus Model with a language modeling head. Can be used for summarization.",
BIGBIRD_PEGASUS_START_DOCSTRING,
)
class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel):
base_model_prefix = "model"
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
def __init__(self, config: BigBirdPegasusConfig):
super().__init__(config)
self.model = BigBirdPegasusModel(config)
self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
self.post_init()
def get_encoder(self):
return self.model.get_encoder()
def get_decoder(self):
return self.model.get_decoder()
def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
self._resize_final_logits_bias(new_embeddings.weight.shape[0])
return new_embeddings
def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
old_num_tokens = self.final_logits_bias.shape[-1]
if new_num_tokens <= old_num_tokens:
new_bias = self.final_logits_bias[:, :new_num_tokens]
else:
extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
self.register_buffer("final_logits_bias", new_bias)
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
@add_start_docstrings_to_model_forward(BIGBIRD_PEGASUS_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
@add_end_docstrings(BIGBIRD_PEGASUS_GENERATION_EXAMPLE)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, Seq2SeqLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
Return type annotation indicating the function returns either a tuple or `Seq2SeqLMOutput`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
if use_cache:
logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
use_cache = False
if decoder_input_ids is None and decoder_inputs_embeds is None:
decoder_input_ids = shift_tokens_right(
labels, self.config.pad_token_id, self.config.decoder_start_token_id
)
outputs = self.model(
input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
encoder_outputs=encoder_outputs,
decoder_attention_mask=decoder_attention_mask,
head_mask=head_mask,
decoder_head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
decoder_inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
lm_logits = self.lm_head(outputs[0])
lm_logits = lm_logits + self.final_logits_bias.to(lm_logits.device)
masked_lm_loss = None
if labels is not None:
labels = labels.to(lm_logits.device)
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (lm_logits,) + outputs[1:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return Seq2SeqLMOutput(
loss=masked_lm_loss,
logits=lm_logits,
past_key_values=outputs.past_key_values,
decoder_hidden_states=outputs.decoder_hidden_states,
decoder_attentions=outputs.decoder_attentions,
cross_attentions=outputs.cross_attentions,
encoder_last_hidden_state=outputs.encoder_last_hidden_state,
encoder_hidden_states=outputs.encoder_hidden_states,
encoder_attentions=outputs.encoder_attentions,
)
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
decoder_attention_mask=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
if decoder_input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = decoder_input_ids.shape[1] - 1
decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"decoder_attention_mask": decoder_attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache,
}
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+ layer_past[2:],
)
return reordered_past
@add_start_docstrings(
"""
BigBirdPegasus model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g.
for GLUE tasks.
""",
BIGBIRD_PEGASUS_START_DOCSTRING,
)
class BigBirdPegasusForSequenceClassification(BigBirdPegasusPreTrainedModel):
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: BigBirdPegasusConfig, **kwargs):
super().__init__(config, **kwargs)
self.model = BigBirdPegasusModel(config)
self.classification_head = BigBirdPegasusClassificationHead(
config.d_model,
config.d_model,
config.num_labels,
config.classifier_dropout,
)
self.post_init()
@add_start_docstrings_to_model_forward(BIGBIRD_PEGASUS_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Seq2SeqSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Forward pass for the BigBirdPegasusForSequenceClassification model.
Args:
input_ids (torch.LongTensor, optional): Input token IDs. Defaults to None.
attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
decoder_input_ids (torch.LongTensor, optional): Decoder input token IDs. Defaults to None.
decoder_attention_mask (torch.LongTensor, optional): Decoder attention mask. Defaults to None.
head_mask (torch.Tensor, optional): Head mask. Defaults to None.
decoder_head_mask (torch.Tensor, optional): Decoder head mask. Defaults to None.
cross_attn_head_mask (torch.Tensor, optional): Cross-attention head mask. Defaults to None.
encoder_outputs (List[torch.FloatTensor], optional): Encoder outputs. Defaults to None.
inputs_embeds (torch.FloatTensor, optional): Embedded inputs. Defaults to None.
decoder_inputs_embeds (torch.FloatTensor, optional): Embedded decoder inputs. Defaults to None.
labels (torch.LongTensor, optional): Labels for classification. Defaults to None.
use_cache (bool, optional): Whether to use cache. Defaults to None.
output_attentions (bool, optional): Whether to output attentions. Defaults to None.
output_hidden_states (bool, optional): Whether to output hidden states. Defaults to None.
return_dict (bool, optional): Whether to return a dictionary. Defaults to None.
Returns:
Seq2SeqSequenceClassifierOutput or dict: Sequence classification output.
"""
pass
@add_start_docstrings(
"""
BigBirdPegasus Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
BIGBIRD_PEGASUS_START_DOCSTRING,
)
class BigBirdPegasusForQuestionAnswering(BigBirdPegasusPreTrainedModel):
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config):
super().__init__(config)
config.num_labels = 2
self.num_labels = config.num_labels
self.model = BigBirdPegasusModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(BIGBIRD_PEGASUS_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Seq2SeqQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: torch.Tensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
class BigBirdPegasusDecoderWrapper(BigBirdPegasusPreTrainedModel):
"""
This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
used in combination with the [`EncoderDecoderModel`] framework.
"""
def __init__(self, config):
super().__init__(config)
self.decoder = BigBirdPegasusDecoder(config)
def forward(self, *args, **kwargs):
return self.decoder(*args, **kwargs)
class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel):
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
config = copy.deepcopy(config)
config.is_decoder = True
config.is_encoder_decoder = False
super().__init__(config)
self.model = BigBirdPegasusDecoderWrapper(config)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
def get_input_embeddings(self):
return self.model.decoder.embed_tokens
def set_input_embeddings(self, value):
self.model.decoder.embed_tokens = value
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model.decoder = decoder
def get_decoder(self):
return self.model.decoder
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Forward pass for the BigBirdPegasusForCausalLM model.
"""
...
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
):
"""
Prepare inputs for generation based on the BigBirdPegasusForCausalLM model.
"""
...
):
if attention_mask is None:
attention_mask = input_ids.new_ones(input_ids.shape)
if past_key_values:
input_ids = input_ids[:, -1:]
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"past_key_values": past_key_values,
"use_cache": use_cache,
}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past