Transformers 源码解析(五十五)
.\models\gptsan_japanese\tokenization_gptsan_japanese.py
"""GPTSANJapanese 的标记化类"""
import collections
import json
import os
import re
from typing import List, Optional, Tuple, Union
import numpy as np
from ...tokenization_utils import PreTrainedTokenizer
from ...tokenization_utils_base import (
BatchEncoding,
PreTokenizedInput,
PreTokenizedInputPair,
TextInput,
TextInputPair,
TruncationStrategy,
)
from ...utils import PaddingStrategy, logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "emoji_file": "emoji.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"Tanrei/GPTSAN-japanese": "https://huggingface.co/Tanrei/GPTSAN-japanese/blob/main/vocab.txt",
},
"emoji_file": {
"Tanrei/GPTSAN-japanese": "https://huggingface.co/Tanrei/GPTSAN-japanese/blob/main/emoji.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"Tanrei/GPTSAN-japanese": 1280,
}
def load_vocab_and_emoji(vocab_file, emoji_file):
"""加载词汇文件和表情符号文件到字典中。"""
with open(emoji_file, "r", encoding="utf-8") as f:
emoji = json.loads(f.read())
vocab = collections.OrderedDict()
raw_vocab = collections.OrderedDict()
ids_to_tokens = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as f:
token = f.readlines()
token = [[t.rstrip("\n")] if (t == ",\n" or "," not in t) else t.rstrip("\n").split(",") for t in token]
for idx, b in enumerate(token):
ids_to_tokens[idx] = b
raw_vocab[",".join(b)] = idx
for wd in b:
vocab[wd] = idx
return vocab, raw_vocab, ids_to_tokens, emoji
class GPTSanJapaneseTokenizer(PreTrainedTokenizer):
"""
本标记器基于 GPTNeoXJapaneseTokenizer,并进行以下修改:
- 正确解码字节0~255的标记
- 添加 bagofword 标记处理
- 为 Prefix-LM 模型返回 token_type_ids
bagofword 标记表示前一个标记的重复,并在解码时转换为三个连续的标记
此外,原始的日本特殊 Sub-Word-Encoding 已在此存储库中发布
(https://github.com/tanreinama/Japanese-BPEEncoder_V2)。token_type_ids 是一个指示前缀输入的掩码
"""
pass
>>> from transformers import GPTSanJapaneseTokenizer
引入 GPTSanJapaneseTokenizer 类从 transformers 库
>>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
使用预训练模型 "Tanrei/GPTSAN-japanese" 初始化一个 tokenizer 对象
>>>
>>> tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"]
[35993, 35998, 34347, 31459, 30647, 31448, 25, 30659, 35729, 35676, 32417, 30647, 17750, 35589, 17750, 35590, 321, 1281]
>>>
>>> tokenizer.decode(tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"])
'吾輩は猫である🐯。実は慶応(慶応)大学出身'
Example for Prefix-LM:
>>> from transformers import GPTSanJapaneseTokenizer
引入 GPTSanJapaneseTokenizer 类从 transformers 库
>>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
使用预训练模型 "Tanrei/GPTSAN-japanese" 初始化一个 tokenizer 对象
>>> tokenizer("実は慶応(慶應)大学出身", prefix_text="吾輩は猫である🐯。")["input_ids"]
[35993, 34347, 31459, 30647, 31448, 25, 30659, 35729, 35676, 35998, 32417, 30647, 17750, 35589, 17750, 35590, 321, 1281]
>>>
>>> tokenizer("実は慶応(慶應)大学出身", prefix_text="吾輩は猫である🐯。")["token_type_ids"]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Example for batch encode:
>>> from transformers import GPTSanJapaneseTokenizer
引入 GPTSanJapaneseTokenizer 类从 transformers 库
>>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
使用预训练模型 "Tanrei/GPTSAN-japanese" 初始化一个 tokenizer 对象
>>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["input_ids"]
[[35993, 8640, 25948, 35998, 30647, 35675, 35999, 35999], [35993, 10382, 9868, 35998, 30646, 9459, 30646, 35675]]
>>>
>>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["token_type_ids"]
[[1, 1, 1, 0, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0, 0]]
>>>
>>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["attention_mask"]
[[1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1]]
Args:
vocab_file (`str`):
File containing the vocabulary.
emoji_file (`str`):
File containing the emoji.
unk_token (`str`, *optional*, defaults to `"<|nottoken|>"`):
The token used for unknown characters.
pad_token (`str`, *optional*, defaults to `"<|separator|>"`):
The token used for padding.
bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
The beginning of sequence token.
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The end of sequence token.
sep_token (`str`, *optional*, defaults to `"<|segmenter|>"`):
A special token to separate tokens into prefix and general input parts.
do_clean_text (`bool`, *optional*, defaults to `False`):
Whether or not to clean text for URLs, emails, telephone numbers, Japanese dates, and Japanese prices.
"""
# Define constants for files related to vocabulary and model configurations
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
def __init__(
self,
vocab_file,
emoji_file,
unk_token="<|nottoken|>",
pad_token="<|separator|>",
bos_token="<|startoftext|>",
eos_token="<|endoftext|>",
sep_token="<|segmenter|>",
do_clean_text=False,
**kwargs,
):
# Check if vocabulary file exists; raise an error if not found
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
" model use `tokenizer = GPTSanJapaneseTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
# Check if emoji file exists; raise an error if not found
if not os.path.isfile(emoji_file):
raise ValueError(
f"Can't find an emoji file at path '{emoji_file}'. To load the emoji information from a Google"
" pretrained model use `tokenizer = GPTSanJapaneseTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
# Initialize the tokenizer with the provided parameters
self.do_clean_text = do_clean_text
self.vocab, self.raw_vocab, self.ids_to_tokens, self.emoji = load_vocab_and_emoji(vocab_file, emoji_file)
self.subword_tokenizer = SubWordJapaneseTokenizer(
vocab=self.vocab, ids_to_tokens=self.ids_to_tokens, emoji=self.emoji
)
# Initialize the superclass (TokenizerBase) with tokenizer specific parameters
super().__init__(
unk_token=unk_token,
pad_token=pad_token,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
do_clean_text=do_clean_text,
**kwargs,
)
@property
# Property to get the size of the vocabulary
# Copied from tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizer.vocab_size
def vocab_size(self):
# The vocab_size property returns the length of the raw_vocab, which contains character variations unique to Japanese
return len(self.raw_vocab)
# 从 raw_vocab 和 added_tokens_encoder 构建并返回词汇表字典
def get_vocab(self):
return dict(self.raw_vocab, **self.added_tokens_encoder)
# 使用 subword_tokenizer 对文本进行分词处理并返回结果
def _tokenize(self, text):
return self.subword_tokenizer.tokenize(text, clean=self.do_clean_text)
# 根据 token 查找词汇表中的对应 id,如果找不到则返回 unk_token 的 id
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.vocab.get(token, self.vocab.get(self.unk_token))
# 根据 id 查找词汇表中的对应 token
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.subword_tokenizer.convert_id_to_token(index)
# 将一系列 token 转换为单个字符串
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
words = []
byte_tokens = []
for word in tokens:
if word[:6] == "<|byte" and word[-2:] == "|>":
byte_tokens.append(int(word[6:-2]))
else:
if len(byte_tokens) > 0:
words.append(bytearray(byte_tokens).decode("utf-8", errors="replace"))
byte_tokens = []
if word[:7] == "<|emoji" and word[-2:] == "|>":
words.append(self.emoji["emoji_inv"][word])
elif word == "<SP>":
words.append(" ")
elif word == "<BR>":
words.append("\n")
elif word == "<TAB>":
words.append("\t")
elif word == "<BLOCK>":
words.append("▀")
elif word == "<KIGOU>":
words.append("ǀ")
elif word == "<U2000U2BFF>":
words.append("‖")
elif word == "<|bagoftoken|>":
if len(words) > 0:
words.append(words[-1])
words.append(words[-1])
words.append(words[-1])
elif word.startswith("<|") and word.endswith("|>"):
words.append("")
else:
words.append(word)
if len(byte_tokens) > 0:
words.append(bytearray(byte_tokens).decode("utf-8", errors="replace"))
text = "".join(words)
return text
# 默认的聊天模板,用于在消息之间添加标准的BOS、SEP和EOS标记,并且不包含角色信息。
def default_chat_template(self):
"""
A simple chat template that adds standard BOS, SEP and EOS tokens between messages while discarding role
information.
"""
# 如果未为此分词器定义聊天模板,则警告并使用默认模板
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
# 返回格式化后的聊天模板字符串
return (
"{% for message in messages %}"
"{% if not loop.first %}{{ bos_token}}{% endif %}"
"{{ sep_token }}{{ message.content }} {{ eos_token }}"
"{% endfor %}"
)
# 从 GPTNeoXJapaneseTokenizer.save_vocabulary 复制而来
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# 初始化索引
index = 0
# 检查保存目录是否存在
if os.path.isdir(save_directory):
# 构建词汇表文件路径和表情符号文件路径
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
emoji_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["emoji_file"]
)
else:
# 构建词汇表文件路径和表情符号文件路径(不是目录)
vocab_file = (
(filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["vocab_file"]
)
emoji_file = (
(filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["emoji_file"]
)
# 写入词汇表文件
with open(vocab_file, "w", encoding="utf-8") as writer:
# 遍历词汇表映射,将索引和对应的 token 写入文件
for token_index, token in self.ids_to_tokens.items():
if index != token_index:
# 若词汇表索引不连续,发出警告
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!"
)
index = token_index
# 将 token 写入文件,每个 token 用逗号分隔
writer.write(",".join(token) + "\n")
index += 1
# 写入表情符号文件
with open(emoji_file, "w", encoding="utf-8") as writer:
json.dump(self.emoji, writer)
# 返回词汇表文件和表情符号文件的路径
return vocab_file, emoji_file
# 创建 token_type_ids 从 token_ids_0 和 token_ids_1 中
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
# docstyle-ignore
"""
The tokenizer returns token_type_ids as separators between the Prefix part and the rest.
token_type_ids is 1 for the Prefix part and 0 for the rest of the token.
Example:
```
>>> from transformers import GPTSanJapaneseTokenizer
>>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
>>> x_token = tokenizer("アイウエ")
>>>
>>>
>>> x_token = tokenizer("", prefix_text="アイウエ")
>>>
>>>
>>> x_token = tokenizer("ウエ", prefix_text="アイ")
>>>
>>>
```"""
# 计算前缀长度的初始值为 0
prefix_len = 0
# 检查分隔符在词汇表中存在
if self.sep_token in self.vocab:
# 获取分隔符在词汇表中的索引
segid = self.vocab[self.sep_token]
# 如果 token_ids_0 中存在分隔符的索引
if segid in token_ids_0:
# 计算前缀长度为分隔符索引之前的长度
prefix_len = token_ids_0.index(segid)
# 如果 token_ids_1 为 None,则总长度为 token_ids_0 的长度
if token_ids_1 is None:
total_len = len(token_ids_0)
else:
# 否则总长度为 token_ids_0 和 token_ids_1 的长度之和
total_len = len(token_ids_0 + token_ids_1)
# 返回前缀长度数量的 1,后面补充 (总长度 - 前缀长度) 个 0 组成的列表
return prefix_len * [1] + (total_len - prefix_len) * [0]
def prepare_for_tokenization(self, text, prefix_text=None, add_sep_token=None, **kwargs):
# GPTSAN 在 Prefix-LM 中除了在文本生成中插入的 SOT,还额外插入 SEP 标记。
# 文本开头的 SOT,以及在前缀部分和其余部分之间的 SEP 标记。
if add_sep_token is None:
# 如果未明确在非前缀位置插入 SEP 标记
add_sep_token = self.sep_token not in text
# 准备 tokenization 的文本,初始为空字符串或者以 BOS 标记开头的字符串
prepared = self.bos_token if self.bos_token in self.vocab else ""
# 如果有前缀文本,则将其添加到准备的文本中
prepared += prefix_text if prefix_text is not None else ""
# 如果需要添加 SEP 标记,则将其添加到准备的文本中
if add_sep_token:
prepared += self.sep_token if self.sep_token in self.vocab else ""
# 将原始文本添加到准备的文本中
prepared += text
# 返回包含准备好的文本和其他关键字参数的元组
return (prepared, kwargs)
# 定义了一个方法 `_batch_encode_plus`,用于批量编码文本或文本对
def _batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput], List[TextInputPair], List[PreTokenizedInput], List[PreTokenizedInputPair]
],
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
) -> BatchEncoding:
# 此标记器将输入文本对转换为前缀输入和后续输入
if isinstance(batch_text_or_text_pairs[0], tuple) or isinstance(tuple(batch_text_or_text_pairs[0]), list):
# 如果输入是文本对或文本对列表,则处理成前缀加分隔符后的单一文本列表
batch_prefix_texts = []
for pref, txt in batch_text_or_text_pairs:
batch_prefix_texts.append(pref + self.sep_token + txt)
batch_text_or_text_pairs = batch_prefix_texts
# 调用父类的 `_batch_encode_plus` 方法,传递所有参数,并返回结果
return super()._batch_encode_plus(
batch_text_or_text_pairs,
add_special_tokens,
padding_strategy,
truncation_strategy,
max_length,
stride,
is_split_into_words,
pad_to_multiple_of,
return_tensors,
return_token_type_ids,
return_attention_mask,
return_overflowing_tokens,
return_special_tokens_mask,
return_offsets_mapping,
return_length,
verbose,
)
# 定义 SubWordJapaneseTokenizer 类,用于日语分词,基于 GPTNeoXJapaneseTokenizer 并进行了以下修改
class SubWordJapaneseTokenizer(object):
"""
This tokenizer is based on GPTNeoXJapaneseTokenizer and has the following modifications
- Decoding byte0~byte255 tokens correctly
- Added bagofword token handling
https://github.com/tanreinama/Japanese-BPEEncoder_V2 This tokenizer class is under MIT Lisence according to the
original repository.
MIT License
Copyright (c) 2020 tanreinama
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of
the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
# 从 tokenization_gpt_neox_japanese.SubWordJapaneseTokenizer.__init__ 复制而来
def __init__(self, vocab, ids_to_tokens, emoji):
self.vocab = vocab # 初始化词汇表属性,与参数swe相同
self.ids_to_tokens = ids_to_tokens # 初始化 ID 到词汇映射属性,与参数bpe相同
self.emoji = emoji # 初始化表情符号属性
self.maxlen = np.max([len(w) for w in self.vocab.keys()]) # 计算词汇表中最长词的长度并赋值给maxlen
# 初始化用于匹配文本中各种模式的正则表达式
self.content_repatter1 = re.compile(r"(https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+$,%#]+)")
self.content_repatter2 = re.compile(r"[A-Za-z0-9\._+]*@[\\-_0-9A-Za-z]+(\.[A-Za-z]+)*")
self.content_repatter3 = re.compile(r"[\(]{0,1}[0-9]{2,4}[\)\-\(]{0,1}[0-9]{2,4}[\)\-]{0,1}[0-9]{3,4}")
self.content_repatter4 = re.compile(
r"([12]\d{3}[/\-年])*(0?[1-9]|1[0-2])[/\-月]((0?[1-9]|[12][0-9]|3[01])日?)*(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*"
)
self.content_repatter5 = re.compile(
r"(明治|大正|昭和|平成|令和|㍾|㍽|㍼|㍻|\u32ff)\d{1,2}年(0?[1-9]|1[0-2])月(0?[1-9]|[12][0-9]|3[01])日(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*"
)
self.content_repatter6 = re.compile(
r"((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*億)*((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*万)*((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*千)*(0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*(千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+(\(税込\)|\(税抜\)|\+tax)*"
)
keisen = "─━│┃┄┅┆┇┈┉┊┋┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋╌╍╎╏═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬╭╮╯╰╱╲╳╴╵╶╷╸╹╺╻╼╽╾╿"
blocks = "▀▁▂▃▄▅▆▇█▉▊▋▌▍▎▏▐░▒▓▔▕▖▗▘▙▚▛▜▝▞▟"
self.content_trans1 = str.maketrans({k: "<BLOCK>" for k in keisen + blocks}) # 创建字符替换映射表
# 从tokenization_gpt_neox_japanese.SubWordJapaneseTokenizer.__len__中复制而来
def __len__(self):
return len(self.ids_to_tokens) # 返回ids_to_tokens的长度作为对象的长度
# 从tokenization_gpt_neox_japanese.SubWordJapaneseTokenizer.clean_text中复制而来
def clean_text(self, content):
content = self.content_repatter1.sub("<URL>", content) # 将文本中的URL替换为"<URL>"
content = self.content_repatter2.sub("<EMAIL>", content) # 将文本中的邮箱地址替换为"<EMAIL>"
content = self.content_repatter3.sub("<TEL>", content) # 将文本中的电话号码替换为"<TEL>"
content = self.content_repatter4.sub("<DATE>", content) # 将文本中的日期替换为"<DATE>"
content = self.content_repatter5.sub("<DATE>", content) # 将文本中的日期替换为"<DATE>"
content = self.content_repatter6.sub("<PRICE>", content) # 将文本中的价格替换为"<PRICE>"
content = content.translate(self.content_trans1) # 使用content_trans1进行文本的字符替换
while "<BLOCK><BLOCK>" in content:
content = content.replace("<BLOCK><BLOCK>", "<BLOCK>") # 将连续的"<BLOCK><BLOCK>"替换为单个"<BLOCK>"
return content
# 从tokenization_gpt_neox_japanese.SubWordJapaneseTokenizer.tokenize中复制而来
# 将文本中的空格替换为"<SP>"
text = text.replace(" ", "<SP>")
# 将全角空格替换为"<SP>"
text = text.replace(" ", "<SP>")
# 将 Windows 换行符"\r\n"替换为"<BR>"
text = text.replace("\r\n", "<BR>")
# 将普通换行符"\n"替换为"<BR>"
text = text.replace("\n", "<BR>")
# 将老式 Mac 换行符"\r"替换为"<BR>"
text = text.replace("\r", "<BR>")
# 将制表符"\t"替换为"<TAB>"
text = text.replace("\t", "<TAB>")
# 将"—"替换为"ー"
text = text.replace("—", "ー")
# 将"−"替换为"ー"
text = text.replace("−", "ー")
# 遍历表情字典中的每个键值对,如果文本中包含某个键,则用对应的值替换文本中的键
for k, v in self.emoji["emoji"].items():
if k in text:
text = text.replace(k, v)
# 如果 clean 参数为 True,则对文本进行清洗处理
if clean:
text = self.clean_text(text)
# 定义检查单个字符是否为特定符号的函数
def check_simbol(x):
e = x.encode()
# 检查字符长度为1且编码长度为2的情况
if len(x) == 1 and len(e) == 2:
c = (int(e[0]) << 8) + int(e[1])
# 检查是否符合特定范围内的字符编码
if (
(c >= 0xC2A1 and c <= 0xC2BF)
or (c >= 0xC780 and c <= 0xC783)
or (c >= 0xCAB9 and c <= 0xCBBF)
or (c >= 0xCC80 and c <= 0xCDA2)
):
return True
return False
# 定义检查单个字符是否为 Unicode 表意文字扩展区域的函数
def checku2e(x):
e = x.encode()
# 检查字符长度为1且编码长度为3的情况
if len(x) == 1 and len(e) == 3:
c = (int(e[0]) << 16) + (int(e[1]) << 8) + int(e[2])
# 检查是否符合特定范围内的字符编码
if c >= 0xE28080 and c <= 0xE2B07F:
return True
return False
# 初始化位置变量为0
pos = 0
# 初始化结果列表
result = []
# 当位置小于文本长度时循环处理文本
while pos < len(text):
# 如果当前字符是"<",则结束位置为当前位置加上最大长度加1;否则结束位置为当前位置加3
end = min(len(text), pos + self.maxlen + 1) if text[pos] == "<" else pos + 3
# 候选词列表初始化为空
candidates = [] # (token_id, token, pos)
# 从结束位置向当前位置遍历
for e in range(end, pos, -1):
# 获取当前位置到结束位置的子串
wd = text[pos:e]
# 如果该子串在词汇表中存在
if wd in self.vocab:
# 如果子串以"<"开头且长度大于2,则将其作为一个候选项加入列表
if wd[0] == "<" and len(wd) > 2:
candidates = [(self.vocab[wd], wd, e)]
break
else:
candidates.append((self.vocab[wd], wd, e))
# 如果候选词列表不为空
if len(candidates) > 0:
# 根据 token_id 最小的原则选取候选项中的一个进行处理
_, wd, e = sorted(candidates, key=lambda x: x[0])[0]
# 将选取的词添加到结果列表中
result.append(wd)
# 更新位置为 e
pos = e
else:
# 如果候选词列表为空,则处理当前位置到结束位置的子串
end = pos + 1
wd = text[pos:end]
# 如果子串为特定符号,则将"<KIGOU>"加入结果列表
if check_simbol(wd):
result.append("<KIGOU>")
# 如果子串为 Unicode 表意文字扩展区域的字符,则将"<U2000U2BFF>"加入结果列表
elif checku2e(wd):
result.append("<U2000U2BFF>")
else:
# 否则将子串中的每个字节按照格式"<|byte%d|>"添加到结果列表中
for i in wd.encode("utf-8"):
result.append("<|byte%d|>" % i)
# 更新位置为 end
pos = end
# 返回处理后的结果列表
return result
.\models\gptsan_japanese\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_flax_available,
is_tf_available,
is_torch_available,
)
_import_structure = {
"configuration_gptsan_japanese": ["GPTSAN_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTSanJapaneseConfig"],
"tokenization_gptsan_japanese": ["GPTSanJapaneseTokenizer"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_gptsan_japanese"] = [
"GPTSAN_JAPANESE_PRETRAINED_MODEL_ARCHIVE_LIST",
"GPTSanJapaneseForConditionalGeneration",
"GPTSanJapaneseModel",
"GPTSanJapanesePreTrainedModel",
]
_import_structure["tokenization_gptsan_japanese"] = [
"GPTSanJapaneseTokenizer",
]
if TYPE_CHECKING:
from .configuration_gptsan_japanese import GPTSAN_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTSanJapaneseConfig
from .tokenization_gptsan_japanese import GPTSanJapaneseTokenizer
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_gptsan_japanese import (
GPTSAN_JAPANESE_PRETRAINED_MODEL_ARCHIVE_LIST,
GPTSanJapaneseForConditionalGeneration,
GPTSanJapaneseModel,
GPTSanJapanesePreTrainedModel,
)
from .tokenization_gptsan_japanese import GPTSanJapaneseTokenizer
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\gpt_bigcode\configuration_gpt_bigcode.py
"""
GPTBigCode configuration
This module contains the configuration for the GPTBigCode model, specifying how to instantiate and customize it.
"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"bigcode/gpt_bigcode-santacoder": "https://huggingface.co/bigcode/gpt_bigcode-santacoder/resolve/main/config.json",
}
class GPTBigCodeConfig(PretrainedConfig):
"""
This is the configuration class to store the configuration of a [`GPTBigCodeModel`]. It is used to instantiate a
GPTBigCode model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the GPTBigCode
[gpt_bigcode](https://huggingface.co/gpt_bigcode) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
Args:
vocab_size (`int`, *optional*, defaults to 50257):
GPT-2 模型的词汇表大小,定义了可以表示的不同标记数量
n_positions (`int`, *optional*, defaults to 1024):
模型可能使用的最大序列长度。通常设置为一个较大的值,例如 512、1024 或 2048
n_embd (`int`, *optional*, defaults to 768):
嵌入和隐藏状态的维度
n_layer (`int`, *optional*, defaults to 12):
Transformer 编码器中的隐藏层数量
n_head (`int`, *optional*, defaults to 12):
Transformer 编码器中每个注意力层的注意头数量
n_inner (`int`, *optional*, defaults to None):
内部前馈层的维度。如果为 `None`,将设置为 4 倍的 n_embd
activation_function (`str`, *optional*, defaults to `"gelu_pytorch_tanh"`):
激活函数,可在列表 `["relu", "silu", "gelu", "tanh", "gelu_new", "gelu_pytorch_tanh"]` 中选择
resid_pdrop (`float`, *optional*, defaults to 0.1):
嵌入、编码器和池化器中所有全连接层的 dropout 概率
embd_pdrop (`float`, *optional*, defaults to 0.1):
嵌入的 dropout 比率
attn_pdrop (`float`, *optional*, defaults to 0.1):
注意力的 dropout 比率
layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
层归一化层使用的 epsilon
initializer_range (`float`, *optional*, defaults to 0.02):
用于初始化所有权重矩阵的截断正态分布的标准差
scale_attn_weights (`bool`, *optional*, defaults to `True`):
是否通过除以 sqrt(hidden_size) 来缩放注意力权重
use_cache (`bool`, *optional*, defaults to `True`):
模型是否应返回最后的键/值注意力(不是所有模型都使用)
attention_softmax_in_fp32 (`bool`, *optional*, defaults to `True`):
是否在 float32 中调用融合 softmax
scale_attention_softmax_in_fp32 (`bool`, *optional*, defaults to `True`):
是否在 float32 中缩放注意力 softmax
attention_type (`bool`, *optional*, defaults to `True`):
是否使用多查询注意力(True)或多头注意力(False)
Example:
```
>>> from transformers import GPTBigCodeConfig, GPTBigCodeModel
>>>
>>> configuration = GPTBigCodeConfig()
>>>
>>> model = GPTBigCodeModel(configuration)
```
configuration = model.config
model_type = "gpt_bigcode"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"hidden_size": "n_embd",
"max_position_embeddings": "n_positions",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}
def __init__(
self,
vocab_size=50257,
n_positions=1024,
n_embd=768,
n_layer=12,
n_head=12,
n_inner=None,
activation_function="gelu_pytorch_tanh",
resid_pdrop=0.1,
embd_pdrop=0.1,
attn_pdrop=0.1,
layer_norm_epsilon=1e-5,
initializer_range=0.02,
scale_attn_weights=True,
use_cache=True,
bos_token_id=50256,
eos_token_id=50256,
attention_softmax_in_fp32=True,
scale_attention_softmax_in_fp32=True,
multi_query=True,
**kwargs,
):
self.vocab_size = vocab_size
self.n_positions = n_positions
self.n_embd = n_embd
self.n_layer = n_layer
self.n_head = n_head
self.n_inner = n_inner
self.activation_function = activation_function
self.resid_pdrop = resid_pdrop
self.embd_pdrop = embd_pdrop
self.attn_pdrop = attn_pdrop
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.scale_attn_weights = scale_attn_weights
self.use_cache = use_cache
self.attention_softmax_in_fp32 = attention_softmax_in_fp32
self.scale_attention_softmax_in_fp32 = scale_attention_softmax_in_fp32
self.multi_query = multi_query
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
.\models\gpt_bigcode\modeling_gpt_bigcode.py
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import AttentionMaskConverter
from ...modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
SequenceClassifierOutputWithPast,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import is_torch_greater_or_equal_than_2_2
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_flash_attn_2_available,
is_flash_attn_greater_or_equal_2_10,
logging,
)
if is_flash_attn_2_available():
from flash_attn import flash_attn_func, flash_attn_varlen_func
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "bigcode/gpt_bigcode-santacoder"
_CONFIG_FOR_DOC = "GPTBigCodeConfig"
GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST = [
"bigcode/gpt_bigcode-santacoder",
]
@torch.jit.script
def upcast_masked_softmax(
x: torch.Tensor, mask: torch.Tensor, mask_value: torch.Tensor, scale: float, softmax_dtype: torch.dtype
):
input_dtype = x.dtype
x = x.to(softmax_dtype) * scale
x = torch.where(mask, x, mask_value)
x = torch.nn.functional.softmax(x, dim=-1).to(input_dtype)
return x
@torch.jit.script
def upcast_softmax(x: torch.Tensor, scale: float, softmax_dtype: torch.dtype):
input_dtype = x.dtype
x = x.to(softmax_dtype) * scale
x = torch.nn.functional.softmax(x, dim=-1).to(input_dtype)
return x
@torch.jit.script
def masked_softmax(x: torch.Tensor, mask: torch.Tensor, mask_value: torch.Tensor):
x = torch.where(mask, x, mask_value)
x = torch.nn.functional.softmax(x, dim=-1)
return x
def _get_unpad_data(attention_mask):
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
max_seqlen_in_batch = seqlens_in_batch.max().item()
cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
return (
indices,
cu_seqlens,
max_seqlen_in_batch,
)
class GPTBigCodeAttention(nn.Module):
def __init__(self, config, is_cross_attention=False, layer_idx=None):
super().__init__()
self.config = config
self.mask_value = None
self.multi_query = config.multi_query
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.embed_dim // self.num_heads
self.kv_heads = 1 if self.multi_query else self.num_heads
self.kv_dim = self.kv_heads * self.head_dim
self.split_size = self.embed_dim
self.is_causal = True
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {self.num_heads})."
)
self.scale_attn_weights = config.scale_attn_weights
self.is_cross_attention = is_cross_attention
self.layer_idx = layer_idx
self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
self.scale_attention_softmax_in_fp32 = (
config.scale_attention_softmax_in_fp32 and config.attention_softmax_in_fp32
)
self.attn_pdrop = config.attn_pdrop
if self.is_cross_attention:
if self.multi_query:
raise NotImplementedError("Multi-Query Attention not supported for cross_attention")
self.c_attn = nn.Linear(self.embed_dim, 2 * self.embed_dim)
self.q_attn = nn.Linear(self.embed_dim, self.embed_dim)
else:
self.c_attn = nn.Linear(self.embed_dim, self.embed_dim + 2 * self.kv_dim)
self.c_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.attn_dropout = nn.Dropout(config.attn_pdrop)
self.resid_dropout = nn.Dropout(config.resid_pdrop)
def _get_mask_value(self, device, dtype):
if self.mask_value is None or self.mask_value.dtype != dtype or self.mask_value.device != device:
self.mask_value = torch.full([], torch.finfo(dtype).min, dtype=dtype, device=device)
return self.mask_value
def forward(
self,
hidden_states: torch.Tensor,
layer_past: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = False,
output_attentions: Optional[bool] = False,
) -> Union[
Tuple[torch.Tensor, Optional[torch.Tensor]],
Tuple[torch.Tensor, Optional[torch.Tensor], Tuple[torch.Tensor, ...]],
]:
if encoder_hidden_states is not None:
if not hasattr(self, "q_attn") or not self.is_cross_attention:
raise ValueError(
"If class is used as cross attention, the weights `q_attn` have to be defined. "
"Please make sure to instantiate class with `GPTBigCodeAttention(..., is_cross_attention=True)`."
)
query = self.q_attn(hidden_states)
key_value = self.c_attn(encoder_hidden_states)
attention_mask = encoder_attention_mask
elif self.multi_query:
query, key_value = self.c_attn(hidden_states).split((self.embed_dim, 2 * self.kv_dim), dim=2)
else:
query, key_value = (
self.c_attn(hidden_states)
.view(*hidden_states.shape[:2], self.num_heads, 3 * self.head_dim)
.transpose(1, 2)
.split((self.head_dim, 2 * self.head_dim), dim=3)
)
if layer_past is not None:
key_value = torch.cat((layer_past, key_value), dim=-2)
present = key_value if use_cache else None
key, value = key_value.split((self.head_dim, self.head_dim), dim=-1)
attn_output, attn_weights = self._attn(query, key.transpose(-1, -2), value, attention_mask, head_mask)
if not self.multi_query:
attn_output = attn_output.transpose(1, 2).reshape(hidden_states.shape)
attn_output = self.c_proj(attn_output)
attn_output = self.resid_dropout(attn_output)
outputs = (attn_output, present)
if output_attentions:
if self.multi_query:
attn_weights = attn_weights.transpose(1, 2)
outputs += (attn_weights,)
return outputs
class GPTBigCodeFlashAttention2(GPTBigCodeAttention):
"""
GPTBigCode flash attention module. This module inherits from `GPTBigCodeAttention` as the weights of the module
stays untouched. The only required change would be on the forward pass where it needs to correctly call the public
API of flash attention and deal with padding tokens in case the input contains any of them.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
def forward(
self,
hidden_states: torch.Tensor,
layer_past: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = False,
output_attentions: Optional[bool] = False,
) -> Union[
Tuple[torch.Tensor, Optional[torch.Tensor]],
Tuple[torch.Tensor, Optional[torch.Tensor], Tuple[torch.Tensor, ...]],
]:
def _flash_attention_forward(
self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
):
"""
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
first unpad the input, then computes the attention scores and pad the final attention scores.
Args:
query_states (`torch.Tensor`):
Input query states to be passed to Flash Attention API
key_states (`torch.Tensor`):
Input key states to be passed to Flash Attention API
value_states (`torch.Tensor`):
Input value states to be passed to Flash Attention API
attention_mask (`torch.Tensor`):
The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
position of padding tokens and 1 for the position of non-padding tokens.
dropout (`float`):
Attention dropout
softmax_scale (`float`, *optional*):
The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
"""
if not self._flash_attn_uses_top_left_mask:
causal = self.is_causal
else:
causal = self.is_causal and query_length != 1
if attention_mask is not None:
batch_size = query_states.shape[0]
query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
query_states, key_states, value_states, attention_mask, query_length
)
cu_seqlens_q, cu_seqlens_k = cu_seq_lens
max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
attn_output_unpad = flash_attn_varlen_func(
query_states,
key_states,
value_states,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_k=cu_seqlens_k,
max_seqlen_q=max_seqlen_in_batch_q,
max_seqlen_k=max_seqlen_in_batch_k,
dropout_p=dropout,
softmax_scale=softmax_scale,
causal=causal,
)
attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
else:
attn_output = flash_attn_func(
query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
)
return attn_output
def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
key_layer = index_first_axis(
key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
)
value_layer = index_first_axis(
value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
)
if query_length == kv_seq_len:
query_layer = index_first_axis(
query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
)
cu_seqlens_q = cu_seqlens_k
max_seqlen_in_batch_q = max_seqlen_in_batch_k
indices_q = indices_k
elif query_length == 1:
max_seqlen_in_batch_q = 1
cu_seqlens_q = torch.arange(
batch_size + 1, dtype=torch.int32, device=query_layer.device
)
indices_q = cu_seqlens_q[:-1]
query_layer = query_layer.squeeze(1)
else:
attention_mask = attention_mask[:, -query_length:]
query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
return (
query_layer,
key_layer,
value_layer,
indices_q,
(cu_seqlens_q, cu_seqlens_k),
(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
)
class GPTBigCodeSdpaAttention(GPTBigCodeAttention):
def forward(
self,
hidden_states: torch.Tensor,
layer_past: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = False,
output_attentions: Optional[bool] = False,
) -> Union[
Tuple[torch.Tensor, Optional[torch.Tensor]],
Tuple[torch.Tensor, Optional[torch.Tensor], Tuple[torch.Tensor, ...]],
]:
pass
class GPTBigCodeMLP(nn.Module):
def __init__(self, intermediate_size, config):
super().__init__()
embed_dim = config.hidden_size
self.c_fc = nn.Linear(embed_dim, intermediate_size)
self.c_proj = nn.Linear(intermediate_size, embed_dim)
self.act = ACT2FN[config.activation_function]
self.dropout = nn.Dropout(config.resid_pdrop)
def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
hidden_states = self.c_fc(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.c_proj(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
GPTBIGCODE_ATTENTION_CLASSES = {
"eager": GPTBigCodeAttention,
"flash_attention_2": GPTBigCodeFlashAttention2,
"sdpa": GPTBigCodeSdpaAttention,
}
class GPTBigCodeBlock(nn.Module):
def __init__(self, config, layer_idx=None):
super().__init__()
hidden_size = config.hidden_size
self.inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
self.attn = GPTBIGCODE_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
if config.add_cross_attention:
if config.multi_query:
raise NotImplementedError("Cross-attention not implemented for MQA")
self.crossattention = GPTBIGCODE_ATTENTION_CLASSES[config._attn_implementation](
config, is_cross_attention=True, layer_idx=layer_idx
)
self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
self.mlp = GPTBigCodeMLP(self.inner_dim, config)
def forward(
self,
hidden_states: Optional[Tuple[torch.Tensor]],
layer_past: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = False,
output_attentions: Optional[bool] = False,
) -> Union[
Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
]:
residual = hidden_states
hidden_states = self.ln_1(hidden_states)
attn_outputs = self.attn(
hidden_states,
layer_past=layer_past,
attention_mask=attention_mask,
head_mask=head_mask,
use_cache=use_cache,
output_attentions=output_attentions,
)
attn_output = attn_outputs[0]
outputs = attn_outputs[1:]
hidden_states = attn_output + residual
if encoder_hidden_states is not None:
if not hasattr(self, "crossattention"):
raise ValueError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
"cross-attention layers by setting `config.add_cross_attention=True`"
)
residual = hidden_states
hidden_states = self.ln_cross_attn(hidden_states)
cross_attn_outputs = self.crossattention(
hidden_states,
attention_mask=attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions,
)
attn_output = cross_attn_outputs[0]
hidden_states = residual + attn_output
outputs = outputs + cross_attn_outputs[2:]
residual = hidden_states
hidden_states = self.ln_2(hidden_states)
feed_forward_hidden_states = self.mlp(hidden_states)
hidden_states = residual + feed_forward_hidden_states
if use_cache:
outputs = (hidden_states,) + outputs
else:
outputs = (hidden_states,) + outputs[1:]
return outputs
class GPTBigCodePreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = GPTBigCodeConfig
base_model_prefix = "transformer"
supports_gradient_checkpointing = True
_no_split_modules = ["GPTBigCodeBlock"]
_skip_keys_device_placement = "past_key_values"
_supports_flash_attn_2 = True
_supports_sdpa = True
def __init__(self, *inputs, **kwargs):
super().__init__(*inputs, **kwargs)
def _init_weights(self, module):
"""Initialize the weights."""
if isinstance(module, (GPTBigCodeMLP, GPTBigCodeAttention)):
module.c_proj.weight.data.normal_(
mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer))
)
module.c_proj._is_hf_initialized = True
elif isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
GPT_BIGCODE_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
"""
Parameters:
config ([`GPTBigCodeConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
GPT_BIGCODE_INPUTS_DOCSTRING = r"""
"""
@add_start_docstrings(
"The bare GPT_BIGCODE Model transformer outputting raw hidden-states without any specific head on top.",
GPT_BIGCODE_START_DOCSTRING,
)
class GPTBigCodeModel(GPTBigCodePreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.multi_query = config.multi_query # 从配置中获取多查询选项
self.embed_dim = config.hidden_size # 从配置中获取嵌入维度大小
self.wte = nn.Embedding(config.vocab_size, self.embed_dim) # 词嵌入层,根据词汇表大小和嵌入维度创建
self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) # 位置嵌入层,根据最大位置嵌入大小和嵌入维度创建
self.drop = nn.Dropout(config.embd_pdrop) # Dropout层,根据配置中的嵌入dropout概率创建
self.h = nn.ModuleList([GPTBigCodeBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)]) # 多层GPTBigCodeBlock组成的层列表
self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) # Layer normalization层,根据嵌入维度和配置中的epsilon创建
max_positions = config.max_position_embeddings
self.register_buffer(
"bias", torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)), persistent=False
) # 创建一个下三角矩阵作为偏置,类型为bool,注册为模型的缓冲区
self.gradient_checkpointing = False # 梯度检查点开关,默认关闭
self._use_sdpa = config._attn_implementation == "sdpa" # 根据配置中的注意力实现类型判断是否使用sdpa
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" # 根据配置中的注意力实现类型判断是否使用flash_attention_2
# 初始化权重并应用最终处理
self.post_init()
def get_input_embeddings(self):
return self.wte # 返回输入嵌入层对象
def set_input_embeddings(self, new_embeddings):
self.wte = new_embeddings # 设置新的输入嵌入层对象
@add_start_docstrings_to_model_forward(GPT_BIGCODE_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPastAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
# 此处应该包含模型前向传播的详细文档字符串和示例代码注释,但根据示例我们只输出类的注释部分
pass
@add_start_docstrings(
"""
The GPT_BIGCODE Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
""",
GPT_BIGCODE_START_DOCSTRING,
)
class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel):
_tied_weights_keys = ["lm_head.weight"] # 定义与输入嵌入层权重相关联的权重键
def __init__(self, config):
# 调用父类的初始化方法,传递配置参数
super().__init__(config)
# 使用给定配置创建 GPTBigCodeModel 模型
self.transformer = GPTBigCodeModel(config)
# 创建线性层用于语言模型的输出
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
# 初始化模型权重并进行最终处理
self.post_init()
def get_output_embeddings(self):
# 返回语言模型头部,即线性层
return self.lm_head
def set_output_embeddings(self, new_embeddings):
# 设置新的输出嵌入
self.lm_head = new_embeddings
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
token_type_ids = kwargs.get("token_type_ids", None)
# 如果存在过去的键值(past_key_values),则移除已覆盖的token
if past_key_values:
if self.config.multi_query:
past_length = past_key_values[0].shape[1]
else:
past_length = past_key_values[0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
# 移除已覆盖的token
input_ids = input_ids[:, remove_prefix_length:]
if token_type_ids is not None:
token_type_ids = token_type_ids[:, -input_ids.shape[1] :]
attention_mask = kwargs.get("attention_mask", None)
position_ids = kwargs.get("position_ids", None)
# 如果存在attention_mask但不存在position_ids,则动态创建position_ids用于批处理生成
if attention_mask is not None and position_ids is None:
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
if past_key_values:
position_ids = position_ids[:, -input_ids.shape[1] :]
else:
position_ids = None
# 如果传入了inputs_embeds,只在第一次生成步骤中使用它们
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
# 更新模型输入参数
model_inputs.update(
{
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"position_ids": position_ids,
"attention_mask": attention_mask,
"token_type_ids": token_type_ids,
}
)
return model_inputs
@add_start_docstrings_to_model_forward(GPT_BIGCODE_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=CausalLMOutputWithCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
# 定义 Transformer 模型的前向传播方法,用于推断或训练过程中的正向计算
def forward(
self,
input_ids: Optional[torch.Tensor] = None, # 输入的 token IDs,可以为 None
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, # 用于存储过去的键值对,可以为 None
attention_mask: Optional[torch.Tensor] = None, # 注意力掩码,用于指定哪些位置的 token 需要被忽略
token_type_ids: Optional[torch.Tensor] = None, # token 类型 IDs,如用于区分句子 A 和句子 B
position_ids: Optional[torch.Tensor] = None, # 位置 IDs,标识每个 token 的位置信息
head_mask: Optional[torch.Tensor] = None, # 头部掩码,用于指定哪些注意力头部需要被忽略
inputs_embeds: Optional[torch.Tensor] = None, # 输入的嵌入表示,代替 input_ids 使用
encoder_hidden_states: Optional[torch.Tensor] = None, # 编码器的隐藏状态,用于编码器-解码器结构
encoder_attention_mask: Optional[torch.Tensor] = None, # 编码器的注意力掩码
labels: Optional[torch.Tensor] = None, # 预测的标签,用于计算损失
use_cache: Optional[bool] = None, # 是否使用缓存,用于存储中间计算结果以加速解码
output_attentions: Optional[bool] = None, # 是否输出注意力权重
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态
return_dict: Optional[bool] = None, # 是否返回一个字典作为输出
) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
r"""
labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
`labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
"""
# 如果 return_dict 不是 None,则使用其值;否则使用 self.config.use_return_dict 的值
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 使用 transformer 处理输入数据,获取 transformer 的输出结果
transformer_outputs = self.transformer(
input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取 transformer 的隐藏状态
hidden_states = transformer_outputs[0]
# 使用 lm_head 对隐藏状态进行预测,得到语言模型的 logits
lm_logits = self.lm_head(hidden_states)
# 初始化损失值
loss = None
# 如果存在 labels,则计算损失
if labels is not None:
# 将 logits 向左移动一个位置,以便于标签预测下一个位置的 token
shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous().to(shift_logits.device)
# 将预测的 token 和标签展开,计算交叉熵损失
loss_fct = CrossEntropyLoss()
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
# 如果 return_dict 为 False,则返回输出的元组
if not return_dict:
output = (lm_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
# 如果 return_dict 为 True,则返回带有交叉注意力的 CausalLMOutputWithCrossAttentions 对象
return CausalLMOutputWithCrossAttentions(
loss=loss,
logits=lm_logits,
past_key_values=transformer_outputs.past_key_values,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
cross_attentions=transformer_outputs.cross_attentions,
)
@staticmethod
def _reorder_cache(
past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
) -> Tuple[Tuple[torch.Tensor]]:
"""
This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
[`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
beam_idx at every generation step.
"""
# 根据 beam_idx 重新排序 past_key_values 的缓存,以匹配每个生成步骤的正确 beam_idx
return tuple(layer_past.index_select(0, beam_idx.to(layer_past.device)) for layer_past in past_key_values)
"""
The GPTBigCode Model transformer with a sequence classification head on top (linear layer).
[`GPTBigCodeForSequenceClassification`] uses the last token in order to do the classification, as other causal
models (e.g. GPT-1) do.
Since it does classification on the last token, it requires to know the position of the last token. If a
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
each row of the batch).
"""
@add_start_docstrings(
"""
GPT_BIGCODE Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
for Named-Entity-Recognition (NER) tasks.
""",
GPT_BIGCODE_START_DOCSTRING,
)
class GPTBigCodeForTokenClassification(GPTBigCodePreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.transformer = GPTBigCodeModel(config)
# Determine the dropout rate for the classifier based on the configuration
if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
classifier_dropout = config.classifier_dropout
elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
classifier_dropout = config.hidden_dropout
else:
classifier_dropout = 0.1
self.dropout = nn.Dropout(classifier_dropout)
# Define the linear classifier layer for token classification
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(GPT_BIGCODE_INPUTS_DOCSTRING)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, TokenClassifierOutput]:
r"""
labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 若未指定 return_dict,则使用模型配置中的默认设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用 transformer 的 forward 方法,传递所有参数
transformer_outputs = self.transformer(
input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取 transformer 输出的隐藏状态
hidden_states = transformer_outputs[0]
# 对隐藏状态应用 dropout 操作
hidden_states = self.dropout(hidden_states)
# 将处理后的隐藏状态输入分类器得到 logits
logits = self.classifier(hidden_states)
# 初始化损失值为 None
loss = None
# 如果提供了标签,则计算损失值
if labels is not None:
# 使用交叉熵损失函数
loss_fct = CrossEntropyLoss()
# 将 logits 和标签视图转换为合适的形状并计算损失
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1).to(logits.device))
# 如果不要求返回字典格式的输出,则按需返回元组形式的输出
if not return_dict:
output = (logits,) + transformer_outputs[2:] # 按顺序拼接输出元组
return ((loss,) + output) if loss is not None else output
# 返回 TokenClassifierOutput 对象,其中包含损失、logits、隐藏状态和注意力权重
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
.\models\gpt_bigcode\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_torch_available,
)
_import_structure = {
"configuration_gpt_bigcode": ["GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTBigCodeConfig"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_gpt_bigcode"] = [
"GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST",
"GPTBigCodeForSequenceClassification",
"GPTBigCodeForTokenClassification",
"GPTBigCodeForCausalLM",
"GPTBigCodeModel",
"GPTBigCodePreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_gpt_bigcode import GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTBigCodeConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_gpt_bigcode import (
GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST,
GPTBigCodeForCausalLM,
GPTBigCodeForSequenceClassification,
GPTBigCodeForTokenClassification,
GPTBigCodeModel,
GPTBigCodePreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\gpt_neo\configuration_gpt_neo.py
class GPTNeoConfig(PretrainedConfig):
model_type = "gpt_neo"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"}
def __init__(
self,
vocab_size=50257,
max_position_embeddings=2048,
hidden_size=2048,
num_layers=24,
attention_types=[[["global", "local"], 12]],
num_heads=16,
intermediate_size=None,
window_size=256,
activation_function="gelu_new",
resid_dropout=0.0,
embed_dropout=0.0,
attention_dropout=0.0,
classifier_dropout=0.1,
layer_norm_epsilon=1e-5,
initializer_range=0.02,
use_cache=True,
bos_token_id=50256,
eos_token_id=50256,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.num_layers = num_layers
self.num_heads = num_heads
self.intermediate_size = intermediate_size
self.window_size = window_size
self.activation_function = activation_function
self.resid_dropout = resid_dropout
self.embed_dropout = embed_dropout
self.attention_dropout = attention_dropout
self.classifier_dropout = classifier_dropout
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.use_cache = use_cache
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.attention_types = attention_types
self.attention_layers = self.expand_attention_types_params(attention_types)
if len(self.attention_layers) != self.num_layers:
raise ValueError(
"Configuration for convolutional module is incorrect. "
"It is required that `len(config.attention_layers)` == `config.num_layers` "
f"but is `len(config.attention_layers) = {len(self.attention_layers)}`, "
f"`config.num_layers = {self.num_layers}`. "
"`config.attention_layers` is prepared using `config.attention_types`. "
"Please verify the value of `config.attention_types` argument."
)
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@staticmethod
def expand_attention_types_params(attention_types):
attentions = []
for item in attention_types:
for _ in range(item[1]):
attentions.extend(item[0])
return attentions
def custom_unfold(input, dimension, size, step):
"""Custom torch.Tensor.unfold implementation to enable the export to ONNX."""
import torch
shape = input.size()
rank = len(shape)
sizedim = shape[dimension]
low_indices = torch.arange(0, sizedim, step)
min_length = torch.div(sizedim - size, step, rounding_mode="floor") + 1
indices = torch.arange(size) + low_indices[:min_length][:, None]
s = [slice(None)] * rank
s[dimension] = indices
sliced = input[s]
perm = list(range(0, rank + 1))
perm.append(perm.pop(dimension + 1))
return sliced.permute(perm)
def custom_get_block_length_and_num_blocks(seq_length, window_size):
"""
Custom implementation for GPTNeoAttentionMixin._get_block_length_and_num_blocks to enable the export to ONNX as
original implementation uses Python variables and control flow.
"""
import torch
candidates = torch.arange(1, window_size)
remainders = torch.remainder(seq_length, candidates)
divisor_indices = remainders == 0
divisors = candidates[divisor_indices]
largest_divisor = torch.max(divisors)
return largest_divisor, torch.div(seq_length, largest_divisor, rounding_mode="floor")
class GPTNeoOnnxConfig(OnnxConfigWithPast):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}})
if self.use_past:
self.fill_with_past_key_values_(common_inputs, direction="inputs")
common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"}
else:
common_inputs["attention_mask"] = {0: "batch", 1: "sequence"}
return common_inputs
@property
def num_attention_heads(self) -> int:
return self._config.num_heads
def generate_dummy_inputs(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
)
ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]})
if self.use_past:
if not is_torch_available():
raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
else:
import torch
batch, seqlen = common_inputs["input_ids"].shape
past_key_values_length = seqlen + 2
past_shape = (
batch,
self.num_attention_heads,
past_key_values_length,
self._config.hidden_size // self.num_attention_heads,
)
ordered_inputs["past_key_values"] = [
(torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(self.num_layers)
]
ordered_inputs["attention_mask"] = common_inputs["attention_mask"]
if self.use_past:
mask_dtype = ordered_inputs["attention_mask"].dtype
ordered_inputs["attention_mask"] = torch.cat(
[ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
)
return ordered_inputs
@property
def default_onnx_opset(self) -> int:
return 13
.\models\gpt_neo\convert_gpt_neo_mesh_tf_to_pytorch.py
import argparse
import json
from transformers import GPTNeoConfig, GPTNeoForCausalLM, load_tf_weights_in_gpt_neo
from transformers.utils import logging
logging.set_verbosity_info()
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
config_json = json.load(open(config_file, "r"))
config = GPTNeoConfig(
hidden_size=config_json["n_embd"],
num_layers=config_json["n_layer"],
num_heads=config_json["n_head"],
attention_types=config_json["attention_types"],
max_position_embeddings=config_json["n_positions"],
resid_dropout=config_json["res_dropout"],
embed_dropout=config_json["embed_dropout"],
attention_dropout=config_json["attn_dropout"],
)
print(f"Building PyTorch model from configuration: {config}")
model = GPTNeoForCausalLM(config)
load_tf_weights_in_gpt_neo(model, config, tf_checkpoint_path)
print(f"Save PyTorch model to {pytorch_dump_path}")
model.save_pretrained(pytorch_dump_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
)
parser.add_argument(
"--config_file",
default=None,
type=str,
required=True,
help=(
"The config json file corresponding to the pre-trained mesh-tf model. \n"
"This specifies the model architecture."
),
)
parser.add_argument(
"--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
args = parser.parse_args()
convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
.\models\gpt_neo\modeling_flax_gpt_neo.py
from functools import partial
from typing import Optional, Tuple
import flax.linen as nn
import jax
import jax.numpy as jnp
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
from flax.linen import combine_masks, make_causal_mask
from flax.linen.attention import dot_product_attention_weights
from flax.traverse_util import flatten_dict, unflatten_dict
from jax import lax
from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutput
from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_gpt_neo import GPTNeoConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "GPTNeoConfig"
_CHECKPOINT_FOR_DOC = "EleutherAI/gpt-neo-1.3B"
GPT_NEO_START_DOCSTRING = r"""
This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a Flax Linen
[flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
Finally, this model supports inherent JAX features such as:
- [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
- [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
- [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
- [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
# Parameters: 定义函数参数列表
# config ([`GPTNeoConfig`]): 使用 `GPTNeoConfig` 类配置模型的参数
# Initializing with a config file does not load the weights associated with the model, only the
# configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
# 用配置文件初始化不会加载与模型关联的权重,仅加载配置。查看 [`~FlaxPreTrainedModel.from_pretrained`] 方法以加载模型权重。
# dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
# 计算的数据类型。可以是 `jax.numpy.float32`, `jax.numpy.float16`(在GPU上)和 `jax.numpy.bfloat16`(在TPU上)之一。
# 可用于在GPU或TPU上启用混合精度训练或半精度推断。如果指定,则所有计算将使用给定的 `dtype` 进行。
#
# **注意,这仅指定计算的数据类型,不影响模型参数的数据类型。**
#
# 如果希望更改模型参数的数据类型,请参阅 [`~FlaxPreTrainedModel.to_fp16`] 和 [`~FlaxPreTrainedModel.to_bf16`]。
"""
FlaxGPTNeoSelfAttention 类的文档字符串,描述了该类的输入参数和返回内容。
Args:
input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`):
输入序列标记的索引数组,形状为 `(batch_size, input_ids_length)`。
可以使用 `AutoTokenizer` 获取这些索引。参见 `PreTrainedTokenizer.encode` 和 `PreTrainedTokenizer.__call__` 的详细说明。
[什么是输入 ID?](../glossary
attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
注意力遮罩,用于避免对填充标记索引进行注意力计算。遮罩值在 `[0, 1]` 范围内:
- 对于不被遮罩的标记,值为 1,
- 对于被遮罩的标记,值为 0。
[什么是注意力遮罩?](../glossary
position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
输入序列标记在位置嵌入中的位置索引数组。取值范围是 `[0, config.max_position_embeddings - 1]`。
past_key_values (`Dict[str, np.ndarray]`, *optional*, 由 `init_cache` 返回或传入先前的 `past_key_values`):
预先计算的隐藏状态字典(用于注意力模块中的键和值)。预计算的键和值的形状为 `[batch_size, max_length]`。
output_attentions (`bool`, *optional*):
是否返回所有注意力层的注意力张量。更多细节参见返回张量中的 `attentions` 字段。
output_hidden_states (`bool`, *optional*):
是否返回所有层的隐藏状态。更多细节参见返回张量中的 `hidden_states` 字段。
return_dict (`bool`, *optional*):
是否返回 `~utils.ModelOutput` 而不是普通的元组。
"""
class FlaxGPTNeoSelfAttention(nn.Module):
# FlaxGPTNeoSelfAttention 类,继承自 nn.Module
config: GPTNeoConfig
# GPTNeoConfig 类型的 config 属性
attention_type: str
# 注意力类型字符串属性
dtype: jnp.dtype = jnp.float32
# 数据类型,默认为 jnp.float32
def setup(self):
# 从配置中获取参数
config = self.config
# 设置嵌入维度为隐藏大小
self.embed_dim = config.hidden_size
# 设置注意力头的数量
self.num_heads = config.num_attention_heads
# 计算每个注意力头的维度
self.head_dim = self.embed_dim // self.num_heads
# 检查 embed_dim 是否能被 num_heads 整除
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and "
f"`num_heads`: {self.num_heads})."
)
# 初始化注意力和残差的 dropout 层
self.attn_dropout = nn.Dropout(config.attention_dropout)
self.resid_dropout = nn.Dropout(config.resid_dropout)
# 部分应用带有特定初始化的 Dense 层函数
dense = partial(
nn.Dense,
self.embed_dim,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
)
# 初始化 query、key、value 的投影层
self.q_proj, self.k_proj, self.v_proj = dense(use_bias=False), dense(use_bias=False), dense(use_bias=False)
# 初始化输出投影层
self.out_proj = dense()
# 创建因果遮蔽掩码
self.causal_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool")
# 如果注意力类型为局部注意力,则修改因果遮蔽掩码
if self.attention_type == "local":
self.causal_mask = self.causal_mask ^ jnp.tril(self.causal_mask, -config.window_size)
# 将隐藏状态按头分割
def _split_heads(self, hidden_states):
return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
# 将分割的头合并为隐藏状态
def _merge_heads(self, hidden_states):
return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
# 使用 nn.compact 装饰器定义紧凑模块
@nn.compact
这些注释描述了给定代码中每个方法和语句的功能和作用。
def _concatenate_to_cache(self, key, value, query, attention_mask):
"""
This function takes projected key, value states from a single input token and concatenates the states to cached
states from previous steps. This function is slighly adapted from the official Flax repository:
https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py
"""
# 检测是否正在初始化,通过检查缓存数据是否存在来判断
is_initialized = self.has_variable("cache", "cached_key")
# 获取或创建缓存的键(key)和值(value)变量,如果不存在则初始化为全零数组
cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
# 获取或创建缓存的索引(index)变量,如果不存在则初始化为整数 0
cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
if is_initialized:
# 获取缓存键(key)的形状,并提取批次维度、最大长度、头数、每头深度
*batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
# 使用新的一维空间切片更新键(key)和值(value)的缓存
cur_index = cache_index.value
indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
key = lax.dynamic_update_slice(cached_key.value, key, indices)
value = lax.dynamic_update_slice(cached_value.value, value, indices)
# 更新缓存中的键(key)和值(value)
cached_key.value = key
cached_value.value = value
# 计算更新的缓存向量数,并更新缓存索引(index)
num_updated_cache_vectors = query.shape[1]
cache_index.value = cache_index.value + num_updated_cache_vectors
# 用于缓存解码器自注意力的因果掩码:我们的单个查询位置应该仅注意到已生成并缓存的键位置,而不是剩余的零元素。
pad_mask = jnp.broadcast_to(
jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
)
# 组合并更新注意力掩码(mask)
attention_mask = combine_masks(pad_mask, attention_mask)
# 返回更新后的键(key)、值(value)和注意力掩码(mask)
return key, value, attention_mask
):
# 计算查询向量,乘以 sqrt(head_dim),并转换为指定数据类型
query = self.q_proj(hidden_states) * jnp.sqrt(self.head_dim).astype(self.dtype)
# 计算键向量
key = self.k_proj(hidden_states)
# 计算值向量
value = self.v_proj(hidden_states)
# 将查询向量拆分成多个头部
query = self._split_heads(query)
# 将键向量拆分成多个头部
key = self._split_heads(key)
# 将值向量拆分成多个头部
value = self._split_heads(value)
# 获取查询向量和键向量的长度
query_length, key_length = query.shape[1], key.shape[1]
# 如果存在缓存的键,则创建一个因果遮罩
if self.has_variable("cache", "cached_key"):
mask_shift = self.variables["cache"]["cache_index"]
max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
# 从因果遮罩中动态切片出部分用于当前查询和键的长度
causal_mask = lax.dynamic_slice(
self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
)
else:
# 否则使用整个因果遮罩
causal_mask = self.causal_mask[:, :, :query_length, :key_length]
# 获取批次大小
batch_size = hidden_states.shape[0]
# 将因果遮罩广播到与注意力头部匹配的形状
causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
# 将注意力遮罩广播到与因果遮罩相同的形状
attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
# 合并注意力遮罩和因果遮罩
attention_mask = combine_masks(attention_mask, causal_mask)
dropout_rng = None
if not deterministic and self.config.attention_dropout > 0.0:
# 如果不是确定性的且注意力 dropout 大于 0,则创建一个随机数生成器用于 dropout
dropout_rng = self.make_rng("dropout")
# 在快速自回归解码期间,我们逐步输入一个位置,并逐步缓存键和值
if self.has_variable("cache", "cached_key") or init_cache:
# 如果存在缓存的键或者需要初始化缓存,则将键、值和注意力遮罩连接到缓存中
key, value, attention_mask = self._concatenate_to_cache(key, value, query, attention_mask)
# 将布尔类型的注意力遮罩转换为浮点数类型的注意力偏置
attention_bias = lax.select(
attention_mask > 0,
jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
)
# 常规的点积注意力计算
attn_weights = dot_product_attention_weights(
query,
key,
bias=attention_bias,
dropout_rng=dropout_rng,
dropout_rate=self.config.attention_dropout,
deterministic=deterministic,
dtype=self.dtype,
precision=None,
)
# 使用 einsum 执行加权求和得到注意力输出
attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value)
# 合并多头注意力的输出
attn_output = self._merge_heads(attn_output)
# 对输出应用输出投影
attn_output = self.out_proj(attn_output)
# 应用残差连接中的 dropout
attn_output = self.resid_dropout(attn_output, deterministic=deterministic)
# 根据需要返回注意力输出和注意力权重
outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
return outputs
class FlaxGPTNeoAttention(nn.Module):
config: GPTNeoConfig # 类变量,存储模型配置信息
layer_id: int = 0 # 类变量,表示当前层的索引,默认为0
dtype: jnp.dtype = jnp.float32 # 类变量,指定数据类型为32位浮点数
def setup(self):
attention_type = self.config.attention_layers[self.layer_id]
self.attention = FlaxGPTNeoSelfAttention(self.config, attention_type, dtype=self.dtype)
# 根据配置和注意力类型创建自注意力层对象
def __call__(
self,
hidden_states,
attention_mask=None,
deterministic: bool = True,
init_cache: bool = False,
output_attentions: bool = False,
):
return self.attention(
hidden_states,
attention_mask=attention_mask,
deterministic=deterministic,
init_cache=init_cache,
output_attentions=output_attentions,
)
# 调用自注意力层对象处理输入隐藏状态,并返回处理后的结果
class FlaxGPTNeoMLP(nn.Module):
config: GPTNeoConfig # 类变量,存储模型配置信息
intermediate_size: int # 类变量,表示中间隐藏层的大小
dtype: jnp.dtype = jnp.float32 # 类变量,指定数据类型为32位浮点数
def setup(self):
embed_dim = self.config.hidden_size
kernel_init = jax.nn.initializers.normal(self.config.initializer_range)
self.c_fc = nn.Dense(self.intermediate_size, dtype=self.dtype, kernel_init=kernel_init)
# 创建全连接层,用于变换隐藏状态的维度
self.c_proj = nn.Dense(embed_dim, dtype=self.dtype, kernel_init=kernel_init)
# 创建全连接层,将变换后的隐藏状态映射回原始维度
self.act = ACT2FN[self.config.activation_function]
# 根据配置选择激活函数
self.dropout = nn.Dropout(rate=self.config.resid_dropout)
# 创建Dropout层,用于随机置零输入张量的元素,以防止过拟合
def __call__(self, hidden_states, deterministic: bool = True):
hidden_states = self.c_fc(hidden_states)
# 使用全连接层变换隐藏状态
hidden_states = self.act(hidden_states)
# 应用激活函数
hidden_states = self.c_proj(hidden_states)
# 使用全连接层将变换后的隐藏状态映射回原始维度
hidden_states = self.dropout(hidden_states, deterministic=deterministic)
# 应用Dropout层
return hidden_states
# 返回处理后的隐藏状态
class FlaxGPTNeoBlock(nn.Module):
config: GPTNeoConfig # 类变量,存储模型配置信息
layer_id: int = 0 # 类变量,表示当前层的索引,默认为0
dtype: jnp.dtype = jnp.float32 # 类变量,指定数据类型为32位浮点数
def setup(self):
hidden_size = self.config.hidden_size
inner_dim = self.config.intermediate_size if self.config.intermediate_size is not None else 4 * hidden_size
# 根据配置确定内部维度大小
self.ln_1 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
# 创建LayerNorm层,用于归一化隐藏状态
self.attn = FlaxGPTNeoAttention(self.config, layer_id=self.layer_id, dtype=self.dtype)
# 创建注意力层对象
self.ln_2 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
# 创建LayerNorm层,用于归一化注意力输出
self.mlp = FlaxGPTNeoMLP(self.config, inner_dim, dtype=self.dtype)
# 创建MLP对象,用于处理注意力输出
def __call__(
self,
hidden_states,
attention_mask=None,
deterministic: bool = True,
init_cache: bool = False,
output_attentions: bool = False,
):
# 通过注意力层和MLP层处理输入
):
# 保存原始的隐藏状态,用于残差连接
residual = hidden_states
# LayerNormalization层,对隐藏状态进行归一化处理
hidden_states = self.ln_1(hidden_states)
# 注意力机制,处理隐藏状态并返回输出
outputs = self.attn(
hidden_states,
attention_mask=attention_mask,
deterministic=deterministic,
init_cache=init_cache,
output_attentions=output_attentions,
)
# 从注意力输出中获取注意力层的输出
attn_output = outputs[0]
# 执行残差连接,更新隐藏状态
hidden_states = attn_output + residual
# 保存当前隐藏状态作为残差
residual = hidden_states
# LayerNormalization层,再次对隐藏状态进行归一化处理
hidden_states = self.ln_2(hidden_states)
# 多层感知机(MLP),对隐藏状态进行前馈神经网络处理
feed_forward_hidden_states = self.mlp(hidden_states, deterministic=deterministic)
# 执行残差连接,更新隐藏状态
hidden_states = residual + feed_forward_hidden_states
# 返回更新后的隐藏状态以及可能的额外输出
return (hidden_states,) + outputs[1:]
# FlaxGPTNeoPreTrainedModel 类定义,继承自 FlaxPreTrainedModel,用于处理权重初始化以及预训练模型下载和加载的抽象类
class FlaxGPTNeoPreTrainedModel(FlaxPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
# 指定配置类为 GPTNeoConfig
config_class = GPTNeoConfig
# 指定基础模型前缀为 "transformer"
base_model_prefix = "transformer"
# 模块类变量初始化为 None
module_class: nn.Module = None
def __init__(
self,
config: GPTNeoConfig,
input_shape: Tuple = (1, 1),
seed: int = 0,
dtype: jnp.dtype = jnp.float32,
_do_init: bool = True,
**kwargs,
):
# 使用给定的配置和参数初始化模块对象
module = self.module_class(config=config, dtype=dtype, **kwargs)
# 调用父类构造函数初始化模型
super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
# 初始化输入张量
input_ids = jnp.zeros(input_shape, dtype="i4")
attention_mask = jnp.ones_like(input_ids)
position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
# 划分随机数生成器
params_rng, dropout_rng = jax.random.split(rng)
rngs = {"params": params_rng, "dropout": dropout_rng}
# 使用初始化的参数生成随机参数
random_params = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)["params"]
# 如果存在预定义的参数,将随机参数与预定义参数合并
if params is not None:
random_params = flatten_dict(unfreeze(random_params))
params = flatten_dict(unfreeze(params))
for missing_key in self._missing_keys:
params[missing_key] = random_params[missing_key]
self._missing_keys = set()
return freeze(unflatten_dict(params))
else:
return random_params
def init_cache(self, batch_size, max_length):
r"""
Args:
batch_size (`int`):
batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
max_length (`int`):
maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
cache.
"""
# 初始化用于检索缓存的输入变量
input_ids = jnp.ones((batch_size, max_length))
attention_mask = jnp.ones_like(input_ids)
position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
# 使用初始化的变量生成缓存
init_variables = self.module.init(
jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
)
return unfreeze(init_variables["cache"])
# 添加模型输入文档字符串到模型前向方法
@add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
def __call__(
self,
input_ids,
attention_mask=None,
position_ids=None,
params: dict = None,
past_key_values: dict = None,
dropout_rng: jax.random.PRNGKey = None,
train: bool = False,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
# 设置输出注意力权重的选项,如果未指定,则使用配置中的默认设置
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 设置输出隐藏状态的选项,如果未指定,则使用配置中的默认设置
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 设置返回字典的选项,如果未指定,则使用配置中的默认设置
return_dict = return_dict if return_dict is not None else self.config.return_dict
# 获取输入张量的批次大小和序列长度
batch_size, sequence_length = input_ids.shape
# 如果未提供位置编码,则根据序列长度和批次大小创建默认位置编码
if position_ids is None:
if past_key_values is not None:
raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.")
position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
# 如果未提供注意力掩码,则创建一个全为1的默认注意力掩码
if attention_mask is None:
attention_mask = jnp.ones((batch_size, sequence_length))
# 处理任何需要的伪随机数生成器
rngs = {}
if dropout_rng is not None:
rngs["dropout"] = dropout_rng
inputs = {"params": params or self.params}
# 如果传递了过去的键值,则初始化缓存,并确保缓存是可变的
if past_key_values:
inputs["cache"] = past_key_values
mutable = ["cache"]
else:
mutable = False
# 调用模块的应用方法来处理输入,并传递必要的参数和选项
outputs = self.module.apply(
inputs,
jnp.array(input_ids, dtype="i4"),
jnp.array(attention_mask, dtype="i4"),
jnp.array(position_ids, dtype="i4"),
not train,
False,
output_attentions,
output_hidden_states,
return_dict,
rngs=rngs,
mutable=mutable,
)
# 如果传递了过去的键值并且需要返回字典,则将更新后的缓存添加到模型输出中
if past_key_values is not None and return_dict:
outputs, past_key_values = outputs
outputs["past_key_values"] = unfreeze(past_key_values["cache"])
return outputs
# 如果传递了过去的键值但不需要返回字典,则更新缓存后将其添加到模型输出中
elif past_key_values is not None and not return_dict:
outputs, past_key_values = outputs
outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
# 返回最终的模型输出
return outputs
class FlaxGPTNeoBlockCollection(nn.Module):
config: GPTNeoConfig
dtype: jnp.dtype = jnp.float32
# 初始化方法,设置模块内的各个子块
def setup(self):
# 创建一个由多个 FlaxGPTNeoBlock 实例组成的列表,每个块对应模型的一个隐藏层
self.blocks = [
FlaxGPTNeoBlock(self.config, layer_id=i, name=str(i), dtype=self.dtype)
for i in range(self.config.num_hidden_layers)
]
# 调用方法,接受输入并依次经过每个块的处理
def __call__(
self,
hidden_states,
attention_mask=None,
deterministic: bool = True,
init_cache: bool = False,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 如果需要输出注意力矩阵,则初始化空的元组用于存储每个块的注意力矩阵
all_attentions = () if output_attentions else None
# 如果需要输出隐藏状态,则初始化空的元组用于存储每个块的隐藏状态
all_hidden_states = () if output_hidden_states else None
# 对每个块进行迭代处理
for block in self.blocks:
# 如果需要输出隐藏状态,则将当前隐藏状态添加到 all_hidden_states 中
if output_hidden_states:
all_hidden_states += (hidden_states,)
# 调用当前块的处理方法,得到该块的输出
layer_outputs = block(
hidden_states,
attention_mask,
deterministic=deterministic,
init_cache=init_cache,
output_attentions=output_attentions,
)
# 更新当前隐藏状态为当前块的输出的第一个元素(通常是下一层的隐藏状态)
hidden_states = layer_outputs[0]
# 如果需要输出注意力矩阵,则将当前块的注意力矩阵添加到 all_attentions 中
if output_attentions:
all_attentions += (layer_outputs[1],)
# 组装最终输出元组,包含最终的隐藏状态、所有块的隐藏状态序列和所有块的注意力矩阵序列
outputs = (hidden_states, all_hidden_states, all_attentions)
return outputs
class FlaxGPTNeoModule(nn.Module):
config: GPTNeoConfig
dtype: jnp.dtype = jnp.float32
# 初始化方法,设置模块内的各个子模块和变量
def setup(self):
# 设置词嵌入维度
self.embed_dim = self.config.hidden_size
# 使用正态分布初始化词嵌入矩阵
embedding_init = jax.nn.initializers.normal(stddev=self.config.initializer_range)
# 创建词嵌入层
self.wte = nn.Embed(
self.config.vocab_size,
self.embed_dim,
embedding_init=embedding_init,
)
# 创建位置嵌入层
self.wpe = nn.Embed(
self.config.max_position_embeddings,
self.embed_dim,
embedding_init=embedding_init,
)
# 创建 Dropout 层,用于随机断开输入的连接,防止过拟合
self.dropout = nn.Dropout(rate=self.config.embed_dropout)
# 创建 FlaxGPTNeoBlockCollection 实例,用于处理模型的多层块
self.h = FlaxGPTNeoBlockCollection(self.config, dtype=self.dtype)
# 创建 Layer Normalization 层,用于对最后的隐藏状态进行归一化处理
self.ln_f = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
# 调用方法,接受输入并依次经过模型内的各个子模块处理
def __call__(
self,
input_ids,
attention_mask,
position_ids,
deterministic=True,
init_cache: bool = False,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
):
# 将输入的词嵌入转换为特定数据类型的张量,并传递给词嵌入层处理
input_embeds = self.wte(input_ids.astype("i4"))
# 将位置编码转换为特定数据类型的张量,并传递给位置编码层处理
position_embeds = self.wpe(position_ids.astype("i4"))
# 将输入的词嵌入张量和位置编码张量相加得到隐藏状态张量
hidden_states = input_embeds + position_embeds
# 对隐藏状态张量应用dropout操作,用于模型训练中的随机失活
hidden_states = self.dropout(hidden_states, deterministic=deterministic)
# 调用Transformer模型中的H层进行前向传播计算
outputs = self.h(
hidden_states,
attention_mask,
deterministic=deterministic,
init_cache=init_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取H层输出的第一个张量,即隐藏状态张量
hidden_states = outputs[0]
# 对隐藏状态张量应用LN_F层,进行Layer Normalization处理
hidden_states = self.ln_f(hidden_states)
# 再次获取H层输出的第一个张量,即隐藏状态张量(重复的代码行)
hidden_states = outputs[0]
# 对隐藏状态张量应用LN_F层,进行Layer Normalization处理(重复的代码行)
# 如果需要输出所有隐藏状态,则将当前隐藏状态张量添加到所有隐藏状态的列表中
if output_hidden_states:
all_hidden_states = outputs[1] + (hidden_states,)
# 更新输出元组,将所有隐藏状态列表添加到输出元组中
outputs = (hidden_states, all_hidden_states) + outputs[2:]
else:
# 更新输出元组,将当前隐藏状态张量添加到输出元组中
outputs = (hidden_states,) + outputs[1:]
# 如果不需要返回字典类型结果,则返回所有非空元素的元组
if not return_dict:
return tuple(v for v in outputs if v is not None)
# 返回模型输出结果的FlaxBaseModelOutput对象,包括最终隐藏状态、所有隐藏状态和注意力分数
return FlaxBaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=outputs[1],
attentions=outputs[-1],
)
# 添加起始文档字符串,描述该类是一个不带特定输出头部的原始隐藏状态的 GPTNeo 模型转换器。
@add_start_docstrings(
"The bare GPTNeo Model transformer outputting raw hidden-states without any specific head on top.",
GPT_NEO_START_DOCSTRING,
)
class FlaxGPTNeoModel(FlaxGPTNeoPreTrainedModel):
# 模块类属性指定为 FlaxGPTNeoModule
module_class = FlaxGPTNeoModule
# 添加调用样本文档字符串的方法,指定 FlaxGPTNeoModel 的一些文档化信息
append_call_sample_docstring(FlaxGPTNeoModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC)
# 定义用于有因果语言建模的 GPTNeo 模型的模块
class FlaxGPTNeoForCausalLMModule(nn.Module):
# 使用 GPTNeoConfig 作为配置,并指定默认数据类型为 jnp.float32
config: GPTNeoConfig
dtype: jnp.dtype = jnp.float32
# 设置方法,在模块初始化时调用,初始化 transformer 和 lm_head
def setup(self):
self.transformer = FlaxGPTNeoModule(self.config, dtype=self.dtype)
self.lm_head = nn.Dense(
self.config.vocab_size,
use_bias=False,
dtype=self.dtype,
# 使用正态分布初始化器初始化权重
kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
)
# 调用方法,实现模块的前向传播
def __call__(
self,
input_ids,
attention_mask,
position_ids,
deterministic: bool = True,
init_cache: bool = False,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 使用 transformer 模型进行前向传播
outputs = self.transformer(
input_ids,
attention_mask,
position_ids,
deterministic=deterministic,
init_cache=init_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0]
# 如果配置要求词嵌入权重共享,则共享 wte 参数的嵌入权重,并应用到 lm_head
if self.config.tie_word_embeddings:
shared_kernel = self.transformer.variables["params"]["wte"]["embedding"].T
lm_logits = self.lm_head.apply({"params": {"kernel": shared_kernel}}, hidden_states)
else:
# 否则直接将隐藏状态传递给 lm_head
lm_logits = self.lm_head(hidden_states)
# 如果不返回字典,则返回元组形式的结果
if not return_dict:
return (lm_logits,) + outputs[1:]
# 返回具有自定义输出的 FlaxCausalLMOutput 对象
return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
# 添加起始文档字符串,描述带有语言建模头部的 GPTNeo 模型转换器
@add_start_docstrings(
"""
The GPTNeo Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
""",
GPT_NEO_START_DOCSTRING,
)
class FlaxGPTNeoForCausalLM(FlaxGPTNeoPreTrainedModel):
module_class = FlaxGPTNeoForCausalLMModule
# 为生成准备输入数据,接受输入的token ids、最大长度和可选的注意力掩码
def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
# 初始化缓存
batch_size, seq_length = input_ids.shape
# 使用输入的batch_size和max_length初始化缓存
past_key_values = self.init_cache(batch_size, max_length)
# 注意:通常需要在attention_mask的超出input_ids.shape[-1]和小于cache_length的位置放置0。
# 但由于GPTNeo使用因果掩码,这些位置已经被掩盖了。
# 因此,我们可以在这里创建一个静态的attention_mask,对编译更有效。
extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
# 如果提供了attention_mask,则根据它计算position_ids
if attention_mask is not None:
position_ids = attention_mask.cumsum(axis=-1) - 1
extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
else:
# 否则,使用默认的位置ids:从0到seq_length的广播
position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
# 返回一个包含past_key_values、extended_attention_mask和position_ids的字典
return {
"past_key_values": past_key_values,
"attention_mask": extended_attention_mask,
"position_ids": position_ids,
}
# 更新生成时的输入数据,接受模型输出和模型参数字典
def update_inputs_for_generation(self, model_outputs, model_kwargs):
# 将模型输出中的past_key_values更新到模型参数字典中
model_kwargs["past_key_values"] = model_outputs.past_key_values
# 更新position_ids,只保留最后一个位置并加1
model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
# 返回更新后的模型参数字典
return model_kwargs
# 导入自动生成文档字符串所需的示例函数的模块,并添加到 FlaxGPTNeoForCausalLM 类的文档中
# _CHECKPOINT_FOR_DOC 是用于文档的检查点
# FlaxCausalLMOutput 是 FlaxGPTNeoForCausalLM 的输出类
# _CONFIG_FOR_DOC 是 FlaxGPTNeoForCausalLM 的配置类
append_call_sample_docstring(FlaxGPTNeoForCausalLM, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC)
.\models\gpt_neo\modeling_gpt_neo.py
import os
from typing import Optional, Tuple, Union
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
BaseModelOutputWithPast,
BaseModelOutputWithPastAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
CausalLMOutputWithPast,
QuestionAnsweringModelOutput,
SequenceClassifierOutputWithPast,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import is_torch_greater_or_equal_than_1_13
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_flash_attn_2_available,
is_flash_attn_greater_or_equal_2_10,
is_torch_fx_available,
logging,
)
if is_flash_attn_2_available():
from flash_attn import flash_attn_func, flash_attn_varlen_func
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
if is_torch_fx_available():
if not is_torch_greater_or_equal_than_1_13:
import torch.fx
_prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "GPTNeoConfig"
GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST = [
"EleutherAI/gpt-neo-1.3B",
]
_CHECKPOINT_FOR_DOC = "EleutherAI/gpt-neo-1.3B"
def _get_unpad_data(attention_mask):
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
max_seqlen_in_batch = seqlens_in_batch.max().item()
cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
return (
indices,
cu_seqlens,
max_seqlen_in_batch,
def data_transform(data):
return data.split(',')
try:
import re
import tensorflow as tf
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path = os.path.abspath(gpt_neo_checkpoint_path)
logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
if "global_step" not in name and "adam" not in name:
array = tf.train.load_variable(tf_path, name)
array = tf.dtypes.cast(array.squeeze(), tf.float32).numpy()
name = name.replace("attn/q", "attn/attention/q_proj/w")
name = name.replace("attn/k", "attn/attention/k_proj/w")
name = name.replace("attn/v", "attn/attention/v_proj/w")
name = name.replace("attn/o", "attn/attention/out_proj/w")
name = name.replace("norm_1", "ln_1")
name = name.replace("norm_2", "ln_2")
name = name.replace("attn/compute_output_bias/o_b", "attn/attention/out_proj/b")
name = name.replace("conv1d_main/c_fc/kernel", "c_fc/w")
name = name.replace("conv1d_main/c_fc/bias", "c_fc/b")
name = name.replace("conv1d_main/c_proj/kernel", "c_proj/w")
name = name.replace("conv1d_main/c_proj/bias", "c_proj/b")
names.append(name)
arrays.append(array)
for name, array in zip(names, arrays):
name = name[5:]
name = name.split("/")
pointer = model.transformer
for m_name in name:
if re.fullmatch(r"[A-Za-z]+\d+", m_name):
scope_names = re.split(r"(\d+)", m_name)
else:
scope_names = [m_name]
if scope_names[0] == "w" or scope_names[0] == "g":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "b":
pointer = getattr(pointer, "bias")
elif scope_names[0] == "wpe" or scope_names[0] == "wte":
pointer = getattr(pointer, scope_names[0])
pointer = getattr(pointer, "weight")
else:
pointer = getattr(pointer, scope_names[0])
if len(scope_names) >= 2:
num = int(scope_names[1])
pointer = pointer[num]
if name[-1] == "w" and name[-2] in ["out_proj", "k_proj", "q_proj", "v_proj", "c_proj", "c_fc"]:
array = array.transpose()
if name == ["wte"]:
array = array[: config.vocab_size]
if pointer.shape != array.shape:
raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched {name}")
print(f"Initialize PyTorch weight {name}")
pointer.data = torch.from_numpy(array)
embs = model.transformer.wte.weight
lin = nn.Linear(embs.size()[1], embs.size()[0], bias=False)
lin.weight = embs
model.set_output_embeddings(lin)
return model
class GPTNeoSelfAttention(nn.Module):
def __init__(self, config, attention_type):
super().__init__()
self.config = config
max_positions = config.max_position_embeddings
bias = torch.tril(torch.ones((max_positions, max_positions), dtype=bool)).view(
1, 1, max_positions, max_positions
)
if attention_type == "local":
bias = torch.bitwise_xor(bias, torch.tril(bias, -config.window_size))
self.register_buffer("bias", bias, persistent=False)
self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)
self.attn_dropout = nn.Dropout(float(config.attention_dropout))
self.resid_dropout = nn.Dropout(float(config.resid_dropout))
self.is_causal = True
self.embed_dim = config.hidden_size
self.num_heads = config.num_heads
self.head_dim = self.embed_dim // self.num_heads
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: "
f"{self.num_heads})."
)
self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)
def _split_heads(self, tensor, num_heads, attn_head_size):
"""
Splits hidden_size dim into attn_head_size and num_heads
"""
new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
tensor = tensor.view(new_shape)
return tensor.permute(0, 2, 1, 3)
def _merge_heads(self, tensor, num_heads, attn_head_size):
"""
Merges attn_head_size dim and num_attn_heads dim into hidden_size
"""
tensor = tensor.permute(0, 2, 1, 3).contiguous()
new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
return tensor.view(new_shape)
def _attn(self, query, key, value, attention_mask=None, head_mask=None):
query = query.to(torch.float32)
key = key.to(torch.float32)
attn_weights = torch.matmul(query, key.transpose(-1, -2))
query_length, key_length = query.size(-2), key.size(-2)
causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
mask_value = torch.finfo(attn_weights.dtype).min
mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
attn_weights = torch.where(causal_mask, attn_weights, mask_value)
if attention_mask is not None:
attn_weights = attn_weights + attention_mask
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
attn_weights = attn_weights.to(value.dtype)
attn_weights = self.attn_dropout(attn_weights)
if head_mask is not None:
attn_weights = attn_weights * head_mask
attn_output = torch.matmul(attn_weights, value)
return attn_output, attn_weights
def forward(
self,
hidden_states,
attention_mask=None,
layer_past=None,
head_mask=None,
use_cache=False,
output_attentions=False,
):
query = self.q_proj(hidden_states)
key = self.k_proj(hidden_states)
value = self.v_proj(hidden_states)
query = self._split_heads(query, self.num_heads, self.head_dim)
key = self._split_heads(key, self.num_heads, self.head_dim)
value = self._split_heads(value, self.num_heads, self.head_dim)
if layer_past is not None:
past_key = layer_past[0]
past_value = layer_past[1]
key = torch.cat((past_key, key), dim=-2)
value = torch.cat((past_value, value), dim=-2)
if use_cache is True:
present = (key, value)
else:
present = None
attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
attn_output = self.out_proj(attn_output)
attn_output = self.resid_dropout(attn_output)
outputs = (attn_output, present)
if output_attentions:
outputs += (attn_weights,)
return outputs
class GPTNeoFlashAttention2(GPTNeoSelfAttention):
"""
GPTNeo flash attention module. This module inherits from `GPTNeoSelfAttention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
def forward(
self,
hidden_states,
attention_mask=None,
layer_past=None,
head_mask=None,
use_cache=False,
output_attentions=False,
):
def _flash_attention_forward(
self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
):
"""
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
first unpad the input, then computes the attention scores and pad the final attention scores.
Args:
query_states (`torch.Tensor`):
Input query states to be passed to Flash Attention API
key_states (`torch.Tensor`):
Input key states to be passed to Flash Attention API
value_states (`torch.Tensor`):
Input value states to be passed to Flash Attention API
attention_mask (`torch.Tensor`):
The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
position of padding tokens and 1 for the position of non-padding tokens.
dropout (`float`):
Attention dropout
softmax_scale (`float`, *optional*):
The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
"""
if not self._flash_attn_uses_top_left_mask:
causal = self.is_causal
else:
causal = self.is_causal and query_length != 1
if attention_mask is not None:
batch_size = query_states.shape[0]
query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
query_states, key_states, value_states, attention_mask, query_length
)
cu_seqlens_q, cu_seqlens_k = cu_seq_lens
max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
attn_output_unpad = flash_attn_varlen_func(
query_states,
key_states,
value_states,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_k=cu_seqlens_k,
max_seqlen_q=max_seqlen_in_batch_q,
max_seqlen_k=max_seqlen_in_batch_k,
dropout_p=dropout,
softmax_scale=softmax_scale,
causal=causal,
)
attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
else:
attn_output = flash_attn_func(
query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
)
return attn_output
```
def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
key_layer = index_first_axis(
key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
)
value_layer = index_first_axis(
value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
)
if query_length == kv_seq_len:
query_layer = index_first_axis(
query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
)
cu_seqlens_q = cu_seqlens_k
max_seqlen_in_batch_q = max_seqlen_in_batch_k
indices_q = indices_k
elif query_length == 1:
max_seqlen_in_batch_q = 1
cu_seqlens_q = torch.arange(
batch_size + 1, dtype=torch.int32, device=query_layer.device
)
indices_q = cu_seqlens_q[:-1]
query_layer = query_layer.squeeze(1)
else:
attention_mask = attention_mask[:, -query_length:]
query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
return (
query_layer,
key_layer,
value_layer,
indices_q,
(cu_seqlens_q, cu_seqlens_k),
(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
)
GPT_NEO_ATTENTION_CLASSES = {
"eager": GPTNeoSelfAttention,
"flash_attention_2": GPTNeoFlashAttention2,
}
class GPTNeoAttention(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
self.layer_id = layer_id
self.attention_layers = config.attention_layers
self.attention_type = self.attention_layers[layer_id]
if self.attention_type in ["global", "local"]:
self.attention = GPT_NEO_ATTENTION_CLASSES[config._attn_implementation](config, self.attention_type)
else:
raise NotImplementedError(
"Only attn layer types 'global' and 'local' exist, but got `config.attention_layers`: "
f"{config.attention_layers}. Select attn layer types from ['global', 'local'] only."
)
def forward(
self,
hidden_states,
layer_past=None,
attention_mask=None,
head_mask=None,
use_cache=False,
output_attentions=False,
):
return self.attention(
hidden_states,
attention_mask=attention_mask,
layer_past=layer_past,
head_mask=head_mask,
use_cache=use_cache,
output_attentions=output_attentions,
)
class GPTNeoMLP(nn.Module):
def __init__(self, intermediate_size, config):
super().__init__()
embed_dim = config.hidden_size
self.c_fc = nn.Linear(embed_dim, intermediate_size)
self.c_proj = nn.Linear(intermediate_size, embed_dim)
self.act = ACT2FN[config.activation_function]
self.dropout = nn.Dropout(float(config.resid_dropout))
def forward(self, hidden_states):
hidden_states = self.c_fc(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.c_proj(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class GPTNeoBlock(nn.Module):
def __init__(self, config, layer_id):
super().__init__()
hidden_size = config.hidden_size
inner_dim = config.intermediate_size if config.intermediate_size is not None else 4 * hidden_size
self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
self.attn = GPTNeoAttention(config, layer_id)
self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
self.mlp = GPTNeoMLP(inner_dim, config)
def forward(
self,
hidden_states,
layer_past=None,
attention_mask=None,
head_mask=None,
use_cache=False,
output_attentions=False,
):
):
residual = hidden_states
hidden_states = self.ln_1(hidden_states)
attn_outputs = self.attn(
hidden_states,
layer_past=layer_past,
attention_mask=attention_mask,
head_mask=head_mask,
use_cache=use_cache,
output_attentions=output_attentions,
)
attn_output = attn_outputs[0]
outputs = attn_outputs[1:]
hidden_states = attn_output + residual
residual = hidden_states
hidden_states = self.ln_2(hidden_states)
feed_forward_hidden_states = self.mlp(hidden_states)
hidden_states = residual + feed_forward_hidden_states
if use_cache:
outputs = (hidden_states,) + outputs
else:
outputs = (hidden_states,) + outputs[1:]
return outputs
class GPTNeoPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = GPTNeoConfig
load_tf_weights = load_tf_weights_in_gpt_neo
base_model_prefix = "transformer"
supports_gradient_checkpointing = True
_no_split_modules = ["GPTNeoBlock"]
_skip_keys_device_placement = "past_key_values"
_supports_flash_attn_2 = True
def __init__(self, *inputs, **kwargs):
super().__init__(*inputs, **kwargs)
def _init_weights(self, module):
"""Initialize the weights."""
if isinstance(module, (nn.Linear,)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
GPT_NEO_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`GPTNeoConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
GPT_NEO_INPUTS_DOCSTRING = r"""
"""
@add_start_docstrings(
"The bare GPT Neo Model transformer outputting raw hidden-states without any specific head on top.",
GPT_NEO_START_DOCSTRING,
)
class GPTNeoModel(GPTNeoPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.embed_dim = config.hidden_size
self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
self.drop = nn.Dropout(float(config.embed_dropout))
self.h = nn.ModuleList([GPTNeoBlock(config, layer_id=i) for i in range(config.num_layers)])
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.wte
def set_input_embeddings(self, new_embeddings):
self.wte = new_embeddings
@add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPastAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
"""
The GPT Neo Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
"""
@add_start_docstrings(
"""
The GPT Neo Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
""",
GPT_NEO_START_DOCSTRING,
)
class GPTNeoForCausalLM(GPTNeoPreTrainedModel):
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
super().__init__(config)
self.transformer = GPTNeoModel(config)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
token_type_ids = kwargs.get("token_type_ids", None)
if past_key_values:
past_length = past_key_values[0][0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
if token_type_ids is not None:
token_type_ids = token_type_ids[:, -input_ids.shape[1] :]
attention_mask = kwargs.get("attention_mask", None)
position_ids = kwargs.get("position_ids", None)
if attention_mask is not None and position_ids is None:
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
if past_key_values:
position_ids = position_ids[:, -input_ids.shape[1] :]
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
model_inputs.update(
{
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"position_ids": position_ids,
"attention_mask": attention_mask,
"token_type_ids": token_type_ids,
}
)
return model_inputs
@add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=CausalLMOutputWithCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
self,
input_ids: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
`labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.transformer(
input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = transformer_outputs[0]
lm_logits = self.lm_head(hidden_states)
loss = None
if labels is not None:
labels = labels.to(lm_logits.device)
lm_logits = lm_logits.to(torch.float32)
shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
loss_fct = CrossEntropyLoss()
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
lm_logits = lm_logits.to(hidden_states.dtype)
loss = loss.to(hidden_states.dtype)
if not return_dict:
output = (lm_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return CausalLMOutputWithPast(
loss=loss,
logits=lm_logits,
past_key_values=transformer_outputs.past_key_values,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@staticmethod
def _reorder_cache(
past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
"""
This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
[`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
beam_idx at every generation step.
"""
return tuple(
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
for layer_past in past_key_values
)
@add_start_docstrings(
"""
The GPTNeo Model transformer with a sequence classification head on top (linear layer).
[`GPTNeoForSequenceClassification`] uses the last token in order to do the classification, as other causal models
(e.g. GPT-1) do.
Since it does classification on the last token, it requires to know the position of the last token. If a
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
each row of the batch).
""",
GPT_NEO_START_DOCSTRING,
)
class GPTNeoForSequenceClassification(GPTNeoPreTrainedModel):
"""
GPTNeo model for sequence classification tasks.
Inherits from `GPTNeoPreTrainedModel` and adds a linear classification layer for sequence classification.
"""
def __init__(self, config):
"""
Initializes the GPTNeoForSequenceClassification model.
Args:
config (:class:`~transformers.GPTNeoConfig`):
The configuration object that defines the model architecture.
Attributes:
num_labels (int):
Number of labels for sequence classification.
transformer (:class:`~transformers.GPTNeoModel`):
The GPTNeoModel transformer instance.
score (:class:`~torch.nn.Linear`):
Linear layer for computing scores for each label.
"""
super().__init__(config)
self.num_labels = config.num_labels
self.transformer = GPTNeoModel(config)
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
self.post_init()
@add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutputWithPast,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Performs forward pass of the GPTNeoForSequenceClassification model.
Args:
input_ids (torch.Tensor, optional):
The input token IDs. Shape [batch_size, sequence_length].
past_key_values (Tuple[torch.FloatTensor], optional):
Tuple of length 1 containing the cached key and value tensors from previous attention layers.
attention_mask (torch.Tensor, optional):
Mask to avoid performing attention on padding tokens. Shape [batch_size, sequence_length].
token_type_ids (torch.Tensor, optional):
Segment token indices to differentiate sequences in batch. Shape [batch_size, sequence_length].
position_ids (torch.Tensor, optional):
Indices of positions of each input token in the sequence. Shape [batch_size, sequence_length].
head_mask (torch.Tensor, optional):
Mask to nullify selected heads of the attention modules. Shape [num_heads] or [num_layers, num_heads].
inputs_embeds (torch.Tensor, optional):
Optionally provided embeddings instead of input_ids. Shape [batch_size, sequence_length, hidden_size].
labels (torch.Tensor, optional):
Labels for computing the sequence classification loss. Shape [batch_size].
use_cache (bool, optional):
Whether to use cache for the attention mechanism.
output_attentions (bool, optional):
Whether to output the attentions tensors.
output_hidden_states (bool, optional):
Whether to output the hidden states tensors.
return_dict (bool, optional):
Whether to return a :class:`~transformers.file_utils.SequenceClassifierOutputWithPast`.
Returns:
:class:`~transformers.file_utils.SequenceClassifierOutputWithPast`:
Sequence classifier output consisting of loss, logits, past key values, attentions, and hidden states.
"""
return super().forward(
input_ids=input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
labels=labels,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
@add_start_docstrings(
"""
GPT Neo model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
GPT_NEO_START_DOCSTRING,
)
class GPTNeoForTokenClassification(GPTNeoPreTrainedModel):
"""
GPTNeo model for token classification tasks.
Inherits from `GPTNeoPreTrainedModel` and adds a linear classification layer for token classification.
"""
def __init__(self, config):
"""
Initializes the GPTNeoForTokenClassification model.
Args:
config (:class:`~transformers.GPTNeoConfig`):
The configuration object that defines the model architecture.
Attributes:
num_labels (int):
Number of labels for token classification.
transformer (:class:`~transformers.GPTNeoModel`):
The GPTNeoModel transformer instance.
dropout (:class:`~torch.nn.Dropout`):
Dropout layer for regularization.
classifier (:class:`~torch.nn.Linear`):
Linear layer for computing scores for each token label.
"""
super().__init__(config)
self.num_labels = config.num_labels
self.transformer = GPTNeoModel(config)
self.dropout = nn.Dropout(config.classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint="EleutherAI/gpt-neo-125m",
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_loss=0.25,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Performs forward pass of the GPTNeoForTokenClassification model.
Args:
input_ids (torch.Tensor, optional):
The input token IDs. Shape [batch_size, sequence_length].
past_key_values (Tuple[torch.FloatTensor], optional):
Tuple of length 1 containing the cached key and value tensors from previous attention layers.
attention_mask (torch.Tensor, optional):
Mask to avoid performing attention on padding tokens. Shape [batch_size, sequence_length].
token_type_ids (torch.Tensor, optional):
Segment token indices to differentiate sequences in batch. Shape [batch_size, sequence_length].
position_ids (torch.Tensor, optional):
Indices of positions of each input token in the sequence. Shape [batch_size, sequence_length].
head_mask (torch.Tensor, optional):
Mask to nullify selected heads of the attention modules. Shape [num_heads] or [num_layers, num_heads].
inputs_embeds (torch.Tensor, optional):
Optionally provided embeddings instead of input_ids. Shape [batch_size, sequence_length, hidden_size].
labels (torch.Tensor, optional):
Labels for computing the token classification loss. Shape [batch_size, sequence_length].
use_cache (bool, optional):
Whether to use cache for the attention mechanism.
output_attentions (bool, optional):
Whether to output the attentions tensors.
output_hidden_states (bool, optional):
Whether to output the hidden states tensors.
return_dict (bool, optional):
Whether to return a :class:`~transformers.file_utils.TokenClassifierOutput`.
Returns:
:class:`~transformers.file_utils.TokenClassifierOutput`:
Token classifier output consisting of loss, logits, attentions, and hidden states.
"""
return super().forward(
input_ids=input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.transformer(
input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = transformer_outputs[0]
hidden_states = self.dropout(hidden_states)
logits = self.classifier(hidden_states)
loss = None
if labels is not None:
labels = labels.to(logits.device)
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + transformer_outputs[2:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings(
"""
The GPT-Neo Model transformer with a span classification head on top for extractive question-answering tasks like
SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
GPT_NEO_START_DOCSTRING,
)
class GPTNeoForQuestionAnswering(GPTNeoPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.transformer = GPTNeoModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, 2)
self.post_init()
@add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
real_checkpoint=_CHECKPOINT_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, QuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
total_loss = None
if start_positions is not None and end_positions is not None:
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
.\models\gpt_neo\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_flax_available, is_torch_available
_import_structure = {
"configuration_gpt_neo": ["GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoConfig", "GPTNeoOnnxConfig"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_gpt_neo"] = [
"GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST",
"GPTNeoForCausalLM",
"GPTNeoForQuestionAnswering",
"GPTNeoForSequenceClassification",
"GPTNeoForTokenClassification",
"GPTNeoModel",
"GPTNeoPreTrainedModel",
"load_tf_weights_in_gpt_neo",
]
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_flax_gpt_neo"] = [
"FlaxGPTNeoForCausalLM",
"FlaxGPTNeoModel",
"FlaxGPTNeoPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_gpt_neo import GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoConfig, GPTNeoOnnxConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_gpt_neo import (
GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST,
GPTNeoForCausalLM,
GPTNeoForQuestionAnswering,
GPTNeoForSequenceClassification,
GPTNeoForTokenClassification,
GPTNeoModel,
GPTNeoPreTrainedModel,
load_tf_weights_in_gpt_neo,
)
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_flax_gpt_neo import FlaxGPTNeoForCausalLM, FlaxGPTNeoModel, FlaxGPTNeoPreTrainedModel
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\gpt_neox\configuration_gpt_neox.py
""" GPTNeoX 模型配置"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"EleutherAI/gpt-neox-20b": "https://huggingface.co/EleutherAI/gpt-neox-20b/resolve/main/config.json",
}
class GPTNeoXConfig(PretrainedConfig):
r"""
这是一个配置类,用于存储 [`GPTNeoXModel`] 的配置。它用于根据指定的参数实例化一个 GPTNeoX 模型,
定义模型架构。使用默认参数实例化配置将产生类似于 GPTNeoX [EleutherAI/gpt-neox-20b]
(https://huggingface.co/EleutherAI/gpt-neox-20b) 架构的配置。
配置对象继承自 [`PretrainedConfig`],可以用于控制模型的输出。更多信息请参阅 [`PretrainedConfig`] 的文档。
```
>>> from transformers import GPTNeoXConfig, GPTNeoXModel
>>> # 初始化一个 GPTNeoX gpt-neox-20b 风格的配置
>>> configuration = GPTNeoXConfig()
>>> # 使用配置初始化一个(具有随机权重的)gpt-neox-20b 风格的模型
>>> model = GPTNeoXModel(configuration) # doctest: +SKIP
>>> # 访问模型配置
>>> configuration = model.config # doctest: +SKIP
```
"""
model_type = "gpt_neox"
def __init__(
self,
vocab_size=50432,
hidden_size=6144,
num_hidden_layers=44,
num_attention_heads=64,
intermediate_size=24576,
hidden_act="gelu",
rotary_pct=0.25,
rotary_emb_base=10000,
attention_dropout=0.0,
hidden_dropout=0.0,
classifier_dropout=0.1,
max_position_embeddings=2048,
initializer_range=0.02,
layer_norm_eps=1e-5,
use_cache=True,
bos_token_id=0,
eos_token_id=2,
tie_word_embeddings=False,
use_parallel_residual=True,
rope_scaling=None,
attention_bias=True,
**kwargs,
):
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.rotary_pct = rotary_pct
self.rotary_emb_base = rotary_emb_base
self.attention_dropout = attention_dropout
self.hidden_dropout = hidden_dropout
self.classifier_dropout = classifier_dropout
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.use_cache = use_cache
self.tie_word_embeddings = tie_word_embeddings
self.use_parallel_residual = use_parallel_residual
self.attention_bias = attention_bias
self._rope_scaling_validation()
if self.hidden_size % self.num_attention_heads != 0:
raise ValueError(
"The hidden size is not divisble by the number of attention heads! Make sure to update them!"
)
def _rope_scaling_validation(self):
"""
验证 `rope_scaling` 配置是否有效。
"""
if self.rope_scaling is None:
return
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
raise ValueError(
"`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
f"got {self.rope_scaling}"
)
rope_scaling_type = self.rope_scaling.get("type", None)
rope_scaling_factor = self.rope_scaling.get("factor", None)
if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
raise ValueError(
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
)
if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
.\models\gpt_neox\modeling_gpt_neox.py
""" PyTorch GPTNeoX 模型。"""
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from torch.nn import functional as F
from ...activations import ACT2FN
from ...file_utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
replace_return_docstrings,
)
from ...modeling_outputs import (
BaseModelOutputWithPast,
CausalLMOutputWithPast,
QuestionAnsweringModelOutput,
SequenceClassifierOutputWithPast,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, logging
from .configuration_gpt_neox import GPTNeoXConfig
if is_flash_attn_2_available():
from flash_attn import flash_attn_func, flash_attn_varlen_func
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "trl-internal-testing/tiny-random-GPTNeoXForCausalLM"
_REAL_CHECKPOINT_FOR_DOC = "EleutherAI/gpt-neox-20b"
_CONFIG_FOR_DOC = "GPTNeoXConfig"
GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST = [
"EleutherAI/gpt-neox-20b",
]
def _get_unpad_data(attention_mask):
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
max_seqlen_in_batch = seqlens_in_batch.max().item()
cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
return (
indices,
cu_seqlens,
max_seqlen_in_batch,
)
class GPTNeoXPreTrainedModel(PreTrainedModel):
"""
一个抽象类,用于处理权重初始化和简单的预训练模型下载加载接口。
"""
config_class = GPTNeoXConfig
base_model_prefix = "gpt_neox"
supports_gradient_checkpointing = True
_no_split_modules = ["GPTNeoXLayer"]
_skip_keys_device_placement = "past_key_values"
_supports_flash_attn_2 = True
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
class GPTNeoXAttention(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.num_attention_heads = config.num_attention_heads
self.hidden_size = config.hidden_size
if self.hidden_size % self.num_attention_heads != 0:
raise ValueError(
"The hidden size is not divisble by the number of attention heads! Make sure to update them"
)
self.head_size = self.hidden_size // self.num_attention_heads
self.rotary_ndims = int(self.head_size * config.rotary_pct)
self._init_bias(config.max_position_embeddings)
self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)
self._init_rope()
self.norm_factor = self.head_size ** -0.5
self.query_key_value = nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=config.attention_bias)
self.dense = nn.Linear(config.hidden_size, config.hidden_size, bias=config.attention_bias)
self.attention_dropout = nn.Dropout(config.attention_dropout)
self.is_causal = True
def _init_bias(self, max_positions, device=None):
self.register_buffer(
"bias",
torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
1, 1, max_positions, max_positions
),
persistent=False,
)
if device is not None:
self.bias = self.bias.to(device)
def _init_rope(self):
if self.config.rope_scaling is None:
self.rotary_emb = GPTNeoXRotaryEmbedding(
self.rotary_ndims, self.config.max_position_embeddings, base=self.config.rotary_emb_base
)
else:
scaling_type = self.config.rope_scaling["type"]
scaling_factor = self.config.rope_scaling["factor"]
if scaling_type == "linear":
self.rotary_emb = GPTNeoXLinearScalingRotaryEmbedding(
self.rotary_ndims,
self.config.max_position_embeddings,
base=self.config.rotary_emb_base,
scaling_factor=scaling_factor,
)
elif scaling_type == "dynamic":
self.rotary_emb = GPTNeoXDynamicNTKScalingRotaryEmbedding(
self.rotary_ndims,
self.config.max_position_embeddings,
base=self.config.rotary_emb_base,
scaling_factor=scaling_factor,
)
else:
raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
def forward(
self,
hidden_states: torch.FloatTensor,
attention_mask: torch.FloatTensor,
position_ids: torch.LongTensor,
head_mask: Optional[torch.FloatTensor] = None,
layer_past: Optional[Tuple[torch.Tensor]] = None,
use_cache: Optional[bool] = False,
output_attentions: Optional[bool] = False,
padding_mask: Optional[torch.Tensor] = None,
):
has_layer_past = layer_past is not None
qkv = self.query_key_value(hidden_states)
new_qkv_shape = qkv.size()[:-1] + (self.num_attention_heads, 3 * self.head_size)
qkv = qkv.view(*new_qkv_shape)
query = qkv[..., : self.head_size].permute(0, 2, 1, 3)
key = qkv[..., self.head_size : 2 * self.head_size].permute(0, 2, 1, 3)
value = qkv[..., 2 * self.head_size :].permute(0, 2, 1, 3)
query_rot = query[..., : self.rotary_ndims]
query_pass = query[..., self.rotary_ndims :]
key_rot = key[..., : self.rotary_ndims]
key_pass = key[..., self.rotary_ndims :]
seq_len = key.shape[-2]
if has_layer_past:
seq_len += layer_past[0].shape[-2]
cos, sin = self.rotary_emb(value, seq_len=seq_len)
query, key = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
query = torch.cat((query, query_pass), dim=-1)
key = torch.cat((key, key_pass), dim=-1)
if has_layer_past:
past_key = layer_past[0]
past_value = layer_past[1]
key = torch.cat((past_key, key), dim=-2)
value = torch.cat((past_value, value), dim=-2)
present = (key, value) if use_cache else None
attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_size)
attn_output = self.dense(attn_output)
outputs = (attn_output, present)
if output_attentions:
outputs += (attn_weights,)
return outputs
@classmethod
def _split_heads(cls, tensor, num_attention_heads, attn_head_size):
"""
Splits hidden dim into attn_head_size and num_attention_heads
"""
new_shape = tensor.size()[:-1] + (num_attention_heads, attn_head_size)
tensor = tensor.view(new_shape)
tensor = tensor.permute(0, 2, 1, 3)
return tensor
@classmethod
def _merge_heads(cls, tensor, num_attention_heads, attn_head_size):
"""
Merges attn_head_size dim and num_attn_heads dim into hidden dim
"""
tensor = tensor.permute(0, 2, 1, 3).contiguous()
tensor = tensor.view(tensor.size(0), tensor.size(1), num_attention_heads * attn_head_size)
return tensor
def _attn(self, query, key, value, attention_mask=None, head_mask=None):
batch_size, num_attention_heads, query_length, attn_head_size = query.size()
key_length = key.size(-2)
if key_length > self.bias.shape[-1]:
self._init_bias(key_length, device=key.device)
causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
query = query.view(batch_size * num_attention_heads, query_length, attn_head_size)
key = key.view(batch_size * num_attention_heads, key_length, attn_head_size)
attn_scores = torch.zeros(
batch_size * num_attention_heads,
query_length,
key_length,
dtype=query.dtype,
device=key.device,
)
attn_scores = torch.baddbmm(
attn_scores,
query,
key.transpose(1, 2),
beta=1.0,
alpha=self.norm_factor,
)
attn_scores = attn_scores.view(batch_size, num_attention_heads, query_length, key_length)
mask_value = torch.finfo(attn_scores.dtype).min
mask_value = torch.tensor(mask_value, dtype=attn_scores.dtype).to(attn_scores.device)
attn_scores = torch.where(causal_mask, attn_scores, mask_value)
if attention_mask is not None:
attn_scores = attn_scores + attention_mask
attn_weights = nn.functional.softmax(attn_scores, dim=-1)
attn_weights = attn_weights.to(value.dtype)
if head_mask is not None:
attn_weights = attn_weights * head_mask
attn_weights = self.attention_dropout(attn_weights)
attn_output = torch.matmul(attn_weights, value)
return attn_output, attn_weights
class GPTNeoXFlashAttention2(GPTNeoXAttention):
"""
GPTNeoX flash attention module. This module inherits from `GPTNeoXAttention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
def forward(
self,
hidden_states: torch.FloatTensor,
attention_mask: torch.FloatTensor,
position_ids: torch.LongTensor,
head_mask: Optional[torch.FloatTensor] = None,
layer_past: Optional[Tuple[torch.Tensor]] = None,
use_cache: Optional[bool] = False,
output_attentions: Optional[bool] = False,
):
def _flash_attention_forward(
self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
):
):
"""
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
first unpad the input, then computes the attention scores and pad the final attention scores.
Args:
query_states (`torch.Tensor`):
Input query states to be passed to Flash Attention API
key_states (`torch.Tensor`):
Input key states to be passed to Flash Attention API
value_states (`torch.Tensor`):
Input value states to be passed to Flash Attention API
attention_mask (`torch.Tensor`):
The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
position of padding tokens and 1 for the position of non-padding tokens.
dropout (`float`):
Attention dropout
softmax_scale (`float`, *optional*):
The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
"""
if not self._flash_attn_uses_top_left_mask:
causal = self.is_causal
else:
causal = self.is_causal and query_length != 1
if attention_mask is not None:
batch_size = query_states.shape[0]
query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
query_states, key_states, value_states, attention_mask, query_length
)
cu_seqlens_q, cu_seqlens_k = cu_seq_lens
max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
attn_output_unpad = flash_attn_varlen_func(
query_states,
key_states,
value_states,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_k=cu_seqlens_k,
max_seqlen_q=max_seqlen_in_batch_q,
max_seqlen_k=max_seqlen_in_batch_k,
dropout_p=dropout,
softmax_scale=softmax_scale,
causal=causal,
)
attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
else:
attn_output = flash_attn_func(
query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
)
return attn_output
def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
key_layer = index_first_axis(
key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
)
value_layer = index_first_axis(
value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
)
if query_length == kv_seq_len:
query_layer = index_first_axis(
query_layer.reshape(batch_size * kv_seq_len, self.num_attention_heads, head_dim), indices_k
)
cu_seqlens_q = cu_seqlens_k
max_seqlen_in_batch_q = max_seqlen_in_batch_k
indices_q = indices_k
elif query_length == 1:
max_seqlen_in_batch_q = 1
cu_seqlens_q = torch.arange(
batch_size + 1, dtype=torch.int32, device=query_layer.device
)
indices_q = cu_seqlens_q[:-1]
query_layer = query_layer.squeeze(1)
else:
attention_mask = attention_mask[:, -query_length:]
query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
return (
query_layer,
key_layer,
value_layer,
indices_q,
(cu_seqlens_q, cu_seqlens_k),
(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
)
def attention_mask_func(attention_scores, ltor_mask):
attention_scores.masked_fill_(~ltor_mask, torch.finfo(attention_scores.dtype).min)
return attention_scores
class GPTNeoXRotaryEmbedding(nn.Module):
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
super().__init__()
self.dim = dim
self.max_position_embeddings = max_position_embeddings
self.base = base
inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
self.register_buffer("inv_freq", inv_freq, persistent=False)
self._set_cos_sin_cache(
seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
)
def _set_cos_sin_cache(self, seq_len, device, dtype):
self.max_seq_len_cached = seq_len
t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
freqs = torch.outer(t, self.inv_freq)
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer("cos_cached", emb.cos(), persistent=False)
self.register_buffer("sin_cached", emb.sin(), persistent=False)
def forward(self, x, seq_len=None):
if seq_len > self.max_seq_len_cached:
self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
return (
self.cos_cached[:seq_len],
self.sin_cached[:seq_len],
)
class GPTNeoXLinearScalingRotaryEmbedding(GPTNeoXRotaryEmbedding):
"""GPTNeoXRotaryEmbedding扩展,添加了线性缩放功能。鸣谢Reddit用户/u/kaiokendev"""
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
self.scaling_factor = scaling_factor
super().__init__(dim, max_position_embeddings, base, device)
def _set_cos_sin_cache(self, seq_len, device, dtype):
self.max_seq_len_cached = seq_len
t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
t = t / self.scaling_factor
freqs = torch.outer(t, self.inv_freq)
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer("cos_cached", emb.cos(), persistent=False)
self.register_buffer("sin_cached", emb.sin(), persistent=False)
class GPTNeoXDynamicNTKScalingRotaryEmbedding(GPTNeoXRotaryEmbedding):
"""GPTNeoXRotaryEmbedding扩展,增加了动态NTK缩放功能。由Reddit用户/u/bloc97和/u/emozilla贡献"""
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
self.scaling_factor = scaling_factor
super().__init__(dim, max_position_embeddings, base, device)
def _set_cos_sin_cache(self, seq_len, device, dtype):
self.max_seq_len_cached = seq_len
if seq_len > self.max_position_embeddings:
base = self.base * (
(self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
) ** (self.dim / (self.dim - 2))
inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
self.register_buffer("inv_freq", inv_freq, persistent=False)
t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
freqs = torch.outer(t, self.inv_freq)
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer("cos_cached", emb.cos(), persistent=False)
self.register_buffer("sin_cached", emb.sin(), persistent=False)
def rotate_half(x):
"""将输入的一半隐藏维度进行旋转。"""
x1 = x[..., : x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2 :]
return torch.cat((-x2, x1), dim=-1)
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
"""将旋转位置嵌入应用到查询和键张量上。"""
cos = cos[position_ids].unsqueeze(unsqueeze_dim)
sin = sin[position_ids].unsqueeze(unsqueeze_dim)
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)
return q_embed, k_embed
class GPTNeoXMLP(nn.Module):
def __init__(self, config):
super().__init__()
self.dense_h_to_4h = nn.Linear(config.hidden_size, config.intermediate_size)
self.dense_4h_to_h = nn.Linear(config.intermediate_size, config.hidden_size)
self.act = ACT2FN[config.hidden_act]
def forward(self, hidden_states):
hidden_states = self.dense_h_to_4h(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.dense_4h_to_h(hidden_states)
return hidden_states
class GPTNeoXLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.use_parallel_residual = config.use_parallel_residual
self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.post_attention_dropout = nn.Dropout(config.hidden_dropout)
self.post_mlp_dropout = nn.Dropout(config.hidden_dropout)
self.attention = GPT_NEOX_ATTENTION_CLASSES[config._attn_implementation](config)
self.mlp = GPTNeoXMLP(config)
def forward(
self,
hidden_states: Optional[torch.FloatTensor],
attention_mask: Optional[torch.FloatTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = False,
layer_past: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
):
attention_layer_outputs = self.attention(
self.input_layernorm(hidden_states),
attention_mask=attention_mask,
position_ids=position_ids,
layer_past=layer_past,
head_mask=head_mask,
use_cache=use_cache,
output_attentions=output_attentions,
)
attn_output = attention_layer_outputs[0]
attn_output = self.post_attention_dropout(attn_output)
outputs = attention_layer_outputs[1:]
if self.use_parallel_residual:
mlp_output = self.mlp(self.post_attention_layernorm(hidden_states))
mlp_output = self.post_mlp_dropout(mlp_output)
hidden_states = mlp_output + attn_output + hidden_states
else:
attn_output = attn_output + hidden_states
mlp_output = self.mlp(self.post_attention_layernorm(attn_output))
mlp_output = self.post_mlp_dropout(mlp_output)
hidden_states = mlp_output + attn_output
if use_cache:
outputs = (hidden_states,) + outputs
else:
outputs = (hidden_states,) + outputs[1:]
return outputs
GPT_NEOX_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`~GPTNeoXConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
GPT_NEOX_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.n_positions - 1]`.
[What are position IDs?](../glossary#position-ids)
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"""Add model-specific documentation to the provided function or class.
Args:
docstring (`str`): The docstring to add to the function or class.
Returns:
Callable: A decorator function that adds the specified docstring to the decorated function or class.
"""
)
"The bare GPTNeoX Model transformer outputting raw hidden-states without any specific head on top.",
GPT_NEOX_START_DOCSTRING,
class GPTNeoXModel(GPTNeoXPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.config = config
self.embed_in = nn.Embedding(config.vocab_size, config.hidden_size)
self.emb_dropout = nn.Dropout(config.hidden_dropout)
self.layers = nn.ModuleList([GPTNeoXLayer(config) for _ in range(config.num_hidden_layers)])
self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.embed_in
def set_input_embeddings(self, value):
self.embed_in = value
@add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPast,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""GPTNeoX Model with a `language modeling` head on top for CLM fine-tuning.""", GPT_NEOX_START_DOCSTRING
)
class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel):
_tied_weights_keys = ["embed_out.weight"]
def __init__(self, config):
super().__init__(config)
self.gpt_neox = GPTNeoXModel(config)
self.embed_out = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
def get_output_embeddings(self):
return self.embed_out
def set_output_embeddings(self, new_embeddings):
self.embed_out = new_embeddings
@add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Model forward method for transformer-like models.
Args:
input_ids (Optional[torch.LongTensor]): Input token IDs.
attention_mask (Optional[torch.FloatTensor]): Mask to avoid performing attention on padding tokens.
position_ids (Optional[torch.LongTensor]): Position IDs for positional embeddings.
inputs_embeds (Optional[torch.FloatTensor]): Optional input embeddings directly provided instead of input_ids.
head_mask (Optional[torch.FloatTensor]): Mask for attention heads.
past_key_values (Optional[Tuple[Tuple[torch.FloatTensor]]]): Cached key-value states for fast autoregressive decoding.
labels (Optional[torch.LongTensor]): Target labels for training.
use_cache (Optional[bool]): Whether to use the cached past key-values for generation.
output_attentions (Optional[bool]): Whether to return attention weights.
output_hidden_states (Optional[bool]): Whether to return hidden states.
return_dict (Optional[bool]): Whether to return a dictionary.
Returns:
model_inputs (Dict[str, torch.Tensor]): Dictionary containing model inputs.
"""
pass
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
):
"""
Prepares inputs for generation by adjusting input_ids and other necessary tensors.
Args:
input_ids (torch.Tensor): Input token IDs.
past_key_values (Optional[Tuple[Tuple[torch.FloatTensor]]]): Cached key-value states from previous decoding steps.
attention_mask (torch.Tensor): Mask to avoid attending to padding tokens.
inputs_embeds (torch.Tensor): Optional input embeddings.
**kwargs: Additional keyword arguments.
Returns:
model_inputs (Dict[str, torch.Tensor]): Dictionary containing prepared model inputs.
"""
input_shape = input_ids.shape
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
position_ids = kwargs.get("position_ids", None)
if attention_mask is not None and position_ids is None:
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
if past_key_values:
position_ids = position_ids[:, -input_ids.shape[1] :]
if attention_mask is None:
attention_mask = input_ids.new_ones(input_shape)
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
model_inputs.update(
{
"attention_mask": attention_mask,
"past_key_values": past_key_values,
"position_ids": position_ids,
}
)
return model_inputs
def _reorder_cache(self, past_key_values, beam_idx):
"""
Reorders cached past key-values according to beam search index.
Args:
past_key_values (Tuple[Tuple[torch.FloatTensor]]): Cached key-value states.
beam_idx (torch.Tensor): Index tensor for reordering.
Returns:
reordered_past (Tuple[Tuple[torch.FloatTensor]]): Reordered past key-values.
"""
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+ layer_past[2:],
)
return reordered_past
"""
The GPTNeoX Model transformer with a sequence classification head on top (linear layer).
[`GPTNeoXForSequenceClassification`] uses the last token in order to do the classification, as other causal models
(e.g. GPT-1) do.
Since it does classification on the last token, it requires to know the position of the last token. If a
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
each row of the batch).
"""
@add_start_docstrings(
"""
The GPTNeoX Model transformer with a token classification head on top (linear layer).
This model uses the GPTNeoX architecture and adds a linear layer on top for token classification tasks.
It includes dropout and a linear classifier for the token classification layer.
Since it performs classification on each token independently, it requires the position information for each token.
If `pad_token_id` is defined in the configuration, it identifies tokens that are not padding tokens in each sequence.
If `pad_token_id` is not defined, it uses the last token in each sequence. When using `inputs_embeds` instead of
`input_ids`, the model assumes the last token in each sequence for classification.
Note that the configuration checkpoint for this model can be found at "LarsJonasson/pythia-410m-deduped-sft-swedish".
""",
GPT_NEOX_START_DOCSTRING,
)
class GPTNeoXForTokenClassification(GPTNeoXPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.gpt_neox = GPTNeoXModel(config)
self.dropout = nn.Dropout(config.classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint="LarsJonasson/pythia-410m-deduped-sft-swedish",
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_loss=0.25,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.gpt_neox(
input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0]
hidden_states = self.dropout(hidden_states)
logits = self.classifier(hidden_states)
loss = None
if labels is not None:
labels = labels.to(logits.device)
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
"""
The GPT-NeoX Model transformer with a span classification head on top for extractive question-answering tasks like
SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
"""
@add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
)
class GPTNeoXForQuestionAnswering(GPTNeoXPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.gpt_neox = GPTNeoXModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, 2)
self.post_init()
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
) -> Union[Tuple, QuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.gpt_neox(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
total_loss = None
if start_positions is not None and end_positions is not None:
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1).to(start_logits.device)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1).to(end_logits.device)
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)