Transformers 源码解析(二十七)
.\models\clvp\number_normalizer.py
"""English Normalizer class for CLVP."""
import re
class EnglishNormalizer:
def __init__(self):
self._abbreviations = [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
("mrs", "misess"),
("mr", "mister"),
("dr", "doctor"),
("st", "saint"),
("co", "company"),
("jr", "junior"),
("maj", "major"),
("gen", "general"),
("drs", "doctors"),
("rev", "reverend"),
("lt", "lieutenant"),
("hon", "honorable"),
("sgt", "sergeant"),
("capt", "captain"),
("esq", "esquire"),
("ltd", "limited"),
("col", "colonel"),
("ft", "fort"),
]
]
self.ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
self.teens = [
"ten",
"eleven",
"twelve",
"thirteen",
"fourteen",
"fifteen",
"sixteen",
"seventeen",
"eighteen",
"nineteen",
]
self.tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
def number_to_words(self, num: int) -> str:
"""
Converts numbers(`int`) to words(`str`).
Please note that it only supports up to - "'nine hundred ninety-nine quadrillion, nine hundred ninety-nine
trillion, nine hundred ninety-nine billion, nine hundred ninety-nine million, nine hundred ninety-nine
thousand, nine hundred ninety-nine'" or `number_to_words(999_999_999_999_999_999)`.
"""
if num == 0:
return "zero"
elif num < 0:
return "minus " + self.number_to_words(abs(num))
elif num < 10:
return self.ones[num]
elif num < 20:
return self.teens[num - 10]
elif num < 100:
return self.tens[num // 10] + ("-" + self.number_to_words(num % 10) if num % 10 != 0 else "")
elif num < 1000:
return (
self.ones[num // 100] + " hundred" + (" " + self.number_to_words(num % 100) if num % 100 != 0 else "")
)
elif num < 1_000_000:
return (
self.number_to_words(num // 1000)
+ " thousand"
+ (", " + self.number_to_words(num % 1000) if num % 1000 != 0 else "")
)
elif num < 1_000_000_000:
return (
self.number_to_words(num // 1_000_000)
+ " million"
+ (", " + self.number_to_words(num % 1_000_000) if num % 1_000_000 != 0 else "")
)
elif num < 1_000_000_000_000:
return (
self.number_to_words(num // 1_000_000_000)
+ " billion"
+ (", " + self.number_to_words(num % 1_000_000_000) if num % 1_000_000_000 != 0 else "")
)
elif num < 1_000_000_000_000_000:
return (
self.number_to_words(num // 1_000_000_000_000)
+ " trillion"
+ (", " + self.number_to_words(num % 1_000_000_000_000) if num % 1_000_000_000_000 != 0 else "")
)
elif num < 1_000_000_000_000_000_000:
return (
self.number_to_words(num // 1_000_000_000_000_000)
+ " quadrillion"
+ (
", " + self.number_to_words(num % 1_000_000_000_000_000)
if num % 1_000_000_000_000_000 != 0
else ""
)
)
else:
return "number out of range"
def convert_to_ascii(self, text: str) -> str:
"""
Converts unicode to ascii
"""
return text.encode("ascii", "ignore").decode("utf-8")
def _expand_dollars(self, m: str) -> str:
"""
This method is used to expand numerical dollar values into spoken words.
"""
match = m.group(1)
parts = match.split(".")
if len(parts) > 2:
return match + " dollars"
dollars = int(parts[0]) if parts[0] else 0
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
if dollars and cents:
dollar_unit = "dollar" if dollars == 1 else "dollars"
cent_unit = "cent" if cents == 1 else "cents"
return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
elif dollars:
dollar_unit = "dollar" if dollars == 1 else "dollars"
return "%s %s" % (dollars, dollar_unit)
elif cents:
cent_unit = "cent" if cents == 1 else "cents"
return "%s %s" % (cents, cent_unit)
else:
return "zero dollars"
def _remove_commas(self, m: str) -> str:
"""
This method is used to remove commas from sentences.
"""
return m.group(1).replace(",", "")
def _expand_decimal_point(self, m: str) -> str:
"""
This method is used to expand '.' into spoken word ' point '.
"""
return m.group(1).replace(".", " point ")
def _expand_ordinal(self, num: str) -> str:
"""
This method is used to expand ordinals such as '1st', '2nd' into spoken words.
"""
ordinal_suffixes = {1: "st", 2: "nd", 3: "rd"}
num = int(num.group(0)[:-2])
if 10 <= num % 100 and num % 100 <= 20:
suffix = "th"
else:
suffix = ordinal_suffixes.get(num % 10, "th")
return self.number_to_words(num) + suffix
def _expand_number(self, m: str) -> str:
"""
This method acts as a preprocessing step for numbers between 1000 and 3000 (same as the original repository,
link :
https://github.com/neonbjb/tortoise-tts/blob/4003544b6ff4b68c09856e04d3eff9da26d023c2/tortoise/utils/tokenizer.py#L86)
"""
num = int(m.group(0))
if num > 1000 and num < 3000:
if num == 2000:
return "two thousand"
elif num > 2000 and num < 2010:
return "two thousand " + self.number_to_words(num % 100)
elif num % 100 == 0:
return self.number_to_words(num // 100) + " hundred"
else:
return self.number_to_words(num)
else:
return self.number_to_words(num)
这段代码包含了几个用于文本处理和数字转换的方法,每个方法都有详细的注释说明其功能和实现逻辑。
def normalize_numbers(self, text: str) -> str:
text = re.sub(re.compile(r"([0-9][0-9\,]+[0-9])"), self._remove_commas, text)
text = re.sub(re.compile(r"£([0-9\,]*[0-9]+)"), r"\1 pounds", text)
text = re.sub(re.compile(r"\$([0-9\.\,]*[0-9]+)"), self._expand_dollars, text)
text = re.sub(re.compile(r"([0-9]+\.[0-9]+)"), self._expand_decimal_point, text)
text = re.sub(re.compile(r"[0-9]+(st|nd|rd|th)"), self._expand_ordinal, text)
text = re.sub(re.compile(r"[0-9]+"), self._expand_number, text)
return text
def expand_abbreviations(self, text: str) -> str:
for regex, replacement in self._abbreviations:
text = re.sub(regex, replacement, text)
return text
def collapse_whitespace(self, text: str) -> str:
return re.sub(re.compile(r"\s+"), " ", text)
def __call__(self, text):
text = self.convert_to_ascii(text)
text = text.lower()
text = self.normalize_numbers(text)
text = self.expand_abbreviations(text)
text = self.collapse_whitespace(text)
text = text.replace('"', "")
return text
.\models\clvp\processing_clvp.py
"""
Processor class for CLVP
"""
from ...processing_utils import ProcessorMixin
class ClvpProcessor(ProcessorMixin):
"""
Constructs a CLVP processor which wraps a CLVP Feature Extractor and a CLVP Tokenizer into a single processor.
[`ClvpProcessor`] offers all the functionalities of [`ClvpFeatureExtractor`] and [`ClvpTokenizer`]. See the
[`~ClvpProcessor.__call__`], [`~ClvpProcessor.decode`] and [`~ClvpProcessor.batch_decode`] for more information.
Args:
feature_extractor (`ClvpFeatureExtractor`):
An instance of [`ClvpFeatureExtractor`]. The feature extractor is a required input.
tokenizer (`ClvpTokenizer`):
An instance of [`ClvpTokenizer`]. The tokenizer is a required input.
"""
feature_extractor_class = "ClvpFeatureExtractor"
tokenizer_class = "ClvpTokenizer"
model_input_names = [
"input_ids",
"input_features",
"attention_mask",
]
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
def __call__(self, *args, **kwargs):
"""
Forwards the `audio` and `sampling_rate` arguments to [`~ClvpFeatureExtractor.__call__`] and the `text`
argument to [`~ClvpTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
information.
"""
raw_speech = kwargs.pop("raw_speech", None)
sampling_rate = kwargs.pop("sampling_rate", None)
text = kwargs.pop("text", None)
if raw_speech is None and text is None:
raise ValueError("You need to specify either an `raw_speech` or `text` input to process.")
if raw_speech is not None:
inputs = self.feature_extractor(raw_speech, sampling_rate=sampling_rate, **kwargs)
if text is not None:
encodings = self.tokenizer(text, **kwargs)
if text is None:
return inputs
elif raw_speech is None:
return encodings
else:
inputs["input_ids"] = encodings["input_ids"]
inputs["attention_mask"] = encodings["attention_mask"]
return inputs
def batch_decode(self, *args, **kwargs):
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
return self.tokenizer.decode(*args, **kwargs)
.\models\clvp\tokenization_clvp.py
import json
import os
from functools import lru_cache
from typing import List, Optional, Tuple
import regex as re
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import logging
from .number_normalizer import EnglishNormalizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"clvp_dev": "https://huggingface.co/susnato/clvp_dev/blob/main/vocab.json",
},
"merges_file": {
"clvp_dev": "https://huggingface.co/susnato/clvp_dev/blob/main/merges.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"clvp_dev": 1024,
}
@lru_cache()
def bytes_to_unicode():
"""
返回 utf-8 字节列表及其对应的 Unicode 字符映射。避免将空白字符和控制字符映射到 BPE 代码会出错的地方。
可逆的 BPE 编码适用于 Unicode 字符串。这意味着如果要避免 UNKs,你需要在词汇表中包含大量的 Unicode 字符。
当你处理像 10B 令牌数据集时,你最终需要大约 5K 个 Unicode 字符才能获得良好的覆盖率。
这在你正常使用的 32K BPE 词汇表中占据了显著的比例。为了避免这种情况,我们需要 utf-8 字节和 Unicode 字符串之间的查找表。
"""
bs = (
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
)
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
def get_pairs(word):
"""
返回单词中的符号对集合。
单词被表示为符号元组(符号是长度可变的字符串)。
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
class ClvpTokenizer(PreTrainedTokenizer):
"""
构建一个 CLVP 分词器。基于字节级字节对编码。
"""
pass
class CLVPTokenizer(PreTrainedTokenizer):
def __init__(
self,
vocab_file: str,
merges_file: str,
errors: str = "replace",
unk_token: str = "[UNK]",
bos_token: str = "<|endoftext|>",
eos_token: str = "[STOP]",
pad_token: str = "[STOP]",
add_prefix_space: bool = False,
add_bos_token: bool = False,
add_eos_token: bool = False,
):
super().__init__(
vocab_file=vocab_file,
merges_file=merges_file,
errors=errors,
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
add_prefix_space=add_prefix_space,
add_bos_token=add_bos_token,
add_eos_token=add_eos_token,
)
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = [
"input_ids",
"attention_mask",
]
def __init__(
self,
vocab_file,
merges_file,
errors="replace",
unk_token="[UNK]",
bos_token="<|endoftext|>",
eos_token="[STOP]",
pad_token="[STOP]",
add_prefix_space=False,
add_bos_token=False,
add_eos_token=False,
**kwargs,
):
bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
self.add_bos_token = add_bos_token
self.add_eos_token = add_eos_token
self._normalizer = None
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
with open(merges_file, encoding="utf-8") as merges_handle:
bpe_merges = merges_handle.read().split("\n")[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
self.add_prefix_space = add_prefix_space
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
super().__init__(
errors=errors,
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
add_prefix_space=add_prefix_space,
add_bos_token=add_bos_token,
add_eos_token=add_eos_token,
**kwargs,
)
@property
def vocab_size(self):
return len(self.encoder)
@property
def normalizer(self):
if self._normalizer is None:
self._normalizer = EnglishNormalizer()
return self._normalizer
def get_vocab(self):
return dict(self.encoder, **self.added_tokens_encoder)
def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token)
pairs = get_pairs(word)
if not pairs:
return token
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = " ".join(word)
self.cache[token] = word
return word
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
output = bos_token_id + token_ids_0 + eos_token_id
if token_ids_1 is not None:
output = output + bos_token_id + token_ids_1 + eos_token_id
return output
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
):
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if not self.add_bos_token:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=False
)
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0))
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
def _tokenize(self, text):
"""Tokenize a string."""
bpe_tokens = []
text = self.normalizer(text)
for token in re.findall(self.pat, text):
token = "".join(
self.byte_encoder[b] for b in token.encode("utf-8")
)
bpe_tokens.extend(
"[SPACE]" if bpe_token == "\u0120" and "[SPACE]" in self.encoder.keys() else bpe_token
for bpe_token in self.bpe(token).split(" ")
)
return bpe_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) into an id using the vocab."""
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) into a token (str) using the vocab."""
return self.decoder.get(index)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) into a single string."""
text = "".join(tokens)
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
return text
def clean_up_tokenization(self, text):
text = "".join(text)
vocab_tokens = list(self.encoder.keys()) + list(self.added_tokens_encoder.keys())
text = text.replace("[SPACE]", " ") if "[SPACE]" in vocab_tokens else text
text = text.replace("[STOP]", " ") if "[STOP]" in vocab_tokens else text
text = text.replace(self.unk_token, "").replace(" ", " ").replace(" ", " ")
return text
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write("#version: 0.2\n")
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file
.\models\clvp\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_torch_available,
)
_import_structure = {
"configuration_clvp": [
"CLVP_PRETRAINED_CONFIG_ARCHIVE_MAP",
"ClvpConfig",
"ClvpDecoderConfig",
"ClvpEncoderConfig",
],
"feature_extraction_clvp": ["ClvpFeatureExtractor"],
"processing_clvp": ["ClvpProcessor"],
"tokenization_clvp": ["ClvpTokenizer"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_clvp"] = [
"CLVP_PRETRAINED_MODEL_ARCHIVE_LIST",
"ClvpModelForConditionalGeneration",
"ClvpForCausalLM",
"ClvpModel",
"ClvpPreTrainedModel",
"ClvpEncoder",
"ClvpDecoder",
]
if TYPE_CHECKING:
from .configuration_clvp import (
CLVP_PRETRAINED_CONFIG_ARCHIVE_MAP,
ClvpConfig,
ClvpDecoderConfig,
ClvpEncoderConfig,
)
from .feature_extraction_clvp import ClvpFeatureExtractor
from .processing_clvp import ClvpProcessor
from .tokenization_clvp import ClvpTokenizer
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_clvp import (
CLVP_PRETRAINED_MODEL_ARCHIVE_LIST,
ClvpDecoder,
ClvpEncoder,
ClvpForCausalLM,
ClvpModel,
ClvpModelForConditionalGeneration,
ClvpPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\codegen\configuration_codegen.py
""" CodeGen model configuration"""
from collections import OrderedDict
from typing import Any, List, Mapping, Optional
from ... import PreTrainedTokenizer, TensorType, is_torch_available
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfigWithPast, PatchingSpec
from ...utils import logging
logger = logging.get_logger(__name__)
CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"Salesforce/codegen-350M-nl": "https://huggingface.co/Salesforce/codegen-350M-nl/resolve/main/config.json",
"Salesforce/codegen-350M-multi": "https://huggingface.co/Salesforce/codegen-350M-multi/resolve/main/config.json",
"Salesforce/codegen-350M-mono": "https://huggingface.co/Salesforce/codegen-350M-mono/resolve/main/config.json",
"Salesforce/codegen-2B-nl": "https://huggingface.co/Salesforce/codegen-2B-nl/resolve/main/config.json",
"Salesforce/codegen-2B-multi": "https://huggingface.co/Salesforce/codegen-2B-multi/resolve/main/config.json",
"Salesforce/codegen-2B-mono": "https://huggingface.co/Salesforce/codegen-2B-mono/resolve/main/config.json",
"Salesforce/codegen-6B-nl": "https://huggingface.co/Salesforce/codegen-6B-nl/resolve/main/config.json",
"Salesforce/codegen-6B-multi": "https://huggingface.co/Salesforce/codegen-6B-multi/resolve/main/config.json",
"Salesforce/codegen-6B-mono": "https://huggingface.co/Salesforce/codegen-6B-mono/resolve/main/config.json",
"Salesforce/codegen-16B-nl": "https://huggingface.co/Salesforce/codegen-16B-nl/resolve/main/config.json",
"Salesforce/codegen-16B-multi": "https://huggingface.co/Salesforce/codegen-16B-multi/resolve/main/config.json",
"Salesforce/codegen-16B-mono": "https://huggingface.co/Salesforce/codegen-16B-mono/resolve/main/config.json",
}
class CodeGenConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`CodeGenModel`]. It is used to instantiate a
CodeGen model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the CodeGen
[Salesforce/codegen-2B-mono](https://huggingface.co/Salesforce/codegen-2B-mono) architecture. Configuration objects
"""
def __init__(
self,
**kwargs
):
super().__init__(**kwargs)
class CodeGenConfig(PretrainedConfig):
def __init__(
self,
vocab_size=50400,
n_positions=2048,
n_ctx=2048,
n_embd=4096,
n_layer=28,
n_head=16,
rotary_dim=64,
n_inner=None,
activation_function="gelu_new",
resid_pdrop=0.0,
embd_pdrop=0.0,
attn_pdrop=0.0,
layer_norm_epsilon=1e-05,
initializer_range=0.02,
use_cache=True,
bos_token_id=50256,
eos_token_id=50256,
tie_word_embeddings=False,
):
self.vocab_size = vocab_size
self.n_positions = n_positions
self.n_ctx = n_ctx
self.n_embd = n_embd
self.n_layer = n_layer
self.n_head = n_head
self.rotary_dim = rotary_dim
self.n_inner = n_inner if n_inner is not None else 4 * n_embd
self.activation_function = activation_function
self.resid_pdrop = resid_pdrop
self.embd_pdrop = embd_pdrop
self.attn_pdrop = attn_pdrop
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.use_cache = use_cache
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.tie_word_embeddings = tie_word_embeddings
>>> from transformers import CodeGenConfig, CodeGenModel
>>> configuration = CodeGenConfig()
>>> model = CodeGenModel(configuration)
>>> configuration = model.config
```
class CodeGenOnnxConfig(OnnxConfigWithPast):
def __init__(
self,
config: PretrainedConfig,
task: str = "default",
patching_specs: List[PatchingSpec] = None,
use_past: bool = False,
):
super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past)
if not getattr(self._config, "pad_token_id", None):
self._config.pad_token_id = 0
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}})
if self.use_past:
self.fill_with_past_key_values_(common_inputs, direction="inputs")
common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"}
else:
common_inputs["attention_mask"] = {0: "batch", 1: "sequence"}
return common_inputs
@property
def num_layers(self) -> int:
return self._config.n_layer
@property
def num_attention_heads(self) -> int:
return self._config.n_head
def generate_dummy_inputs(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
)
ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]})
if self.use_past:
if not is_torch_available():
raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
else:
import torch
batch, seqlen = common_inputs["input_ids"].shape
past_key_values_length = seqlen + 2
past_shape = (
batch,
self.num_attention_heads,
past_key_values_length,
self._config.hidden_size // self.num_attention_heads,
)
ordered_inputs["past_key_values"] = [
(torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(self.num_layers)
]
ordered_inputs["attention_mask"] = common_inputs["attention_mask"]
if self.use_past:
mask_dtype = ordered_inputs["attention_mask"].dtype
ordered_inputs["attention_mask"] = torch.cat(
[ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
)
return ordered_inputs
@property
def default_onnx_opset(self) -> int:
return 13
.\models\codegen\modeling_codegen.py
""" PyTorch CodeGen model."""
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
from ...modeling_utils import PreTrainedModel
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_codegen import CodeGenConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "Salesforce/codegen-2B-mono"
_CONFIG_FOR_DOC = "CodeGenConfig"
CODEGEN_PRETRAINED_MODEL_ARCHIVE_LIST = [
"Salesforce/codegen-350M-nl",
"Salesforce/codegen-350M-multi",
"Salesforce/codegen-350M-mono",
"Salesforce/codegen-2B-nl",
"Salesforce/codegen-2B-multi",
"Salesforce/codegen-2B-mono",
"Salesforce/codegen-6B-nl",
"Salesforce/codegen-6B-multi",
"Salesforce/codegen-6B-mono",
"Salesforce/codegen-16B-nl",
"Salesforce/codegen-16B-multi",
"Salesforce/codegen-16B-mono",
]
def create_sinusoidal_positions(num_pos: int, dim: int) -> torch.Tensor:
inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, dtype=torch.int64) / dim))
sinusoid_inp = torch.einsum("i , j -> i j", torch.arange(num_pos, dtype=torch.int64).float(), inv_freq).float()
return torch.cat((torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)), dim=1)
def rotate_every_two(x: torch.Tensor) -> torch.Tensor:
x1 = x[:, :, :, ::2]
x2 = x[:, :, :, 1::2]
x = torch.stack((-x2, x1), dim=-1)
return x.flatten(-2)
def apply_rotary_pos_emb(tensor: torch.Tensor, sin: torch.Tensor, cos: torch.Tensor) -> torch.Tensor:
sin = torch.repeat_interleave(sin[:, :, None, :], 2, 3)
cos = torch.repeat_interleave(cos[:, :, None, :], 2, 3)
return (tensor * cos) + (rotate_every_two(tensor) * sin)
class CodeGenAttention(nn.Module):
def __init__(self, config):
super().__init__()
max_positions = config.max_position_embeddings
self.register_buffer(
"causal_mask",
torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
1, 1, max_positions, max_positions
),
persistent=False,
)
self.attn_dropout = nn.Dropout(config.attn_pdrop)
self.resid_dropout = nn.Dropout(config.resid_pdrop)
self.embed_dim = config.hidden_size
self.num_attention_heads = config.num_attention_heads
self.head_dim = self.embed_dim // self.num_attention_heads
if self.head_dim * self.num_attention_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_attention_heads (got `embed_dim`: {self.embed_dim} and"
f" `num_attention_heads`: {self.num_attention_heads})."
)
self.scale_attn = torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32)).to(torch.get_default_dtype())
self.qkv_proj = nn.Linear(self.embed_dim, self.embed_dim * 3, bias=False)
self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
self.rotary_dim = config.rotary_dim
pos_embd_dim = self.rotary_dim or self.embed_dim
self.embed_positions = create_sinusoidal_positions(max_positions, pos_embd_dim)
def _split_heads(self, x, n_head, dim_head, mp_num):
reshaped = x.reshape(x.shape[:-1] + (n_head // mp_num, dim_head))
reshaped = reshaped.reshape(x.shape[:-2] + (-1,) + reshaped.shape[-1:])
return reshaped
def _merge_heads(self, tensor, num_attention_heads, attn_head_size):
"""
Merges attn_head_size dim and num_attn_heads dim into n_ctx
"""
if len(tensor.shape) == 5:
tensor = tensor.permute(0, 1, 3, 2, 4).contiguous()
elif len(tensor.shape) == 4:
tensor = tensor.permute(0, 2, 1, 3).contiguous()
else:
raise ValueError(f"Input tensor rank should be one of [4, 5], but is: {len(tensor.shape)}")
new_shape = tensor.size()[:-2] + (num_attention_heads * attn_head_size,)
return tensor.view(new_shape)
def _attn(
self,
query,
key,
value,
attention_mask=None,
head_mask=None,
query_length, key_length = query.size(-2), key.size(-2)
causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length]
query = query.to(torch.float32)
key = key.to(torch.float32)
attn_weights = torch.matmul(query, key.transpose(-1, -2))
attn_weights = attn_weights / self.scale_attn
mask_value = torch.finfo(attn_weights.dtype).min
mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
attn_weights = torch.where(causal_mask, attn_weights, mask_value)
if attention_mask is not None:
attn_weights = attn_weights + attention_mask
attn_weights = nn.Softmax(dim=-1)(attn_weights)
attn_weights = attn_weights.to(value.dtype)
attn_weights = self.attn_dropout(attn_weights)
if head_mask is not None:
attn_weights = attn_weights * head_mask
attn_output = torch.matmul(attn_weights, value)
return attn_output, attn_weights
]:
qkv = self.qkv_proj(hidden_states)
mp_num = 4
qkv_split = qkv.reshape(qkv.shape[:-1] + (mp_num, -1))
local_dim = self.head_dim * self.num_attention_heads // mp_num
query, value, key = torch.split(qkv_split, local_dim, dim=-1)
query = self._split_heads(query, self.num_attention_heads, self.head_dim, mp_num=mp_num)
key = self._split_heads(key, self.num_attention_heads, self.head_dim, mp_num=mp_num)
value = self._split_heads(value, self.num_attention_heads, self.head_dim, mp_num=mp_num)
value = value.permute(0, 2, 1, 3)
embed_positions = self.embed_positions
if embed_positions.device != position_ids.device:
embed_positions = embed_positions.to(position_ids.device)
self.embed_positions = embed_positions
sincos = embed_positions[position_ids]
sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1)
if self.rotary_dim is not None:
k_rot = key[:, :, :, : self.rotary_dim]
k_pass = key[:, :, :, self.rotary_dim :]
q_rot = query[:, :, :, : self.rotary_dim]
q_pass = query[:, :, :, self.rotary_dim :]
k_rot = apply_rotary_pos_emb(k_rot, sin, cos)
q_rot = apply_rotary_pos_emb(q_rot, sin, cos)
key = torch.cat([k_rot, k_pass], dim=-1)
query = torch.cat([q_rot, q_pass], dim=-1)
else:
key = apply_rotary_pos_emb(key, sin, cos)
query = apply_rotary_pos_emb(query, sin, cos)
key = key.permute(0, 2, 1, 3)
query = query.permute(0, 2, 1, 3)
if layer_past is not None:
past_key = layer_past[0]
past_value = layer_past[1]
key = torch.cat((past_key, key), dim=-2)
value = torch.cat((past_value, value), dim=-2)
if use_cache is True:
present = (key.to(hidden_states.dtype), value)
else:
present = None
attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_dim)
attn_output = self.out_proj(attn_output)
attn_output = self.resid_dropout(attn_output)
outputs = (attn_output, present)
if output_attentions:
outputs += (attn_weights,)
return outputs
class CodeGenMLP(nn.Module):
def __init__(self, intermediate_size, config):
super().__init__()
embed_dim = config.n_embd
self.fc_in = nn.Linear(embed_dim, intermediate_size)
self.fc_out = nn.Linear(intermediate_size, embed_dim)
self.act = ACT2FN[config.activation_function]
self.dropout = nn.Dropout(config.resid_pdrop)
def forward(self, hidden_states: Optional[torch.FloatTensor]) -> torch.FloatTensor:
hidden_states = self.fc_in(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.fc_out(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class CodeGenBlock(nn.Module):
def __init__(self, config):
super().__init__()
inner_dim = config.n_inner if config.n_inner is not None else 4 * config.n_embd
self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
self.attn = CodeGenAttention(config)
self.mlp = CodeGenMLP(inner_dim, config)
def forward(
self,
hidden_states: Optional[torch.FloatTensor],
layer_past: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = False,
output_attentions: Optional[bool] = False,
) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
residual = hidden_states
hidden_states = self.ln_1(hidden_states)
attn_outputs = self.attn(
hidden_states=hidden_states,
layer_past=layer_past,
attention_mask=attention_mask,
position_ids=position_ids,
head_mask=head_mask,
use_cache=use_cache,
output_attentions=output_attentions,
)
attn_output = attn_outputs[0]
outputs = attn_outputs[1:]
feed_forward_hidden_states = self.mlp(hidden_states)
hidden_states = attn_output + feed_forward_hidden_states + residual
if use_cache:
outputs = (hidden_states,) + outputs
else:
outputs = (hidden_states,) + outputs[1:]
return outputs
_skip_keys_device_placement = "past_key_values"
def __init__(self, *inputs, **kwargs):
super().__init__(*inputs, **kwargs)
def _init_weights(self, module):
"""Initialize the weights."""
if isinstance(module, (nn.Linear,)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
CODEGEN_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`CodeGenConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
CODEGEN_INPUTS_DOCSTRING = r"""
"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- 0 corresponds to a *sentence A* token,
- 1 corresponds to a *sentence B* token.
[What are token type IDs?](../glossary
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`.
[What are position IDs?](../glossary
head_mask (`torch.FloatTensor` of shape `(num_attention_heads,)` or `(n_layer, num_attention_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_dim)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
# 是否返回一个`ModelOutput`对象而不是普通元组
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare CodeGen Model transformer outputting raw hidden-states without any specific head on top.",
CODEGEN_START_DOCSTRING,
)
"""
class CodeGenModel(CodeGenPreTrainedModel):
"""
Implementing a transformer model for code generation without additional task-specific heads.
Args:
config: The configuration class for model initialization.
Attributes:
embed_dim (int): Dimensionality of the embedding layer.
vocab_size (int): Size of the vocabulary.
wte (nn.Embedding): Embedding layer to convert input tokens to embeddings.
drop (nn.Dropout): Dropout layer for regularization.
h (nn.ModuleList): List of CodeGenBlock modules representing transformer layers.
ln_f (nn.LayerNorm): Layer normalization for final layer.
rotary_dim (int): Dimension for rotary position encodings.
gradient_checkpointing (bool): Whether to use gradient checkpointing during training.
Methods:
get_input_embeddings(): Returns the input embedding layer.
set_input_embeddings(new_embeddings): Sets new input embeddings for the model.
forward(...): Performs forward pass through the model with various input tensors.
"""
def __init__(self, config):
super().__init__(config)
self.embed_dim = config.n_embd
self.vocab_size = config.vocab_size
self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
self.drop = nn.Dropout(config.embd_pdrop)
self.h = nn.ModuleList([CodeGenBlock(config) for _ in range(config.n_layer)])
self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
self.rotary_dim = min(config.rotary_dim, config.n_ctx // config.num_attention_heads)
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings(
"""
The CodeGen Model transformer with a language modeling head on top.
""",
CODEGEN_START_DOCSTRING,
)
"""
class CodeGenForCausalLM(CodeGenPreTrainedModel):
"""
Extended transformer model for code generation with an added language modeling head.
Args:
config: The configuration class for model initialization.
Attributes:
transformer (CodeGenModel): Instance of the base CodeGenModel transformer.
lm_head (nn.Linear): Linear layer for language modeling predictions.
Methods:
get_output_embeddings(): Returns the output embedding layer.
set_output_embeddings(new_embeddings): Sets new output embeddings for the model.
"""
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
super().__init__(config)
self.transformer = CodeGenModel(config)
self.lm_head = nn.Linear(config.n_embd, config.vocab_size)
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
# 从 kwargs 中获取 token_type_ids,如果不存在则为 None
token_type_ids = kwargs.get("token_type_ids", None)
# 如果 past_key_values 不为 None,则执行以下操作
if past_key_values:
# 获取 past_key_values 的长度信息
past_length = past_key_values[0][0].shape[2]
# 如果 input_ids 的第二维度大于 past_length,则移除前缀长度为 past_length
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
# 否则默认行为:保留最后一个输入 ID
remove_prefix_length = input_ids.shape[1] - 1
# 截取 input_ids 的第二维度,保留从 remove_prefix_length 到结尾的部分
input_ids = input_ids[:, remove_prefix_length:]
# 如果 token_type_ids 不为 None,则截取与 input_ids 相同长度的部分
if token_type_ids is not None:
token_type_ids = token_type_ids[:, -input_ids.shape[1] :]
# 从 kwargs 中获取 attention_mask 和 position_ids
attention_mask = kwargs.get("attention_mask", None)
position_ids = kwargs.get("position_ids", None)
# 如果 attention_mask 不为 None 且 position_ids 为 None,则执行以下操作
if attention_mask is not None and position_ids is None:
# 动态生成 position_ids 以用于批量生成
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
# 如果 past_key_values 不为 None,则截取与 input_ids 相同长度的部分
if past_key_values:
position_ids = position_ids[:, -input_ids.shape[1] :]
# 返回准备好的输入字典
return {
"input_ids": input_ids,
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"position_ids": position_ids,
"attention_mask": attention_mask,
"token_type_ids": token_type_ids,
}
@add_start_docstrings_to_model_forward(CODEGEN_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=CausalLMOutputWithPast,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
`labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
"""
# 如果未指定 return_dict,则使用配置中的设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 使用 Transformer 处理输入的各种参数,并获取输出
transformer_outputs = self.transformer(
input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取 Transformer 输出中的隐藏状态
hidden_states = transformer_outputs[0]
# 确保在 fp16 下的采样工作正常,并且在 fp32 下计算损失,以与 mesh-tf 版本保持一致
# 参考链接: https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
# 将隐藏状态传递给语言模型头部,转换为 torch.float32 类型
lm_logits = self.lm_head(hidden_states).to(torch.float32)
# 初始化损失为 None
loss = None
if labels is not None:
# 将标签移动到正确的设备上以启用模型并行处理
labels = labels.to(lm_logits.device)
# 移动 logits 以便让 tokens < n 预测 n
shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
# 展平 tokens
loss_fct = CrossEntropyLoss()
# 计算损失
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
# 将损失转换为与隐藏状态类型相同的类型
loss = loss.to(hidden_states.dtype)
# 如果不使用 return_dict,则输出的格式为元组
if not return_dict:
output = (lm_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
# 如果使用 return_dict,则返回 CausalLMOutputWithPast 类型的对象,包含损失、logits 和其他 Transformer 的输出
return CausalLMOutputWithPast(
loss=loss,
logits=lm_logits,
past_key_values=transformer_outputs.past_key_values,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@staticmethod
def _reorder_cache(
past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
"""
如果调用了 `PretrainedModel.beam_search` 或 `PretrainedModel.beam_sample`,则使用此函数重新排序 `past_key_values` 缓存。
这是为了确保在每一代生成步骤中,`past_key_values` 与正确的 `beam_idx` 匹配。
返回一个元组,其中包含重新排序后的 `past_key_values`,每个元素都是一个元组,每个元组包含一组 `torch.Tensor` 对象。
"""
return tuple(
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
for layer_past in past_key_values
)
.\models\codegen\tokenization_codegen.py
"""Tokenization classes for CodeGen"""
import json
import os
from functools import lru_cache
from typing import TYPE_CHECKING, List, Optional, Tuple, Union
import numpy as np
import regex as re
from ...utils import is_tf_available, is_torch_available, logging, to_py_obj
if TYPE_CHECKING:
if is_torch_available():
import torch
if is_tf_available():
import tensorflow as tf
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"Salesforce/codegen-350M-mono": "https://huggingface.co/Salesforce/codegen-350M-mono/resolve/main/vocab.json",
},
"merges_file": {
"Salesforce/codegen-350M-mono": "https://huggingface.co/Salesforce/codegen-350M-mono/resolve/main/merges.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"Salesforce/codegen-350M-mono": 2048,
}
@lru_cache()
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
characters the bpe code barfs on.
The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
tables between utf-8 bytes and unicode strings.
"""
bs = (
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
)
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
def get_pairs(word):
"""
Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
class CodeGenTokenizer(PreTrainedTokenizer):
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
merges_file,
errors="replace",
unk_token="<|endoftext|>",
bos_token="<|endoftext|>",
eos_token="<|endoftext|>",
pad_token=None,
add_prefix_space=False,
add_bos_token=False,
**kwargs,
):
bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
self.add_bos_token = add_bos_token
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
with open(merges_file, encoding="utf-8") as merges_handle:
bpe_merges = merges_handle.read().split("\n")[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
self.add_prefix_space = add_prefix_space
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
super().__init__(
errors=errors,
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
add_prefix_space=add_prefix_space,
add_bos_token=add_bos_token,
**kwargs,
)
@property
def vocab_size(self):
return len(self.encoder)
def get_vocab(self):
return dict(self.encoder, **self.added_tokens_encoder)
def _tokenize(self, text):
"""Tokenize a string."""
bpe_tokens = []
for token in re.findall(self.pat, text):
token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
return bpe_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
text = "".join(tokens)
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
return text
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write("#version: 0.2\n")
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
if is_split_into_words or add_prefix_space:
text = " " + text
return (text, kwargs)
def decode(
self,
token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = None,
truncate_before_pattern: Optional[List[str]] = None,
**kwargs,
) -> str:
"""
Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
tokens and clean up tokenization spaces.
Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
Args:
token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
List of tokenized input ids. Can be obtained using the `__call__` method.
skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (`bool`, *optional*):
Whether or not to clean up the tokenization spaces. If `None`, will default to
`self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
truncate_before_pattern (`List[str]`, *optional*, defaults to `None`):
A list of regular expression strings that will be used to truncate the returned string. This can be
used to remove extra pieces of code (e.g. truncate if observing a comment symbol "#" at the beginning
of a new line). An example pattern could be `["^#", re.escape("<|endoftext|>"), "^'''", "\n\n\n"]`.
kwargs (additional keyword arguments, *optional*):
Will be passed to the underlying model specific decode method.
Returns:
`str`: The decoded sentence.
"""
token_ids = to_py_obj(token_ids)
decoded_text = super()._decode(
token_ids=token_ids,
skip_special_tokens=skip_special_tokens,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)
if truncate_before_pattern is not None and len(truncate_before_pattern) > 0:
decoded_text = self.truncate(decoded_text, truncate_before_pattern)
return decoded_text
def truncate(self, completion, truncate_before_pattern):
"""
Truncates `completion` text based on specified `truncate_before_pattern`.
Args:
completion (str): The text to truncate.
truncate_before_pattern (List[str]): List of regular expressions to determine truncation points.
Returns:
str: The truncated text.
"""
def find_re(string, pattern, start_pos):
"""
Helper function to find the position of a pattern in a string.
Args:
string (str): The string to search within.
pattern (Pattern): The compiled regular expression pattern.
start_pos (int): The starting position of the search.
Returns:
int: The position of the pattern in the string, or -1 if not found.
"""
m = pattern.search(string, start_pos)
return m.start() if m else -1
terminals = [re.compile(pattern, re.MULTILINE) for pattern in truncate_before_pattern]
prints = list(re.finditer("^print", completion, re.MULTILINE))
if len(prints) > 1:
completion = completion[: prints[1].start()]
defs = list(re.finditer("^def", completion, re.MULTILINE))
if len(defs) > 1:
completion = completion[: defs[1].start()]
start_pos = 0
terminals_pos = [
pos for pos in [find_re(completion, terminal, start_pos) for terminal in terminals] if pos != -1
]
if len(terminals_pos) > 0:
return completion[: min(terminals_pos)]
else:
return completion
.\models\codegen\tokenization_codegen_fast.py
import json
import re
from typing import TYPE_CHECKING, List, Optional, Tuple, Union
import numpy as np
from ...utils import is_tf_available, is_torch_available, logging
if TYPE_CHECKING:
if is_torch_available():
import torch
if is_tf_available():
import tensorflow as tf
from tokenizers import pre_tokenizers
from ...tokenization_utils_base import BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from .tokenization_codegen import CodeGenTokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"Salesforce/codegen-350M-mono": "https://huggingface.co/Salesforce/codegen-350M-mono/resolve/main/vocab.json",
},
"merges_file": {
"Salesforce/codegen-350M-mono": "https://huggingface.co/Salesforce/codegen-350M-mono/resolve/main/merges.txt",
},
"tokenizer_file": {
"Salesforce/codegen-350M-mono": (
"https://huggingface.co/Salesforce/codegen-350M-mono/resolve/main/tokenizer.json"
),
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"Salesforce/codegen-350M-mono": 2048,
}
class CodeGenTokenizerFast(PreTrainedTokenizerFast):
"""
构建一个“快速”CodeGen分词器(由HuggingFace的*tokenizers*库支持)。基于字节级的 Byte-Pair-Encoding。
这个分词器经过训练,将空格视为标记的一部分(类似于sentencepiece),因此一个单词的编码方式会因其是否位于句子开头而不同(没有空格或有空格):
```
>>> from transformers import CodeGenTokenizerFast
>>> tokenizer = CodeGenTokenizerFast.from_pretrained("Salesforce/codegen-350M-mono")
>>> tokenizer("Hello world")["input_ids"]
[15496, 995]
>>> tokenizer(" Hello world")["input_ids"]
[18435, 995]
```
如果在实例化分词器时传入 `add_prefix_space=True`,可以避免这种行为,但由于模型未以这种方式进行预训练,可能会降低性能。
<Tip>
当 `is_split_into_words=True` 时,需要使用 `add_prefix_space=True` 实例化这个分词器。
"""
This tokenizer inherits from `PreTrainedTokenizerFast` which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.
Args:
vocab_file (`str`, *optional*):
Path to the vocabulary file.
词汇表文件的路径。
merges_file (`str`, *optional*):
Path to the merges file.
merges 文件的路径。
tokenizer_file (`str`, *optional*):
Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
contains everything needed to load the tokenizer.
tokenizers 文件的路径,通常为 .json 扩展名,包含加载 tokenizer 所需的全部信息。
unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
未知标记。不在词汇表中的标记无法转换为 ID,并将设置为此标记。
bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The beginning of sequence token.
序列的开始标记。
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The end of sequence token.
序列的结束标记。
add_prefix_space (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (CodeGen tokenizer detect beginning of words by the preceding space).
是否在输入前添加一个初始空格。这样可以将前导单词视为任何其他单词。(CodeGen tokenizer 通过前导空格检测单词的开始)。
"""
# 定义常量,指定词汇表文件名列表
vocab_files_names = VOCAB_FILES_NAMES
# 预训练模型的词汇表文件映射
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
# 预训练模型的最大输入尺寸
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# 模型输入的名称列表
model_input_names = ["input_ids", "attention_mask"]
# 慢速 tokenizer 的类为 CodeGenTokenizer
slow_tokenizer_class = CodeGenTokenizer
# 初始化方法
def __init__(
self,
vocab_file=None,
merges_file=None,
tokenizer_file=None,
unk_token="<|endoftext|>",
bos_token="<|endoftext|>",
eos_token="<|endoftext|>",
add_prefix_space=False,
**kwargs,
):
):
# 调用父类的构造函数,初始化一个新的实例
super().__init__(
vocab_file,
merges_file,
tokenizer_file=tokenizer_file,
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
add_prefix_space=add_prefix_space,
**kwargs,
)
# 如果在参数 kwargs 中设置了 "add_bos_token",则抛出错误
if kwargs.pop("add_bos_token", False):
model_id = kwargs.pop("name_or_path", "")
raise ValueError(
"Currenty GPT2's fast tokenizer does NOT support adding a BOS token. "
"Instead you should use GPT2's slow tokenizer class `CodeGenTokenizer` as follows: \n"
f"`CodeGenTokenizer.from_pretrained('{model_id}')`\nor\n"
f"`AutoTokenizer.from_pretrained('{model_id}', use_fast=False)`\n"
"This issue will be fixed soon, see: https://github.com/huggingface/tokenizers/pull/1005."
" so that the fast tokenizer works correctly."
)
# 获取当前预处理器的状态并将其转换为 JSON 格式
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
# 如果预处理器的 "add_prefix_space" 参数与当前实例中的 add_prefix_space 不一致,则更新预处理器的状态
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
# 设置实例的 add_prefix_space 属性
self.add_prefix_space = add_prefix_space
# 重写父类的 _batch_encode_plus 方法,返回 BatchEncoding 对象
def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
# 获取是否已经将输入拆分为单词的参数,默认为 False
is_split_into_words = kwargs.get("is_split_into_words", False)
# 断言如果 add_prefix_space 为 True 或者未将输入拆分为单词,则抛出错误
assert self.add_prefix_space or not is_split_into_words, (
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
"to use it with pretokenized inputs."
)
# 调用父类的 _batch_encode_plus 方法并返回结果
return super()._batch_encode_plus(*args, **kwargs)
# 重写父类的 _encode_plus 方法,返回 BatchEncoding 对象
def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
# 获取是否已经将输入拆分为单词的参数,默认为 False
is_split_into_words = kwargs.get("is_split_into_words", False)
# 断言如果 add_prefix_space 为 True 或者未将输入拆分为单词,则抛出错误
assert self.add_prefix_space or not is_split_into_words, (
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
"to use it with pretokenized inputs."
)
# 调用父类的 _encode_plus 方法并返回结果
return super()._encode_plus(*args, **kwargs)
# 保存词汇表到指定目录
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# 调用 Tokenizer 的 model.save 方法保存模型文件到指定目录,并返回文件名的元组
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
# 解码 token_ids 到原始文本
def decode(
self,
token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = None,
truncate_before_pattern: Optional[List[str]] = None,
**kwargs,
) -> str:
"""
Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
tokens and clean up tokenization spaces.
Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
Args:
token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
List of tokenized input ids. Can be obtained using the `__call__` method.
skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (`bool`, *optional*):
Whether or not to clean up the tokenization spaces. If `None`, will default to
`self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
truncate_before_pattern (`List[str]`, *optional*, defaults to `None`):
A list of regular expression strings that will be used to truncate the returned string. This can be
used to remove extra pieces of code (e.g. truncate if observing a comment symbol "#" at the beginning
of a new line). An example pattern could be `["^#", re.escape("<|endoftext|>"), "^'''", "\n\n\n"]`.
kwargs (additional keyword arguments, *optional*):
Will be passed to the underlying model specific decode method.
Returns:
`str`: The decoded sentence.
"""
# 使用继承自父类的方法 `decode` 对 token_ids 进行解码
decoded_text = super().decode(
token_ids=token_ids,
skip_special_tokens=skip_special_tokens,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)
# 如果指定了 `truncate_before_pattern`,则根据正则表达式列表进行截断
if truncate_before_pattern is not None and len(truncate_before_pattern) > 0:
decoded_text = self.truncate(decoded_text, truncate_before_pattern)
# 返回解码后的文本
return decoded_text
def truncate(self, completion, truncate_before_pattern):
# 内部函数,用于在字符串中查找正则表达式的位置
def find_re(string, pattern, start_pos):
m = pattern.search(string, start_pos)
return m.start() if m else -1
# 编译正则表达式列表为多行模式的正则对象
terminals = [re.compile(pattern, re.MULTILINE) for pattern in truncate_before_pattern]
# 查找代码字符串中以 "^print" 开头的所有位置
prints = list(re.finditer("^print", completion, re.MULTILINE))
# 如果找到多于一个 "^print" 开头的位置,则截断字符串到第二个 "^print" 之前
if len(prints) > 1:
completion = completion[: prints[1].start()]
# 查找代码字符串中以 "^def" 开头的所有位置
defs = list(re.finditer("^def", completion, re.MULTILINE))
# 如果找到多于一个 "^def" 开头的位置,则截断字符串到第二个 "^def" 之前
if len(defs) > 1:
completion = completion[: defs[1].start()]
start_pos = 0
# 查找代码字符串中所有 `truncate_before_pattern` 匹配的位置
terminals_pos = [
pos for pos in [find_re(completion, terminal, start_pos) for terminal in terminals] if pos != -1
]
# 如果找到任何一个 `truncate_before_pattern` 的位置,则截断字符串到最小的位置处
if len(terminals_pos) > 0:
return completion[: min(terminals_pos)]
else:
return completion
.\models\codegen\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
_import_structure = {
"configuration_codegen": ["CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP", "CodeGenConfig", "CodeGenOnnxConfig"],
"tokenization_codegen": ["CodeGenTokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_codegen_fast"] = ["CodeGenTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_codegen"] = [
"CODEGEN_PRETRAINED_MODEL_ARCHIVE_LIST",
"CodeGenForCausalLM",
"CodeGenModel",
"CodeGenPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_codegen import CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP, CodeGenConfig, CodeGenOnnxConfig
from .tokenization_codegen import CodeGenTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_codegen_fast import CodeGenTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_codegen import (
CODEGEN_PRETRAINED_MODEL_ARCHIVE_LIST,
CodeGenForCausalLM,
CodeGenModel,
CodeGenPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\code_llama\tokenization_code_llama.py
import os
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm
from ...convert_slow_tokenizer import import_protobuf
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import logging, requires_backends
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"hf-internal-testing/llama-code-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
},
"tokenizer_file": {
"hf-internal-testing/llama-code-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"hf-internal-testing/llama-code-tokenizer": 2048,
}
SPIECE_UNDERLINE = "▁"
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
that your responses are socially unbiased and positive in nature.
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
correct. If you don't know the answer to a question, please don't share false information."""
def __init__(
self,
vocab_file,
unk_token="<unk>",
bos_token="<s>",
eos_token="</s>",
prefix_token="▁<PRE>",
middle_token="▁<MID>",
suffix_token="▁<SUF>",
eot_token="▁<EOT>",
fill_token="<FILL_ME>",
suffix_first=False,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
add_bos_token=True,
add_eos_token=False,
clean_up_tokenization_spaces=False,
additional_special_tokens=None,
use_default_system_prompt=False,
**kwargs,
):
requires_backends(self, "protobuf")
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
self.use_default_system_prompt = use_default_system_prompt
additional_special_tokens = additional_special_tokens or []
for token in [prefix_token, middle_token, suffix_token, eot_token]:
additional_special_tokens += [token] if token is not None else []
self.vocab_file = vocab_file
self.add_bos_token = add_bos_token
self.add_eos_token = add_eos_token
self._prefix_token = prefix_token
self._middle_token = middle_token
self._suffix_token = suffix_token
self._eot_token = eot_token
self.fill_token = fill_token
self.suffix_first = suffix_first
self.sp_model = self.get_spm_processor()
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
add_bos_token=add_bos_token,
add_eos_token=add_eos_token,
prefix_token=prefix_token,
middle_token=middle_token,
suffix_token=suffix_token,
eot_token=eot_token,
fill_token=fill_token,
sp_model_kwargs=self.sp_model_kwargs,
suffix_first=suffix_first,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
additional_special_tokens=additional_special_tokens,
use_default_system_prompt=use_default_system_prompt,
**kwargs,
)
@property
def unk_token_length(self):
return len(self.sp_model.encode(str(self.unk_token)))
def get_spm_processor(self):
tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
with open(self.vocab_file, "rb") as f:
sp_model = f.read()
model_pb2 = import_protobuf()
model = model_pb2.ModelProto.FromString(sp_model)
normalizer_spec = model_pb2.NormalizerSpec()
normalizer_spec.add_dummy_prefix = False
model.normalizer_spec.MergeFrom(normalizer_spec)
sp_model = model.SerializeToString()
tokenizer.LoadFromSerializedProto(sp_model)
return tokenizer
@property
def prefix_token(self):
return self._prefix_token
@property
def prefix_id(self):
if self._prefix_token is None:
return None
return self.convert_tokens_to_ids(self.prefix_token)
@property
def middle_token(self):
return self._middle_token
@property
def middle_id(self):
if self._middle_token is None:
return None
return self.convert_tokens_to_ids(self.middle_token)
@property
def suffix_token(self):
return self._suffix_token
@property
def suffix_id(self):
if self._suffix_token is None:
return None
return self.convert_tokens_to_ids(self.suffix_token)
@property
def eot_token(self):
return self._eot_token
@property
def eot_id(self):
if self._eot_token is None:
return None
return self.convert_tokens_to_ids(self.eot_token)
@property
def vocab_size(self):
"""Returns vocab size"""
return self.sp_model.get_piece_size()
def get_vocab(self):
"""Returns vocab as a dict"""
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def tokenize(self, prefix, suffix=None, suffix_first=False, **kwargs) -> List[int]:
if self.fill_token is not None and self.fill_token in prefix and suffix is None:
prefix, suffix = prefix.split(self.fill_token)
if len(prefix) > 0:
prefix = SPIECE_UNDERLINE + prefix.replace(SPIECE_UNDERLINE, " ")
if suffix is None or len(suffix) < 1:
tokens = super().tokenize(prefix, **kwargs)
if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
tokens = tokens[1:]
return tokens
prefix_tokens = self._tokenize(prefix)
if None in (self.prefix_id, self.middle_id, self.suffix_id):
raise ValueError(
"The input either includes a `prefix` and a `suffix` used for the infilling task,"
f" or can be split on the {self.fill_token} token, creating a suffix and prefix,"
" but the model does not support `infilling`."
)
suffix_tokens = self._tokenize(suffix)
suffix_first = suffix_first if suffix_first is not None else self.suffix_first
if suffix_first:
return [self.prefix_token, self.suffix_token] + suffix_tokens + [self.middle_token] + prefix_tokens
else:
return [self.prefix_token] + prefix_tokens + [self.suffix_token] + suffix_tokens + [self.middle_token]
def _tokenize(self, text, **kwargs):
"""
Returns a tokenized string.
We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
`['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
`unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
`self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
"""
tokens = self.sp_model.encode(text, out_type=str)
if not text.startswith((SPIECE_UNDERLINE, " ")):
return tokens
tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.sp_model.piece_to_id(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
token = self.sp_model.IdToPiece(index)
return token
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
if tokens[0].startswith(SPIECE_UNDERLINE):
tokens[0] = tokens[0][1:]
current_sub_tokens = []
out_string = ""
for _, token in enumerate(tokens):
if token in self.all_special_tokens:
out_string += self.sp_model.decode(current_sub_tokens) + token
current_sub_tokens = []
else:
current_sub_tokens.append(token)
out_string += self.sp_model.decode(current_sub_tokens)
return out_string
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (`str`):
The directory in which to save the vocabulary.
Returns:
`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file,)
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
output = bos_token_id + token_ids_0 + eos_token_id
if token_ids_1 is not None:
output = output + bos_token_id + token_ids_1 + eos_token_id
return output
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
bos_token_id = [1] if self.add_bos_token else []
eos_token_id = [1] if self.add_eos_token else []
if token_ids_1 is None:
return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
return (
bos_token_id
+ ([0] * len(token_ids_0))
+ eos_token_id
+ bos_token_id
+ ([0] * len(token_ids_1))
+ eos_token_id
)
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
if token_ids_1 is None, only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of ids.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
if token_ids_1 is not None:
output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
return output
@property
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
state["sp_model_proto"] = self.sp_model.serialized_model_proto()
return state
def __setstate__(self, d):
self.__dict__ = d
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
.\models\code_llama\tokenization_code_llama_fast.py
import os
from shutil import copyfile
from typing import List, Optional, Tuple
from tokenizers import normalizers, processors
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import is_sentencepiece_available, logging
from ...utils.versions import require_version
require_version("tokenizers>=0.13.3")
if is_sentencepiece_available():
from .tokenization_code_llama import CodeLlamaTokenizer
else:
CodeLlamaTokenizer = None
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"}
SPIECE_UNDERLINE = "▁"
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
that your responses are socially unbiased and positive in nature.
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
correct. If you don't know the answer to a question, please don't share false information."""
class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
"""
构建 Llama 快速分词器。基于字节级别的字节对编码。
这里特别使用了 ByteFallback 和没有规范化。
```
>>> from transformers import CodeLlamaTokenizerFast
>>> tokenizer = CodeLlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
>>> tokenizer.encode("Hello this is a test")
[1, 15043, 445, 338, 263, 1243]
```
如果要更改 `bos_token` 或 `eos_token`,请在初始化模型时指定它们,或调用 `tokenizer.update_post_processor()` 来确保后处理正确执行
(否则编码序列的第一个标记和最后一个标记的值将不正确)。有关更多详细信息,请查看 [后处理器文档](https://huggingface.co/docs/tokenizers/api/post-processors)。
该分词器继承自 [`PreTrainedTokenizerFast`],其中包含大多数主要方法。用户应该
"""
vocab_files_names = VOCAB_FILES_NAMES
slow_tokenizer_class = CodeLlamaTokenizer
padding_side = "left"
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
clean_up_tokenization_spaces=False,
unk_token="<unk>",
bos_token="<s>",
eos_token="</s>",
prefix_token="▁<PRE>",
middle_token="▁<MID>",
suffix_token="▁<SUF>",
eot_token="▁<EOT>",
fill_token="<FILL_ME>",
additional_special_tokens=None,
add_bos_token=True,
add_eos_token=False,
use_default_system_prompt=False,
**kwargs,
):
additional_special_tokens = additional_special_tokens or []
for token in [prefix_token, middle_token, suffix_token, eot_token]:
additional_special_tokens += [token] if token is not None else []
self.use_default_system_prompt = use_default_system_prompt
super().__init__(
vocab_file=vocab_file,
tokenizer_file=tokenizer_file,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
additional_special_tokens=additional_special_tokens,
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
add_bos_token=add_bos_token,
add_eos_token=add_eos_token,
prefix_token=prefix_token,
middle_token=middle_token,
suffix_token=suffix_token,
eot_token=eot_token,
fill_token=fill_token,
use_default_system_prompt=use_default_system_prompt,
**kwargs,
)
self._add_bos_token = add_bos_token
self._add_eos_token = add_eos_token
self.update_post_processor()
self.vocab_file = vocab_file
self._prefix_token = prefix_token
self._middle_token = middle_token
self._suffix_token = suffix_token
self._eot_token = eot_token
self.fill_token = fill_token
@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False
def update_post_processor(self):
"""
Updates the underlying post processor with the current `bos_token` and `eos_token`.
"""
bos = self.bos_token
bos_token_id = self.bos_token_id
if bos is None and self.add_bos_token:
raise ValueError("add_bos_token = True but bos_token = None")
eos = self.eos_token
eos_token_id = self.eos_token_id
if eos is None and self.add_eos_token:
raise ValueError("add_eos_token = True but eos_token = None")
single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
special_tokens = []
if self.add_bos_token:
special_tokens.append((bos, bos_token_id))
if self.add_eos_token:
special_tokens.append((eos, eos_token_id))
self._tokenizer.post_processor = processors.TemplateProcessing(
single=single, pair=pair, special_tokens=special_tokens
)
@property
def prefix_token(self):
return self._prefix_token
@property
def prefix_id(self):
if self._prefix_token is None:
return None
return self.convert_tokens_to_ids(self.prefix_token)
@property
def middle_token(self):
return self._middle_token
@property
def middle_id(self):
if self._middle_token is None:
return None
return self.convert_tokens_to_ids(self.middle_token)
@property
def suffix_token(self):
return self._suffix_token
@property
def suffix_id(self):
if self._suffix_token is None:
return None
return self.convert_tokens_to_ids(self.suffix_token)
@property
def eot_id(self):
if self._eot_token is None:
return None
return self.convert_tokens_to_ids(self.eot_token)
@property
def eot_token(self):
return self._eot_token
@property
def add_eos_token(self):
return self._add_eos_token
@property
def add_bos_token(self):
return self._add_bos_token
@add_eos_token.setter
def add_eos_token(self, value):
self._add_eos_token = value
self.update_post_processor()
@add_bos_token.setter
def add_bos_token(self, value):
self._add_bos_token = value
self.update_post_processor()
def set_infilling_processor(self, reset, suffix_first=False, add_special_tokens=True):
"""
Updates the normalizer to ensure the prompt format for `infilling` is respected. The infilling format is as follows:
if `suffix_first`:
" <PRE> <SUF>{suf} <MID> {pre}"
else:
" <PRE> {pre} <SUF>{suf} <MID>"
If `reset` is `True`, resets `normalizer` and `post_processor` to their default behaviors:
normalizer adds a prefix space, post_processor adds a `bos_token`.
Args:
reset (bool): Indicates whether to reset the processors.
suffix_first (bool, optional): Flag indicating the order of suffix and prefix in the format.
add_special_tokens (bool, optional): Whether to add special tokens.
Returns:
None
"""
if reset:
self._tokenizer.normalizer = normalizers.Sequence(
[
normalizers.Prepend(prepend="▁"),
normalizers.Replace(pattern=" ", content="▁"),
]
)
self.update_post_processor()
return
self._tokenizer.normalizer = normalizers.Replace(pattern=" ", content="▁")
pair = [self.bos_token] if self.add_bos_token and add_special_tokens else []
special_tokens = [(self.bos_token, self.bos_token_id)] if self.add_bos_token and add_special_tokens else []
if suffix_first:
pair += [self.prefix_token, self.suffix_token, "$B", self.middle_token, "$A"]
special_tokens += [
(self.prefix_token, self.prefix_id),
(self.suffix_token, self.suffix_id),
(self.middle_token, self.middle_id),
]
else:
pair += [self.prefix_token, "$A", self.suffix_token, "$B", self.middle_token]
special_tokens += [
(self.prefix_token, self.prefix_id),
(self.suffix_token, self.suffix_id),
(self.middle_token, self.middle_id),
]
if self.add_eos_token and add_special_tokens:
pair += [self.eos_token]
special_tokens += [(self.eos_token, self.eos_token_id)]
self._tokenizer.post_processor = processors.TemplateProcessing(
single="$A", pair=pair, special_tokens=special_tokens
)
def encode_plus(self, text, text_pair=None, suffix_first=False, add_special_tokens=True, **kwargs):
text_pair = kwargs.pop("suffix", text_pair)
if self.fill_token is not None and self.fill_token in text and text_pair is None:
text, text_pair = text.split(self.fill_token)
if text_pair is None or len(text_pair) < 1:
return super().encode_plus(text, text_pair, add_special_tokens=add_special_tokens, **kwargs)
if None in (self.prefix_id, self.middle_id, self.suffix_id):
raise ValueError(
"Then input includes a `prefix` and a `suffix` used for the infilling task,"
" the `prefix_id, middle_id, suffix_id` must all be initialized. Current"
f" values : {self.prefix_id, self.middle_id, self.suffix_id}"
)
self.set_infilling_processor(False, suffix_first=suffix_first, add_special_tokens=add_special_tokens)
tokens = super().encode_plus(" " + text, text_pair=text_pair, add_special_tokens=True, **kwargs)
self.set_infilling_processor(True)
return tokens
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not self.can_save_slow_tokenizer:
raise ValueError(
"Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
"tokenizer."
)
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)
@property
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
):
def set_lang(self, src_lang_code: int, tgt_lang_code: int = None) -> List[int]:
"""
通过连接和添加特殊标记,从序列或序列对构建用于序列分类任务的模型输入。特殊标记取决于调用 set_lang。
对于 NLLB 序列,其格式如下,其中 `X` 表示序列:
- `input_ids`(用于编码器):`X [eos, src_lang_code]`
- `decoder_input_ids`(用于解码器):`X [eos, tgt_lang_code]`
BOS 永远不会被使用。序列对不是预期的使用情况,但将会在没有分隔符的情况下处理。
Args:
token_ids_0 (`List[int]`):
要添加特殊标记的 ID 列表。
token_ids_1 (`List[int]`, *optional*):
可选的第二个 ID 列表,用于序列对。
Returns:
`List[int]`: 包含适当特殊标记的输入 ID 列表。
"""
if token_ids_1 is None:
return self.bos_token_id + token_ids_0 + self.eos_token_id
return self.bos_token_id + token_ids_0 + token_ids_1 + self.eos_token_id
.\models\code_llama\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_tokenizers_available
_import_structure = {}
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_code_llama"] = ["CodeLlamaTokenizer"]
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_code_llama_fast"] = ["CodeLlamaTokenizerFast"]
if TYPE_CHECKING:
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_code_llama import CodeLlamaTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_code_llama_fast import CodeLlamaTokenizerFast
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\cohere\configuration_cohere.py
""" Cohere model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
COHERE_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
class CohereConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
model according to the specified arguments, defining the model architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information. Instantiating a configuration
with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.
```
>>> from transformers import CohereModel, CohereConfig
>>> # Initializing a Cohere model configuration
>>> configuration = CohereConfig()
>>> # Initializing a model from the Cohere configuration
>>> model = CohereModel(configuration) # doctest: +SKIP
>>> # Accessing the model configuration
>>> configuration = model.config # doctest: +SKIP
```
"""
model_type = "cohere"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=256000,
hidden_size=8192,
intermediate_size=22528,
logit_scale=0.0625,
num_hidden_layers=40,
num_attention_heads=64,
num_key_value_heads=None,
hidden_act="silu",
max_position_embeddings=8192,
initializer_range=0.02,
layer_norm_eps=1e-5,
use_cache=True,
pad_token_id=0,
bos_token_id=5,
eos_token_id=255001,
tie_word_embeddings=True,
rope_theta=10000.0,
attention_bias=False,
attention_dropout=0.0,
**kwargs,
):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.logit_scale = logit_scale
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.use_cache = use_cache
self.pad_token_id = pad_token_id
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.tie_word_embeddings = tie_word_embeddings
self.rope_theta = rope_theta
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.logit_scale = logit_scale
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
.\models\cohere\modeling_cohere.py
class CohereLayerNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-5, bias=False):
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.bias = nn.Parameter(torch.zeros(hidden_size)) if bias else None
self.variance_epsilon = eps
def forward(self, hidden_states):
input_dtype = hidden_states.dtype
hidden_states = hidden_states.to(torch.float32)
mean = hidden_states.mean(-1, keepdim=True)
variance = (hidden_states - mean).pow(2).mean(-1, keepdim=True)
hidden_states = (hidden_states - mean) * torch.rsqrt(variance + self.variance_epsilon)
hidden_states = self.weight.to(torch.float32) * hidden_states
if self.bias is not None:
hidden_states = hidden_states + self.bias.to(torch.float32)
return hidden_states.to(input_dtype)
ALL_LAYERNORM_LAYERS.append(CohereLayerNorm)
class CohereRotaryEmbedding(nn.Module):
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
super().__init__()
self.scaling_factor = scaling_factor
self.dim = dim
self.max_position_embeddings = max_position_embeddings
self.base = base
inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
self.register_buffer("inv_freq", inv_freq, persistent=False)
@torch.no_grad()
def forward(self, x, position_ids):
inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
position_ids_expanded = position_ids[:, None, :].float()
device_type = x.device.type
device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
with torch.autocast(device_type=device_type, enabled=False):
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
emb = torch.repeat_interleave(freqs, 2, dim=-1)
cos = emb.cos()
sin = emb.sin()
return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
def rotate_half(x):
x1 = x[..., ::2]
x2 = x[..., 1::2]
rot_x = torch.stack([-x2, x1], dim=-1).flatten(-2)
return rot_x
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
"""Applies Rotary Position Embedding to the query and key tensors.
Args:
q (`torch.Tensor`): The query tensor.
k (`torch.Tensor`): The key tensor.
cos (`torch.Tensor`): The cosine part of the rotary embedding.
sin (`torch.Tensor`): The sine part of the rotary embedding.
position_ids (`torch.Tensor`, *optional*):
Deprecated and unused.
unsqueeze_dim (`int`, *optional*, defaults to 1):
The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
# 将 cos 张量在指定维度上扩展维度
cos = cos.unsqueeze(unsqueeze_dim)
# 将 sin 张量在指定维度上扩展维度
sin = sin.unsqueeze(unsqueeze_dim)
# 计算查询向量的旋转位置嵌入,使用 cos 和 sin 对应元素加权
q_embed = (q * cos) + (rotate_half(q) * sin)
# 计算键向量的旋转位置嵌入,使用 cos 和 sin 对应元素加权
k_embed = (k * cos) + (rotate_half(k) * sin)
# 返回旋转后的查询向量和键向量作为元组
return q_embed, k_embed
# 从 transformers.models.llama.modeling_llama.LlamaMLP Llama->Cohere 复制过来的类,用于定义 CohereMLP 模型
class CohereMLP(nn.Module):
# 初始化函数,接受一个配置对象 config 作为参数
def __init__(self, config):
super().__init__()
# 将配置对象保存到实例中
self.config = config
# 设置隐藏层大小为配置中的 hidden_size
self.hidden_size = config.hidden_size
# 设置中间层大小为配置中的 intermediate_size
self.intermediate_size = config.intermediate_size
# 创建一个线性层,用于门控投影,输入维度为 hidden_size,输出维度为 intermediate_size,没有偏置项
self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
# 创建一个线性层,用于上投影,输入维度为 hidden_size,输出维度为 intermediate_size,没有偏置项
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
# 创建一个线性层,用于下投影,输入维度为 intermediate_size,输出维度为 hidden_size,没有偏置项
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
# 根据配置中的激活函数类型选择对应的激活函数,保存到实例中
self.act_fn = ACT2FN[config.hidden_act]
# 前向传播函数,接受输入张量 x
def forward(self, x):
# 计算门控投影后的结果,应用激活函数,再与上投影结果相乘,然后应用下投影
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
# 返回下投影的结果
return down_proj
# 从 transformers.models.llama.modeling_llama.repeat_kv 复制过来的函数
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
"""
这是等效于 torch.repeat_interleave(x, dim=1, repeats=n_rep) 的函数。
将隐藏状态从 (batch, num_key_value_heads, seqlen, head_dim) 扩展为 (batch, num_attention_heads, seqlen, head_dim)
"""
# 获取隐藏状态张量的形状信息
batch, num_key_value_heads, slen, head_dim = hidden_states.shape
# 如果 n_rep 等于 1,直接返回原始隐藏状态张量
if n_rep == 1:
return hidden_states
# 扩展隐藏状态张量的维度,重复 n_rep 次
hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
# 重新整形张量,以得到期望的形状 (batch, num_attention_heads * n_rep, seqlen, head_dim)
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
# 从 transformers.models.llama.modeling_llama.LlamaAttention Llama->Cohere 复制过来的类,用于定义 CohereAttention 模型
class CohereAttention(nn.Module):
"""来自 'Attention Is All You Need' 论文的多头注意力机制"""
# 类的文档字符串,描述其为来自论文的多头注意力机制
# 初始化函数,用于创建一个新的实例
def __init__(self, config: CohereConfig, layer_idx: Optional[int] = None):
# 调用父类的初始化函数
super().__init__()
# 将传入的配置信息和层索引保存到对象中
self.config = config
self.layer_idx = layer_idx
# 如果未传入层索引,则记录警告信息,建议在使用缓存时传入层索引以避免错误
if layer_idx is None:
logger.warning_once(
f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
"lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
"when creating this class."
)
# 从配置中获取注意力机制的丢弃率、隐藏层大小、注意力头数等信息
self.attention_dropout = config.attention_dropout
self.hidden_size = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.hidden_size // self.num_heads
self.num_key_value_heads = config.num_key_value_heads
self.num_key_value_groups = self.num_heads // self.num_key_value_heads
self.max_position_embeddings = config.max_position_embeddings
self.rope_theta = config.rope_theta
self.is_causal = True
# 检查隐藏层大小是否可以被注意力头数整除,否则抛出数值错误
if (self.head_dim * self.num_heads) != self.hidden_size:
raise ValueError(
f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
f" and `num_heads`: {self.num_heads})."
)
# 初始化线性投影层,用于将隐藏状态映射到注意力头和维度上
self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
# 初始化旋转嵌入,这是一个辅助函数,用于增强模型在序列位置上的表示能力
self._init_rope()
# 辅助函数,用于初始化旋转嵌入
def _init_rope(self):
self.rotary_emb = CohereRotaryEmbedding(
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
base=self.rope_theta,
)
# 前向传播函数,接受输入的隐藏状态并计算输出
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
**kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
# 获取输入张量的尺寸信息
bsz, q_len, _ = hidden_states.size()
# 使用线性投影层对隐藏状态进行变换,得到查询、键、值的状态
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
# 将查询、键、值的状态按照指定维度重新组织,并进行维度转置
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
# 获取已存储的过去键值对信息,若存在则更新当前键值对状态
past_key_value = getattr(self, "past_key_value", past_key_value)
cos, sin = self.rotary_emb(value_states, position_ids)
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
# 对于已存储的过去键值对,根据 RoPE 模型的特定要求进行更新
# 需要提供 sin 和 cos 参数以及缓存位置信息
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
# 将键值对状态分组复制以便多头注意力计算
key_states = repeat_kv(key_states, self.num_key_value_groups)
value_states = repeat_kv(value_states, self.num_key_value_groups)
# 计算注意力权重
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
# 如果存在注意力掩码,则将其应用于注意力权重
if attention_mask is not None: # no matter the length, we just slice it
causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
attn_weights = attn_weights + causal_mask
# 将注意力权重归一化并进行 dropout 处理
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
attn_output = torch.matmul(attn_weights, value_states)
# 检查注意力输出的尺寸是否符合预期
if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
f" {attn_output.size()}"
)
# 调整注意力输出张量的维度顺序,并保证连续性
attn_output = attn_output.transpose(1, 2).contiguous()
# 将注意力输出张量重新调整为隐藏状态的形状
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
# 使用输出投影层处理最终的注意力输出
attn_output = self.o_proj(attn_output)
# 如果不需要输出注意力权重,则将其置为 None
if not output_attentions:
attn_weights = None
# 返回注意力输出张量、注意力权重张量以及更新后的过去键值对信息(如果有)
return attn_output, attn_weights, past_key_value
# 从 `transform
):
"""
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
first unpad the input, then computes the attention scores and pad the final attention scores.
Args:
query_states (`torch.Tensor`):
Input query states to be passed to Flash Attention API
key_states (`torch.Tensor`):
Input key states to be passed to Flash Attention API
value_states (`torch.Tensor`):
Input value states to be passed to Flash Attention API
attention_mask (`torch.Tensor`):
The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
position of padding tokens and 1 for the position of non-padding tokens.
dropout (`float`):
Attention dropout
softmax_scale (`float`, *optional*):
The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
"""
# Determine if causal masking is required based on the configuration
if not self._flash_attn_uses_top_left_mask:
causal = self.is_causal
else:
# Temporary workaround for an issue with Flash Attention on RoCm
# Remove this check once the issue is resolved in future versions
causal = self.is_causal and query_length != 1
# Check if there are any padding tokens in the input sequence
if attention_mask is not None:
batch_size = query_states.shape[0]
# Unpad the input based on the attention mask
query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
query_states, key_states, value_states, attention_mask, query_length
)
cu_seqlens_q, cu_seqlens_k = cu_seq_lens
max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
# Compute attention scores for variable-length sequences
attn_output_unpad = flash_attn_varlen_func(
query_states,
key_states,
value_states,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_k=cu_seqlens_k,
max_seqlen_q=max_seqlen_in_batch_q,
max_seqlen_k=max_seqlen_in_batch_k,
dropout_p=dropout,
softmax_scale=softmax_scale,
causal=causal,
)
# Pad the computed attention scores to match the original input length
attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
else:
# Compute attention scores without considering padding
attn_output = flash_attn_func(
query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
)
return attn_output
# 从注意力掩码中获取未填充的数据的索引、当前序列长度和批次中最大序列长度
indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
# 获取输入张量 key_layer 的形状信息
batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
# 根据获取的索引重新排列 key_layer 张量的第一个轴,以对应未填充的数据
key_layer = index_first_axis(
key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
)
# 根据获取的索引重新排列 value_layer 张量的第一个轴,以对应未填充的数据
value_layer = index_first_axis(
value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
)
# 根据查询长度调整查询层的数据
if query_length == kv_seq_len:
# 若查询长度等于 key_value 序列长度,则根据获取的索引重新排列查询层的数据
query_layer = index_first_axis(
query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
)
# 更新当前查询序列长度和批次中最大查询序列长度
cu_seqlens_q = cu_seqlens_k
max_seqlen_in_batch_q = max_seqlen_in_batch_k
indices_q = indices_k
elif query_length == 1:
# 若查询长度为1,则对应的当前查询序列长度为1,且索引为批次索引
max_seqlen_in_batch_q = 1
cu_seqlens_q = torch.arange(
batch_size + 1, dtype=torch.int32, device=query_layer.device
) # 这里有一个内存拷贝,这样做效率很低。
indices_q = cu_seqlens_q[:-1]
query_layer = query_layer.squeeze(1)
else:
# 如果查询长度不等于 key_value 序列长度且不等于1,则假设存在左填充情况,根据注意力掩码和查询层调整输入数据
attention_mask = attention_mask[:, -query_length:]
query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
# 返回调整后的查询层、键层、值层、查询索引、当前序列长度元组和最大序列长度元组
return (
query_layer,
key_layer,
value_layer,
indices_q,
(cu_seqlens_q, cu_seqlens_k),
(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
)
# 定义一个名为 CohereSdpaAttention 的类,它继承自 CohereAttention 类
# 该类用于实现 Cohere 模型的自注意力机制,使用 torch.nn.functional.scaled_dot_product_attention
class CohereSdpaAttention(CohereAttention):
"""
Cohere attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
`CohereAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
SDPA API.
"""
# 重写 forward 方法,适应 SDPA API
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
):
# forward 方法的具体实现在子类中完成,本处不直接提供具体实现
pass
# 定义一个字典 COHERE_ATTENTION_CLASSES,包含不同的 CohereAttention 类型作为值,以字符串为键
COHERE_ATTENTION_CLASSES = {
"eager": CohereAttention,
"flash_attention_2": CohereFlashAttention2,
"sdpa": CohereSdpaAttention, # 将 CohereSdpaAttention 类作为 sdpa 类型的实现
}
class CohereDecoderLayer(nn.Module):
# CohereDecoderLayer 类,继承自 nn.Module
def __init__(self, config: CohereConfig, layer_idx: int):
super().__init__()
self.hidden_size = config.hidden_size
# 初始化 self_attn 属性,根据配置选择不同的注意力机制实现类
self.self_attn = COHERE_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
# 初始化 mlp 属性为 CohereMLP 类的实例
self.mlp = CohereMLP(config)
# 初始化 input_layernorm 属性为 CohereLayerNorm 类的实例,用于层归一化
self.input_layernorm = CohereLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 定义 forward 方法,实现模型的前向传播
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
cache_position: Optional[torch.LongTensor] = None,
**kwargs,
):
# forward 方法的具体实现在子类中完成,本处不直接提供具体实现
pass
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`, *optional*):
attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
query_sequence_length, key_sequence_length)` if default attention is used.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
"""
if "padding_mask" in kwargs:
# 发出警告信息,提示 `padding_mask` 参数已经不推荐使用
warnings.warn(
"Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
)
# 将输入状态作为残差连接的起始点
residual = hidden_states
# Layer normalization,对输入状态进行归一化处理
hidden_states = self.input_layernorm(hidden_states)
# Self Attention
# 调用自注意力机制进行处理,获取注意力加权的输出、注意力权重及可能的过去键值状态
hidden_states_attention, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,
**kwargs,
)
# Fully Connected Layer
# 通过多层感知机(MLP)进行全连接层的处理
hidden_states_mlp = self.mlp(hidden_states)
# 将残差、注意力加权输出和MLP输出相加得到最终的隐藏状态表示
hidden_states = residual + hidden_states_attention + hidden_states_mlp
# 构建输出元组
outputs = (hidden_states,)
# 如果需要返回注意力权重,则添加到输出元组中
if output_attentions:
outputs += (self_attn_weights,)
# 如果需要返回缓存状态,则添加到输出元组中
if use_cache:
outputs += (present_key_value,)
# 返回最终的输出元组
return outputs
"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`CohereConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
@add_start_docstrings(
"The bare Cohere Model outputting raw hidden-states without any specific head on top.",
COHERE_START_DOCSTRING,
)
# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->Cohere
class CoherePreTrainedModel(PreTrainedModel):
config_class = CohereConfig # 设置模型配置类为CohereConfig
base_model_prefix = "model" # 模型基本前缀为"model"
supports_gradient_checkpointing = True # 支持梯度检查点
_no_split_modules = ["CohereDecoderLayer"] # 不分割的模块列表,包括"CohereDecoderLayer"
_skip_keys_device_placement = ["past_key_values"] # 跳过设备放置的键列表,包括"past_key_values"
_supports_flash_attn_2 = True # 支持flash attention 2
_supports_sdpa = True # 支持sdpa
_supports_cache_class = True # 支持缓存类操作
def _init_weights(self, module):
std = self.config.initializer_range
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std) # 初始化线性层权重为正态分布
if module.bias is not None:
module.bias.data.zero_() # 如果有偏置,初始化为零
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std) # 初始化嵌入层权重为正态分布
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_() # 如果有填充索引,对应位置初始化为零
def _setup_cache(self, cache_cls, max_batch_size, max_cache_len: Optional[int] = None):
if self.config._attn_implementation == "flash_attention_2" and cache_cls == StaticCache:
raise ValueError(
"`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
"make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
)
for layer in self.model.layers:
device = layer.input_layernorm.weight.device
if hasattr(self.config, "_pre_quantization_dtype"):
dtype = self.config._pre_quantization_dtype
else:
dtype = layer.self_attn.o_proj.weight.dtype
layer.self_attn.past_key_value = cache_cls(
self.config, max_batch_size, max_cache_len, device=device, dtype=dtype
)
def _reset_cache(self):
for layer in self.model.layers:
layer.self_attn.past_key_value = None
# CohereModel 类定义,继承自 CoherePreTrainedModel
class CohereModel(CoherePreTrainedModel):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`CohereDecoderLayer`]
Args:
config: CohereConfig
"""
# 初始化函数,接受一个 config 对象作为参数
def __init__(self, config: CohereConfig):
# 调用父类的初始化函数
super().__init__(config)
# 将配置中的 pad_token_id 赋给当前实例的 padding_idx 属性
self.padding_idx = config.pad_token_id
# 将配置中的 vocab_size 赋给当前实例的 vocab_size 属性
self.vocab_size = config.vocab_size
# 创建一个词嵌入层,参数为 vocab_size(词汇表大小)、hidden_size(隐藏层大小)、padding_idx(填充标记的索引)
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
# 使用列表推导式创建包含多个 CohereDecoderLayer 对象的层列表,数量为 config.num_hidden_layers
self.layers = nn.ModuleList(
[CohereDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
)
# 创建一个 CohereLayerNorm 层,参数为 hidden_size(隐藏层大小)、eps(层归一化的 epsilon 值)
self.norm = CohereLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 设置梯度检查点为 False
self.gradient_checkpointing = False
# 创建一个因果遮罩(causal mask),用于区分因果和填充遮罩的创建。注意:这不利于 TorchScript、ONNX 和大型 max_position_embeddings 的序列化导出。
# causal_mask 是一个二维的布尔类型张量,形状为 (config.max_position_embeddings, config.max_position_embeddings)
causal_mask = torch.full(
(config.max_position_embeddings, config.max_position_embeddings), fill_value=True, dtype=torch.bool
)
# 将上三角矩阵部分设置为 False,保留下三角矩阵和对角线为 True
self.register_buffer("causal_mask", torch.triu(causal_mask, diagonal=1), persistent=False)
# 调用初始化后的处理函数
self.post_init()
# 返回词嵌入层 embed_tokens
def get_input_embeddings(self):
return self.embed_tokens
# 设置词嵌入层 embed_tokens 的值
def set_input_embeddings(self, value):
self.embed_tokens = value
# forward 函数重写,对模型进行前向传播
@add_start_docstrings_to_model_forward(COHERE_INPUTS_DOCSTRING)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
):
# TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
# KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
# (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
# `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
# 从 transformers.models.llama.modeling_llama.LlamaForCausalLM 复制而来,将 Llama 替换为 Cohere
class CohereForCausalLM(CoherePreTrainedModel):
# 定义与 lm_head 权重相关的键列表,用于权重共享
_tied_weights_keys = ["lm_head.weight"]
# 初始化方法,接受一个配置对象 config
def __init__(self, config):
# 调用父类的初始化方法
super().__init__(config)
# 创建 CohereModel 的实例,并保存到 self.model 中
self.model = CohereModel(config)
# 从 config 中获取词汇表大小,并保存到 self.vocab_size 中
self.vocab_size = config.vocab_size
# 创建一个线性层,用于 LM 头部,将隐藏大小转换为词汇表大小,无偏置
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
# 从 config 中获取 logit_scale,并保存到 self.logit_scale 中
self.logit_scale = config.logit_scale
# 从 config 中获取 tie_word_embeddings,并保存到 self.tie_word_embeddings 中
self.tie_word_embeddings = config.tie_word_embeddings
# 调用后处理初始化方法
self.post_init()
# 返回模型的输入嵌入层对象
def get_input_embeddings(self):
return self.model.embed_tokens
# 设置模型的输入嵌入层对象为指定的值
def set_input_embeddings(self, value):
self.model.embed_tokens = value
# 返回模型的输出嵌入层对象
def get_output_embeddings(self):
return self.lm_head
# 设置模型的输出嵌入层对象为指定的新嵌入层
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
# 设置解码器为指定的 decoder 对象
def set_decoder(self, decoder):
self.model = decoder
# 获取当前使用的解码器对象
def get_decoder(self):
return self.model
# 前向传播方法,接收多个输入参数,详见装饰器中的 COHERE_INPUTS_DOCSTRING 描述
# 返回类型为 CausalLMOutputWithPast,详见配置类 _CONFIG_FOR_DOC
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
):
# 前向传播逻辑在后续实现中定义,具体实现细节参考具体模型文档
# 为生成准备输入数据的静态方法,处理生成需要的输入参数
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, cache_position=None, **kwargs
):
# 输入数据准备逻辑在后续实现中定义,具体实现细节参考具体模型文档
# 静态方法,重新排序缓存中的过去键值,以适应 beam search 中的索引变化
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
.\models\cohere\tokenization_cohere_fast.py
import pickle
from typing import Dict, List, Literal, Union
from tokenizers import processors
from ...pipelines.conversational import Conversation
from ...tokenization_utils_base import BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from ...utils.versions import require_version
require_version("tokenizers>=0.13.3")
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"tokenizer_file": {
"Cohere/Command-nightly": "https://huggingface.co/Cohere/Command-nightly/blob/main/tokenizer.json",
},
}
DEFAULT_SYSTEM_PROMPT = "You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere."
DEFAULT_RAG_PREAMBLE = """## Task and Context
You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging.
## Style Guide
Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling."""
class CohereTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a Cohere tokenizer. Based on byte-level Byte-Pair-Encoding.
This uses notably ByteFallback and NFC normalization.
```
>>> from transformers import AutoTokenizer
>>> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")
>>> tokenizer.encode("Hello this is a test")
[5, 28339, 2075, 1801, 1671, 3282]
```
If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the
values of the first token and final token of an encoded sequence will not be correct). For more details, checkout
[post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
padding_side = "left"
model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = None
def __init__(
self,
vocab_file=None,
merges_file=None,
tokenizer_file=None,
clean_up_tokenization_spaces=False,
unk_token="<UNK>",
bos_token="<BOS_TOKEN>",
eos_token="<|END_OF_TURN_TOKEN|>",
add_bos_token=True,
add_eos_token=False,
use_default_system_prompt=False,
add_prefix_space=False,
**kwargs,
):
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
padding_side = "left"
model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = None
def __init__(
self,
vocab_file=None,
merges_file=None,
tokenizer_file=None,
clean_up_tokenization_spaces=False,
unk_token="<UNK>",
bos_token="<BOS_TOKEN>",
eos_token="<|END_OF_TURN_TOKEN|>",
add_bos_token=True,
add_eos_token=False,
use_default_system_prompt=False,
add_prefix_space=False,
**kwargs,
):
):
super().__init__(
vocab_file=vocab_file,
merges_file=merges_file,
tokenizer_file=tokenizer_file,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
add_bos_token=add_bos_token,
add_eos_token=add_eos_token,
use_default_system_prompt=use_default_system_prompt,
add_prefix_space=add_prefix_space,
**kwargs,
)
self._add_bos_token = add_bos_token
self._add_eos_token = add_eos_token
self.update_post_processor()
self.use_default_system_prompt = use_default_system_prompt
self.vocab_file = vocab_file
self.grounded_generation_template = kwargs.pop("grounded_generation_template", None)
self.tool_use_template = kwargs.pop("tool_use_template", None)
pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer)
decoder_state = pickle.dumps(self.backend_tokenizer.decoder)
if add_prefix_space:
pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state)
self.backend_tokenizer.decoder = pickle.loads(decoder_state)
self.add_prefix_space = add_prefix_space
def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
is_split_into_words = kwargs.get("is_split_into_words", False)
if not (self.add_prefix_space or not is_split_into_words):
raise Exception(
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with"
" pretokenized inputs."
)
return super()._batch_encode_plus(*args, **kwargs)
def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
is_split_into_words = kwargs.get("is_split_into_words", False)
if not (self.add_prefix_space or not is_split_into_words):
raise Exception(
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with"
" pretokenized inputs."
)
return super()._encode_plus(*args, **kwargs)
def update_post_processor(self):
"""
更新底层的后处理器,使用当前的 `bos_token` 和 `eos_token`。
"""
bos = self.bos_token
bos_token_id = self.bos_token_id
if bos is None and self.add_bos_token:
raise ValueError("add_bos_token = True but bos_token = None")
eos = self.eos_token
eos_token_id = self.eos_token_id
if eos is None and self.add_eos_token:
raise ValueError("add_eos_token = True but eos_token = None")
single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
special_tokens = []
if self.add_bos_token:
special_tokens.append((bos, bos_token_id))
if self.add_eos_token:
special_tokens.append((eos, eos_token_id))
self._tokenizer.post_processor = processors.TemplateProcessing(
single=single, pair=pair, special_tokens=special_tokens
)
@property
def add_eos_token(self):
return self._add_eos_token
@property
def add_bos_token(self):
return self._add_bos_token
@add_eos_token.setter
def add_eos_token(self, value):
self._add_eos_token = value
self.update_post_processor()
@add_bos_token.setter
def add_bos_token(self, value):
self._add_bos_token = value
self.update_post_processor()
@property
def apply_tool_use_template(
self,
conversation: Union[List[Dict[str, str]], "Conversation"],
tools: List[Dict],
**kwargs,
):
"""
应用工具使用模板到给定对话和工具列表中的工具。
"""
def apply_grounded_generation_template(
self,
conversation: Union[List[Dict[str, str]], "Conversation"],
documents: List[Dict],
citation_mode: Literal["fast", "accurate"] = "accurate",
**kwargs,
):
"""
应用基于文档生成的模板到给定对话和文档列表中的文档。
"""
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
构建包含特殊标记的输入序列。
Args:
- token_ids_0: 第一个输入序列的 token IDs
- token_ids_1: 可选,第二个输入序列的 token IDs
Returns:
- 包含特殊标记的输入序列
"""
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
output = bos_token_id + token_ids_0 + eos_token_id
if token_ids_1 is not None:
output = output + bos_token_id + token_ids_1 + eos_token_id
return output
.\models\cohere\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_sentencepiece_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_cohere": ["COHERE_PRETRAINED_CONFIG_ARCHIVE_MAP", "CohereConfig"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_cohere_fast"] = ["CohereTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_cohere"] = [
"CohereForCausalLM",
"CohereModel",
"CoherePreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_cohere import COHERE_PRETRAINED_CONFIG_ARCHIVE_MAP, CohereConfig
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_cohere_fast import CohereTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_cohere import (
CohereForCausalLM,
CohereModel,
CoherePreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\conditional_detr\configuration_conditional_detr.py
""" Conditional DETR model configuration"""
from collections import OrderedDict
from typing import Mapping
from packaging import version
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
from ..auto import CONFIG_MAPPING
logger = logging.get_logger(__name__)
CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"microsoft/conditional-detr-resnet-50": (
"https://huggingface.co/microsoft/conditional-detr-resnet-50/resolve/main/config.json"
),
}
class ConditionalDetrConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`ConditionalDetrModel`]. It is used to instantiate
a Conditional DETR model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the Conditional DETR
[microsoft/conditional-detr-resnet-50](https://huggingface.co/microsoft/conditional-detr-resnet-50) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Examples:
```
>>> from transformers import ConditionalDetrConfig, ConditionalDetrModel
>>> # Initializing a Conditional DETR microsoft/conditional-detr-resnet-50 style configuration
>>> configuration = ConditionalDetrConfig()
>>> # Initializing a model (with random weights) from the microsoft/conditional-detr-resnet-50 style configuration
>>> model = ConditionalDetrModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "conditional_detr"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"hidden_size": "d_model",
"num_attention_heads": "encoder_attention_heads",
}
def __init__(
self,
use_timm_backbone=True,
backbone_config=None,
num_channels=3,
num_queries=300,
encoder_layers=6,
encoder_ffn_dim=2048,
encoder_attention_heads=8,
decoder_layers=6,
decoder_ffn_dim=2048,
decoder_attention_heads=8,
encoder_layerdrop=0.0,
decoder_layerdrop=0.0,
is_encoder_decoder=True,
activation_function="relu",
d_model=256,
dropout=0.1,
attention_dropout=0.0,
activation_dropout=0.0,
init_std=0.02,
init_xavier_std=1.0,
auxiliary_loss=False,
position_embedding_type="sine",
backbone="resnet50",
use_pretrained_backbone=True,
backbone_kwargs=None,
dilation=False,
class_cost=2,
bbox_cost=5,
giou_cost=2,
mask_loss_coefficient=1,
dice_loss_coefficient=1,
cls_loss_coefficient=2,
bbox_loss_coefficient=5,
giou_loss_coefficient=2,
focal_alpha=0.25,
**kwargs,
):
@property
def num_attention_heads(self) -> int:
return self.encoder_attention_heads
@property
def hidden_size(self) -> int:
return self.d_model
class ConditionalDetrOnnxConfig(OnnxConfig):
torch_onnx_minimum_version = version.parse("1.11")
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
return OrderedDict(
[
("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
("pixel_mask", {0: "batch"}),
]
)
@property
def atol_for_validation(self) -> float:
return 1e-5
@property
def default_onnx_opset(self) -> int:
return 12
.\models\conditional_detr\convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
"""Convert Conditional DETR checkpoints."""
import argparse
import json
from collections import OrderedDict
from pathlib import Path
import requests
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import (
ConditionalDetrConfig,
ConditionalDetrForObjectDetection,
ConditionalDetrForSegmentation,
ConditionalDetrImageProcessor,
)
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
rename_keys = []
for i in range(6):
rename_keys.append(
(f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight")
)
rename_keys.append(
(f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias")
)
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias"))
rename_keys.append(
(f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight")
)
rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias"))
rename_keys.append(
(f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight")
)
rename_keys.append(
(f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias")
)
rename_keys.append(
(
f"transformer.decoder.layers.{i}.cross_attn.out_proj.weight",
f"decoder.layers.{i}.encoder_attn.out_proj.weight",
)
)
rename_keys.append(
(
f"transformer.decoder.layers.{i}.cross_attn.out_proj.bias",
f"decoder.layers.{i}.encoder_attn.out_proj.bias",
)
)
rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias"))
rename_keys.append(
(f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight")
)
rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias"))
rename_keys.append(
(f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight")
)
rename_keys.append(
(f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias")
)
rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))
rename_keys.append(
(f"transformer.decoder.layers.{i}.sa_qcontent_proj.weight", f"decoder.layers.{i}.sa_qcontent_proj.weight")
)
rename_keys.append(
(f"transformer.decoder.layers.{i}.sa_kcontent_proj.weight", f"decoder.layers.{i}.sa_kcontent_proj.weight")
)
rename_keys.append(
(f"transformer.decoder.layers.{i}.sa_qpos_proj.weight", f"decoder.layers.{i}.sa_qpos_proj.weight")
)
rename_keys.append(
(f"transformer.decoder.layers.{i}.sa_kpos_proj.weight", f"decoder.layers.{i}.sa_kpos_proj.weight")
)
rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.weight", f"decoder.layers.{i}.sa_v_proj.weight"))
rename_keys.append(
(f"transformer.decoder.layers.{i}.ca_qcontent_proj.weight", f"decoder.layers.{i}.ca_qcontent_proj.weight")
)
rename_keys.append(
(f"transformer.decoder.layers.{i}.ca_kcontent_proj.weight", f"decoder.layers.{i}.ca_kcontent_proj.weight")
)
rename_keys.append(
(f"transformer.decoder.layers.{i}.ca_kpos_proj.weight", f"decoder.layers.{i}.ca_kpos_proj.weight")
)
rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.weight", f"decoder.layers.{i}.ca_v_proj.weight"))
rename_keys.append(
(f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.weight", f"decoder.layers.{i}.ca_qpos_sine_proj.weight")
)
rename_keys.append(
(f"transformer.decoder.layers.{i}.sa_qcontent_proj.bias", f"decoder.layers.{i}.sa_qcontent_proj.bias")
)
rename_keys.append(
(f"transformer.decoder.layers.{i}.sa_kcontent_proj.bias", f"decoder.layers.{i}.sa_kcontent_proj.bias")
)
rename_keys.append((f"transformer.decoder.layers.{i}.sa_qpos_proj.bias", f"decoder.layers.{i}.sa_qpos_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.sa_kpos_proj.bias", f"decoder.layers.{i}.sa_kpos_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.bias", f"decoder.layers.{i}.sa_v_proj.bias"))
rename_keys.append(
(f"transformer.decoder.layers.{i}.ca_qcontent_proj.bias", f"decoder.layers.{i}.ca_qcontent_proj.bias")
)
rename_keys.append(
(f"transformer.decoder.layers.{i}.ca_kcontent_proj.bias", f"decoder.layers.{i}.ca_kcontent_proj.bias")
)
rename_keys.append((f"transformer.decoder.layers.{i}.ca_kpos_proj.bias", f"decoder.layers.{i}.ca_kpos_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.bias", f"decoder.layers.{i}.ca_v_proj.bias"))
rename_keys.append(
(f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.bias", f"decoder.layers.{i}.ca_qpos_sine_proj.bias")
)
rename_keys.extend(
[
("input_proj.weight", "input_projection.weight"),
("input_proj.bias", "input_projection.bias"),
("query_embed.weight", "query_position_embeddings.weight"),
("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
("class_embed.weight", "class_labels_classifier.weight"),
("class_embed.bias", "class_labels_classifier.bias"),
("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
("transformer.decoder.ref_point_head.layers.0.weight", "decoder.ref_point_head.layers.0.weight"),
("transformer.decoder.ref_point_head.layers.0.bias", "decoder.ref_point_head.layers.0.bias"),
("transformer.decoder.ref_point_head.layers.1.weight", "decoder.ref_point_head.layers.1.weight"),
("transformer.decoder.ref_point_head.layers.1.bias", "decoder.ref_point_head.layers.1.bias"),
("transformer.decoder.query_scale.layers.0.weight", "decoder.query_scale.layers.0.weight"),
("transformer.decoder.query_scale.layers.0.bias", "decoder.query_scale.layers.0.bias"),
("transformer.decoder.query_scale.layers.1.weight", "decoder.query_scale.layers.1.weight"),
("transformer.decoder.query_scale.layers.1.bias", "decoder.query_scale.layers.1.bias"),
("transformer.decoder.layers.0.ca_qpos_proj.weight", "decoder.layers.0.ca_qpos_proj.weight"),
("transformer.decoder.layers.0.ca_qpos_proj.bias", "decoder.layers.0.ca_qpos_proj.bias"),
]
)
def rename_key(state_dict, old, new):
val = state_dict.pop(old)
state_dict[new] = val
def rename_backbone_keys(state_dict):
new_state_dict = OrderedDict()
for key, value in state_dict.items():
if "backbone.0.body" in key:
new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model")
new_state_dict[new_key] = value
else:
new_state_dict[key] = value
return new_state_dict
def read_in_q_k_v(state_dict, is_panoptic=False):
prefix = ""
if is_panoptic:
prefix = "conditional_detr."
for i in range(6):
in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
"""
将模型权重复制/粘贴/调整到我们的 CONDITIONAL_DETR 结构中。
"""
config = ConditionalDetrConfig()
if "resnet101" in model_name:
config.backbone = "resnet101"
if "dc5" in model_name:
config.dilation = True
is_panoptic = "panoptic" in model_name
if is_panoptic:
config.num_labels = 250
else:
config.num_labels = 91
repo_id = "huggingface/label-files"
filename = "coco-detection-id2label.json"
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
format = "coco_panoptic" if is_panoptic else "coco_detection"
image_processor = ConditionalDetrImageProcessor(format=format)
img = prepare_img()
encoding = image_processor(images=img, return_tensors="pt")
pixel_values = encoding["pixel_values"]
logger.info(f"Converting model {model_name}...")
conditional_detr = torch.hub.load("DeppMeng/ConditionalDETR", model_name, pretrained=True).eval()
state_dict = conditional_detr.state_dict()
for src, dest in rename_keys:
if is_panoptic:
src = "conditional_detr." + src
rename_key(state_dict, src, dest)
state_dict = rename_backbone_keys(state_dict)
read_in_q_k_v(state_dict, is_panoptic=is_panoptic)
prefix = "conditional_detr.model." if is_panoptic else "model."
for key in state_dict.copy().keys():
if is_panoptic:
if (
key.startswith("conditional_detr")
and not key.startswith("class_labels_classifier")
and not key.startswith("bbox_predictor")
):
val = state_dict.pop(key)
state_dict["conditional_detr.model" + key[4:]] = val
elif "class_labels_classifier" in key or "bbox_predictor" in key:
val = state_dict.pop(key)
state_dict["conditional_detr." + key] = val
elif key.startswith("bbox_attention") or key.startswith("mask_head"):
continue
else:
val = state_dict.pop(key)
state_dict[prefix + key] = val
else:
if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
val = state_dict.pop(key)
state_dict[prefix + key] = val
model = ConditionalDetrForSegmentation(config) if is_panoptic else ConditionalDetrForObjectDetection(config)
model.load_state_dict(state_dict)
model.eval()
model.push_to_hub(repo_id=model_name, organization="DepuMeng", commit_message="Add model")
original_outputs = conditional_detr(pixel_values)
outputs = model(pixel_values)
assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4)
assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4)
if is_panoptic:
assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_folder_path)
image_processor.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
default="conditional_detr_resnet50",
type=str,
help="Name of the CONDITIONAL_DETR model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the folder to output PyTorch model."
)
args = parser.parse_args()
convert_conditional_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path)