Transformers 源码解析(一百二十五)
.\models\whisper\tokenization_whisper_fast.py
import json
import os
import re
import warnings
from functools import lru_cache
from typing import List, Optional, Tuple
import numpy as np
from tokenizers import AddedToken, pre_tokenizers, processors
from ...tokenization_utils_base import BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .english_normalizer import BasicTextNormalizer, EnglishTextNormalizer
from .tokenization_whisper import LANGUAGES, TASK_IDS, TO_LANGUAGE_CODE, WhisperTokenizer, _decode_asr
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"tokenizer_file": "tokenizer.json",
"merges_file": "merges.txt",
"normalizer_file": "normalizer.json",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"openai/whisper-tiny": "https://huggingface.co/openai/whisper-tiny/resolve/main/vocab.json",
"openai/whisper-base": "https://huggingface.co/openai/whisper-base/resolve/main/vocab.json",
"openai/whisper-small": "https://huggingface.co/openai/whisper-small/resolve/main/vocab.json",
"openai/whisper-medium": "https://huggingface.co/openai/whisper-medium/resolve/main/vocab.json",
"openai/whisper-large": "https://huggingface.co/openai/whisper-large/resolve/main/vocab.json",
"openai/whisper-tiny.en": "https://huggingface.co/openai/whisper-tiny.en/resolve/main/vocab.json",
"openai/whisper-base.en": "https://huggingface.co/openai/whisper-base.en/resolve/main/vocab.json",
"openai/whisper-small.en": "https://huggingface.co/openai/whisper-small.en/resolve/main/vocab.json",
"openai/whisper-medium.en": "https://huggingface.co/openai/whisper-medium.en/resolve/main/vocab.json",
},
}
"merges_file": {
"openai/whisper-tiny": "https://huggingface.co/openai/whisper-tiny/resolve/main/merges.txt",
"openai/whisper-base": "https://huggingface.co/openai/whisper-base/resolve/main/merges.txt",
"openai/whisper-small": "https://huggingface.co/openai/whisper-small/resolve/main/merges.txt",
"openai/whisper-medium": "https://huggingface.co/openai/whisper-medium/resolve/main/merges.txt",
"openai/whisper-large": "https://huggingface.co/openai/whisper-large/resolve/main/merges.txt",
"openai/whisper-tiny.en": "https://huggingface.co/openai/whisper-tiny.en/resolve/main/merges.txt",
"openai/whisper-base.en": "https://huggingface.co/openai/whisper-base.en/resolve/main/merges.txt",
"openai/whisper-small.en": "https://huggingface.co/openai/whisper-small.en/resolve/main/merges.txt",
"openai/whisper-medium.en": "https://huggingface.co/openai/whisper-medium.en/resolve/main/merges.txt",
},
"tokenizer_file": {
"openai/whisper-tiny": "https://huggingface.co/openai/whisper-tiny/resolve/main/tokenizer.json",
"openai/whisper-base": "https://huggingface.co/openai/whisper-base/resolve/main/tokenizer.json",
"openai/whisper-small": "https://huggingface.co/openai/whisper-small/resolve/main/tokenizer.json",
"openai/whisper-medium": "https://huggingface.co/openai/whisper-medium/resolve/main/tokenizer.json",
"openai/whisper-large": "https://huggingface.co/openai/whisper-large/resolve/main/tokenizer.json",
"openai/whisper-tiny.en": "https://huggingface.co/openai/whisper-tiny.en/resolve/main/tokenizer.json",
"openai/whisper-base.en": "https://huggingface.co/openai/whisper-base.en/resolve/main/tokenizer.json",
"openai/whisper-small.en": "https://huggingface.co/openai/whisper-small.en/resolve/main/tokenizer.json",
"openai/whisper-medium.en": "https://huggingface.co/openai/whisper-medium.en/resolve/main/tokenizer.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"openai/whisper-tiny": 1500,
"openai/whisper-base": 1500,
"openai/whisper-small": 1500,
"openai/whisper-medium": 1500,
"openai/whisper-large": 1500,
"openai/whisper-tiny.en": 1500,
"openai/whisper-base.en": 1500,
"openai/whisper-small.en": 1500,
"openai/whisper-medium.en": 1500,
}
class WhisperTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" Whisper tokenizer (backed by HuggingFace's *tokenizers* library).
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.
Args:
vocab_file (`str`, *optional*):
Path to the vocabulary file.
merges_file (`str`, *optional*):
Path to the merges file.
normalizer_file (`str`, *optional*):
Path to the normalizer_file file.
tokenizer_file (`str`, *optional*):
Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
contains everything needed to load the tokenizer.
unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The beginning of sequence token. The `decoder_start_token_id` is used to set the first token as
`"<|startoftranscript|>"` when generating.
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The end of sequence token.
add_prefix_space (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (Whisper tokenizer detect beginning of words by the preceding space).
language (`str`, *optional*):
The language of the transcription text. The corresponding language id token is appended to the start of the
sequence for multilingual speech recognition and speech translation tasks, e.g. for Spanish the token
`"<|es|>"` is appended to the start of sequence. This should be used for multilingual fine-tuning only.
task (`str`, *optional*):
Task identifier to append at the start of sequence (if any). This should be used for mulitlingual
fine-tuning, with `"transcribe"` for speech recognition and `"translate"` for speech translation.
predict_timestamps (`bool`, *optional*, defaults to `False`):
Whether to omit the `<|notimestamps|>` token at the start of the sequence.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = WhisperTokenizer
def __init__(
self,
vocab_file=None,
merges_file=None,
normalizer_file=None,
tokenizer_file=None,
unk_token="<|endoftext|>",
bos_token="<|endoftext|>",
eos_token="<|endoftext|>",
add_prefix_space=False,
language=None,
task=None,
predict_timestamps=False,
**kwargs,
):
bos_token = (
AddedToken(bos_token, lstrip=False, rstrip=False, normalized=False, special=True)
if isinstance(bos_token, str)
else bos_token
)
eos_token = (
AddedToken(eos_token, lstrip=False, rstrip=False, normalized=False, special=True)
if isinstance(eos_token, str)
else eos_token
)
unk_token = (
AddedToken(unk_token, lstrip=False, rstrip=False, normalized=False, special=True)
if isinstance(unk_token, str)
else unk_token
)
super().__init__(
vocab_file,
merges_file,
tokenizer_file=tokenizer_file,
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
add_prefix_space=add_prefix_space,
**kwargs,
)
self.add_bos_token = kwargs.pop("add_bos_token", False)
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
if normalizer_file is not None:
with open(normalizer_file, encoding="utf-8") as vocab_handle:
self.english_spelling_normalizer = json.load(vocab_handle)
else:
self.english_spelling_normalizer = None
self.add_prefix_space = add_prefix_space
self.timestamp_pat = re.compile(r"<\|(\d+\.\d+)\|>")
self.language = language
self.task = task
self.predict_timestamps = predict_timestamps
def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
is_split_into_words = kwargs.get("is_split_into_words", False)
assert self.add_prefix_space or not is_split_into_words, (
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
"to use it with pretokenized inputs."
)
return super()._batch_encode_plus(*args, **kwargs)
is_split_into_words = kwargs.get("is_split_into_words", False)
assert self.add_prefix_space or not is_split_into_words, (
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
"to use it with pretokenized inputs."
)
return super()._encode_plus(*args, **kwargs)
def _decode_with_timestamps(self, token_ids, skip_special_tokens=False, time_precision=0.02) -> str:
"""
Timestamp tokens are above the special tokens' id range and are ignored by `decode()`. This method decodes
given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
"""
timestamp_begin = self.all_special_ids[-1] + 1
outputs = [[]]
cur_max_timestamp = 0.0
prev_segments_len = 0.0
for token in token_ids:
if token >= timestamp_begin:
timestamp = float((token - timestamp_begin) * time_precision)
if timestamp < cur_max_timestamp:
prev_segments_len += cur_max_timestamp
cur_max_timestamp = timestamp
outputs.append(f"<|{(timestamp + prev_segments_len):.2f}|>")
outputs.append([])
else:
outputs[-1].append(token)
outputs = [
s if isinstance(s, str) else self.decode(s, skip_special_tokens=skip_special_tokens) for s in outputs
]
return "".join(outputs)
def _compute_offsets(self, token_ids, time_precision=0.02):
"""
Compute offsets for a given tokenized input
Args:
token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
List of tokenized input ids. Can be obtained using the `__call__` method.
time_precision (`float`, `optional`, defaults to 0.02):
The time ratio to convert from token to time.
"""
offsets = []
if "torch" in str(type(token_ids)) and (hasattr(token_ids, "cpu") and callable(token_ids.cpu)):
token_ids = token_ids.cpu()
token_ids = np.array(token_ids)
if token_ids.shape[0] > 1 and len(token_ids.shape) > 1:
raise ValueError("Can only process a single input at a time")
timestamp_begin = self.all_special_ids[-1] + 1
timestamp_tokens = token_ids >= timestamp_begin
consecutive = np.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] + 1
if consecutive.shape[0] == 0 and timestamp_tokens.sum() <= 1:
return []
elif np.where(timestamp_tokens)[0][-1] + 1 not in consecutive:
consecutive = np.append(consecutive, np.where(timestamp_tokens)[0][-1] + 1)
last_slice = np.where(timestamp_tokens)[0][0]
for current_slice in consecutive:
sliced_tokens = token_ids[last_slice:current_slice]
start_timestamp_position = sliced_tokens[0].item() - timestamp_begin
end_timestamp_position = sliced_tokens[-1].item() - timestamp_begin
sliced_tokens = self._preprocess_token_ids(sliced_tokens)
text = self._decode(sliced_tokens)
text = self._filter_timestamp_ids(text)
offsets.append(
{
"text": text,
"timestamp": (
start_timestamp_position * time_precision,
end_timestamp_position * time_precision,
),
}
)
last_slice = current_slice
return offsets
def timestamp_ids(self, time_precision=0.02):
return self.convert_tokens_to_ids([("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)])
def _preprocess_token_ids(self, token_ids, skip_special_tokens: bool = False):
if skip_special_tokens:
prompt_token_id = self.convert_tokens_to_ids("<|startofprev|>")
decoder_start_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
token_ids = self._strip_prompt(token_ids, prompt_token_id, decoder_start_token_id)
return token_ids
def _filter_timestamp_ids(self, token_ids):
return re.sub(self.timestamp_pat, "", token_ids)
def decode(
self,
token_ids,
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = None,
output_offsets: bool = False,
time_precision: float = 0.02,
decode_with_timestamps: bool = False,
normalize: bool = False,
basic_normalize: bool = False,
remove_diacritics: bool = False,
**kwargs,
):
"""
解码令牌 ID 为文本。
Args:
token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
Tokenized input ids list.
skip_special_tokens (`bool`, optional, defaults to `False`):
Whether or not to remove special tokens from the token ids. If `True`, the prompt token ids will be
removed.
clean_up_tokenization_spaces (`bool`, optional):
Whether or not to clean up tokenization spaces in the output text.
output_offsets (`bool`, optional):
Whether to return the token-level offsets in the original input text.
time_precision (`float`, optional, defaults to 0.02):
The time precision used for decoding timestamps.
decode_with_timestamps (`bool`, optional):
Whether to decode timestamps along with the tokens.
normalize (`bool`, optional):
Whether to normalize the decoded text.
basic_normalize (`bool`, optional):
Whether to perform basic normalization on the decoded text.
remove_diacritics (`bool`, optional):
Whether to remove diacritics from the decoded text.
Returns:
Decoded text as a string.
"""
if normalize:
clean_text = self._normalize(text)
return clean_text
elif basic_normalize:
clean_text = self._basic_normalize(text, remove_diacritics=remove_diacritics)
return clean_text
else:
return text
def _decode(
self, *args, normalize: bool = False, basic_normalize: bool = False, remove_diacritics: bool = False, **kwargs
) -> str:
"""
解码操作的内部实现。
Args:
*args: 传递给超类的参数。
normalize (`bool`, optional):
是否对解码后的文本进行规范化处理。
basic_normalize (`bool`, optional):
是否对解码后的文本进行基础规范化处理。
remove_diacritics (`bool`, optional):
是否移除解码后文本中的变音符号。
Returns:
解码后的字符串。
"""
text = super()._decode(*args, **kwargs)
if normalize:
clean_text = self._normalize(text)
return clean_text
elif basic_normalize:
clean_text = self._basic_normalize(text, remove_diacritics=remove_diacritics)
return clean_text
else:
return text
def _normalize(self, text):
warnings.warn(
"The private method `_normalize` is deprecated and will be removed in v5 of Transformers."
"You can normalize an input string using the Whisper English normalizer using the `normalize` method."
)
return self.normalize(text)
def _basic_normalize(self, text, remove_diacritics=False):
warnings.warn(
"The private method `_basic_normalize` is deprecated and will be removed in v5 of Transformers."
"You can normalize an input string using the Whisper basic normalizer using the `basic_normalize` method."
)
return self.basic_normalize(text, remove_diacritics=remove_diacritics)
def normalize(self, text):
normalizer = EnglishTextNormalizer(self.english_spelling_normalizer)
return normalizer(text)
@staticmethod
def basic_normalize(text, remove_diacritics=False):
normalizer = BasicTextNormalizer(remove_diacritics=remove_diacritics)
return normalizer(text)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
normalizer_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["normalizer_file"]
)
if self.english_spelling_normalizer is not None:
with open(normalizer_file, "w", encoding="utf-8") as f:
f.write(
json.dumps(self.english_spelling_normalizer, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
)
return tuple(files) + (normalizer_file,)
def set_prefix_tokens(self, language: str = None, task: str = None, predict_timestamps: bool = None):
"""
Override the prefix tokens appended to the start of the label sequence. This method can be used standalone to
update the prefix tokens as required when fine-tuning. Example:
```
>>> # instantiate the tokenizer and set the prefix token to Spanish
>>> tokenizer = WhisperTokenizerFast.from_pretrained("openai/whisper-tiny", language="spanish")
>>> # now switch the prefix token from Spanish to French
>>> tokenizer.set_prefix_tokens(language="french")
```
Args:
language (`str`, *optional*, defaults to `None`):
The language of the transcription text.
task (`str`, *optional*, defaults to `None`):
Task identifier to append at the start of sequence (if any).
predict_timestamps (`bool`, *optional*, defaults to `None`):
Whether to omit the `<|notimestamps|>` token at the start of the sequence.
"""
self.language = language if language is not None else self.language
self.task = task if task is not None else self.task
self.predict_timestamps = predict_timestamps if predict_timestamps is not None else self.predict_timestamps
prefix_token_ids = self.prefix_tokens
prefixes = self.convert_ids_to_tokens(prefix_token_ids)
eos = self.eos_token
eos_token_id = self.eos_token_id
prefix_template = " ".join([f"{token}:0" for token in prefixes])
self.backend_tokenizer.post_processor = processors.TemplateProcessing(
single=f"{prefix_template} $A:0 {eos}:0",
pair=f"{prefix_template} $A:0 $B:1 {eos}:1",
special_tokens=[
(eos, eos_token_id),
*zip(prefixes, prefix_token_ids),
],
)
@property
def prefix_tokens(self) -> List[int]:
bos_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
translate_token_id = self.convert_tokens_to_ids("<|translate|>")
transcribe_token_id = self.convert_tokens_to_ids("<|transcribe|>")
notimestamps_token_id = self.convert_tokens_to_ids("<|notimestamps|>")
langs = tuple(LANGUAGES.keys())
if self.language is not None:
self.language = self.language.lower()
if self.language in TO_LANGUAGE_CODE:
language_id = TO_LANGUAGE_CODE[self.language]
elif self.language in TO_LANGUAGE_CODE.values():
language_id = self.language
else:
is_language_code = len(self.language) == 2
raise ValueError(
f"Unsupported language: {self.language}. Language should be one of:"
f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
)
if self.task is not None:
if self.task not in TASK_IDS:
raise ValueError(f"Unsupported task: {self.task}. Task should be in: {TASK_IDS}")
bos_sequence = [bos_token_id]
if self.language is not None:
bos_sequence.append(bos_token_id + 1 + langs.index(language_id))
if self.task is not None:
bos_sequence.append(transcribe_token_id if self.task == "transcribe" else translate_token_id)
if not self.predict_timestamps:
bos_sequence.append(notimestamps_token_id)
return bos_sequence
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
"""Build model inputs from a sequence by appending eos_token_id."""
if token_ids_1 is None:
return self.prefix_tokens + token_ids_0 + [self.eos_token_id]
return self.prefix_tokens + token_ids_0 + token_ids_1 + [self.eos_token_id]
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
):
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
prefix_ones = [1] * len(self.prefix_tokens)
suffix_ones = [1]
if token_ids_1 is None:
return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
@property
def default_chat_template(self):
"""
A simple chat template that ignores role information and just concatenates messages with EOS tokens.
"""
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
self.set_prefix_tokens(task=task, language=language, predict_timestamps=not no_timestamps)
forced_tokens = self.prefix_tokens[1:]
forced_decoder_ids = [(rank + 1, token) for rank, token in enumerate(forced_tokens)]
return forced_decoder_ids
def _decode_asr(self, model_outputs, *, return_timestamps, return_language, time_precision):
return _decode_asr(
self,
model_outputs,
return_timestamps=return_timestamps,
return_language=return_language,
time_precision=time_precision,
)
def get_prompt_ids(self, text: str, return_tensors="np"):
"""Converts prompt text to IDs that can be passed to [`~WhisperForConditionalGeneration.generate`]."""
batch_encoding = self("<|startofprev|>", " " + text.strip(), add_special_tokens=False)
prompt_text_ids = batch_encoding["input_ids"][1:]
special_token_id = next((x for x in prompt_text_ids if x >= self.all_special_ids[0]), None)
if special_token_id is not None:
token = self.convert_ids_to_tokens(special_token_id)
raise ValueError(f"Encountered text in the prompt corresponding to disallowed special token: {token}.")
batch_encoding.convert_to_tensors(tensor_type=return_tensors)
return batch_encoding["input_ids"]
@staticmethod
def _strip_prompt(token_ids: List[int], prompt_token_id: int, decoder_start_token_id: int):
has_prompt = isinstance(token_ids, list) and token_ids and token_ids[0] == prompt_token_id
if has_prompt:
if decoder_start_token_id in token_ids:
return token_ids[token_ids.index(decoder_start_token_id) :]
else:
return []
return token_ids
.\models\whisper\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_flax_available,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_whisper": ["WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP", "WhisperConfig", "WhisperOnnxConfig"],
"feature_extraction_whisper": ["WhisperFeatureExtractor"],
"processing_whisper": ["WhisperProcessor"],
"tokenization_whisper": ["WhisperTokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_whisper_fast"] = ["WhisperTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_whisper"] = [
"WHISPER_PRETRAINED_MODEL_ARCHIVE_LIST",
"WhisperForCausalLM",
"WhisperForConditionalGeneration",
"WhisperModel",
"WhisperPreTrainedModel",
"WhisperForAudioClassification",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_whisper"] = [
"TF_WHISPER_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFWhisperForConditionalGeneration",
"TFWhisperModel",
"TFWhisperPreTrainedModel",
]
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_flax_whisper"] = [
"FlaxWhisperForConditionalGeneration",
"FlaxWhisperModel",
"FlaxWhisperPreTrainedModel",
"FlaxWhisperForAudioClassification",
]
if TYPE_CHECKING:
from .configuration_whisper import WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP, WhisperConfig, WhisperOnnxConfig
from .feature_extraction_whisper import WhisperFeatureExtractor
from .processing_whisper import WhisperProcessor
from .tokenization_whisper import WhisperTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_whisper_fast import WhisperTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_whisper import (
WHISPER_PRETRAINED_MODEL_ARCHIVE_LIST,
WhisperForAudioClassification,
WhisperForCausalLM,
WhisperForConditionalGeneration,
WhisperModel,
WhisperPreTrainedModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_whisper import (
TF_WHISPER_PRETRAINED_MODEL_ARCHIVE_LIST,
TFWhisperForConditionalGeneration,
TFWhisperModel,
TFWhisperPreTrainedModel,
)
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_flax_whisper import (
FlaxWhisperForAudioClassification,
FlaxWhisperForConditionalGeneration,
FlaxWhisperModel,
FlaxWhisperPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\xglm\configuration_xglm.py
""" XGLM model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
XGLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/xglm-564M": "https://huggingface.co/facebook/xglm-564M/resolve/main/config.json",
}
class XGLMConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`XGLMModel`]. It is used to instantiate an XGLM
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the XGLM
[facebook/xglm-564M](https://huggingface.co/facebook/xglm-564M) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
model_type = "xglm"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_attention_heads": "attention_heads",
"hidden_size": "d_model",
"num_hidden_layers": "num_layers",
}
def __init__(
self,
vocab_size=256008,
max_position_embeddings=2048,
d_model=1024,
ffn_dim=4096,
num_layers=24,
attention_heads=16,
activation_function="gelu",
dropout=0.1,
attention_dropout=0.1,
activation_dropout=0.0,
layerdrop=0.0,
init_std=0.02,
scale_embedding=True,
use_cache=True,
decoder_start_token_id=2,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.d_model = d_model
self.ffn_dim = ffn_dim
self.num_layers = num_layers
self.attention_heads = attention_heads
self.activation_function = activation_function
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.layerdrop = layerdrop
self.init_std = init_std
self.scale_embedding = scale_embedding
self.use_cache = use_cache
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
.\models\xglm\convert_xglm_original_ckpt_to_trfms.py
import argparse
from argparse import Namespace
import torch
from torch import nn
from transformers import XGLMConfig, XGLMForCausalLM
def remove_ignore_keys_(state_dict):
ignore_keys = [
"decoder.version",
"decoder.output_projection.weight",
"_float_tensor",
"decoder.embed_positions._float_tensor",
]
for k in ignore_keys:
state_dict.pop(k, None)
def make_linear_from_emb(emb):
vocab_size, emb_size = emb.weight.shape
lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
lin_layer.weight.data = emb.weight.data
return lin_layer
def convert_fairseq_xglm_checkpoint_from_disk(checkpoint_path):
checkpoint = torch.load(checkpoint_path, map_location="cpu")
args = Namespace(**checkpoint["cfg"]["model"])
state_dict = checkpoint["model"]
remove_ignore_keys_(state_dict)
vocab_size = state_dict["decoder.embed_tokens.weight"].shape[0]
state_dict = {key.replace("decoder", "model"): val for key, val in state_dict.items()}
config = XGLMConfig(
vocab_size=vocab_size,
max_position_embeddings=args.max_target_positions,
num_layers=args.decoder_layers,
attention_heads=args.decoder_attention_heads,
ffn_dim=args.decoder_ffn_embed_dim,
d_model=args.decoder_embed_dim,
layerdrop=args.decoder_layerdrop,
dropout=args.dropout,
attention_dropout=args.attention_dropout,
activation_dropout=args.activation_dropout,
activation_function="gelu",
scale_embedding=not args.no_scale_embedding,
tie_word_embeddings=args.share_decoder_input_output_embed,
)
model = XGLMForCausalLM(config)
missing = model.load_state_dict(state_dict, strict=False)
print(missing)
model.lm_head = make_linear_from_emb(model.model.embed_tokens)
return model
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("fairseq_path", type=str, help="path to a model.pt on local filesystem.")
parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
args = parser.parse_args()
model = convert_fairseq_xglm_checkpoint_from_disk(args.fairseq_path)
model.save_pretrained(args.pytorch_dump_folder_path)
.\models\xglm\modeling_flax_xglm.py
""" Flax XGLM model."""
import math
import random
from functools import partial
from typing import Optional, Tuple
import flax.linen as nn
import jax
import jax.numpy as jnp
import numpy as np
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
from flax.linen import combine_masks, make_causal_mask
from flax.linen.attention import dot_product_attention_weights
from flax.traverse_util import flatten_dict, unflatten_dict
from jax import lax
from jax.random import PRNGKey
from ...modeling_flax_outputs import (
FlaxBaseModelOutputWithPastAndCrossAttentions,
FlaxCausalLMOutputWithCrossAttentions,
)
from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_xglm import XGLMConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "facebook/xglm-564M"
_CONFIG_FOR_DOC = "XGLMConfig"
XGLM_START_DOCSTRING = r"""
This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a Flax Linen
[flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
Finally, this model supports inherent JAX features such as:
- [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
- [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
- [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
- [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
"""
"""
XGLM_INPUTS_DOCSTRING = r"""
Args:
input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary
attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary
position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
def create_sinusoidal_positions(n_pos, dim, padding_idx=1):
# Calculate half of the dimension for sinusoidal embedding
half_dim = dim // 2
# Compute the exponential term for sinusoidal embedding
emb = math.log(10000) / (half_dim - 1)
emb = np.exp(np.arange(half_dim) * -emb)
# Expand dimensions to perform element-wise multiplication
emb = np.expand_dims(np.arange(n_pos), 1) * np.expand_dims(emb, 0)
# Concatenate sine and cosine transformations of embeddings
emb = np.concatenate([np.sin(emb), np.cos(emb)], 1)
# Reshape the embedding to match desired dimensions
emb = np.reshape(emb, (n_pos, dim))
# If padding index is specified, zero out its embedding
if padding_idx is not None:
emb[padding_idx, :] = 0
# Convert embedding to JAX array
return jnp.array(emb)
class FlaxXGLMAttention(nn.Module):
config: XGLMConfig
embed_dim: int
num_heads: int
dropout: float = 0.0
causal: bool = False
bias: bool = True
dtype: jnp.dtype = jnp.float32 # the dtype of the computation
def setup(self) -> None:
# 计算每个头部的维度
self.head_dim = self.embed_dim // self.num_heads
# 检查 embed_dim 是否能被 num_heads 整除,否则抛出数值错误
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} "
f"and `num_heads`: {self.num_heads})."
)
# 定义部分应用了部分参数的 Dense 层构造函数
dense = partial(
nn.Dense,
self.embed_dim,
use_bias=self.bias,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.init_std),
)
# 初始化查询、键、值、输出投影层
self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense()
self.out_proj = dense()
# 初始化 Dropout 层
self.dropout_layer = nn.Dropout(rate=self.dropout)
# 如果需要引入因果注意力机制,则创建对应的因果掩码
if self.causal:
self.causal_mask = make_causal_mask(
jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool"
)
def _split_heads(self, hidden_states):
# 将隐藏状态张量按头部分割
return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
def _merge_heads(self, hidden_states):
# 将分割后的头部重新合并
return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
@nn.compact
def _concatenate_to_cache(self, key, value, query, attention_mask):
"""
This function takes projected key, value states from a single input token and concatenates the states to cached
states from previous steps. This function is slighly adapted from the official Flax repository:
https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py
"""
# 检测是否正在初始化,通过检查是否存在缓存数据来判断
is_initialized = self.has_variable("cache", "cached_key")
# 获取或创建缓存的键值对应的变量,如果不存在则创建一个全零数组
cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
# 获取或创建缓存的值对应的变量,如果不存在则创建一个全零数组
cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
# 获取或创建缓存索引对应的变量,如果不存在则创建一个值为0的整数数组
cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
if is_initialized:
# 提取当前缓存的维度信息,包括批次维度、最大长度、头数、每头深度
*batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
# 使用新的一维空间切片更新键和值的缓存
cur_index = cache_index.value
indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
key = lax.dynamic_update_slice(cached_key.value, key, indices)
value = lax.dynamic_update_slice(cached_value.value, value, indices)
# 更新缓存中的键和值
cached_key.value = key
cached_value.value = value
# 更新缓存索引,增加已更新的缓存向量数目
num_updated_cache_vectors = query.shape[1]
cache_index.value = cache_index.value + num_updated_cache_vectors
# 用于缓存的因果掩码:我们的单个查询位置应该只关注已生成和缓存的键位置,而不是剩余的零元素。
pad_mask = jnp.broadcast_to(
jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
)
# 将因果掩码和传入的注意力掩码结合起来
attention_mask = combine_masks(pad_mask, attention_mask)
# 返回更新后的键、值和注意力掩码
return key, value, attention_mask
# 定义一个 FlaxXGLMDecoderLayer 类,继承自 nn.Module
class FlaxXGLMDecoderLayer(nn.Module):
# 类变量:XGLMConfig 类型的 config 变量
config: XGLMConfig
# 类变量:jnp.float32 类型的 dtype,默认为 jnp.float32
dtype: jnp.dtype = jnp.float32
# 初始化方法,无返回值
def setup(self) -> None:
# 实例变量:self.embed_dim 等于 config.d_model
self.embed_dim = self.config.d_model
# 实例变量:self.self_attn 是一个 FlaxXGLMAttention 实例
# 根据给定的 config 参数进行初始化
self.self_attn = FlaxXGLMAttention(
config=self.config,
embed_dim=self.embed_dim,
num_heads=self.config.attention_heads,
dropout=self.config.attention_dropout,
causal=True,
dtype=self.dtype,
)
# 实例变量:self.self_attn_layer_norm 是一个 LayerNorm 实例
# 根据 dtype 参数进行初始化
self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
# 实例变量:self.dropout_layer 是一个 Dropout 层实例
# 根据 config.dropout 参数进行初始化
self.dropout_layer = nn.Dropout(rate=self.config.dropout)
# 实例变量:self.activation_fn 是一个激活函数,根据 config.activation_function 选择
self.activation_fn = ACT2FN[self.config.activation_function]
# 实例变量:self.activation_dropout_layer 是一个 Dropout 层实例
# 根据 config.activation_dropout 参数进行初始化
self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
# 如果 config.add_cross_attention 为 True,则初始化下面的变量
if self.config.add_cross_attention:
# 实例变量:self.encoder_attn 是一个 FlaxXGLMAttention 实例
# 根据给定的 config 参数进行初始化
self.encoder_attn = FlaxXGLMAttention(
config=self.config,
embed_dim=self.embed_dim,
num_heads=self.config.decoder_attention_heads,
dropout=self.config.attention_dropout,
dtype=self.dtype,
)
# 实例变量:self.encoder_attn_layer_norm 是一个 LayerNorm 实例
# 根据 dtype 参数进行初始化
self.encoder_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
# 实例变量:self.fc1 是一个全连接层实例
# 输入维度为 self.config.ffn_dim,输出维度为 self.embed_dim
# 根据 dtype 参数和 self.config.init_std 进行初始化
self.fc1 = nn.Dense(
self.config.ffn_dim,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.init_std),
)
# 实例变量:self.fc2 是一个全连接层实例
# 输入维度为 self.embed_dim,输出维度为 self.embed_dim
# 根据 dtype 参数和 self.config.init_std 进行初始化
self.fc2 = nn.Dense(
self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std)
)
# 实例变量:self.final_layer_norm 是一个 LayerNorm 实例
# 根据 dtype 参数进行初始化
self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
# 重写 __call__ 方法,用于实例调用时的行为
# 可以接收多种输入参数并处理
# 来自 transformers.models.mbart.modeling_flax_mbart.FlaxMBartDecoderLayer.__call__
def __call__(
self,
hidden_states: jnp.ndarray, # 输入的隐藏状态,类型为 jnp.ndarray
attention_mask: jnp.ndarray, # 注意力掩码,类型为 jnp.ndarray
encoder_hidden_states: Optional[jnp.ndarray] = None, # 编码器的隐藏状态,可选参数,默认为 None
encoder_attention_mask: Optional[jnp.ndarray] = None, # 编码器的注意力掩码,可选参数,默认为 None
init_cache: bool = False, # 是否初始化缓存,类型为布尔值,默认为 False
output_attentions: bool = True, # 是否输出注意力权重,类型为布尔值,默认为 True
deterministic: bool = True, # 是否确定性计算,类型为布尔值,默认为 True
# 返回值类型为 Tuple[jnp.ndarray, Optional[jnp.ndarray]]
# 其中第一个元素为输出的隐藏状态,第二个元素为注意力权重,可选
) -> Tuple[jnp.ndarray, Optional[jnp.ndarray]]:
) -> Tuple[jnp.ndarray]:
# 保存残差连接(Residual Connection)的输入隐藏状态
residual = hidden_states
# 应用自注意力机制前的层归一化
hidden_states = self.self_attn_layer_norm(hidden_states)
# 自注意力机制
hidden_states, self_attn_weights = self.self_attn(
hidden_states=hidden_states, attention_mask=attention_mask, init_cache=init_cache
)
# 应用 dropout
hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
# 添加残差连接
hidden_states = residual + hidden_states
# 交叉注意力块
cross_attn_weights = None
if encoder_hidden_states is not None:
# 保存残差连接
residual = hidden_states
# 应用编码器注意力块前的层归一化
hidden_states = self.encoder_attn_layer_norm(hidden_states)
# 应用编码器注意力机制
hidden_states, cross_attn_weights = self.encoder_attn(
hidden_states=hidden_states,
key_value_states=encoder_hidden_states,
attention_mask=encoder_attention_mask,
)
# 应用 dropout
hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
# 添加残差连接
hidden_states = residual + hidden_states
# 全连接层
residual = hidden_states
# 应用最终层归一化
hidden_states = self.final_layer_norm(hidden_states)
# 应用激活函数
hidden_states = self.activation_fn(self.fc1(hidden_states))
# 应用激活函数后的 dropout
hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic)
# 应用最后的线性变换
hidden_states = self.fc2(hidden_states)
# 应用 dropout
hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
# 添加残差连接
hidden_states = residual + hidden_states
# 准备输出
outputs = (hidden_states,)
# 如果需要输出注意力权重,则添加到输出中
if output_attentions:
outputs += (self_attn_weights, cross_attn_weights)
return outputs
class FlaxXGLMDecoderLayerCollection(nn.Module):
config: XGLMConfig
dtype: jnp.dtype = jnp.float32 # 计算时使用的数据类型
def setup(self):
# 初始化所有的解码器层,并根据配置添加到层列表中
self.layers = [
FlaxXGLMDecoderLayer(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.num_layers)
]
# 设置层间隔概率(LayerDrop)
self.layerdrop = self.config.layerdrop
def __call__(
self,
hidden_states,
attention_mask,
encoder_hidden_states: Optional[jnp.ndarray] = None,
encoder_attention_mask: Optional[jnp.ndarray] = None,
deterministic: bool = True,
init_cache: bool = False,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 如果需要输出隐藏状态,则初始化存储所有隐藏状态的元组
all_hidden_states = () if output_hidden_states else None
# 如果需要输出注意力权重,则初始化存储所有自注意力权重的元组
all_self_attns = () if output_attentions else None
# 如果需要输出交叉注意力权重且存在编码器隐藏状态,则初始化存储所有交叉注意力权重的元组
all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
# 遍历所有解码器层
for decoder_layer in self.layers:
if output_hidden_states:
# 如果需要输出隐藏状态,则将当前隐藏状态添加到存储所有隐藏状态的元组中
all_hidden_states += (hidden_states,)
# 添加层间隔概率(LayerDrop),详见论文 https://arxiv.org/abs/1909.11556
dropout_probability = random.uniform(0, 1)
if not deterministic and (dropout_probability < self.layerdrop):
# 如果不是确定性计算且随机丢弃概率小于层间隔概率,则设置层输出为None
layer_outputs = (None, None, None)
else:
# 否则,调用当前解码器层进行前向计算
layer_outputs = decoder_layer(
hidden_states,
attention_mask=attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
init_cache=init_cache,
output_attentions=output_attentions,
deterministic=deterministic,
)
# 更新当前隐藏状态为解码器层的输出的第一个元素
hidden_states = layer_outputs[0]
if output_attentions:
# 如果需要输出注意力权重,则将当前解码器层的自注意力权重添加到存储所有自注意力权重的元组中
all_self_attns += (layer_outputs[1],)
if encoder_hidden_states is not None:
# 如果存在编码器隐藏状态,则将当前解码器层的交叉注意力权重添加到存储所有交叉注意力权重的元组中
all_cross_attentions += (layer_outputs[2],)
# 添加来自最后一个解码器层的隐藏状态
if output_hidden_states:
all_hidden_states += (hidden_states,)
# 构建模型输出,根据需要返回不同的数据结构
outputs = (hidden_states, all_hidden_states, all_self_attns, all_cross_attentions)
if not return_dict:
# 如果不需要返回字典形式的输出,则只返回非空的元组元素
return tuple(v for v in outputs if v is not None)
# 否则,返回包含各类注意力权重和隐藏状态的字典形式的输出
return FlaxBaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attns,
cross_attentions=all_cross_attentions,
)
class FlaxXGLMModule(nn.Module):
config: XGLMConfig
dtype: jnp.dtype = jnp.float32 # 计算时使用的数据类型
# 设置模型的初始配置
def setup(self):
# 初始化 dropout 层
self.dropout_layer = nn.Dropout(rate=self.config.dropout)
# 获取嵌入维度、填充索引、最大目标位置和嵌入缩放因子的配置信息
embed_dim = self.config.d_model
self.padding_idx = self.config.pad_token_id
self.max_target_positions = self.config.max_position_embeddings
self.embed_scale = math.sqrt(self.config.d_model) if self.config.scale_embedding else 1.0
# 创建词嵌入矩阵,指定词汇表大小和嵌入维度,使用正态分布初始化
self.embed_tokens = nn.Embed(
self.config.vocab_size,
embed_dim,
embedding_init=jax.nn.initializers.normal(self.config.init_std),
)
# XGLM 模型的特殊设置:如果指定了填充索引,将嵌入 id 偏移 2,并相应调整 num_embeddings
# 其他模型不需要此调整
self.offset = 2
# 创建 sinusoidal 位置嵌入,考虑偏移量和嵌入维度
self.embed_positions = create_sinusoidal_positions(
self.config.max_position_embeddings + self.offset, embed_dim
)
# 初始化 XGLM 解码器层集合
self.layers = FlaxXGLMDecoderLayerCollection(self.config, self.dtype)
# 初始化 LayerNorm 层,设置类型和 epsilon 值
self.layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
# 定义模型调用方法
def __call__(
self,
input_ids,
attention_mask,
position_ids,
encoder_hidden_states: Optional[jnp.ndarray] = None,
encoder_attention_mask: Optional[jnp.ndarray] = None,
init_cache: bool = False,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
deterministic: bool = True,
):
# 获取输入张量的形状
input_shape = input_ids.shape
# 将输入张量重新整形为二维张量,保留最后一个维度不变
input_ids = input_ids.reshape(-1, input_shape[-1])
# 使用模型的词嵌入层对输入张量进行嵌入,并乘以嵌入缩放因子
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
# 嵌入位置信息
position_ids = position_ids + self.offset
positions = jnp.take(self.embed_positions, position_ids, axis=0)
# 将词嵌入和位置嵌入相加得到隐藏状态
hidden_states = inputs_embeds + positions
# 使用 dropout 层对隐藏状态进行处理,根据 deterministic 参数确定是否使用确定性的 dropout
hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
# 将隐藏状态传入模型的层中进行处理
outputs = self.layers(
hidden_states,
attention_mask,
encoder_hidden_states,
encoder_attention_mask,
deterministic=deterministic,
init_cache=init_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取模型输出中的最后一个隐藏状态,并进行层归一化处理
last_hidden_states = outputs[0]
last_hidden_states = self.layer_norm(last_hidden_states)
hidden_states = None
# 如果需要输出所有隐藏状态,则将其从模型输出中提取并添加最后一个隐藏状态
if output_hidden_states:
hidden_states = outputs[1]
hidden_states = hidden_states[:-1] + (last_hidden_states,)
# 根据 return_dict 决定如何返回模型输出
if not return_dict:
# 如果不需要返回字典形式的结果,则根据需要组合输出
outputs = (last_hidden_states, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:])
# 过滤掉空值并返回元组形式的结果
return tuple(v for v in outputs if v is not None)
# 如果需要返回字典形式的结果,则构建 FlaxBaseModelOutputWithPastAndCrossAttentions 对象
return FlaxBaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=last_hidden_states,
hidden_states=hidden_states,
attentions=outputs.attentions,
cross_attentions=outputs.cross_attentions,
)
# 定义 FlaxXGLMPreTrainedModel 类,继承自 FlaxPreTrainedModel 类
class FlaxXGLMPreTrainedModel(FlaxPreTrainedModel):
# 指定配置类为 XGLMConfig
config_class = XGLMConfig
# 指定基础模型前缀为 "model"
base_model_prefix: str = "model"
# 模块类默认为空
module_class: nn.Module = None
# 初始化方法,接受配置、输入形状、种子、数据类型等参数
def __init__(
self,
config: XGLMConfig,
input_shape: Tuple[int] = (1, 1),
seed: int = 0,
dtype: jnp.dtype = jnp.float32,
_do_init: bool = True,
**kwargs,
):
# 使用模块类和其他参数初始化模块
module = self.module_class(config=config, dtype=dtype, **kwargs)
# 调用父类的初始化方法,传入配置、模块、输入形状、种子、数据类型等参数
super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
# 初始化权重方法,接受随机数种子、输入形状和参数字典等参数
def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
# 初始化输入张量
input_ids = jnp.zeros(input_shape, dtype="i4")
# 创建与 input_ids 类型相同的全1张量作为 attention_mask
attention_mask = jnp.ones_like(input_ids)
# 根据 input_ids 的形状广播生成位置编码张量
position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
# 切分随机数种子为 params_rng 和 dropout_rng
params_rng, dropout_rng = jax.random.split(rng)
# 创建随机数字典 rngs,用于参数和 dropout
rngs = {"params": params_rng, "dropout": dropout_rng}
# 如果配置中包含跨注意力机制
if self.config.add_cross_attention:
# 创建与 input_shape 和配置的嵌入维度大小相同的全0隐藏状态张量
encoder_hidden_states = jnp.zeros(input_shape + (self.config.n_embd,))
# 将 attention_mask 用作编码器的注意力掩码
encoder_attention_mask = attention_mask
# 使用模块的初始化方法进行初始化,传入随机数字典、input_ids、attention_mask、position_ids、隐藏状态张量及其注意力掩码
module_init_outputs = self.module.init(
rngs,
input_ids,
attention_mask,
position_ids,
encoder_hidden_states,
encoder_attention_mask,
return_dict=False,
)
else:
# 否则,只使用 input_ids、attention_mask、position_ids 进行模块的初始化
module_init_outputs = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)
# 获取随机初始化的模型参数
random_params = module_init_outputs["params"]
# 如果提供了预定义的参数,将随机参数与已有参数进行合并
if params is not None:
# 展平并解冻随机参数和已有参数
random_params = flatten_dict(unfreeze(random_params))
params = flatten_dict(unfreeze(params))
# 将随机参数中缺失的键加入已有参数中
for missing_key in self._missing_keys:
params[missing_key] = random_params[missing_key]
self._missing_keys = set()
# 冻结并重新构造参数字典
return freeze(unflatten_dict(params))
else:
# 否则,直接返回随机初始化的参数
return random_params
# 初始化缓存方法,用于快速自回归解码
def init_cache(self, batch_size, max_length):
"""
Args:
batch_size (`int`):
用于快速自回归解码的批处理大小。定义初始化缓存的批处理大小。
max_length (`int`):
自回归解码的最大可能长度。定义初始化缓存的序列长度。
"""
# 初始化用于检索缓存的输入变量
input_ids = jnp.ones((batch_size, max_length), dtype="i4")
# 创建与 input_ids 类型相同的全1张量作为 attention_mask
attention_mask = jnp.ones_like(input_ids, dtype="i4")
# 根据 input_ids 的形状广播生成位置编码张量
position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
# 使用模块的初始化方法初始化变量,包括 input_ids、attention_mask、position_ids,并请求返回缓存
init_variables = self.module.init(
jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
)
# 返回解冻后的初始化缓存
return unfreeze(init_variables["cache"])
# 将模型的前向传播方法装饰为添加文档字符串,用于模型输入参数的说明
@add_start_docstrings_to_model_forward(XGLM_INPUTS_DOCSTRING)
# 定义模型的调用方法,接受多个参数作为输入
def __call__(
self,
input_ids: jnp.ndarray, # 输入的token IDs,作为模型的输入
attention_mask: Optional[jnp.ndarray] = None, # 可选的注意力掩码,指示哪些token需要注意
position_ids: Optional[jnp.ndarray] = None, # 可选的位置IDs,用于指示token的位置信息
encoder_hidden_states: Optional[jnp.ndarray] = None, # 可选的编码器隐藏状态,用于encoder-decoder模型
encoder_attention_mask: Optional[jnp.ndarray] = None, # 可选的编码器注意力掩码
output_attentions: Optional[bool] = None, # 是否输出注意力权重
output_hidden_states: Optional[bool] = None, # 是否输出所有层的隐藏状态
return_dict: Optional[bool] = None, # 是否以字典形式返回结果
train: bool = False, # 是否处于训练模式
params: dict = None, # 模型参数字典
past_key_values: dict = None, # 过去的键值,用于存储前一次的状态信息
dropout_rng: PRNGKey = None, # 随机数生成器,用于Dropout层的随机掩码
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.return_dict
if encoder_hidden_states is not None and encoder_attention_mask is None:
batch_size, sequence_length = encoder_hidden_states.shape[:2]
encoder_attention_mask = jnp.ones((batch_size, sequence_length))
# 准备编码器的输入
# 如果 attention_mask 为空,则使用与 input_ids 相同形状的全 1 数组
if attention_mask is None:
attention_mask = jnp.ones_like(input_ids)
# 如果 position_ids 为空,则广播形状为 (batch_size, sequence_length) 的序列长度数组
if position_ids is None:
batch_size, sequence_length = input_ids.shape
position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
# 如果需要处理任何伪随机数生成器 (PRNG),则构建相应的字典
rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
inputs = {"params": params or self.params}
# 如果 past_key_values 被传递,则初始化了缓存,并传递一个私有标志 init_cache 以确保使用缓存。
# 必须确保缓存被标记为可变,以便 FlaxXGLMAttention 模块可以更改它。
if past_key_values:
inputs["cache"] = past_key_values
mutable = ["cache"]
else:
mutable = False
# 调用模块的 apply 方法,传递输入参数
outputs = self.module.apply(
inputs,
input_ids=jnp.array(input_ids, dtype="i4"),
attention_mask=jnp.array(attention_mask, dtype="i4"),
position_ids=jnp.array(position_ids, dtype="i4"),
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
deterministic=not train,
rngs=rngs,
mutable=mutable,
)
# 将更新后的缓存添加到模型输出中
if past_key_values is not None and return_dict:
outputs, past_key_values = outputs
outputs["past_key_values"] = unfreeze(past_key_values["cache"])
return outputs
elif past_key_values is not None and not return_dict:
outputs, past_key_values = outputs
outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
# 返回模型输出
return outputs
# 为了给 FlaxXGLMModel 类添加文档字符串,指定它输出原始隐藏状态而没有特定的顶部头部。
@add_start_docstrings(
"The bare XGLM Model transformer outputting raw hidden-states without any specific head on top.",
XGLM_START_DOCSTRING,
)
class FlaxXGLMModel(FlaxXGLMPreTrainedModel):
module_class = FlaxXGLMModule
# 添加调用示例的文档字符串给 FlaxXGLMModel 类
append_call_sample_docstring(
FlaxXGLMModel,
_CHECKPOINT_FOR_DOC,
FlaxBaseModelOutputWithPastAndCrossAttentions,
_CONFIG_FOR_DOC,
)
class FlaxXGLMForCausalLMModule(nn.Module):
config: XGLMConfig
dtype: jnp.dtype = jnp.float32 # 计算的数据类型
def setup(self):
# 使用配置和数据类型初始化 FlaxXGLMModule 模型
self.model = FlaxXGLMModule(self.config, self.dtype)
# 初始化语言模型头部,是一个全连接层,不使用偏置
self.lm_head = nn.Dense(
self.config.vocab_size,
use_bias=False,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.init_std),
)
def __call__(
self,
input_ids,
attention_mask,
position_ids,
encoder_hidden_states: Optional[jnp.ndarray] = None,
encoder_attention_mask: Optional[jnp.ndarray] = None,
init_cache: bool = False,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
deterministic: bool = True,
):
# 调用模型进行前向传播
outputs = self.model(
input_ids,
attention_mask,
position_ids,
encoder_hidden_states,
encoder_attention_mask,
deterministic=deterministic,
init_cache=init_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0]
# 如果配置要求词嵌入共享,则使用共享的嵌入层参数进行计算
if self.config.tie_word_embeddings:
shared_embedding = self.model.variables["params"]["embed_tokens"]["embedding"]
lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
else:
# 否则直接使用语言模型头部进行计算
lm_logits = self.lm_head(hidden_states)
# 如果不需要返回字典格式,则返回元组
if not return_dict:
return (lm_logits,) + outputs[1:]
# 返回带有交叉注意力输出的 FlaxCausalLMOutputWithCrossAttentions 对象
return FlaxCausalLMOutputWithCrossAttentions(
logits=lm_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
cross_attentions=outputs.cross_attentions,
)
# 为 FlaxXGLMForCausalLM 类添加文档字符串,描述其为带有语言建模头部的 XGLM 模型变换器
@add_start_docstrings(
"""
The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
""",
XGLM_START_DOCSTRING,
)
class FlaxXGLMForCausalLM(FlaxXGLMPreTrainedModel):
module_class = FlaxXGLMForCausalLMModule
# 为生成准备输入数据,初始化缓存
def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
# 获取输入张量的批量大小和序列长度
batch_size, seq_length = input_ids.shape
# 使用 self.init_cache 方法初始化过去键值对
past_key_values = self.init_cache(batch_size, max_length)
# 创建一个扩展的注意力掩码,初始化为全1数组
extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
# 如果给定了 attention_mask,则根据其累积和更新位置 ID,并将 attention_mask 的值复制到扩展的注意力掩码中对应位置
if attention_mask is not None:
position_ids = attention_mask.cumsum(axis=-1) - 1
extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
else:
# 否则,根据序列长度广播生成位置 ID
position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
# 返回包含过去键值对、扩展注意力掩码和位置 ID 的字典
return {
"past_key_values": past_key_values,
"attention_mask": extended_attention_mask,
"position_ids": position_ids,
}
# 更新生成的输入数据,将模型输出的过去键值对和更新后的位置 ID 存入 model_kwargs 中
def update_inputs_for_generation(self, model_outputs, model_kwargs):
model_kwargs["past_key_values"] = model_outputs.past_key_values
model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
return model_kwargs
# 调用函数 `append_call_sample_docstring`,将以下参数传递给它:
# - FlaxXGLMForCausalLM: 作为第一个参数传递的类或函数
# - _CHECKPOINT_FOR_DOC: 作为第二个参数传递的变量或值
# - FlaxCausalLMOutputWithCrossAttentions: 作为第三个参数传递的类或函数
# - _CONFIG_FOR_DOC: 作为第四个参数传递的变量或值
append_call_sample_docstring(
FlaxXGLMForCausalLM, # 第一个参数,传递类或函数 FlaxXGLMForCausalLM
_CHECKPOINT_FOR_DOC, # 第二个参数,传递变量或值 _CHECKPOINT_FOR_DOC
FlaxCausalLMOutputWithCrossAttentions, # 第三个参数,传递类或函数 FlaxCausalLMOutputWithCrossAttentions
_CONFIG_FOR_DOC, # 第四个参数,传递变量或值 _CONFIG_FOR_DOC
)
.\models\xglm\modeling_tf_xglm.py
""" TF 2.0 XGLM model."""
from __future__ import annotations
import math
import random
from typing import Any, Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...file_utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
replace_return_docstrings,
)
from ...modeling_tf_outputs import TFBaseModelOutputWithPastAndCrossAttentions, TFCausalLMOutputWithCrossAttentions
from ...modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFModelInputType,
TFPreTrainedModel,
TFSharedEmbeddings,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import logging
from .configuration_xglm import XGLMConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "facebook/xglm-564M"
_CONFIG_FOR_DOC = "XGLMConfig"
TF_XGLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/xglm-564M",
]
LARGE_NEGATIVE = -1e8
def create_sinusoidal_positions(num_positions: int, embedding_dim: int, padding_idx: Optional[int]) -> tf.Tensor:
half_dim = embedding_dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = tf.exp(tf.range(half_dim, dtype=tf.float32) * -emb)
emb = tf.expand_dims(tf.range(num_positions, dtype=tf.float32), axis=1) * tf.expand_dims(emb, axis=0)
emb = tf.reshape(tf.concat([tf.sin(emb), tf.cos(emb)], axis=1), (num_positions, -1))
if embedding_dim % 2 == 1:
emb = tf.concat([emb, tf.zeros((num_positions, 1))], axis=1)
if padding_idx is not None:
_padding_mask = tf.concat(
[
tf.ones((padding_idx, shape_list(emb)[1])),
tf.zeros((1, shape_list(emb)[1])),
tf.ones((shape_list(emb)[0] - padding_idx - 1, shape_list(emb)[1])),
],
axis=0,
)
emb *= _padding_mask
return tf.constant(emb, name="embed_positions")
def _create_position_ids_from_input_ids(
input_ids: tf.Tensor, past_key_values_length: int, padding_idx: Optional[int]
) -> tf.Tensor:
"""
根据输入的token IDs创建位置 IDs
Args:
input_ids (tf.Tensor): 输入的token IDs
past_key_values_length (int): 过去key values的长度
padding_idx (Optional[int]): 填充的索引位置
Returns:
tf.Tensor: 对应的位置 IDs
"""
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.
"""
mask = tf.where(input_ids != padding_idx, 1, 0)
incremental_indices = (tf.cast(tf.cumsum(mask, axis=1), dtype=mask.dtype) + past_key_values_length) * mask
return tf.cast(incremental_indices, dtype=tf.int64) + padding_idx
def _create_position_ids_from_inputs_embeds(
inputs_embeds: tf.Tensor, past_key_values_length: int, padding_idx: Optional[int]
) -> tf.Tensor:
"""
Args:
inputs_embeds: 直接提供的嵌入向量张量
Returns:
tf.Tensor: 生成的位置ID张量
"""
input_shape = shape_list(inputs_embeds)[:-1]
sequence_length = input_shape[1]
position_ids = tf.range(padding_idx + 1, sequence_length + padding_idx + 1, dtype=tf.int64)
return tf.broadcast_to(tf.expand_dims(position_ids, axis=0), input_shape) + past_key_values_length
def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0):
"""
创建用于双向自注意力的因果掩码。
"""
bsz = input_ids_shape[0]
tgt_len = input_ids_shape[1]
mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE
mask_cond = tf.range(shape_list(mask)[-1])
mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask)
if past_key_values_length > 0:
mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1)
return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1))
def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
"""
将注意力掩码从 `[bsz, seq_len]` 扩展到 `[bsz, 1, tgt_seq_len, src_seq_len]`。
"""
src_len = shape_list(mask)[1]
tgt_len = tgt_len if tgt_len is not None else src_len
one_cst = tf.constant(1.0)
mask = tf.cast(mask, dtype=one_cst.dtype)
expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))
return (one_cst - expanded_mask) * LARGE_NEGATIVE
class TFXGLMAttention(keras.layers.Layer):
"""来自"Attention Is All You Need"的多头注意力"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
**kwargs,
):
super().__init__(**kwargs)
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
def call(
self,
hidden_states: tf.Tensor,
key_value_states: tf.Tensor | None = None,
past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
attention_mask: tf.Tensor | None = None,
layer_head_mask: tf.Tensor | None = None,
training: Optional[bool] = False,
):
if self.built:
return
self.built = True
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
class TFXGLMDecoderLayer(keras.layers.Layer):
def __init__(self, config: XGLMConfig, **kwargs: Any) -> None:
super().__init__(**kwargs)
self.embed_dim = config.d_model
self.self_attn = TFXGLMAttention(
embed_dim=self.embed_dim,
num_heads=config.attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
name="self_attn",
)
self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
if config.add_cross_attention:
self.encoder_attn = TFXGLMAttention(
embed_dim=self.embed_dim,
num_heads=config.attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
name="encoder_attn",
)
self.encoder_attn_layer_norm = keras.layers.LayerNormalization(
epsilon=1e-5, name="encoder_attn_layer_norm"
)
self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.fc1 = keras.layers.Dense(config.ffn_dim, name="fc1")
self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor | None = None,
encoder_hidden_states: tf.Tensor | None = None,
encoder_attention_mask: tf.Tensor | None = None,
layer_head_mask: tf.Tensor | None = None,
cross_attn_layer_head_mask: tf.Tensor | None = None,
past_key_value: Tuple[tf.Tensor] | None = None,
training: Optional[bool] = False,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor | None = None,
encoder_hidden_states: tf.Tensor | None = None,
encoder_attention_mask: tf.Tensor | None = None,
layer_head_mask: tf.Tensor | None = None,
cross_attn_layer_head_mask: tf.Tensor | None = None,
past_key_value: Tuple[tf.Tensor] | None = None,
training: Optional[bool] = False,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor | None = None,
encoder_hidden_states: tf.Tensor | None = None,
encoder_attention_mask: tf.Tensor | None = None,
layer_head_mask: tf.Tensor | None = None,
cross_attn_layer_head_mask: tf.Tensor | None = None,
past_key_value: Tuple[tf.Tensor] | None = None,
training: Optional[bool] = False,
) -> tf.Tensor:
pass
注释:
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "encoder_attn", None) is not None:
with tf.name_scope(self.encoder_attn.name):
self.encoder_attn.build(None)
if getattr(self, "encoder_attn_layer_norm", None) is not None:
with tf.name_scope(self.encoder_attn_layer_norm.name):
self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
@keras_serializable
class TFXGLMMainLayer(keras.layers.Layer):
config_class = XGLMConfig
def __init__(
self, config: XGLMConfig, embed_tokens: Optional[TFSharedEmbeddings] = None, *inputs, **kwargs: Any
) -> None:
super().__init__(*inputs, **kwargs)
self.config = config
self.padding_idx = config.pad_token_id
self.max_target_positions = config.max_position_embeddings
self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
if embed_tokens is not None:
self.embed_tokens = embed_tokens
else:
self.embed_tokens = TFSharedEmbeddings(
config.vocab_size, config.d_model, self.padding_idx, name="embed_tokens"
)
self.offset = 2
self._embed_positions_weights = create_sinusoidal_positions(
num_positions=config.max_position_embeddings + self.offset,
embedding_dim=config.d_model,
padding_idx=config.pad_token_id,
)
self.dropout = keras.layers.Dropout(config.dropout)
self.layers = [TFXGLMDecoderLayer(config, name=f"layers.{i}") for i in range(config.num_layers)]
self.layerdrop = config.layerdrop
self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
def get_input_embeddings(self) -> TFSharedEmbeddings:
return self.embed_tokens
def set_input_embeddings(self, value: TFSharedEmbeddings) -> None:
self.embed_tokens = value
def _prepare_decoder_attention_mask(
self,
attention_mask: tf.Tensor | None,
input_shape: tf.TensorShape,
past_key_values_length: int,
) -> tf.Tensor:
combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length)
combined_attention_mask = tf.cond(
input_shape[-1] > 1, lambda: combined_attention_mask, lambda: tf.ones_like(combined_attention_mask)
)
if attention_mask is None:
return combined_attention_mask
expand_attention_mask = _expand_mask(attention_mask, tgt_len=input_shape[-1])
return expand_attention_mask + combined_attention_mask
def embed_positions(self, position_ids: np.ndarray | tf.Tensor | None = None) -> tf.Tensor:
position_ids += self.offset
positions = tf.gather(self._embed_positions_weights, position_ids, axis=0)
return positions
@unpack_inputs
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
**kwargs: Any,
):
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
if getattr(self, "embed_tokens", None) is not None:
with tf.name_scope(self.embed_tokens.name):
self.embed_tokens.build(None)
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
class TFXGLMPreTrainedModel(TFPreTrainedModel):
config_class = XGLMConfig
base_model_prefix = "model"
XGLM_START_DOCSTRING = r"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
positional argument:
- a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
Note that when creating models and layers with
[subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
about any of this, as you can just pass inputs like you would to any other Python function!
</Tip>
Args:
config ([`XGLMConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
"""
@add_start_docstrings(
"The bare XGLM Model transformer outputting raw hidden-states without any specific head on top.",
XGLM_START_DOCSTRING,
)
class TFXGLMModel(TFXGLMPreTrainedModel):
"""
Transformer decoder consisting of *config.num_layers* layers. Each layer is a [`TFXGLMDecoderLayer`]
"""
"""
初始化函数,设置模型的配置和嵌入层参数,继承父类的初始化方法。
Args:
config: XGLMConfig 类型的配置对象
embed_tokens: 可选的 TFSharedEmbeddings 类型的嵌入层参数
*inputs: 可变数量的输入参数
**kwargs: 可变数量的关键字参数
"""
super().__init__(config, *inputs, **kwargs)
self.model = TFXGLMMainLayer(config, embed_tokens=embed_tokens, name="model")
@unpack_inputs
@add_start_docstrings_to_model_forward(XGLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutputWithPastAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
**kwargs: Any,
) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
"""
调用方法,用于模型的前向推断。
Args:
input_ids: TFModelInputType 类型或 None,输入的 token IDs
attention_mask: np.ndarray 或 tf.Tensor 或 None,注意力遮罩
position_ids: np.ndarray 或 tf.Tensor 或 None,位置 IDs
encoder_hidden_states: np.ndarray 或 tf.Tensor 或 None,编码器隐藏状态
encoder_attention_mask: np.ndarray 或 tf.Tensor 或 None,编码器注意力遮罩
head_mask: np.ndarray 或 tf.Tensor 或 None,注意力头部遮罩
cross_attn_head_mask: np.ndarray 或 tf.Tensor 或 None,跨注意力头部遮罩
past_key_values: 可选的 Tuple[Tuple[Union[np.ndarray, tf.Tensor]]],过去的键值
inputs_embeds: np.ndarray 或 tf.Tensor 或 None,输入的嵌入
use_cache: 可选的 bool 类型,是否使用缓存
output_attentions: 可选的 bool 类型,是否输出注意力权重
output_hidden_states: 可选的 bool 类型,是否输出隐藏状态
return_dict: 可选的 bool 类型,是否返回字典格式的输出
training: 可选的 bool 类型,默认为 False,是否处于训练模式
**kwargs: 其他关键字参数
Returns:
模型输出,可以是 TFBaseModelOutputWithPastAndCrossAttentions 类型或 tf.Tensor 的元组
"""
outputs = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
head_mask=head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def build(self, input_shape=None):
"""
构建方法,用于建立模型的层次结构。
Args:
input_shape: 可选的输入形状信息
"""
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
@add_start_docstrings(
"""
The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
""",
XGLM_START_DOCSTRING,
)
class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = [
r"model.embed_positions.weights",
r"lm_head.weight",
]
_keys_to_ignore_on_save = [
r"model.embed_positions.weights",
]
def __init__(
self, config: XGLMConfig, embed_tokens: Optional[TFSharedEmbeddings] = None, *inputs: Any, **kwargs: Any
) -> None:
super().__init__(config, *inputs, **kwargs)
self.model = TFXGLMMainLayer(config, embed_tokens=embed_tokens, name="model")
self.lm_head = keras.layers.Dense(
config.vocab_size,
use_bias=False,
kernel_initializer=get_initializer(config.init_std),
name="lm_head",
)
self.config = config
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs):
if past_key_values:
inputs = tf.expand_dims(inputs[:, -1], -1)
position_ids = kwargs.get("position_ids", None)
attention_mask = kwargs.get("attention_mask", None)
if attention_mask is not None and position_ids is None:
position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
if past_key_values:
position_ids = tf.expand_dims(position_ids[:, -1], -1)
return {
"input_ids": inputs,
"attention_mask": attention_mask,
"position_ids": position_ids,
"past_key_values": past_key_values,
"use_cache": use_cache,
}
@unpack_inputs
@add_start_docstrings_to_model_forward(XGLM_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFCausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFCausalLMOutputWithCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
labels: np.ndarray | tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
**kwargs: Any,
) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
r"""
labels (`np.ndarray` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
`labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
"""
outputs = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
head_mask=head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
hidden_states = outputs[0]
lm_logits = self.lm_head(hidden_states)
loss = None
if labels is not None:
labels = tf.concat(
[labels[:, 1:], tf.fill((labels.shape[0], 1), tf.cast(self.config.pad_token_id, labels.dtype))],
axis=-1,
)
loss = self.hf_compute_loss(labels, lm_logits)
if not return_dict:
output = (lm_logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFCausalLMOutputWithCrossAttentions(
loss=loss,
logits=lm_logits,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
cross_attentions=outputs.cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build([None, None, self.config.hidden_size])
def tf_to_pt_weight_rename(self, tf_weight):
if tf_weight == "lm_head.weight":
return tf_weight, "model.embed_tokens.weight"
else:
return (tf_weight,)
.\models\xglm\modeling_xglm.py
""" PyTorch XGLM model."""
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
from ...modeling_utils import PreTrainedModel
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_xglm import XGLMConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "facebook/xglm-564M"
_CONFIG_FOR_DOC = "XGLMConfig"
XGLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/xglm-564M",
]
XGLM_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`XGLMConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
XGLM_INPUTS_DOCSTRING = r"""
"""
class XGLMSinusoidalPositionalEmbedding(nn.Module):
"""This module produces sinusoidal positional embeddings of any length."""
def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
super().__init__()
self.offset = 2
self.embedding_dim = embedding_dim
self.padding_idx = padding_idx
self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
if hasattr(self, "weights"):
emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
self.register_buffer("weights", emb_weights, persistent=False)
@staticmethod
def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
"""
构建正弦嵌入。
这与tensor2tensor中的实现相匹配,但与"Attention Is All You Need"第3.5节的描述略有不同。
"""
half_dim = embedding_dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
if embedding_dim % 2 == 1:
emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
if padding_idx is not None:
emb[padding_idx, :] = 0
return emb.to(torch.get_default_dtype())
@torch.no_grad()
def forward(self, position_ids: torch.Tensor = None, past_key_values_length: int = 0):
bsz, seq_len = position_ids.size()
position_ids += self.offset
max_pos = 2 + seq_len + past_key_values_length
if max_pos > self.weights.size(0):
self.make_weights(max_pos, self.embedding_dim, self.padding_idx)
return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
class XGLMAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
pass
class XGLMDecoderLayer(nn.Module):
def __init__(self, config: XGLMConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = XGLMAttention(
embed_dim=self.embed_dim,
num_heads=config.attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
if config.add_cross_attention:
self.encoder_attn = XGLMAttention(
embed_dim=self.embed_dim,
num_heads=config.attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
)
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.ffn_dim)
self.fc2 = nn.Linear(config.ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
class XGLMPreTrainedModel(PreTrainedModel):
config_class = XGLMConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["XGLMDecoderLayer"]
def _init_weights(self, module):
std = self.config.init_std
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
@add_start_docstrings(
"The bare XGLM Model transformer outputting raw hidden-states without any specific head on top.",
XGLM_START_DOCSTRING,
)
class XGLMModel(XGLMPreTrainedModel):
"""
Transformer decoder consisting of *config.num_layers* layers. Each layer is a [`XGLMDecoderLayer`]
Args:
config: XGLMConfig
embed_tokens (nn.Embedding): output embedding
"""
def __init__(self, config: XGLMConfig, embed_tokens: Optional[nn.Embedding] = None):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.layerdrop
self.padding_idx = config.pad_token_id
self.max_target_positions = config.max_position_embeddings
self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
if embed_tokens is not None:
self.embed_tokens = embed_tokens
else:
self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
self.embed_positions = XGLMSinusoidalPositionalEmbedding(
config.max_position_embeddings,
config.d_model,
config.pad_token_id,
)
self.layers = nn.ModuleList([XGLMDecoderLayer(config) for _ in range(config.num_layers)])
self.layer_norm = nn.LayerNorm(config.d_model)
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
@add_start_docstrings_to_model_forward(XGLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPastAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""
The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
""",
XGLM_START_DOCSTRING,
)
class XGLMForCausalLM(XGLMPreTrainedModel):
base_model_prefix = "model"
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
super().__init__(config)
self.model = XGLMModel(config)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
def get_input_embeddings(self):
return self.model.embed_tokens
def set_input_embeddings(self, value):
self.model.embed_tokens = value
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
@add_start_docstrings_to_model_forward(XGLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=CausalLMOutputWithCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
head_mask=head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
logits = self.lm_head(outputs[0])
loss = None
if labels is not None:
shift_labels = labels.new_zeros(labels.shape)
shift_labels[:, :-1] = labels[:, 1:].clone()
shift_labels[:, -1] = self.config.pad_token_id
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.config.vocab_size), shift_labels.view(-1))
if not return_dict:
output = (logits,) + outputs[1:]
return (loss,) + output if loss is not None else output
return CausalLMOutputWithCrossAttentions(
loss=loss,
logits=logits,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
cross_attentions=outputs.cross_attentions,
)
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
):
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
position_ids = kwargs.get("position_ids", None)
if attention_mask is not None and position_ids is None:
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
if past_key_values:
position_ids = position_ids[:, -input_ids.shape[1] :]
else:
position_ids = None
if attention_mask is None:
attention_mask = input_ids.new_ones(input_ids.shape)
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"position_ids": position_ids,
"past_key_values": past_key_values,
"use_cache": use_cache,
}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
.\models\xglm\tokenization_xglm.py
"""Tokenization classes for ."""
import os
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
SPIECE_UNDERLINE = "▁"
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/xglm-564M": "https://huggingface.co/facebook/xglm-564M/resolve/main/sentencepiece.bpe.model",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/xglm-564M": 2048,
}
class XGLMTokenizer(PreTrainedTokenizer):
"""
Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
[SentencePiece](https://github.com/google/sentencepiece).
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
"""
Args:
vocab_file (`str`):
Path to the vocabulary file.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the `cls_token`.
</Tip>
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the end of sequence.
The token used is the `sep_token`.
</Tip>
sep_token (`str`, *optional*, defaults to `"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
cls_token (`str`, *optional*, defaults to `"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
sp_model_kwargs (`dict`, *optional*):
Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
to set:
- `enable_sampling`: Enable subword regularization.
- `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
- `nbest_size = {0,1}`: No sampling is performed.
- `nbest_size > 1`: samples from the nbest_size results.
- `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
Attributes:
sp_model (`SentencePieceProcessor`):
The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
"""
# 从全局常量中获取词汇文件名列表
vocab_files_names = VOCAB_FILES_NAMES
# 预训练词汇文件映射,指定了每个特殊词汇的文件路径
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
# 将预训练的位置嵌入大小赋值给max_model_input_sizes变量
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# 定义模型输入的名称列表
model_input_names = ["input_ids", "attention_mask"]
# 初始化函数,用于创建一个新的实例对象
def __init__(
self,
vocab_file,
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs,
) -> None:
# 如果sp_model_kwargs为None,则设为一个空字典
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
# 兼容性处理,与原始分词器的兼容
self.num_madeup_words = 7
madeup_words = [f"<madeupword{i}>" for i in range(self.num_madeup_words)]
# 获取kwargs中的additional_special_tokens列表,如果不存在则创建一个空列表
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or []
# 将madeup_words中未在additional_special_tokens中的单词添加到additional_special_tokens中
kwargs["additional_special_tokens"] += [
word for word in madeup_words if word not in kwargs["additional_special_tokens"]
]
# 使用指定的参数初始化SentencePieceProcessor对象
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
# 加载词汇文件到self.sp_model中
self.sp_model.Load(str(vocab_file))
# 将vocab_file保存到self.vocab_file中
self.vocab_file = vocab_file
# 原始fairseq词汇表和spm词汇表必须是“对齐”的:
# Vocab | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
# -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
# fairseq | '<s>' | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's' | '▁de' | '-'
# spm | '<unk>' | '<s>' | '</s>' | ',' | '.' | '▁' | 's' | '▁de' | '-' | '▁a'
# 在原始fairseq词汇表和spm词汇表之间进行对齐,第一个“真实”标记“,”在fairseq词汇表中位置为4,在spm词汇表中位置为3
self.fairseq_offset = 1
# 模仿fairseq的token-to-id对齐,对前4个token进行映射
self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
# 计算spm词汇表的大小
sp_size = len(self.sp_model)
# 创建一个字典,将madeup_words映射到fairseq词汇表之后的位置
madeup_words = {f"<madeupword{i}>": sp_size + i + self.fairseq_offset for i in range(self.num_madeup_words)}
self.fairseq_tokens_to_ids.update(madeup_words)
# 创建一个反向映射,从token id到token的映射
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
# 调用父类的初始化方法,传入相应的参数
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
cls_token=cls_token,
pad_token=pad_token,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs,
)
# 序列化对象时调用的方法,返回对象的状态信息
def __getstate__(self):
state = self.__dict__.copy()
# 将self.sp_model设置为None,因为它不能直接被序列化
state["sp_model"] = None
# 将self.sp_model的序列化模型信息保存到state中
state["sp_model_proto"] = self.sp_model.serialized_model_proto()
return state
# 反序列化对象时调用的方法,用于恢复对象的状态信息
def __setstate__(self, d):
# 恢复对象的状态信息
self.__dict__ = d
# 向后兼容性处理
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
# 使用保存的sp_model_proto信息重新初始化self.sp_model
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
从序列或序列对构建用于序列分类任务的模型输入,通过连接并添加特殊标记。XLM-RoBERTa 序列的格式如下:
- 单序列: `<s> X </s>`
- 序列对: `<s> A </s></s> B </s>`
Args:
token_ids_0 (`List[int]`):
要添加特殊标记的 ID 列表。
token_ids_1 (`List[int]`, *optional*):
可选的第二个 ID 列表,用于序列对。
Returns:
`List[int]`: 带有适当特殊标记的输入 ID 列表。
"""
if token_ids_1 is None:
# 如果只有一个序列,返回带有 SEP 特殊标记的 token_ids_0
return [self.sep_token_id] + token_ids_0
sep = [self.sep_token_id]
# 如果有两个序列,返回连接的序列,每个序列末尾带有两个 SEP 特殊标记
return sep + token_ids_0 + sep + sep + token_ids_1
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
从没有添加特殊标记的 token 列表中检索序列 ID。在使用 tokenizer 的 `prepare_for_model` 方法添加特殊标记时调用此方法。
Args:
token_ids_0 (`List[int]`):
ID 列表。
token_ids_1 (`List[int]`, *optional*):
可选的第二个 ID 列表,用于序列对。
already_has_special_tokens (`bool`, *optional*, 默认为 `False`):
token 列表是否已经使用特殊标记格式化为模型。
Returns:
`List[int]`: 整数列表,范围为 [0, 1]:1 表示特殊标记,0 表示序列标记。
"""
if already_has_special_tokens:
# 如果已经有特殊标记,调用父类方法获取特殊标记掩码
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is None:
# 如果只有一个序列,返回一个序列首部带有特殊标记的掩码
return [1] + ([0] * len(token_ids_0))
# 如果有两个序列,返回连接的序列,每个序列首尾带有特殊标记的掩码
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1))
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
):
"""
从序列或序列对创建用于区分 token 类型的 token 类型 ID。
Args:
token_ids_0 (`List[int]`):
第一个序列的 ID 列表。
token_ids_1 (`List[int]`, *optional*):
可选的第二个序列的 ID 列表,用于序列对。
Returns:
无返回值,该方法会生成用于区分 token 类型的 token 类型 ID。
"""
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
not make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
# Define a separator token list containing `self.sep_token_id`
sep = [self.sep_token_id]
# Check if token_ids_1 is None; if so, return a list of zeros based on the length of `sep + token_ids_0`
if token_ids_1 is None:
return len(sep + token_ids_0) * [0]
# If token_ids_1 is provided, return a list of zeros based on the extended length of tokens including separators
return len(sep + token_ids_0 + sep + sep + token_ids_1) * [0]
@property
def vocab_size(self):
# Calculate and return the total vocabulary size, including fairseq offsets and made-up words
return len(self.sp_model) + self.fairseq_offset + self.num_madeup_words
def get_vocab(self):
# Create a dictionary mapping from token strings to their corresponding IDs within the vocabulary
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder) # Update with additional tokens from `added_tokens_encoder`
return vocab
def _tokenize(self, text: str) -> List[str]:
# Tokenize the input `text` using `sp_model` and return a list of token strings
return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token):
"""Converts a token (str) into an ID using the vocabulary."""
if token in self.fairseq_tokens_to_ids:
return self.fairseq_tokens_to_ids[token]
spm_id = self.sp_model.PieceToId(token)
# Return the offset ID for unknown tokens if SP model returns 0 (indicating unknown token)
return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
def _convert_id_to_token(self, index):
"""Converts an index (integer) into a token (str) using the vocabulary."""
if index in self.fairseq_ids_to_tokens:
return self.fairseq_ids_to_tokens[index]
return self.sp_model.IdToPiece(index - self.fairseq_offset)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) into a single string."""
# Concatenate tokens into a single string, replacing SPIECE_UNDERLINE with spaces and stripping leading/trailing spaces
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# Ensure `save_directory` exists; if not, log an error and return None
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
# Define the output vocabulary file path based on `save_directory` and `filename_prefix`
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
# If the current `vocab_file` path is different from `out_vocab_file` and exists, copy `vocab_file` to `out_vocab_file`
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
# If `vocab_file` does not exist, write `sp_model.serialized_model_proto()` content to `out_vocab_file`
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file,)
.\models\xglm\tokenization_xglm_fast.py
"""Tokenization classes for XGLM."""
import os
from shutil import copyfile
from typing import List, Optional, Tuple
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import is_sentencepiece_available, logging
if is_sentencepiece_available():
from .tokenization_xglm import XGLMTokenizer
else:
XGLMTokenizer = None
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/xglm-564M": "https://huggingface.co/facebook/xglm-564M/resolve/main/sentencepiece.bpe.model",
},
"tokenizer_file": {
"facebook/xglm-564M": "https://huggingface.co/facebook/xglm-564M/resolve/main/tokenizer.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/xglm-564M": 2048,
}
class XGLMTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" XGLM tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from [`RobertaTokenizer`]
and [`XLNetTokenizer`]. Based on
[BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.
"""
"""
Args:
vocab_file (`str`):
Path to the vocabulary file.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the `cls_token`.
</Tip>
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the end of sequence.
The token used is the `sep_token`.
</Tip>
sep_token (`str`, *optional*, defaults to `"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
cls_token (`str`, *optional*, defaults to `"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = XGLMTokenizer
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
**kwargs,
"""
):
"""
Compatibility with the original tokenizer.
Set the number of made-up words to 7 and generate a list of made-up words.
Append any new made-up words to the 'additional_special_tokens' in kwargs.
Initialize the superclass with various parameters including vocab_file and additional_special_tokens.
"""
self.num_madeup_words = 7
madeup_words = [f"<madeupword{i}>" for i in range(self.num_madeup_words)]
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or []
kwargs["additional_special_tokens"] += [
word for word in madeup_words if word not in kwargs["additional_special_tokens"]
]
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
cls_token=cls_token,
unk_token=unk_token,
pad_token=pad_token,
**kwargs,
)
self.vocab_file = vocab_file
@property
def can_save_slow_tokenizer(self) -> bool:
"""
Check if the vocab_file exists to determine if the slow tokenizer can be saved.
Returns True if vocab_file exists, False otherwise.
"""
return os.path.isfile(self.vocab_file) if self.vocab_file else False
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs for sequence classification tasks by adding special tokens.
Formats sequences according to XLM-RoBERTa standards.
Args:
token_ids_0 (`List[int]`):
List of IDs for the first sequence.
token_ids_1 (`List[int]`, *optional*):
List of IDs for the second sequence (optional).
Returns:
`List[int]`: List of input IDs with special tokens added.
"""
if token_ids_1 is None:
return [self.sep_token_id] + token_ids_0
sep = [self.sep_token_id]
return sep + token_ids_0 + sep + sep + token_ids_1
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create token type IDs for sequence-pair classification tasks.
Always returns a list of zeros as XLM-RoBERTa does not use token type IDs.
Args:
token_ids_0 (`List[int]`):
List of IDs for the first sequence.
token_ids_1 (`List[int]`, *optional*):
List of IDs for the second sequence (optional).
Returns:
`List[int]`: List of zeros (indicating no distinction in token types).
"""
sep = [self.sep_token_id]
if token_ids_1 is None:
return len(sep + token_ids_0) * [0]
return len(sep + token_ids_0 + sep + sep + token_ids_1) * [0]
# 定义一个方法用于保存词汇表到指定目录下的文件中,方法签名指定了参数和返回类型
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# 如果当前的快速分词器没有保存慢速分词器所需的信息,则抛出数值错误异常
if not self.can_save_slow_tokenizer:
raise ValueError(
"Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
"tokenizer."
)
# 如果保存词汇表的目录不存在,则记录错误日志并返回
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
return
# 构建输出词汇表文件的路径,包括可选的文件名前缀和文件名
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
# 如果当前词汇表文件的绝对路径与输出文件的绝对路径不同,则复制当前词汇表文件到输出路径
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
# 返回一个包含输出词汇表文件路径的元组
return (out_vocab_file,)
.\models\xglm\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_flax_available,
is_sentencepiece_available,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {"configuration_xglm": ["XGLM_PRETRAINED_CONFIG_ARCHIVE_MAP", "XGLMConfig"]}
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_xglm"] = ["XGLMTokenizer"]
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_xglm_fast"] = ["XGLMTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_xglm"] = [
"XGLM_PRETRAINED_MODEL_ARCHIVE_LIST",
"XGLMForCausalLM",
"XGLMModel",
"XGLMPreTrainedModel",
]
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_flax_xglm"] = [
"FlaxXGLMForCausalLM",
"FlaxXGLMModel",
"FlaxXGLMPreTrainedModel",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_xglm"] = [
"TF_XGLM_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFXGLMForCausalLM",
"TFXGLMModel",
"TFXGLMPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_xglm import XGLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XGLMConfig
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_xglm import XGLMTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_xglm_fast import XGLMTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_xglm import XGLM_PRETRAINED_MODEL_ARCHIVE_LIST, XGLMForCausalLM, XGLMModel, XGLMPreTrainedModel
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_flax_xglm import FlaxXGLMForCausalLM, FlaxXGLMModel, FlaxXGLMPreTrainedModel
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_xglm import (
TF_XGLM_PRETRAINED_MODEL_ARCHIVE_LIST,
TFXGLMForCausalLM,
TFXGLMModel,
TFXGLMPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
.\models\xlm\configuration_xlm.py
from collections import OrderedDict
from typing import Mapping
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"FacebookAI/xlm-mlm-en-2048": "https://huggingface.co/FacebookAI/xlm-mlm-en-2048/resolve/main/config.json",
"FacebookAI/xlm-mlm-ende-1024": "https://huggingface.co/FacebookAI/xlm-mlm-ende-1024/resolve/main/config.json",
"FacebookAI/xlm-mlm-enfr-1024": "https://huggingface.co/FacebookAI/xlm-mlm-enfr-1024/resolve/main/config.json",
"FacebookAI/xlm-mlm-enro-1024": "https://huggingface.co/FacebookAI/xlm-mlm-enro-1024/resolve/main/config.json",
"FacebookAI/xlm-mlm-tlm-xnli15-1024": "https://huggingface.co/FacebookAI/xlm-mlm-tlm-xnli15-1024/resolve/main/config.json",
"FacebookAI/xlm-mlm-xnli15-1024": "https://huggingface.co/FacebookAI/xlm-mlm-xnli15-1024/resolve/main/config.json",
"FacebookAI/xlm-clm-enfr-1024": "https://huggingface.co/FacebookAI/xlm-clm-enfr-1024/resolve/main/config.json",
"FacebookAI/xlm-clm-ende-1024": "https://huggingface.co/FacebookAI/xlm-clm-ende-1024/resolve/main/config.json",
"FacebookAI/xlm-mlm-17-1280": "https://huggingface.co/FacebookAI/xlm-mlm-17-1280/resolve/main/config.json",
"FacebookAI/xlm-mlm-100-1280": "https://huggingface.co/FacebookAI/xlm-mlm-100-1280/resolve/main/config.json",
}
class XLMConfig(PretrainedConfig):
"""
XLM 模型的配置类,用于存储 [`XLMModel`] 或 [`TFXLMModel`] 的配置信息。根据指定参数实例化一个 XLM 模型配置,定义模型的架构。
使用默认值实例化配置将得到与 [FacebookAI/xlm-mlm-en-2048](https://huggingface.co/FacebookAI/xlm-mlm-en-2048) 架构类似的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型输出。阅读 [`PretrainedConfig`] 的文档以获取更多信息。
Examples:
```
>>> from transformers import XLMConfig, XLMModel
>>> # 初始化一个 XLM 配置
>>> configuration = XLMConfig()
>>> # 从配置初始化一个模型(随机权重)
```
"""
>>> model = XLMModel(configuration)
>>>
>>> configuration = model.config
model_type = "xlm"
attribute_map = {
"hidden_size": "emb_dim",
"num_attention_heads": "n_heads",
"num_hidden_layers": "n_layers",
"n_words": "vocab_size",
}
def __init__(
self,
vocab_size=30145,
emb_dim=2048,
n_layers=12,
n_heads=16,
dropout=0.1,
attention_dropout=0.1,
gelu_activation=True,
sinusoidal_embeddings=False,
causal=False,
asm=False,
n_langs=1,
use_lang_emb=True,
max_position_embeddings=512,
embed_init_std=2048**-0.5,
layer_norm_eps=1e-12,
init_std=0.02,
bos_index=0,
eos_index=1,
pad_index=2,
unk_index=3,
mask_index=5,
is_encoder=True,
summary_type="first",
summary_use_proj=True,
summary_activation=None,
summary_proj_to_labels=True,
summary_first_dropout=0.1,
start_n_top=5,
end_n_top=5,
mask_token_id=0,
lang_id=0,
pad_token_id=2,
bos_token_id=0,
**kwargs,
):
"""Constructs XLMConfig."""
self.vocab_size = vocab_size
self.emb_dim = emb_dim
self.n_layers = n_layers
self.n_heads = n_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.gelu_activation = gelu_activation
self.sinusoidal_embeddings = sinusoidal_embeddings
self.causal = causal
self.asm = asm
self.n_langs = n_langs
self.use_lang_emb = use_lang_emb
self.layer_norm_eps = layer_norm_eps
self.bos_index = bos_index
self.eos_index = eos_index
self.pad_index = pad_index
self.unk_index = unk_index
self.mask_index = mask_index
self.is_encoder = is_encoder
self.max_position_embeddings = max_position_embeddings
self.embed_init_std = embed_init_std
self.init_std = init_std
self.summary_type = summary_type
self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation
self.summary_proj_to_labels = summary_proj_to_labels
self.summary_first_dropout = summary_first_dropout
self.start_n_top = start_n_top
self.end_n_top = end_n_top
self.mask_token_id = mask_token_id
self.lang_id = lang_id
if "n_words" in kwargs:
self.n_words = kwargs["n_words"]
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
class XLMOnnxConfig(OnnxConfig):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task == "multiple-choice":
dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
else:
dynamic_axis = {0: "batch", 1: "sequence"}
return OrderedDict(
[
("input_ids", dynamic_axis),
("attention_mask", dynamic_axis),
("token_type_ids", dynamic_axis),
]
)
.\models\xlm\convert_xlm_original_pytorch_checkpoint_to_pytorch.py
"""Convert OpenAI GPT checkpoint."""
import argparse
import json
import numpy
import torch
from transformers.models.xlm.tokenization_xlm import VOCAB_FILES_NAMES
from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
logging.set_verbosity_info()
def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
chkpt = torch.load(xlm_checkpoint_path, map_location="cpu")
state_dict = chkpt["model"]
two_levels_state_dict = {}
for k, v in state_dict.items():
if "pred_layer" in k:
two_levels_state_dict[k] = v
else:
two_levels_state_dict["transformer." + k] = v
config = chkpt["params"]
config = {n: v for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray))}
vocab = chkpt["dico_word2id"]
vocab = {s + "</w>" if s.find("@@") == -1 and i > 13 else s.replace("@@", ""): i for s, i in vocab.items()}
pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
torch.save(two_levels_state_dict, pytorch_weights_dump_path)
pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
f.write(json.dumps(config, indent=2) + "\n")
pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["vocab_file"]
with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f:
f.write(json.dumps(vocab, indent=2) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--xlm_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
args = parser.parse_args()
convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path)
.\models\xlm\modeling_tf_xlm.py
def create_sinusoidal_embeddings(n_pos, dim, out):
position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
out[:, 0::2] = tf.constant(np.sin(position_enc[:, 0::2]))
out[:, 1::2] = tf.constant(np.cos(position_enc[:, 1::2]))
def get_masks(slen, lengths, causal, padding_mask=None):
bs = shape_list(lengths)[0]
if padding_mask is not None:
mask = padding_mask
else:
alen = tf.range(slen, dtype=lengths.dtype)
mask = alen < tf.expand_dims(lengths, axis=1)
if causal:
attn_mask = tf.less_equal(
tf.tile(tf.reshape(alen, (1, 1, slen)), (bs, slen, 1)), tf.reshape(alen, (1, slen, 1))
)
else:
attn_mask = mask
tf.debugging.assert_equal(shape_list(mask), [bs, slen])
if causal:
tf.debugging.assert_equal(shape_list(attn_mask), [bs, slen, slen])
return mask, attn_mask
class TFXLMMultiHeadAttention(keras.layers.Layer):
NEW_ID = itertools.count()
def __init__(self, n_heads, dim, config, **kwargs):
super().__init__(**kwargs)
self.layer_id = next(TFXLMMultiHeadAttention.NEW_ID)
self.dim = dim
self.n_heads = n_heads
self.output_attentions = config.output_attentions
assert self.dim % self.n_heads == 0
self.q_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="q_lin")
self.k_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="k_lin")
self.v_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="v_lin")
self.out_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="out_lin")
self.dropout = keras.layers.Dropout(config.attention_dropout)
self.pruned_heads = set()
self.dim = dim
def prune_heads(self, heads):
raise NotImplementedError
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "q_lin", None) is not None:
with tf.name_scope(self.q_lin.name):
self.q_lin.build([None, None, self.dim])
if getattr(self, "k_lin", None) is not None:
with tf.name_scope(self.k_lin.name):
self.k_lin.build([None, None, self.dim])
if getattr(self, "v_lin", None) is not None:
with tf.name_scope(self.v_lin.name):
self.v_lin.build([None, None, self.dim])
if getattr(self, "out_lin", None) is not None:
with tf.name_scope(self.out_lin.name):
self.out_lin.build([None, None, self.dim])
class TFXLMTransformerFFN(keras.layers.Layer):
def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs):
super().__init__(**kwargs)
self.lin1 = keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1")
self.lin2 = keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2")
self.act = get_tf_activation("gelu") if config.gelu_activation else get_tf_activation("relu")
self.dropout = keras.layers.Dropout(config.dropout)
self.in_dim = in_dim
self.dim_hidden = dim_hidden
def call(self, input, training=False):
x = self.lin1(input)
x = self.act(x)
x = self.lin2(x)
x = self.dropout(x, training=training)
return x
if self.built:
return
self.built = True
if getattr(self, "lin1", None) is not None:
with tf.name_scope(self.lin1.name):
self.lin1.build([None, None, self.in_dim])
if getattr(self, "lin2", None) is not None:
with tf.name_scope(self.lin2.name):
self.lin2.build([None, None, self.dim_hidden])
@keras_serializable
class TFXLMMainLayer(keras.layers.Layer):
config_class = XLMConfig
def build(self, input_shape=None):
if self.built:
return
self.built = True
with tf.name_scope("position_embeddings"):
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.dim],
initializer=get_initializer(self.embed_init_std),
)
if self.n_langs > 1 and self.use_lang_emb:
with tf.name_scope("lang_embeddings"):
self.lang_embeddings = self.add_weight(
name="embeddings",
shape=[self.n_langs, self.dim],
initializer=get_initializer(self.embed_init_std),
)
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "layer_norm_emb", None) is not None:
with tf.name_scope(self.layer_norm_emb.name):
self.layer_norm_emb.build([None, None, self.dim])
for layer in self.attentions:
with tf.name_scope(layer.name):
layer.build(None)
for layer in self.layer_norm1:
with tf.name_scope(layer.name):
layer.build([None, None, self.dim])
for layer in self.ffns:
with tf.name_scope(layer.name):
layer.build(None)
for layer in self.layer_norm2:
with tf.name_scope(layer.name):
layer.build([None, None, self.dim])
def get_input_embeddings(self):
return self.embeddings
def set_input_embeddings(self, value):
self.embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
raise NotImplementedError
@unpack_inputs
def call(
self,
input_ids=None,
attention_mask=None,
langs=None,
token_type_ids=None,
position_ids=None,
lengths=None,
cache=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
training=False,
):
pass
class TFXLMPreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = XLMConfig
base_model_prefix = "transformer"
@property
def dummy_inputs(self):
inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]], dtype=tf.int32)
attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]], dtype=tf.int32)
if self.config.use_lang_emb and self.config.n_langs > 1:
return {
"input_ids": inputs_list,
"attention_mask": attns_list,
"langs": tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]], dtype=tf.int32),
}
else:
return {"input_ids": inputs_list, "attention_mask": attns_list}
@dataclass
class TFXLMWithLMHeadModelOutput(ModelOutput):
"""
[`TFXLMWithLMHeadModel`] 输出的基类。
Args:
logits (`tf.Tensor`,形状为 `(batch_size, sequence_length, config.vocab_size)`):
语言建模头部的预测分数(SoftMax 之前的每个词汇标记的分数)。
hidden_states (`tuple(tf.Tensor)`,*可选*,当传入 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
形状为 `(batch_size, sequence_length, hidden_size)` 的 `tf.Tensor` 元组。
模型在每层输出的隐藏状态以及初始嵌入输出。
attentions (`tuple(tf.Tensor)`,*可选*,当传入 `output_attentions=True` 或 `config.output_attentions=True` 时返回):
形状为 `(batch_size, num_heads, sequence_length, sequence_length)` 的 `tf.Tensor` 元组。
注意力 SoftMax 之后的注意力权重,用于计算自注意力头中的加权平均值。
"""
logits: tf.Tensor = None
hidden_states: Tuple[tf.Tensor, ...] | None = None
attentions: Tuple[tf.Tensor, ...] | None = None
XLM_START_DOCSTRING = r"""
该模型继承自 [`TFPreTrainedModel`]。请查阅超类文档以获取库实现的所有模型通用方法(如下载或保存模型、调整输入嵌入大小、修剪头等)。
该模型也是 [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) 的子类。可以将其用作常规的 TF 2.0 Keras 模型,并参考 TF 2.0 文档以获取与一般使用和行为相关的所有内容。
<Tip>
`transformers` 中的 TensorFlow 模型和层接受两种输入格式:
- 将所有输入作为关键字参数传递(类似于 PyTorch 模型),或者
- 将所有输入作为列表、元组或字典传递给第一个位置参数。
支持第二种格式的原因是 Keras 方法更喜欢在将输入传递给模型和层时使用此格式。由于此支持,当使用诸如 `model.fit()` 等方法时,您只需将输入和标签以 `model.fit()` 支持的任何格式传递即可!但是,如果您希望在 Keras 方法之外(例如在使用 Keras `Functional` API 创建自己的层或模型时)使用第二种格式,那么您可以使用以下三种可能性将所有输入张量收集到第一个位置参数中:
- 仅具有 `input_ids` 的单个张量且没有其他内容:`model(input_ids)`
- 长度不同的列表,其中按照文档字符串中给定的顺序包含一个或多个输入张量:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- 根据不同的输入情况,可以接受一个或多个输入张量的字典。
`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
Note that when creating models and layers with
[subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
about any of this, as you can just pass inputs like you would to any other Python function!
在使用子类化创建模型和层时,可以像传递给任何其他Python函数一样传递输入,因此无需担心这些细节。
Parameters:
config ([`XLMConfig`]): Model configuration class with all the parameters of the model.
使用包含模型所有参数的配置类(例如`XLMConfig`)进行模型的配置。
使用配置文件进行初始化不会加载与模型关联的权重,仅加载配置。
可以查看 [`~PreTrainedModel.from_pretrained`] 方法来加载模型的权重。
"""
XLM_INPUTS_DOCSTRING = r"""
"""
@add_start_docstrings(
"The bare XLM Model transformer outputting raw hidden-states without any specific head on top.",
XLM_START_DOCSTRING,
)
class TFXLMModel(TFXLMPreTrainedModel):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLMMainLayer(config, name="transformer")
@unpack_inputs
@add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: tf.Tensor | None = None,
langs: tf.Tensor | None = None,
token_type_ids: tf.Tensor | None = None,
position_ids: tf.Tensor | None = None,
lengths: tf.Tensor | None = None,
cache: Dict[str, tf.Tensor] | None = None,
head_mask: tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
output_attentions: bool | None = None,
output_hidden_states: bool | None = None,
return_dict: bool | None = None,
training: bool = False,
) -> TFBaseModelOutput | Tuple[tf.Tensor]:
outputs = self.transformer(
input_ids=input_ids,
attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
class TFXLMPredLayer(keras.layers.Layer):
"""
Prediction layer (cross_entropy or adaptive_softmax).
"""
def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs)
self.asm = config.asm
self.n_words = config.n_words
self.pad_index = config.pad_index
if config.asm is False:
self.input_embeddings = input_embeddings
else:
raise NotImplementedError
self.bias = self.add_weight(shape=(self.n_words,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape)
def get_output_embeddings(self):
return self.input_embeddings
def set_output_embeddings(self, value):
self.input_embeddings.weight = value
self.input_embeddings.vocab_size = shape_list(value)[0]
def get_bias(self):
return {"bias": self.bias}
def set_bias(self, value):
self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states):
hidden_states = self.input_embeddings(hidden_states, mode="linear")
hidden_states = hidden_states + self.bias
return hidden_states
"""
The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
"""
@add_start_docstrings(
"""
The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
""",
XLM_START_DOCSTRING,
)
class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLMMainLayer(config, name="transformer")
self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj")
self.supports_xla_generation = False
def get_lm_head(self):
return self.pred_layer
def get_prefix_bias_name(self):
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
return self.name + "/" + self.pred_layer.name
def prepare_inputs_for_generation(self, inputs, **kwargs):
mask_token_id = self.config.mask_token_id
lang_id = self.config.lang_id
effective_batch_size = inputs.shape[0]
mask_token = tf.fill((effective_batch_size, 1), 1) * mask_token_id
inputs = tf.concat([inputs, mask_token], axis=1)
if lang_id is not None:
langs = tf.ones_like(inputs) * lang_id
else:
langs = None
return {"input_ids": inputs, "langs": langs}
@unpack_inputs
@add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFXLMWithLMHeadModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
langs: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
lengths: np.ndarray | tf.Tensor | None = None,
cache: Optional[Dict[str, tf.Tensor]] = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
):
-> Union[TFXLMWithLMHeadModelOutput, Tuple[tf.Tensor]]:
transformer_outputs = self.transformer(
input_ids=input_ids,
attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
output = transformer_outputs[0]
outputs = self.pred_layer(output)
if not return_dict:
return (outputs,) + transformer_outputs[1:]
return TFXLMWithLMHeadModelOutput(
logits=outputs,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "pred_layer", None) is not None:
with tf.name_scope(self.pred_layer.name):
self.pred_layer.build(None)
"""
XLM Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.
for GLUE tasks.
"""
class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificationLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.transformer = TFXLMMainLayer(config, name="transformer")
self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
@unpack_inputs
@add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
langs: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
lengths: np.ndarray | tf.Tensor | None = None,
cache: Optional[Dict[str, tf.Tensor]] = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: bool = False,
) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
transformer_outputs = self.transformer(
input_ids=input_ids,
attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
output = transformer_outputs[0]
logits = self.sequence_summary(output)
loss = None if labels is None else self.hf_compute_loss(labels, logits)
if not return_dict:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFSequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "sequence_summary", None) is not None:
with tf.name_scope(self.sequence_summary.name):
self.sequence_summary.build(None)
"""
XLM Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
"""
class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLMMainLayer(config, name="transformer")
self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
self.logits_proj = keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
)
self.config = config
@property
def dummy_inputs(self):
"""
Dummy inputs to build the network.
Returns:
tf.Tensor with dummy inputs
"""
if self.config.use_lang_emb and self.config.n_langs > 1:
return {
"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32),
"langs": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32),
}
else:
return {
"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32),
}
@unpack_inputs
@add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
langs: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
lengths: np.ndarray | tf.Tensor | None = None,
cache: Optional[Dict[str, tf.Tensor]] = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: bool = False,
) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
if input_ids is not None:
num_choices = shape_list(input_ids)[1]
seq_length = shape_list(input_ids)[2]
else:
num_choices = shape_list(inputs_embeds)[1]
seq_length = shape_list(inputs_embeds)[2]
flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
flat_langs = tf.reshape(langs, (-1, seq_length)) if langs is not None else None
flat_inputs_embeds = (
tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3]))
if inputs_embeds is not None
else None
)
if lengths is not None:
logger.warning(
"The `lengths` parameter cannot be used with the XLM multiple choice models. Please use the "
"attention mask instead.",
)
lengths = None
transformer_outputs = self.transformer(
flat_input_ids,
flat_attention_mask,
flat_langs,
flat_token_type_ids,
flat_position_ids,
lengths,
cache,
head_mask,
flat_inputs_embeds,
output_attentions,
output_hidden_states,
return_dict=return_dict,
training=training,
)
output = transformer_outputs[0]
logits = self.sequence_summary(output)
logits = self.logits_proj(logits)
reshaped_logits = tf.reshape(logits, (-1, num_choices))
loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)
if not return_dict:
output = (reshaped_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFMultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "sequence_summary", None) is not None:
with tf.name_scope(self.sequence_summary.name):
self.sequence_summary.build(None)
if getattr(self, "logits_proj", None) is not None:
with tf.name_scope(self.logits_proj.name):
self.logits_proj.build([None, None, self.config.num_labels])
@add_start_docstrings(
"""
XLM Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
XLM_START_DOCSTRING,
)
class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.transformer = TFXLMMainLayer(config, name="transformer")
self.dropout = keras.layers.Dropout(config.dropout)
self.classifier = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
langs: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
lengths: np.ndarray | tf.Tensor | None = None,
cache: Optional[Dict[str, tf.Tensor]] = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: bool = False,
) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
transformer_outputs = self.transformer(
input_ids=input_ids,
attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = transformer_outputs[0]
sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output)
loss = None if labels is None else self.hf_compute_loss(labels, logits)
if not return_dict:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFTokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layer
on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
XLM_START_DOCSTRING,
)
class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLMMainLayer(config, name="transformer")
self.qa_outputs = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
langs: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
lengths: np.ndarray | tf.Tensor | None = None,
cache: Optional[Dict[str, tf.Tensor]] = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
start_positions: np.ndarray | tf.Tensor | None = None,
end_positions: np.ndarray | tf.Tensor | None = None,
training: bool = False,
) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
r"""
start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
transformer_outputs = self.transformer(
input_ids=input_ids,
attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = transformer_outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = tf.split(logits, 2, axis=-1)
start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1)
loss = None
if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions}
labels["end_position"] = end_positions
loss = self.hf_compute_loss(labels, (start_logits, end_logits))
if not return_dict:
output = (start_logits, end_logits) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])