Transformers 源码解析(七十九)
.\models\mpnet\tokenization_mpnet_fast.py
"""Fast Tokenization classes for MPNet."""
import json
from typing import List, Optional, Tuple
from tokenizers import normalizers
from ...tokenization_utils import AddedToken
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .tokenization_mpnet import MPNetTokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/mpnet-base": "https://huggingface.co/microsoft/mpnet-base/resolve/main/vocab.txt",
},
"tokenizer_file": {
"microsoft/mpnet-base": "https://huggingface.co/microsoft/mpnet-base/resolve/main/tokenizer.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/mpnet-base": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"microsoft/mpnet-base": {"do_lower_case": True},
}
class MPNetTokenizerFast(PreTrainedTokenizerFast):
r"""
Construct a "fast" MPNet tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
slow_tokenizer_class = MPNetTokenizer
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
do_lower_case=True,
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="[UNK]",
pad_token="<pad>",
mask_token="<mask>",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs,
):
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
cls_token=cls_token,
unk_token=unk_token,
pad_token=pad_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
pre_tok_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
if (
pre_tok_state.get("lowercase", do_lower_case) != do_lower_case
or pre_tok_state.get("strip_accents", strip_accents) != strip_accents
):
pre_tok_class = getattr(normalizers, pre_tok_state.pop("type"))
pre_tok_state["lowercase"] = do_lower_case
pre_tok_state["strip_accents"] = strip_accents
self.backend_tokenizer.normalizer = pre_tok_class(**pre_tok_state)
self.do_lower_case = do_lower_case
@property
def mask_token(self) -> str:
"""
`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
having been set.
MPNet tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
comprise the space before the *<mask>*.
"""
if self._mask_token is None:
if self.verbose:
logger.error("Using mask_token, but it is not set yet.")
return None
return str(self._mask_token)
@mask_token.setter
def mask_token(self, value):
"""
Overriding the default behavior of the mask token to have it eat the space before it.
This is needed to preserve backward compatibility with all the previously used models based on MPNet.
"""
value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
self._mask_token = value
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Constructs input with special tokens by prepending bos_token_id, appending eos_token_id,
and optionally adding token_ids_1 with an additional eos_token_id.
Args:
token_ids_0 (list of int):
List of input token IDs.
token_ids_1 (list of int, optional):
Optional second list of token IDs for sequence pairs.
Returns:
list of int: Combined list of token IDs with special tokens.
"""
output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
if token_ids_1 is None:
return output
return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. MPNet does not
make use of token type IDs, therefore a list of zeros is returned.
Args:
token_ids_0 (List[int]):
List of token IDs.
token_ids_1 (List[int], optional):
Optional second list of token IDs for sequence pairs.
Returns:
List[int]: List of zeros indicating token type IDs.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Saves the vocabulary and related model files to the specified directory.
Args:
save_directory (str):
Directory where the vocabulary will be saved.
filename_prefix (str, optional):
Optional prefix for the saved vocabulary files.
Returns:
Tuple[str]: Tuple containing the paths of the saved files.
"""
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
.\models\mpnet\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_flax_available,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_mpnet": ["MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "MPNetConfig"],
"tokenization_mpnet": ["MPNetTokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_mpnet_fast"] = ["MPNetTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_mpnet"] = [
"MPNET_PRETRAINED_MODEL_ARCHIVE_LIST",
"MPNetForMaskedLM",
"MPNetForMultipleChoice",
"MPNetForQuestionAnswering",
"MPNetForSequenceClassification",
"MPNetForTokenClassification",
"MPNetLayer",
"MPNetModel",
"MPNetPreTrainedModel",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_mpnet"] = [
"TF_MPNET_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFMPNetEmbeddings",
"TFMPNetForMaskedLM",
"TFMPNetForMultipleChoice",
"TFMPNetForQuestionAnswering",
"TFMPNetForSequenceClassification",
"TFMPNetForTokenClassification",
"TFMPNetMainLayer",
"TFMPNetModel",
"TFMPNetPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_mpnet import MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP, MPNetConfig
from .tokenization_mpnet import MPNetTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_mpnet_fast import MPNetTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_mpnet import (
MPNET_PRETRAINED_MODEL_ARCHIVE_LIST,
MPNetForMaskedLM,
MPNetForMultipleChoice,
MPNetForQuestionAnswering,
MPNetForSequenceClassification,
MPNetForTokenClassification,
MPNetLayer,
MPNetModel,
MPNetPreTrainedModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_mpnet import (
TF_MPNET_PRETRAINED_MODEL_ARCHIVE_LIST,
TFMPNetEmbeddings,
TFMPNetForMaskedLM,
TFMPNetForMultipleChoice,
TFMPNetForQuestionAnswering,
TFMPNetForSequenceClassification,
TFMPNetForTokenClassification,
TFMPNetMainLayer,
TFMPNetModel,
TFMPNetPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\mpt\configuration_mpt.py
"""
Mpt configuration
"""
from typing import TYPE_CHECKING, Optional, Union
if TYPE_CHECKING:
pass
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
MPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"mosaicml/mpt-7b": "https://huggingface.co/mosaicml/mpt-7b/resolve/main/config.json",
}
class MptAttentionConfig(PretrainedConfig):
"""
This is the configuration class to store the configuration of a [`MptAttention`] class. It is used to instantiate
attention layers according to the specified arguments, defining the layers architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the MPT
[mosaicml/mpt-7b](https://huggingface.co/mosaicml/mpt-7b) architecture. Most of the arguments are kept for backward
compatibility with previous MPT models that are hosted on the Hub (previously with `trust_remote_code=True`).
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
Args:
attn_type (`str`, *optional*, defaults to `"multihead_attention"`):
type of attention to use. Options: `"multihead_attention"`, `"multiquery_attention"`.
attn_pdrop (`float`, *optional*, defaults to 0.0):
The dropout probability for the attention layers.
attn_impl (`str`, *optional*, defaults to `"torch"`):
The attention implementation to use. One of `"torch"`, `"flash"`, or `"triton"`.
clip_qkv (`float`, *optional*):
If not `None`, clip the queries, keys, and values in the attention layer to this value.
softmax_scale (`float`, *optional*, defaults to `None`):
If not `None`, scale the softmax in the attention layer by this value. If `None`, will default to
`1/sqrt(hidden_size)`.
prefix_lm (`bool`, *optional*, defaults to `False`)):
Whether the model should operate as a Prefix LM. This requires passing an extra `prefix_mask` argument
which indicates which tokens belong to the prefix. Tokens in the prefix can attend to one another
bi-directionally. Tokens outside the prefix use causal attention.
qk_ln (`bool`, *optional*, defaults to `False`):
Whether to apply layer normalization to the queries and keys in the attention layer.
attn_uses_sequence_id (`bool`, *optional*, defaults to `False`)):
Whether to restrict attention to tokens that have the same token_type_ids. When the model is in `train`
mode, this requires passing an extra *token_type_ids* argument which indicates which sub-sequence each
token belongs to. Defaults to `False` meaning any provided *token_type_ids* will be ignored.
alibi (`bool`, *optional*, defaults to `True`):
Whether or not to use the alibi bias instead of positional embedding.
alibi_bias_max (`int`, *optional*, defaults to 8):
The maximum value of the alibi bias.
"""
def __init__(
self,
attn_type="multihead_attention",
attn_pdrop=0,
attn_impl="torch",
clip_qkv=None,
softmax_scale=None,
prefix_lm=False,
qk_ln=False,
attn_uses_sequence_id=False,
alibi=True,
alibi_bias_max=8,
**kwargs,
):
# 调用父类的初始化方法
super().__init__()
# 初始化注意力机制类型
self.attn_type = attn_type
# 初始化注意力机制中的 dropout 概率
self.attn_pdrop = attn_pdrop
# 初始化注意力机制的实现方式
self.attn_impl = attn_impl
# 如果设置了 clip_qkv 参数,则用其值来剪裁注意力层中的 queries、keys 和 values
self.clip_qkv = clip_qkv
# 如果设置了 softmax_scale 参数,则用其值来缩放注意力层中的 softmax 操作
self.softmax_scale = softmax_scale
# 是否将模型设置为 Prefix LM,这要求传入额外的 prefix_mask 参数
self.prefix_lm = prefix_lm
# 是否对注意力层中的 queries 和 keys 应用层归一化
self.qk_ln = qk_ln
# 是否限制注意力仅应用于具有相同 token_type_ids 的 tokens
self.attn_uses_sequence_id = attn_uses_sequence_id
# 是否使用 alibi 偏置而不是位置嵌入
self.alibi = alibi
# 初始化 alibi 偏置的最大值
self.alibi_bias_max = alibi_bias_max
# 检查 attn_type 是否为支持的类型,否则抛出 ValueError 异常
if attn_type not in ["multihead_attention", "multiquery_attention"]:
raise ValueError(
f"`attn_type` has to be either `multihead_attention` or `multiquery_attention`. Received: {attn_type}"
)
@classmethod
# 从预训练模型名称或路径加载配置,并返回预训练配置对象
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "PretrainedConfig":
# 在 kwargs 中设置 token
cls._set_token_in_kwargs(kwargs)
# 调用 get_config_dict 方法获取配置字典和更新后的 kwargs
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
# 如果配置字典中的模型类型是 "mpt",则使用其对应的注意力配置
if config_dict.get("model_type") == "mpt":
config_dict = config_dict["attn_config"]
# 如果配置字典中包含 "model_type" 并且类有 "model_type" 属性,并且两者不相等,发出警告
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
# 使用 from_dict 方法根据配置字典创建预训练配置对象
return cls.from_dict(config_dict, **kwargs)
class MptConfig(PretrainedConfig):
"""
This is the configuration class to store the configuration of a [`MptModel`]. It is used to instantiate a Mpt model
according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to the Mpt-7b architecture
[mosaicml/mpt-7b](https://huggingface.co/mosaicml/mpt-7b).
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
# 定义 Transformer 编码器的配置参数
Args:
d_model (`int`, *optional*, defaults to 2048):
# 嵌入和隐藏状态的维度。
Dimensionality of the embeddings and hidden states.
n_heads (`int`, *optional*, defaults to 16):
# 每个注意力层中的注意力头数量。
Number of attention heads for each attention layer in the Transformer encoder.
n_layers (`int`, *optional*, defaults to 24):
# Transformer 编码器中隐藏层的数量。
Number of hidden layers in the Transformer encoder.
expansion_ratio (`int`, *optional*, defaults to 4):
# MLP 中上/下扩展比率。
The ratio of the up/down scale in the MLP.
max_seq_len (`int`, *optional*, defaults to 2048):
# 模型的最大序列长度。
The maximum sequence length of the model.
vocab_size (`int`, *optional*, defaults to 50368):
# Mpt 模型的词汇量大小。定义了在调用 `MptModel` 时可以表示的不同标记的最大数量。
Vocabulary size of the Mpt model. Defines the maximum number of different tokens that can be represented by
the `inputs_ids` passed when calling [`MptModel`]. Check [this
discussion](https://huggingface.co/bigscience/mpt/discussions/120#633d28389addb8530b406c2a) on how the
`vocab_size` has been defined.
resid_pdrop (`float`, *optional*, defaults to 0.0):
# 在与残差结合之前应用于注意力输出的 dropout 概率。
The dropout probability applied to the attention output before combining with residual.
layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
# 层归一化层中使用的 epsilon。
The epsilon to use in the layer normalization layers.
emb_pdrop (`float`, *optional*, defaults to 0.0):
# 嵌入层的 dropout 概率。
The dropout probability for the embedding layer.
learned_pos_emb (`bool`, *optional*, defaults to `True`):
# 是否使用学习的位置编码。
Whether to use learned positional embeddings.
attn_config (`dict`, *optional*):
# 用于配置模型注意力模块的字典。
A dictionary used to configure the model's attention module.
init_device (`str`, *optional*, defaults to `"cpu"`):
# 用于参数初始化的设备。为了向后兼容而定义。
The device to use for parameter initialization. Defined for backward compatibility
logit_scale (`float`, *optional*):
# 如果不为 None,则缩放 logits 的值。
If not None, scale the logits by this value.
no_bias (`bool`, *optional*, defaults to `True`):
# 是否在所有线性层中使用偏置。
Whether to use bias in all linear layers.
verbose (`int`, *optional*, defaults to 0):
# 用于日志记录的详细级别。在先前版本的 MPT 模型中用于日志记录。此参数已弃用。
The verbosity level to use for logging. Used in the previous versions of MPT models for logging. This
argument is deprecated.
embedding_fraction (`float`, *optional*, defaults to 1.0):
# 缩放嵌入层梯度的比例。
The fraction to scale the gradients of the embedding layer by.
norm_type (`str`, *optional*, defaults to `"low_precision_layernorm"`):
# 要使用的层归一化类型。所有 MPT 模型使用相同的层归一化实现。为了向后兼容而定义。
Type of layer norm to use. All MPT models uses the same layer norm implementation. Defined for backward
compatibility.
use_cache (`bool`, *optional*, defaults to `False`):
# 模型是否应返回最后的 key/values 注意力(并非所有模型都使用)。
Whether or not the model should return the last key/values attentions (not used by all models).
initializer_range (`float`, *optional*, defaults to 0.02):
# 用于初始化所有权重矩阵的截断正态初始化器的标准差。
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
Example:
```
# 导入 transformers 库中的 MptConfig 和 MptModel 类
>>> from transformers import MptConfig, MptModel
# 初始化一个 MptConfig 实例
>>> # Initializing a Mpt configuration
>>> configuration = MptConfig()
# 使用配置初始化一个模型实例(权重随机生成)
>>> # Initializing a model (with random weights) from the configuration
>>> model = MptModel(configuration)
# 获取模型的配置信息
>>> # Accessing the model configuration
>>> configuration = model.config
# 设定模型类型为 "mpt"
model_type = "mpt"
# 定义一个属性映射字典,将 MptConfig 中的部分属性名映射到另一种命名方式
attribute_map = {
"num_attention_heads": "n_heads", # 注意力头数量映射为 n_heads
"hidden_size": "d_model", # 隐藏层大小映射为 d_model
"num_hidden_layers": "n_layers", # 隐藏层层数映射为 n_layers
}
# 定义 MptConfig 类
def __init__(
self,
d_model: int = 2048,
n_heads: int = 16,
n_layers: int = 24,
expansion_ratio: int = 4,
max_seq_len: int = 2048,
vocab_size: int = 50368,
resid_pdrop: float = 0.0,
layer_norm_epsilon: float = 1e-5,
emb_pdrop: float = 0.0,
learned_pos_emb: bool = True,
attn_config: MptAttentionConfig = None,
init_device: str = "cpu",
logit_scale: Optional[Union[float, str]] = None,
no_bias: bool = True,
verbose: int = 0,
embedding_fraction: float = 1.0,
norm_type: str = "low_precision_layernorm",
use_cache: bool = False,
initializer_range=0.02,
**kwargs,
):
# 如果没有给定 attn_config,则初始化一个空的 MptAttentionConfig 对象
if attn_config is None:
self.attn_config = MptAttentionConfig()
# 如果 attn_config 是字典类型,则使用这些参数初始化一个 MptAttentionConfig 对象
elif isinstance(attn_config, dict):
self.attn_config = MptAttentionConfig(**attn_config)
else:
self.attn_config = attn_config
# 初始化各个属性值
self.d_model = d_model
self.n_heads = n_heads
self.n_layers = n_layers
self.expansion_ratio = expansion_ratio
self.max_seq_len = max_seq_len
self.vocab_size = vocab_size
self.resid_pdrop = resid_pdrop
self.emb_pdrop = emb_pdrop
self.learned_pos_emb = learned_pos_emb
self.init_device = init_device
self.logit_scale = logit_scale
self.no_bias = no_bias
self.verbose = verbose
self.embedding_fraction = embedding_fraction
self.norm_type = norm_type
self.layer_norm_epsilon = layer_norm_epsilon
self.use_cache = use_cache
self.initializer_range = initializer_range
# 调用父类的构造函数,传递其他未命名的关键字参数
super().__init__(**kwargs)
.\models\mpt\modeling_mpt.py
"""PyTorch MPT model."""
import math
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
from torch.nn import functional as F
from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
QuestionAnsweringModelOutput,
SequenceClassifierOutputWithPast,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import logging
from .configuration_mpt import MptConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "mosaicml/mpt-7b"
_CONFIG_FOR_DOC = "MptConfig"
MPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"mosaicml/mpt-7b",
"mosaicml/mpt-7b-storywriter",
"mosaicml/mpt-7b-instruct",
"mosaicml/mpt-7b-8k",
"mosaicml/mpt-7b-8k-instruct",
"mosaicml/mpt-7b-8k-chat",
"mosaicml/mpt-30b",
"mosaicml/mpt-30b-instruct",
"mosaicml/mpt-30b-chat",
]
def build_mpt_alibi_tensor(num_heads, sequence_length, alibi_bias_max=8, device=None):
r"""
Link to paper: https://arxiv.org/abs/2108.12409 - Alibi tensor is not causal as the original paper mentions, it
relies on a translation invariance of softmax for quick implementation. This implementation has been copied from
the alibi implementation of MPT source code that led to slightly different results than the Bloom alibi:
https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L292
"""
alibi = torch.arange(1 - sequence_length, 1, dtype=torch.int32, device=device).view(1, 1, 1, sequence_length)
num_heads_power_of_2 = 2 ** math.ceil(math.log2(num_heads))
base = torch.arange(1, num_heads_power_of_2 + 1, dtype=torch.int64, device=device).float()
base = base * (alibi_bias_max / num_heads_power_of_2)
slopes = 1.0 / torch.pow(2, base)
slopes = slopes.view(1, num_heads_power_of_2, 1, 1)
if num_heads_power_of_2 != num_heads:
slopes = torch.cat([slopes[:, 1::2, ...], slopes[:, ::2, ...]], dim=1)[:, :num_heads, ...]
alibi = alibi * slopes
return alibi.squeeze(0)
class MptAttention(nn.Module):
"""Multi-head self attention.
Using torch or triton attention implementation enables user to also use additive bias.
多头自注意力模块,使用 torch 或 triton 实现的注意力机制,允许用户使用附加偏置。
"""
def __init__(self, config: MptConfig):
super().__init__()
self.hidden_size = config.hidden_size
self.n_heads = config.n_heads
self.max_seq_length = config.max_seq_len
self.head_dim = self.hidden_size // self.n_heads
self.softmax_scale = config.attn_config.softmax_scale
if self.softmax_scale is None:
self.softmax_scale = 1 / math.sqrt(self.hidden_size / self.n_heads)
self.attn_dropout_p = config.attn_config.attn_pdrop
self.Wqkv = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False)
self.out_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
def forward(
self,
hidden_states: torch.Tensor,
position_bias: torch.Tensor,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
):
):
batch_size, seq_length = hidden_states.shape[:2]
mixed_qkv = self.Wqkv(hidden_states)
query_states, key_states, value_states = mixed_qkv.chunk(3, dim=2)
query_states = query_states.reshape(batch_size, seq_length, self.n_heads, self.head_dim).transpose(1, 2)
key_states = key_states.reshape(batch_size, seq_length, self.n_heads, self.head_dim).transpose(1, 2)
value_states = value_states.reshape(batch_size, seq_length, self.n_heads, self.head_dim).transpose(1, 2)
if past_key_value is not None:
if len(past_key_value) != 0:
key_states = torch.cat([past_key_value[0], key_states], dim=2)
value_states = torch.cat([past_key_value[1], value_states], dim=2)
past_key_value = (key_states, value_states)
else:
past_key_value = (key_states, value_states)
attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2)) * self.softmax_scale
query_length = seq_length if past_key_value is None else seq_length + past_key_value[0].shape[2]
if position_bias is not None:
if len(position_bias.shape) != 3:
raise ValueError(f"Expecting position_bias shape to be 3 dimensions, got {len(position_bias.shape)}")
key_length = key_states.shape[-2]
position_bias_query_index = max(0, position_bias.size(1) - query_length)
position_bias_key_index = max(0, position_bias.size(2) - key_length)
position_bias = position_bias[:, position_bias_query_index:, position_bias_key_index:]
attention_scores = attention_scores + position_bias
if attention_mask is not None:
attention_scores = attention_scores.masked_fill(attention_mask, torch.finfo(query_states.dtype).min)
attn_weights = nn.functional.softmax(attention_scores.float(), dim=-1).to(value_states.dtype)
attn_weights = nn.functional.dropout(attn_weights, p=self.attn_dropout_p, training=self.training)
context_states = torch.matmul(attn_weights, value_states)
context_states = context_states.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_length, -1)
attn_output = self.out_proj(context_states)
return attn_output, attn_weights, past_key_value
class MptMLP(nn.Module):
def __init__(self, config: MptConfig):
super().__init__()
hidden_size = config.hidden_size
self.up_proj = nn.Linear(hidden_size, 4 * hidden_size, bias=False)
self.act = nn.GELU(approximate="none")
self.down_proj = nn.Linear(4 * hidden_size, hidden_size, bias=False)
self.hidden_dropout = config.attn_config.attn_pdrop
def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor) -> torch.Tensor:
hidden_states = self.act(self.up_proj(hidden_states))
intermediate_output = self.down_proj(hidden_states)
output = F.dropout(intermediate_output, p=self.hidden_dropout, training=self.training)
output = output + residual
return output
class MptBlock(nn.Module):
def __init__(self, config: MptConfig):
super().__init__()
hidden_size = config.hidden_size
self.norm_1 = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
self.num_heads = config.n_heads
self.attn = MptAttention(config)
self.norm_2 = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
self.ffn = MptMLP(config)
self.dropout_rate = config.attn_config.attn_pdrop
self.resid_attn_dropout = nn.Dropout(self.dropout_rate)
def forward(
self,
hidden_states: torch.Tensor,
position_bias: torch.Tensor,
attention_mask: torch.Tensor,
layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
use_cache: bool = False,
output_attentions: bool = False,
):
layernorm_output = self.norm_1(hidden_states)
residual = hidden_states
attn_outputs, attn_weights, past_key_value = self.attn(
layernorm_output,
position_bias=position_bias,
attention_mask=attention_mask,
past_key_value=layer_past,
)
hidden_states = self.resid_attn_dropout(attn_outputs) + residual
layernorm_output = self.norm_2(hidden_states)
residual = hidden_states
output = self.ffn(layernorm_output, residual)
outputs = (output,)
if use_cache:
outputs += (past_key_value,)
if output_attentions:
outputs += (attn_weights,)
return outputs
class MptPreTrainedModel(PreTrainedModel):
config_class = MptConfig
base_model_prefix = "transformer"
supports_gradient_checkpointing = True
_no_split_modules = ["MptBlock"]
_keys_to_ignore_on_load_missing = [r"lm_head.*."]
def __init__(self, *inputs, **kwargs):
super().__init__(*inputs, **kwargs)
def __init__(self, *inputs, **kwargs):
super().__init__(*inputs, **kwargs)
def _init_weights(self, module: nn.Module):
"""Initialize the weights."""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, LayerNorm):
if module.bias is not None:
module.bias.data.zero_()
module.weight.data.fill_(1.0)
def _init_weights(self, module: nn.Module):
"""Initialize the weights."""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, LayerNorm):
if module.bias is not None:
module.bias.data.zero_()
module.weight.data.fill_(1.0)
@staticmethod
def _convert_to_mpt_cache(
past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]],
) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
"""
Converts the cache to the format expected by Mpt, i.e. to tuple(tuple([batch_size * num_heads, ...]))
"""
batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape
batch_size_times_num_heads = batch_size * num_heads
return tuple(
(
layer_past[0].reshape(batch_size_times_num_heads, head_dim, seq_length),
layer_past[1].reshape(batch_size_times_num_heads, seq_length, head_dim),
)
for layer_past in past_key_value
)
@staticmethod
def _convert_to_mpt_cache(
past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]],
) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
"""
Converts the cache to the format expected by Mpt, i.e. to tuple(tuple([batch_size * num_heads, ...]))
"""
batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape
batch_size_times_num_heads = batch_size * num_heads
return tuple(
(
layer_past[0].reshape(batch_size_times_num_heads, head_dim, seq_length),
layer_past[1].reshape(batch_size_times_num_heads, seq_length, head_dim),
)
for layer_past in past_key_value
)
MPT_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`MptConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
MPT_INPUTS_DOCSTRING = r"""
# 接收输入参数的函数定义,用于处理Transformer模型的输入
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
`input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
(`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.
If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
`input_ids`.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.n_layers`):
Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
`past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
their past given to this model should not be passed as `input_ids` as they have already been computed.
Each element of `past_key_values` is a tuple (past_key, past_value):
- past_key: [batch_size * num_heads, head_dim, kv_length]
- past_value: [batch_size * num_heads, kv_length, head_dim]
attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
`past_key_values`).
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare Mpt Model transformer outputting raw hidden-states without any specific head on top.",
MPT_START_DOCSTRING,
)
"""
class MptModel(MptPreTrainedModel):
"""
MPT Model class inheriting from MptPreTrainedModel, initializing the model with given configuration.
Args:
config (MptConfig): The configuration class defining model parameters.
Attributes:
hidden_size (int): Size of the hidden layers.
num_heads (int): Number of attention heads.
wte (nn.Embedding): Word token embeddings.
blocks (nn.ModuleList): List of transformer blocks.
norm_f (LayerNorm): Final layer normalization.
gradient_checkpointing (bool): Flag for gradient checkpointing.
Methods:
get_input_embeddings(): Returns the input embeddings.
build_mpt_alibi_tensor(): Builds alibi tensor for MPT.
set_input_embeddings(new_embeddings): Sets new input embeddings.
forward(): Performs forward pass through the model.
"""
def __init__(self, config: MptConfig):
super().__init__(config)
self.hidden_size = config.hidden_size
self.num_heads = config.n_heads
# Embedding + LN Embedding
self.wte = nn.Embedding(config.vocab_size, self.hidden_size)
# Transformer blocks
self.blocks = nn.ModuleList([MptBlock(config) for _ in range(config.n_layers)])
# Final Layer Norm
self.norm_f = LayerNorm(self.hidden_size, eps=config.layer_norm_epsilon)
# backward compatibility with weights on the Hub
self.norm_f.bias = None
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
"""
Returns:
nn.Embedding: The input word token embeddings.
"""
return self.wte
def build_mpt_alibi_tensor(self, num_heads, sequence_length, alibi_bias_max=8, device=None):
"""
Builds an alibi tensor for MPT.
Args:
num_heads (int): Number of attention heads.
sequence_length (int): Length of the input sequence.
alibi_bias_max (int, optional): Maximum bias value for alibi tensor. Defaults to 8.
device (torch.device, optional): Device to place alibi tensor on. Defaults to None.
Returns:
torch.Tensor: Alibi tensor for MPT.
"""
return build_mpt_alibi_tensor(num_heads, sequence_length, alibi_bias_max, device)
def set_input_embeddings(self, new_embeddings: torch.Tensor):
"""
Sets new input embeddings.
Args:
new_embeddings (torch.Tensor): New input embeddings to be set.
"""
self.wte = new_embeddings
@add_start_docstrings_to_model_forward(MPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPastAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
attention_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Performs forward pass through the MPT model.
Args:
input_ids (torch.LongTensor, optional): Input token IDs.
past_key_values (Tuple[Tuple[torch.Tensor, torch.Tensor], ...], optional): Past key-value states for fast decoding.
attention_mask (torch.Tensor, optional): Mask to avoid attention on padding tokens.
inputs_embeds (torch.LongTensor, optional): Optional input embeddings.
use_cache (bool, optional): Whether to use cached key-value states.
output_attentions (bool, optional): Whether to output attention weights.
output_hidden_states (bool, optional): Whether to output hidden states.
return_dict (bool, optional): Whether to return a dictionary as output.
Returns:
BaseModelOutputWithPastAndCrossAttentions: Model output including past and cross attentions.
"""
# Implementation of forward pass is omitted here for brevity
pass
"""
@add_start_docstrings(
"""
The MPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
""",
MPT_START_DOCSTRING,
)
"""
class MptForCausalLM(MptPreTrainedModel):
"""
MPT Model for Causal Language Modeling, inheriting from MptPreTrainedModel.
Args:
config (MptConfig): The configuration class defining model parameters.
Attributes:
transformer (MptModel): The MPT base model transformer.
lm_head (nn.Linear): Language modeling head.
Methods:
get_output_embeddings(): Returns the output embeddings.
set_output_embeddings(new_embeddings): Sets new output embeddings.
"""
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config: MptConfig):
super().__init__(config)
self.transformer = MptModel(config)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
"""
Returns:
nn.Linear: The language modeling head.
"""
return self.lm_head
def set_output_embeddings(self, new_embeddings: torch.Tensor):
"""
Sets new output embeddings.
Args:
new_embeddings (torch.Tensor): New output embeddings to be set.
"""
self.lm_head = new_embeddings
def prepare_inputs_for_generation(
self,
input_ids: torch.LongTensor,
past_key_values: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
**kwargs,
) -> dict:
# 如果 past_key_values 不为 None,则仅保留 input_ids 的最后一部分
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
# 一些生成方法已经只传递最后一个输入 ID
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
# 默认旧行为:仅保留最后一个 ID
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
# 如果传入了 `inputs_embeds`,并且 past_key_values 是 None,则只在第一个生成步骤中使用它们
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
# 更新 model_inputs 字典,包括 past_key_values、use_cache 和 attention_mask
model_inputs.update(
{
"past_key_values": past_key_values, # NITS 这里应该是 layer_past 吗?
"use_cache": use_cache,
"attention_mask": attention_mask,
}
)
return model_inputs
@add_start_docstrings_to_model_forward(MPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=CausalLMOutputWithCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
attention_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Model 的前向传播方法,接受各种输入和参数进行推理和生成。
Parameters:
- input_ids (Optional[torch.LongTensor]): 输入的 token IDs.
- past_key_values (Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]]): 用于存储过去的 key 和 value 的元组。
- attention_mask (Optional[torch.Tensor]): 注意力遮罩,掩盖不需要计算的位置。
- inputs_embeds (Optional[torch.Tensor]): 如果传入,代表已经嵌入的输入。
- labels (Optional[torch.Tensor]): 模型的标签,用于计算损失。
- use_cache (Optional[bool]): 是否使用缓存以加速生成。
- output_attentions (Optional[bool]): 是否输出注意力权重。
- output_hidden_states (Optional[bool]): 是否输出隐藏状态。
- return_dict (Optional[bool]): 是否返回字典格式的输出。
Returns:
- 输出字典,包含模型生成的各种输出。
"""
) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
`labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
"""
# Determine whether to return a dictionary of outputs
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# Pass input through the transformer model
transformer_outputs = self.transformer(
input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# Extract hidden states from transformer outputs
hidden_states = transformer_outputs[0]
# Generate logits from the language model head
lm_logits = self.lm_head(hidden_states)
# Initialize loss as None
loss = None
# Calculate loss if labels are provided
if labels is not None:
# Move labels to the same device as logits for model parallelism
labels = labels.to(lm_logits.device)
# Shift logits and labels to align predictions and targets
shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
batch_size, seq_length, vocab_size = shift_logits.shape
# Flatten the logits and labels to compute loss
loss_fct = CrossEntropyLoss()
loss = loss_fct(
shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length)
)
# Prepare the output depending on return_dict flag
if not return_dict:
output = (lm_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
# Return structured output using CausalLMOutputWithCrossAttentions class
return CausalLMOutputWithCrossAttentions(
loss=loss,
logits=lm_logits,
past_key_values=transformer_outputs.past_key_values,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
def _reorder_cache(
self, past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
"""
This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
[`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
beam_idx at every generation step.
Output shares the same memory storage as `past`.
"""
# 创建一个字典,将每个 `layer_past` 的 `device` 映射到对应的 `beam_idx`,确保在每个生成步骤中 `past_key_values` 与正确的 `beam_idx` 匹配
device_to_beam_idx = {
past_state.device: beam_idx.to(past_state.device) for layer_past in past for past_state in layer_past
}
# 重新排序 `past`,使得每个 `layer_past` 的数据按照 `device_to_beam_idx` 中的索引顺序重新排列
reordered_past = tuple(
(
layer_past[0].index_select(0, device_to_beam_idx[layer_past[0].device]),
layer_past[1].index_select(0, device_to_beam_idx[layer_past[0].device]),
)
for layer_past in past
)
# 返回重新排序后的 `past`,保持与输入 `past` 相同的内存存储结构
return reordered_past
"""
The MPT Model transformer with a sequence classification head on top (linear layer).
[`MptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
(e.g. GPT-1) do.
Since it does classification on the last token, it requires to know the position of the last token. If a
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
each row of the batch).
"""
@add_start_docstrings(
"""
MPT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
MPT_START_DOCSTRING,
)
class MptForTokenClassification(MptPreTrainedModel):
def __init__(self, config: MptConfig):
super().__init__(config)
self.num_labels = config.num_labels
# Initialize the MPT transformer model with the provided configuration
self.transformer = MptModel(config)
# Determine the dropout rate for the classifier layer based on the provided configuration
if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
classifier_dropout = config.classifier_dropout
elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
classifier_dropout = config.hidden_dropout
else:
classifier_dropout = 0.1
# Apply dropout regularization to the classifier layer
self.dropout = nn.Dropout(classifier_dropout)
# Create a linear layer for the classification task with output size as specified in the configuration
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(MPT_INPUTS_DOCSTRING)
"""
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
attention_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**deprecated_arguments,
) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.transformer(
input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = transformer_outputs[0]
hidden_states = self.dropout(hidden_states)
logits = self.classifier(hidden_states)
loss = None
if labels is not None:
labels = labels.to(logits.device)
batch_size, seq_length = labels.shape
loss_fct = CrossEntropyLoss()
loss = loss_fct(
logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)
)
if not return_dict:
output = (logits,) + transformer_outputs[2:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings(
"""
The MPT Model transformer with a span classification head on top for extractive question-answering tasks like SQuAD
(a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
MPT_START_DOCSTRING,
)
class MptForQuestionAnswering(MptPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.transformer = MptModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, 2)
self.post_init()
@add_start_docstrings_to_model_forward(MPT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, QuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
total_loss = None
if start_positions is not None and end_positions is not None:
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
.\models\mpt\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_mpt": ["MPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MptConfig", "MptOnnxConfig"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_mpt"] = [
"MPT_PRETRAINED_MODEL_ARCHIVE_LIST",
"MptForCausalLM",
"MptModel",
"MptPreTrainedModel",
"MptForSequenceClassification",
"MptForTokenClassification",
"MptForQuestionAnswering",
]
if TYPE_CHECKING:
from .configuration_mpt import MPT_PRETRAINED_CONFIG_ARCHIVE_MAP, MptConfig, MptOnnxConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_mpt import (
MPT_PRETRAINED_MODEL_ARCHIVE_LIST,
MptForCausalLM,
MptForQuestionAnswering,
MptForSequenceClassification,
MptForTokenClassification,
MptModel,
MptPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\mra\configuration_mra.py
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
MRA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"uw-madison/mra-base-512-4": "https://huggingface.co/uw-madison/mra-base-512-4/resolve/main/config.json",
}
class MraConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`MraModel`]. It is used to instantiate an MRA
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Mra
[uw-madison/mra-base-512-4](https://huggingface.co/uw-madison/mra-base-512-4) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
Args:
vocab_size (`int`, *optional*, defaults to 50265):
Mra 模型的词汇表大小,定义了在调用 [`MraModel`] 时输入 `inputs_ids` 可以表示的不同标记数量。
hidden_size (`int`, *optional*, defaults to 768):
编码器层和池化层的维度大小。
num_hidden_layers (`int`, *optional*, defaults to 12):
Transformer 编码器中隐藏层的数量。
num_attention_heads (`int`, *optional*, defaults to 12):
Transformer 编码器中每个注意力层的注意头数量。
intermediate_size (`int`, *optional*, defaults to 3072):
Transformer 编码器中“中间”(即前馈)层的维度大小。
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
编码器和池化器中的非线性激活函数(函数或字符串)。如果是字符串,支持 `"gelu"`, `"relu"`, `"selu"` 和 `"gelu_new"`。
hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
嵌入层、编码器和池化器中所有全连接层的 dropout 概率。
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
注意力概率的 dropout 比率。
max_position_embeddings (`int`, *optional*, defaults to 512):
该模型可能使用的最大序列长度。通常设置一个大值(例如 512、1024 或 2048)以防万一。
type_vocab_size (`int`, *optional*, defaults to 1):
在调用 [`MraModel`] 时传递的 `token_type_ids` 的词汇表大小。
initializer_range (`float`, *optional*, defaults to 0.02):
用于初始化所有权重矩阵的截断正态初始化器的标准差。
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
层归一化层使用的 epsilon。
position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
位置嵌入的类型。选择 `"absolute"`, `"relative_key"`, `"relative_key_query"` 之一。
block_per_row (`int`, *optional*, defaults to 4):
用于设置高分辨率比例的预算。
approx_mode (`str`, *optional*, defaults to `"full"`):
控制是否使用低分辨率和高分辨率的逼近。设置为 `"full"` 表示同时使用低分辨率和高分辨率,设置为 `"sparse"` 表示仅使用低分辨率。
initial_prior_first_n_blocks (`int`, *optional*, defaults to 0):
最初使用高分辨率的块数。
initial_prior_diagonal_n_blocks (`int`, *optional*, defaults to 0):
使用高分辨率的对角块数。
Example:
>>> from transformers import MraConfig, MraModel
>>> configuration = MraConfig()
>>> model = MraModel(configuration)
>>> configuration = model.config
.\models\mra\convert_mra_pytorch_to_pytorch.py
import argparse
import torch
from transformers import MraConfig, MraForMaskedLM
def rename_key(orig_key):
if "model" in orig_key:
orig_key = orig_key.replace("model.", "")
if "norm1" in orig_key:
orig_key = orig_key.replace("norm1", "attention.output.LayerNorm")
if "norm2" in orig_key:
orig_key = orig_key.replace("norm2", "output.LayerNorm")
if "norm" in orig_key:
orig_key = orig_key.replace("norm", "LayerNorm")
if "transformer" in orig_key:
layer_num = orig_key.split(".")[0].split("_")[-1]
orig_key = orig_key.replace(f"transformer_{layer_num}", f"encoder.layer.{layer_num}")
if "mha.attn" in orig_key:
orig_key = orig_key.replace("mha.attn", "attention.self")
if "mha" in orig_key:
orig_key = orig_key.replace("mha", "attention")
if "W_q" in orig_key:
orig_key = orig_key.replace("W_q", "self.query")
if "W_k" in orig_key:
orig_key = orig_key.replace("W_k", "self.key")
if "W_v" in orig_key:
orig_key = orig_key.replace("W_v", "self.value")
if "ff.0" in orig_key:
orig_key = orig_key.replace("ff.0", "intermediate.dense")
if "ff.2" in orig_key:
orig_key = orig_key.replace("ff.2", "output.dense")
if "ff" in orig_key:
orig_key = orig_key.replace("ff", "output.dense")
if "mlm_class" in orig_key:
orig_key = orig_key.replace("mlm.mlm_class", "cls.predictions.decoder")
if "mlm" in orig_key:
orig_key = orig_key.replace("mlm", "cls.predictions.transform")
if "backbone.backbone.encoders" in orig_key:
orig_key = orig_key.replace("backbone.backbone.encoders", "encoder.layer")
if "cls" not in orig_key:
orig_key = "mra." + orig_key
return orig_key
def convert_checkpoint_helper(max_position_embeddings, orig_state_dict):
for key in orig_state_dict.copy().keys():
val = orig_state_dict.pop(key)
if ("pooler" in key) or ("sen_class" in key):
continue
else:
orig_state_dict[rename_key(key)] = val
orig_state_dict["cls.predictions.bias"] = orig_state_dict["cls.predictions.decoder.bias"]
orig_state_dict["mra.embeddings.position_ids"] = torch.arange(max_position_embeddings).expand((1, -1)) + 2
return orig_state_dict
def convert_mra_checkpoint(checkpoint_path, mra_config_file, pytorch_dump_path):
orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"]
config = MraConfig.from_json_file(mra_config_file)
model = MraForMaskedLM(config)
new_state_dict = convert_checkpoint_helper(config.max_position_embeddings, orig_state_dict)
print(model.load_state_dict(new_state_dict))
model.eval()
model.save_pretrained(pytorch_dump_path)
print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--pytorch_model_path", default=None, type=str, required=True, help="Path to Mra pytorch checkpoint."
)
parser.add_argument(
"--config_file",
default=None,
type=str,
required=True,
help="The json file for Mra model config.",
)
parser.add_argument(
"--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
args = parser.parse_args()
convert_mra_checkpoint(args.pytorch_model_path, args.config_file, args.pytorch_dump_path)
.\models\mra\modeling_mra.py
""" PyTorch MRA model."""
import math
from pathlib import Path
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from torch.utils.cpp_extension import load
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutputWithCrossAttentions,
MaskedLMOutput,
MultipleChoiceModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import (
apply_chunking_to_forward,
find_pruneable_heads_and_indices,
prune_linear_layer,
)
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_ninja_available,
is_torch_cuda_available,
logging,
)
from .configuration_mra import MraConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "uw-madison/mra-base-512-4"
_CONFIG_FOR_DOC = "MraConfig"
_TOKENIZER_FOR_DOC = "AutoTokenizer"
MRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"uw-madison/mra-base-512-4",
]
mra_cuda_kernel = None
def load_cuda_kernels():
global mra_cuda_kernel
src_folder = Path(__file__).resolve().parent.parent.parent / "kernels" / "mra"
def append_root(files):
return [src_folder / file for file in files]
src_files = append_root(["cuda_kernel.cu", "cuda_launch.cu", "torch_extension.cpp"])
mra_cuda_kernel = load("cuda_kernel", src_files, verbose=True)
def sparse_max(sparse_qk_prod, indices, query_num_block, key_num_block):
"""
Computes maximum values for softmax stability.
计算softmax稳定性的最大值。
"""
if len(sparse_qk_prod.size()) != 4:
raise ValueError("sparse_qk_prod must be a 4-dimensional tensor.")
if len(indices.size()) != 2:
raise ValueError("indices must be a 2-dimensional tensor.")
if sparse_qk_prod.size(2) != 32:
raise ValueError("The size of the second dimension of sparse_qk_prod must be 32.")
if sparse_qk_prod.size(3) != 32:
raise ValueError("The size of the third dimension of sparse_qk_prod must be 32.")
index_vals = sparse_qk_prod.max(dim=-2).values.transpose(-1, -2)
index_vals = index_vals.contiguous()
indices = indices.int()
indices = indices.contiguous()
max_vals, max_vals_scatter = mra_cuda_kernel.index_max(index_vals, indices, query_num_block, key_num_block)
max_vals_scatter = max_vals_scatter.transpose(-1, -2)[:, :, None, :]
return max_vals, max_vals_scatter
def sparse_mask(mask, indices, block_size=32):
if len(mask.size()) != 2:
raise ValueError("mask must be a 2-dimensional tensor.")
if len(indices.size()) != 2:
raise ValueError("indices must be a 2-dimensional tensor.")
if mask.shape[0] != indices.shape[0]:
raise ValueError("mask and indices must have the same size in the zero-th dimension.")
batch_size, seq_len = mask.shape
num_block = seq_len // block_size
batch_idx = torch.arange(indices.size(0), dtype=torch.long, device=indices.device)
mask = mask.reshape(batch_size, num_block, block_size)
mask = mask[batch_idx[:, None], (indices % num_block).long(), :]
return mask
def mm_to_sparse(dense_query, dense_key, indices, block_size=32):
batch_size, query_size, dim = dense_query.size()
_, key_size, dim = dense_key.size()
if query_size % block_size != 0:
raise ValueError("query_size (size of first dimension of dense_query) must be divisible by block_size.")
if key_size % block_size != 0:
raise ValueError("key_size (size of first dimension of dense_key) must be divisible by block_size.")
dense_query = dense_query.reshape(batch_size, query_size // block_size, block_size, dim).transpose(-1, -2)
dense_key = dense_key.reshape(batch_size, key_size // block_size, block_size, dim).transpose(-1, -2)
if len(dense_query.size()) != 4:
raise ValueError("dense_query must be a 4-dimensional tensor.")
if len(dense_key.size()) != 4:
raise ValueError("dense_key must be a 4-dimensional tensor.")
if len(indices.size()) != 2:
raise ValueError("indices must be a 2-dimensional tensor.")
if dense_query.size(3) != 32:
raise ValueError("The third dimension of dense_query must be 32.")
if dense_key.size(3) != 32:
raise ValueError("The third dimension of dense_key must be 32.")
dense_query = dense_query.contiguous()
dense_key = dense_key.contiguous()
indices = indices.int().contiguous()
return mra_cuda_kernel.mm_to_sparse(dense_query, dense_key, indices.int())
def sparse_dense_mm(sparse_query, indices, dense_key, query_num_block, block_size=32):
batch_size, key_size, dim = dense_key.size()
if key_size % block_size != 0:
raise ValueError("key_size (size of first dimension of dense_key) must be divisible by block_size.")
if sparse_query.size(2) != block_size:
raise ValueError("The size of the second dimension of sparse_query must be equal to the block_size.")
if sparse_query.size(3) != block_size:
raise ValueError("The size of the third dimension of sparse_query must be equal to the block_size.")
dense_key = dense_key.reshape(batch_size, key_size // block_size, block_size, dim).transpose(-1, -2)
if len(sparse_query.size()) != 4:
raise ValueError("sparse_query must be a 4-dimensional tensor.")
if len(dense_key.size()) != 4:
raise ValueError("dense_key must be a 4-dimensional tensor.")
if len(indices.size()) != 2:
raise ValueError("indices must be a 2-dimensional tensor.")
if dense_key.size(3) != 32:
raise ValueError("The size of the third dimension of dense_key must be 32.")
sparse_query = sparse_query.contiguous()
indices = indices.int()
indices = indices.contiguous()
dense_key = dense_key.contiguous()
dense_qk_prod = mra_cuda_kernel.sparse_dense_mm(sparse_query, indices, dense_key, query_num_block)
dense_qk_prod = dense_qk_prod.transpose(-1, -2).reshape(batch_size, query_num_block * block_size, dim)
return dense_qk_prod
def transpose_indices(indices, dim_1_block, dim_2_block):
return ((indices % dim_2_block) * dim_1_block + torch.div(indices, dim_2_block, rounding_mode="floor")).long()
class MraSampledDenseMatMul(torch.autograd.Function):
@staticmethod
def forward(ctx, dense_query, dense_key, indices, block_size):
sparse_qk_prod = mm_to_sparse(dense_query, dense_key, indices, block_size)
ctx.save_for_backward(dense_query, dense_key, indices)
ctx.block_size = block_size
return sparse_qk_prod
@staticmethod
def backward(ctx, grad):
dense_query, dense_key, indices = ctx.saved_tensors
block_size = ctx.block_size
query_num_block = dense_query.size(1) // block_size
key_num_block = dense_key.size(1) // block_size
indices_T = transpose_indices(indices, query_num_block, key_num_block)
grad_key = sparse_dense_mm(grad.transpose(-1, -2), indices_T, dense_query, key_num_block)
grad_query = sparse_dense_mm(grad, indices, dense_key, query_num_block)
return grad_query, grad_key, None, None
@staticmethod
def operator_call(dense_query, dense_key, indices, block_size=32):
return MraSampledDenseMatMul.apply(dense_query, dense_key, indices, block_size)
class MraSparseDenseMatMul(torch.autograd.Function):
@staticmethod
def forward(ctx, sparse_query, indices, dense_key, query_num_block):
sparse_qk_prod = sparse_dense_mm(sparse_query, indices, dense_key, query_num_block)
ctx.save_for_backward(sparse_query, indices, dense_key)
ctx.query_num_block = query_num_block
return sparse_qk_prod
@staticmethod
def backward(ctx, grad):
sparse_query, indices, dense_key = ctx.saved_tensors
query_num_block = ctx.query_num_block
key_num_block = dense_key.size(1) // sparse_query.size(-1)
indices_T = transpose_indices(indices, query_num_block, key_num_block)
grad_key = sparse_dense_mm(sparse_query.transpose(-1, -2), indices_T, grad, key_num_block)
grad_query = mm_to_sparse(grad, dense_key, indices)
return grad_query, None, grad_key, None
@staticmethod
def operator_call(sparse_query, indices, dense_key, query_num_block):
return MraSparseDenseMatMul.apply(sparse_query, indices, dense_key, query_num_block)
class MraReduceSum:
@staticmethod
def operator_call(sparse_query, indices, query_num_block, key_num_block):
batch_size, num_block, block_size, _ = sparse_query.size()
if len(sparse_query.size()) != 4:
raise ValueError("sparse_query must be a 4-dimensional tensor.")
if len(indices.size()) != 2:
raise ValueError("indices must be a 2-dimensional tensor.")
_, _, block_size, _ = sparse_query.size()
batch_size, num_block = indices.size()
sparse_query = sparse_query.sum(dim=2).reshape(batch_size * num_block, block_size)
batch_idx = torch.arange(indices.size(0), dtype=torch.long, device=indices.device)
global_idxes = (
torch.div(indices, key_num_block, rounding_mode="floor").long() + batch_idx[:, None] * query_num_block
).reshape(batch_size * num_block)
temp = torch.zeros(
(batch_size * query_num_block, block_size), dtype=sparse_query.dtype, device=sparse_query.device
)
output = temp.index_add(0, global_idxes, sparse_query).reshape(batch_size, query_num_block, block_size)
output = output.reshape(batch_size, query_num_block * block_size)
return output
batch_size, seq_len, head_dim = query.size()
num_block_per_row = seq_len // block_size
value_hat = None
if mask is not None:
token_count = mask.reshape(batch_size, num_block_per_row, block_size).sum(dim=-1)
query_hat = query.reshape(batch_size, num_block_per_row, block_size, head_dim).sum(dim=-2) / (
token_count[:, :, None] + 1e-6
)
key_hat = key.reshape(batch_size, num_block_per_row, block_size, head_dim).sum(dim=-2) / (
token_count[:, :, None] + 1e-6
)
if value is not None:
value_hat = value.reshape(batch_size, num_block_per_row, block_size, head_dim).sum(dim=-2) / (
token_count[:, :, None] + 1e-6
)
else:
token_count = block_size * torch.ones(batch_size, num_block_per_row, dtype=torch.float, device=query.device)
query_hat = query.reshape(batch_size, num_block_per_row, block_size, head_dim).mean(dim=-2)
key_hat = key.reshape(batch_size, num_block_per_row, block_size, head_dim).mean(dim=-2)
if value is not None:
value_hat = value.reshape(batch_size, num_block_per_row, block_size, head_dim).mean(dim=-2)
low_resolution_logit = torch.matmul(query_hat, key_hat.transpose(-1, -2)) / math.sqrt(head_dim)
low_resolution_logit_row_max = low_resolution_logit.max(dim=-1, keepdims=True).values
if mask is not None:
low_resolution_logit = (
low_resolution_logit - 1e4 * ((token_count[:, None, :] * token_count[:, :, None]) < 0.5).float()
)
return low_resolution_logit, token_count, low_resolution_logit_row_max, value_hat
indices = top_k_vals.indices
if approx_mode == "full":
threshold = top_k_vals.values.min(dim=-1).values
high_resolution_mask = (low_resolution_logit >= threshold[:, None, None]).float()
elif approx_mode == "sparse":
high_resolution_mask = None
else:
raise ValueError(f"{approx_mode} is not a valid approx_model value.")
return indices, high_resolution_mask
"""
使用 Mra 来近似自注意力机制。
"""
if mra_cuda_kernel is None:
return torch.zeros_like(query).requires_grad_()
batch_size, num_head, seq_len, head_dim = query.size()
meta_batch = batch_size * num_head
if seq_len % block_size != 0:
raise ValueError("sequence length must be divisible by the block_size.")
num_block_per_row = seq_len // block_size
query = query.reshape(meta_batch, seq_len, head_dim)
key = key.reshape(meta_batch, seq_len, head_dim)
value = value.reshape(meta_batch, seq_len, head_dim)
if mask is not None:
query = query * mask[:, :, None]
key = key * mask[:, :, None]
value = value * mask[:, :, None]
if approx_mode == "full":
low_resolution_logit, token_count, low_resolution_logit_row_max, value_hat = get_low_resolution_logit(
query, key, block_size, mask, value
)
elif approx_mode == "sparse":
with torch.no_grad():
low_resolution_logit, token_count, low_resolution_logit_row_max, _ = get_low_resolution_logit(
query, key, block_size, mask
)
else:
raise Exception('approx_mode must be "full" or "sparse"')
with torch.no_grad():
low_resolution_logit_normalized = low_resolution_logit - low_resolution_logit_row_max
indices, high_resolution_mask = get_block_idxes(
low_resolution_logit_normalized,
num_blocks,
approx_mode,
initial_prior_first_n_blocks,
initial_prior_diagonal_n_blocks,
)
high_resolution_logit = MraSampledDenseMatMul.operator_call(
query, key, indices, block_size=block_size
) / math.sqrt(head_dim)
max_vals, max_vals_scatter = sparse_max(high_resolution_logit, indices, num_block_per_row, num_block_per_row)
high_resolution_logit = high_resolution_logit - max_vals_scatter
if mask is not None:
high_resolution_logit = high_resolution_logit - 1e4 * (1 - sparse_mask(mask, indices)[:, :, :, None])
high_resolution_attn = torch.exp(high_resolution_logit)
high_resolution_attn_out = MraSparseDenseMatMul.operator_call(
high_resolution_attn, indices, value, num_block_per_row
)
high_resolution_normalizer = MraReduceSum.operator_call(
high_resolution_attn, indices, num_block_per_row, num_block_per_row
)
if approx_mode == "full":
low_resolution_attn = (
torch.exp(low_resolution_logit - low_resolution_logit_row_max - 1e4 * high_resolution_mask)
* token_count[:, None, :]
)
low_resolution_attn_out = (
torch.matmul(low_resolution_attn, value_hat)[:, :, None, :]
.repeat(1, 1, block_size, 1)
.reshape(meta_batch, seq_len, head_dim)
)
low_resolution_normalizer = (
low_resolution_attn.sum(dim=-1)[:, :, None].repeat(1, 1, block_size).reshape(meta_batch, seq_len)
)
log_correction = low_resolution_logit_row_max.repeat(1, 1, block_size).reshape(meta_batch, seq_len) - max_vals
if mask is not None:
log_correction = log_correction * mask
low_resolution_corr = torch.exp(log_correction * (log_correction <= 0).float())
low_resolution_attn_out = low_resolution_attn_out * low_resolution_corr[:, :, None]
low_resolution_normalizer = low_resolution_normalizer * low_resolution_corr
high_resolution_corr = torch.exp(-log_correction * (log_correction > 0).float())
high_resolution_attn_out = high_resolution_attn_out * high_resolution_corr[:, :, None]
high_resolution_normalizer = high_resolution_normalizer * high_resolution_corr
context_layer = (high_resolution_attn_out + low_resolution_attn_out) / (
high_resolution_normalizer[:, :, None] + low_resolution_normalizer[:, :, None] + 1e-6
)
elif approx_mode == "sparse":
context_layer = high_resolution_attn_out / (high_resolution_normalizer[:, :, None] + 1e-6)
else:
raise Exception('config.approx_mode must be "full" or "sparse"')
if mask is not None:
context_layer = context_layer * mask[:, :, None]
context_layer = context_layer.reshape(batch_size, num_head, seq_len, head_dim)
return context_layer
class MraEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings + 2, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer(
"token_type_ids",
torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
persistent=False,
)
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = self.position_ids[:, :seq_length]
if token_type_ids is None:
if hasattr(self, "token_type_ids"):
buffered_token_type_ids = self.token_type_ids[:, :seq_length]
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
token_type_ids = buffered_token_type_ids_expanded
else:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + token_type_embeddings
if self.position_embedding_type == "absolute":
position_embeddings = self.position_embeddings(position_ids)
embeddings += position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
def __init__(self, config, position_embedding_type=None):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
kernel_loaded = mra_cuda_kernel is not None
if is_torch_cuda_available() and is_ninja_available() and not kernel_loaded:
try:
load_cuda_kernels()
except Exception as e:
logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}")
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = (
position_embedding_type if position_embedding_type is not None else config.position_embedding_type
)
self.num_block = (config.max_position_embeddings // 32) * config.block_per_row
self.num_block = min(self.num_block, int((config.max_position_embeddings // 32) ** 2))
self.approx_mode = config.approx_mode
self.initial_prior_first_n_blocks = config.initial_prior_first_n_blocks
self.initial_prior_diagonal_n_blocks = config.initial_prior_diagonal_n_blocks
def transpose_for_scores(self, layer):
new_layer_shape = layer.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
layer = layer.view(*new_layer_shape)
return layer.permute(0, 2, 1, 3)
def forward(self, hidden_states, attention_mask=None):
mixed_query_layer = self.query(hidden_states)
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
query_layer = self.transpose_for_scores(mixed_query_layer)
batch_size, num_heads, seq_len, head_dim = query_layer.size()
attention_mask = 1.0 + attention_mask / 10000.0
attention_mask = (
attention_mask.squeeze().repeat(1, num_heads, 1).reshape(batch_size * num_heads, seq_len).int()
)
gpu_warp_size = 32
if head_dim < gpu_warp_size:
pad_size = batch_size, num_heads, seq_len, gpu_warp_size - head_dim
query_layer = torch.cat([query_layer, torch.zeros(pad_size, device=query_layer.device)], dim=-1)
key_layer = torch.cat([key_layer, torch.zeros(pad_size, device=key_layer.device)], dim=-1)
value_layer = torch.cat([value_layer, torch.zeros(pad_size, device=value_layer.device)], dim=-1)
context_layer = mra2_attention(
query_layer.float(),
key_layer.float(),
value_layer.float(),
attention_mask.float(),
self.num_block,
approx_mode=self.approx_mode,
initial_prior_first_n_blocks=self.initial_prior_first_n_blocks,
initial_prior_diagonal_n_blocks=self.initial_prior_diagonal_n_blocks,
)
if head_dim < gpu_warp_size:
context_layer = context_layer[:, :, :, :head_dim]
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
outputs = (context_layer,)
return outputs
class MraSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class MraAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
self.self = MraSelfAttention(config, position_embedding_type=position_embedding_type)
self.output = MraSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(self, hidden_states, attention_mask=None):
self_outputs = self.self(hidden_states, attention_mask)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class MraIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class MraOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class MraLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = MraAttention(config)
self.add_cross_attention = config.add_cross_attention
self.intermediate = MraIntermediate(config)
self.output = MraOutput(config)
def forward(self, hidden_states, attention_mask=None):
self_attention_outputs = self.attention(hidden_states, attention_mask)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:]
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class MraEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([MraLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
output_hidden_states=False,
return_dict=True,
):
all_hidden_states = () if output_hidden_states else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
attention_mask,
)
else:
layer_outputs = layer_module(hidden_states, attention_mask)
hidden_states = layer_outputs[0]
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
return BaseModelOutputWithCrossAttentions(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
)
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
class MraLMPredictionHead(nn.Module):
def __init__(self, config):
super().__init__()
self.transform = MraPredictionHeadTransform(config)
self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.decoder.bias = self.bias
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states)
return hidden_states
class MraOnlyMLMHead(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = MraLMPredictionHead(config)
def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
prediction_scores = self.predictions(sequence_output)
return prediction_scores
class MraPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = MraConfig
base_model_prefix = "mra"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""初始化权重"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
"""
Parameters:
config ([`MraConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
定义一个多层次的文档字符串,描述了模型输入的各种参数和返回的内容。
"""
@add_start_docstrings(
"The bare MRA Model transformer outputting raw hidden-states without any specific head on top.",
MRA_START_DOCSTRING,
)
class MraModel(MraPreTrainedModel):
"""
MRA模型类,继承自MraPreTrainedModel,用于输出未经任何特定头部处理的原始隐藏状态。
Args:
config (MraConfig): 包含模型配置信息的配置对象。
Attributes:
config (MraConfig): 模型的配置信息对象。
embeddings (MraEmbeddings): MRA模型的嵌入层。
encoder (MraEncoder): MRA模型的编码器层。
"""
def __init__(self, config):
"""
初始化方法,设置模型的各个组件。
Args:
config (MraConfig): 包含模型配置信息的配置对象。
"""
super().__init__(config)
self.config = config
self.embeddings = MraEmbeddings(config)
self.encoder = MraEncoder(config)
self.post_init()
def get_input_embeddings(self):
"""
返回模型的嵌入层的词嵌入。
Returns:
torch.nn.Embedding: 返回模型的嵌入层的词嵌入。
"""
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings("""MRA Model with a `language modeling` head on top.""", MRA_START_DOCSTRING)
class MraForMaskedLM(MraPreTrainedModel):
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
def __init__(self, config):
super().__init__(config)
self.mra = MraModel(config)
self.cls = MraOnlyMLMHead(config)
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, MaskedLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.mra(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
prediction_scores = self.cls(sequence_output)
masked_lm_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (prediction_scores,) + outputs[1:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return MaskedLMOutput(
loss=masked_lm_loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
class MraClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
self.config = config
def forward(self, features, **kwargs):
x = features[:, 0, :]
x = self.dropout(x)
x = self.dense(x)
x = ACT2FN[self.config.hidden_act](x)
x = self.dropout(x)
x = self.out_proj(x)
return x
@add_start_docstrings(
"""MRA Model transformer with a sequence classification/regression head on top (a linear layer on top of
the pooled output) e.g. for GLUE tasks.""",
MRA_START_DOCSTRING,
)
class MraForSequenceClassification(MraPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.mra = MraModel(config)
self.classifier = MraClassificationHead(config)
self.post_init()
@add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.mra(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""MRA Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks.""",
MRA_START_DOCSTRING,
)
class MraForMultipleChoice(MraPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.mra = MraModel(config)
self.pre_classifier = nn.Linear(config.hidden_size, config.hidden_size)
self.classifier = nn.Linear(config.hidden_size, 1)
self.post_init()
@add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, MultipleChoiceModelOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
outputs = self.mra(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_state = outputs[0]
pooled_output = hidden_state[:, 0]
pooled_output = self.pre_classifier(pooled_output)
pooled_output = nn.ReLU()(pooled_output)
logits = self.classifier(pooled_output)
reshaped_logits = logits.view(-1, num_choices)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
if not return_dict:
output = (reshaped_logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""MRA Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.""",
MRA_START_DOCSTRING,
)
class MraForTokenClassification(MraPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.mra = MraModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.mra(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
if attention_mask is not None:
active_loss = attention_mask.view(-1) == 1
active_logits = logits.view(-1, self.num_labels)
active_labels = torch.where(
active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
)
loss = loss_fct(active_logits, active_labels)
else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""MRA Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).""",
MRA_START_DOCSTRING,
)
class MraForQuestionAnswering(MraPreTrainedModel):
def __init__(self, config):
super().__init__(config)
config.num_labels = 2
self.num_labels = config.num_labels
self.mra = MraModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
start_positions: Optional[torch.Tensor] = None,
end_positions: Optional[torch.Tensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.mra(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)
total_loss = None
if start_positions is not None and end_positions is not None:
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
output = (start_logits, end_logits) + outputs[1:]
return ((total_loss,) + output) if total_loss is not None else output
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
.\models\mra\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
_import_structure = {"configuration_mra": ["MRA_PRETRAINED_CONFIG_ARCHIVE_MAP", "MraConfig"]}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_mra"] = [
"MRA_PRETRAINED_MODEL_ARCHIVE_LIST",
"MraForMaskedLM",
"MraForMultipleChoice",
"MraForQuestionAnswering",
"MraForSequenceClassification",
"MraForTokenClassification",
"MraLayer",
"MraModel",
"MraPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_mra import MRA_PRETRAINED_CONFIG_ARCHIVE_MAP, MraConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_mra import (
MRA_PRETRAINED_MODEL_ARCHIVE_LIST,
MraForMaskedLM,
MraForMultipleChoice,
MraForQuestionAnswering,
MraForSequenceClassification,
MraForTokenClassification,
MraLayer,
MraModel,
MraPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
.\models\mt5\configuration_mt5.py
from typing import Mapping
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxSeq2SeqConfigWithPast
from ...utils import logging
logger = logging.get_logger(__name__)
class MT5Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`MT5Model`] or a [`TFMT5Model`]. It is used to
instantiate a mT5 model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the mT5
[google/mt5-small](https://huggingface.co/google/mt5-small) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
model_type = "mt5"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"hidden_size": "d_model", "num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"}
def __init__(
self,
vocab_size=250112,
d_model=512,
d_kv=64,
d_ff=1024,
num_layers=8,
num_decoder_layers=None,
num_heads=6,
relative_attention_num_buckets=32,
relative_attention_max_distance=128,
dropout_rate=0.1,
layer_norm_epsilon=1e-6,
initializer_factor=1.0,
feed_forward_proj="gated-gelu",
is_encoder_decoder=True,
use_cache=True,
tokenizer_class="T5Tokenizer",
tie_word_embeddings=False,
pad_token_id=0,
eos_token_id=1,
decoder_start_token_id=0,
classifier_dropout=0.0,
**kwargs,
):
self.vocab_size = vocab_size
self.d_model = d_model
self.d_kv = d_kv
self.d_ff = d_ff
self.num_layers = num_layers
self.num_decoder_layers = (
num_decoder_layers if num_decoder_layers is not None else self.num_layers
)
self.num_heads = num_heads
self.relative_attention_num_buckets = relative_attention_num_buckets
self.relative_attention_max_distance = relative_attention_max_distance
self.dropout_rate = dropout_rate
self.classifier_dropout = classifier_dropout
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_factor = initializer_factor
self.feed_forward_proj = feed_forward_proj
self.use_cache = use_cache
act_info = self.feed_forward_proj.split("-")
self.dense_act_fn = act_info[-1]
self.is_gated_act = act_info[0] == "gated"
if len(act_info) > 1 and act_info[0] != "gated" or len(act_info) > 2:
raise ValueError(
f"`feed_forward_proj`: {feed_forward_proj} is not a valid activation function of the dense layer. "
"Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. "
"'gated-gelu' or 'relu'"
)
if feed_forward_proj == "gated-gelu":
self.dense_act_fn = "gelu_new"
super().__init__(
is_encoder_decoder=is_encoder_decoder,
tokenizer_class=tokenizer_class,
tie_word_embeddings=tie_word_embeddings,
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
class MT5OnnxConfig(OnnxSeq2SeqConfigWithPast):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
common_inputs = {
"input_ids": {0: "batch", 1: "encoder_sequence"},
"attention_mask": {0: "batch", 1: "encoder_sequence"},
}
if self.use_past:
common_inputs["attention_mask"][1] = "past_encoder_sequence + sequence"
common_inputs["decoder_input_ids"] = {0: "batch"}
common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
else:
common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"}
if self.use_past:
self.fill_with_past_key_values_(common_inputs, direction="inputs")
return common_inputs
@property
def default_onnx_opset(self) -> int:
return 13
@property
def atol_for_validation(self) -> float:
return 5e-4
.\models\mt5\modeling_flax_mt5.py
""" Flax mT5 model."""
import jax.numpy as jnp
from ...utils import logging
from ..t5.modeling_flax_t5 import FlaxT5EncoderModel, FlaxT5ForConditionalGeneration, FlaxT5Model
from .configuration_mt5 import MT5Config
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "T5Config"
def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
"""
Shift input ids one token to the right.
"""
shifted_input_ids = jnp.zeros_like(input_ids)
shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1])
shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id)
shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
return shifted_input_ids
class FlaxMT5Model(FlaxT5Model):
r"""
This class overrides [`FlaxT5Model`]. Please check the superclass for the appropriate documentation alongside usage
examples.
Examples:
```
>>> from transformers import FlaxMT5Model, AutoTokenizer
>>> model = FlaxMT5Model.from_pretrained("google/mt5-small")
>>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
>>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
>>> summary = "Weiter Verhandlung in Syrien."
>>> inputs = tokenizer(article, return_tensors="np")
>>> decoder_input_ids = tokenizer(text_target=summary, return_tensors="np").input_ids
>>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=decoder_input_ids)
>>> hidden_states = outputs.last_hidden_state
```"""
model_type = "mt5"
config_class = MT5Config
class FlaxMT5EncoderModel(FlaxT5EncoderModel):
r"""
This class overrides [`FlaxT5EncoderModel`]. Please check the superclass for the appropriate documentation
alongside usage examples.
Examples:
```
>>> from transformers import FlaxT5EncoderModel, AutoTokenizer
>>> model = FlaxT5EncoderModel.from_pretrained("google/mt5-small")
>>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
>>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
>>> summary = "Weiter Verhandlung in Syrien."
```
# 定义一个字符串变量,表示模型类型为 "mt5"
model_type = "mt5"
# 定义一个变量,表示配置类为 MT5Config,但未使用该变量
config_class = MT5Config
# 定义一个用于条件生成的FlaxMT5ForConditionalGeneration类,它继承自FlaxT5ForConditionalGeneration类。
# 请查看超类以获取适当的文档和用法示例。
class FlaxMT5ForConditionalGeneration(FlaxT5ForConditionalGeneration):
# 类型标识为"mt5"
model_type = "mt5"
# 配置类为MT5Config
config_class = MT5Config
.\models\mt5\modeling_mt5.py
""" PyTorch mT5 model."""
import copy
import math
import os
import warnings
from typing import List, Optional, Tuple, Union
import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
Seq2SeqLMOutput,
Seq2SeqModelOutput,
Seq2SeqQuestionAnsweringModelOutput,
Seq2SeqSequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
DUMMY_INPUTS,
DUMMY_MASK,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_torch_fx_proxy,
logging,
replace_return_docstrings,
)
from ...utils.model_parallel_utils import assert_device_map, get_device_map
from .configuration_mt5 import MT5Config
logger = logging.get_logger(__name__)
MT5_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/mt5-small",
"google/mt5-base",
"google/mt5-large",
"google/mt5-xl",
"google/mt5-xxl",
]
PARALLELIZE_DOCSTRING = r"""
This is an experimental feature and is a subject to change at a moment's notice.
Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
it will evenly distribute blocks across all devices.
"""
Args:
device_map (`Dict[int, list]`, optional, defaults to None):
A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
automatically mapped to the first device (for esoteric reasons). That means that the first device should
have fewer attention modules mapped to it than other devices. For reference, the mt5 models have the
following number of attention modules:
- mt5-small: 6
- mt5-base: 12
- mt5-large: 24
- mt5-xl: 24
- mt5-xxl: 24
Example:
```
model = MT5ForConditionalGeneration.from_pretrained("mt5-xl")
创建一个 MT5 模型实例,使用预训练的 "mt5-xl" 模型
device_map = {
0: [0, 1, 2],
将 attention 模块映射到四个 GPU 设备上的示例映射表
1: [3, 4, 5, 6, 7, 8, 9],
2: [10, 11, 12, 13, 14, 15, 16],
3: [17, 18, 19, 20, 21, 22, 23],
}
使用给定的设备映射表将模型并行化处理
model.parallelize(device_map)
```
"""
DEPARALLELIZE_DOCSTRING = r"""
Moves the model to cpu from a model parallel state.
Example:
```
model = MT5ForConditionalGeneration.from_pretrained("Mt5-xl")
device_map = {
0: [0, 1, 2],
1: [3, 4, 5, 6, 7, 8, 9],
2: [10, 11, 12, 13, 14, 15, 16],
3: [17, 18, 19, 20, 21, 22, 23],
}
model.parallelize(device_map)
model.deparallelize()
```
"""
# Copied from transformers.models.t5.modeling_t5.T5LayerNorm with T5->MT5
class MT5LayerNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
"""
Construct a layernorm module in the MT5 style. No bias and no subtraction of mean.
"""
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size)) # 初始化权重参数为全一张量
self.variance_epsilon = eps # 设置方差的 epsilon 值
def forward(self, hidden_states):
# MT5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
# Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
# w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
# half-precision inputs is done in fp32
variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) # 计算输入张量的方差
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) # 根据方差进行 layer normalization
# convert into half-precision if necessary
if self.weight.dtype in [torch.float16, torch.bfloat16]:
hidden_states = hidden_states.to(self.weight.dtype) # 如果权重数据类型为半精度,则将隐藏状态转换为相同精度
return self.weight * hidden_states # 返回经过权重调整后的隐藏状态
# Copied from transformers.models.t5.modeling_t5.T5DenseActDense with T5->MT5
class MT5DenseActDense(nn.Module):
def __init__(self, config: MT5Config):
super().__init__()
self.wi = nn.Linear(config.d_model, config.d_ff, bias=False) # 带有线性变换的全连接层,无偏置
self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) # 带有线性变换的全连接层,无偏置
self.dropout = nn.Dropout(config.dropout_rate) # 随机丢弃层,使用指定的 dropout 率
self.act = ACT2FN[config.dense_act_fn] # 激活函数从配置中获取
def forward(self, hidden_states):
hidden_states = self.wi(hidden_states) # 输入经过第一个线性层变换
hidden_states = self.act(hidden_states) # 应用激活函数
hidden_states = self.dropout(hidden_states) # 应用 dropout
if (
isinstance(self.wo.weight, torch.Tensor)
and hidden_states.dtype != self.wo.weight.dtype
and self.wo.weight.dtype != torch.int8
):
hidden_states = hidden_states.to(self.wo.weight.dtype) # 根据权重数据类型调整隐藏状态的数据类型
hidden_states = self.wo(hidden_states) # 输入经过第二个线性层变换
return hidden_states
# Copied from transformers.models.t5.modeling_t5.T5DenseGatedActDense with T5->MT5
class MT5DenseGatedActDense(nn.Module):
# 初始化方法,接受一个 MT5Config 对象作为参数
def __init__(self, config: MT5Config):
# 调用父类的初始化方法
super().__init__()
# 创建一个线性层,输入维度为 config.d_model,输出维度为 config.d_ff,无偏置
self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
# 创建一个线性层,输入维度为 config.d_model,输出维度为 config.d_ff,无偏置
self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
# 创建一个线性层,输入维度为 config.d_ff,输出维度为 config.d_model,无偏置
self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
# 创建一个 Dropout 层,使用给定的 dropout 率
self.dropout = nn.Dropout(config.dropout_rate)
# 根据配置中指定的激活函数名称,选择对应的激活函数
self.act = ACT2FN[config.dense_act_fn]
# 前向传播方法,接受输入 hidden_states
def forward(self, hidden_states):
# 将输入 hidden_states 经过激活函数 act 和线性层 wi_0 得到 hidden_gelu
hidden_gelu = self.act(self.wi_0(hidden_states))
# 将输入 hidden_states 经过线性层 wi_1 得到 hidden_linear
hidden_linear = self.wi_1(hidden_states)
# 将 hidden_gelu 和 hidden_linear 逐元素相乘得到 hidden_states
hidden_states = hidden_gelu * hidden_linear
# 对 hidden_states 应用 dropout 操作
hidden_states = self.dropout(hidden_states)
# 为了让 8 位量化适用于 google/flan-t5-xxl,self.wo 保持为 float32 类型。
# 参考 https://github.com/huggingface/transformers/issues/20287
# 同时确保权重不是 `int8` 类型,以防止用户强制设置 `_keep_in_fp32_modules` 为 `None`
if (
isinstance(self.wo.weight, torch.Tensor)
and hidden_states.dtype != self.wo.weight.dtype
and self.wo.weight.dtype != torch.int8
):
# 将 hidden_states 转换为 self.wo.weight 的数据类型
hidden_states = hidden_states.to(self.wo.weight.dtype)
# 将 hidden_states 经过线性层 self.wo 得到输出 hidden_states
hidden_states = self.wo(hidden_states)
# 返回最终的 hidden_states 结果
return hidden_states
# 从 transformers.models.t5.modeling_t5.T5LayerFF 复制并改为 T5->MT5
class MT5LayerFF(nn.Module):
# 初始化函数,接受一个 MT5Config 对象作为参数
def __init__(self, config: MT5Config):
super().__init__()
# 根据配置选择不同的 DenseReluDense 模块
if config.is_gated_act:
self.DenseReluDense = MT5DenseGatedActDense(config)
else:
self.DenseReluDense = MT5DenseActDense(config)
# 初始化 LayerNorm 模块,设定 epsilon 值
self.layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
# 初始化 Dropout 模块,设定 dropout 率
self.dropout = nn.Dropout(config.dropout_rate)
# 前向传播函数,接受隐藏状态作为输入,返回更新后的隐藏状态
def forward(self, hidden_states):
# 对隐藏状态进行 LayerNorm 处理
forwarded_states = self.layer_norm(hidden_states)
# 通过 DenseReluDense 模块处理规范化后的隐藏状态
forwarded_states = self.DenseReluDense(forwarded_states)
# 使用 Dropout 处理得到的前向传播状态,并与原始隐藏状态相加
hidden_states = hidden_states + self.dropout(forwarded_states)
# 返回更新后的隐藏状态
return hidden_states
# 从 transformers.models.t5.modeling_t5.T5Attention 复制并改为 T5->MT5
class MT5Attention(nn.Module):
# 初始化函数,接受一个 MT5Config 对象和是否包含相对注意力偏置的标志作为参数
def __init__(self, config: MT5Config, has_relative_attention_bias=False):
super().__init__()
# 是否为解码器
self.is_decoder = config.is_decoder
# 是否包含相对注意力偏置
self.has_relative_attention_bias = has_relative_attention_bias
# 相对注意力偏置的桶数
self.relative_attention_num_buckets = config.relative_attention_num_buckets
# 相对注意力的最大距离
self.relative_attention_max_distance = config.relative_attention_max_distance
# 模型的隐藏状态维度
self.d_model = config.d_model
# 键值投影维度
self.key_value_proj_dim = config.d_kv
# 注意力头的数量
self.n_heads = config.num_heads
# Dropout 率
self.dropout = config.dropout_rate
# 内部维度,即头数乘以键值投影维度
self.inner_dim = self.n_heads * self.key_value_proj_dim
# 初始化查询、键、值和输出的线性变换层,无偏置
self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
# 如果有相对注意力偏置,初始化相对注意力偏置的嵌入层
if self.has_relative_attention_bias:
self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
# 初始化被剪枝的注意力头集合和梯度检查点标志
self.pruned_heads = set()
self.gradient_checkpointing = False
# 静态方法:剪枝注意力头
def prune_heads(self, heads):
if len(heads) == 0:
return
# 找到可剪枝的注意力头和对应索引
heads, index = find_pruneable_heads_and_indices(
heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
)
# 剪枝线性层
self.q = prune_linear_layer(self.q, index)
self.k = prune_linear_layer(self.k, index)
self.v = prune_linear_layer(self.v, index)
self.o = prune_linear_layer(self.o, index, dim=1)
# 更新超参数
self.n_heads = self.n_heads - len(heads)
self.inner_dim = self.key_value_proj_dim * self.n_heads
self.pruned_heads = self.pruned_heads.union(heads)
def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
"""
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py
Translate relative position to a bucket number for relative attention. The relative position is defined as
memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to longer sequences than the model has been trained on
Args:
relative_position: an int32 Tensor - 相对位置,表示从当前位置到关注位置的距离
bidirectional: a boolean - 是否为双向注意力
num_buckets: an integer - 桶的数量,用于将相对位置映射到桶编号
max_distance: an integer - 最大距离,超过此距离的相对位置映射到同一个桶
Returns:
a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
返回一个与 relative_position 形状相同的张量,包含范围在 [0, num_buckets) 内的整数值
"""
relative_buckets = 0 # 初始化相对位置桶号为0
# 如果是双向注意力,则将桶数减半,并根据 relative_position 的正负分别计算桶号偏移
if bidirectional:
num_buckets //= 2
relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
relative_position = torch.abs(relative_position)
else:
# 如果是单向注意力,将 relative_position 转换为非正的数值
relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
# 现在 relative_position 范围在 [0, inf)
# 小于 max_exact 的相对位置使用线性增量的桶
max_exact = num_buckets // 2
is_small = relative_position < max_exact
# 大于 max_exact 的相对位置使用对数增量的桶,映射到 [max_exact, num_buckets-1] 范围内
relative_position_if_large = max_exact + (
torch.log(relative_position.float() / max_exact)
/ math.log(max_distance / max_exact)
* (num_buckets - max_exact)
).to(torch.long)
relative_position_if_large = torch.min(
relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
)
# 根据相对位置大小选择合适的桶号
relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
return relative_buckets # 返回计算得到的相对位置桶号张量
def compute_bias(self, query_length, key_length, device=None):
"""Compute binned relative position bias"""
# 如果未指定设备,则使用相对注意力偏置权重张量的设备
if device is None:
device = self.relative_attention_bias.weight.device
# 创建表示上下文位置的张量,范围为[0, query_length-1]
context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
# 创建表示记忆位置的张量,范围为[0, key_length-1]
memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
# 计算相对位置偏差,形状为(query_length, key_length)
relative_position = memory_position - context_position
# 将相对位置映射到桶中,返回形状为(query_length, key_length)的桶索引张量
relative_position_bucket = self._relative_position_bucket(
relative_position,
bidirectional=(not self.is_decoder),
num_buckets=self.relative_attention_num_buckets,
max_distance=self.relative_attention_max_distance,
)
# 使用相对位置桶索引获取相对注意力偏置值,形状为(query_length, key_length, num_heads)
values = self.relative_attention_bias(relative_position_bucket)
# 调整张量维度顺序以匹配Transformer的注意力头结构,形状为(1, num_heads, query_length, key_length)
values = values.permute([2, 0, 1]).unsqueeze(0)
# 返回相对位置注意力偏置张量
return values
def forward(
self,
hidden_states,
mask=None,
key_value_states=None,
position_bias=None,
past_key_value=None,
layer_head_mask=None,
query_length=None,
use_cache=False,
output_attentions=False,
# Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->MT5
class MT5LayerSelfAttention(nn.Module):
def __init__(self, config, has_relative_attention_bias=False):
super().__init__()
# 初始化自注意力层对象,使用MT5Attention进行自注意力计算
self.SelfAttention = MT5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
# 初始化层归一化对象,用于规范化隐藏状态
self.layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
# 初始化Dropout层,用于随机失活以防止过拟合
self.dropout = nn.Dropout(config.dropout_rate)
def forward(
self,
hidden_states,
attention_mask=None,
position_bias=None,
layer_head_mask=None,
past_key_value=None,
use_cache=False,
output_attentions=False,
):
# 对输入的隐藏状态进行层归一化处理
normed_hidden_states = self.layer_norm(hidden_states)
# 使用SelfAttention对象计算自注意力,得到注意力输出
attention_output = self.SelfAttention(
normed_hidden_states,
mask=attention_mask,
position_bias=position_bias,
layer_head_mask=layer_head_mask,
past_key_value=past_key_value,
use_cache=use_cache,
output_attentions=output_attentions,
)
# 将原始隐藏状态与注意力输出相加,并且应用Dropout
hidden_states = hidden_states + self.dropout(attention_output[0])
# 准备输出,如果需要返回注意力权重,则包含在输出中
outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them
return outputs
# Copied from transformers.models.t5.modeling_t5.T5LayerCrossAttention with T5->MT5
class MT5LayerCrossAttention(nn.Module):
def __init__(self, config):
super().__init__()
# 初始化跨注意力层对象,使用MT5Attention进行编码-解码注意力计算
self.EncDecAttention = MT5Attention(config, has_relative_attention_bias=False)
# 初始化层归一化对象,用于规范化隐藏状态
self.layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
# 初始化Dropout层,用于随机失活以防止过拟合
self.dropout = nn.Dropout(config.dropout_rate)
def forward(
self,
hidden_states,
key_value_states,
attention_mask=None,
position_bias=None,
layer_head_mask=None,
past_key_value=None,
use_cache=False,
query_length=None,
output_attentions=False,
):
# 对输入的隐藏状态进行层归一化处理
normed_hidden_states = self.layer_norm(hidden_states)
# 使用EncDecAttention对象计算编码-解码注意力,得到注意力输出
attention_output = self.EncDecAttention(
normed_hidden_states,
mask=attention_mask,
key_value_states=key_value_states,
position_bias=position_bias,
layer_head_mask=layer_head_mask,
past_key_value=past_key_value,
use_cache=use_cache,
query_length=query_length,
output_attentions=output_attentions,
)
# 将原始隐藏状态与注意力输出相加,并且应用Dropout
layer_output = hidden_states + self.dropout(attention_output[0])
# 准备输出,如果需要返回注意力权重,则包含在输出中
outputs = (layer_output,) + attention_output[1:] # add attentions if we output them
return outputs
# Copied from transformers.models.t5.modeling_t5.T5Block with T5->MT5
class MT5Block(nn.Module):
# 初始化方法,用于创建一个 MT5Model 的实例
def __init__(self, config, has_relative_attention_bias=False):
# 调用父类的初始化方法
super().__init__()
# 根据配置设置是否为解码器
self.is_decoder = config.is_decoder
# 创建一个空的模块列表用于存储层的组件
self.layer = nn.ModuleList()
# 向模块列表中添加自注意力层,并传入配置和是否有相对注意力偏置的参数
self.layer.append(MT5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
# 如果是解码器,再向模块列表中添加跨注意力层
if self.is_decoder:
self.layer.append(MT5LayerCrossAttention(config))
# 向模块列表中添加前馈神经网络层
self.layer.append(MT5LayerFF(config))
# 前向传播方法,用于计算模型的输出
def forward(
self,
hidden_states,
attention_mask=None,
position_bias=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
encoder_decoder_position_bias=None,
layer_head_mask=None,
cross_attn_layer_head_mask=None,
past_key_value=None,
use_cache=False,
output_attentions=False,
return_dict=True,
# 导入必要的模块和库
try:
import re
import numpy as np
import tensorflow as tf
except ImportError:
# 如果导入失败,记录错误信息并抛出异常
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
# 获取 TensorFlow checkpoint 文件的绝对路径
tf_path = os.path.abspath(tf_checkpoint_path)
# 打印日志,显示正在转换的 TensorFlow checkpoint 的路径
logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
# 从 TensorFlow 模型中加载权重
init_vars = tf.train.list_variables(tf_path)
names = []
tf_weights = {}
# 遍历初始化变量列表,加载每个权重并存储到字典中
for name, shape in init_vars:
logger.info(f"Loading TF weight {name} with shape {shape}")
array = tf.train.load_variable(tf_path, name)
names.append(name)
tf_weights[name] = array
# 打印日志,显示未复制到 PyTorch 模型的权重名称
logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}.")
# 返回加载权重后的 PyTorch 模型
return model
# 定义一个方法 `_shift_right`,接受一个输入的张量 `input_ids`
def _shift_right(self, input_ids):
# 从配置中获取解码器起始标记的 ID
decoder_start_token_id = self.config.decoder_start_token_id
# 从配置中获取填充标记的 ID
pad_token_id = self.config.pad_token_id
# 如果解码器起始标记的 ID 未定义,则抛出数值错误
if decoder_start_token_id is None:
raise ValueError(
"self.model.config.decoder_start_token_id has to be defined. In MT5 it is usually set to the pad_token_id. "
"See MT5 docs for more information."
)
# 将输入向右移动一位
if is_torch_fx_proxy(input_ids):
# 对于 Torch FX 代理,不支持原生的项目赋值
# 创建一个全是解码器起始标记 ID 的张量,并连接到输入张量的末尾
shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
else:
# 使用 `new_zeros` 创建与输入张量相同形状的零张量
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
# 将输入张量向右移动一位,并将解码器起始标记 ID 放在开头
shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
shifted_input_ids[..., 0] = decoder_start_token_id
# 如果填充标记 ID 未定义,则抛出数值错误
if pad_token_id is None:
raise ValueError("self.model.config.pad_token_id has to be defined.")
# 将标签中可能存在的 -100 值替换为 `pad_token_id`
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
# 返回向右移动后的输入张量
return shifted_input_ids
# Copied from transformers.models.t5.modeling_t5.T5Stack with T5->MT5
class MT5Stack(MT5PreTrainedModel):
def __init__(self, config, embed_tokens=None):
super().__init__(config)
# 初始化 MT5Stack 类的实例
self.embed_tokens = embed_tokens # 嵌入令牌,用于输入的嵌入表示
self.is_decoder = config.is_decoder # 是否为解码器模式
# 创建由多个 MT5Block 组成的模块列表,每个块具有相对注意力偏置(仅第一个块)
self.block = nn.ModuleList(
[MT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
)
self.final_layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) # 最终的层归一化
self.dropout = nn.Dropout(config.dropout_rate) # 随机失活率
# 初始化权重并应用最终处理
self.post_init()
# 模型并行化相关设置
self.model_parallel = False # 模型是否并行化
self.device_map = None # 设备映射表
self.gradient_checkpointing = False # 梯度检查点
@add_start_docstrings(PARALLELIZE_DOCSTRING)
def parallelize(self, device_map=None):
warnings.warn(
"`MT5Stack.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model"
" with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
" `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0,"
" 'block.1': 1, ...}",
FutureWarning,
)
# 检查设备映射的有效性
self.device_map = (
get_device_map(len(self.block), range(torch.cuda.device_count())) if device_map is None else device_map
)
assert_device_map(self.device_map, len(self.block)) # 断言设备映射合法性
self.model_parallel = True # 开启模型并行化
self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
self.last_device = "cuda:" + str(max(self.device_map.keys()))
# 将每个块加载到指定设备
for k, v in self.device_map.items():
for layer in v:
cuda_device = "cuda:" + str(k)
self.block[layer] = self.block[layer].to(cuda_device)
# 将嵌入令牌加载到第一个设备
self.embed_tokens = self.embed_tokens.to(self.first_device)
# 将最终层归一化加载到最后一个设备
self.final_layer_norm = self.final_layer_norm.to(self.last_device)
@add_start_docstrings(DEPARALLELIZE_DOCSTRING)
def deparallelize(self):
warnings.warn(
"Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
FutureWarning,
)
self.model_parallel = False # 关闭模型并行化
self.device_map = None # 清空设备映射表
self.first_device = "cpu" # 第一个设备设置为 CPU
self.last_device = "cpu" # 最后一个设备设置为 CPU
# 将每个块加载到 CPU
for i in range(len(self.block)):
self.block[i] = self.block[i].to("cpu")
self.embed_tokens = self.embed_tokens.to("cpu") # 将嵌入令牌加载到 CPU
self.final_layer_norm = self.final_layer_norm.to("cpu") # 将最终层归一化加载到 CPU
torch.cuda.empty_cache() # 清空 CUDA 缓存
def get_input_embeddings(self):
return self.embed_tokens # 返回嵌入令牌
# 设置模型输入的嵌入向量
def set_input_embeddings(self, new_embeddings):
self.embed_tokens = new_embeddings
# 定义模型的前向传播函数,接收多个参数用于推理或训练
def forward(
self,
input_ids=None, # 输入的token IDs
attention_mask=None, # 注意力掩码,指示模型在计算注意力时忽略某些token
encoder_hidden_states=None, # 编码器的隐藏状态,用于注意力机制
encoder_attention_mask=None, # 编码器的注意力掩码,指示编码器在计算注意力时忽略某些token
inputs_embeds=None, # 替代input_ids的嵌入向量输入
head_mask=None, # 头部掩码,用于遮蔽某些注意力头部的输出
cross_attn_head_mask=None, # 用于跨注意力的头部掩码
past_key_values=None, # 用于存储过去的键值对,以便支持自回归生成
use_cache=None, # 控制是否使用缓存
output_attentions=None, # 是否输出注意力权重
output_hidden_states=None, # 是否输出所有隐藏状态
return_dict=None, # 是否以字典形式返回输出
# MT5_START_DOCSTRING 是一个长字符串,用来描述 MT5 模型的相关信息和特性,包括其论文引用、模型结构等详细信息。
MT5_START_DOCSTRING = r"""
The MT5 model was proposed in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text
Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan
Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a
text-to-text denoising generative setting.
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`MT5Config`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# MT5_INPUTS_DOCSTRING 是一个空字符串,可能是为了后续补充描述输入的相关文档信息。
MT5_INPUTS_DOCSTRING = r"""
"""
# MT5_ENCODER_INPUTS_DOCSTRING 是另一个字符串,可能用来描述 MT5 模型编码器相关的输入信息。
MT5_ENCODER_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
# 输入序列标记在词汇表中的索引。MT5 模型具有相对位置嵌入,因此可以在右侧和左侧都进行填充。
# 可以使用 [`AutoTokenizer`] 获取索引。详见 [`PreTrainedTokenizer.encode`] 和 [`PreTrainedTokenizer.__call__`]。
# 想要了解如何为预训练准备 `input_ids`,请参考 [MT5 Training](./mt5#training)。
attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
# 遮盖掩码,避免在填充标记索引上执行注意力操作。遮盖值在 `[0, 1]` 中选择:
# - 1 表示**未遮盖**的标记,
# - 0 表示**遮盖**的标记。
# [什么是注意力遮盖?](../glossary#attention-mask)
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
# 遮头掩码,用于使自注意力模块的特定头部失效。遮盖值在 `[0, 1]` 中选择:
# - 1 表示头部**未遮盖**,
# - 0 表示头部**遮盖**。
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
# 可选地,您可以直接传递嵌入表示,而不是传递 `input_ids`。如果您希望更多控制如何将 `input_ids` 索引转换为关联向量,
# 则这很有用,而不是使用模型的内部嵌入查找矩阵。
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。有关详细信息,请参见返回张量下的 `attentions`。
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。有关详细信息,请参见返回张量下的 `hidden_states`。
return_dict (`bool`, *optional*):
# 是否返回 [`~utils.ModelOutput`] 而不是普通元组。
# 用于将来的警告消息:head_mask 参数已分成两个参数 - head_mask 和 decoder_head_mask
__HEAD_MASK_WARNING_MSG = """
The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
num_heads)`.
"""
# 定义 MT5Model 类,用于输出没有特定输出头的原始隐藏状态
@add_start_docstrings(
"The bare MT5 Model transformer outputting raw hidden-states without any specific head on top.",
MT5_START_DOCSTRING,
)
class MT5Model(MT5PreTrainedModel):
r"""
Examples:
```
>>> from transformers import MT5Model, AutoTokenizer
>>> model = MT5Model.from_pretrained("google/mt5-small")
>>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
>>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
>>> summary = "Weiter Verhandlung in Syrien."
>>> inputs = tokenizer(article, return_tensors="pt")
>>> labels = tokenizer(text_target=summary, return_tensors="pt")
>>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
>>> hidden_states = outputs.last_hidden_state
```
"""
# 模型类型为 "mt5"
model_type = "mt5"
# 配置类为 MT5Config
config_class = MT5Config
# 在加载时忽略的意外键列表
_keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
# 共享权重键的列表
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
# 从 transformers.models.t5.modeling_t5.T5Model.__init__ 复制并修改为 MT5Model
def __init__(self, config: MT5Config):
super().__init__(config)
# 创建一个共享的嵌入层
self.shared = nn.Embedding(config.vocab_size, config.d_model)
# 复制并修改编码器配置
encoder_config = copy.deepcopy(config)
encoder_config.is_decoder = False
encoder_config.use_cache = False
encoder_config.is_encoder_decoder = False
# 创建编码器实例
self.encoder = MT5Stack(encoder_config, self.shared)
# 复制并修改解码器配置
decoder_config = copy.deepcopy(config)
decoder_config.is_decoder = True
decoder_config.is_encoder_decoder = False
decoder_config.num_layers = config.num_decoder_layers
# 创建解码器实例
self.decoder = MT5Stack(decoder_config, self.shared)
# 初始化权重并应用最终处理
self.post_init()
# 模型并行设置
self.model_parallel = False
self.device_map = None
@add_start_docstrings(PARALLELIZE_DOCSTRING)
# 从 transformers.models.t5.modeling_t5.T5Model.parallelize 复制
@add_start_docstrings(DEPARALLELIZE_DOCSTRING)
# 使用给定的 DEPARALLELIZE_DOCSTRING 添加文档字符串,这是从 transformers.models.t5.modeling_t5.T5Model.deparallelize 复制过来的
def deparallelize(self):
# 发出警告,说明此方法即将在 Transformers 的 v5 版本中删除
warnings.warn(
"Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
FutureWarning,
)
# 调用编码器的 deparallelize 方法
self.encoder.deparallelize()
# 调用解码器的 deparallelize 方法
self.decoder.deparallelize()
# 将编码器移动到 CPU
self.encoder = self.encoder.to("cpu")
# 将解码器移动到 CPU
self.decoder = self.decoder.to("cpu")
# 将 model_parallel 标志设置为 False
self.model_parallel = False
# 将 device_map 设置为 None
self.device_map = None
# 清空 CUDA 缓存
torch.cuda.empty_cache()
# Copied from transformers.models.t5.modeling_t5.T5Model.get_input_embeddings
# 从 transformers.models.t5.modeling_t5.T5Model.get_input_embeddings 复制而来
def get_input_embeddings(self):
# 返回共享的输入嵌入层
return self.shared
# Copied from transformers.models.t5.modeling_t5.T5Model.set_input_embeddings
# 从 transformers.models.t5.modeling_t5.T5Model.set_input_embeddings 复制而来
def set_input_embeddings(self, new_embeddings):
# 设置共享的输入嵌入层为新的嵌入
self.shared = new_embeddings
# 调用编码器的 set_input_embeddings 方法设置新的嵌入
self.encoder.set_input_embeddings(new_embeddings)
# 调用解码器的 set_input_embeddings 方法设置新的嵌入
# Copied from transformers.models.t5.modeling_t5.T5Model.get_encoder
# 从 transformers.models.t5.modeling_t5.T5Model.get_encoder 复制而来
def get_encoder(self):
# 返回编码器
return self.encoder
# Copied from transformers.models.t5.modeling_t5.T5Model.get_decoder
# 从 transformers.models.t5.modeling_t5.T5Model.get_decoder 复制而来
def get_decoder(self):
# 返回解码器
return self.decoder
# Copied from transformers.models.t5.modeling_t5.T5Model._prune_heads
# 从 transformers.models.t5.modeling_t5.T5Model._prune_heads 复制而来
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
# 遍历需要修剪的层和头部的字典
for layer, heads in heads_to_prune.items():
# 在编码器的特定层的注意力头部上执行修剪操作
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(MT5_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
# 从 transformers.models.t5.modeling_t5.T5Model.forward 复制过来,但将 T5->MT5, t5->mt5
# 添加开始的文档字符串和替换返回文档字符串的注解
# 定义一个方法 `forward`,用于模型的前向传播
def forward(
# 输入序列的标识符,可以是一个长整型张量,可选参数
input_ids: Optional[torch.LongTensor] = None,
# 注意力掩码,可以是一个浮点数张量,可选参数
attention_mask: Optional[torch.FloatTensor] = None,
# 解码器的输入序列的标识符,可以是一个长整型张量,可选参数
decoder_input_ids: Optional[torch.LongTensor] = None,
# 解码器的注意力掩码,可以是一个布尔张量,可选参数
decoder_attention_mask: Optional[torch.BoolTensor] = None,
# 头部掩码,可以是一个浮点数张量,可选参数
head_mask: Optional[torch.FloatTensor] = None,
# 解码器的头部掩码,可以是一个浮点数张量,可选参数
decoder_head_mask: Optional[torch.FloatTensor] = None,
# 跨注意力头部掩码,可以是一个张量,可选参数
cross_attn_head_mask: Optional[torch.Tensor] = None,
# 编码器的输出,可以是一系列浮点数张量的元组,可选参数
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
# 过去键值对,可以是一系列浮点数张量的元组,可选参数
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
# 输入嵌入,可以是一个张量,可选参数
inputs_embeds: Optional[torch.Tensor] = None,
# 解码器的输入嵌入,可以是一个张量,可选参数
decoder_inputs_embeds: Optional[torch.Tensor] = None,
# 是否使用缓存,布尔值,可选参数
use_cache: Optional[bool] = None,
# 是否输出注意力,布尔值,可选参数
output_attentions: Optional[bool] = None,
# 是否输出隐藏状态,布尔值,可选参数
output_hidden_states: Optional[bool] = None,
# 是否返回字典,布尔值,可选参数
return_dict: Optional[bool] = None,
# 使用装饰器为类添加文档字符串,描述其作为基于 MT5 模型的带有语言建模头部的条件生成模型的特性
@add_start_docstrings("""MT5 Model with a `language modeling` head on top.""", MT5_START_DOCSTRING)
class MT5ForConditionalGeneration(MT5PreTrainedModel):
r"""
Examples:
```
>>> from transformers import MT5ForConditionalGeneration, AutoTokenizer
>>> model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
>>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
>>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
>>> summary = "Weiter Verhandlung in Syrien."
>>> inputs = tokenizer(article, text_target=summary, return_tensors="pt")
>>> outputs = model(**inputs)
>>> loss = outputs.loss
```"""
# 模型类型设定为 "mt5"
model_type = "mt5"
# 配置类设定为 MT5Config
config_class = MT5Config
# 加载时忽略的键列表,用于处理未预期的键
_keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
# 共享权重的键列表
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
# 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ 复制并替换 T5 为 MT5
def __init__(self, config: MT5Config):
super().__init__(config)
# 设置模型维度为 config.d_model
self.model_dim = config.d_model
# 创建共享的嵌入层,用于词汇表大小和模型维度
self.shared = nn.Embedding(config.vocab_size, config.d_model)
# 复制编码器配置,将其设定为非解码器
encoder_config = copy.deepcopy(config)
encoder_config.is_decoder = False
encoder_config.use_cache = False
encoder_config.is_encoder_decoder = False
# 创建 MT5 编码器堆栈
self.encoder = MT5Stack(encoder_config, self.shared)
# 复制解码器配置,将其设定为解码器
decoder_config = copy.deepcopy(config)
decoder_config.is_decoder = True
decoder_config.is_encoder_decoder = False
decoder_config.num_layers = config.num_decoder_layers
# 创建 MT5 解码器堆栈
self.decoder = MT5Stack(decoder_config, self.shared)
# 创建线性层用于语言建模头部,输入维度为 config.d_model,输出维度为 config.vocab_size,无偏置
self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
# 初始化权重并应用最终处理
self.post_init()
# 模型并行设定为 False
self.model_parallel = False
# 设备映射设定为 None
self.device_map = None
# 使用装饰器添加并行化文档字符串
@add_start_docstrings(PARALLELIZE_DOCSTRING)
# 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration.parallelize 复制
def parallelize(self, device_map=None):
# 发出警告,提醒 `T5ForConditionalGeneration.parallelize` 方法将在 Transformers v5 中移除
warnings.warn(
"`T5ForConditionalGeneration.parallelize` is deprecated and will be removed in v5 of Transformers, you"
" should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also"
" provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance"
" {'encoder.block.0': 0, 'encoder.block.1': 1, ...}",
FutureWarning,
)
# 根据 encoder.block 的数量和当前 CUDA 设备数量生成设备映射,如果未提供 device_map 则使用生成的映射
self.device_map = (
get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
if device_map is None
else device_map
)
# 检查设备映射的有效性
assert_device_map(self.device_map, len(self.encoder.block))
# 并行化编码器
self.encoder.parallelize(self.device_map)
# 并行化解码器
self.decoder.parallelize(self.device_map)
# 将语言模型头部移动到解码器的第一个设备上
self.lm_head = self.lm_head.to(self.decoder.first_device)
# 设置模型并行化标志为 True
self.model_parallel = True
@add_start_docstrings(DEPARALLELIZE_DOCSTRING)
# 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration.deparallelize 复制而来
def deparallelize(self):
# 发出警告,提醒 `deparallelize` 方法将在 Transformers v5 中移除
warnings.warn(
"Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
FutureWarning,
)
# 反并行化编码器
self.encoder.deparallelize()
# 反并行化解码器
self.decoder.deparallelize()
# 将编码器移动到 CPU
self.encoder = self.encoder.to("cpu")
# 将解码器移动到 CPU
self.decoder = self.decoder.to("cpu")
# 将语言模型头部移动到 CPU
self.lm_head = self.lm_head.to("cpu")
# 设置模型并行化标志为 False
self.model_parallel = False
# 清空 CUDA 缓存
torch.cuda.empty_cache()
# 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_input_embeddings 复制而来
def get_input_embeddings(self):
# 返回共享的输入嵌入
return self.shared
# 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration.set_input_embeddings 复制而来
def set_input_embeddings(self, new_embeddings):
# 设置共享的输入嵌入
self.shared = new_embeddings
# 设置编码器的输入嵌入
self.encoder.set_input_embeddings(new_embeddings)
# 设置解码器的输入嵌入
self.decoder.set_input_embeddings(new_embeddings)
# 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration.set_output_embeddings 复制而来
def set_output_embeddings(self, new_embeddings):
# 设置语言模型头部的输出嵌入
self.lm_head = new_embeddings
# 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_output_embeddings 复制而来
def get_output_embeddings(self):
# 返回语言模型头部的输出嵌入
return self.lm_head
# 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_encoder 复制而来
def get_encoder(self):
# 返回编码器
return self.encoder
# 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_decoder 复制而来
def get_decoder(self):
# 返回解码器
return self.decoder
@add_start_docstrings_to_model_forward(MT5_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
# 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration.forward 复制而来,定义了 MT5 模型的前向传播方法
def forward(
self,
input_ids: Optional[torch.LongTensor] = None, # 输入的 token IDs,类型为可选的长整型张量
attention_mask: Optional[torch.FloatTensor] = None, # 注意力掩码,类型为可选的浮点数张量
decoder_input_ids: Optional[torch.LongTensor] = None, # 解码器的输入 token IDs,类型为可选的长整型张量
decoder_attention_mask: Optional[torch.BoolTensor] = None, # 解码器的注意力掩码,类型为可选的布尔张量
head_mask: Optional[torch.FloatTensor] = None, # 头部掩码,类型为可选的浮点数张量
decoder_head_mask: Optional[torch.FloatTensor] = None, # 解码器头部掩码,类型为可选的浮点数张量
cross_attn_head_mask: Optional[torch.Tensor] = None, # 跨注意力头部掩码,类型为可选的张量
encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, # 编码器的输出,类型为可选的张量元组
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, # 过去的键值对,类型为可选的张量元组
inputs_embeds: Optional[torch.FloatTensor] = None, # 输入的嵌入,类型为可选的浮点数张量
decoder_inputs_embeds: Optional[torch.FloatTensor] = None, # 解码器输入的嵌入,类型为可选的浮点数张量
labels: Optional[torch.LongTensor] = None, # 标签,类型为可选的长整型张量
use_cache: Optional[bool] = None, # 是否使用缓存,类型为可选的布尔值
output_attentions: Optional[bool] = None, # 是否输出注意力,类型为可选的布尔值
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,类型为可选的布尔值
return_dict: Optional[bool] = None, # 是否返回字典形式的输出,类型为可选的布尔值
):
# 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation 复制而来,准备生成过程中的输入
def prepare_inputs_for_generation(
self,
input_ids, # 输入的 token IDs
past_key_values=None, # 过去的键值对,默认为 None
attention_mask=None, # 注意力掩码,默认为 None
head_mask=None, # 头部掩码,默认为 None
decoder_head_mask=None, # 解码器头部掩码,默认为 None
decoder_attention_mask=None, # 解码器的注意力掩码,默认为 None
cross_attn_head_mask=None, # 跨注意力头部掩码,默认为 None
use_cache=None, # 是否使用缓存,默认为 None
encoder_outputs=None, # 编码器的输出,默认为 None
**kwargs, # 其他关键字参数
):
# 如果使用了过去的键值对
if past_key_values is not None:
# 获取过去键值对的长度
past_length = past_key_values[0][0].shape[2]
# 如果输入的 token IDs 的长度大于过去键值对的长度
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length # 移除前缀的长度设为过去键值对的长度
else:
# 否则,默认采用旧的行为:只保留最后一个输入 ID
remove_prefix_length = input_ids.shape[1] - 1
# 将输入的 token IDs 裁剪为移除前缀长度后的部分
input_ids = input_ids[:, remove_prefix_length:]
# 返回准备好的输入字典
return {
"decoder_input_ids": input_ids,
"past_key_values": past_key_values,
"encoder_outputs": encoder_outputs,
"attention_mask": attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"decoder_attention_mask": decoder_attention_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache,
}
# 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_decoder_input_ids_from_labels 复制而来,准备从标签生成解码器输入 token IDs
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
return self._shift_right(labels)
# 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration._reorder_cache 复制而来,重新排序缓存
# 重新排列缓存中的过去键值,以便与beam索引对应
def _reorder_cache(self, past_key_values, beam_idx):
# 如果过去的键值未包含在输出中
# 禁用快速解码,无需重新排序
if past_key_values is None:
# 提示用户可能需要设置`use_cache=True`来加快解码速度
logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
return past_key_values
# 重新排序后的解码器过去状态
reordered_decoder_past = ()
for layer_past_states in past_key_values:
# 从层过去状态中获取正确的批次索引,批次维度在第二个位置
reordered_layer_past_states = ()
for layer_past_state in layer_past_states:
# 需要为每个四个键/值状态设置正确的`past`
reordered_layer_past_states = reordered_layer_past_states + (
layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
)
# 检查重新排序后的第一个层过去状态的形状与原始的是否匹配
if reordered_layer_past_states[0].shape != layer_past_states[0].shape:
raise ValueError(
f"reordered_layer_past_states[0] shape {reordered_layer_past_states[0].shape} and layer_past_states[0] shape {layer_past_states[0].shape} mismatched"
)
# 检查重新排序后的过去状态列表长度与原始列表是否匹配
if len(reordered_layer_past_states) != len(layer_past_states):
raise ValueError(
f"length of reordered_layer_past_states {len(reordered_layer_past_states)} and length of layer_past_states {len(layer_past_states)} mismatched"
)
# 将重新排序后的层过去状态添加到重新排序后的解码器过去状态中
reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
# 返回重新排序后的解码器过去状态
return reordered_decoder_past
# 使用装饰器为类添加文档字符串,描述了该类的基本信息和使用示例
@add_start_docstrings(
"The bare MT5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
MT5_START_DOCSTRING,
)
class MT5EncoderModel(MT5PreTrainedModel):
r"""
Examples:
```
>>> from transformers import MT5EncoderModel, AutoTokenizer
>>> model = MT5EncoderModel.from_pretrained("google/mt5-small")
>>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
>>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
>>> input_ids = tokenizer(article, return_tensors="pt").input_ids
>>> outputs = model(input_ids)
>>> hidden_state = outputs.last_hidden_state
```"""
# 设置模型类型为 "mt5"
model_type = "mt5"
# 指定配置类为 MT5Config
config_class = MT5Config
# 定义了需要绑定权重的键列表
_tied_weights_keys = ["encoder.embed_tokens.weight"]
# 从 transformers.models.t5.modeling_t5.T5EncoderModel.__init__ 复制并修改为 MT5EncoderModel
def __init__(self, config: MT5Config):
super().__init__(config)
# 创建共享的嵌入层,使用配置中的词汇表大小和模型维度
self.shared = nn.Embedding(config.vocab_size, config.d_model)
# 复制配置以便修改而不影响原始配置,设置不使用缓存和不是编码器-解码器模型
encoder_config = copy.deepcopy(config)
encoder_config.use_cache = False
encoder_config.is_encoder_decoder = False
# 创建 MT5 堆栈编码器
self.encoder = MT5Stack(encoder_config, self.shared)
# 初始化权重并应用最终处理
self.post_init()
# 模型并行设置
self.model_parallel = False
self.device_map = None
# 从 transformers.models.t5.modeling_t5.T5EncoderModel.parallelize 复制而来
@add_start_docstrings(PARALLELIZE_DOCSTRING)
def parallelize(self, device_map=None):
# 发出警告,说明方法已弃用,将在 Transformers v5 版本中删除
warnings.warn(
"`T5EncoderModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load"
" your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
" `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0,"
" 'block.1': 1, ...}",
FutureWarning,
)
# 根据传入的 device_map 参数设置设备映射
self.device_map = (
get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
if device_map is None
else device_map
)
# 断言设备映射的有效性
assert_device_map(self.device_map, len(self.encoder.block))
# 将编码器对象分布到多个设备上
self.encoder.parallelize(self.device_map)
self.model_parallel = True
# 从 transformers.models.t5.modeling_t5.T5EncoderModel.deparallelize 复制而来
@add_start_docstrings(DEPARALLELIZE_DOCSTRING)
def deparallelize(self):
# 发出警告,说明方法已弃用,将在 Transformers v5 版本中删除
warnings.warn(
"Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
FutureWarning,
)
# 取消编码器对象的并行化
self.encoder.deparallelize()
# 将编码器对象移回 CPU
self.encoder = self.encoder.to("cpu")
self.model_parallel = False
self.device_map = None
# 清空 CUDA 缓存
torch.cuda.empty_cache()
# 从 transformers.models.t5.modeling_t5.T5EncoderModel.get_input_embeddings 复制而来
# 返回当前模型共享的输入嵌入向量
def get_input_embeddings(self):
return self.shared
# 从给定的新嵌入向量设置模型共享的输入嵌入向量,并更新编码器的输入嵌入
def set_input_embeddings(self, new_embeddings):
self.shared = new_embeddings
self.encoder.set_input_embeddings(new_embeddings)
# 返回当前模型的编码器
def get_encoder(self):
return self.encoder
# 剪枝模型中编码器的注意力头
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.block[layer].layer[0].SelfAttention.prune_heads(heads)
@add_start_docstrings_to_model_forward(MT5_ENCODER_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
# 重写的前向传播函数,用于MT5模型,接受多种输入并返回编码器的输出
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
r"""
Returns:
Example:
```
>>> from transformers import AutoTokenizer, MT5EncoderModel
>>> tokenizer = AutoTokenizer.from_pretrained("google-mt5/mt5-small")
>>> model = MT5EncoderModel.from_pretrained("google-mt5/mt5-small")
>>> input_ids = tokenizer(
... "Studies have been shown that owning a dog is good for you", return_tensors="pt"
... ).input_ids
>>> outputs = model(input_ids=input_ids)
>>> last_hidden_states = outputs.last_hidden_state
```"""
# 如果return_dict未指定,则根据配置确定是否使用返回字典
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用编码器的前向传播,传递输入参数并返回编码器的输出
encoder_outputs = self.encoder(
input_ids=input_ids,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
return encoder_outputs
"""
MT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
tasks.
"""
@add_start_docstrings(
"""
MT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
tasks.
""",
MT5_START_DOCSTRING,
)
class MT5ForSequenceClassification(MT5PreTrainedModel):
_keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
# Copied from transformers.models.t5.modeling_t5.T5ForSequenceClassification.__init__ with T5->MT5
def __init__(self, config: MT5Config):
super().__init__(config)
self.transformer = MT5Model(config) # 初始化MT5模型
self.classification_head = MT5ClassificationHead(config) # 初始化分类头部
# Initialize weights and apply final processing
self.post_init() # 初始化后处理步骤
self.model_parallel = False # 设置模型并行为False
@add_start_docstrings_to_model_forward(MT5_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
# Copied from transformers.models.t5.modeling_t5.T5ForSequenceClassification.forward
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Perform forward pass of the MT5 model for sequence classification.
"""
# Forward pass through MT5 model and classification head
# 正向传播通过MT5模型和分类头部
# 详细参数说明参见MT5_INPUTS_DOCSTRING
pass
"""
MT5 Encoder Model with a token classification head on top (a linear layer on top of the hidden-states output)
e.g. for Named-Entity-Recognition (NER) tasks.
"""
@add_start_docstrings(
"""
MT5 Encoder Model with a token classification head on top (a linear layer on top of the hidden-states output)
e.g. for Named-Entity-Recognition (NER) tasks.
""",
MT5_START_DOCSTRING,
)
class MT5ForTokenClassification(MT5PreTrainedModel):
_tied_weights_keys = ["transformer.encoder.embed_tokens.weight"]
# Copied from transformers.models.t5.modeling_t5.T5ForTokenClassification.__init__ with T5->MT5
def __init__(self, config: MT5Config):
super().__init__(config)
self.num_labels = config.num_labels # 设置标签数量
self.transformer = MT5EncoderModel(config) # 初始化MT5编码器模型
self.dropout = nn.Dropout(config.classifier_dropout) # 初始化Dropout层
self.classifier = nn.Linear(config.hidden_size, config.num_labels) # 初始化线性分类器
# Initialize weights and apply final processing
self.post_init() # 初始化后处理步骤
@add_start_docstrings_to_model_forward(MT5_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Perform forward pass of the MT5 model for token classification.
"""
# Forward pass through MT5 model and token classification head
# 正向传播通过MT5模型和标记分类头部
# 详细参数说明参见MT5_INPUTS_DOCSTRING
pass
# 从transformers.models.mt5.modeling_mt5.MT5ForTokenClassification.forward中复制而来
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
计算标记分类损失的标签。索引应在 `[0, ..., config.num_labels - 1]` 范围内。
Returns:
返回一个元组或者TokenClassifierOutput对象。
"""
# 确定是否返回字典格式的输出
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 使用transformer模型处理输入
outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取模型输出中的隐藏状态
hidden_states = outputs[0]
# 对隐藏状态应用dropout层
hidden_states = self.dropout(hidden_states)
# 将处理后的隐藏状态传入分类器得到logits
logits = self.classifier(hidden_states)
# 初始化损失值为None
loss = None
# 如果有标签,则计算损失值
if labels is not None:
loss_fct = CrossEntropyLoss()
# 计算交叉熵损失
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
# 如果不要求返回字典格式的输出
if not return_dict:
# 构建输出元组
output = (logits, outputs[2:-1])
# 如果损失不为None,则将损失值加入输出元组中
return ((loss,) + output) if loss is not None else output
# 返回TokenClassifierOutput对象,包含损失、logits、隐藏状态和注意力值
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
MT5 Model with a span classification head on top for extractive question-answering tasks like SQuAD (linear layers
on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
MT5_START_DOCSTRING,
)
class MT5ForQuestionAnswering(MT5PreTrainedModel):
_keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
# Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.__init__ with T5->MT5
def __init__(self, config: MT5Config):
super().__init__(config)
self.model_dim = config.d_model
# Embedding layer shared between encoder and decoder
self.shared = nn.Embedding(config.vocab_size, config.d_model)
# Initialize encoder with MT5Stack
encoder_config = copy.deepcopy(config)
encoder_config.is_decoder = False
encoder_config.use_cache = False
encoder_config.is_encoder_decoder = False
self.encoder = MT5Stack(encoder_config, self.shared)
# Initialize decoder with MT5Stack
decoder_config = copy.deepcopy(config)
decoder_config.is_decoder = True
decoder_config.is_encoder_decoder = False
decoder_config.num_layers = config.num_decoder_layers
self.decoder = MT5Stack(decoder_config, self.shared)
# Output layer for question answering logits
self.num_labels = config.num_labels
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
# Initialize weights and apply final processing
self.post_init()
self.model_parallel = False
# Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.get_input_embeddings
def get_input_embeddings(self):
return self.shared
# Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.set_input_embeddings
def set_input_embeddings(self, new_embeddings):
# Set new embeddings for shared layer and update encoder and decoder embeddings
self.shared = new_embeddings
self.encoder.set_input_embeddings(new_embeddings)
self.decoder.set_input_embeddings(new_embeddings)
# Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.get_encoder
def get_encoder(self):
return self.encoder
# Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.get_decoder
def get_decoder(self):
return self.decoder
@add_start_docstrings_to_model_forward(MT5_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
# Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.forward
# 定义模型的前向传播方法,接受多个可选的输入参数
def forward(
self,
input_ids: Optional[torch.LongTensor] = None, # 输入序列的token IDs,可选的长整型张量
attention_mask: Optional[torch.FloatTensor] = None, # 输入序列的注意力掩码,可选的浮点数张量
decoder_input_ids: Optional[torch.LongTensor] = None, # 解码器输入序列的token IDs,可选的长整型张量
decoder_attention_mask: Optional[torch.BoolTensor] = None, # 解码器输入序列的注意力掩码,可选的布尔张量
head_mask: Optional[torch.FloatTensor] = None, # 多头注意力机制的头掩码,可选的浮点数张量
decoder_head_mask: Optional[torch.FloatTensor] = None, # 解码器的多头注意力机制的头掩码,可选的浮点数张量
cross_attn_head_mask: Optional[torch.Tensor] = None, # 交叉注意力机制的头掩码,可选的张量
encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, # 编码器输出的元组,可选的张量元组
start_positions: Optional[torch.LongTensor] = None, # 开始位置的token IDs,可选的长整型张量
end_positions: Optional[torch.LongTensor] = None, # 结束位置的token IDs,可选的长整型张量
inputs_embeds: Optional[torch.FloatTensor] = None, # 输入嵌入的张量,可选的浮点数张量
decoder_inputs_embeds: Optional[torch.FloatTensor] = None, # 解码器输入嵌入的张量,可选的浮点数张量
use_cache: Optional[bool] = None, # 是否使用缓存,可选的布尔值
output_attentions: Optional[bool] = None, # 是否输出注意力权重,可选的布尔值
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,可选的布尔值
return_dict: Optional[bool] = None, # 是否返回字典格式的结果,可选的布尔值