Transformers 源码解析(八)
.\modeling_tf_outputs.py
import warnings
from dataclasses import dataclass
from typing import List, Optional, Tuple
import tensorflow as tf
from .utils import ModelOutput
@dataclass
class TFBaseModelOutput(ModelOutput):
"""
模型输出的基类,包含可能的隐藏状态和注意力信息。
Args:
last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
模型最后一层的隐藏状态序列。
hidden_states (`tuple(tf.FloatTensor)`, *optional*, 当 `output_hidden_states=True` 时返回或者 `config.output_hidden_states=True`):
元组,包含每一层的隐藏状态 `tf.Tensor`(一个用于嵌入输出,一个用于每一层的输出)的形状为 `(batch_size, sequence_length, hidden_size)`。
模型在每一层的隐藏状态,包括初始嵌入层的输出。
attentions (`tuple(tf.Tensor)`, *optional*, 当 `output_attentions=True` 时返回或者 `config.output_attentions=True`):
元组,包含每一层的注意力权重 `tf.Tensor` 的形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
注意力 softmax 后的注意力权重,用于计算自注意力头的加权平均值。
"""
last_hidden_state: tf.Tensor = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
@dataclass
class TFBaseModelOutputWithNoAttention(ModelOutput):
"""
模型输出的基类,包含可能的隐藏状态,但不包含注意力信息。
Args:
last_hidden_state (`tf.Tensor` shape `(batch_size, num_channels, height, width)`):
模型最后一层的隐藏状态序列。
hidden_states (`tuple(tf.Tensor)`, *optional*, 当 `output_hidden_states=True` 时返回或者 `config.output_hidden_states=True`):
元组,包含每一层的隐藏状态 `tf.Tensor`(一个用于嵌入层的输出,如果模型有嵌入层,一个用于每一层的输出)的形状为 `(batch_size, num_channels, height, width)`。
模型在每一层的隐藏状态,包括可选的初始嵌入层的输出。
"""
last_hidden_state: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor, ...]] = None
@dataclass
class TFBaseModelOutputWithPooling(ModelOutput):
"""
Base class for model's outputs that also contains a pooling of the last hidden states.
Args:
last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token) further processed by a
Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
prediction (classification) objective during pretraining.
This output is usually *not* a good summary of the semantic content of the input, you're often better with
averaging or pooling the sequence of hidden-states for the whole input sequence.
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: tf.Tensor = None
pooler_output: tf.Tensor = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
@dataclass
class TFBaseModelOutputWithPoolingAndNoAttention(ModelOutput):
"""
Base class for model's outputs that also contains a pooling of the last hidden states.
Args:
last_hidden_state (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
Last layer hidden-state after a pooling operation on the spatial dimensions.
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
the output of each layer) of shape `(batch_size, num_channels, height, width)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
"""
last_hidden_state: tf.Tensor = None
pooler_output: tf.Tensor = None
hidden_states: Tuple[tf.Tensor] | None = None
last_hidden_state: tf.Tensor = None
pooler_output: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor, ...]] = None
@dataclass
class TFBaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
"""
Base class for model's outputs that also contains a pooling of the last hidden states.
Args:
last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token) further processed by a
Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
prediction (classification) objective during pretraining.
This output is usually *not* a good summary of the semantic content of the input, you're often better with
averaging or pooling the sequence of hidden-states for the whole input sequence.
past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
"""
last_hidden_state: tf.Tensor = None
pooler_output: tf.Tensor = None
past_key_values: List[tf.Tensor] | None = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
cross_attentions: Tuple[tf.Tensor] | None = None
cross_attentions: Tuple[tf.Tensor] | None = None
@dataclass
class TFBaseModelOutputWithPast(ModelOutput):
"""
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
Args:
last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
hidden_size)` is output.
past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: tf.Tensor = None
past_key_values: List[tf.Tensor] | None = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
@dataclass
class TFBaseModelOutputWithCrossAttentions(ModelOutput):
"""
Base class for model's outputs, with potential hidden states and attentions.
"""
"""
Args:
last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
模型最后一层的输出隐藏状态序列。
hidden_states (`tuple(tf.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
元组的形式,包含每层模型的隐藏状态,以及初始嵌入输出。
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
元组的形式,包含每层注意力权重,用于计算自注意力中加权平均值。
cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
元组的形式,包含解码器跨注意力层的注意力权重,用于计算跨注意力中加权平均值。
"""
last_hidden_state: tf.Tensor = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
cross_attentions: Tuple[tf.Tensor] | None = None
@dataclass
class TFBaseModelOutputWithPastAndCrossAttentions(ModelOutput):
"""
Model output class for Transformer-based models that includes past key/values and cross-attentions.
Args:
last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Final layer hidden-states of the model.
If `past_key_values` is used, only the last hidden-state of shape `(batch_size, 1, hidden_size)` is output.
past_key_values (`List[tf.Tensor]`, *optional*):
List of tensors, each of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`.
Pre-computed hidden-states (key and values) for sequential decoding speed-up.
hidden_states (`Tuple[tf.Tensor]`, *optional*):
Tuple of tensors, one for embeddings and one for each layer's hidden-states, each of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`Tuple[tf.Tensor]`, *optional*):
Tuple of tensors, one for each layer, each of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attention weights after softmax, used for self-attention heads.
cross_attentions (`Tuple[tf.Tensor]`, *optional*):
Tuple of tensors, one for each layer, each of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attention weights of the decoder's cross-attention layer after softmax.
"""
last_hidden_state: tf.Tensor = None
past_key_values: List[tf.Tensor] | None = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
cross_attentions: Tuple[tf.Tensor] | None = None
@dataclass
class TFSeq2SeqModelOutput(ModelOutput):
"""
Model output class for Seq2Seq Transformer-based models.
Args:
last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Final layer hidden-states of the encoder.
past_key_values (`List[tf.Tensor]`, *optional*):
List of tensors, each of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`.
Pre-computed hidden-states (key and values) for decoder's sequential decoding speed-up.
decoder_hidden_states (`Tuple[tf.Tensor]`, *optional*):
Tuple of tensors for decoder's hidden-states, each of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer for the decoder.
decoder_attentions (`Tuple[tf.Tensor]`, *optional*):
Tuple of tensors for decoder's attentions, each of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attention weights after softmax for the decoder.
"""
last_hidden_state: tf.Tensor = None
past_key_values: List[tf.Tensor] | None = None
decoder_hidden_states: Tuple[tf.Tensor] | None = None
decoder_attentions: Tuple[tf.Tensor] | None = None
cross_attentions: Tuple[tf.Tensor] | None = None
encoder_last_hidden_state: tf.Tensor | None = None
encoder_hidden_states: Tuple[tf.Tensor] | None = None
encoder_attentions: Tuple[tf.Tensor] | None = None
@dataclass
class TFCausalLMOutput(ModelOutput):
"""
Base class for causal language model (or autoregressive) outputs.
Args:
loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
Language modeling loss (for next-token prediction).
logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: tf.Tensor | None = None
logits: tf.Tensor = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
@dataclass
class TFCausalLMOutputWithPast(ModelOutput):
"""
Base class for causal language model (or autoregressive) outputs.
"""
Args:
loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
语言建模损失(用于下一个标记预测)。
logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
语言建模头的预测分数(在 SoftMax 之前每个词汇标记的分数)。
past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
包含预先计算的隐藏状态(注意力块中的键和值)的列表,可用于加速顺序解码。
长度为 `config.n_layers` 的 `tf.Tensor` 列表,每个张量的形状为 `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`。
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
模型在每一层输出的隐藏状态加上初始嵌入输出的元组。
包含 `tf.Tensor`(嵌入输出的一个 + 每层输出的一个),形状为 `(batch_size, sequence_length, hidden_size)`。
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
self-attention 头部中的加权平均计算所使用的注意力 softmax 后的注意力权重。
包含每一层的 `tf.Tensor`,形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
@dataclass
class TFCausalLMOutputWithCrossAttentions(ModelOutput):
"""
Base class for causal language model (or autoregressive) outputs.
Args:
loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
Language modeling loss (for next-token prediction).
logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
"""
loss: tf.Tensor | None = None
logits: tf.Tensor = None
past_key_values: List[tf.Tensor] | None = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
cross_attentions: Tuple[tf.Tensor] | None = None
@dataclass
class TFMaskedLMOutput(ModelOutput):
"""
Base class for masked language models outputs.
"""
loss: tf.Tensor | None = None
logits: tf.Tensor = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
@dataclass
class TFSeq2SeqLMOutput(ModelOutput):
"""
Base class for sequence-to-sequence language models outputs.
"""
loss: tf.Tensor | None = None
logits: tf.Tensor = None
past_key_values: List[tf.Tensor] | None = None
decoder_hidden_states: Tuple[tf.Tensor] | None = None
decoder_attentions: Tuple[tf.Tensor] | None = None
cross_attentions: Tuple[tf.Tensor] | None = None
encoder_last_hidden_state: tf.Tensor | None = None
encoder_hidden_states: Tuple[tf.Tensor] | None = None
encoder_attentions: Tuple[tf.Tensor] | None = None
@dataclass
class TFNextSentencePredictorOutput(ModelOutput):
"""
Base class for outputs of models predicting if two sentences are consecutive or not.
"""
loss: tf.Tensor | None = None
logits: tf.Tensor = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
@dataclass
class TFSequenceClassifierOutput(ModelOutput):
"""
Base class for outputs of sentence classification models.
"""
Args:
loss (`tf.Tensor` of shape `(batch_size, )`, *optional*, returned when `labels` is provided):
分类(或回归,如果 `config.num_labels==1`)的损失。
当提供 `labels` 参数时返回。
logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
分类(或回归,如果 `config.num_labels==1`)的分数(SoftMax 之前的值)。
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
由 `tf.Tensor` 组成的元组(当传递 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回)。
形状为 `(batch_size, sequence_length, hidden_size)`。
模型在每个层输出的隐藏状态,以及初始嵌入输出。
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
由 `tf.Tensor` 组成的元组(当传递 `output_attentions=True` 或 `config.output_attentions=True` 时返回)。
形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
注意力权重经过注意力 SoftMax 后的结果,用于计算自注意力头部的加权平均值。
"""
loss: tf.Tensor | None = None # 初始化为 None,表示损失值尚未设置
logits: tf.Tensor = None # 初始化为 None,表示逻辑分数尚未设置
hidden_states: Tuple[tf.Tensor] | None = None # 初始化为 None,表示隐藏状态尚未设置
attentions: Tuple[tf.Tensor] | None = None # 初始化为 None,表示注意力权重尚未设置
# 使用 `dataclass` 装饰器声明一个类,表示一个序列到序列句子分类模型的输出。
@dataclass
class TFSeq2SeqSequenceClassifierOutput(ModelOutput):
"""
序列到序列句子分类模型输出的基础类。
"""
# 表示损失值的张量,可以为 None
loss: tf.Tensor | None = None
# 表示逻辑回归输出的张量
logits: tf.Tensor = None
# 表示过去键值的列表,可以为 None
past_key_values: List[tf.Tensor] | None = None
# 表示解码器隐藏状态的元组,可以为 None
decoder_hidden_states: Tuple[tf.Tensor] | None = None
# 表示解码器注意力的元组,可以为 None
decoder_attentions: Tuple[tf.Tensor] | None = None
# 表示交叉注意力的元组,可以为 None
cross_attentions: Tuple[tf.Tensor] | None = None
# 表示编码器最后隐藏状态的张量,可以为 None
encoder_last_hidden_state: tf.Tensor | None = None
# 表示编码器隐藏状态的元组,可以为 None
encoder_hidden_states: Tuple[tf.Tensor] | None = None
# 表示编码器注意力的元组,可以为 None
encoder_attentions: Tuple[tf.Tensor] | None = None
# 使用 `dataclass` 装饰器声明一个类,表示语义分割模型的输出。
@dataclass
class TFSemanticSegmenterOutput(ModelOutput):
"""
语义分割模型输出的基础类。
Args:
loss (`tf.Tensor` of shape `(1,)`, *optional*, 当提供 `labels` 时返回):
分类(或回归,如果 `config.num_labels==1`)损失。
logits (`tf.Tensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
每个像素的分类分数。
<Tip warning={true}>
返回的 logits 不一定与作为输入传递的 `pixel_values` 的大小相同。这是为了避免进行两次插值并在将 logits 调整回原始图像大小时失去一些质量。
您应该始终检查 logits 的形状并根据需要进行调整大小。
</Tip>
hidden_states (`tuple(tf.Tensor)`, *optional*, 当 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
`tf.Tensor` 的元组(如果模型具有嵌入层,则为一个用于每层输出的隐藏状态的输出 + 一个用于每个层输出的初始嵌入输出),
形状为 `(batch_size, patch_size, hidden_size)`。
模型在每层输出之后的隐藏状态,加上可选的初始嵌入输出。
attentions (`tuple(tf.Tensor)`, *optional*, 当 `output_attentions=True` 或 `config.output_attentions=True` 时返回):
`tf.Tensor` 的元组(每层一个),
形状为 `(batch_size, num_heads, patch_size, sequence_length)`。
经过注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
"""
# 表示损失值的张量,可以为 None
loss: tf.Tensor | None = None
# 表示逻辑回归输出的张量
logits: tf.Tensor = None
# 表示隐藏状态的元组,可以为 None
hidden_states: Tuple[tf.Tensor] | None = None
# 表示注意力的元组,可以为 None
attentions: Tuple[tf.Tensor] | None = None
# 使用 `dataclass` 装饰器声明一个类,表示不输出注意力分数的语义分割模型的输出。
@dataclass
class TFSemanticSegmenterOutputWithNoAttention(ModelOutput):
"""
不输出注意力分数的语义分割模型输出的基础类。
"""
# 定义函数的参数和返回值类型注释
Args:
loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
分类(或回归,如果 `config.num_labels==1`)损失。
logits (`tf.Tensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
每个像素的分类得分。
<Tip warning={true}>
返回的 logits 不一定与输入的 `pixel_values` 具有相同的大小。这是为了避免在用户需要将 logits 调整回原始图像大小时进行两次插值并丢失一些质量。您应始终检查 logits 的形状并根据需要调整大小。
</Tip>
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
形状为 `(batch_size, patch_size, hidden_size)` 的 `tf.Tensor` 元组(当传递 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回)。
模型在每个层的输出隐藏状态加上可选的初始嵌入输出。
loss: tf.Tensor | None = None
logits: tf.Tensor = None
hidden_states: Tuple[tf.Tensor] | None = None
# 定义 TFImageClassifierOutput 类,用于表示图像分类模型的输出结果
@dataclass
class TFImageClassifierOutput(ModelOutput):
"""
Base class for outputs of image classification models.
Args:
loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
分类损失(如果提供 `labels` 参数)。
logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
分类得分(如果 `config.num_labels==1` 则是回归分数),未经 SoftMax 处理。
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
一个元组,包含 `tf.Tensor`(用于嵌入层的输出,如果模型有嵌入层,+ 每个阶段的输出),形状为 `(batch_size, sequence_length, hidden_size)`。
模型在每个阶段输出的隐藏状态(也称为特征图)。
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
一个元组,包含 `tf.Tensor`(每个层的注意力权重),形状为 `(batch_size, num_heads, patch_size, sequence_length)`。
注意力 softmax 后的注意力权重,用于在自注意力头中计算加权平均值。
"""
loss: tf.Tensor | None = None
logits: tf.Tensor = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
# 定义 TFMultipleChoiceModelOutput 类,用于表示多项选择模型的输出结果
@dataclass
class TFMultipleChoiceModelOutput(ModelOutput):
"""
Base class for outputs of multiple choice models.
Args:
loss (`tf.Tensor` of shape *(batch_size, )*, *optional*, returned when `labels` is provided):
分类损失。
logits (`tf.Tensor` of shape `(batch_size, num_choices)`):
`num_choices` 是输入张量的第二维。参见上面的 `input_ids`。
分类得分(未经 SoftMax 处理)。
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
一个元组,包含 `tf.Tensor`(用于嵌入层的输出 + 每层的输出),形状为 `(batch_size, sequence_length, hidden_size)`。
模型在每层输出的隐藏状态加上初始嵌入输出。
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
一个元组,包含 `tf.Tensor`(每个层的注意力权重),形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
注意力 softmax 后的注意力权重,用于在自注意力头中计算加权平均值。
"""
loss: tf.Tensor | None = None
logits: tf.Tensor = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
@dataclass
class TFTokenClassifierOutput(ModelOutput):
"""
Token 分类模型输出的基类。
Args:
loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of unmasked labels, returned when `labels` is provided):
分类损失。
logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.num_labels)`):
分类分数(SoftMax 之前)。
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
元组,包含 `tf.Tensor`(一个用于嵌入输出 + 每层输出的 `tf.Tensor`),形状为 `(batch_size, sequence_length, hidden_size)`。
每层模型的隐藏状态,加上初始嵌入输出。
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
元组,包含 `tf.Tensor`(每个层的注意力权重),形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
注意力权重经过 SoftMax 后的结果,用于计算自注意力头部的加权平均值。
"""
loss: tf.Tensor | None = None
logits: tf.Tensor = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
@dataclass
class TFQuestionAnsweringModelOutput(ModelOutput):
"""
问答模型输出的基类。
Args:
loss (`tf.Tensor` of shape `(batch_size, )`, *optional*, returned when `start_positions` and `end_positions` are provided):
总的跨度提取损失,为开始和结束位置的交叉熵之和。
start_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
起始位置的分数(SoftMax 之前)。
end_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
结束位置的分数(SoftMax 之前)。
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
元组,包含 `tf.Tensor`(一个用于嵌入输出 + 每层输出的 `tf.Tensor`),形状为 `(batch_size, sequence_length, hidden_size)`。
每层模型的隐藏状态,加上初始嵌入输出。
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
元组,包含 `tf.Tensor`(每个层的注意力权重),形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
注意力权重经过 SoftMax 后的结果,用于计算自注意力头部的加权平均值。
"""
loss: tf.Tensor | None = None
# 定义变量 start_logits,用于存储开始位置的预测张量,初始值为 None
start_logits: tf.Tensor = None
# 定义变量 end_logits,用于存储结束位置的预测张量,初始值为 None
end_logits: tf.Tensor = None
# 定义变量 hidden_states,用于存储隐藏状态的元组张量,初始值为 None
hidden_states: Tuple[tf.Tensor] | None = None
# 定义变量 attentions,用于存储注意力张量的元组张量,初始值为 None
attentions: Tuple[tf.Tensor] | None = None
@dataclass
class TFSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
"""
Base class for outputs of sequence-to-sequence question answering models.
"""
Args:
loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
start_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Span-start scores (before SoftMax).
end_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Span-end scores (before SoftMax).
past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
used (see `past_key_values` input) to speed up sequential decoding.
decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
"""
loss: tf.Tensor | None = None
start_logits: tf.Tensor = None
end_logits: tf.Tensor = None
past_key_values: List[tf.Tensor] | None = None
decoder_hidden_states: Tuple[tf.Tensor] | None = None
decoder_attentions: Tuple[tf.Tensor] | None = None
encoder_last_hidden_state: tf.Tensor | None = None
encoder_hidden_states: Tuple[tf.Tensor] | None = None
encoder_attentions: Tuple[tf.Tensor] | None = None
@dataclass
class TFSequenceClassifierOutputWithPast(ModelOutput):
"""
用于句子分类模型输出的基础类。
Args:
loss (`tf.Tensor` of shape `(batch_size, )`, *optional*, returned when `labels` is provided):
分类(或回归,如果 config.num_labels==1)的损失值。
logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
分类(或回归,如果 config.num_labels==1)的分数(SoftMax 之前)。
past_key_values (`List[tf.Tensor]`, *optional*, 当传递 `use_cache=True` 或 `config.use_cache=True` 时返回):
长度为 `config.n_layers` 的 `tf.Tensor` 列表,每个张量的形状为 `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`。
包含预先计算的隐藏状态(注意力块中的键和值),可用于加速序列解码。
hidden_states (`tuple(tf.Tensor)`, *optional*, 当传递 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
形状为 `(batch_size, sequence_length, hidden_size)` 的 `tf.Tensor` 元组。
模型在每一层输出的隐藏状态,以及初始嵌入输出。
attentions (`tuple(tf.Tensor)`, *optional*, 当传递 `output_attentions=True` 或 `config.output_attentions=True` 时返回):
形状为 `(batch_size, num_heads, sequence_length, sequence_length)` 的 `tf.Tensor` 元组。
注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
"""
loss: tf.Tensor | None = None
logits: tf.Tensor = None
past_key_values: List[tf.Tensor] | None = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
@dataclass
class TFImageClassifierOutputWithNoAttention(ModelOutput):
"""
用于图像分类模型输出的基础类。
Args:
loss (`tf.Tensor` of shape `(1,)`, *optional*, 当提供 `labels` 时返回):
分类(或回归,如果 config.num_labels==1)的损失值。
logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
分类(或回归,如果 config.num_labels==1)的分数(SoftMax 之前)。
hidden_states (`tuple(tf.Tensor)`, *optional*, 当传递 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
形状为 `(batch_size, num_channels, height, width)` 的 `tf.Tensor` 元组。
模型在每个阶段输出的隐藏状态(也称为特征图)。
"""
loss: tf.Tensor | None = None
logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor, ...]] = None
@dataclass
class TFMaskedImageModelingOutput(ModelOutput):
"""
Base class for outputs of masked image completion / in-painting models.
Args:
loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
Reconstruction loss.
reconstruction (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
Reconstructed / completed images.
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when
`config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called
feature maps) of the model at the output of each stage.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when
`config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: tf.Tensor | None = None
reconstruction: tf.Tensor = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
@property
def logits(self):
warnings.warn(
"logits attribute is deprecated and will be removed in version 5 of Transformers."
" Please use the reconstruction attribute to retrieve the final output instead.",
FutureWarning,
)
return self.reconstruction
.\modeling_tf_pytorch_utils.py
""" PyTorch - TF 2.0 通用实用工具 """
import os
import re
import numpy
from .utils import ExplicitEnum, expand_dims, is_numpy_array, is_torch_tensor, logging, reshape, squeeze, tensor_size
from .utils import transpose as transpose_func
logger = logging.get_logger(__name__)
class TransposeType(ExplicitEnum):
"""
可能的...
"""
NO = "no"
SIMPLE = "simple"
CONV1D = "conv1d"
CONV2D = "conv2d"
def convert_tf_weight_name_to_pt_weight_name(
tf_name, start_prefix_to_remove="", tf_weight_shape=None, name_scope=None
):
"""
将 TF 2.0 模型变量名转换为 PyTorch 模型权重名。
TF2.0 范围 -> PyTorch 属性名转换的约定:
- '$1___$2' 被 $2 替换(可用于在 TF2.0 vs PyTorch 中复制或删除层)
- '_._' 被新的级别分隔替换(可用于在 PyTorch nn.ModulesList 中转换 TF2.0 列表)
返回一个元组,包含:
- PyTorch 模型权重名
- transpose:表示 TF2.0 和 PyTorch 权重矩阵之间是否以及如何进行转置的 `TransposeType` 成员
"""
if name_scope is not None:
if not tf_name.startswith(name_scope) and "final_logits_bias" not in tf_name:
raise ValueError(
f"Weight name {tf_name} does not start with name_scope {name_scope}. This is an internal error "
"in Transformers, so (unless you were doing something really evil) please open an issue to report it!"
)
tf_name = tf_name[len(name_scope) :]
tf_name = tf_name.lstrip("/")
tf_name = tf_name.replace(":0", "")
tf_name = re.sub(
r"/[^/]*___([^/]*)/", r"/\1/", tf_name
)
tf_name = tf_name.replace(
"_._", "/"
)
tf_name = re.sub(r"//+", "/", tf_name)
tf_name = tf_name.split("/")
if len(tf_name) > 1:
tf_name = tf_name[1:]
tf_weight_shape = list(tf_weight_shape)
if tf_name[-1] == "kernel" and tf_weight_shape is not None and len(tf_weight_shape) == 4:
transpose = TransposeType.CONV2D
elif tf_name[-1] == "kernel" and tf_weight_shape is not None and len(tf_weight_shape) == 3:
transpose = TransposeType.CONV1D
elif bool(
tf_name[-1] in ["kernel", "pointwise_kernel", "depthwise_kernel"]
or "emb_projs" in tf_name
or "out_projs" in tf_name
):
transpose = TransposeType.SIMPLE
else:
transpose = TransposeType.NO
if tf_name[-1] == "kernel" or tf_name[-1] == "embeddings" or tf_name[-1] == "gamma":
tf_name[-1] = "weight"
if tf_name[-1] == "beta":
tf_name[-1] = "bias"
if tf_name[-1] == "pointwise_kernel" or tf_name[-1] == "depthwise_kernel":
tf_name[-1] = tf_name[-1].replace("_kernel", ".weight")
tf_name = ".".join(tf_name)
if start_prefix_to_remove:
tf_name = tf_name.replace(start_prefix_to_remove, "", 1)
return tf_name, transpose
def apply_transpose(transpose: TransposeType, weight, match_shape=None, pt_to_tf=True):
"""
Apply a transpose operation to a weight tensor and optionally reshape it to match a target shape, in a framework-agnostic manner.
"""
if transpose is TransposeType.CONV2D:
axes = (2, 3, 1, 0) if pt_to_tf else (3, 2, 0, 1)
weight = transpose_func(weight, axes=axes)
elif transpose is TransposeType.CONV1D:
weight = transpose_func(weight, axes=(2, 1, 0))
elif transpose is TransposeType.SIMPLE:
weight = transpose_func(weight)
if match_shape is None:
return weight
if len(match_shape) < len(weight.shape):
weight = squeeze(weight)
elif len(match_shape) > len(weight.shape):
weight = expand_dims(weight, axis=0)
if list(match_shape) != list(weight.shape):
try:
weight = reshape(weight, match_shape)
except AssertionError as e:
e.args += (match_shape, match_shape)
raise e
return weight
def load_pytorch_checkpoint_in_tf2_model(
tf_model,
pytorch_checkpoint_path,
tf_inputs=None,
allow_missing_keys=False,
output_loading_info=False,
_prefix=None,
tf_to_pt_weight_rename=None,
):
"""Load pytorch checkpoints into a TF 2.0 model"""
try:
import tensorflow as tf
import torch
from safetensors.torch import load_file as safe_load_file
from .pytorch_utils import is_torch_greater_or_equal_than_1_13
except ImportError:
logger.error(
"Loading a PyTorch model in TensorFlow requires both PyTorch and TensorFlow to be installed. Please see "
"https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
)
raise
if isinstance(pytorch_checkpoint_path, str):
pytorch_checkpoint_path = [pytorch_checkpoint_path]
pt_state_dict = {}
for path in pytorch_checkpoint_path:
pt_path = os.path.abspath(path)
logger.info(f"Loading PyTorch weights from {pt_path}")
if pt_path.endswith(".safetensors"):
state_dict = safe_load_file(pt_path)
else:
weights_only_kwarg = {"weights_only": True} if is_torch_greater_or_equal_than_1_13 else {}
state_dict = torch.load(pt_path, map_location="cpu", **weights_only_kwarg)
pt_state_dict.update(state_dict)
logger.info(f"PyTorch checkpoint contains {sum(t.numel() for t in pt_state_dict.values()):,} parameters")
return load_pytorch_weights_in_tf2_model(
tf_model,
pt_state_dict,
tf_inputs=tf_inputs,
allow_missing_keys=allow_missing_keys,
output_loading_info=output_loading_info,
_prefix=_prefix,
tf_to_pt_weight_rename=tf_to_pt_weight_rename,
)
def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_missing_keys=False):
"""Load pytorch checkpoints in a TF 2.0 model"""
pt_state_dict = pt_model.state_dict()
return load_pytorch_weights_in_tf2_model(
tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys
)
def load_pytorch_weights_in_tf2_model(
tf_model,
pt_state_dict,
tf_inputs=None,
allow_missing_keys=False,
output_loading_info=False,
_prefix=None,
tf_to_pt_weight_rename=None,
):
"""Load pytorch state_dict in a TF 2.0 model."""
try:
import tensorflow as tf
import torch
except ImportError:
logger.error(
"Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see "
"https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
)
raise
pt_state_dict = {k: v.numpy() for k, v in pt_state_dict.items()}
return load_pytorch_state_dict_in_tf2_model(
tf_model,
pt_state_dict,
tf_inputs=tf_inputs,
allow_missing_keys=allow_missing_keys,
output_loading_info=output_loading_info,
_prefix=_prefix,
tf_to_pt_weight_rename=tf_to_pt_weight_rename,
)
def load_pytorch_state_dict_in_tf2_model(
tf_model,
pt_state_dict,
tf_inputs=None,
allow_missing_keys=False,
output_loading_info=False,
_prefix=None,
tf_to_pt_weight_rename=None,
ignore_mismatched_sizes=False,
):
"""Load a pytorch state_dict in a TF 2.0 model. pt_state_dict can be either an actual dict or a lazy-loading
safetensors archive created with the safe_open() function."""
import tensorflow as tf
if tf_inputs is None:
tf_inputs = tf_model.dummy_inputs
if _prefix is None:
_prefix = ""
if tf_inputs:
with tf.name_scope(_prefix):
tf_model(tf_inputs, training=False)
tf_keys_to_pt_keys = {}
for key in pt_state_dict.keys():
new_key = None
if "gamma" in key:
new_key = key.replace("gamma", "weight")
if "beta" in key:
new_key = key.replace("beta", "bias")
if "running_var" in key:
new_key = key.replace("running_var", "moving_variance")
if "running_mean" in key:
new_key = key.replace("running_mean", "moving_mean")
key_components = key.split(".")
name = None
if key_components[-3::2] == ["parametrizations", "original0"]:
name = key_components[-2] + "_g"
elif key_components[-3::2] == ["parametrizations", "original1"]:
name = key_components[-2] + "_v"
if name is not None:
key_components = key_components[:-3] + [name]
new_key = ".".join(key_components)
if new_key is None:
new_key = key
tf_keys_to_pt_keys[new_key] = key
start_prefix_to_remove = ""
if not any(s.startswith(tf_model.base_model_prefix) for s in tf_keys_to_pt_keys.keys()):
start_prefix_to_remove = tf_model.base_model_prefix + "."
symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights
tf_loaded_numel = 0
all_pytorch_weights = set(tf_keys_to_pt_keys.keys())
missing_keys = []
mismatched_keys = []
is_safetensor_archive = hasattr(pt_state_dict, "get_tensor")
for symbolic_weight in symbolic_weights:
sw_name = symbolic_weight.name
name, transpose = convert_tf_weight_name_to_pt_weight_name(
sw_name,
start_prefix_to_remove=start_prefix_to_remove,
tf_weight_shape=symbolic_weight.shape,
name_scope=_prefix,
)
if tf_to_pt_weight_rename is not None:
aliases = tf_to_pt_weight_rename(name)
for alias in aliases:
if alias in tf_keys_to_pt_keys:
name = alias
break
else:
name = aliases[0]
if name not in tf_keys_to_pt_keys:
if allow_missing_keys:
missing_keys.append(name)
continue
elif tf_model._keys_to_ignore_on_load_missing is not None:
if any(re.search(pat, name) is not None for pat in tf_model._keys_to_ignore_on_load_missing):
continue
raise AttributeError(f"{name} not found in PyTorch model")
state_dict_name = tf_keys_to_pt_keys[name]
if is_safetensor_archive:
array = pt_state_dict.get_tensor(state_dict_name)
else:
array = pt_state_dict[state_dict_name]
try:
array = apply_transpose(transpose, array, symbolic_weight.shape)
except tf.errors.InvalidArgumentError as e:
if not ignore_mismatched_sizes:
error_msg = str(e)
error_msg += (
"\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
)
raise tf.errors.InvalidArgumentError(error_msg)
else:
mismatched_keys.append((name, array.shape, symbolic_weight.shape))
continue
tf_loaded_numel += tensor_size(array)
symbolic_weight.assign(tf.cast(array, symbolic_weight.dtype))
del array
all_pytorch_weights.discard(name)
logger.info(f"Loaded {tf_loaded_numel:,} parameters in the TF 2.0 model.")
unexpected_keys = list(all_pytorch_weights)
if tf_model._keys_to_ignore_on_load_missing is not None:
for pat in tf_model._keys_to_ignore_on_load_missing:
missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
if tf_model._keys_to_ignore_on_load_unexpected is not None:
for pat in tf_model._keys_to_ignore_on_load_unexpected:
unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
if len(unexpected_keys) > 0:
logger.warning(
"Some weights of the PyTorch model were not used when initializing the TF 2.0 model"
f" {tf_model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are initializing"
f" {tf_model.__class__.__name__} from a PyTorch model trained on another task or with another architecture"
" (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n- This IS"
f" NOT expected if you are initializing {tf_model.__class__.__name__} from a PyTorch model that you expect"
" to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a"
" BertForSequenceClassification model)."
)
else:
logger.warning(f"All PyTorch model weights were used when initializing {tf_model.__class__.__name__}.\n")
if len(missing_keys) > 0:
logger.warning(
f"Some weights or buffers of the TF 2.0 model {tf_model.__class__.__name__} were not initialized from the"
f" PyTorch model and are newly initialized: {missing_keys}\nYou should probably TRAIN this model on a"
" down-stream task to be able to use it for predictions and inference."
)
else:
logger.warning(
f"All the weights of {tf_model.__class__.__name__} were initialized from the PyTorch model.\n"
"If your task is similar to the task the model of the checkpoint was trained on, "
f"you can already use {tf_model.__class__.__name__} for predictions without further training."
)
if len(mismatched_keys) > 0:
mismatched_warning = "\n".join(
[
f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
for key, shape1, shape2 in mismatched_keys
]
)
logger.warning(
f"Some weights of {tf_model.__class__.__name__} were not initialized from the model checkpoint"
f" are newly initialized because the shapes did not"
f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be able"
" to use it for predictions and inference."
)
if output_loading_info:
loading_info = {
"missing_keys": missing_keys,
"unexpected_keys": unexpected_keys,
"mismatched_keys": mismatched_keys,
}
return tf_model, loading_info
return tf_model
def load_tf2_checkpoint_in_pytorch_model(
pt_model, tf_checkpoint_path, tf_inputs=None, allow_missing_keys=False, output_loading_info=False
):
"""
Load TF 2.0 HDF5 checkpoint in a PyTorch model We use HDF5 to easily do transfer learning (see
https://github.com/tensorflow/tensorflow/blob/ee16fcac960ae660e0e4496658a366e2f745e1f0/tensorflow/python/keras/engine/network.py#L1352-L1357).
"""
try:
import tensorflow as tf
import torch
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
"https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
)
raise
import transformers
from .modeling_tf_utils import load_tf_weights
logger.info(f"Loading TensorFlow weights from {tf_checkpoint_path}")
tf_model_class_name = "TF" + pt_model.__class__.__name__
tf_model_class = getattr(transformers, tf_model_class_name)
tf_model = tf_model_class(pt_model.config)
if tf_inputs is None:
tf_inputs = tf_model.dummy_inputs
if tf_inputs is not None:
tf_model(tf_inputs, training=False)
load_tf_weights(tf_model, tf_checkpoint_path)
return load_tf2_model_in_pytorch_model(
pt_model, tf_model, allow_missing_keys=allow_missing_keys, output_loading_info=output_loading_info
)
def load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=False, output_loading_info=False):
"""Load TF 2.0 model in a pytorch model"""
weights = tf_model.weights
return load_tf2_weights_in_pytorch_model(
pt_model, weights, allow_missing_keys=allow_missing_keys, output_loading_info=output_loading_info
)
def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=False, output_loading_info=False):
"""Load TF2.0 symbolic weights in a PyTorch model"""
try:
import tensorflow as tf
import torch
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
"https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_state_dict = {tf_weight.name: tf_weight.numpy() for tf_weight in tf_weights}
return load_tf2_state_dict_in_pytorch_model(
pt_model, tf_state_dict, allow_missing_keys=allow_missing_keys, output_loading_info=output_loading_info
)
def load_tf2_state_dict_in_pytorch_model(pt_model, tf_state_dict, allow_missing_keys=False, output_loading_info=False):
import torch
new_pt_params_dict = {}
current_pt_params_dict = dict(pt_model.named_parameters())
start_prefix_to_remove = ""
if not any(s.startswith(pt_model.base_model_prefix) for s in current_pt_params_dict.keys()):
start_prefix_to_remove = pt_model.base_model_prefix + "."
tf_weights_map = {}
for name, tf_weight in tf_state_dict.items():
pt_name, transpose = convert_tf_weight_name_to_pt_weight_name(
name, start_prefix_to_remove=start_prefix_to_remove, tf_weight_shape=tf_weight.shape
)
tf_weights_map[pt_name] = (tf_weight, transpose)
all_tf_weights = set(tf_weights_map.keys())
loaded_pt_weights_data_ptr = {}
missing_keys_pt = []
for pt_weight_name, pt_weight in current_pt_params_dict.items():
if pt_weight.data_ptr() in loaded_pt_weights_data_ptr:
new_pt_params_dict[pt_weight_name] = loaded_pt_weights_data_ptr[pt_weight.data_ptr()]
continue
pt_weight_name_to_check = pt_weight_name
key_components = pt_weight_name.split(".")
name = None
if key_components[-3::2] == ["parametrizations", "original0"]:
name = key_components[-2] + "_g"
elif key_components[-3::2] == ["parametrizations", "original1"]:
name = key_components[-2] + "_v"
if name is not None:
key_components = key_components[:-3] + [name]
pt_weight_name_to_check = ".".join(key_components)
if pt_weight_name_to_check not in tf_weights_map:
if allow_missing_keys:
missing_keys_pt.append(pt_weight_name)
continue
raise AttributeError(f"{pt_weight_name} not found in TF 2.0 model")
array, transpose = tf_weights_map[pt_weight_name_to_check]
array = apply_transpose(transpose, array, pt_weight.shape, pt_to_tf=False)
if numpy.isscalar(array):
array = numpy.array(array)
if not is_torch_tensor(array) and not is_numpy_array(array):
array = array.numpy()
if is_numpy_array(array):
array = torch.from_numpy(array)
new_pt_params_dict[pt_weight_name] = array
loaded_pt_weights_data_ptr[pt_weight.data_ptr()] = array
all_tf_weights.discard(pt_weight_name)
missing_keys, unexpected_keys = pt_model.load_state_dict(new_pt_params_dict, strict=False)
missing_keys += missing_keys_pt
if pt_model._keys_to_ignore_on_load_missing is not None:
for pat in pt_model._keys_to_ignore_on_load_missing:
missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
if pt_model._keys_to_ignore_on_load_unexpected is not None:
for pat in pt_model._keys_to_ignore_on_load_unexpected:
unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
if len(unexpected_keys) > 0:
logger.warning(
"Some weights of the TF 2.0 model were not used when initializing the PyTorch model"
f" {pt_model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are initializing"
f" {pt_model.__class__.__name__} from a TF 2.0 model trained on another task or with another architecture"
" (e.g. initializing a BertForSequenceClassification model from a TFBertForPreTraining model).\n- This IS"
f" NOT expected if you are initializing {pt_model.__class__.__name__} from a TF 2.0 model that you expect"
" to be exactly identical (e.g. initializing a BertForSequenceClassification model from a"
" TFBertForSequenceClassification model)."
)
else:
logger.warning(f"All TF 2.0 model weights were used when initializing {pt_model.__class__.__name__}.\n")
if len(missing_keys) > 0:
logger.warning(
f"Some weights of {pt_model.__class__.__name__} were not initialized from the TF 2.0 model and are newly"
f" initialized: {missing_keys}\nYou should probably TRAIN this model on a down-stream task to be able to"
" use it for predictions and inference."
)
else:
logger.warning(
f"All the weights of {pt_model.__class__.__name__} were initialized from the TF 2.0 model.\n"
"If your task is similar to the task the model of the checkpoint was trained on, "
f"you can already use {pt_model.__class__.__name__} for predictions without further training."
)
logger.info(f"Weights or buffers not loaded from TF 2.0 model: {all_tf_weights}")
if output_loading_info:
loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys}
return pt_model, loading_info
return pt_model
.\modeling_tf_utils.py
"""TF general model utils."""
from __future__ import annotations
import functools
import gc
import inspect
import json
import os
import pickle
import re
import warnings
from collections.abc import Mapping
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
import h5py
import numpy as np
import tensorflow as tf
from packaging.version import parse
from . import DataCollatorWithPadding, DefaultDataCollator
from .activations_tf import get_tf_activation
from .configuration_utils import PretrainedConfig
from .dynamic_module_utils import custom_object_save
from .generation import GenerationConfig, TFGenerationMixin
from .tf_utils import (
convert_batch_encoding,
expand_1d,
load_attributes_from_hdf5_group,
save_attributes_to_hdf5_group,
shape_list,
)
from .utils import (
SAFE_WEIGHTS_INDEX_NAME,
SAFE_WEIGHTS_NAME,
TF2_WEIGHTS_INDEX_NAME,
TF2_WEIGHTS_NAME,
TF_WEIGHTS_NAME,
WEIGHTS_INDEX_NAME,
WEIGHTS_NAME,
ModelOutput,
PushToHubMixin,
cached_file,
download_url,
find_labels,
has_file,
is_offline_mode,
is_remote_url,
is_safetensors_available,
is_tf_symbolic_tensor,
logging,
requires_backends,
working_or_temp_dir,
)
from .utils.hub import convert_file_size_to_int, get_checkpoint_shard_files
if is_safetensors_available():
from safetensors import safe_open
from safetensors.tensorflow import save_file as safe_save_file
if TYPE_CHECKING:
from . import PreTrainedTokenizerBase
logger = logging.get_logger(__name__)
if "TF_USE_LEGACY_KERAS" not in os.environ:
os.environ["TF_USE_LEGACY_KERAS"] = "1"
elif os.environ["TF_USE_LEGACY_KERAS"] != "1":
logger.warning(
"Transformers is only compatible with Keras 2, but you have explicitly set `TF_USE_LEGACY_KERAS` to `0`. "
"This may result in unexpected behaviour or errors if Keras 3 objects are passed to Transformers models."
)
try:
import tf_keras as keras
from tf_keras import backend as K
except (ModuleNotFoundError, ImportError):
import keras
from keras import backend as K
if parse(keras.__version__).major > 2:
raise ValueError(
"Your currently installed version of Keras is Keras 3, but this is not yet supported in "
"Transformers. Please install the backwards-compatible tf-keras package with "
"`pip install tf-keras`."
)
tf_logger = tf.get_logger()
TFModelInputType = Union[
List[tf.Tensor],
List[np.ndarray],
Dict[str, tf.Tensor],
Dict[str, np.ndarray],
tf.Tensor,
np.ndarray,
]
def dummy_loss(y_true, y_pred):
if y_pred.shape.rank <= 1:
return y_pred
else:
reduction_axes = list(range(1, y_pred.shape.rank))
return tf.reduce_mean(y_pred, axis=reduction_axes)
class TFModelUtilsMixin:
"""
`keras.Model` 的几个实用工具方法,作为 Mixin 使用。
"""
def num_parameters(self, only_trainable: bool = False) -> int:
"""
获取模型中参数的数量(可选只计算可训练的参数)。
Args:
only_trainable (`bool`, *optional*, 默认为 `False`):
是否只返回可训练参数的数量。
Returns:
`int`: 参数的数量。
"""
if only_trainable:
return int(sum(np.prod(w.shape.as_list()) for w in self.trainable_variables))
else:
return self.count_params()
def keras_serializable(cls):
"""
装饰一个 Keras 层类,以支持 Keras 序列化。
这是通过以下方式实现的:
1. 在 `get_config` 中为 Keras 配置字典添加 `transformers_config` 字典(在序列化时由 Keras 调用)。
2. 包装 `__init__` 方法以接受 `transformers_config` 字典(在反序列化时由 Keras 传递)并将其转换为实际层初始化器的配置对象。
3. 在 Keras 中注册该类作为自定义对象(如果 Tensorflow 版本支持),因此在调用 `keras.models.load_model` 时不需要在 `custom_objects` 中提供它。
Args:
cls (a `keras.layers.Layers subclass`):
通常是项目中的 `TF.MainLayer` 类,一般必须接受 `config` 参数作为其初始化器。
Returns:
经过修改以支持 Keras 反序列化的同一类对象。
"""
initializer = cls.__init__
config_class = getattr(cls, "config_class", None)
if config_class is None:
raise AttributeError("Must set `config_class` to use @keras_serializable")
@functools.wraps(initializer)
def wrapped_init(self, *args, **kwargs):
config = args[0] if args and isinstance(args[0], PretrainedConfig) else kwargs.pop("config", None)
if isinstance(config, dict):
config = config_class.from_dict(config)
initializer(self, config, *args, **kwargs)
elif isinstance(config, PretrainedConfig):
if len(args) > 0:
initializer(self, *args, **kwargs)
else:
initializer(self, config, *args, **kwargs)
else:
raise ValueError("Must pass either `config` (PretrainedConfig) or `config` (dict)")
self._config = config
self._kwargs = kwargs
cls.__init__ = wrapped_init
if not hasattr(cls, "get_config"):
raise TypeError("Only use @keras_serializable on keras.layers.Layer subclasses")
if hasattr(cls.get_config, "_is_default"):
def get_config(self):
cfg = super(cls, self).get_config()
cfg["config"] = self._config.to_dict()
cfg.update(self._kwargs)
return cfg
cls.get_config = get_config
cls._keras_serializable = True
if hasattr(keras.utils, "register_keras_serializable"):
cls = keras.utils.register_keras_serializable()(cls)
return cls
class TFCausalLanguageModelingLoss:
"""
Loss function suitable for causal language modeling (CLM), that is, the task of guessing the next token.
<Tip>
Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
</Tip>
"""
def hf_compute_loss(self, labels, logits):
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
if self.config.tf_legacy_loss:
active_loss = tf.not_equal(tf.reshape(labels, (-1,)), -100)
reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)
return loss_fn(labels, reduced_logits)
unmasked_loss = loss_fn(tf.nn.relu(labels), logits)
loss_mask = tf.cast(labels != -100, dtype=unmasked_loss.dtype)
masked_loss = unmasked_loss * loss_mask
reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(loss_mask)
return tf.reshape(reduced_masked_loss, (1,))
class TFQuestionAnsweringLoss:
"""
Loss function suitable for question answering.
"""
def hf_compute_loss(self, labels, logits):
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
start_loss = loss_fn(labels["start_position"], logits[0])
end_loss = loss_fn(labels["end_position"], logits[1])
return (start_loss + end_loss) / 2.0
class TFTokenClassificationLoss:
"""
Loss function suitable for token classification.
<Tip>
Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
</Tip>
"""
def hf_compute_loss(self, labels, logits):
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
if tf.executing_eagerly():
if tf.math.reduce_any(labels == -1):
tf.print("Using `-1` to mask the loss for the token is deprecated. Please use `-100` instead.")
if self.config.tf_legacy_loss:
if tf.math.reduce_any(labels == -1):
tf.print("Using `-1` to mask the loss for the token is deprecated. Please use `-100` instead.")
active_loss = tf.reshape(labels, (-1,)) != -1
else:
active_loss = tf.reshape(labels, (-1,)) != -100
reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)
return loss_fn(labels, reduced_logits)
unmasked_loss = loss_fn(tf.nn.relu(labels), logits)
loss_mask = tf.cast(labels >= 0, dtype=unmasked_loss.dtype)
masked_loss = unmasked_loss * loss_mask
reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(loss_mask)
return tf.reshape(reduced_masked_loss, (1,))
class TFSequenceClassificationLoss:
"""
Loss function suitable for sequence classification.
"""
def hf_compute_loss(self, labels, logits):
if logits.shape.rank == 1 or logits.shape[1] == 1:
loss_fn = keras.losses.MeanSquaredError(reduction=keras.losses.Reduction.NONE)
if labels.shape.rank == 1:
labels = tf.expand_dims(labels, axis=-1)
else:
loss_fn = keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction=keras.losses.Reduction.NONE
)
return loss_fn(labels, logits)
class TFMultipleChoiceLoss:
"""Loss function suitable for multiple choice tasks."""
def hf_compute_loss(self, labels, logits):
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
return loss_fn(labels, logits)
class TFMaskedLanguageModelingLoss(TFCausalLanguageModelingLoss):
"""
Loss function suitable for masked language modeling (MLM), that is, the task of guessing the masked tokens.
<Tip>
Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
</Tip>
"""
class TFNextSentencePredictionLoss:
"""
Loss function suitable for next sentence prediction (NSP), that is, the task of guessing the next sentence.
<Tip>
Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
</Tip>
"""
def hf_compute_loss(self, labels, logits):
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
if self.config.tf_legacy_loss:
next_sentence_active_loss = tf.not_equal(tf.reshape(labels, (-1,)), -100)
next_sentence_reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, 2)), next_sentence_active_loss)
next_sentence_label = tf.boolean_mask(tf.reshape(labels, (-1,)), next_sentence_active_loss)
return loss_fn(next_sentence_label, next_sentence_reduced_logits)
unmasked_ns_loss = loss_fn(y_true=tf.nn.relu(labels), y_pred=logits)
ns_loss_mask = tf.cast(labels != -100, dtype=unmasked_ns_loss.dtype)
masked_ns_loss = unmasked_ns_loss * ns_loss_mask
return masked_ns_loss
def booleans_processing(config, **kwargs):
"""
Process the input booleans of each model.
"""
final_booleans = {}
if "output_attentions" in kwargs:
final_booleans["output_attentions"] = (
kwargs["output_attentions"] if kwargs["output_attentions"] is not None else config.output_attentions
)
final_booleans["output_hidden_states"] = (
kwargs["output_hidden_states"] if kwargs["output_hidden_states"] is not None else config.output_hidden_states
)
final_booleans["return_dict"] = kwargs["return_dict"] if kwargs["return_dict"] is not None else config.return_dict
if "use_cache" in kwargs:
final_booleans["use_cache"] = (
kwargs["use_cache"] if kwargs["use_cache"] is not None else getattr(config, "use_cache", None)
)
return final_booleans
original_signature = inspect.signature(func)
@functools.wraps(func)
def run_call_with_unpacked_inputs(self, *args, **kwargs):
kwargs_call = {key: val for key, val in kwargs.items() if key not in dict(original_signature.parameters)}
fn_args_and_kwargs = {key: val for key, val in kwargs.items() if key not in kwargs_call}
fn_args_and_kwargs.update({"kwargs_call": kwargs_call})
fn_args_and_kwargs.update(dict(zip(func.__code__.co_varnames[1:], args)))
if "EncoderDecoder" in self.__class__.__name__:
config = None
else:
config = self.config
unpacked_inputs = input_processing(func, config, **fn_args_and_kwargs)
return func(self, **unpacked_inputs)
run_call_with_unpacked_inputs.__signature__ = original_signature
return run_call_with_unpacked_inputs
allowed_types = (tf.Tensor, bool, int, ModelOutput, tuple, list, dict, np.ndarray)
if "inputs" in kwargs["kwargs_call"]:
warnings.warn(
"The `inputs` argument is deprecated and will be removed in a future version, use `input_ids` instead.",
FutureWarning,
)
output["input_ids"] = kwargs["kwargs_call"].pop("inputs")
if "decoder_cached_states" in kwargs["kwargs_call"]:
warnings.warn(
"The `decoder_cached_states` argument is deprecated and will be removed in a future version, use"
" `past_key_values` instead.",
FutureWarning,
)
output["past_key_values"] = kwargs["kwargs_call"].pop("decoder_cached_states")
if "past" in kwargs["kwargs_call"] and "past_key_values" in parameter_names:
warnings.warn(
"The `past` argument is deprecated and will be removed in a future version, use `past_key_values`"
" instead.",
FutureWarning,
)
kwargs["past_key_values"] = kwargs["kwargs_call"].pop("past")
elif "past_key_values" in kwargs["kwargs_call"] and "past" in parameter_names:
kwargs["past"] = kwargs["kwargs_call"].pop("past_key_values")
if has_kwargs:
output["kwargs"] = kwargs.pop("kwargs_call", {})
else:
if len(kwargs["kwargs_call"]) > 0:
raise ValueError(
"The following keyword arguments are not supported by this model:"
f" {list(kwargs['kwargs_call'].keys())}."
)
kwargs.pop("kwargs_call")
for k, v in kwargs.items():
if isinstance(v, allowed_types) or tf.is_tensor(v) or v is None:
output[k] = v
else:
raise ValueError(f"Data of type {type(v)} is not allowed only {allowed_types} is accepted for {k}.")
if isinstance(main_input, (tuple, list)):
for i, input in enumerate(main_input):
if is_tf_symbolic_tensor(input):
tensor_name = input.name.split(":")[0]
if tensor_name in parameter_names:
output[tensor_name] = input
else:
output[parameter_names[i]] = input
elif isinstance(input, allowed_types) or input is None:
output[parameter_names[i]] = input
else:
raise ValueError(
f"Data of type {type(input)} is not allowed only {allowed_types} is accepted for"
f" {parameter_names[i]}."
)
elif isinstance(main_input, Mapping):
if "inputs" in main_input:
warnings.warn(
"The `inputs` argument is deprecated and will be removed in a future version, use `input_ids`"
" instead.",
FutureWarning,
)
output["input_ids"] = main_input.pop("inputs")
if "decoder_cached_states" in main_input:
warnings.warn(
"The `decoder_cached_states` argument is deprecated and will be removed in a future version, use"
" `past_key_values` instead.",
FutureWarning,
)
output["past_key_values"] = main_input.pop("decoder_cached_states")
for k, v in dict(main_input).items():
if isinstance(v, allowed_types) or v is None:
output[k] = v
elif k not in parameter_names and "args" not in parameter_names:
logger.warning(
f"The parameter {k} does not belongs to the parameter list {parameter_names} and will be ignored."
)
continue
else:
raise ValueError(f"Data of type {type(v)} is not allowed only {allowed_types} is accepted for {k}.")
else:
if tf.is_tensor(main_input) or main_input is None:
output[main_input_name] = main_input
else:
raise ValueError(
f"Data of type {type(main_input)} is not allowed only {allowed_types} is accepted for {main_input_name}."
)
for name in parameter_names:
if name not in list(output.keys()) and name != "args":
output[name] = kwargs.pop(name, signature[name].default)
if "args" in output:
if output["args"] is not None and is_tf_symbolic_tensor(output["args"]):
tensor_name = output["args"].name.split(":")[0]
output[tensor_name] = output["args"]
else:
output["input_ids"] = output["args"]
del output["args"]
if "kwargs" in output:
del output["kwargs"]
cast_output = {}
for key, val in output.items():
if isinstance(val, tf.Tensor) and val.dtype == tf.int64:
cast_output[key] = tf.cast(val, tf.int32)
elif isinstance(val, np.ndarray) and val.dtype == np.int64:
cast_output[key] = val.astype(np.int32)
else:
cast_output[key] = val
output = cast_output
del cast_output
boolean_dict = {
k: v
for k, v in output.items()
if k in ["return_dict", "output_attentions", "output_hidden_states", "use_cache"]
}
output.update(
booleans_processing(
config=config,
**boolean_dict,
)
)
return output
def tf_shard_checkpoint(weights, max_shard_size="10GB"):
"""
Splits a model state dictionary into sub-checkpoints so that the final size of each sub-checkpoint does not exceed a
given size.
The sub-checkpoints are determined by iterating through the `weights` in the order of its keys, ensuring that each
sub-checkpoint does not exceed `max_shard_size`.
Args:
weights (`Dict[str, tf.ResourceVariable]`): The dictionary of tf.ResourceVariable objects representing weights
of a model.
max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
The maximum size of each sub-checkpoint. If expressed as a string, needs to be digits followed by a unit
(like `"5MB"`).
Returns:
Tuple[Dict[str, List[tf.ResourceVariable]], Optional[Dict[str, List[int]]]]:
A tuple containing:
- A dictionary mapping from a checkpoint name (e.g., `"TF2_WEIGHTS_NAME"`) to a list of tf.ResourceVariable objects,
representing each sub-checkpoint.
- Optionally, a dictionary mapping from each checkpoint name to a list of sizes (in bytes) of the corresponding
sub-checkpoints.
"""
max_shard_size = convert_file_size_to_int(max_shard_size)
sharded_state_dicts = []
current_block = []
current_block_size = 0
total_size = 0
for item in weights:
weight_size = item.numpy().size * dtype_byte_size(item.dtype)
if current_block_size + weight_size > max_shard_size:
sharded_state_dicts.append(current_block)
current_block = []
current_block_size = 0
current_block.append(item)
current_block_size += weight_size
total_size += weight_size
sharded_state_dicts.append(current_block)
if len(sharded_state_dicts) == 1:
return {TF2_WEIGHTS_NAME: sharded_state_dicts[0]}, None
weight_map = {}
shards = {}
for idx, shard in enumerate(sharded_state_dicts):
shard_file = TF2_WEIGHTS_NAME.replace(".h5", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.h5")
shards[shard_file] = shard
for weight in shard:
weight_name = weight.name
weight_map[weight_name] = shard_file
metadata = {"total_size": total_size}
index = {"metadata": metadata, "weight_map": weight_map}
return shards, index
def load_tf_sharded_weights(model, shard_files, ignore_mismatched_sizes=False, strict=False, _prefix=None):
"""
This is the same as `load_tf_weights` but for a sharded checkpoint. Detect missing and unexpected layers and load
the TF weights from the shard file accordingly to their names and shapes.
This load is performed efficiently: each checkpoint shard is loaded one by one in RAM and deleted after being
loaded in the model.
Args:
model (`keras.models.Model`): The model in which to load the checkpoint.
shard_files (`str` or `os.PathLike`): A list containing the sharded checkpoint names.
ignore_mismatched_sizes (`bool`, *optional*, defaults to `True`):
Whether or not to ignore the mismatch between the sizes.
strict (`bool`, *optional*, defaults to `True`):
Whether to strictly enforce that the keys in the model state dict match the keys in the sharded checkpoint.
Returns:
Three lists, one for the missing layers, another one for the unexpected layers, and a last one for the
mismatched layers.
"""
unexpected_keys = set()
saved_keys = set()
mismatched_keys = set()
model_keys = set()
model_layer_map = {}
for i, k in enumerate(model.weights):
layer_name = k.name
if _prefix is not None and layer_name.startswith(_prefix):
layer_name = layer_name[len(_prefix):]
layer_name = layer_name.lstrip("/")
if not ("model." in layer_name or len(layer_name.split("/")) == 1):
layer_name = "/".join(layer_name.split("/")[1:])
model_keys.add(layer_name)
model_layer_map[layer_name] = i
for shard_file in shard_files:
saved_weight_names_set, unexpected_keys_set, mismatched_keys_set = load_tf_shard(
model,
model_layer_map,
shard_file,
ignore_mismatched_sizes=ignore_mismatched_sizes,
_prefix=_prefix,
)
saved_keys.update(saved_weight_names_set)
unexpected_keys.update(unexpected_keys_set)
mismatched_keys.update(mismatched_keys_set)
gc.collect()
missing_keys = model_keys - saved_keys
if strict and (len(missing_keys) > 0 or len(unexpected_keys) > 0):
error_message = f"Error(s) in loading state_dict for {model.__class__.__name__}"
if len(missing_keys) > 0:
str_missing_keys = ",".join([f'"{k}"' for k in missing_keys])
error_message += f"\nMissing key(s): {str_missing_keys}."
if len(unexpected_keys) > 0:
str_unexpected_keys = ",".join([f'"{k}"' for k in unexpected_keys])
error_message += f"\nUnexpected key(s): {str_unexpected_keys}."
raise RuntimeError(error_message)
return missing_keys, unexpected_keys, mismatched_keys
def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatched_sizes=False, _prefix=None):
"""
Loads a shard from a sharded checkpoint file. Handles the missing keys and unexpected keys.
Args:
model (`keras.models.Model`): Model in which the weights are loaded
model_layer_map (`Dict`): A dictionary mapping the layer name to the index of the layer in the model.
resolved_archive_file (`str`): Path to the checkpoint file from which the weights will be loaded
ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`): Whether to ignore the mismatched keys
Returns:
`keras.models.Model`: Three lists, one for the layers that were found and succesfully restored (from the
shard file), one for the mismatched layers, and another one for the unexpected layers.
"""
saved_weight_names_set = set()
saved_weights = {}
mismatched_keys = set()
unexpected_keys = set()
try:
with h5py.File(resolved_archive_file, "r") as sharded_checkpoint_file:
saved_h5_model_layers_name = set(load_attributes_from_hdf5_group(sharded_checkpoint_file, "layer_names"))
weight_value_tuples = []
for layer_name in saved_h5_model_layers_name:
h5_layer_object = sharded_checkpoint_file[layer_name]
saved_weights[layer_name] = np.asarray(h5_layer_object)
saved_weight_names_set.add(layer_name)
if layer_name not in model_layer_map:
unexpected_keys.add(layer_name)
else:
symbolic_weight = model.weights[model_layer_map[layer_name]]
saved_weight_value = saved_weights[layer_name]
if saved_weight_value is not None:
if K.int_shape(symbolic_weight) != saved_weight_value.shape:
try:
array = np.reshape(saved_weight_value, K.int_shape(symbolic_weight))
except ValueError as e:
if ignore_mismatched_sizes:
mismatched_keys.add(
(layer_name, saved_weight_value.shape, K.int_shape(symbolic_weight))
)
continue
else:
raise e
else:
array = saved_weight_value
weight_value_tuples.append((symbolic_weight, array))
K.batch_set_value(weight_value_tuples)
return saved_weight_names_set, unexpected_keys, mismatched_keys
except Exception as e:
try:
with open(resolved_archive_file) as f:
if f.read().startswith("version"):
raise OSError(
"You seem to have cloned a repository without having git-lfs installed. Please install "
"git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
"you cloned."
)
else:
raise ValueError(
f"Unable to locate the file {resolved_archive_file} which is necessary to load this pretrained"
" model. Make sure you have saved the model properly."
) from e
except (UnicodeDecodeError, ValueError):
raise OSError(
f"Unable to load weights from TF checkpoint file for '{resolved_archive_file}' "
f"at '{resolved_archive_file}'. "
"If you tried to load a TF model from a sharded checkpoint, you should try converting the model "
"by loading it in pytorch and saving it localy. A convertion script should be realeased soon."
)
if resolved_archive_file.endswith(".safetensors"):
load_function = load_tf_weights_from_safetensors
else:
load_function = load_tf_weights_from_h5
return load_function(
model, resolved_archive_file, ignore_mismatched_sizes=ignore_mismatched_sizes, _prefix=_prefix
)
def load_tf_weights_from_h5(model, resolved_archive_file, ignore_mismatched_sizes=False, _prefix=None):
mismatched_layers = []
K.batch_set_value(weight_value_tuples)
missing_layers.extend(list(symbolic_weights_names - saved_weight_names_set))
unexpected_layers.extend(list(saved_weight_names_set - symbolic_weights_names))
return missing_layers, unexpected_layers, mismatched_layers
def load_tf_weights_from_safetensors(model, resolved_archive_file, ignore_mismatched_sizes=False, _prefix=None):
with safe_open(resolved_archive_file, framework="tf") as safetensors_archive:
mismatched_layers = []
weight_names = [strip_model_name_and_prefix(w.name, _prefix=_prefix) for w in model.weights]
loaded_weight_names = list(safetensors_archive.keys())
missing_layers = list(set(weight_names) - set(loaded_weight_names))
unexpected_layers = list(set(loaded_weight_names) - set(weight_names))
for weight in model.weights:
weight_name = strip_model_name_and_prefix(weight.name, _prefix=_prefix)
if weight_name in loaded_weight_names:
weight_value = safetensors_archive.get_tensor(weight_name)
if K.int_shape(weight) != weight_value.shape:
try:
weight_value = tf.reshape(weight_value, K.int_shape(weight))
except (ValueError, tf.errors.InvalidArgumentError) as e:
if ignore_mismatched_sizes:
mismatched_layers.append((weight_name, weight_value.shape, K.int_shape(weight)))
continue
else:
raise e
K.set_value(weight, weight_value)
return missing_layers, unexpected_layers, mismatched_layers
def init_copy_embeddings(old_embeddings, new_num_tokens):
r"""
This function aims to reduce the embeddings in case new_num_tokens < old_num_tokens or to pad with -1 in case
new_num_tokens > old_num_tokens. A mask is also computed in order to know which weight in the embeddings should be
kept or not. Example:
- if new_num_tokens=5 and old_num_tokens=4 and old_embeddings=[w1,w2,w3,w4]
- mask=[True,True,True,True,False] and current_weights=[w1,w2,w3,w4,-1]
- if new_num_tokens=4 and old_num_tokens=5 and old_embeddings=[w1,w2,w3,w4,w5]
- mask=[True,True,True,True] and current_weights=[w1,w2,w3,w4]
"""
old_num_tokens, old_embedding_dim = shape_list(old_embeddings)
size_diff = new_num_tokens - old_num_tokens
if tf.math.greater(size_diff, 0):
current_weights = tf.pad(
old_embeddings.value(), tf.convert_to_tensor([[0, size_diff], [0, 0]]), constant_values=-1
)
num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
mask = tf.fill(tf.convert_to_tensor([num_tokens_to_copy, 1]), True)
mask = tf.pad(mask, tf.convert_to_tensor([[0, size_diff], [0, 0]]), constant_values=False)
else:
current_weights = tf.slice(
old_embeddings.value(),
tf.convert_to_tensor([0, 0]),
tf.convert_to_tensor([new_num_tokens, old_embedding_dim]),
)
mask = tf.fill(tf.convert_to_tensor([new_num_tokens, 1]), True)
return mask, current_weights
"""
Class attributes (overridden by derived classes):
- **config_class** ([`PretrainedConfig`]) -- A subclass of [`PretrainedConfig`] to use as configuration class
for this model architecture.
- **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in derived
classes of the same architecture adding modules on top of the base model.
- **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for NLP
models, `pixel_values` for vision models and `input_values` for speech models).
"""
config_class = None
base_model_prefix = ""
main_input_name = "input_ids"
_auto_class = None
_using_dummy_loss = None
_label_to_output_map = None
_keys_to_ignore_on_load_missing = None
_keys_to_ignore_on_load_unexpected = None
_requires_load_weight_prefix = False
@property
def dummy_inputs(self) -> Dict[str, tf.Tensor]:
"""
Dummy inputs to build the network.
Returns:
`Dict[str, tf.Tensor]`: The dummy inputs.
"""
dummies = {}
for key, spec in self.input_signature.items():
dummy_shape = [dim if dim is not None else 2 for dim in spec.shape]
if spec.shape[0] is None:
dummy_shape[0] = 1
dummies[key] = tf.ones(shape=dummy_shape, dtype=spec.dtype)
if key == "token_type_ids":
dummies[key] = tf.zeros_like(dummies[key])
if self.config.add_cross_attention and "encoder_hidden_states" in inspect.signature(self.call).parameters:
if "encoder_hidden_states" not in dummies:
if self.main_input_name == "input_ids":
dummies["encoder_hidden_states"] = tf.ones(
shape=(1, 2, self.config.hidden_size), dtype=tf.float32, name="encoder_hidden_states"
)
else:
raise NotImplementedError(
"Model has cross-attention but we couldn't infer the shape for the encoder hidden states. Please manually override dummy_inputs!"
)
return dummies
def build_in_name_scope(self):
with tf.name_scope(self.name):
self.build(input_shape=None)
@property
def framework(self) -> str:
"""
:str: Identifies that this is a TensorFlow model.
"""
return "tf"
def build(self, input_shape=None):
pass
def __init__(self, config, *inputs, **kwargs):
super().__init__(*inputs, **kwargs)
if not isinstance(config, PretrainedConfig):
raise ValueError(
f"Parameter config in `{self.__class__.__name__}(config)` should be an instance of class "
"`PretrainedConfig`. To create a model from a pretrained model use "
f"`model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
self.config = config
self.name_or_path = config.name_or_path
self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None
self._set_save_spec(self.input_signature)
def get_config(self):
return self.config.to_dict()
@functools.wraps(keras.Model.fit)
def fit(self, *args, **kwargs):
args, kwargs = convert_batch_encoding(*args, **kwargs)
return super().fit(*args, **kwargs)
@functools.wraps(keras.Model.train_on_batch)
def train_on_batch(self, *args, **kwargs):
args, kwargs = convert_batch_encoding(*args, **kwargs)
return super().train_on_batch(*args, **kwargs)
@functools.wraps(keras.Model.test_on_batch)
def test_on_batch(self, *args, **kwargs):
args, kwargs = convert_batch_encoding(*args, **kwargs)
return super().test_on_batch(*args, **kwargs)
@functools.wraps(keras.Model.predict_on_batch)
def predict_on_batch(self, *args, **kwargs):
args, kwargs = convert_batch_encoding(*args, **kwargs)
return super().predict_on_batch(*args, **kwargs)
@functools.wraps(keras.Model.predict)
def predict(self, *args, **kwargs):
args, kwargs = convert_batch_encoding(*args, **kwargs)
return super().predict(*args, **kwargs)
@functools.wraps(keras.Model.evaluate)
def evaluate(self, *args, **kwargs):
args, kwargs = convert_batch_encoding(*args, **kwargs)
return super().evaluate(*args, **kwargs)
@classmethod
def from_config(cls, config, **kwargs):
if isinstance(config, PretrainedConfig):
return cls._from_config(config, **kwargs)
return cls._from_config(cls.config_class.from_dict(config, **kwargs))
@classmethod
def _from_config(cls, config, **kwargs):
"""
所有模型初始化时应置于其下的上下文管理器都在这里。
"""
return cls(config, **kwargs)
def get_head_mask(self, head_mask: tf.Tensor | None, num_hidden_layers: int) -> tf.Tensor:
"""
Prepare the head mask if needed.
Args:
head_mask (`tf.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):
The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
num_hidden_layers (`int`):
The number of hidden layers in the model.
Returns:
`tf.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with
`[None]` for each layer.
"""
if head_mask is not None:
head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
else:
head_mask = [None] * num_hidden_layers
return head_mask
def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
"""-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
if head_mask.shape.rank == 1:
head_mask = head_mask[None, None, :, None, None]
head_mask = tf.repeat(head_mask, repeats=num_hidden_layers, axis=0)
elif head_mask.shape.rank == 2:
head_mask = head_mask[:, None, :, None, None]
assert head_mask.shape.rank == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
head_mask = tf.cast(head_mask, tf.float32)
return head_mask
@tf.function
def serving(self, inputs):
"""
Args:
Method used for serving the model. Does not have a specific signature, but will be specialized as concrete
functions when saving with `save_pretrained`.
inputs (`Dict[str, tf.Tensor]`):
The input of the saved model as a dictionary of tensors.
"""
output = self.call(inputs)
return self.serving_output(output)
@property
def input_signature(self) -> Dict[str, tf.TensorSpec]:
"""
This property should return a dict mapping input names to tf.TensorSpec objects, representing the expected
shape and dtype for model inputs. It is used for both serving and for generating dummy inputs.
"""
model_inputs = list(inspect.signature(self.call).parameters)
sig = {}
if "input_ids" in model_inputs:
if self.__class__.__name__.endswith("ForMultipleChoice"):
text_dims = 3
else:
text_dims = 2
for input_name in (
"input_ids",
"attention_mask",
"token_type_ids",
"decoder_input_ids",
"decoder_attention_mask",
):
if input_name in model_inputs:
sig[input_name] = tf.TensorSpec([None] * text_dims, tf.int32, name=input_name)
if "pixel_values" in model_inputs:
pixel_values_shape = [None, None, None, None]
if hasattr(self.config, "vision_config"):
vision_config = self.config.vision_config
else:
vision_config = self.config
if hasattr(vision_config, "num_channels"):
pixel_values_shape[1] = vision_config.num_channels
else:
raise NotImplementedError(
"Could not infer number of channels from config, please override input_signature to specify input shapes."
)
if hasattr(vision_config, "image_size"):
pixel_values_shape[2] = pixel_values_shape[3] = vision_config.image_size
elif hasattr(vision_config, "input_size"):
pixel_values_shape[2] = pixel_values_shape[3] = vision_config.input_size
else:
raise NotImplementedError(
"Could not infer input image shape from config, please override input_signature to specify input shapes."
)
sig["pixel_values"] = tf.TensorSpec(pixel_values_shape, tf.float32, name="pixel_values")
if "input_features" in model_inputs:
raise NotImplementedError("Audio models need a manually defined input_signature")
return sig
def serving_output(self, output):
"""
Prepare the output of the saved model. Can be overridden if specific serving modifications are required.
"""
if not isinstance(output, ModelOutput):
return output
for key in output:
if key.endswith("hidden_states") and not getattr(self.config, "output_hidden_states", False):
output[key] = None
elif key.endswith("attentions") and not getattr(self.config, "output_attentions", False):
output[key] = None
elif key == "past_key_values" and not getattr(self.config, "use_cache", False):
output[key] = None
elif key == "cross_attentions" and not (
getattr(self.config, "output_attentions", False) and getattr(self.config, "add_cross_attention", False)
):
output[key] = None
if isinstance(output[key], (tuple, list)):
try:
output[key] = tf.convert_to_tensor(output[key])
except (ValueError, tf.errors.InvalidArgumentError):
pass
return output
@classmethod
def can_generate(cls) -> bool:
"""
Returns whether this model can generate sequences with `.generate()`.
Returns:
`bool`: Whether this model can generate sequences with `.generate()`.
"""
if "GenerationMixin" in str(cls.prepare_inputs_for_generation) and "GenerationMixin" in str(cls.generate):
return False
return True
def get_input_embeddings(self) -> keras.layers.Layer:
"""
Returns the model's input embeddings layer.
Returns:
`tf.Variable`: The embeddings layer mapping vocabulary to hidden states.
"""
main_layer = getattr(self, self.base_model_prefix, self)
if main_layer is not self:
return main_layer.get_input_embeddings()
else:
raise NotImplementedError
def _save_checkpoint(self, checkpoint_dir, epoch):
if not os.path.isdir(checkpoint_dir):
os.mkdir(checkpoint_dir)
weights_path = os.path.join(checkpoint_dir, "weights.h5")
self.save_weights(weights_path)
extra_data = {"epoch": epoch, "optimizer_state": self.optimizer.get_weights()}
extra_data_path = os.path.join(checkpoint_dir, "extra_data.pickle")
with open(extra_data_path, "wb") as f:
pickle.dump(extra_data, f)
def prepare_tf_dataset(
self,
dataset: "datasets.Dataset",
batch_size: int = 8,
shuffle: bool = True,
tokenizer: Optional["PreTrainedTokenizerBase"] = None,
collate_fn: Optional[Callable] = None,
collate_fn_args: Optional[Dict[str, Any]] = None,
drop_remainder: Optional[bool] = None,
prefetch: bool = True,
):
def compile(
self,
optimizer="rmsprop",
loss="auto_with_warning",
metrics=None,
loss_weights=None,
weighted_metrics=None,
run_eagerly=None,
steps_per_execution=None,
**kwargs,
):
):
"""
This is a thin wrapper that sets the model's loss output head as the loss if the user does not specify a loss
function themselves.
"""
if loss in ("auto_with_warning", "passthrough"):
logger.info(
"No loss specified in compile() - the model's internal loss computation will be used as the "
"loss. Don't panic - this is a common way to train TensorFlow models in Transformers! "
"To disable this behaviour please pass a loss argument, or explicitly pass "
"`loss=None` if you do not want your model to compute a loss. You can also specify `loss='auto'` to "
"get the internal loss without printing this info string."
)
loss = "auto"
if loss == "auto":
loss = dummy_loss
self._using_dummy_loss = True
else:
self._using_dummy_loss = False
parent_args = list(inspect.signature(keras.Model.compile).parameters.keys())
if "steps_per_execution" in parent_args:
super().compile(
optimizer=optimizer,
loss=loss,
metrics=metrics,
loss_weights=loss_weights,
weighted_metrics=weighted_metrics,
run_eagerly=run_eagerly,
steps_per_execution=steps_per_execution,
**kwargs,
)
else:
super().compile(
optimizer=optimizer,
loss=loss,
metrics=metrics,
loss_weights=loss_weights,
weighted_metrics=weighted_metrics,
run_eagerly=run_eagerly,
experimental_steps_per_execution=steps_per_execution,
**kwargs,
)
def compute_loss(self, *args, **kwargs):
if hasattr(keras.Model, "compute_loss"):
return super().compute_loss(*args, **kwargs)
else:
warnings.warn(
"The old compute_loss method is deprecated as it conflicts with the Keras compute_loss "
"method added in TF 2.8. If you want the original HF compute_loss, please call "
"hf_compute_loss() instead. From TF versions >= 2.8, or Transformers versions >= 5, "
"calling compute_loss() will get the Keras method instead.",
FutureWarning,
)
return self.hf_compute_loss(*args, **kwargs)
def get_label_to_output_name_mapping(self):
arg_names = list(inspect.signature(self.call).parameters)
if self._label_to_output_map is not None:
return self._label_to_output_map
elif "start_positions" in arg_names:
return {"start_positions": "start_logits", "end_positions": "end_logits"}
elif "sentence_order_label" in arg_names:
return {"labels": "prediction_logits", "sentence_order_label": "sop_logits"}
elif "next_sentence_label" in arg_names:
return {"labels": "prediction_logits", "next_sentence_label": "seq_relationship_logits"}
elif "mc_labels" in arg_names:
return {"labels": "logits", "mc_labels": "mc_logits"}
else:
return {}
def create_model_card(
self,
output_dir,
model_name: str,
language: Optional[str] = None,
license: Optional[str] = None,
tags: Optional[str] = None,
finetuned_from: Optional[str] = None,
tasks: Optional[str] = None,
dataset_tags: Optional[Union[str, List[str]]] = None,
dataset: Optional[Union[str, List[str]]] = None,
dataset_args: Optional[Union[str, List[str]]] = None,
):
"""
Creates a draft of a model card using the information available to the `Trainer`.
Args:
output_dir (`str` or `os.PathLike`):
The folder in which to create the model card.
model_name (`str`, *optional*):
The name of the model.
language (`str`, *optional*):
The language of the model (if applicable)
license (`str`, *optional*):
The license of the model. Will default to the license of the pretrained model used, if the original
model given to the `Trainer` comes from a repo on the Hub.
tags (`str` or `List[str]`, *optional*):
Some tags to be included in the metadata of the model card.
finetuned_from (`str`, *optional*):
The name of the model used to fine-tune this one (if applicable). Will default to the name of the repo
of the original model given to the `Trainer` (if it comes from the Hub).
tasks (`str` or `List[str]`, *optional*):
One or several task identifiers, to be included in the metadata of the model card.
dataset_tags (`str` or `List[str]`, *optional*):
One or several dataset tags, to be included in the metadata of the model card.
dataset (`str` or `List[str]`, *optional*):
One or several dataset identifiers, to be included in the metadata of the model card.
dataset_args (`str` or `List[str]`, *optional*):
One or several dataset arguments, to be included in the metadata of the model card.
"""
from .modelcard import TrainingSummary
training_summary = TrainingSummary.from_keras(
self,
keras_history=self.history,
language=language,
license=license,
tags=tags,
model_name=model_name,
finetuned_from=finetuned_from,
tasks=tasks,
dataset_tags=dataset_tags,
dataset=dataset,
dataset_args=dataset_args,
)
model_card = training_summary.to_model_card()
with open(os.path.join(output_dir, "README.md"), "w") as f:
f.write(model_card)
def set_input_embeddings(self, value):
"""
Set model's input embeddings
Args:
value (`tf.Variable`):
The new weights mapping hidden states to vocabulary.
"""
main_layer = getattr(self, self.base_model_prefix)
if main_layer is None:
raise NotImplementedError("The model does not implements the base_model_prefix attribute.")
try:
main_layer.set_input_embeddings(value)
except AttributeError:
logger.info("Building the model")
self.build_in_name_scope()
main_layer.set_input_embeddings(value)
def get_output_embeddings(self) -> Union[None, keras.layers.Layer]:
"""
Returns the model's output embeddings
Returns:
`tf.Variable`: The new weights mapping vocabulary to hidden states.
"""
if self.get_lm_head() is not None:
lm_head = self.get_lm_head()
try:
return lm_head.get_output_embeddings()
except AttributeError:
logger.info("Building the model")
self.build_in_name_scope()
return lm_head().get_output_embeddings()
return None
def set_output_embeddings(self, value):
"""
Set model's output embeddings
Args:
value (`tf.Variable`):
The new weights mapping hidden states to vocabulary.
"""
if self.get_lm_head() is not None:
lm_head = self.get_lm_head()
try:
lm_head.set_output_embeddings(value)
except AttributeError:
logger.info("Building the model")
self.build_in_name_scope()
lm_head.set_output_embeddings(value)
def get_output_layer_with_bias(self) -> Union[None, keras.layers.Layer]:
"""
Get the layer that handles a bias attribute in case the model has an LM head with weights tied to the
embeddings
Return:
`keras.layers.Layer`: The layer that handles the bias, None if not an LM model.
"""
warnings.warn(
"The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.", FutureWarning
)
return self.get_lm_head()
def get_prefix_bias_name(self) -> Union[None, str]:
"""
Get the concatenated _prefix name of the bias from the model name to the parent layer
Return:
`str`: The _prefix name of the bias.
"""
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
return None
def get_bias(self) -> Union[None, Dict[str, tf.Variable]]:
"""
获取 LM 头部的偏置字典。键表示偏置属性的名称。
Return:
`tf.Variable`: 表示偏置的权重,如果不是 LM 模型则返回 None。
"""
if self.get_lm_head() is not None:
lm_head = self.get_lm_head()
try:
return lm_head.get_bias()
except AttributeError:
self.build_in_name_scope()
return lm_head.get_bias()
return None
def set_bias(self, value):
"""
设置 LM 头部所有的偏置。
Args:
value (`Dict[tf.Variable]`):
LM 头部新的偏置字典。
"""
if self.get_lm_head() is not None:
lm_head = self.get_lm_head()
try:
lm_head.set_bias(value)
except AttributeError:
self.build_in_name_scope()
lm_head.set_bias(value)
def get_lm_head(self) -> keras.layers.Layer:
"""
LM 头部层。所有包含 LM 头部的模型必须重写此方法。
Return:
`keras.layers.Layer`: 如果模型有 LM 头部则返回该层,否则返回 None。
"""
return None
def resize_token_embeddings(
self, new_num_tokens: Optional[int] = None
) -> Union[keras.layers.Embedding, tf.Variable]:
"""
调整模型输入标记嵌入矩阵的大小,如果 `new_num_tokens != config.vocab_size`。
在之后处理权重嵌入时要注意是否模型类有 `tie_weights()` 方法。
Arguments:
new_num_tokens (`int`, *optional*):
嵌入矩阵中的新标记数量。增加大小将在末尾添加新初始化的向量,减小大小将从末尾删除向量。如果未提供或为 `None`,则仅返回输入标记的指针而不执行任何操作。
Return:
`tf.Variable` 或 `keras.layers.Embedding`: 模型输入标记的指针。
"""
if isinstance(self.get_input_embeddings(), keras.layers.Embedding):
return self._v2_resized_token_embeddings(new_num_tokens)
if new_num_tokens is None or new_num_tokens == self.config.vocab_size:
return self._get_word_embedding_weight(self.get_input_embeddings())
model_embeds = self._resize_token_embeddings(new_num_tokens)
self.config.vocab_size = new_num_tokens
return model_embeds
def _v2_resized_token_embeddings(self, new_num_tokens: Optional[int] = None) -> keras.layers.Embedding:
if new_num_tokens is None or new_num_tokens == self.config.vocab_size:
return self.get_input_embeddings()
model_embeds = self._v2_resize_token_embeddings(new_num_tokens)
self.config.vocab_size = new_num_tokens
return model_embeds
def _get_word_embedding_weight(model, embedding_layer):
if isinstance(embedding_layer, tf.Tensor):
return embedding_layer
embeds = getattr(embedding_layer, "weight", None)
if embeds is not None:
return embeds
embeds = getattr(embedding_layer, "decoder", None)
if embeds is not None:
return embeds
model.build_in_name_scope()
embeds = getattr(embedding_layer, "weight", None)
if embeds is not None:
return embeds
embeds = getattr(embedding_layer, "decoder", None)
if embeds is not None:
return embeds
return None
def _resize_token_embeddings(self, new_num_tokens):
old_embeddings = self._get_word_embedding_weight(self.get_input_embeddings())
new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
if self.get_bias() is not None:
old_lm_head_bias = self.get_bias()
new_lm_head_bias = self._get_resized_lm_head_bias(old_lm_head_bias, new_num_tokens)
self.set_bias(new_lm_head_bias)
if self.get_output_embeddings() is not None:
old_lm_head_decoder = self._get_word_embedding_weight(self.get_output_embeddings())
new_lm_head_decoder = self._get_resized_lm_head_decoder(old_lm_head_decoder, new_num_tokens)
self.set_output_embeddings(new_lm_head_decoder)
self.set_input_embeddings(new_embeddings)
return self.get_input_embeddings()
def _v2_resize_token_embeddings(self, new_num_tokens):
old_embeddings = self.get_input_embeddings()
new_embeddings = self._v2_get_resized_embeddings(old_embeddings, new_num_tokens)
self.set_input_embeddings(new_embeddings)
if self.get_bias() is not None:
old_lm_head_bias = self.get_bias()
new_lm_head_bias = self._v2_get_resized_lm_head_bias(old_lm_head_bias, new_num_tokens)
self.set_bias(new_lm_head_bias)
tied_weights = self.get_input_embeddings() == self.get_output_embeddings()
if self.get_output_embeddings() is not None and not tied_weights:
old_lm_head_decoder = self._get_word_embedding_weight(self.get_output_embeddings())
new_lm_head_decoder = self._get_resized_lm_head_decoder(old_lm_head_decoder, new_num_tokens)
self.set_output_embeddings(new_lm_head_decoder)
return self.get_input_embeddings()
`
def _get_resized_lm_head_bias(self, old_lm_head_bias, new_num_tokens):
"""
Build a resized bias from the old ones. Increasing the size will add newly initialized vectors at the end.
Reducing the size will remove vectors from the end
Args:
old_lm_head_bias (`tf.Variable`):
Old lm head bias to be resized.
new_num_tokens (`int`, *optional*):
New number of tokens in the linear matrix.
Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
vectors from the end. If not provided or `None`, just returns None
Return:
`tf.Variable`: Pointer to the resized bias.
"""
new_lm_head_bias = {}
for attr, weight in old_lm_head_bias.items():
first_dim, old_num_tokens = (None, shape_list(weight)[0]) if tf.rank(weight) == 1 else shape_list(weight)
size_diff = new_num_tokens - old_num_tokens
final_shape = [new_num_tokens] if first_dim is None else [first_dim, new_num_tokens]
if tf.math.greater(size_diff, 0):
padding_shape = [[0, size_diff]] if first_dim is None else [[0, 0], [0, size_diff]]
current_bias = tf.pad(weight.value(), tf.convert_to_tensor(padding_shape), constant_values=-1)
num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
mask_shape = [num_tokens_to_copy] if first_dim is None else [1, num_tokens_to_copy]
bias_mask = tf.fill(tf.convert_to_tensor(mask_shape), True)
bias_mask = tf.pad(bias_mask, tf.convert_to_tensor(padding_shape), constant_values=False)
else:
slice_from = [0] if first_dim is None else [0, 0]
current_bias = tf.slice(
weight.value(), tf.convert_to_tensor(slice_from), tf.convert_to_tensor(final_shape)
)
bias_mask = tf.fill(tf.convert_to_tensor(final_shape), True)
new_bias = self.add_weight(
shape=final_shape,
initializer="zeros",
trainable=True,
name=weight.name.split(":")[0],
)
init_bias = tf.where(bias_mask, current_bias, new_bias.value())
new_bias.assign(init_bias)
new_lm_head_bias[attr] = new_bias
return new_lm_head_bias
) -> Dict[str, tf.Tensor]:
"""
Build a resized bias from the old ones. Increasing the size will add newly initialized vectors at the end.
Reducing the size will remove vectors from the end
Args:
old_lm_head_bias (`Dict[str, tf.Variable]`):
Old lm head bias to be resized.
new_num_tokens (`int`):
New number of tokens in the linear matrix. Increasing the size will add newly initialized vectors at
the end. Reducing the size will remove vectors from the end.
Return:
`tf.Tensor`: Values for the resized bias.
"""
new_lm_head_bias = {}
for attr, weight in old_lm_head_bias.items():
first_dim, old_num_tokens = (None, shape_list(weight)[0]) if tf.rank(weight) == 1 else shape_list(weight)
size_diff = new_num_tokens - old_num_tokens
if old_num_tokens > new_num_tokens:
new_bias = weight.value()[..., :new_num_tokens]
else:
padding_shape = [[0, size_diff]] if first_dim is None else [[0, 0], [0, size_diff]]
new_bias = tf.pad(weight.value(), tf.convert_to_tensor(padding_shape))
new_lm_head_bias[attr] = new_bias
return new_lm_head_bias
def _get_resized_lm_head_decoder(self, old_lm_head_decoder, new_num_tokens):
"""
Build a resized decoder from the old ones. Increasing the size will add newly initialized vectors at the end.
Reducing the size will remove vectors from the end
Args:
old_lm_head_decoder (`tf.Variable`):
Old lm head decoder to be resized.
new_num_tokens (`int`, *optional*):
New number of tokens in the linear matrix.
Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
vectors from the end. If not provided or `None`, just returns None
Return:
`tf.Variable`: Pointer to the resized decoder or None if the output embeddings are different from the input
ones.
"""
new_lm_head_decoder = old_lm_head_decoder
is_input_output_equals = tf.reduce_any(
self._get_word_embedding_weight(self.get_input_embeddings()) == old_lm_head_decoder
)
if old_lm_head_decoder is not None and not is_input_output_equals:
old_embedding_dim = shape_list(old_lm_head_decoder)[1]
decoder_mask, current_decoder = init_copy_embeddings(old_lm_head_decoder, new_num_tokens)
new_lm_head_decoder = self.add_weight(
shape=(new_num_tokens, old_embedding_dim),
initializer="zeros",
trainable=True,
name=old_lm_head_decoder.name.split(":")[0],
)
init_decoder = tf.where(decoder_mask, current_decoder, new_lm_head_decoder.value())
new_lm_head_decoder.assign(init_decoder)
return new_lm_head_decoder
def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None) -> tf.Variable:
"""
Build a resized Embedding weights from a provided token Embedding weights. Increasing the size will add newly
initialized vectors at the end. Reducing the size will remove vectors from the end
Args:
old_embeddings (`tf.Variable`):
Old embeddings to be resized.
new_num_tokens (`int`, *optional*):
New number of tokens in the embedding matrix.
Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
vectors from the end. If not provided or `None`, just returns a pointer to the input tokens
`tf.Variable` module of the model without doing anything.
Return:
`tf.Variable`: Pointer to the resized Embedding Module or the old Embedding Module if `new_num_tokens` is
`None`
"""
old_embedding_dim = shape_list(old_embeddings)[1]
init_range = getattr(self.config, "initializer_range", 0.02)
embeddings_mask, current_embeddings = init_copy_embeddings(old_embeddings, new_num_tokens)
new_embeddings = self.add_weight(
name=old_embeddings.name.split(":")[0],
shape=[new_num_tokens, old_embedding_dim],
initializer=get_initializer(init_range),
dtype=tf.float32,
)
init_embeddings = tf.where(embeddings_mask, current_embeddings, new_embeddings.value())
new_embeddings.assign(init_embeddings)
return new_embeddings
) -> keras.layers.Embedding:
"""
Build a resized Embedding layer from a provided Embedding layer. Increasing the size will add newly initialized
vectors at the end. Reducing the size will remove vectors from the end.
Args:
old_embeddings (`keras.layers.Embedding`):
Old embeddings to be resized.
new_num_tokens (`int`, *optional*):
New number of tokens in the embedding matrix.
Return:
`keras.layers.Embedding`: Resized Embedding layer.
"""
init_range = 0.02
potential_initialization_variable_names = [
"initializer_range",
"initializer_factor",
"init_std",
]
for var_name in potential_initialization_variable_names:
if hasattr(self.config, var_name):
init_range = getattr(self.config, var_name)
new_embeddings = keras.layers.Embedding(
input_dim=new_num_tokens,
output_dim=old_embeddings.output_dim,
embeddings_initializer=keras.initializers.TruncatedNormal(stddev=init_range),
name=old_embeddings.embeddings.name[:-13],
)
new_embeddings(tf.constant([[0]]))
if old_embeddings.input_dim >= new_num_tokens:
init_embeddings = old_embeddings.embeddings[:new_num_tokens]
else:
init_embeddings = tf.concat(
[old_embeddings.embeddings, new_embeddings.embeddings[old_embeddings.input_dim :]], axis=0
)
new_embeddings.embeddings.assign(init_embeddings)
return new_embeddings
def prune_heads(self, heads_to_prune):
"""
Prunes heads of the base model.
Arguments:
heads_to_prune (`Dict[int, List[int]]`):
Dictionary with keys being selected layer indices (`int`) and associated values being the list of heads
to prune in said layer (list of `int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on
layer 1 and heads 2 and 3 on layer 2.
"""
raise NotImplementedError
def save_pretrained(
self,
save_directory,
saved_model=False,
version=1,
push_to_hub=False,
signatures=None,
max_shard_size: Union[int, str] = "10GB",
create_pr: bool = False,
safe_serialization: bool = False,
token: Optional[Union[str, bool]] = None,
**kwargs,
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
*model_args,
config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
cache_dir: Optional[Union[str, os.PathLike]] = None,
ignore_mismatched_sizes: bool = False,
force_download: bool = False,
local_files_only: bool = False,
token: Optional[Union[str, bool]] = None,
revision: str = "main",
use_safetensors: bool = None,
**kwargs,
):
"""
从预训练模型加载模型或者模型配置。
Args:
pretrained_model_name_or_path (str or os.PathLike):
预训练模型名称或路径。
*model_args:
模型特定的额外参数。
config (PretrainedConfig, str, os.PathLike, optional):
可选的模型配置。
cache_dir (str or os.PathLike, optional):
可选的缓存目录。
ignore_mismatched_sizes (bool):
是否忽略大小不匹配的警告,默认为 False。
force_download (bool):
是否强制下载模型,默认为 False。
local_files_only (bool):
是否只使用本地文件,默认为 False。
token (str or bool, optional):
可选的身份验证令牌。
revision (str):
模型版本,默认为 "main"。
use_safetensors (bool, optional):
是否使用安全张量,默认为 None。
**kwargs:
其他未指定的关键字参数。
"""
def push_to_hub(
self,
repo_id: str,
use_temp_dir: Optional[bool] = None,
commit_message: Optional[str] = None,
private: Optional[bool] = None,
max_shard_size: Optional[Union[int, str]] = "10GB",
token: Optional[Union[bool, str]] = None,
use_auth_token: Optional[Union[bool, str]] = None,
create_pr: bool = False,
**base_model_card_args,
):
"""
将模型推送到模型中心(Hub)的指定仓库。
Args:
repo_id (str):
仓库的唯一标识符。
use_temp_dir (bool, optional):
是否使用临时目录,默认为 None。
commit_message (str, optional):
提交消息,用于版本控制。
private (bool, optional):
是否将仓库设置为私有,默认为 None。
max_shard_size (int or str, optional):
最大的分片大小限制,默认为 "10GB"。
token (bool or str, optional):
身份验证令牌。
use_auth_token (bool or str, optional):
(已弃用)身份验证令牌,用于兼容目的。
create_pr (bool):
是否创建 Pull Request,默认为 False。
**base_model_card_args:
其他基本模型卡片参数。
@classmethod
def register_for_auto_class(cls, auto_class="TFAutoModel"):
"""
注册当前类到给定的自动加载类中。这仅用于自定义模型,因为库中的模型已经与自动加载类映射。
<Tip warning={true}>
该 API 是实验性的,可能在未来的发布中有些许更改。
</Tip>
Args:
auto_class (str or type, optional, defaults to "TFAutoModel"):
要注册新模型的自动加载类。
"""
if not isinstance(auto_class, str):
auto_class = auto_class.__name__
import transformers.models.auto as auto_module
if not hasattr(auto_module, auto_class):
raise ValueError(f"{auto_class} 不是有效的自动加载类名。")
cls._auto_class = auto_class
# 定义一个自定义的 1D 卷积层,按照 Radford 等人在 OpenAI GPT 中定义的方式(也用于 GPT-2)。
class TFConv1D(keras.layers.Layer):
"""
1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).
Basically works like a linear layer but the weights are transposed.
Args:
nf (`int`):
The number of output features.
nx (`int`):
The number of input features.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation to use to initialize the weights.
kwargs (`Dict[str, Any]`, *optional*):
Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`.
"""
def __init__(self, nf, nx, initializer_range=0.02, **kwargs):
super().__init__(**kwargs)
self.nf = nf # 输出特征的数量
self.nx = nx # 输入特征的数量
self.initializer_range = initializer_range # 初始化权重时的标准差
def build(self, input_shape):
if self.built:
return
self.built = True
# 添加权重变量:weight 的形状为 [nx, nf],使用指定标准差的初始化器初始化
self.weight = self.add_weight(
"weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range)
)
# 添加偏置变量:bias 的形状为 [1, nf],使用零初始化器初始化
self.bias = self.add_weight("bias", shape=[1, self.nf], initializer=tf.zeros_initializer())
def call(self, x):
bz, sl = shape_list(x)[:2] # 获取输入张量 x 的批量大小和序列长度
x = tf.reshape(x, [-1, self.nx]) # 将输入张量 x 重塑为二维张量
x = tf.matmul(x, self.weight) + self.bias # 执行矩阵乘法和偏置加法操作
x = tf.reshape(x, [bz, sl, self.nf]) # 将结果重新塑造为原始序列张量的形状
return x
class TFSharedEmbeddings(keras.layers.Layer):
r"""
Construct shared token embeddings.
The weights of the embedding layer is usually shared with the weights of the linear decoder when doing language
modeling.
Args:
vocab_size (`int`):
The size of the vocabulary, e.g., the number of unique tokens.
hidden_size (`int`):
The size of the embedding vectors.
initializer_range (`float`, *optional*):
The standard deviation to use when initializing the weights. If no value is provided, it will default to
\\(1/\sqrt{hidden\_size}\\).
kwargs (`Dict[str, Any]`, *optional*):
Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`.
"""
# TODO (joao): flagged for delection due to embeddings refactor
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: Optional[float] = None, **kwargs):
super().__init__(**kwargs)
self.vocab_size = vocab_size # 词汇表大小,即唯一标记的数量
self.hidden_size = hidden_size # 嵌入向量的大小
self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range
# 如果未提供初始化标准差,则默认为 1/√hidden_size
warnings.warn(
"`TFSharedEmbeddings` is scheduled for deletion in v4.32, use `keras.layers.Embedding` instead.",
DeprecationWarning,
)
def build(self, input_shape):
"""
Build shared token embedding layer.
This method initializes the layer's weight matrix based on the specified vocabulary size and hidden size.
The weight matrix is initialized using a custom initializer within the specified range.
Args:
input_shape (tuple): Shape tuple describing the input shape.
Returns:
None
"""
self.weight = self.add_weight(
"weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range)
)
super().build(input_shape)
def get_config(self):
"""
Returns the configuration of the layer.
Returns:
dict: Configuration dictionary containing vocab_size, hidden_size, and initializer_range.
"""
config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs: tf.Tensor, mode: str = "embedding") -> tf.Tensor:
"""
Get token embeddings of inputs or decode final hidden state.
Args:
inputs (`tf.Tensor`):
In embedding mode, should be an int64 tensor with shape `[batch_size, length]`.
In linear mode, should be a float tensor with shape `[batch_size, length, hidden_size]`.
mode (`str`, defaults to `"embedding"`):
A valid value is either `"embedding"` or `"linear"`, indicating the layer's usage mode.
Returns:
`tf.Tensor`: Depending on mode,
- In embedding mode: float32 embedding tensor, shape `[batch_size, length, embedding_size]`.
- In linear mode: float32 tensor, shape `[batch_size, length, vocab_size]`.
Raises:
ValueError: if `mode` is not valid.
"""
if mode == "embedding":
return self._embedding(inputs)
elif mode == "linear":
return self._linear(inputs)
else:
raise ValueError(f"mode {mode} is not valid.")
def _embedding(self, input_ids):
"""
Applies embedding based on input_ids tensor.
Args:
input_ids: Tensor containing token indices.
Returns:
`tf.Tensor`: Float32 embedding tensor.
"""
return tf.gather(self.weight, input_ids)
def _linear(self, inputs):
"""
Computes logits by running inputs through a linear layer.
Args:
inputs: A float32 tensor with shape [..., hidden_size]
Returns:
`tf.Tensor`: Float32 tensor with shape [..., vocab_size].
"""
first_dims = shape_list(inputs)[:-1]
x = tf.reshape(inputs, [-1, self.hidden_size])
logits = tf.matmul(x, self.weight, transpose_b=True)
return tf.reshape(logits, first_dims + [self.vocab_size])
class TFSequenceSummary(keras.layers.Layer):
"""
Compute a single vector summary of a sequence hidden states.
Args:
config ([`PretrainedConfig`]):
The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
config class of your model for the default values it uses):
- **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:
- `"last"` -- Take the last token hidden state (like XLNet)
- `"first"` -- Take the first token hidden state (like Bert)
- `"mean"` -- Take the mean of all tokens hidden states
- `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
- `"attn"` -- Not implemented now, use multi-head attention
- **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
- **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
(otherwise to `config.hidden_size`).
- **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
another string or `None` will add no activation.
- **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
- **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
initializer_range (`float`, defaults to 0.02): The standard deviation to use to initialize the weights.
kwargs (`Dict[str, Any]`, *optional*):
Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`.
"""
# 初始化函数,接受预训练配置和其他可选参数
def __init__(self, config: PretrainedConfig, initializer_range: float = 0.02, **kwargs):
# 调用父类的初始化方法
super().__init__(**kwargs)
# 根据配置确定摘要类型,如果配置中有 summary_use_proj 属性则使用其值,否则默认为 "last"
self.summary_type = config.summary_type if hasattr(config, "summary_use_proj") else "last"
# 如果摘要类型为 "attn",抛出未实现错误,建议使用标准的多头注意力模块
if self.summary_type == "attn":
raise NotImplementedError
# 判断配置中是否有 summary_use_proj 属性并且其值为 True,表示需要进行投影操作
self.has_summary = hasattr(config, "summary_use_proj") and config.summary_use_proj
if self.has_summary:
# 如果配置中定义了 summary_proj_to_labels 并且其值为 True,并且 num_labels 大于 0
if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
num_classes = config.num_labels
else:
num_classes = config.hidden_size
# 创建一个全连接层,用于摘要投影,输出维度为 num_classes
self.summary = keras.layers.Dense(
num_classes, kernel_initializer=get_initializer(initializer_range), name="summary"
)
# 判断配置中是否定义了 summary_activation 属性,如果有则设置相应的激活函数
self.has_activation = False
activation_string = getattr(config, "summary_activation", None)
if activation_string is not None:
self.has_activation = True
self.activation = get_tf_activation(activation_string)
# 判断配置中是否定义了 summary_first_dropout 属性并且其值大于 0,如果是则创建首层 Dropout
self.has_first_dropout = hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0
if self.has_first_dropout:
self.first_dropout = keras.layers.Dropout(config.summary_first_dropout)
# 判断配置中是否定义了 summary_last_dropout 属性并且其值大于 0,如果是则创建末层 Dropout
self.has_last_dropout = hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0
if self.has_last_dropout:
self.last_dropout = keras.layers.Dropout(config.summary_last_dropout)
# 将隐藏大小设置为配置中定义的 hidden_size
self.hidden_size = config.hidden_size
# 定义一个方法 `call`,用于执行模型的前向传播
def call(self, inputs, cls_index=None, training=False):
# 检查输入是否为字典、元组或列表,若不是则直接使用 `inputs` 作为隐藏状态
if not isinstance(inputs, (dict, tuple, list)):
hidden_states = inputs
elif isinstance(inputs, (tuple, list)):
# 若输入为元组或列表,则将第一个元素作为隐藏状态,第二个元素作为 `cls_index`(若有的话)
hidden_states = inputs[0]
cls_index = inputs[1] if len(inputs) > 1 else None
assert len(inputs) <= 2, "Too many inputs." # 断言输入的长度不超过2,否则报错
else:
# 若输入为字典,则从中获取 `hidden_states` 和 `cls_index`(默认为 None)
hidden_states = inputs.get("hidden_states")
cls_index = inputs.get("cls_index", None)
# 根据 `summary_type` 选择如何汇总隐藏状态
if self.summary_type == "last":
output = hidden_states[:, -1] # 取最后一个时间步的隐藏状态作为输出
elif self.summary_type == "first":
output = hidden_states[:, 0] # 取第一个时间步的隐藏状态作为输出
elif self.summary_type == "mean":
output = tf.reduce_mean(hidden_states, axis=1) # 对隐藏状态在第一维(batch 维度)上取平均
elif self.summary_type == "cls_index":
# 根据给定的 `cls_index` 从隐藏状态中取出对应位置的向量
hidden_shape = shape_list(hidden_states) # 获取隐藏状态的形状信息
if cls_index is None:
# 若 `cls_index` 为 None,则默认选择每个样本序列的最后一个位置
cls_index = tf.fill(
hidden_shape[:-2], hidden_shape[-2] - 1
) # 创建一个张量,形状为 [batch] 或 [batch, num choices],填充为序列长度
cls_shape = shape_list(cls_index)
if len(cls_shape) <= len(hidden_shape) - 2:
cls_index = tf.expand_dims(cls_index, axis=-1) # 在最后一维上扩展 `cls_index`
# output shape: (batch, num choices, hidden_size)
output = tf.gather(hidden_states, cls_index, batch_dims=len(hidden_shape) - 2)
output = tf.squeeze(
output, axis=len(hidden_shape) - 2
) # 压缩维度,输出形状为 (batch, num choices, hidden_size)
elif self.summary_type == "attn":
raise NotImplementedError # 如果 `summary_type` 是 "attn",则抛出未实现错误
# 若模型具有第一个 dropout 层,则在输出上应用该 dropout
if self.has_first_dropout:
output = self.first_dropout(output, training=training)
# 若模型具有汇总方法,则将输出传递给汇总方法
if self.has_summary:
output = self.summary(output)
# 若模型具有激活函数,则将输出传递给激活函数
if self.has_activation:
output = self.activation(output)
# 若模型具有最后一个 dropout 层,则在输出上应用该 dropout
if self.has_last_dropout:
output = self.last_dropout(output, training=training)
return output
# 构建模型,在输入形状已知的情况下进行构建
def build(self, input_shape):
if self.built:
return # 如果模型已经构建过,则直接返回
self.built = True # 标记模型已经构建
if getattr(self, "summary", None) is not None:
with tf.name_scope("summary"):
self.summary.build(self.hidden_size) # 使用汇总方法构建汇总层
# 定义一个函数,用于创建具有指定范围的截断正态分布初始化器
def get_initializer(initializer_range: float = 0.02) -> keras.initializers.TruncatedNormal:
"""
Creates a `keras.initializers.TruncatedNormal` with the given range.
Args:
initializer_range (*float*, defaults to 0.02): Standard deviation of the initializer range.
Returns:
`keras.initializers.TruncatedNormal`: The truncated normal initializer.
"""
# 返回一个截断正态分布初始化器对象,其标准差由参数 initializer_range 指定
return keras.initializers.TruncatedNormal(stddev=initializer_range)
.\modeling_utils.py
import collections
import copy
import functools
import gc
import importlib.metadata
import inspect
import itertools
import json
import os
import re
import shutil
import tempfile
import warnings
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import torch
from packaging import version
from torch import Tensor, nn
from torch.nn import CrossEntropyLoss, Identity
from torch.utils.checkpoint import checkpoint
from .activations import get_activation
from .configuration_utils import PretrainedConfig
from .dynamic_module_utils import custom_object_save
from .generation import GenerationConfig, GenerationMixin
from .integrations import PeftAdapterMixin, deepspeed_config, is_deepspeed_zero3_enabled
from .pytorch_utils import (
Conv1D,
apply_chunking_to_forward,
find_pruneable_heads_and_indices,
id_tensor_storage,
is_torch_greater_or_equal_than_1_13,
prune_conv1d_layer,
prune_layer,
prune_linear_layer,
)
from .quantizers import AutoHfQuantizer, HfQuantizer
from .quantizers.quantizers_utils import get_module_from_name
from .safetensors_conversion import auto_conversion
from .utils import (
ADAPTER_SAFE_WEIGHTS_NAME,
ADAPTER_WEIGHTS_NAME,
CONFIG_NAME,
DUMMY_INPUTS,
FLAX_WEIGHTS_NAME,
SAFE_WEIGHTS_INDEX_NAME,
SAFE_WEIGHTS_NAME,
TF2_WEIGHTS_NAME,
TF_WEIGHTS_NAME,
WEIGHTS_INDEX_NAME,
WEIGHTS_NAME,
ContextManagers,
ModelOutput,
PushToHubMixin,
cached_file,
copy_func,
download_url,
extract_commit_hash,
has_file,
is_accelerate_available,
is_bitsandbytes_available,
is_flash_attn_2_available,
is_offline_mode,
is_optimum_available,
is_peft_available,
is_remote_url,
is_safetensors_available,
is_torch_sdpa_available,
is_torch_xla_available,
logging,
replace_return_docstrings,
strtobool,
)
from .utils.hub import convert_file_size_to_int, create_and_tag_model_card, get_checkpoint_shard_files
from .utils.import_utils import (
ENV_VARS_TRUE_VALUES,
is_sagemaker_mp_enabled,
is_torch_fx_proxy,
)
is_torchdynamo_compiling,
from .utils.quantization_config import BitsAndBytesConfig, QuantizationMethod
XLA_USE_BF16 = os.environ.get("XLA_USE_BF16", "0").upper()
XLA_DOWNCAST_BF16 = os.environ.get("XLA_DOWNCAST_BF16", "0").upper()
if is_accelerate_available():
from accelerate import dispatch_model, infer_auto_device_map, init_empty_weights
from accelerate.hooks import add_hook_to_module
from accelerate.utils import (
check_tied_parameters_on_same_device,
find_tied_parameters,
get_balanced_memory,
get_max_memory,
load_offloaded_weights,
offload_weight,
save_offload_index,
set_module_tensor_to_device,
)
if is_safetensors_available():
from safetensors import safe_open
from safetensors.torch import load_file as safe_load_file
from safetensors.torch import save_file as safe_save_file
logger = logging.get_logger(__name__)
_init_weights = True
def is_fsdp_enabled():
return (
torch.distributed.is_available()
and torch.distributed.is_initialized()
and strtobool(os.environ.get("ACCELERATE_USE_FSDP", "False")) == 1
and strtobool(os.environ.get("FSDP_CPU_RAM_EFFICIENT_LOADING", "False")) == 1
)
def is_local_dist_rank_0():
return (
torch.distributed.is_available()
and torch.distributed.is_initialized()
and int(os.environ.get("LOCAL_RANK", -1)) == 0
)
if is_sagemaker_mp_enabled():
import smdistributed.modelparallel.torch as smp
from smdistributed.modelparallel import __version__ as SMP_VERSION
IS_SAGEMAKER_MP_POST_1_10 = version.parse(SMP_VERSION) >= version.parse("1.10")
else:
IS_SAGEMAKER_MP_POST_1_10 = False
if is_peft_available():
from .utils import find_adapter_config_file
TORCH_INIT_FUNCTIONS = {
"uniform_": nn.init.uniform_,
"normal_": nn.init.normal_,
"trunc_normal_": nn.init.trunc_normal_,
"constant_": nn.init.constant_,
"xavier_uniform_": nn.init.xavier_uniform_,
"xavier_normal_": nn.init.xavier_normal_,
"kaiming_uniform_": nn.init.kaiming_uniform_,
"kaiming_normal_": nn.init.kaiming_normal_,
"uniform": nn.init.uniform,
"normal": nn.init.normal,
"xavier_uniform": nn.init.xavier_uniform,
"xavier_normal": nn.init.xavier_normal,
"kaiming_uniform": nn.init.kaiming_uniform,
"kaiming_normal": nn.init.kaiming_normal,
}
@contextmanager
def no_init_weights(_enable=True):
"""
Context manager to globally disable weight initialization to speed up loading large models.
TODO(Patrick): Delete safety argument `_enable=True` at next major version. .
"""
global _init_weights
old_init_weights = _init_weights
if _enable:
_init_weights = False
def _skip_init(*args, **kwargs):
pass
for name, init_func in TORCH_INIT_FUNCTIONS.items():
setattr(torch.nn.init, name, _skip_init)
try:
yield
finally:
_init_weights = old_init_weights
if _enable:
for name, init_func in TORCH_INIT_FUNCTIONS.items():
setattr(torch.nn.init, name, init_func)
def get_parameter_device(parameter: Union[nn.Module, GenerationMixin, "ModuleUtilsMixin"]):
try:
return next(parameter.parameters()).device
except StopIteration:
def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
return tuples
gen = parameter._named_members(get_members_fn=find_tensor_attributes)
first_tuple = next(gen)
return first_tuple[1].device
def get_first_parameter_dtype(parameter: Union[nn.Module, GenerationMixin, "ModuleUtilsMixin"]):
"""
Returns the first parameter dtype (can be non-floating) or asserts if none were found.
"""
try:
return next(parameter.parameters()).dtype
except StopIteration:
def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
return tuples
gen = parameter._named_members(get_members_fn=find_tensor_attributes)
first_tuple = next(gen)
return first_tuple[1].dtype
def get_parameter_dtype(parameter: Union[nn.Module, GenerationMixin, "ModuleUtilsMixin"]):
"""
Returns the first found floating dtype in parameters if there is one, otherwise returns the last dtype it found.
"""
last_dtype = None
for t in parameter.parameters():
last_dtype = t.dtype
if t.is_floating_point():
if XLA_USE_BF16 in ENV_VARS_TRUE_VALUES and is_torch_xla_available():
return torch.bfloat16
if XLA_DOWNCAST_BF16 in ENV_VARS_TRUE_VALUES and is_torch_xla_available():
if t.dtype == torch.float:
return torch.bfloat16
if t.dtype == torch.double:
return torch.float32
return t.dtype
if last_dtype is not None:
return last_dtype
def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
return tuples
gen = parameter._named_members(get_members_fn=find_tensor_attributes)
last_tuple = None
for tuple in gen:
last_tuple = tuple
if tuple[1].is_floating_point():
return tuple[1].dtype
if last_tuple is not None:
return last_tuple[1].dtype
for t in parameter.buffers():
last_dtype = t.dtype
if t.is_floating_point():
return t.dtype
return last_dtype
def get_state_dict_float_dtype(state_dict):
for t in state_dict.values():
if t.is_floating_point():
return t.dtype
raise ValueError("couldn't find any floating point dtypes in state_dict")
def get_state_dict_dtype(state_dict):
for t in state_dict.values():
if t.is_floating_point():
return t.dtype
else:
return next(state_dict.values()).dtype
def dtype_byte_size(dtype):
if dtype == torch.bool:
return 1 / 8
bit_search = re.search(r"[^\d](\d+)$", str(dtype))
if bit_search is None:
raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
bit_size = int(bit_search.groups()[0])
return bit_size // 8
def shard_checkpoint(
state_dict: Dict[str, torch.Tensor], max_shard_size: Union[int, str] = "10GB", weights_name: str = WEIGHTS_NAME
):
max_shard_size = convert_file_size_to_int(max_shard_size)
sharded_state_dicts = [{}]
last_block_size = 0
total_size = 0
storage_id_to_block = {}
for key, weight in state_dict.items():
if isinstance(weight, str):
continue
else:
storage_id = id_tensor_storage(weight)
if storage_id in storage_id_to_block:
block_id = storage_id_to_block[storage_id]
sharded_state_dicts[block_id][key] = weight
continue
weight_size = weight.numel() * dtype_byte_size(weight.dtype)
if last_block_size + weight_size > max_shard_size and len(sharded_state_dicts[-1]) > 0:
sharded_state_dicts.append({})
last_block_size = 0
sharded_state_dicts[-1][key] = weight
last_block_size += weight_size
storage_id_to_block[storage_id] = len(sharded_state_dicts) - 1
if len(sharded_state_dicts) == 1:
return {weights_name: sharded_state_dicts[0]}, None
weight_map = {}
shards = {}
for idx, shard in enumerate(sharded_state_dicts):
shard_file = weights_name.replace(".bin", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.bin")
shard_file = shard_file.replace(
".safetensors", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.safetensors"
)
shards[shard_file] = shard
for key in shard.keys():
weight_map[key] = shard_file
metadata = {"total_size": total_size}
index = {"metadata": metadata, "weight_map": weight_map}
return shards, index
def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True):
"""
This is the same as
[`torch.nn.Module.load_state_dict`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html?highlight=load_state_dict#torch.nn.Module.load_state_dict)
but for a sharded checkpoint.
This load is performed efficiently: each checkpoint shard is loaded one by one in RAM and deleted after being
loaded in the model.
Args:
model (`torch.nn.Module`): The model in which to load the checkpoint.
folder (`str` or `os.PathLike`): A path to a folder containing the sharded checkpoint.
strict (`bool`, *optional`, defaults to `True`):
Whether to strictly enforce that the keys in the model state dict match the keys in the sharded checkpoint.
prefer_safe (`bool`, *optional*, defaults to `False`)
If both safetensors and PyTorch save files are present in checkpoint and `prefer_safe` is True, the
safetensors files will be loaded. Otherwise, PyTorch files are always loaded when possible.
Returns:
`NamedTuple`: A named tuple with `missing_keys` and `unexpected_keys` fields
- `missing_keys` is a list of str containing the missing keys
- `unexpected_keys` is a list of str containing the unexpected keys
"""
index_file = os.path.join(folder, WEIGHTS_INDEX_NAME)
safe_index_file = os.path.join(folder, SAFE_WEIGHTS_INDEX_NAME)
index_present = os.path.isfile(index_file)
safe_index_present = os.path.isfile(safe_index_file)
if not index_present and not (safe_index_present and is_safetensors_available()):
filenames = (
(WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_INDEX_NAME) if is_safetensors_available() else (WEIGHTS_INDEX_NAME,)
)
raise ValueError(f"Can't find a checkpoint index ({' or '.join(filenames)}) in {folder}.")
load_safe = False
if safe_index_present:
if prefer_safe:
if is_safetensors_available():
load_safe = True
else:
logger.warning(
f"Cannot load sharded checkpoint at {folder} safely since safetensors is not installed!"
)
elif not index_present:
load_safe = True
load_index = safe_index_file if load_safe else index_file
with open(load_index, "r", encoding="utf-8") as f:
index = json.load(f)
shard_files = list(set(index["weight_map"].values()))
loaded_keys = index["weight_map"].keys()
model_keys = model.state_dict().keys()
missing_keys = [key for key in model_keys if key not in loaded_keys]
unexpected_keys = [key for key in loaded_keys if key not in model_keys]
if strict and (len(missing_keys) > 0 or len(unexpected_keys) > 0):
error_message = f"Error(s) in loading state_dict for {model.__class__.__name__}"
if len(missing_keys) > 0:
str_missing_keys = ",".join([f'"{k}"' for k in missing_keys])
error_message += f"\nMissing key(s): {str_missing_keys}."
if len(unexpected_keys) > 0:
str_unexpected_keys = ",".join([f'"{k}"' for k in unexpected_keys])
error_message += f"\nMissing key(s): {str_unexpected_keys}."
raise RuntimeError(error_message)
weights_only_kwarg = {"weights_only": True} if is_torch_greater_or_equal_than_1_13 else {}
loader = safe_load_file if load_safe else partial(torch.load, map_location="cpu", **weights_only_kwarg)
for shard_file in shard_files:
state_dict = loader(os.path.join(folder, shard_file))
model.load_state_dict(state_dict, strict=False)
del state_dict
gc.collect()
return torch.nn.modules.module._IncompatibleKeys(missing_keys, unexpected_keys)
def load_state_dict(checkpoint_file: Union[str, os.PathLike], is_quantized: bool = False):
"""
Reads a PyTorch checkpoint file, returning properly formatted errors if they arise.
"""
if checkpoint_file.endswith(".safetensors") and is_safetensors_available():
with safe_open(checkpoint_file, framework="pt") as f:
metadata = f.metadata()
if metadata.get("format") not in ["pt", "tf", "flax", "mlx"]:
raise OSError(
f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata. Make sure "
"you save your model with the `save_pretrained` method."
)
return safe_load_file(checkpoint_file)
try:
if (
(is_deepspeed_zero3_enabled() and torch.distributed.is_initialized() and torch.distributed.get_rank() > 0)
or (is_fsdp_enabled() and not is_local_dist_rank_0())
) and not is_quantized:
map_location = "meta"
else:
map_location = "cpu"
extra_args = {}
if (
isinstance(checkpoint_file, str)
and map_location != "meta"
and version.parse(torch.__version__) >= version.parse("2.1.0")
and is_zipfile(checkpoint_file)
):
extra_args = {"mmap": True}
weights_only_kwarg = {"weights_only": True} if is_torch_greater_or_equal_than_1_13 else {}
return torch.load(
checkpoint_file,
map_location=map_location,
**weights_only_kwarg,
**extra_args,
)
except Exception as e:
try:
with open(checkpoint_file) as f:
if f.read(7) == "version":
raise OSError(
"You seem to have cloned a repository without having git-lfs installed. Please install "
"git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
"you cloned."
)
else:
raise ValueError(
f"Unable to locate the file {checkpoint_file} which is necessary to load this pretrained "
"model. Make sure you have saved the model properly."
) from e
except (UnicodeDecodeError, ValueError):
raise OSError(
f"Unable to load weights from pytorch checkpoint file for '{checkpoint_file}' "
f"at '{checkpoint_file}'. "
"If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True."
)
def set_initialized_submodules(model, state_dict_keys):
"""
Sets the `_is_hf_initialized` flag in all submodules of a given model when all its weights are in the loaded state
dict.
"""
not_initialized_submodules = {}
for module_name, module in model.named_modules():
loaded_keys = {k.replace(f"{module_name}.", "") for k in state_dict_keys if k.startswith(f"{module_name}.")}
if loaded_keys.issuperset(module.state_dict()):
module._is_hf_initialized = True
else:
not_initialized_submodules[module_name] = module
return not_initialized_submodules
def _load_state_dict_into_model(model_to_load, state_dict, start_prefix):
old_keys = []
new_keys = []
for key in state_dict.keys():
new_key = None
if "gamma" in key:
new_key = key.replace("gamma", "weight")
if "beta" in key:
new_key = key.replace("beta", "bias")
if new_key:
old_keys.append(key)
new_keys.append(new_key)
for old_key, new_key in zip(old_keys, new_keys):
state_dict[new_key] = state_dict.pop(old_key)
metadata = getattr(state_dict, "_metadata", None)
state_dict = state_dict.copy()
if metadata is not None:
state_dict._metadata = metadata
error_msgs = []
def load(module: nn.Module, state_dict, prefix=""):
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
args = (state_dict, prefix, local_metadata, True, [], [], error_msgs)
if len([key for key in state_dict if key.startswith(prefix)]) > 0:
if is_deepspeed_zero3_enabled():
import deepspeed
named_parameters = dict(module.named_parameters(prefix=prefix[:-1], recurse=False))
params_to_gather = [named_parameters[k] for k in state_dict.keys() if k in named_parameters]
if len(params_to_gather) > 0:
with deepspeed.zero.GatheredParameters(params_to_gather, modifier_rank=0):
if torch.distributed.get_rank() == 0:
module._load_from_state_dict(*args)
else:
module._load_from_state_dict(*args)
for name, child in module._modules.items():
if child is not None:
load(child, state_dict, prefix + name + ".")
load(model_to_load, state_dict, prefix=start_prefix)
del state_dict
return error_msgs
if len(start_prefix) > 0 and long_key.startswith(start_prefix):
long_key = ".".join(long_key.split(".")[1:])
split_key = long_key.split(".")
submodule = model
while len(split_key) > 1:
if hasattr(submodule, split_key[0]):
submodule = getattr(submodule, split_key[0])
del split_key[0]
else:
submodule = None
break
if submodule == model:
submodule = None
return submodule, split_key[0]
error_msgs = []
old_keys = []
new_keys = []
is_quantized = hf_quantizer is not None
for key in state_dict.keys():
new_key = None
if "gamma" in key:
new_key = key.replace("gamma", "weight")
if "beta" in key:
new_key = key.replace("beta", "bias")
if new_key:
old_keys.append(key)
new_keys.append(new_key)
for old_key, new_key in zip(old_keys, new_keys):
state_dict[new_key] = state_dict.pop(old_key)
return error_msgs, offload_index, state_dict_index
def _add_variant(weights_name: str, variant: Optional[str] = None) -> str:
if variant is not None:
splits = weights_name.split(".")
splits = splits[:-1] + [variant] + splits[-1:]
weights_name = ".".join(splits)
return weights_name
class ModuleUtilsMixin:
"""
A few utilities for `torch.nn.Modules`, to be used as a mixin.
"""
@staticmethod
def _hook_rss_memory_pre_forward(module, *args, **kwargs):
try:
import psutil
except ImportError:
raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.")
process = psutil.Process(os.getpid())
mem = process.memory_info()
module.mem_rss_pre_forward = mem.rss
return None
@staticmethod
def _hook_rss_memory_post_forward(module, *args, **kwargs):
try:
import psutil
except ImportError:
raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.")
process = psutil.Process(os.getpid())
mem = process.memory_info()
module.mem_rss_post_forward = mem.rss
mem_rss_diff = module.mem_rss_post_forward - module.mem_rss_pre_forward
module.mem_rss_diff = mem_rss_diff + (module.mem_rss_diff if hasattr(module, "mem_rss_diff") else 0)
return None
def add_memory_hooks(self):
"""
Add a memory hook before and after each sub-module forward pass to record increase in memory consumption.
Increase in memory consumption is stored in a `mem_rss_diff` attribute for each module and can be reset to zero
with `model.reset_memory_hooks_state()`.
"""
for module in self.modules():
module.register_forward_pre_hook(self._hook_rss_memory_pre_forward)
module.register_forward_hook(self._hook_rss_memory_post_forward)
self.reset_memory_hooks_state()
def reset_memory_hooks_state(self):
"""
Reset the `mem_rss_diff` attribute of each module (see [`~modeling_utils.ModuleUtilsMixin.add_memory_hooks`]).
"""
for module in self.modules():
module.mem_rss_diff = 0
module.mem_rss_post_forward = 0
module.mem_rss_pre_forward = 0
@property
def device(self) -> torch.device:
"""
`torch.device`: The device on which the module is (assuming that all the module parameters are on the same
device).
"""
return get_parameter_device(self)
@property
def dtype(self) -> torch.dtype:
"""
`torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
"""
return get_parameter_dtype(self)
def invert_attention_mask(self, encoder_attention_mask: Tensor) -> Tensor:
"""
Invert an attention mask (e.g., switches 0. and 1.).
Args:
encoder_attention_mask (`torch.Tensor`): An attention mask.
Returns:
`torch.Tensor`: The inverted attention mask.
"""
if encoder_attention_mask.dim() == 3:
encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
if encoder_attention_mask.dim() == 2:
encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype)
encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * torch.finfo(self.dtype).min
return encoder_extended_attention_mask
@staticmethod
def create_extended_attention_mask_for_decoder(input_shape, attention_mask, device=None):
if device is not None:
warnings.warn(
"The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
)
else:
device = attention_mask.device
batch_size, seq_length = input_shape
seq_ids = torch.arange(seq_length, device=device)
causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
causal_mask = causal_mask.to(attention_mask.dtype)
if causal_mask.shape[1] < attention_mask.shape[1]:
prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
causal_mask = torch.cat(
[
torch.ones((batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype),
causal_mask,
],
axis=-1,
)
extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
return extended_attention_mask
def get_extended_attention_mask(
self, attention_mask: Tensor, input_shape: Tuple[int], device: torch.device = None, dtype: torch.float = None
):
) -> Tensor:
"""
Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
Arguments:
attention_mask (`torch.Tensor`):
Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
input_shape (`Tuple[int]`):
The shape of the input to the model.
Returns:
`torch.Tensor` The extended attention mask, with the same dtype as `attention_mask.dtype`.
"""
if dtype is None:
dtype = self.dtype
if not (attention_mask.dim() == 2 and self.config.is_decoder):
if device is not None:
warnings.warn(
"The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
)
if attention_mask.dim() == 3:
extended_attention_mask = attention_mask[:, None, :, :]
elif attention_mask.dim() == 2:
if self.config.is_decoder:
extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder(
input_shape, attention_mask, device
)
else:
extended_attention_mask = attention_mask[:, None, None, :]
else:
raise ValueError(
f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
)
extended_attention_mask = extended_attention_mask.to(dtype=dtype)
extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min
return extended_attention_mask
def prepare_head_mask(self, head_mask: Optional[Tensor], num_hidden_layers: int, is_attention_chunked: bool = False) -> Tensor:
"""
Prepare the head mask if needed.
Args:
head_mask (`torch.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):
The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
num_hidden_layers (`int`):
The number of hidden layers in the model.
is_attention_chunked (`bool`, *optional*, defaults to `False`):
Whether or not the attention scores are computed by chunks or not.
Returns:
`torch.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with
`[None]` for each layer.
"""
if head_mask is not None:
head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
if is_attention_chunked is True:
head_mask = head_mask.unsqueeze(-1)
else:
head_mask = [None] * num_hidden_layers
return head_mask
def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
"""
Convert `head_mask` to a 5-dimensional tensor `[num_hidden_layers x batch x num_heads x seq_length x seq_length]`.
Args:
head_mask (`torch.Tensor`):
The input head_mask tensor with shape `[num_heads]` or `[num_hidden_layers x num_heads]`.
num_hidden_layers (`int`):
The number of hidden layers in the model.
Returns:
`torch.Tensor`:
The converted head_mask tensor with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]`.
"""
if head_mask.dim() == 1:
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
head_mask = head_mask.expand(num_hidden_layers, -1, -1, -1, -1)
elif head_mask.dim() == 2:
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
assert head_mask.dim() == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
head_mask = head_mask.to(dtype=self.dtype)
return head_mask
def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int:
"""
Get number of (optionally, trainable or non-embeddings) parameters in the module.
Args:
only_trainable (`bool`, *optional*, defaults to `False`):
Whether or not to return only the number of trainable parameters
exclude_embeddings (`bool`, *optional*, defaults to `False`):
Whether or not to return only the number of non-embeddings parameters
Returns:
`int`: The number of parameters.
"""
if exclude_embeddings:
embedding_param_names = [
f"{name}.weight" for name, module_type in self.named_modules() if isinstance(module_type, nn.Embedding)
]
total_parameters = [
parameter for name, parameter in self.named_parameters() if name not in embedding_param_names
]
else:
total_parameters = list(self.parameters())
total_numel = []
is_loaded_in_4bit = getattr(self, "is_loaded_in_4bit", False)
if is_loaded_in_4bit:
if is_bitsandbytes_available():
import bitsandbytes as bnb
else:
raise ValueError(
"bitsandbytes is not installed but it seems that the model has been loaded in 4bit precision, something went wrong"
" make sure to install bitsandbytes with `pip install bitsandbytes`. You also need a GPU. "
)
for param in total_parameters:
if param.requires_grad or not only_trainable:
if is_loaded_in_4bit and isinstance(param, bnb.nn.Params4bit):
total_numel.append(
param.numel() * 2 * self.hf_quantizer.quantization_config.bnb_4bit_quant_storage.itemsize
)
else:
total_numel.append(param.numel())
return sum(total_numel)
def estimate_tokens(self, input_dict: Dict[str, Union[torch.Tensor, Any]]) -> int:
"""
Helper function to estimate the total number of tokens from the model inputs.
Args:
inputs (`dict`): The model inputs.
Returns:
`int`: The total number of tokens.
"""
if not hasattr(self, "warnings_issued"):
self.warnings_issued = {}
if self.main_input_name in input_dict:
return input_dict[self.main_input_name].numel()
elif "estimate_tokens" not in self.warnings_issued:
logger.warning(
"Could not estimate the number of tokens of the input, floating-point operations will not be computed"
)
self.warnings_issued["estimate_tokens"] = True
return 0
def floating_point_ops(
self, input_dict: Dict[str, Union[torch.Tensor, Any]], exclude_embeddings: bool = True
) -> int:
"""
Get number of (optionally, non-embeddings) floating-point operations for the forward and backward passes of a
batch with this transformer model. Default approximation neglects the quadratic dependency on the number of
tokens (valid if `12 * d_model << sequence_length`) as laid out in [this
paper](https://arxiv.org/pdf/2001.08361.pdf) section 2.1. Should be overridden for transformers with parameter
re-use e.g. Albert or Universal Transformers, or if doing long-range modeling with very high sequence lengths.
Args:
batch_size (`int`):
The batch size for the forward pass.
sequence_length (`int`):
The number of tokens in each line of the batch.
exclude_embeddings (`bool`, *optional*, defaults to `True`):
Whether or not to count embedding and softmax operations.
Returns:
`int`: The number of floating-point operations.
"""
return 6 * self.estimate_tokens(input_dict) * self.num_parameters(exclude_embeddings=exclude_embeddings)
class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMixin, PeftAdapterMixin):
r"""
Base class for all models.
[`PreTrainedModel`] takes care of storing the configuration of the models and handles methods for loading,
downloading and saving models as well as a few methods common to all models to:
- resize the input embeddings,
- prune heads in the self-attention heads.
Class attributes (overridden by derived classes):
- **config_class** ([`PretrainedConfig`]) -- A subclass of [`PretrainedConfig`] to use as configuration class
for this model architecture.
- **load_tf_weights** (`Callable`) -- A python *method* for loading a TensorFlow checkpoint in a PyTorch model,
taking as arguments:
- **model** ([`PreTrainedModel`]) -- An instance of the model on which to load the TensorFlow checkpoint.
- **config** ([`PreTrainedConfig`]) -- An instance of the configuration associated to the model.
- **path** (`str`) -- A path to the TensorFlow checkpoint.
- **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in derived
classes of the same architecture adding modules on top of the base model.
- **is_parallelizable** (`bool`) -- A flag indicating whether this model supports model parallelization.
- **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for NLP
models, `pixel_values` for vision models and `input_values` for speech models).
"""
config_class = None
base_model_prefix = ""
main_input_name = "input_ids"
model_tags = None
_auto_class = None
_no_split_modules = None
_skip_keys_device_placement = None
_keep_in_fp32_modules = None
_keys_to_ignore_on_load_missing = None
_keys_to_ignore_on_load_unexpected = None
_keys_to_ignore_on_save = None
_tied_weights_keys = None
is_parallelizable = False
supports_gradient_checkpointing = False
_supports_flash_attn_2 = False
_supports_sdpa = False
_supports_cache_class = False
@property
def dummy_inputs(self) -> Dict[str, torch.Tensor]:
"""
`Dict[str, torch.Tensor]`: 返回用于网络前向传播的虚拟输入数据字典。
"""
return {"input_ids": torch.tensor(DUMMY_INPUTS)}
@property
def framework(self) -> str:
"""
:str: 标识这是一个基于 PyTorch 的模型。
"""
return "pt"
def __init__(self, config: PretrainedConfig, *inputs, **kwargs):
super().__init__()
if not isinstance(config, PretrainedConfig):
raise ValueError(
f"Parameter config in `{self.__class__.__name__}(config)` should be an instance of class "
"`PretrainedConfig`. To create a model from a pretrained model use "
f"`model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
config = self._autoset_attn_implementation(
config, torch_dtype=torch.get_default_dtype(), check_device_map=False
)
self.config = config
self.name_or_path = config.name_or_path
self.warnings_issued = {}
self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None
self._keep_in_fp32_modules = copy.copy(self.__class__._keep_in_fp32_modules)
def post_init(self):
"""
在每次 Transformer 模型初始化结束时执行的方法,用于执行需要模型模块正确初始化的代码(例如权重初始化)。
"""
self.init_weights()
self._backward_compatibility_gradient_checkpointing()
def _backward_compatibility_gradient_checkpointing(self):
if self.supports_gradient_checkpointing and getattr(self.config, "gradient_checkpointing", False):
self.gradient_checkpointing_enable()
delattr(self.config, "gradient_checkpointing")
def add_model_tags(self, tags: Union[List[str], str]) -> None:
r"""
Add custom tags into the model that gets pushed to the Hugging Face Hub. Will
not overwrite existing tags in the model.
Args:
tags (`Union[List[str], str]`):
The desired tags to inject in the model
Examples:
```
from transformers import AutoModel
model = AutoModel.from_pretrained("google-bert/bert-base-cased")
model.add_model_tags(["custom", "custom-bert"])
# Push the model to your namespace with the name "my-custom-bert".
model.push_to_hub("my-custom-bert")
```
"""
if isinstance(tags, str):
tags = [tags]
if self.model_tags is None:
self.model_tags = []
for tag in tags:
if tag not in self.model_tags:
self.model_tags.append(tag)
@classmethod
def _from_config(cls, config, **kwargs):
"""
All context managers that the model should be initialized under go here.
Args:
torch_dtype (`torch.dtype`, *optional*):
Override the default `torch.dtype` and load the model under this dtype.
"""
torch_dtype = kwargs.pop("torch_dtype", None)
use_flash_attention_2 = kwargs.pop("use_flash_attention_2", False)
dtype_orig = None
if torch_dtype is not None:
dtype_orig = cls._set_default_torch_dtype(torch_dtype)
config = copy.deepcopy(config)
config._attn_implementation = kwargs.pop("attn_implementation", None)
config = cls._autoset_attn_implementation(
config,
use_flash_attention_2=use_flash_attention_2,
check_device_map=False,
torch_dtype=torch_dtype,
)
if is_deepspeed_zero3_enabled():
import deepspeed
logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
with deepspeed.zero.Init(config_dict_or_path=deepspeed_config()):
model = cls(config, **kwargs)
else:
model = cls(config, **kwargs)
if dtype_orig is not None:
torch.set_default_dtype(dtype_orig)
return model
@classmethod
def _autoset_attn_implementation(
cls,
config,
use_flash_attention_2: bool = False,
torch_dtype: Optional[torch.dtype] = None,
device_map: Optional[Union[str, Dict[str, int]]] = None,
check_device_map: bool = True,
):
"""
Automatically sets the attention implementation in the provided config.
Args:
config: The model configuration to modify.
use_flash_attention_2: Whether to use the Flash Attention 2 implementation.
torch_dtype: Optional, override the default torch.dtype for initialization.
device_map: Optional device mapping.
check_device_map: Whether to check device map validity.
Returns:
The modified config with the attention implementation set.
"""
if use_flash_attention_2:
config.attention_type = "flash_attention_2"
elif config._attn_implementation is not None:
config.attention_type = config._attn_implementation
if device_map is not None and check_device_map:
cls._validate_device_map(device_map)
return config
def _set_default_torch_dtype(cls, dtype: torch.dtype) -> torch.dtype:
"""
Change the default dtype and return the previous one. This is needed when wanting to instantiate the model
under specific dtype.
Args:
dtype (`torch.dtype`):
a floating dtype to set to.
Returns:
`torch.dtype`: the original `dtype` that can be used to restore `torch.set_default_dtype(dtype)` if it was
modified. If it wasn't, returns `None`.
Note `set_default_dtype` currently only works with floating-point types and asserts if for example,
`torch.int64` is passed. So if a non-float `dtype` is passed this functions will throw an exception.
"""
if not dtype.is_floating_point:
raise ValueError(
f"Can't instantiate {cls.__name__} model under dtype={dtype} since it is not a floating point dtype"
)
logger.info(f"Instantiating {cls.__name__} model under default dtype {dtype}.")
dtype_orig = torch.get_default_dtype()
torch.set_default_dtype(dtype)
return dtype_orig
@property
def base_model(self) -> nn.Module:
"""
`torch.nn.Module`: The main body of the model.
"""
return getattr(self, self.base_model_prefix, self)
@classmethod
def can_generate(cls) -> bool:
"""
Returns whether this model can generate sequences with `.generate()`.
Returns:
`bool`: Whether this model can generate sequences with `.generate()`.
"""
if "GenerationMixin" in str(cls.prepare_inputs_for_generation) and "GenerationMixin" in str(cls.generate):
return False
return True
@classmethod
def _check_and_enable_flash_attn_2(
cls,
config,
torch_dtype: Optional[torch.dtype] = None,
device_map: Optional[Union[str, Dict[str, int]]] = None,
check_device_map: bool = True,
hard_check_only: bool = False,
):
"""
Check and potentially enable the Flash Attention 2 features based on the provided configuration.
Args:
config: The configuration object for the model.
torch_dtype (Optional[torch.dtype]): The desired dtype to set as default.
device_map (Optional[Union[str, Dict[str, int]]]): Device mapping information.
check_device_map (bool): Whether to check device map.
hard_check_only (bool): Whether to perform a hard check only.
This function checks if certain conditions are met in the provided configuration to enable Flash Attention 2.
"""
pass
def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False) -> PretrainedConfig:
if hard_check_only:
if not cls._supports_sdpa:
raise ValueError(
f"{cls.__name__} does not support an attention implementation through torch.nn.functional.scaled_dot_product_attention yet."
" Please request the support for this architecture: https://github.com/huggingface/transformers/issues/28005. If you believe"
' this error is a bug, please open an issue in Transformers GitHub repository and load your model with the argument `attn_implementation="eager"` meanwhile. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager")`'
)
if not is_torch_sdpa_available():
raise ImportError(
"PyTorch SDPA requirements in Transformers are not met. Please install torch>=2.1.1."
)
if not is_torch_sdpa_available() or not cls._supports_sdpa:
return config
_is_bettertransformer = getattr(cls, "use_bettertransformer", False)
if _is_bettertransformer:
return config
if not hard_check_only:
config._attn_implementation = "sdpa"
return config
def enable_input_require_grads(self):
def make_inputs_require_grads(module, input, output):
output.requires_grad_(True)
self._require_grads_hook = self.get_input_embeddings().register_forward_hook(make_inputs_require_grads)
def disable_input_require_grads(self):
self._require_grads_hook.remove()
def get_input_embeddings(self) -> nn.Module:
base_model = getattr(self, self.base_model_prefix, self)
if base_model is not self:
return base_model.get_input_embeddings()
else:
raise NotImplementedError
def set_input_embeddings(self, value: nn.Module):
"""
Set model's input embeddings.
Args:
value (`nn.Module`): A module mapping vocabulary to hidden states.
"""
base_model = getattr(self, self.base_model_prefix, self)
if base_model is not self:
base_model.set_input_embeddings(value)
else:
raise NotImplementedError
def get_output_embeddings(self) -> nn.Module:
"""
Returns the model's output embeddings.
Returns:
`nn.Module`: A torch module mapping hidden states to vocabulary.
"""
return None
def _init_weights(self, module):
"""
Initialize the weights. This method should be overridden by derived class and is
the only initialization method that will be called when loading a checkpoint
using `from_pretrained`. Any attempt to initialize outside of this function
will be useless as the torch.nn.init function are all replaced with skip.
"""
def _initialize_weights(self, module):
"""
Initialize the weights if they are not already initialized.
"""
if getattr(module, "_is_hf_initialized", False):
return
self._init_weights(module)
module._is_hf_initialized = True
def tie_weights(self):
"""
Tie the weights between the input embeddings and the output embeddings.
If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning the
weights instead.
"""
if getattr(self.config, "tie_word_embeddings", True):
output_embeddings = self.get_output_embeddings()
if output_embeddings is not None:
self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
if getattr(self.config, "is_encoder_decoder", False) and getattr(self.config, "tie_encoder_decoder", False):
if hasattr(self, self.base_model_prefix):
self = getattr(self, self.base_model_prefix)
self._tie_encoder_decoder_weights(self.encoder, self.decoder, self.base_model_prefix)
for module in self.modules():
if hasattr(module, "_tie_weights"):
module._tie_weights()
def _tie_or_clone_weights(self, output_embeddings, input_embeddings):
"""根据是否使用 TorchScript 来共享或克隆模块的权重"""
if self.config.torchscript:
output_embeddings.weight = nn.Parameter(input_embeddings.weight.clone())
else:
output_embeddings.weight = input_embeddings.weight
if getattr(output_embeddings, "bias", None) is not None:
output_embeddings.bias.data = nn.functional.pad(
output_embeddings.bias.data,
(
0,
output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0],
),
"constant",
0,
)
if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
output_embeddings.out_features = input_embeddings.num_embeddings
def _get_no_split_modules(self, device_map: str):
"""
获取在使用 device_map 时不应分割的模块。我们遍历模块以获取底层的 `_no_split_modules`。
Args:
device_map (`str`):
设备映射值。选项有 ["auto", "balanced", "balanced_low_0", "sequential"]
Returns:
`List[str]`: 不应分割的模块列表
"""
_no_split_modules = set()
modules_to_check = [self]
while len(modules_to_check) > 0:
module = modules_to_check.pop(-1)
if module.__class__.__name__ not in _no_split_modules:
if isinstance(module, PreTrainedModel):
if module._no_split_modules is None:
raise ValueError(
f"{module.__class__.__name__} 不支持 `device_map='{device_map}'`。要实现支持,模型类需要实现 `_no_split_modules` 属性。"
)
else:
_no_split_modules = _no_split_modules | set(module._no_split_modules)
modules_to_check += list(module.children())
return list(_no_split_modules)
def resize_token_embeddings(
self, new_num_tokens: Optional[int] = None, pad_to_multiple_of: Optional[int] = None
):
def resize_token_embeddings(
self, new_num_tokens: Optional[int] = None, pad_to_multiple_of: Optional[int] = None
) -> nn.Embedding:
"""
调整模型的输入 token embeddings 矩阵大小,如果 `new_num_tokens != config.vocab_size` 的话。
调整后负责在需要时绑定权重 embeddings,如果模型类有 `tie_weights()` 方法的话。
参数:
new_num_tokens (`int`, *可选*):
embedding 矩阵中的新 token 数量。增加大小会在末尾添加新初始化的向量。减少大小会从末尾移除向量。
如果未提供或为 `None`,仅返回指向模型输入 token 的 `torch.nn.Embedding` 模块的指针,不执行任何操作。
pad_to_multiple_of (`int`, *可选*):
如果设置,将填充 embedding 矩阵至提供的值的倍数。如果 `new_num_tokens` 设置为 `None`,则仅将 embedding
填充至 `pad_to_multiple_of` 的倍数。
这对于启用 NVIDIA 硬件的 Tensor Cores(计算能力 `>= 7.5`,Volta)或者利用 TPUs 时特别有用,这些硬件
在序列长度为 128 的倍数时效果最佳。有关更多详细信息或调整大小的正确值的帮助,请参考此指南:
https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
返回:
`torch.nn.Embedding`: 指向模型输入 tokens Embedding 模块的指针。
"""
model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
if new_num_tokens is None and pad_to_multiple_of is None:
return model_embeds
self.config.vocab_size = model_embeds.weight.shape[0]
self.vocab_size = model_embeds.weight.shape[0]
self.tie_weights()
return model_embeds
def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None):
old_embeddings = self.get_input_embeddings()
new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens, pad_to_multiple_of)
if hasattr(old_embeddings, "_hf_hook"):
hook = old_embeddings._hf_hook
add_hook_to_module(new_embeddings, hook)
old_embeddings_requires_grad = old_embeddings.weight.requires_grad
new_embeddings.requires_grad_(old_embeddings_requires_grad)
self.set_input_embeddings(new_embeddings)
is_quantized = hasattr(self, "hf_quantizer") and self.hf_quantizer is not None
if pad_to_multiple_of is not None:
if is_deepspeed_zero3_enabled() and not is_quantized:
import deepspeed
with deepspeed.zero.GatheredParameters(new_embeddings.weight, modifier_rank=None):
new_num_tokens = new_embeddings.weight.shape[0]
else:
new_num_tokens = new_embeddings.weight.shape[0]
if self.get_output_embeddings() is not None and not self.config.tie_word_embeddings:
old_lm_head = self.get_output_embeddings()
new_lm_head = self._get_resized_lm_head(old_lm_head, new_num_tokens)
if hasattr(old_lm_head, "_hf_hook"):
hook = old_lm_head._hf_hook
add_hook_to_module(new_lm_head, hook)
old_lm_head_requires_grad = old_lm_head.weight.requires_grad
new_lm_head.requires_grad_(old_lm_head_requires_grad)
self.set_output_embeddings(new_lm_head)
return self.get_input_embeddings()
def _get_resized_embeddings(
self,
old_embeddings: nn.Embedding,
new_num_tokens: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
):
...
def _get_resized_lm_head(
self, old_lm_head: nn.Linear, new_num_tokens: Optional[int] = None, transposed: Optional[bool] = False
):
...
def _copy_lm_head_original_to_resized(
self, new_lm_head, old_lm_head, num_tokens_to_copy, transposed, has_new_lm_head_bias
):
if not transposed:
new_lm_head.weight.data[:num_tokens_to_copy, :] = old_lm_head.weight.data[:num_tokens_to_copy, :]
else:
new_lm_head.weight.data[:, :num_tokens_to_copy] = old_lm_head.weight.data[:, :num_tokens_to_copy]
if has_new_lm_head_bias:
new_lm_head.bias.data[:num_tokens_to_copy] = old_lm_head.bias.data[:num_tokens_to_copy]
def resize_position_embeddings(self, new_num_position_embeddings: int):
raise NotImplementedError(
f"`resize_position_embeddings` is not implemented for {self.__class__}`. To implement it, you should "
f"overwrite this method in the class {self.__class__} in `modeling_{self.__class__.__module__}.py`"
)
def get_position_embeddings(self) -> Union[nn.Embedding, Tuple[nn.Embedding]]:
raise NotImplementedError(
f"`get_position_embeddings` is not implemented for {self.__class__}`. To implement it, you should "
f"overwrite this method in the class {self.__class__} in `modeling_{self.__class__.__module__}.py`"
)
def init_weights(self):
"""
If needed prunes and maybe initializes weights. If using a custom `PreTrainedModel`, you need to implement any
initialization logic in `_init_weights`.
"""
if self.config.pruned_heads:
self.prune_heads(self.config.pruned_heads)
if _init_weights:
self.apply(self._initialize_weights)
self.tie_weights()
def prune_heads(self, heads_to_prune: Dict[int, List[int]]):
"""
Prunes heads of the base model.
Arguments:
heads_to_prune (`Dict[int, List[int]]`):
Dictionary with keys being selected layer indices (`int`) and associated values being the list of heads
to prune in said layer (list of `int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on
layer 1 and heads 2 and 3 on layer 2.
"""
for layer, heads in heads_to_prune.items():
union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads)
self.config.pruned_heads[layer] = list(union_heads)
self.base_model._prune_heads(heads_to_prune)
def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
"""
Activates gradient checkpointing for the current model.
Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
activations".
We pass the `__call__` method of the modules instead of `forward` because `__call__` attaches all the hooks of
the module. https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
Args:
gradient_checkpointing_kwargs (dict, *optional*):
Additional keyword arguments passed along to the `torch.utils.checkpoint.checkpoint` function.
"""
if not self.supports_gradient_checkpointing:
raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
if gradient_checkpointing_kwargs is None:
gradient_checkpointing_kwargs = {"use_reentrant": True}
gradient_checkpointing_func = functools.partial(checkpoint, **gradient_checkpointing_kwargs)
_is_using_old_format = "value" in inspect.signature(self._set_gradient_checkpointing).parameters
if not _is_using_old_format:
self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=gradient_checkpointing_func)
else:
self.apply(partial(self._set_gradient_checkpointing, value=True))
logger.warn(
"You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it)."
"Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model."
)
if getattr(self, "_hf_peft_config_loaded", False):
self.enable_input_require_grads()
def _set_gradient_checkpointing(self, enable: bool = True, gradient_checkpointing_func: Callable = checkpoint):
is_gradient_checkpointing_set = False
if hasattr(self, "gradient_checkpointing"):
self._gradient_checkpointing_func = gradient_checkpointing_func
self.gradient_checkpointing = enable
is_gradient_checkpointing_set = True
for module in self.modules():
if hasattr(module, "gradient_checkpointing"):
module._gradient_checkpointing_func = gradient_checkpointing_func
module.gradient_checkpointing = enable
is_gradient_checkpointing_set = True
if not is_gradient_checkpointing_set:
raise ValueError(
f"{self.__class__.__name__} is not compatible with gradient checkpointing. Make sure all the architecture support it by setting a boolean attribute"
" `gradient_checkpointing` to modules of the model that uses checkpointing."
)
def gradient_checkpointing_disable(self):
"""
Deactivates gradient checkpointing for the current model.
Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
activations".
"""
if self.supports_gradient_checkpointing:
_is_using_old_format = "value" in inspect.signature(self._set_gradient_checkpointing).parameters
if not _is_using_old_format:
self._set_gradient_checkpointing(enable=False)
else:
logger.warn(
"You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it)."
"Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model."
)
self.apply(partial(self._set_gradient_checkpointing, value=False))
if getattr(self, "_hf_peft_config_loaded", False):
self.disable_input_require_grads()
@property
def is_gradient_checkpointing(self) -> bool:
"""
Whether gradient checkpointing is activated for this model or not.
Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
activations".
"""
return any(hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing for m in self.modules())
def save_pretrained(
self,
save_directory: Union[str, os.PathLike],
is_main_process: bool = True,
state_dict: Optional[dict] = None,
save_function: Callable = torch.save,
push_to_hub: bool = False,
max_shard_size: Union[int, str] = "5GB",
safe_serialization: bool = True,
variant: Optional[str] = None,
token: Optional[Union[str, bool]] = None,
save_peft_format: bool = True,
**kwargs,
):
"""
Save the model to the specified directory.
Arguments:
save_directory (`Union[str, os.PathLike]`):
Directory where the model should be saved.
is_main_process (`bool`, *optional*, defaults to `True`):
Flag indicating if the current process is the main one.
state_dict (`Optional[dict]`, *optional*):
Optional dictionary containing the state of the model.
save_function (`Callable`, *optional*):
Function used for saving the model (default is `torch.save`).
push_to_hub (`bool`, *optional*, defaults to `False`):
Whether to push the saved model to a model hub (if supported).
max_shard_size (`Union[int, str]`, *optional*, defaults to `"5GB"`):
Maximum size of each shard when saving large models.
safe_serialization (`bool`, *optional*, defaults to `True`):
Whether to ensure safe serialization of the model.
variant (`Optional[str]`, *optional*):
Variant of the model being saved (if applicable).
token (`Optional[Union[str, bool]]`, *optional*):
Token used for authentication or authorization.
save_peft_format (`bool`, *optional*, defaults to `True`):
Whether to save the model in PEFT format.
**kwargs:
Additional keyword arguments for customizing the saving process.
"""
@wraps(PushToHubMixin.push_to_hub)
def push_to_hub(self, *args, **kwargs):
"""
Push the model to a model hub with specified tags.
Arguments:
*args:
Positional arguments for the push operation.
**kwargs:
Keyword arguments for customizing the push operation.
Returns:
Result of the super class's `push_to_hub` method.
"""
tags = self.model_tags if self.model_tags is not None else []
tags_kwargs = kwargs.get("tags", [])
if isinstance(tags_kwargs, str):
tags_kwargs = [tags_kwargs]
for tag in tags_kwargs:
if tag not in tags:
tags.append(tag)
if tags:
kwargs["tags"] = tags
return super().push_to_hub(*args, **kwargs)
def get_memory_footprint(self, return_buffers=True):
"""
Get the memory footprint of the model.
Arguments:
return_buffers (`bool`, *optional*, defaults to `True`):
Whether to include buffer tensors in the memory footprint calculation.
Returns:
Memory footprint of the model in bytes.
"""
mem = sum([param.nelement() * param.element_size() for param in self.parameters()])
if return_buffers:
mem_bufs = sum([buf.nelement() * buf.element_size() for buf in self.buffers()])
mem = mem + mem_bufs
return mem
@wraps(torch.nn.Module.cuda)
def cuda(self, *args, **kwargs):
"""
Move the model to CUDA device, if not quantized.
Arguments:
*args:
Positional arguments for the CUDA operation.
**kwargs:
Keyword arguments for customizing the CUDA operation.
Returns:
Result of the super class's `cuda` method.
Raises:
ValueError: If the model is 4-bit or 8-bit quantized.
"""
if getattr(self, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES:
raise ValueError(
"Calling `cuda()` is not supported for `4-bit` or `8-bit` quantized models. "
"Please use the model as it is, since the model has already been set to the "
"correct devices and casted to the correct `dtype`."
)
else:
return super().cuda(*args, **kwargs)
@wraps(torch.nn.Module.to)
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
*model_args,
config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
cache_dir: Optional[Union[str, os.PathLike]] = None,
ignore_mismatched_sizes: bool = False,
force_download: bool = False,
local_files_only: bool = False,
token: Optional[Union[str, bool]] = None,
revision: str = "main",
use_safetensors: bool = None,
**kwargs,
):
def _load_pretrained_model(
cls,
model,
state_dict,
loaded_keys,
resolved_archive_file,
pretrained_model_name_or_path,
ignore_mismatched_sizes=False,
sharded_metadata=None,
_fast_init=True,
low_cpu_mem_usage=False,
device_map=None,
offload_folder=None,
offload_state_dict=None,
dtype=None,
hf_quantizer=None,
keep_in_fp32_modules=None,
):
"""
Load a pretrained model using the provided state_dict and configuration.
Args:
model: The model to load the pretrained weights into.
state_dict: The pretrained weights as a state dictionary.
loaded_keys: Keys of the loaded state_dict.
resolved_archive_file: Path to the resolved archive file.
pretrained_model_name_or_path: Name or path of the pretrained model.
ignore_mismatched_sizes: If True, ignore mismatched tensor sizes.
sharded_metadata: Metadata related to sharding.
_fast_init: Whether to perform fast initialization.
low_cpu_mem_usage: If True, use low CPU memory mode.
device_map: Mapping of devices.
offload_folder: Folder for offloading.
offload_state_dict: State dictionary for offloading.
dtype: Data type of the model weights.
hf_quantizer: Quantizer for Hugging Face models.
keep_in_fp32_modules: Modules to keep in FP32 format.
Returns:
None
"""
_move_model_to_meta(model, loaded_keys, "")
state_dict = load_state_dict(resolved_archive_file)
expected_keys = loaded_keys
error_msgs = _load_state_dict_into_meta_model(
model,
state_dict,
loaded_keys,
"",
expected_keys=expected_keys,
hf_quantizer=hf_quantizer,
)
return error_msgs
def retrieve_modules_from_names(self, names, add_prefix=False, remove_prefix=False):
"""
Retrieve modules from the model based on provided module names.
Args:
names: List of module names to retrieve.
add_prefix: Whether to add a prefix to retrieved module names.
remove_prefix: Whether to remove a prefix from retrieved module names.
Returns:
List: Retrieved modules based on the provided names.
"""
module_keys = {".".join(key.split(".")[:-1]) for key in names}
module_keys = module_keys.union(
{".".join(key.split(".")[:-2]) for key in names if len(key) > 0 and key[-1].isdigit()}
)
retrieved_modules = []
for name, module in self.named_modules():
if remove_prefix:
_prefix = f"{self.base_model_prefix}."
name = name[len(_prefix) :] if name.startswith(_prefix) else name
elif add_prefix:
name = ".".join([self.base_model_prefix, name]) if len(name) > 0 else self.base_model_prefix
if name in module_keys:
retrieved_modules.append(module)
return retrieved_modules
@staticmethod
def _load_pretrained_model_low_mem(
model, loaded_state_dict_keys, resolved_archive_file, start_prefix="", hf_quantizer=None
):
"""
This is an experimental function that loads the model using ~1.x model size CPU memory
Before you call it do:
1. save which state_dict keys are available
2. drop state_dict before model is created, since the latter takes 1x model size memory
Here then we continue:
3. switch to the meta device all params/buffers that are going to be replaced from the loaded state_dict
4. load state_dict 2nd time
5. replace the params/buffers from the state_dict
Currently, it doesn't handle missing_keys, unexpected_keys, mismatched_keys. It can't handle deepspeed. To
handle bitsandbytes, needs non-empty hf_quantizer argument.
"""
_move_model_to_meta(model, loaded_state_dict_keys, start_prefix)
state_dict = load_state_dict(resolved_archive_file)
expected_keys = loaded_state_dict_keys
error_msgs = _load_state_dict_into_meta_model(
model,
state_dict,
loaded_state_dict_keys,
start_prefix,
expected_keys=expected_keys,
hf_quantizer=hf_quantizer,
)
return error_msgs
def register_for_auto_class(cls, auto_class="AutoModel"):
"""
Register this class with a given auto class. This should only be used for custom models as the ones in the
library are already mapped with an auto class.
<Tip warning={true}>
This API is experimental and may have some slight breaking changes in the next releases.
</Tip>
Args:
auto_class (`str` or `type`, *optional*, defaults to `"AutoModel"`):
The auto class to register this new model with.
"""
if not isinstance(auto_class, str):
auto_class = auto_class.__name__
import transformers.models.auto as auto_module
if not hasattr(auto_module, auto_class):
raise ValueError(f"{auto_class} is not a valid auto class.")
cls._auto_class = auto_class
def to_bettertransformer(self) -> "PreTrainedModel":
"""
Converts the model to use [PyTorch's native attention
implementation](https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html), integrated to
Transformers through [Optimum library](https://huggingface.co/docs/optimum/bettertransformer/overview). Only a
subset of all Transformers models are supported.
PyTorch's attention fastpath allows to speed up inference through kernel fusions and the use of [nested
tensors](https://pytorch.org/docs/stable/nested.html). Detailed benchmarks can be found in [this blog
post](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2).
Returns:
[`PreTrainedModel`]: The model converted to BetterTransformer.
"""
if not is_optimum_available():
raise ImportError("The package `optimum` is required to use Better Transformer.")
from optimum.version import __version__ as optimum_version
if version.parse(optimum_version) < version.parse("1.7.0"):
raise ImportError(
f"Please install optimum>=1.7.0 to use Better Transformer. The version {optimum_version} was found."
)
from optimum.bettertransformer import BetterTransformer
return BetterTransformer.transform(self)
def reverse_bettertransformer(self):
"""
Reverts the transformation from [`~PreTrainedModel.to_bettertransformer`] so that the original modeling is
used, for example in order to save the model.
Returns:
[`PreTrainedModel`]: The model converted back to the original modeling.
"""
if not is_optimum_available():
raise ImportError("The package `optimum` is required to use Better Transformer.")
from optimum.version import __version__ as optimum_version
if version.parse(optimum_version) < version.parse("1.7.0"):
raise ImportError(
f"Please install optimum>=1.7.0 to use Better Transformer. The version {optimum_version} was found."
)
from optimum.bettertransformer import BetterTransformer
return BetterTransformer.reverse(self)
def warn_if_padding_and_no_attention_mask(self, input_ids, attention_mask):
"""
Shows a one-time warning if the input_ids appear to contain padding and no attention mask was given.
"""
if is_torch_fx_proxy(input_ids) or torch.jit.is_tracing() or is_torchdynamo_compiling():
return
if (attention_mask is not None) or (self.config.pad_token_id is None):
return
if self.config.pad_token_id in input_ids[:, [-1, 0]]:
warn_string = (
"We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See "
"https://huggingface.co/docs/transformers/troubleshooting"
"#incorrect-output-when-padding-tokens-arent-masked."
)
if (
(self.config.bos_token_id is not None and self.config.bos_token_id == self.config.pad_token_id)
or (self.config.eos_token_id is not None and self.config.eos_token_id == self.config.pad_token_id)
or (self.config.sep_token_id is not None and self.config.sep_token_id == self.config.pad_token_id)
):
warn_string += (
f"\nYou may ignore this warning if your `pad_token_id` ({self.config.pad_token_id}) is identical "
f"to the `bos_token_id` ({self.config.bos_token_id}), `eos_token_id` ({self.config.eos_token_id}), "
f"or the `sep_token_id` ({self.config.sep_token_id}), and your input is not padded."
)
logger.warning_once(warn_string)
@property
warnings.warn(
"`_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead",
FutureWarning,
)
if not hasattr(self, "hf_quantizer"):
return False
return self.hf_quantizer.is_trainable
PreTrainedModel.push_to_hub = copy_func(PreTrainedModel.push_to_hub)
if PreTrainedModel.push_to_hub.__doc__ is not None:
PreTrainedModel.push_to_hub.__doc__ = PreTrainedModel.push_to_hub.__doc__.format(
object="model", object_class="AutoModel", object_files="model file"
)
class PoolerStartLogits(nn.Module):
"""
Compute SQuAD start logits from sequence hidden states.
Args:
config ([`PretrainedConfig`]):
The config used by the model, will be used to grab the `hidden_size` of the model.
"""
def __init__(self, config: PretrainedConfig):
super().__init__()
self.dense = nn.Linear(config.hidden_size, 1)
def forward(
self, hidden_states: torch.FloatTensor, p_mask: Optional[torch.FloatTensor] = None
) -> torch.FloatTensor:
"""
Args:
hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
The final hidden states of the model.
p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
should be masked.
Returns:
`torch.FloatTensor`: The start logits for SQuAD.
"""
x = self.dense(hidden_states).squeeze(-1)
if p_mask is not None:
if get_parameter_dtype(self) == torch.float16:
x = x * (1 - p_mask) - 65500 * p_mask
else:
x = x * (1 - p_mask) - 1e30 * p_mask
return x
class PoolerEndLogits(nn.Module):
"""
Compute SQuAD end logits from sequence hidden states.
Args:
config ([`PretrainedConfig`]):
The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
to use.
"""
def __init__(self, config: PretrainedConfig):
super().__init__()
self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
self.activation = nn.Tanh()
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dense_1 = nn.Linear(config.hidden_size, 1)
def forward(
self,
hidden_states: torch.FloatTensor,
start_states: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
p_mask: Optional[torch.FloatTensor] = None,
) -> torch.FloatTensor:
"""
Args:
hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
模型的最终隐藏状态。
start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
标记范围内第一个标记的隐藏状态。
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
标记范围内第一个标记的位置。
p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
用于无效位置的掩码,如查询和特殊符号(PAD、SEP、CLS)。1.0 表示该标记应被屏蔽。
<Tip>
`start_states` 或 `start_positions` 中的一个必须不为 `None`。如果两者都设置了,`start_positions` 会覆盖 `start_states`。
</Tip>
Returns:
`torch.FloatTensor`: SQuAD 任务的结束位置logits。
"""
assert (
start_states is not None or start_positions is not None
), "One of start_states, start_positions should be not None"
if start_positions is not None:
slen, hsz = hidden_states.shape[-2:]
start_positions = start_positions[:, None, None].expand(-1, -1, hsz)
start_states = hidden_states.gather(-2, start_positions)
start_states = start_states.expand(-1, slen, -1)
x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1))
x = self.activation(x)
x = self.LayerNorm(x)
x = self.dense_1(x).squeeze(-1)
if p_mask is not None:
if get_parameter_dtype(self) == torch.float16:
x = x * (1 - p_mask) - 65500 * p_mask
else:
x = x * (1 - p_mask) - 1e30 * p_mask
return x
class PoolerAnswerClass(nn.Module):
"""
Compute SQuAD 2.0 answer class from classification and start tokens hidden states.
Args:
config ([`PretrainedConfig`]):
The config used by the model, will be used to grab the `hidden_size` of the model.
"""
def __init__(self, config):
super().__init__()
self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
self.activation = nn.Tanh()
self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False)
def forward(
self,
hidden_states: torch.FloatTensor,
start_states: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
cls_index: Optional[torch.LongTensor] = None,
) -> torch.FloatTensor:
"""
Args:
hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
The final hidden states of the model.
start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
The hidden states of the first tokens for the labeled span.
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
The position of the first token for the labeled span.
cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
<Tip>
One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
`start_states`.
</Tip>
Returns:
`torch.FloatTensor`: The SQuAD 2.0 answer class.
"""
hsz = hidden_states.shape[-1]
assert (
start_states is not None or start_positions is not None
), "One of start_states, start_positions should be not None"
if start_positions is not None:
start_positions = start_positions[:, None, None].expand(-1, -1, hsz)
start_states = hidden_states.gather(-2, start_positions).squeeze(-2)
if cls_index is not None:
cls_index = cls_index[:, None, None].expand(-1, -1, hsz)
cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2)
else:
cls_token_state = hidden_states[:, -1, :]
x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1))
x = self.activation(x)
x = self.dense_1(x).squeeze(-1)
return x
@dataclass
class SquadHeadOutput(ModelOutput):
"""
Base class for outputs of question answering models using a [`~modeling_utils.SQuADHead`].
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
Classification loss as the sum of start token, end token (and is_impossible if provided) classification
losses.
start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
Log probabilities for the top config.start_n_top start token possibilities (beam-search).
start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
Indices for the top config.start_n_top start token possibilities (beam-search).
end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
(beam-search).
end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
Log probabilities for the `is_impossible` label of the answers.
"""
# Optional: 可选参数,以下各变量用于存储模型的不同输出结果,如果未提供`start_positions`或`end_positions`,则可能为空
loss: Optional[torch.FloatTensor] = None
start_top_log_probs: Optional[torch.FloatTensor] = None
start_top_index: Optional[torch.LongTensor] = None
end_top_log_probs: Optional[torch.FloatTensor] = None
end_top_index: Optional[torch.LongTensor] = None
cls_logits: Optional[torch.FloatTensor] = None
class SQuADHead(nn.Module):
r"""
A SQuAD head inspired by XLNet.
Args:
config ([`PretrainedConfig`]):
The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
to use.
"""
def __init__(self, config):
super().__init__()
# 初始化 SQuAD 头部模块,设置起始和结束位置的 top k 值
self.start_n_top = config.start_n_top
self.end_n_top = config.end_n_top
# 初始化起始位置的 logits 池化层
self.start_logits = PoolerStartLogits(config)
# 初始化结束位置的 logits 池化层
self.end_logits = PoolerEndLogits(config)
# 初始化答案分类的池化层
self.answer_class = PoolerAnswerClass(config)
@replace_return_docstrings(output_type=SquadHeadOutput, config_class=PretrainedConfig)
def forward(
self,
hidden_states: torch.FloatTensor,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
cls_index: Optional[torch.LongTensor] = None,
is_impossible: Optional[torch.LongTensor] = None,
p_mask: Optional[torch.FloatTensor] = None,
return_dict: bool = False,
):
"""
Perform forward pass of the SQuAD head module.
Args:
hidden_states (torch.FloatTensor): Sequence of hidden states.
start_positions (Optional[torch.LongTensor]): Tensor of start positions for the answer spans.
end_positions (Optional[torch.LongTensor]): Tensor of end positions for the answer spans.
cls_index (Optional[torch.LongTensor]): Index of the classification token if used.
is_impossible (Optional[torch.LongTensor]): Tensor indicating if the question is unanswerable.
p_mask (Optional[torch.FloatTensor]): Mask indicating which elements in the input sequence should not be attended to.
return_dict (bool): Whether to return a dictionary.
Returns:
SquadHeadOutput: Output of the SQuAD head module.
"""
# 实现 SQuAD 头部的前向传播逻辑
# 这里应该包含具体的模型逻辑,根据输入参数计算输出
pass
class SequenceSummary(nn.Module):
r"""
Compute a single vector summary of a sequence hidden states.
Args:
config ([`PretrainedConfig`]):
The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
config class of your model for the default values it uses):
- **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:
- `"last"` -- Take the last token hidden state (like XLNet)
- `"first"` -- Take the first token hidden state (like Bert)
- `"mean"` -- Take the mean of all tokens hidden states
- `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
- `"attn"` -- Not implemented now, use multi-head attention
- **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
- **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
(otherwise to `config.hidden_size`).
- **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
another string or `None` will add no activation.
- **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
- **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
"""
# 初始化函数,接受一个预训练配置对象作为参数
def __init__(self, config: PretrainedConfig):
# 调用父类的初始化方法
super().__init__()
# 从配置对象中获取摘要类型,如果未指定则默认为"last"
self.summary_type = getattr(config, "summary_type", "last")
# 如果摘要类型为"attn",则抛出未实现错误,建议使用标准的多头注意力模块
if self.summary_type == "attn":
raise NotImplementedError
# 初始化摘要为一个Identity对象,这个对象在前向传播中不做任何操作
self.summary = Identity()
# 如果配置中指定了使用投影进行摘要操作
if hasattr(config, "summary_use_proj") and config.summary_use_proj:
# 如果配置中指定了将投影映射到标签并且标签数大于0,则num_classes为标签数
if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
num_classes = config.num_labels
# 否则num_classes为隐藏大小
else:
num_classes = config.hidden_size
# 使用线性层将隐藏状态映射到num_classes维度
self.summary = nn.Linear(config.hidden_size, num_classes)
# 根据配置中指定的激活函数字符串,获取对应的激活函数或者使用Identity作为激活函数
activation_string = getattr(config, "summary_activation", None)
self.activation: Callable = get_activation(activation_string) if activation_string else Identity()
# 初始化第一个dropout层为Identity对象,如果配置中指定了第一个dropout的概率,则使用nn.Dropout进行初始化
self.first_dropout = Identity()
if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
self.first_dropout = nn.Dropout(config.summary_first_dropout)
# 初始化最后一个dropout层为Identity对象,如果配置中指定了最后一个dropout的概率,则使用nn.Dropout进行初始化
self.last_dropout = Identity()
if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
self.last_dropout = nn.Dropout(config.summary_last_dropout)
) -> torch.FloatTensor:
"""
Compute a single vector summary of a sequence hidden states.
Args:
hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
The hidden states of the last layer.
cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.
Returns:
`torch.FloatTensor`: The summary of the sequence hidden states.
"""
# 根据选择的汇总类型进行汇总操作
if self.summary_type == "last":
# 取每个序列的最后一个隐藏状态
output = hidden_states[:, -1]
elif self.summary_type == "first":
# 取每个序列的第一个隐藏状态
output = hidden_states[:, 0]
elif self.summary_type == "mean":
# 对整个序列的隐藏状态进行平均
output = hidden_states.mean(dim=1)
elif self.summary_type == "cls_index":
if cls_index is None:
# 如果没有提供 cls_index,则默认选择每个序列的最后一个 token 作为分类 token
cls_index = torch.full_like(
hidden_states[..., :1, :],
hidden_states.shape[-2] - 1,
dtype=torch.long,
)
else:
# 将 cls_index 扩展为与 hidden_states 相同的维度
cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
# 从 hidden_states 中根据 cls_index 提取对应的隐藏状态
output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size)
elif self.summary_type == "attn":
# 如果选择了注意力汇总类型,目前尚未实现此功能,抛出未实现错误
raise NotImplementedError
# 对输出进行第一个 dropout 操作
output = self.first_dropout(output)
# 将汇总后的向量传递给汇总层
output = self.summary(output)
# 对汇总后的向量应用激活函数
output = self.activation(output)
# 对最终输出进行最后一个 dropout 操作
output = self.last_dropout(output)
return output
# 递归地解包模型,从可能的容器中解开(如在分布式训练中使用的容器)。
def unwrap_model(model: nn.Module) -> nn.Module:
"""
Recursively unwraps a model from potential containers (as used in distributed training).
Args:
model (`torch.nn.Module`): The model to unwrap.
"""
# 如果模型具有 `module` 属性,说明模型被包装,需要递归解包
if hasattr(model, "module"):
return unwrap_model(model.module)
else:
return model
# 展开设备映射,返回对应参数名到设备的映射。
def expand_device_map(device_map, param_names, start_prefix):
"""
Expand a device map to return the correspondance parameter name to device.
"""
# 创建新的设备映射字典
new_device_map = {}
# 过滤参数名列表,仅保留以给定前缀开头的参数名,并去除前缀
param_names = [p[len(start_prefix) :] for p in param_names if p.startswith(start_prefix)]
# 遍历设备映射,更新新的设备映射字典
for module, device in device_map.items():
new_device_map.update(
# 对于每个参数名,如果与模块名匹配,或者以模块名加点开头,或者模块名为空,则更新映射
{p: device for p in param_names if p == module or p.startswith(f"{module}.") or module == ""}
)
return new_device_map
# 获取仅包含已转移到磁盘的权重的碎片文件列表。
def get_disk_only_shard_files(device_map, sharded_metadata, start_prefix):
"""
Returns the list of shard files containing only weights offloaded to disk.
"""
# 从权重映射中提取与给定前缀匹配的权重名称及其对应的文件名
weight_map = {
p[len(start_prefix) :]: v for p, v in sharded_metadata["weight_map"].items() if p.startswith(start_prefix)
}
# 创建一个默认值为列表的字典,用于存储每个文件的设备列表
files_content = collections.defaultdict(list)
# 遍历权重映射,为每个权重名称找到对应的设备列表并存储到 files_content 中
for weight_name, filename in weight_map.items():
while len(weight_name) > 0 and weight_name not in device_map:
weight_name = ".".join(weight_name.split(".")[:-1])
files_content[filename].append(device_map[weight_name])
# 返回仅包含磁盘设备的文件列表
return [fname for fname, devices in files_content.items() if set(devices) == {"disk"}]