Transformers 源码解析(八十四)
.\models\openai\modeling_tf_openai.py
class TFAttention(keras.layers.Layer):
def __init__(self, nx, config, scale=False, **kwargs):
super().__init__(**kwargs)
n_state = nx
assert (
n_state % config.n_head == 0
), f"Hidden dimension {n_state} not dividable by number of heads {config.n_head}"
self.n_head = config.n_head
self.split_size = n_state
self.scale = scale
self.output_attentions = config.output_attentions
self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn")
self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj")
self.attn_dropout = keras.layers.Dropout(config.attn_pdrop)
self.resid_dropout = keras.layers.Dropout(config.resid_pdrop)
self.n_state = n_state
self.pruned_heads = set()
def prune_heads(self, heads):
pass
@staticmethod
def causal_attention_mask(nd, ns):
"""
1's in the lower triangle, counting from the lower right corner. Same as tf.matrix_band_part(tf.ones([nd, ns]),
-1, ns-nd), but doesn't produce garbage on TPUs.
"""
i = tf.range(nd)[:, None]
j = tf.range(ns)
m = i >= j - ns + nd
return m
def _attn(self, q, k, v, attention_mask, head_mask, output_attentions, training=False):
w = tf.matmul(q, k, transpose_b=True)
if self.scale:
dk = tf.cast(shape_list(k)[-1], dtype=w.dtype)
w = w / tf.math.sqrt(dk)
_, _, nd, ns = shape_list(w)
b = tf.cast(self.causal_attention_mask(nd, ns), dtype=w.dtype)
b = tf.reshape(b, [1, 1, nd, ns])
w = w * b - 1e4 * (1 - b)
if attention_mask is not None:
attention_mask = tf.cast(attention_mask, dtype=w.dtype)
w = w + attention_mask
w = stable_softmax(w, axis=-1)
w = self.attn_dropout(w, training=training)
if head_mask is not None:
w = w * head_mask
outputs = [tf.matmul(w, v)]
if output_attentions:
outputs.append(w)
return outputs
def merge_heads(self, x):
x = tf.transpose(x, [0, 2, 1, 3])
x_shape = shape_list(x)
new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]]
return tf.reshape(x, new_x_shape)
def split_heads(self, x):
x_shape = shape_list(x)
new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head]
x = tf.reshape(x, new_x_shape)
return tf.transpose(x, (0, 2, 1, 3))
def call(self, x, attention_mask, head_mask, output_attentions, training=False):
x = self.c_attn(x)
query, key, value = tf.split(x, 3, axis=2)
query = self.split_heads(query)
key = self.split_heads(key)
value = self.split_heads(value)
attn_outputs = self._attn(query, key, value, attention_mask, head_mask, output_attentions, training=training)
a = attn_outputs[0]
a = self.merge_heads(a)
a = self.c_proj(a)
a = self.resid_dropout(a, training=training)
outputs = [a] + attn_outputs[1:]
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "c_attn", None) is not None:
with tf.name_scope(self.c_attn.name):
self.c_attn.build([None, None, self.n_state * 3])
if getattr(self, "c_proj", None) is not None:
with tf.name_scope(self.c_proj.name):
self.c_proj.build([None, None, self.n_state])
class TFMLP(keras.layers.Layer):
def __init__(self, n_state, config, **kwargs):
super().__init__(**kwargs)
nx = config.n_embd
self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc")
self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
self.act = get_tf_activation("gelu")
self.dropout = keras.layers.Dropout(config.resid_pdrop)
self.nx = nx
self.n_state = n_state
def call(self, x, training=False):
h = self.act(self.c_fc(x))
h2 = self.c_proj(h)
h2 = self.dropout(h2, training=training)
return h2
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "c_fc", None) is not None:
with tf.name_scope(self.c_fc.name):
self.c_fc.build([None, None, self.n_state])
if getattr(self, "c_proj", None) is not None:
with tf.name_scope(self.c_proj.name):
self.c_proj.build([None, None, self.nx])
class TFBlock(keras.layers.Layer):
def __init__(self, config, scale=False, **kwargs):
super().__init__(**kwargs)
nx = config.n_embd
self.attn = TFAttention(nx, config, scale, name="attn")
self.ln_1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
self.mlp = TFMLP(4 * nx, config, name="mlp")
self.ln_2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
self.nx = nx
def call(self, x, attention_mask, head_mask, output_attentions, training=False):
output_attn = self.attn(x, attention_mask, head_mask, output_attentions, training=training)
a = output_attn[0]
n = self.ln_1(x + a)
m = self.mlp(n, training=training)
h = self.ln_2(n + m)
outputs = [h] + output_attn[1:]
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attn", None) is not None:
with tf.name_scope(self.attn.name):
self.attn.build(None)
if getattr(self, "ln_1", None) is not None:
with tf.name_scope(self.ln_1.name):
self.ln_1.build([None, None, self.nx])
if getattr(self, "mlp", None) is not None:
with tf.name_scope(self.mlp.name):
self.mlp.build(None)
if getattr(self, "ln_2", None) is not None:
with tf.name_scope(self.ln_2.name):
self.ln_2.build([None, None, self.nx])
@keras_serializable
class TFOpenAIGPTMainLayer(keras.layers.Layer):
config_class = OpenAIGPTConfig
def __init__(self, config, *inputs, **kwargs):
super().__init__(*inputs, **kwargs)
self.config = config
self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions
self.return_dict = config.use_return_dict
self.num_hidden_layers = config.n_layer
self.n_embd = config.n_embd
self.n_positions = config.n_positions
self.initializer_range = config.initializer_range
self.tokens_embed = TFSharedEmbeddings(
config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="tokens_embed"
)
self.drop = keras.layers.Dropout(config.embd_pdrop)
self.h = [TFBlock(config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)]
def build(self, input_shape=None):
with tf.name_scope("positions_embed"):
self.positions_embed = self.add_weight(
name="embeddings",
shape=[self.n_positions, self.n_embd],
initializer=get_initializer(self.initializer_range),
)
if self.built:
return
self.built = True
if getattr(self, "tokens_embed", None) is not None:
with tf.name_scope(self.tokens_embed.name):
self.tokens_embed.build(None)
if getattr(self, "h", None) is not None:
for layer in self.h:
with tf.name_scope(layer.name):
layer.build(None)
def get_input_embeddings(self):
return self.tokens_embed
def set_input_embeddings(self, value):
self.tokens_embed.weight = value
self.tokens_embed.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
"""
raise NotImplementedError
@unpack_inputs
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
OPENAI_GPT_START_DOCSTRING = r"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
"""
class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = OpenAIGPTConfig
base_model_prefix = "transformer"
@dataclass
class TFOpenAIGPTDoubleHeadsModelOutput(ModelOutput):
"""
Base class for outputs of models predicting if two sentences are consecutive or not.
Args:
logits (`tf.Tensor` of shape `(batch_size, num_choices, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
mc_logits (`tf.Tensor` of shape `(batch_size, num_choices)`):
Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
logits: tf.Tensor = None
mc_logits: tf.Tensor = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
"""
Defines a constant string providing an introductory documentation string for the OpenAI GPT model implementation.
This docstring outlines the inheritance structure, general usage, and compatibility with TensorFlow 2.0,
emphasizing the support for multiple input formats. It also offers a tip regarding the input format preference
in TensorFlow's `transformers` library, ensuring seamless integration with Keras methods like `model.fit()`.
"""
format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
positional argument:
- a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
Note that when creating models and layers with
[subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
about any of this, as you can just pass inputs like you would to any other Python function!
</Tip>
Parameters:
config ([`OpenAIGPTConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
定义了一个文档字符串,用于描述 OpenAI GPT 相关的输入参数说明。
"""
@add_start_docstrings(
"The bare OpenAI GPT transformer model outputting raw hidden-states without any specific head on top.",
OPENAI_GPT_START_DOCSTRING,
)
"""
使用装饰器添加了文档字符串,描述了一个裸的 OpenAI GPT 变压器模型,输出原始的隐藏状态,没有特定的输出头。
"""
class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
"""
定义了 TFOpenAIGPTModel 类,继承自 TFOpenAIGPTPreTrainedModel。
"""
def __init__(self, config, *inputs, **kwargs):
"""
初始化方法,接受配置和其他参数,并调用父类初始化方法。
"""
super().__init__(config, *inputs, **kwargs)
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
"""
创建了 TFOpenAIGPTMainLayer 对象,作为 transformer 属性。
"""
@unpack_inputs
@add_start_docstrings_to_model_forward(OPENAI_GPT_INPUTS_DOCSTRING)
"""
使用装饰器添加了文档字符串,描述了模型的前向传播函数,扩展了输入参数的文档说明。
"""
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutput,
config_class=_CONFIG_FOR_DOC,
)
"""
使用装饰器添加了代码示例的文档字符串,指定了用于文档化的检查点、输出类型和配置类。
"""
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
) -> Union[Tuple, TFBaseModelOutput]:
"""
模型的调用方法,接受多个输入参数,并返回模型输出。
"""
outputs = self.transformer(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def build(self, input_shape=None):
"""
构建方法,用于构建模型的层次结构。
"""
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
"""
在 transformer 属性上建立命名作用域,并调用其 build 方法。
"""
@add_start_docstrings(
"""
OpenAI GPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
""",
OPENAI_GPT_START_DOCSTRING,
)
"""
使用装饰器添加了文档字符串,描述了带有语言建模头部的 OpenAI GPT 模型变压器。
"""
class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelingLoss):
"""
定义了 TFOpenAIGPTLMHeadModel 类,继承自 TFOpenAIGPTPreTrainedModel 和 TFCausalLanguageModelingLoss。
"""
def __init__(self, config, *inputs, **kwargs):
"""
初始化方法,接受配置和其他参数,并调用父类初始化方法。
"""
super().__init__(config, *inputs, **kwargs)
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
"""
创建了 TFOpenAIGPTMainLayer 对象,作为 transformer 属性。
OpenAIGPT 模型不支持过去的缓存特性。
"""
self.supports_xla_generation = False
def get_output_embeddings(self):
"""
获取输出嵌入的方法,返回输入嵌入。
"""
return self.get_input_embeddings()
def set_output_embeddings(self, value):
"""
设置输出嵌入的方法,设置输入嵌入的值。
"""
self.set_input_embeddings(value)
@unpack_inputs
@add_start_docstrings_to_model_forward(OPENAI_GPT_INPUTS_DOCSTRING)
"""
使用装饰器添加了文档字符串,扩展了模型的前向传播函数的输入参数文档说明。
"""
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFCausalLMOutput,
config_class=_CONFIG_FOR_DOC,
)
# 使用装饰器添加代码示例的文档字符串,指定文档检查点、输出类型和配置类
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[Tuple, TFCausalLMOutput]:
r"""
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
config.vocab_size - 1]`.
"""
# 定义模型的调用方法,接受多个输入参数并返回输出结果
# 调用 transformer 模型进行前向传播
transformer_outputs = self.transformer(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 提取 transformer 的隐藏状态作为输出的第一个元素
hidden_states = transformer_outputs[0]
# 对隐藏状态进行线性变换,生成预测的 logits
logits = self.transformer.tokens_embed(hidden_states, mode="linear")
# 初始化损失为 None
loss = None
# 如果 labels 不为空,则计算损失
if labels is not None:
# 将 logits 向左移动一位并截断最后一个 logit token
shifted_logits = logits[:, :-1]
labels = labels[:, 1:]
# 使用预测的 logits 和实际的 labels 计算损失
loss = self.hf_compute_loss(labels, shifted_logits)
# 如果 return_dict 为 False,则返回元组形式的输出
if not return_dict:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
# 如果 return_dict 为 True,则返回 TFCausalLMOutput 对象作为输出
return TFCausalLMOutput(
loss=loss,
logits=logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
# 准备生成的输入数据格式
def prepare_inputs_for_generation(self, inputs, **kwargs):
return {"input_ids": inputs}
# 构建模型
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果模型已经构建,则直接返回
if getattr(self, "transformer", None) is not None:
# 在 transformer 的命名空间下构建模型
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
@add_start_docstrings(
"""
OpenAI GPT Model transformer with a language modeling and a multiple-choice classification head on top e.g. for
RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the
input embeddings, the classification head takes as input the input of a specified classification token index in the
input sequence).
""",
OPENAI_GPT_START_DOCSTRING,
)
class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
config.num_labels = 1
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
self.multiple_choice_head = TFSequenceSummary(
config, initializer_range=config.initializer_range, name="multiple_choice_head"
)
@unpack_inputs
@add_start_docstrings_to_model_forward(OPENAI_GPT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFOpenAIGPTDoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
mc_token_ids: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
):
"""
Perform the forward pass of the OpenAI GPT model with two heads.
Args:
input_ids: Optional[input_ids: tf.TensorSpec((None, None, None), tf.int32, name="input_ids")],
The input tensor of shape [batch_size, sequence_length].
attention_mask: Optional[tf.TensorSpec((None, None, None), tf.int32, name="attention_mask")],
The attention mask tensor of shape [batch_size, sequence_length].
token_type_ids: Optional[tf.TensorSpec((None, None), tf.int32, name="token_type_ids")],
The token type ids tensor of shape [batch_size, sequence_length].
position_ids: Optional[tf.TensorSpec((None, None), tf.int32, name="position_ids")],
The position ids tensor of shape [batch_size, sequence_length].
head_mask: Optional[tf.TensorSpec((None, None), tf.float32, name="head_mask")],
The head mask tensor of shape [num_heads, sequence_length].
inputs_embeds: Optional[tf.TensorSpec((None, None, None), tf.float32, name="inputs_embeds")],
The input embeddings tensor of shape [batch_size, sequence_length, hidden_size].
mc_token_ids: Optional[tf.TensorSpec((None, None), tf.int32, name="mc_token_ids")],
The multiple choice token ids tensor of shape [batch_size, num_choices].
output_attentions: Optional[bool],
Whether to return attentions weights.
output_hidden_states: Optional[bool],
Whether to return hidden states.
return_dict: Optional[bool],
Whether to return a dictionary instead of a tuple.
training: Optional[bool],
Whether in training mode or not.
Returns:
TFOpenAIGPTDoubleHeadsModelOutput or tf.Tensor,
The model output as a named tuple or a tensor.
"""
pass
@property
def input_signature(self):
"""
Return the input signature for the TensorFlow model.
"""
return {
"input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
"attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
"mc_token_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
}
def build(self, input_shape=None):
"""
Build the OpenAI GPT model.
Args:
input_shape: Optional, The shape of the input tensor.
"""
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "multiple_choice_head", None) is not None:
with tf.name_scope(self.multiple_choice_head.name):
self.multiple_choice_head.build(None)
@add_start_docstrings(
"""
The OpenAI GPT Model transformer with a sequence classification head on top (linear layer).
[`TFOpenAIGPTForSequenceClassification`] uses the last token in order to do the classification, as other causal
models (e.g. GPT-2) do.
Since it does classification on the last token, it requires to know the position of the last token. If a
""",
OPENAI_GPT_START_DOCSTRING,
)
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
each row of the batch).
OPENAI_GPT_START_DOCSTRING,
OPENAI_GPT_START_DOCSTRING,
)
# 定义一个继承自 TFOpenAIGPTPreTrainedModel 和 TFSequenceClassificationLoss 的类
class TFOpenAIGPTForSequenceClassification(TFOpenAIGPTPreTrainedModel, TFSequenceClassificationLoss):
# 初始化函数,接受配置参数 config 和额外的输入 *inputs 和 **kwargs
def __init__(self, config, *inputs, **kwargs):
# 调用父类的初始化方法
super().__init__(config, *inputs, **kwargs)
# 设置类别数目为配置中的 num_labels
self.num_labels = config.num_labels
# 创建一个全连接层 Dense 对象用于生成输出分数
self.score = keras.layers.Dense(
config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="score",
use_bias=False,
)
# 创建一个 OpenAIGPT 主层对象用于处理输入数据
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
# 保存配置对象到类的属性中
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(OPENAI_GPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
# 定义模型的前向传播函数,接受多种输入参数并返回模型输出
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[Tuple, TFSequenceClassifierOutput]:
r"""
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
config.vocab_size - 1]`.
"""
# 调用transformer模型,获取transformer的输出结果
transformer_outputs = self.transformer(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 从transformer输出中获取隐藏状态
hidden_states = transformer_outputs[0]
# 将隐藏状态输入score层,得到logits
logits = self.score(hidden_states)
# 初始化in_logits变量
in_logits = None
# 如果没有定义pad_token_id,则将sequence_lengths设为-1
if self.config.pad_token_id is None:
sequence_lengths = -1
else:
# 如果输入中有input_ids
if input_ids is not None:
# 计算每个样本的序列长度
sequence_lengths = (
tf.argmax(tf.cast(tf.math.equal(input_ids, self.config.pad_token_id), input_ids.dtype), axis=-1)
- 1
)
# 如果长度小于0,则设为默认序列长度-1
sequence_lengths = tf.where(sequence_lengths >= 0, sequence_lengths, input_ids.shape[-1] - 1)
# 从logits中根据序列长度取出对应位置的值
in_logits = tf.gather(logits, sequence_lengths, batch_dims=1, axis=1)
else:
sequence_lengths = -1
# 如果没有input_ids,发出警告
logger.warning(
f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
)
# 初始化loss为None
loss = None
# 如果提供了labels
if labels is not None:
# 根据input_ids或inputs_embeds获取batch_size和sequence_length
if input_ids is not None:
batch_size, sequence_length = shape_list(input_ids)[:2]
else:
batch_size, sequence_length = shape_list(inputs_embeds)[:2]
# 检查是否定义了pad_token_id或者batch_size为1,否则报错
assert (
self.config.pad_token_id is not None or batch_size == 1
), "Cannot handle batch sizes > 1 if no padding token is defined."
# 如果sequence_lengths不是tensor,则根据batch_size和sequence_lengths取出对应的logits值
if not tf.is_tensor(sequence_lengths):
in_logits = logits[0:batch_size, sequence_lengths]
# 计算损失函数
loss = self.hf_compute_loss(tf.reshape(labels, [-1, 1]), tf.reshape(in_logits, [-1, self.num_labels]))
# 如果in_logits不为None,则使用in_logits作为pooled_logits,否则使用logits
pooled_logits = in_logits if in_logits is not None else logits
# 如果return_dict为False,则返回输出元组
if not return_dict:
output = (pooled_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
# 如果return_dict为True,则返回TFSequenceClassifierOutput对象
return TFSequenceClassifierOutput(
loss=loss,
logits=pooled_logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
# 定义一个方法 `build`,用于构建模型的结构
def build(self, input_shape=None):
# 如果已经构建过,直接返回,避免重复构建
if self.built:
return
# 将 built 属性设置为 True,表示模型已构建
self.built = True
# 检查是否存在 `score` 属性,并且属性值不为 None
if getattr(self, "score", None) is not None:
# 在命名空间 `self.score.name` 下构建 `score` 属性
with tf.name_scope(self.score.name):
# 调用 `build` 方法构建 `score`,输入形状为 [None, None, self.config.n_embd]
self.score.build([None, None, self.config.n_embd])
# 检查是否存在 `transformer` 属性,并且属性值不为 None
if getattr(self, "transformer", None) is not None:
# 在命名空间 `self.transformer.name` 下构建 `transformer` 属性
with tf.name_scope(self.transformer.name):
# 调用 `build` 方法构建 `transformer`,输入形状为 None
self.transformer.build(None)
.\models\openai\tokenization_openai.py
import json
import os
import re
import unicodedata
from typing import Optional, Tuple
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"openai-community/openai-gpt": "https://huggingface.co/openai-community/openai-gpt/resolve/main/vocab.json"
},
"merges_file": {
"openai-community/openai-gpt": "https://huggingface.co/openai-community/openai-gpt/resolve/main/merges.txt"
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"openai-community/openai-gpt": 512,
}
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class BasicTokenizer(object):
"""
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
"""
Args:
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
never_split (`Iterable`, *optional*):
Collection of tokens which will never be split during tokenization. Only has an effect when
`do_basic_tokenize=True`
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters.
This should likely be deactivated for Japanese (see this
[issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
"""
Initialize the Tokenizer with specified parameters.
Args:
do_lower_case (bool, optional, default=True): Whether or not to convert tokens to lowercase.
never_split (Iterable, optional): Collection of tokens that should never be split during tokenization.
Defaults to an empty list.
tokenize_chinese_chars (bool, optional, default=True): Whether or not to tokenize Chinese characters.
strip_accents (bool, optional): Whether or not to remove accents. If None, determined by `lowercase`.
do_split_on_punc (bool, optional, default=True): Whether or not to split on punctuation marks.
"""
# Initialize with default values or provided arguments
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of token not to split.
"""
# 使用 union() 方法将 `never_split` 参数与 `self.never_split` 属性合并,以确保不分割的词汇列表完整
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
# 清理文本,去除不必要的空白和特殊字符
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
# 如果设定了 `tokenize_chinese_chars` 参数为真,则对文本进行中文字符的分词处理
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
# 标准化 Unicode 文本为 NFC 形式,确保不同的 Unicode 编码形式被视为相同的字符
unicode_normalized_text = unicodedata.normalize("NFC", text)
# 使用空白字符进行分词,得到原始的 token 列表
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
# 遍历每个原始 token
for token in orig_tokens:
# 如果 token 不在 `never_split` 中,则可能对其进行小写处理和去重音符处理
if token not in never_split:
if self.do_lower_case:
# 如果设置了小写处理,则将 token 转换为小写
token = token.lower()
# 如果设置了去除重音符号,则对 token 进行去重音符处理
if self.strip_accents is not False:
token = self._run_strip_accents(token)
elif self.strip_accents:
# 如果仅设置了去重音符号,则对 token 进行去重音符处理
token = self._run_strip_accents(token)
# 将 token 进行标点符号的分割处理,并加入到分割后的 token 列表中
split_tokens.extend(self._run_split_on_punc(token, never_split))
# 将分割后的 token 列表重新用空白字符连接为字符串,并进行最终的分词处理
output_tokens = whitespace_tokenize(" ".join(split_tokens))
# 返回最终的 token 列表
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
# 将文本标准化为 NFD 形式,以便去除重音符号
text = unicodedata.normalize("NFD", text)
output = []
# 遍历文本中的每个字符
for char in text:
# 获取当前字符的 Unicode 分类信息
cat = unicodedata.category(char)
# 如果当前字符为重音符号,则跳过该字符,不加入到输出列表中
if cat == "Mn":
continue
# 否则将当前字符加入到输出列表中
output.append(char)
# 将输出列表中的字符连接成字符串,并返回处理后的文本
return "".join(output)
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
# 如果不需要在标点处分割或者给定的文本在never_split中,则直接返回包含整个文本的列表
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
# 将文本转换为字符列表
chars = list(text)
i = 0
start_new_word = True
output = []
# 遍历字符列表
while i < len(chars):
char = chars[i]
# 如果当前字符是标点符号
if _is_punctuation(char):
# 在输出列表中添加一个新的空列表,用于存储下一个词
output.append([char])
start_new_word = True
else:
# 如果不是标点符号
if start_new_word:
# 添加一个空列表作为新词的起始
output.append([])
start_new_word = False
# 将当前字符添加到当前词的末尾
output[-1].append(char)
i += 1
# 将每个词列表转换为字符串,并返回列表
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
# 遍历文本中的每个字符
for char in text:
cp = ord(char)
# 如果是中文字符
if self._is_chinese_char(cp):
# 在中文字符的前后添加空格,并添加到输出列表中
output.append(" ")
output.append(char)
output.append(" ")
else:
# 如果不是中文字符,则直接添加到输出列表中
output.append(char)
# 将输出列表中的字符连接成一个字符串并返回
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# 检查给定的码点是否是CJK字符的码点范围内
# 这里参考了CJK统一表意文字的Unicode块范围
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF)
or (cp >= 0x20000 and cp <= 0x2A6DF)
or (cp >= 0x2A700 and cp <= 0x2B73F)
or (cp >= 0x2B740 and cp <= 0x2B81F)
or (cp >= 0x2B820 and cp <= 0x2CEAF)
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F)
):
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
# 遍历文本中的每个字符
for char in text:
cp = ord(char)
# 如果字符为无效字符或控制字符,则跳过
if cp == 0 or cp == 0xFFFD or _is_control(char):
continue
# 如果字符是空白字符,则替换为单个空格
if _is_whitespace(char):
output.append(" ")
else:
# 否则将字符添加到输出列表中
output.append(char)
# 将输出列表中的字符连接成一个字符串并返回
return "".join(output)
def get_pairs(word):
"""
Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
strings)
"""
# Initialize an empty set to store symbol pairs
pairs = set()
# Initialize the previous character as the first character in the word
prev_char = word[0]
# Iterate over each character in the word starting from the second character
for char in word[1:]:
# Add the pair of previous character and current character to the set
pairs.add((prev_char, char))
# Update the previous character to the current character for the next iteration
prev_char = char
# Return the set of symbol pairs
return pairs
def text_standardize(text):
"""
fixes some issues the spacy tokenizer had on books corpus also does some whitespace standardization
"""
# Replace em dashes, en dashes, horizontal bars, and ellipses with standard symbols
text = text.replace("—", "-")
text = text.replace("–", "-")
text = text.replace("―", "-")
text = text.replace("…", "...")
text = text.replace("´", "'")
# Use regex to standardize certain punctuation marks with surrounding spaces
text = re.sub(r"""(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)""", r" \1 ", text)
text = re.sub(r"\s*\n\s*", " \n ", text)
text = re.sub(r"[^\S\n]+", " ", text)
return text.strip()
class OpenAIGPTTokenizer(PreTrainedTokenizer):
"""
Construct a GPT Tokenizer. Based on Byte-Pair-Encoding with the following peculiarities:
- lowercases all inputs,
- uses `SpaCy` tokenizer and `ftfy` for pre-BPE tokenization if they are installed, fallback to BERT's
`BasicTokenizer` if not.
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
Args:
vocab_file (`str`):
Path to the vocabulary file.
merges_file (`str`):
Path to the merges file.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
try:
import ftfy
from spacy.lang.en import English
_nlp = English()
self.nlp = _nlp.tokenizer
self.fix_text = ftfy.fix_text
except ImportError:
logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
self.nlp = BasicTokenizer(do_lower_case=True)
self.fix_text = None
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
with open(merges_file, encoding="utf-8") as merges_handle:
merges = merges_handle.read().split("\n")[1:-1]
merges = [tuple(merge.split()) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
super().__init__(unk_token=unk_token, **kwargs)
def do_lower_case(self):
return True
@property
def vocab_size(self):
return len(self.encoder)
def get_vocab(self):
return dict(self.encoder, **self.added_tokens_encoder)
def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token[:-1]) + (token[-1] + "</w>",)
pairs = get_pairs(word)
if not pairs:
return token + "</w>"
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = " ".join(word)
if word == "\n </w>":
word = "\n</w>"
self.cache[token] = word
return word
def _tokenize(self, text):
split_tokens = []
if self.fix_text is None:
text = self.nlp.tokenize(text)
for token in text:
split_tokens.extend(list(self.bpe(token).split(" ")))
else:
text = self.nlp(text_standardize(self.fix_text(text)))
for token in text:
split_tokens.extend(list(self.bpe(token.text.lower()).split(" ")))
return split_tokens
def _convert_token_to_id(self, token):
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
return self.decoder.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
out_string = "".join(tokens).replace("</w>", " ").strip()
return out_string
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write("#version: 0.2\n")
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file
.\models\openai\tokenization_openai_fast.py
"""Fast Tokenization classes for OpenAI GPT."""
from typing import Optional, Tuple
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .tokenization_openai import OpenAIGPTTokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"openai-community/openai-gpt": "https://huggingface.co/openai-community/openai-gpt/resolve/main/vocab.json"
},
"merges_file": {
"openai-community/openai-gpt": "https://huggingface.co/openai-community/openai-gpt/resolve/main/merges.txt"
},
"tokenizer_file": {
"openai-community/openai-gpt": "https://huggingface.co/openai-community/openai-gpt/resolve/main/tokenizer.json"
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"openai-community/openai-gpt": 512,
}
class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" GPT Tokenizer (backed by HuggingFace's *tokenizers* library). Based on Byte-Pair-Encoding with
the following peculiarities:
- lower case all inputs
- uses BERT's BasicTokenizer for pre-BPE tokenization
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.
Args:
vocab_file (`str`):
Path to the vocabulary file.
merges_file (`str`):
Path to the merges file.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = OpenAIGPTTokenizer
def __init__(self, vocab_file=None, merges_file=None, tokenizer_file=None, unk_token="<unk>", **kwargs):
super().__init__(vocab_file, merges_file, tokenizer_file=tokenizer_file, unk_token=unk_token, **kwargs)
@property
def do_lower_case(self):
return True
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
.\models\openai\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_openai": ["OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "OpenAIGPTConfig"],
"tokenization_openai": ["OpenAIGPTTokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_openai_fast"] = ["OpenAIGPTTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_openai"] = [
"OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST",
"OpenAIGPTDoubleHeadsModel",
"OpenAIGPTForSequenceClassification",
"OpenAIGPTLMHeadModel",
"OpenAIGPTModel",
"OpenAIGPTPreTrainedModel",
"load_tf_weights_in_openai_gpt",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_openai"] = [
"TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFOpenAIGPTDoubleHeadsModel",
"TFOpenAIGPTForSequenceClassification",
"TFOpenAIGPTLMHeadModel",
"TFOpenAIGPTMainLayer",
"TFOpenAIGPTModel",
"TFOpenAIGPTPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig
from .tokenization_openai import OpenAIGPTTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_openai_fast import OpenAIGPTTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_openai import (
OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST,
OpenAIGPTDoubleHeadsModel,
OpenAIGPTForSequenceClassification,
OpenAIGPTLMHeadModel,
OpenAIGPTModel,
OpenAIGPTPreTrainedModel,
load_tf_weights_in_openai_gpt,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_openai import (
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFOpenAIGPTDoubleHeadsModel,
TFOpenAIGPTForSequenceClassification,
TFOpenAIGPTLMHeadModel,
TFOpenAIGPTMainLayer,
TFOpenAIGPTModel,
TFOpenAIGPTPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\opt\configuration_opt.py
""" OPT model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
OPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/opt-125m": "https://huggingface.co/facebook/opt-125m/blob/main/config.json",
"facebook/opt-350m": "https://huggingface.co/facebook/opt-350m/blob/main/config.json",
"facebook/opt-1.3b": "https://huggingface.co/facebook/opt-1.3b/blob/main/config.json",
"facebook/opt-2.7b": "https://huggingface.co/facebook/opt-2.7b/blob/main/config.json",
"facebook/opt-6.7b": "https://huggingface.co/facebook/opt-6.7b/blob/main/config.json",
"facebook/opt-13b": "https://huggingface.co/facebook/opt-13b/blob/main/config.json",
}
class OPTConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`OPTModel`]. It is used to instantiate a OPT model
according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the OPT
[facebook/opt-350m](https://huggingface.co/facebook/opt-350m) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
Args:
vocab_size (`int`, *optional*, defaults to 50272):
OPT 模型的词汇表大小,定义了在调用 OPTModel 时 `inputs_ids` 可以表示的不同标记数量
hidden_size (`int`, *optional*, defaults to 768):
层和池化层的维度
num_hidden_layers (`int`, *optional*, defaults to 12):
解码器层的数量
ffn_dim (`int`, *optional*, defaults to 3072):
解码器中“中间”(通常称为前馈)层的维度
num_attention_heads (`int`, *optional*, defaults to 12):
Transformer 解码器中每个注意力层的注意力头数量
activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
编码器和池化器中的非线性激活函数(函数或字符串),支持的字符串有 `"gelu"`, `"relu"`, `"silu"` 和 `"gelu_new"`
max_position_embeddings (`int`, *optional*, defaults to 2048):
模型可能使用的最大序列长度,通常设置为较大的值(例如 512、1024 或 2048)
do_layer_norm_before (`bool`, *optional*, defaults to `True`):
是否在注意力块之前执行层归一化
word_embed_proj_dim (`int`, *optional*):
可以设置为缩小词嵌入的维度,例如 `opt-350m`。默认为 `hidden_size`
dropout (`float`, *optional*, defaults to 0.1):
所有嵌入层、编码器和池化器中完全连接层的 dropout 概率
attention_dropout (`float`, *optional*, defaults to 0.0):
注意力概率的 dropout 比率
layerdrop (`float`, *optional*, defaults to 0.0):
LayerDrop 概率。参见 LayerDrop 论文以获取更多细节(https://arxiv.org/abs/1909.11556)
init_std (`float`, *optional*, defaults to 0.02):
用于初始化所有权重矩阵的截断正态初始化器的标准差
use_cache (`bool`, *optional*, defaults to `True`):
模型是否应返回最后一组键/值注意力(并非所有模型都使用)
enable_bias (`bool`, *optional*, defaults to `True`):
注意力块中线性层是否应使用偏置项
layer_norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
层归一化是否应具有可学习参数
Example:
```
>>> from transformers import OPTConfig, OPTModel
>>>
>>> configuration = OPTConfig()
```
model = OPTModel(configuration)
configuration = model.config
.\models\opt\convert_opt_original_pytorch_checkpoint_to_pytorch.py
import argparse
from pathlib import Path
import torch
from transformers import OPTConfig, OPTModel
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def load_checkpoint(checkpoint_path):
"""Checkpoint path should end in model.pt"""
sd = torch.load(checkpoint_path, map_location="cpu")
if "model" in sd.keys():
sd = torch.load(checkpoint_path, map_location="cpu")["model"]
keys_to_delete = [
"decoder.version",
"decoder.output_projection.weight",
]
for key in keys_to_delete:
if key in sd:
sd.pop(key)
keys_to_rename = {
"decoder.project_in_dim.weight": "decoder.project_in.weight",
"decoder.project_out_dim.weight": "decoder.project_out.weight",
"decoder.layer_norm.weight": "decoder.final_layer_norm.weight",
"decoder.layer_norm.bias": "decoder.final_layer_norm.bias",
}
for old_key, new_key in keys_to_rename.items():
if old_key in sd:
sd[new_key] = sd.pop(old_key)
keys = list(sd.keys())
for key in keys:
if ".qkv_proj." in key:
value = sd[key]
q_name = key.replace(".qkv_proj.", ".q_proj.")
k_name = key.replace(".qkv_proj.", ".k_proj.")
v_name = key.replace(".qkv_proj.", ".v_proj.")
depth = value.shape[0]
assert depth % 3 == 0
k, v, q = torch.split(value, depth // 3, dim=0)
sd[q_name] = q
sd[k_name] = k
sd[v_name] = v
del sd[key]
return sd
@torch.no_grad()
def convert_opt_checkpoint(checkpoint_path, pytorch_dump_folder_path, config=None):
"""
Copy/paste/tweak model's weights to our BERT structure.
"""
state_dict = load_checkpoint(checkpoint_path)
if config is not None:
config = OPTConfig.from_pretrained(config)
else:
config = OPTConfig()
model = OPTModel(config).half().eval()
model.load_state_dict(state_dict)
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--fairseq_path",
type=str,
help=(
"path to fairseq checkpoint in correct format. You can find all checkpoints in the correct format here:"
" https://huggingface.co/models?other=opt_metasq"
),
)
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument("--hf_config", default=None, type=str, help="Define HF config.")
args = parser.parse_args()
convert_opt_checkpoint(args.fairseq_path, args.pytorch_dump_folder_path, config=args.hf_config)
.\models\opt\modeling_flax_opt.py
"""
Flax OPT model.
Flax OPT 模型
"""
from functools import partial
from typing import Optional, Tuple
import flax.linen as nn
import jax
import jax.numpy as jnp
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
from flax.linen import combine_masks, make_causal_mask
from flax.linen.attention import dot_product_attention_weights
from flax.traverse_util import flatten_dict, unflatten_dict
from jax import lax
from jax.random import PRNGKey
from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxMaskedLMOutput
from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
from ...utils import add_start_docstrings, logging
from .configuration_opt import OPTConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "facebook/opt-350m"
_CONFIG_FOR_DOC = "OPTConfig"
OPT_START_DOCSTRING = r"""
This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a Flax Linen
[flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
Finally, this model supports inherent JAX features such as:
- [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
- [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
- [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
- [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
"""
Parameters:
config ([`OPTConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
`jax.numpy.bfloat16` (on TPUs).
This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
specified all the computation will be performed with the given `dtype`.
**Note that this only specifies the dtype of the computation and does not influence the dtype of model
parameters.**
If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
[`~FlaxPreTrainedModel.to_bf16`].
"""
OPT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary
attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary
position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention with Bart->OPT
class FlaxOPTAttention(nn.Module):
config: OPTConfig
embed_dim: int
num_heads: int
dropout: float = 0.0
causal: bool = False
bias: bool = True
dtype: jnp.dtype = jnp.float32 # the dtype of the computation
def setup(self) -> None:
# 计算每个注意力头的维度
self.head_dim = self.embed_dim // self.num_heads
# 检查是否能整除,若不能则抛出错误
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {self.num_heads})."
)
# 部分应用全连接层函数,用于创建查询、键、值和输出投影
dense = partial(
nn.Dense,
self.embed_dim,
use_bias=self.bias,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.init_std),
)
# 创建查询、键、值和输出投影层
self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense()
self.out_proj = dense()
# 创建 dropout 层
self.dropout_layer = nn.Dropout(rate=self.dropout)
# 如果是因果注意力,创建因果掩码
if self.causal:
self.causal_mask = make_causal_mask(
jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool"
)
# 将隐藏状态按照指定维度重新形状化,以便分离注意力头
def _split_heads(self, hidden_states):
return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
# 将隐藏状态按照指定维度重新形状化,以合并注意力头
def _merge_heads(self, hidden_states):
return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
# 使用 Flax 框架的 @nn.compact 装饰器定义一个方法,用于将单个输入令牌的投影键、值状态与前几步骤的缓存状态连接起来
def _concatenate_to_cache(self, key, value, query, attention_mask):
"""
This function takes projected key, value states from a single input token and concatenates the states to cached
states from previous steps. This function is slighly adapted from the official Flax repository:
https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py
"""
# 检测是否通过缺少现有缓存数据来进行初始化
is_initialized = self.has_variable("cache", "cached_key")
# 获取或创建缓存的键和值,并初始化为零矩阵
cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
# 获取或创建缓存索引,并初始化为零
cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
if is_initialized:
# 提取批处理维度、最大长度、头数和每个头的深度
*batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
# 使用新的一维空间切片更新键和值缓存
cur_index = cache_index.value
indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
key = lax.dynamic_update_slice(cached_key.value, key, indices)
value = lax.dynamic_update_slice(cached_value.value, value, indices)
# 更新缓存中的键和值
cached_key.value = key
cached_value.value = value
# 更新缓存索引以反映更新的缓存向量数
num_updated_cache_vectors = query.shape[1]
cache_index.value = cache_index.value + num_updated_cache_vectors
# 用于缓存的因果掩码:我们的单个查询位置只应与已生成和缓存的键位置进行关联,而不是剩余的零元素
pad_mask = jnp.broadcast_to(
jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
)
# 组合当前的掩码和输入的注意力掩码
attention_mask = combine_masks(pad_mask, attention_mask)
# 返回更新后的键、值和注意力掩码
return key, value, attention_mask
class FlaxOPTDecoderLayer(nn.Module):
config: OPTConfig # 定义一个类成员变量 config,类型为 OPTConfig
dtype: jnp.dtype = jnp.float32 # 定义一个类成员变量 dtype,默认为 jnp.float32
def setup(self) -> None:
self.embed_dim = self.config.hidden_size # 从 config 中获取 hidden_size 并赋给 embed_dim
self.self_attn = FlaxOPTAttention( # 初始化 self_attn,使用 FlaxOPTAttention 类
config=self.config, # 传入配置参数 config
embed_dim=self.embed_dim, # 传入 embed_dim 参数
num_heads=self.config.num_attention_heads, # 传入注意力头数
dropout=self.config.attention_dropout, # 传入注意力 dropout 率
causal=True, # 是否使用因果注意力
dtype=self.dtype, # 数据类型为类成员变量 dtype
)
self.do_layer_norm_before = self.config.do_layer_norm_before # 是否在前面进行层归一化
self.dropout_layer = nn.Dropout(rate=self.config.dropout) # 初始化 dropout 层
self.activation_fn = ACT2FN[self.config.activation_function] # 根据激活函数名称选择对应的激活函数
self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) # 初始化自注意力层的 LayerNorm
self.fc1 = nn.Dense( # 初始化全连接层 fc1
self.config.ffn_dim, # 全连接层的输出维度
dtype=self.dtype, # 数据类型为类成员变量 dtype
kernel_init=jax.nn.initializers.normal(self.config.init_std), # 使用正态分布初始化权重
)
self.fc2 = nn.Dense( # 初始化全连接层 fc2
self.embed_dim, # 全连接层的输出维度为 embed_dim
dtype=self.dtype, # 数据类型为类成员变量 dtype
kernel_init=jax.nn.initializers.normal(self.config.init_std) # 使用正态分布初始化权重
)
self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) # 初始化最终输出的 LayerNorm
def __call__(
self,
hidden_states: jnp.ndarray, # 输入的隐藏状态张量
attention_mask: jnp.ndarray, # 注意力掩码张量
init_cache: bool = False, # 是否初始化缓存
output_attentions: bool = True, # 是否输出注意力权重
deterministic: bool = True, # 是否使用确定性计算
) -> Tuple[jnp.ndarray]:
residual = hidden_states # 保存输入的隐藏状态作为残差连接的基础
# 根据 self.do_layer_norm_before 的值判断是否在注意力机制之前应用层归一化
if self.do_layer_norm_before:
hidden_states = self.self_attn_layer_norm(hidden_states)
# 自注意力机制
hidden_states, self_attn_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
init_cache=init_cache,
deterministic=deterministic,
)
hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) # 应用 dropout
hidden_states = residual + hidden_states # 添加残差连接
# 根据 self.do_layer_norm_before 的值判断是否在注意力机制之后应用层归一化
if not self.do_layer_norm_before:
hidden_states = self.self_attn_layer_norm(hidden_states)
# 全连接层
hidden_states_shape = hidden_states.shape
hidden_states = hidden_states.reshape(-1, hidden_states.shape[-1]) # 将隐藏状态展平
residual = hidden_states # 更新残差连接基础
# 根据 self.do_layer_norm_before 的值判断是否在全连接层之前应用层归一化
if self.do_layer_norm_before:
hidden_states = self.final_layer_norm(hidden_states)
hidden_states = self.fc1(hidden_states) # 应用第一个全连接层
hidden_states = self.activation_fn(hidden_states) # 应用激活函数
hidden_states = self.fc2(hidden_states) # 应用第二个全连接层
hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) # 应用 dropout
hidden_states = (residual + hidden_states).reshape(hidden_states_shape) # 添加残差连接并恢复形状
# 根据 self.do_layer_norm_before 的值判断是否在全连接层之后应用层归一化
if not self.do_layer_norm_before:
hidden_states = self.final_layer_norm(hidden_states)
outputs = (hidden_states,) # 准备输出结果
if output_attentions:
outputs += (self_attn_weights,) # 如果需要输出注意力权重,则添加到输出中
return outputs # 返回模型的输出
class FlaxOPTDecoderLayerCollection(nn.Module):
config: OPTConfig
dtype: jnp.dtype = jnp.float32 # 计算时的数据类型
def setup(self):
# 创建多个解码器层,并按顺序存储在列表中
self.layers = [
FlaxOPTDecoderLayer(self.config, name=str(i), dtype=self.dtype)
for i in range(self.config.num_hidden_layers)
]
# 从配置中获取层丢弃率
self.layerdrop = self.config.layerdrop
def __call__(
self,
hidden_states,
attention_mask,
deterministic: bool = True,
init_cache: bool = False,
output_attentions: bool = False,
output_hidden_states: bool = False,
):
# 如果需要输出隐藏状态,则初始化一个空元组
all_hidden_states = () if output_hidden_states else None
# 如果需要输出注意力权重,则初始化一个空元组
all_self_attns = () if output_attentions else None
# 遍历每个解码器层
for decoder_layer in self.layers:
# 如果需要输出隐藏状态,则将当前隐藏状态加入到列表中
if output_hidden_states:
all_hidden_states += (hidden_states,)
# 调用当前解码器层,获取其输出
layer_outputs = decoder_layer(
hidden_states,
attention_mask=attention_mask,
init_cache=init_cache,
output_attentions=output_attentions,
deterministic=deterministic,
)
# 更新隐藏状态为当前层的输出
hidden_states = layer_outputs[0]
# 如果需要输出注意力权重,则将当前层的注意力权重加入到列表中
if output_attentions:
all_self_attns += (layer_outputs[1],)
# 组装最终输出,包括最终隐藏状态、所有隐藏状态列表和所有注意力权重列表
outputs = [hidden_states, all_hidden_states, all_self_attns]
return outputs
class FlaxOPTLearnedPositionalEmbedding(nn.Embed):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
def setup(self):
# 设置位置偏移量
self.offset = 2
# 初始化位置嵌入矩阵参数
self.embedding = self.param(
"embedding", self.embedding_init, (self.num_embeddings + self.offset, self.features), self.param_dtype
)
def __call__(self, positions):
"""`input_ids_shape` is expected to be [bsz x seqlen]."""
# 调用父类的 __call__ 方法,并在输入位置上加上偏移量
return super().__call__(positions + self.offset)
class FlaxOPTDecoder(nn.Module):
config: OPTConfig
dtype: jnp.dtype = jnp.float32 # 计算时的数据类型
offset: int = 2
# 设置方法用于初始化模型参数和各种配置
def setup(self):
# 初始化一个dropout层,用于随机失活以防止过拟合
self.dropout_layer = nn.Dropout(rate=self.config.dropout)
# 从配置中获取隐藏层大小作为嵌入维度
embed_dim = self.config.hidden_size
# 从配置中获取填充 token 的索引
self.padding_idx = self.config.pad_token_id
# 从配置中获取最大目标位置
self.max_target_positions = self.config.max_position_embeddings
# 初始化词嵌入层,使用正态分布初始化方法
self.embed_tokens = nn.Embed(
self.config.vocab_size,
self.config.word_embed_proj_dim,
embedding_init=jax.nn.initializers.normal(self.config.init_std),
dtype=self.dtype,
)
# 初始化学习位置嵌入层,使用正态分布初始化方法
self.embed_positions = FlaxOPTLearnedPositionalEmbedding(
self.config.max_position_embeddings,
embed_dim,
embedding_init=jax.nn.initializers.normal(self.config.init_std),
dtype=self.dtype,
)
# 如果词嵌入投影维度不等于隐藏层大小,则初始化投影层
if self.config.word_embed_proj_dim != self.config.hidden_size:
self.project_in = nn.Dense(self.config.hidden_size, use_bias=False)
self.project_out = nn.Dense(self.config.word_embed_proj_dim, use_bias=False)
else:
# 否则将投影层设置为 None
self.project_in = None
self.project_out = None
# 检查是否需要在最后一层使用 LayerNorm,主要是为了向后兼容
if self.config.do_layer_norm_before and not self.config._remove_final_layer_norm:
self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
else:
# 如果不需要 LayerNorm 则将其设置为 None
self.final_layer_norm = None
# 初始化解码器层集合
self.layers = FlaxOPTDecoderLayerCollection(self.config, self.dtype)
# 模型调用方法,用于执行模型的前向传播
def __call__(
self,
input_ids,
attention_mask,
position_ids,
init_cache: bool = False,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
deterministic: bool = True,
# 其他参数用于控制模型的行为,如是否输出注意力矩阵、隐藏状态等
):
# 获取输入的张量形状
input_shape = input_ids.shape
# 将输入张量展平为二维张量
input_ids = input_ids.reshape(-1, input_shape[-1])
# 使用嵌入标记方法对输入张量进行嵌入
inputs_embeds = self.embed_tokens(input_ids)
# 如果存在输入投影层,则将嵌入结果投影
if self.project_in is not None:
inputs_embeds = self.project_in(inputs_embeds)
# 使用嵌入位置方法生成位置嵌入张量
positions = self.embed_positions(position_ids)
# 将嵌入的输入张量和位置嵌入张量相加以得到隐藏状态张量
hidden_states = inputs_embeds + positions
# 调用多层模型的前向传播方法,获取隐藏状态、所有隐藏状态和注意力张量
hidden_state, all_hidden_states, attentions = self.layers(
hidden_states,
attention_mask,
deterministic=deterministic,
init_cache=init_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
)
# 如果存在最终层归一化,则对隐藏状态进行归一化
if self.final_layer_norm is not None:
hidden_state = self.final_layer_norm(hidden_state)
# 如果存在输出投影层,则对隐藏状态进行投影
if self.project_out is not None:
hidden_state = self.project_out(hidden_state)
# 如果要求输出所有隐藏状态,则将当前隐藏状态加入到所有隐藏状态列表中
if output_hidden_states:
all_hidden_states += (hidden_state,)
# 根据返回值是否为字典形式,决定返回元组还是命名元组形式的输出
outputs = [hidden_state, all_hidden_states, attentions]
if not return_dict:
return tuple(v for v in outputs if v is not None)
# 返回命名元组形式的输出
return FlaxBaseModelOutput(
last_hidden_state=hidden_state,
hidden_states=all_hidden_states,
attentions=attentions,
)
# 定义一个继承自FlaxPreTrainedModel的类,用于OPT模型的预训练。
class FlaxOPTPreTrainedModel(FlaxPreTrainedModel):
# 指定配置类为OPTConfig
config_class = OPTConfig
# 指定基础模型前缀为"model"
base_model_prefix: str = "model"
# 模块类初始化为None
module_class: nn.Module = None
# 初始化函数,接受配置config、输入形状input_shape、种子seed、数据类型dtype等参数
def __init__(
self,
config: OPTConfig,
input_shape: Tuple[int] = (1, 1),
seed: int = 0,
dtype: jnp.dtype = jnp.float32,
_do_init: bool = True,
**kwargs,
):
# 使用module_class创建模块对象module,传入config和其他kwargs参数
module = self.module_class(config=config, dtype=dtype, **kwargs)
# 调用父类初始化方法,传入config、module、input_shape、seed、dtype、_do_init等参数
super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
# 初始化权重函数,接受随机数生成器rng、输入形状input_shape、参数params等参数,返回初始化后的参数params
def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
# 初始化input_ids为全零数组,数据类型为"i4"
input_ids = jnp.zeros(input_shape, dtype="i4")
# 初始化attention_mask为与input_ids形状相同的全1数组
attention_mask = jnp.ones_like(input_ids)
# 获取batch_size和sequence_length
batch_size, sequence_length = input_ids.shape
# 初始化position_ids为广播形式的序列长度数组
position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
# 拆分rng生成params_rng和dropout_rng
params_rng, dropout_rng = jax.random.split(rng)
# 构建随机数字典rngs,包含params_rng和dropout_rng
rngs = {"params": params_rng, "dropout": dropout_rng}
# 使用module的init方法初始化模型参数
module_init_outputs = self.module.init(
rngs,
input_ids,
attention_mask,
position_ids,
return_dict=False,
)
# 获取随机初始化的模型参数random_params
random_params = module_init_outputs["params"]
# 如果params不为None,则将随机参数和给定参数params进行扁平化处理并填充缺失键
if params is not None:
random_params = flatten_dict(unfreeze(random_params))
params = flatten_dict(unfreeze(params))
for missing_key in self._missing_keys:
params[missing_key] = random_params[missing_key]
self._missing_keys = set()
return freeze(unflatten_dict(params))
else:
return random_params
# 初始化缓存函数,用于快速自回归解码
def init_cache(self, batch_size, max_length):
r"""
Args:
batch_size (`int`):
用于快速自回归解码的批量大小。定义了初始化缓存的批处理大小。
max_length (`int`):
自动回归解码的最大可能长度。定义了初始化缓存的序列长度。
"""
# 初始化input_ids为全1数组,形状为(batch_size, max_length),数据类型为"i4"
input_ids = jnp.ones((batch_size, max_length), dtype="i4")
# 初始化attention_mask为与input_ids形状相同的全1数组,数据类型为"i4"
attention_mask = jnp.ones_like(input_ids, dtype="i4")
# 初始化position_ids为广播形式的input_ids的序列长度数组
position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
# 使用module的init方法初始化模型变量,设置init_cache为True以初始化缓存
init_variables = self.module.init(
jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
)
# 返回解除冻结后的缓存变量
return unfreeze(init_variables["cache"])
def __call__(
self,
input_ids: jnp.ndarray,
attention_mask: Optional[jnp.ndarray] = None,
position_ids: Optional[jnp.ndarray] = None,
params: dict = None,
past_key_values: dict = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
dropout_rng: PRNGKey = None,
deterministic: bool = True,
):
# 设置输出注意力机制的标志,如果未指定,则使用配置中的默认值
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 设置输出隐藏状态的标志,如果未指定,则使用配置中的默认值
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 设置返回字典的标志,如果未指定,则使用配置中的默认值
return_dict = return_dict if return_dict is not None else self.config.return_dict
# 如果未提供注意力掩码,则创建一个全为1的掩码
if attention_mask is None:
attention_mask = jnp.ones_like(input_ids)
# 如果未提供位置编码,则根据注意力掩码累积的结果生成位置编码
if position_ids is None:
position_ids = (attention_mask.cumsum(axis=1) * attention_mask) - 1
# 处理可能需要的任何伪随机数生成器
rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
# 准备模型输入字典
inputs = {"params": params or self.params}
# 如果提供了过去的键值对,则将其缓存放入输入中,并标记为可变
if past_key_values:
inputs["cache"] = past_key_values
mutable = ["cache"]
else:
mutable = False
# 应用模型的前向传播
outputs = self.module.apply(
inputs,
input_ids=jnp.array(input_ids, dtype="i4"),
attention_mask=jnp.array(attention_mask, dtype="i4"),
position_ids=jnp.array(position_ids, dtype="i4"),
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
deterministic=deterministic,
rngs=rngs,
mutable=mutable,
)
# 如果同时传递了过去的键值对和return_dict为True,则将更新后的缓存添加到模型输出中
if past_key_values is not None and return_dict:
outputs, past_key_values = outputs
outputs["past_key_values"] = unfreeze(past_key_values["cache"])
return outputs
# 如果同时传递了过去的键值对和return_dict为False,则将更新后的缓存插入到模型输出的适当位置
elif past_key_values is not None and not return_dict:
outputs, past_key_values = outputs
outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
# 返回模型的输出结果
return outputs
class FlaxOPTModule(nn.Module):
config: OPTConfig
dtype: jnp.dtype = jnp.float32 # 计算时使用的数据类型
def setup(self):
# 初始化解码器对象,使用给定的配置和数据类型
self.decoder = FlaxOPTDecoder(self.config, dtype=self.dtype)
def _get_decoder_module(self):
return self.decoder
def __call__(
self,
input_ids,
attention_mask,
position_ids,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
deterministic: bool = True,
init_cache=False,
):
# 调用解码器对象进行前向传播
decoder_outputs = self.decoder(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
deterministic=deterministic,
init_cache=init_cache,
)
if not return_dict:
return decoder_outputs
# 返回经过模型输出的结果,作为 FlaxBaseModelOutput 对象
return FlaxBaseModelOutput(
last_hidden_state=decoder_outputs.last_hidden_state,
hidden_states=decoder_outputs.hidden_states,
attentions=decoder_outputs.attentions,
)
# 从 transformers.models.bart.modeling_flax_bart.FlaxBartModel 复制而来,将 Bart 换成 OPT
class FlaxOPTModel(FlaxOPTPreTrainedModel):
config: OPTConfig
dtype: jnp.dtype = jnp.float32 # 计算时使用的数据类型
module_class = FlaxOPTModule
# 添加函数签名的示例文档到 FlaxOPTModel 类中
append_call_sample_docstring(FlaxOPTModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC)
@add_start_docstrings(
"The bare OPT Model transformer outputting raw hidden-states without any specific head on top.",
OPT_START_DOCSTRING,
)
class FlaxOPTForCausalLMModule(nn.Module):
config: OPTConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
# 初始化 OPT 模型和语言模型头部
self.model = FlaxOPTModule(config=self.config, dtype=self.dtype)
self.lm_head = nn.Dense(
self.config.vocab_size,
use_bias=False,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.init_std),
)
def __call__(
self,
input_ids,
attention_mask,
position_ids,
init_cache: bool = False,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
deterministic: bool = True,
):
# 调用模型进行前向传播
model_outputs = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
init_cache=init_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
deterministic=deterministic,
)
# 如果不要求返回字典形式的输出,直接返回模型的输出
if not return_dict:
return model_outputs
# 否则,返回 FlaxBaseModelOutput 对象,其中包含模型的隐藏状态、注意力等信息
return FlaxBaseModelOutput(
last_hidden_state=model_outputs.last_hidden_state,
hidden_states=model_outputs.hidden_states,
attentions=model_outputs.attentions,
)
):
# 调用模型进行推理
outputs = self.model(
input_ids,
attention_mask,
position_ids,
init_cache=init_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
deterministic=deterministic,
)
# 从模型输出中获取隐藏状态
hidden_states = outputs[0]
# 如果配置要求共享词嵌入,则使用decoder的嵌入矩阵作为共享的嵌入
if self.config.tie_word_embeddings:
shared_embedding = self.model.variables["params"]["decoder"]["embed_tokens"]["embedding"]
# 应用共享的词嵌入到隐藏状态得到语言模型的logits
lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
else:
# 否则直接用语言模型头部处理隐藏状态得到logits
lm_logits = self.lm_head(hidden_states)
# 如果不要求返回字典形式的输出,则返回tuple形式的结果
if not return_dict:
return (lm_logits,) + outputs[1:]
# 返回FlaxMaskedLMOutput对象,其中包含logits、隐藏状态和注意力权重
return FlaxMaskedLMOutput(
logits=lm_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
"""
OPT Model with a language modeling head on top (linear layer with weights tied to the input embeddings) e.g for
autoregressive tasks.
"""
@add_start_docstrings(
"""
OPT Model with a language modeling head on top (linear layer with weights tied to the input embeddings) e.g for
autoregressive tasks.
""",
OPT_START_DOCSTRING,
)
class FlaxOPTForCausalLM(FlaxOPTPreTrainedModel):
# 使用 FlaxOPTForCausalLMModule 作为模块类
module_class = FlaxOPTForCausalLMModule
def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
# initializing the cache
batch_size, seq_length = input_ids.shape
# 初始化缓存,准备用于生成
past_key_values = self.init_cache(batch_size, max_length)
# 由于解码器使用因果掩码,attention_mask 通常只需要在 input_ids.shape[-1] 之外和 cache_length 之前的位置放置 0,
# 但这些位置因为因果掩码而已经被屏蔽了。因此,我们可以在这里创建一个静态的 attention_mask,这样更有效率地进行编译。
extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
if attention_mask is not None:
# 计算位置 ids
position_ids = attention_mask.cumsum(axis=1) - 1
# 更新动态切片的 extended_attention_mask
extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
else:
# 如果没有传入 attention_mask,则广播生成位置 ids
position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
return {
"past_key_values": past_key_values,
"attention_mask": extended_attention_mask,
"position_ids": position_ids,
}
def update_inputs_for_generation(self, model_outputs, model_kwargs):
# 更新生成过程中的输入参数,更新 past_key_values 和 position_ids
model_kwargs["past_key_values"] = model_outputs.past_key_values
model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
return model_kwargs
# 向类添加调用示例文档字符串
append_call_sample_docstring(
FlaxOPTForCausalLM,
_CHECKPOINT_FOR_DOC,
FlaxBaseModelOutput,
_CONFIG_FOR_DOC,
)
.\models\opt\modeling_opt.py
"""
PyTorch OPT model.
"""
from typing import List, Optional, Tuple, Union
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
BaseModelOutputWithPast,
CausalLMOutputWithPast,
QuestionAnsweringModelOutput,
SequenceClassifierOutputWithPast,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_flash_attn_2_available,
is_flash_attn_greater_or_equal_2_10,
logging,
replace_return_docstrings,
)
from .configuration_opt import OPTConfig
if is_flash_attn_2_available():
from flash_attn import flash_attn_func, flash_attn_varlen_func
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "facebook/opt-350m"
_CONFIG_FOR_DOC = "OPTConfig"
_EXPECTED_OUTPUT_SHAPE = [1, 8, 1024]
_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "ArthurZ/opt-350m-dummy-sc"
_SEQ_CLASS_EXPECTED_LOSS = 1.71
_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_0'"
OPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/opt-125m",
"facebook/opt-350m",
"facebook/opt-1.3b",
"facebook/opt-2.7b",
"facebook/opt-6.7b",
"facebook/opt-13b",
"facebook/opt-30b",
]
def _get_unpad_data(attention_mask):
"""
Get indices, cumulative sequence lengths, and maximum sequence length from attention mask.
Args:
attention_mask (torch.Tensor): Attention mask tensor.
Returns:
Tuple: Tuple containing:
- indices (torch.Tensor): Indices of attention mask where True.
- cu_seqlens (torch.Tensor): Cumulative sequence lengths.
- max_seqlen_in_batch (int): Maximum sequence length in the batch.
"""
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
max_seqlen_in_batch = seqlens_in_batch.max().item()
cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
return indices, cu_seqlens, max_seqlen_in_batch
class OPTLearnedPositionalEmbedding(nn.Embedding):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
def __init__(self, num_embeddings: int, embedding_dim: int):
self.offset = 2
super().__init__(num_embeddings + self.offset, embedding_dim)
def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0):
"""`input_ids_shape` is expected to be [bsz x seqlen]."""
attention_mask = attention_mask.long()
positions = (torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask).long() - 1
positions = positions[:, past_key_values_length:]
return super().forward(positions + self.offset)
class OptFlashAttention2(OPTAttention):
"""
OPT flash attention module. This module inherits from `OPTAttention` as the weights of the module stays untouched.
The only required change would be on the forward pass where it needs to correctly call the public API of flash
attention and deal with padding tokens in case the input contains any of them.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
def _flash_attention_forward(
self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
"""
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
first unpad the input, then computes the attention scores and pad the final attention scores.
Args:
query_states (`torch.Tensor`):
Input query states to be passed to Flash Attention API
key_states (`torch.Tensor`):
Input key states to be passed to Flash Attention API
value_states (`torch.Tensor`):
Input value states to be passed to Flash Attention API
attention_mask (`torch.Tensor`):
The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
position of padding tokens and 1 for the position of non-padding tokens.
dropout (`float`):
Attention dropout
softmax_scale (`float`, *optional*):
The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
"""
if not self._flash_attn_uses_top_left_mask:
causal = self.is_causal
else:
causal = self.is_causal and query_length != 1
if attention_mask is not None:
batch_size = query_states.shape[0]
query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
query_states, key_states, value_states, attention_mask, query_length
)
cu_seqlens_q, cu_seqlens_k = cu_seq_lens
max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
attn_output_unpad = flash_attn_varlen_func(
query_states,
key_states,
value_states,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_k=cu_seqlens_k,
max_seqlen_q=max_seqlen_in_batch_q,
max_seqlen_k=max_seqlen_in_batch_k,
dropout_p=dropout,
softmax_scale=softmax_scale,
causal=causal,
)
attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
else:
attn_output = flash_attn_func(
query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
)
return attn_output
def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
key_layer = index_first_axis(
key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
)
value_layer = index_first_axis(
value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
)
if query_length == kv_seq_len:
query_layer = index_first_axis(
query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
)
cu_seqlens_q = cu_seqlens_k
max_seqlen_in_batch_q = max_seqlen_in_batch_k
indices_q = indices_k
elif query_length == 1:
max_seqlen_in_batch_q = 1
cu_seqlens_q = torch.arange(
batch_size + 1, dtype=torch.int32, device=query_layer.device
)
indices_q = cu_seqlens_q[:-1]
query_layer = query_layer.squeeze(1)
else:
attention_mask = attention_mask[:, -query_length:]
query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
return (
query_layer,
key_layer,
value_layer,
indices_q,
(cu_seqlens_q, cu_seqlens_k),
(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
)
OPT_ATTENTION_CLASSES = {
"eager": OPTAttention,
"flash_attention_2": OptFlashAttention2,
}
class OPTDecoderLayer(nn.Module):
def __init__(self, config: OPTConfig):
super().__init__()
self.embed_dim = config.hidden_size
self.self_attn = OPT_ATTENTION_CLASSES[config._attn_implementation](config=config, is_decoder=True)
self.do_layer_norm_before = config.do_layer_norm_before
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.self_attn_layer_norm = nn.LayerNorm(
self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine
)
self.fc1 = nn.Linear(self.embed_dim, config.ffn_dim, bias=config.enable_bias)
self.fc2 = nn.Linear(config.ffn_dim, self.embed_dim, bias=config.enable_bias)
self.final_layer_norm = nn.LayerNorm(self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
):
pass
OPT_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`OPTConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
@add_start_docstrings(
"The bare OPT Model outputting raw hidden-states without any specific head on top.",
OPT_START_DOCSTRING,
)
class OPTPreTrainedModel(PreTrainedModel):
config_class = OPTConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["OPTDecoderLayer"]
_supports_flash_attn_2 = True
def _init_weights(self, module):
std = self.config.init_std
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
OPT_INPUTS_DOCSTRING = r"""
"""
class OPTDecoder(OPTPreTrainedModel):
"""
OPT 解码器,由 config.num_hidden_layers 层组成。每一层都是一个 OPTDecoderLayer 对象。
Args:
config: OPTConfig
"""
def __init__(self, config: OPTConfig):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.layerdrop
self.padding_idx = config.pad_token_id
self.max_target_positions = config.max_position_embeddings
self.vocab_size = config.vocab_size
self.embed_tokens = nn.Embedding(config.vocab_size, config.word_embed_proj_dim, self.padding_idx)
self.embed_positions = OPTLearnedPositionalEmbedding(config.max_position_embeddings, config.hidden_size)
if config.word_embed_proj_dim != config.hidden_size:
self.project_out = nn.Linear(config.hidden_size, config.word_embed_proj_dim, bias=False)
else:
self.project_out = None
if config.word_embed_proj_dim != config.hidden_size:
self.project_in = nn.Linear(config.word_embed_proj_dim, config.hidden_size, bias=False)
else:
self.project_in = None
if config.do_layer_norm_before and not config._remove_final_layer_norm:
self.final_layer_norm = nn.LayerNorm(
config.hidden_size, elementwise_affine=config.layer_norm_elementwise_affine
)
else:
self.final_layer_norm = None
self.layers = nn.ModuleList([OPTDecoderLayer(config) for _ in range(config.num_hidden_layers)])
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"The bare OPT Model outputting raw hidden-states without any specific head on top.",
OPT_START_DOCSTRING,
)
class OPTModel(OPTPreTrainedModel):
def __init__(self, config: OPTConfig):
super().__init__(config)
self.decoder = OPTDecoder(config)
self.post_init()
def get_input_embeddings(self):
return self.decoder.embed_tokens
def set_input_embeddings(self, value):
self.decoder.embed_tokens = value
def get_decoder(self):
return self.decoder
@add_start_docstrings_to_model_forward(OPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPast,
config_class=_CONFIG_FOR_DOC,
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPast]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
decoder_outputs = self.decoder(
input_ids=input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
if not return_dict:
return decoder_outputs
return BaseModelOutputWithPast(
last_hidden_state=decoder_outputs.last_hidden_state,
past_key_values=decoder_outputs.past_key_values,
hidden_states=decoder_outputs.hidden_states,
attentions=decoder_outputs.attentions,
)
class OPTForCausalLM(OPTPreTrainedModel):
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
super().__init__(config)
self.model = OPTModel(config)
self.lm_head = nn.Linear(config.word_embed_proj_dim, config.vocab_size, bias=False)
self.post_init()
def get_input_embeddings(self):
return self.model.decoder.embed_tokens
def set_input_embeddings(self, value):
self.model.decoder.embed_tokens = value
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model.decoder = decoder
def get_decoder(self):
return self.model.decoder
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
):
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
model_inputs.update(
{
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"attention_mask": attention_mask,
}
)
return model_inputs
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
"""
The OPT Model transformer with a sequence classification head on top (linear layer).
[`OPTForSequenceClassification`] uses the last token in order to do the classification, as other causal models
(e.g. GPT-2) do.
Since it does classification on the last token, it requires to know the position of the last token. If a
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
each row of the batch).
"""
@add_start_docstrings(OPT_START_DOCSTRING)
class OPTForSequenceClassification(OPTPreTrainedModel):
def __init__(self, config: OPTConfig):
super().__init__(config)
self.num_labels = config.num_labels
self.model = OPTModel(config)
self.score = nn.Linear(config.word_embed_proj_dim, self.num_labels, bias=False)
self.post_init()
@add_start_docstrings_to_model_forward(OPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
output_type=SequenceClassifierOutputWithPast,
config_class=_CONFIG_FOR_DOC,
expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Forward pass of the OPTForSequenceClassification model.
"""
def get_input_embeddings(self):
"""
Retrieve the input embeddings from the model's decoder.
"""
return self.model.decoder.embed_tokens
def set_input_embeddings(self, value):
"""
Set new input embeddings for the model's decoder.
"""
self.model.decoder.embed_tokens = value
"""
The OPT Model transformer with a span classification head on top for extractive question-answering tasks like SQuAD
(a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
"""
@add_start_docstrings(OPT_START_DOCSTRING)
class OPTForQuestionAnswering(OPTPreTrainedModel):
def __init__(self, config: OPTConfig):
super().__init__(config)
self.model = OPTModel(config)
self.qa_outputs = nn.Linear(config.word_embed_proj_dim, 2)
self.post_init()
@add_start_docstrings_to_model_forward(OPT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=QuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
def get_input_embeddings(self):
return self.model.decoder.embed_tokens
def set_input_embeddings(self, value):
self.model.decoder.embed_tokens = value
.\models\opt\modeling_tf_opt.py
""" TF 2.0 OPT model."""
from __future__ import annotations
from typing import Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import TFBaseModelOutputWithPast, TFCausalLMOutputWithPast
from ...modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFModelInputType,
TFPreTrainedModel,
TFSharedEmbeddings,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_opt import OPTConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "facebook/opt-350m"
_CONFIG_FOR_DOC = "OPTConfig"
_EXPECTED_OUTPUT_SHAPE = [1, 8, 1024]
_CAUSAL_LM_EXPECTED_OUTPUT = (
"Hey, are you conscious? Can you talk to me?\nI'm not conscious. I'm just a little bit of a weirdo."
)
LARGE_NEGATIVE = -1e8
def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0):
"""
Make causal mask used for bi-directional self-attention.
Args:
input_ids_shape (tf.TensorShape): Shape of input tensor representing input ids.
past_key_values_length (int): Length of past key values for attention mechanism.
Returns:
tf.Tensor: Causal mask tensor for bi-directional self-attention.
"""
bsz = input_ids_shape[0]
tgt_len = input_ids_shape[1]
mask = tf.fill((tgt_len, tgt_len), tf.cast(LARGE_NEGATIVE, tf.float32))
mask = tf.linalg.band_part(mask, 0, -1) - tf.linalg.band_part(mask, 0, 0)
if past_key_values_length > 0:
mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1)
return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1))
def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
Args:
mask (tf.Tensor): Tensor representing attention mask.
tgt_len (Optional[int]): Target sequence length (default: None).
Returns:
tf.Tensor: Expanded attention mask tensor.
"""
src_len = shape_list(mask)[1]
tgt_len = tgt_len if tgt_len is not None else src_len
one_cst = tf.constant(1.0)
mask = tf.cast(mask, dtype=one_cst.dtype)
expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))
return expanded_mask
return (one_cst - expanded_mask) * LARGE_NEGATIVE
class TFOPTLearnedPositionalEmbedding(keras.layers.Embedding):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
self.offset = 2
super().__init__(num_embeddings + self.offset, embedding_dim, **kwargs)
def call(self, attention_mask, past_key_values_length: int = 0):
"""`input_ids_shape` is expected to be [bsz x seqlen]."""
attention_mask = tf.cast(attention_mask, tf.int64)
positions = tf.math.cumsum(attention_mask, axis=1) * attention_mask - 1
positions = positions[:, past_key_values_length:]
return super().call(positions + self.offset)
class TFOPTAttention(keras.layers.Layer):
"""Multi-headed attention from "Attention Is All You Need"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
**kwargs,
):
super().__init__(**kwargs)
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
def call(
self,
hidden_states: tf.Tensor,
key_value_states: tf.Tensor | None = None,
past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
attention_mask: tf.Tensor | None = None,
layer_head_mask: tf.Tensor | None = None,
training: Optional[bool] = False,
**kwargs
):
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
class TFOPTDecoderLayer(keras.layers.Layer):
def __init__(self, config: OPTConfig, **kwargs):
super().__init__(**kwargs)
self.do_layer_norm_before = config.do_layer_norm_before
self.embed_dim = config.hidden_size
self.self_attn = TFOPTAttention(
embed_dim=self.embed_dim,
num_heads=config.num_attention_heads,
dropout=config.attention_dropout,
name="self_attn",
is_decoder=True,
)
self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function)
self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.fc1 = keras.layers.Dense(config.ffn_dim, name="fc1")
self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
self,
hidden_states: tf.Tensor,
attention_mask: np.ndarray | tf.Tensor | None = None,
layer_head_mask: tf.Tensor | None = None,
past_key_value: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
training: Optional[bool] = False,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
= None,
training: Optional[bool] = False,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
"""
Args:
hidden_states (`tf.Tensor`): 输入到层的张量,形状为 `(batch, seq_len, embed_dim)`
attention_mask (`tf.Tensor`, *可选*): 注意力掩码,形状为 `(batch, 1, tgt_len, src_len)`,
其中填充元素由非常大的负值表示。
layer_head_mask (`tf.Tensor`, *可选*): 给定层中注意力头的掩码,形状为 `(decoder_attention_heads,)`
past_key_value (`Tuple(tf.Tensor)`, *可选*): 缓存的过去键和值投影状态
training (`bool`, *可选*, 默认为 `False`):
是否在训练模式下使用模型(某些模块如 dropout 在训练和评估中的行为不同)。
"""
residual = hidden_states
if self.do_layer_norm_before:
hidden_states = self.self_attn_layer_norm(hidden_states)
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
hidden_states, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
past_key_value=self_attn_past_key_value,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = residual + hidden_states
if not self.do_layer_norm_before:
hidden_states = self.self_attn_layer_norm(hidden_states)
residual = hidden_states
if self.do_layer_norm_before:
hidden_states = self.final_layer_norm(hidden_states)
hidden_states = self.fc1(hidden_states)
hidden_states = self.activation_fn(hidden_states)
hidden_states = self.fc2(hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = residual + hidden_states
if not self.do_layer_norm_before:
hidden_states = self.final_layer_norm(hidden_states)
return (hidden_states, self_attn_weights, present_key_value)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
OPT_START_DOCSTRING = r"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
positional argument:
- a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
Note that when creating models and layers with
[subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
about any of this, as you can just pass inputs like you would to any other Python function!
</Tip>
Args:
config ([`OPTConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
"""
@add_start_docstrings(
"The bare OPT Model outputting raw hidden-states without any specific head on top.",
OPT_START_DOCSTRING,
)
class TFOPTPreTrainedModel(TFPreTrainedModel):
"""
TFOPT Pretrained Model that inheritates from transformers.TFPreTrainedModel
Args:
config: OPTConfig
"""
config_class = OPTConfig
base_model_prefix = "model"
OPT_INPUTS_DOCSTRING = r"""
"""
Args:
input_ids (`tf.Tensor` of shape `({0})`):
输入序列中词汇表中的输入序列标记的索引。
可以使用 [`AutoTokenizer`] 获得这些索引。有关详细信息,请参见 [`PreTrainedTokenizer.encode`] 和 [`PreTrainedTokenizer.__call__`]。
[什么是输入 ID?](../glossary
attention_mask (`tf.Tensor` of shape `({0})`, *optional*):
遮罩,用于在填充标记索引上避免执行注意力操作。遮罩值选择在 `[0, 1]`:
- 1 表示**未遮罩**的标记,
- 0 表示**遮罩**的标记。
[什么是注意力遮罩?](../glossary
head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
用于在编码器中将选定的注意力模块头部置零的遮罩。遮罩值选择在 `[0, 1]`:
- 1 表示**未遮罩**的头部,
- 0 表示**遮罩**的头部。
past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
包含注意力块预计算的键和值隐藏状态。可用于加速解码过程。
如果使用 `past_key_values`,用户可以选择只输入最后的 `decoder_input_ids`(这些没有给出其过去键值状态的模型)的形状为 `(batch_size, 1)`,而不是所有 `decoder_input_ids` 的形状为 `(batch_size, sequence_length)`。
use_cache (`bool`, *optional*, defaults to `True`):
如果设置为 `True`,则返回 `past_key_values` 键值状态,可用于加速解码(参见 `past_key_values`)。在训练期间设置为 `False`,在生成期间设置为 `True`。
output_attentions (`bool`, *optional*):
是否返回所有注意力层的注意力张量。有关更多详细信息,请参见返回张量下的 `attentions`。此参数仅在即时模式下可用,在图模式下将使用配置中的值。
output_hidden_states (`bool`, *optional*):
是否返回所有层的隐藏状态。有关更多详细信息,请参见返回张量下的 `hidden_states`。此参数仅在即时模式下可用,在图模式下将使用配置中的值。
return_dict (`bool`, *optional*):
是否返回 [`~utils.ModelOutput`] 而不是普通元组。此参数可以在即时模式下使用,在图模式下将始终设置为 True。
training (`bool`, *optional*, defaults to `False`):
是否在训练模式下使用模型(某些模块如 dropout 模块在训练和评估中有不同的行为)。
"""
@keras_serializable
class TFOPTDecoder(keras.layers.Layer):
config_class = OPTConfig
def __init__(self, config: OPTConfig, **kwargs):
super().__init__(**kwargs)
self.config = config # 初始化配置对象,包含解码器的各种配置参数
self.padding_idx = config.pad_token_id # 设置填充标记的索引
self.layerdrop = config.layerdrop # 设置层跳跃的概率
num_embeddings = config.max_position_embeddings # 获取最大位置编码的数量
self.embed_tokens = TFSharedEmbeddings(
config.vocab_size, config.word_embed_proj_dim, config.pad_token_id, name="embed_tokens"
) # 初始化共享的词嵌入对象
self.embed_positions = TFOPTLearnedPositionalEmbedding(
num_embeddings,
config.hidden_size,
name="embed_positions",
) # 初始化位置编码对象
# 注意:`config._remove_final_layer_norm` 仅用于保持与旧版本的兼容性,
# 在 transformers v4.20.1 之前微调过的检查点需要使用,详见 https://github.com/facebookresearch/metaseq/pull/164
if config.do_layer_norm_before and not config._remove_final_layer_norm:
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
else:
self.final_layer_norm = None # 如果不需要最终的层归一化,则为 None
if config.word_embed_proj_dim != config.hidden_size:
self.project_out = keras.layers.Dense(config.word_embed_proj_dim, name="project_out", use_bias=False)
self.project_in = keras.layers.Dense(config.hidden_size, name="project_in", use_bias=False)
else:
self.project_in = None
self.project_out = None # 如果词嵌入投影维度与隐藏层维度相同,则为 None
self.layers = [TFOPTDecoderLayer(config, name=f"layers.{i}") for i in range(config.num_hidden_layers)]
self.dropout = keras.layers.Dropout(config.dropout) # 初始化 dropout 层
def get_embed_tokens(self):
return self.embed_tokens # 返回词嵌入对象
def set_embed_tokens(self, embed_tokens):
self.embed_tokens = embed_tokens # 设置新的词嵌入对象
def set_input_embeddings(self, new_embeddings):
self.embed_tokens.vocab_size = new_embeddings.shape[0] # 更新词汇表大小
self.embed_tokens.weight = new_embeddings # 更新词嵌入权重矩阵
def get_input_embeddings(self):
return self.embed_tokens # 返回当前词嵌入对象
# 如果模型已经构建,则直接返回,不进行重复构建
if self.built:
return
# 将模型标记为已构建状态
self.built = True
# 如果存在嵌入标记,构建嵌入标记模块
if getattr(self, "embed_tokens", None) is not None:
with tf.name_scope(self.embed_tokens.name):
self.embed_tokens.build(None)
# 如果存在位置嵌入,构建位置嵌入模块
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
# 如果存在最终层归一化,构建最终层归一化模块
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.config.hidden_size])
# 如果存在输出投影层,构建输出投影层模块
if getattr(self, "project_out", None) is not None:
with tf.name_scope(self.project_out.name):
self.project_out.build([None, None, self.config.hidden_size])
# 如果存在输入投影层,构建输入投影层模块
if getattr(self, "project_in", None) is not None:
with tf.name_scope(self.project_in.name):
self.project_in.build([None, None, self.config.word_embed_proj_dim])
# 如果存在多层结构,逐层构建每一层
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
# 使用 keras_serializable 装饰器将类声明为可序列化的 Keras 模型
@keras_serializable
class TFOPTMainLayer(keras.layers.Layer):
# 设置配置类为 OPTConfig
config_class = OPTConfig
# 初始化方法,接受配置对象 config 和其他关键字参数
def __init__(self, config: OPTConfig, **kwargs):
super().__init__(**kwargs)
# 将配置对象 config 存储在实例中
self.config = config
# 创建 TFOPTDecoder 对象,并命名为 "decoder"
self.decoder = TFOPTDecoder(config, name="decoder")
# 获取输入嵌入的方法,返回解码器的嵌入标记
def get_input_embeddings(self):
return self.decoder.embed_tokens
# 设置输入嵌入的方法,用新的嵌入替换解码器的嵌入标记
def set_input_embeddings(self, new_embeddings):
self.decoder.set_input_embeddings(new_embeddings)
# 使用 unpack_inputs 装饰器定义的调用方法,接受多个输入参数,返回 TFBaseModelOutputWithPast 或者 Tensor 元组
@unpack_inputs
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
**kwargs,
) -> Union[TFBaseModelOutputWithPast, Tuple[tf.Tensor]]:
# 根据传入的参数或者配置对象设置输出注意力和隐藏状态
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用解码器对象进行处理,返回结果存储在 outputs 变量中
outputs = self.decoder(
input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 如果 return_dict 为 False,则直接返回 outputs
if not return_dict:
return outputs
# 否则,构造 TFBaseModelOutputWithPast 对象,返回其中的属性作为输出
return TFBaseModelOutputWithPast(
last_hidden_state=outputs.last_hidden_state,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 构建方法,用于构建模型结构,如果已经构建过则直接返回
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果存在解码器对象,则在解码器的名称空间内构建其结构
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
# 使用 add_start_docstrings 装饰器添加模型的文档字符串说明和 OPT_START_DOCSTRING
@add_start_docstrings(
"The bare TF OPT Model outputting raw hidden-states without any specific head on top.",
OPT_START_DOCSTRING,
)
# 使用 keras_serializable 装饰器将类声明为可序列化的 Keras 模型
@keras_serializable
class TFOPTModel(TFOPTPreTrainedModel):
# 设置配置类为 OPTConfig
config_class = OPTConfig
# 初始化方法,接受配置对象 config 和其他关键字参数
def __init__(self, config: OPTConfig, **kwargs):
super().__init__(config, **kwargs)
# 将配置对象 config 存储在实例中
self.config = config
# 创建 TFOPTMainLayer 对象,并命名为 "model"
self.model = TFOPTMainLayer(config, name="model")
# 获取输入嵌入层,即模型解码器的嵌入标记
def get_input_embeddings(self):
return self.model.decoder.embed_tokens
# 设置输入嵌入层,用新的嵌入进行替换
def set_input_embeddings(self, new_embeddings):
self.model.set_input_embeddings(new_embeddings)
# 使用装饰器 unpack_inputs 解包输入参数,并为模型的 call 方法添加文档字符串
# 该方法用于模型调用,接收多个输入参数,返回模型输出或包含过去键值的对象
@unpack_inputs
@add_start_docstrings_to_model_forward(OPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutputWithPast,
config_class=_CONFIG_FOR_DOC,
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
**kwargs,
) -> Union[TFBaseModelOutputWithPast, Tuple[tf.Tensor]]:
# 根据传入的参数或者配置决定是否使用输出注意力、隐藏状态及缓存
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用模型的前向传播方法,传递给模型的参数包括输入数据、注意力掩码、头部掩码等
outputs = self.model(
input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 如果不要求返回字典形式的输出,则直接返回模型的原始输出
if not return_dict:
return outputs
# 构造 TFBaseModelOutputWithPast 对象,包含最后隐藏状态、过去键值、隐藏状态、注意力等信息
return TFBaseModelOutputWithPast(
last_hidden_state=outputs.last_hidden_state,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 用于服务输出的方法,根据配置决定是否返回过去键值、隐藏状态和注意力
def serving_output(self, output):
pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
# 返回 TFBaseModelOutputWithPast 对象,包含最后隐藏状态、过去键值、隐藏状态、注意力
return TFBaseModelOutputWithPast(
last_hidden_state=output.last_hidden_state,
past_key_values=pkv,
hidden_states=hs,
attentions=attns,
)
# 定义模型的构建方法,用于构建模型的层次结构
def build(self, input_shape=None):
# 如果模型已经构建完成,则直接返回,避免重复构建
if self.built:
return
# 将模型的构建状态标记为已构建
self.built = True
# 检查模型是否已经实例化,如果是,则在命名空间下构建模型
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
# 使用 None 的输入形状构建模型
self.model.build(None)
@add_start_docstrings(
"""
The OPT Model transformer with a language modeling head on top.
""",
OPT_START_DOCSTRING,
)
@keras_serializable
class TFOPTForCausalLM(TFOPTPreTrainedModel, TFCausalLanguageModelingLoss):
# 使用 OPTConfig 作为配置类
config_class = OPTConfig
def __init__(self, config: OPTConfig, **kwargs):
# 调用父类构造函数,初始化配置
super().__init__(config, **kwargs)
self.config = config
# 创建 TFOPTMainLayer 模型,命名为 "model"
self.model = TFOPTMainLayer(config, name="model")
def get_output_embeddings(self):
# 获取模型的输入嵌入
return self.model.get_input_embeddings()
def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs):
# 获取 kwargs 中的注意力遮罩
attention_mask = kwargs.get("attention_mask", None)
# 如果 past_key_values 存在,则只使用输入的最后一个标记
if past_key_values:
inputs = tf.expand_dims(inputs[:, -1], -1)
# 返回准备好的输入字典
return {
"input_ids": inputs,
"attention_mask": attention_mask,
"past_key_values": past_key_values,
"use_cache": use_cache,
}
@unpack_inputs
@replace_return_docstrings(output_type=TFCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFCausalLMOutputWithPast,
config_class=_CONFIG_FOR_DOC,
expected_output=_CAUSAL_LM_EXPECTED_OUTPUT,
)
def call(
self,
input_ids: TFModelInputType | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
labels: np.ndarray | tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
**kwargs,
):
# 实现模型的前向传播逻辑,详细说明参考函数装饰器
def serving_output(self, output):
# 根据配置决定是否使用缓存来处理输出中的过去键值
pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
# 根据配置决定是否输出隐藏状态
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
# 根据配置决定是否输出注意力权重
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
# 返回带有过去键值的语言模型输出对象
return TFCausalLMOutputWithPast(
past_key_values=pkv,
hidden_states=hs,
attentions=attns,
loss=output.loss,
logits=output.logits,
)
def build(self, input_shape=None):
# 如果已经构建过,直接返回
if self.built:
return
# 标记已经构建
self.built = True
# 如果存在模型属性,则在名称作用域内构建模型
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
.\models\opt\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_flax_available,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {"configuration_opt": ["OPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "OPTConfig"]}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_opt"] = [
"OPT_PRETRAINED_MODEL_ARCHIVE_LIST",
"OPTForCausalLM",
"OPTModel",
"OPTPreTrainedModel",
"OPTForSequenceClassification",
"OPTForQuestionAnswering",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_opt"] = ["TFOPTForCausalLM", "TFOPTModel", "TFOPTPreTrainedModel"]
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_flax_opt"] = [
"FlaxOPTForCausalLM",
"FlaxOPTModel",
"FlaxOPTPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_opt import OPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OPTConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_opt import (
OPT_PRETRAINED_MODEL_ARCHIVE_LIST,
OPTForCausalLM,
OPTForQuestionAnswering,
OPTForSequenceClassification,
OPTModel,
OPTPreTrainedModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_opt import TFOPTForCausalLM, TFOPTModel, TFOPTPreTrainedModel
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_flax_opt import FlaxOPTForCausalLM, FlaxOPTModel, FlaxOPTPreTrainedModel
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\owlv2\configuration_owlv2.py
""" OWLv2 model configuration"""
import os
from typing import TYPE_CHECKING, Dict, Union
if TYPE_CHECKING:
pass
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
OWLV2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"google/owlv2-base-patch16": "https://huggingface.co/google/owlv2-base-patch16/resolve/main/config.json",
}
class Owlv2TextConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of an [`Owlv2TextModel`]. It is used to instantiate an
Owlv2 text encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the Owlv2
[google/owlv2-base-patch16](https://huggingface.co/google/owlv2-base-patch16) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
vocab_size (`int`, *optional*, defaults to 49408):
Vocabulary size of the OWLv2 text model. Defines the number of different tokens that can be represented
by the `inputs_ids` passed when calling [`Owlv2TextModel`].
hidden_size (`int`, *optional*, defaults to 512):
Dimensionality of the encoder layers and the pooler layer.
intermediate_size (`int`, *optional*, defaults to 2048):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads for each attention layer in the Transformer encoder.
max_position_embeddings (`int`, *optional*, defaults to 16):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float`, *optional*, defaults to 1.0):
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing).
pad_token_id (`int`, *optional*, defaults to 0):
The id of the padding token in the input sequences.
bos_token_id (`int`, *optional*, defaults to 49406):
The id of the beginning-of-sequence token in the input sequences.
eos_token_id (`int`, *optional*, defaults to 49407):
The id of the end-of-sequence token in the input sequences.
Example:
>>> from transformers import Owlv2TextConfig, Owlv2TextModel
>>>
>>> configuration = Owlv2TextConfig()
>>>
>>> model = Owlv2TextModel(configuration)
>>>
>>> configuration = model.config
model_type = "owlv2_text_model"
def __init__(
self,
vocab_size=49408,
hidden_size=512,
intermediate_size=2048,
num_hidden_layers=12,
num_attention_heads=8,
max_position_embeddings=16,
hidden_act="quick_gelu",
layer_norm_eps=1e-5,
attention_dropout=0.0,
initializer_range=0.02,
initializer_factor=1.0,
pad_token_id=0,
bos_token_id=49406,
eos_token_id=49407,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.max_position_embeddings = max_position_embeddings
self.hidden_act = hidden_act
self.layer_norm_eps = layer_norm_eps
self.attention_dropout = attention_dropout
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
cls._set_token_in_kwargs(kwargs)
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
if config_dict.get("model_type") == "owlv2":
config_dict = config_dict["text_config"]
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
return cls.from_dict(config_dict, **kwargs)
class Owlv2VisionConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of an [`Owlv2VisionModel`]. It is used to instantiate
an OWLv2 image encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the OWLv2
[google/owlv2-base-patch16](https://huggingface.co/google/owlv2-base-patch16) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
num_channels (`int`, *optional*, defaults to 3):
Number of channels in the input images.
image_size (`int`, *optional*, defaults to 768):
The size (resolution) of each image.
patch_size (`int`, *optional*, defaults to 16):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float`, *optional*, defaults to 1.0):
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing).
Example:
```
>>> from transformers import Owlv2VisionConfig, Owlv2VisionModel
>>> # Initializing a Owlv2VisionModel with google/owlv2-base-patch16 style configuration
>>> configuration = Owlv2VisionConfig()
>>> # Initializing a Owlv2VisionModel model from the google/owlv2-base-patch16 style configuration
>>> model = Owlv2VisionModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
model_type = "owlv2_vision_model"
def __init__(
self,
hidden_size=768,
intermediate_size=3072,
num_hidden_layers=12,
num_attention_heads=12,
num_channels=3,
image_size=768,
patch_size=16,
hidden_act="quick_gelu",
layer_norm_eps=1e-5,
attention_dropout=0.0,
initializer_range=0.02,
initializer_factor=1.0,
**kwargs,
):
super().__init__(**kwargs)
# 设置模型的各种参数
self.hidden_size = hidden_size # 隐藏层大小
self.intermediate_size = intermediate_size # 中间层大小
self.num_hidden_layers = num_hidden_layers # 隐藏层数量
self.num_attention_heads = num_attention_heads # 注意力头数量
self.num_channels = num_channels # 图像通道数
self.image_size = image_size # 图像大小
self.patch_size = patch_size # 图像分块大小
self.hidden_act = hidden_act # 隐藏层激活函数
self.layer_norm_eps = layer_norm_eps # 层归一化 epsilon 参数
self.attention_dropout = attention_dropout # 注意力机制的 dropout 概率
self.initializer_range = initializer_range # 初始化范围
self.initializer_factor = initializer_factor # 初始化因子
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
cls._set_token_in_kwargs(kwargs)
# 从预训练模型中加载配置字典和额外的关键字参数
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
# 如果配置字典中的模型类型是 "owlv2",则使用视觉配置字典
if config_dict.get("model_type") == "owlv2":
config_dict = config_dict["vision_config"]
# 检查模型类型是否与类属性中指定的模型类型匹配,如果不匹配则发出警告
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
# 从配置字典创建类的实例
return cls.from_dict(config_dict, **kwargs)
# 从 transformers.models.owlvit.configuration_owlvit.OwlViTConfig 复制过来,将 OwlViT 替换为 Owlv2,owlvit-base-patch32 替换为 owlv2-base-patch16,owlvit 替换为 owlv2,OWL-ViT 替换为 OWLv2
class Owlv2Config(PretrainedConfig):
r"""
[`Owlv2Config`] 是用来存储 [`Owlv2Model`] 配置的类。它用于根据指定的参数实例化一个 OWLv2 模型,定义文本模型和视觉模型的配置。
使用默认参数实例化配置将产生与 OWLv2 [google/owlv2-base-patch16](https://huggingface.co/google/owlv2-base-patch16) 架构类似的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型的输出。更多信息请参阅 [`PretrainedConfig`] 的文档。
Args:
text_config (`dict`, *optional*):
用于初始化 [`Owlv2TextConfig`] 的配置选项字典。
vision_config (`dict`, *optional*):
用于初始化 [`Owlv2VisionConfig`] 的配置选项字典。
projection_dim (`int`, *optional*, defaults to 512):
文本和视觉投影层的维度。
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
*logit_scale* 参数的初始值。默认值与原始 OWLv2 实现相同。
return_dict (`bool`, *optional*, defaults to `True`):
模型是否应返回字典。如果为 `False`,返回一个元组。
kwargs (*optional*):
关键字参数的字典。
"""
model_type = "owlv2"
def __init__(
self,
text_config=None,
vision_config=None,
projection_dim=512,
logit_scale_init_value=2.6592,
return_dict=True,
**kwargs,
):
super().__init__(**kwargs)
if text_config is None:
text_config = {}
logger.info("text_config is None. Initializing the Owlv2TextConfig with default values.")
if vision_config is None:
vision_config = {}
logger.info("vision_config is None. initializing the Owlv2VisionConfig with default values.")
# 使用给定的文本配置和视觉配置初始化 Owlv2TextConfig 和 Owlv2VisionConfig 对象
self.text_config = Owlv2TextConfig(**text_config)
self.vision_config = Owlv2VisionConfig(**vision_config)
# 设置投影维度、logit_scale 初始值和返回字典选项
self.projection_dim = projection_dim
self.logit_scale_init_value = logit_scale_init_value
self.return_dict = return_dict
self.initializer_factor = 1.0
@classmethod
# 类方法:从预训练模型名称或路径加载配置,并返回预训练配置对象
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
# 在关键字参数中设置 token
cls._set_token_in_kwargs(kwargs)
# 调用类方法获取预训练模型的配置字典和更新后的关键字参数
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
# 如果配置字典中包含 "model_type" 键且类有 "model_type" 属性,并且它们不一致,发出警告
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
# 使用配置字典创建配置对象并返回
return cls.from_dict(config_dict, **kwargs)
@classmethod
def from_text_vision_configs(cls, text_config: Dict, vision_config: Dict, **kwargs):
r"""
从 owlv2 文本模型配置和 owlv2 视觉模型配置实例化一个 [`Owlv2Config`](或其派生类)。
返回:
[`Owlv2Config`]: 配置对象的一个实例
"""
# 创建一个空的配置字典,存储文本配置和视觉配置
config_dict = {}
config_dict["text_config"] = text_config
config_dict["vision_config"] = vision_config
# 使用配置字典创建配置对象并返回
return cls.from_dict(config_dict, **kwargs)
.\models\owlv2\convert_owlv2_to_hf.py
import argparse
import collections
import os
import jax
import jax.numpy as jnp
import numpy as np
import torch
from flax.training import checkpoints
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import (
CLIPTokenizer,
Owlv2Config,
Owlv2ForObjectDetection,
Owlv2ImageProcessor,
Owlv2Processor,
Owlv2TextConfig,
Owlv2VisionConfig,
)
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def get_owlv2_config(model_name):
if "large" in model_name:
image_size = 1008
patch_size = 14
vision_hidden_size = 1024
vision_intermediate_size = 4096
vision_num_hidden_layers = 24
vision_num_attention_heads = 16
projection_dim = 768
text_hidden_size = 768
text_intermediate_size = 3072
text_num_attention_heads = 12
text_num_hidden_layers = 12
else:
image_size = 960
patch_size = 16
vision_hidden_size = 768
vision_intermediate_size = 3072
vision_num_hidden_layers = 12
vision_num_attention_heads = 12
projection_dim = 512
text_hidden_size = 512
text_intermediate_size = 2048
text_num_attention_heads = 8
text_num_hidden_layers = 12
vision_config = Owlv2VisionConfig(
patch_size=patch_size,
image_size=image_size,
hidden_size=vision_hidden_size,
num_hidden_layers=vision_num_hidden_layers,
intermediate_size=vision_intermediate_size,
num_attention_heads=vision_num_attention_heads,
)
text_config = Owlv2TextConfig(
hidden_size=text_hidden_size,
intermediate_size=text_intermediate_size,
num_attention_heads=text_num_attention_heads,
num_hidden_layers=text_num_hidden_layers,
)
config = Owlv2Config(
text_config=text_config.to_dict(),
vision_config=vision_config.to_dict(),
projection_dim=projection_dim,
)
return config
def flatten_nested_dict(params, parent_key="", sep="/"):
items = []
for k, v in params.items():
new_key = parent_key + sep + k if parent_key else k
if isinstance(v, collections.MutableMapping):
items.extend(flatten_nested_dict(v, new_key, sep=sep).items())
else:
items.append((new_key, v))
return dict(items)
def create_rename_keys(config, model_name):
rename_keys = []
rename_keys.append(("backbone/clip/visual/class_embedding", "owlv2.vision_model.embeddings.class_embedding"))
rename_keys.append(("backbone/clip/visual/conv1/kernel", "owlv2.vision_model.embeddings.patch_embedding.weight"))
rename_keys.append(("backbone/clip/visual/positional_embedding", "owlv2.vision_model.embeddings.position_embedding.weight"))
rename_keys.append(("backbone/clip/visual/ln_pre/scale", "owlv2.vision_model.pre_layernorm.weight"))
rename_keys.append(("backbone/clip/visual/ln_pre/bias", "owlv2.vision_model.pre_layernorm.bias"))
rename_keys.append(("backbone/clip/visual/ln_post/scale", "owlv2.vision_model.post_layernorm.weight"))
rename_keys.append(("backbone/clip/visual/ln_post/bias", "owlv2.vision_model.post_layernorm.bias"))
rename_keys.append(("backbone/clip/text/token_embedding/embedding", "owlv2.text_model.embeddings.token_embedding.weight"))
rename_keys.append(("backbone/clip/text/positional_embedding", "owlv2.text_model.embeddings.position_embedding.weight"))
rename_keys.append(("backbone/clip/text/ln_final/scale", "owlv2.text_model.final_layer_norm.weight"))
rename_keys.append(("backbone/clip/text/ln_final/bias", "owlv2.text_model.final_layer_norm.bias"))
rename_keys.append(("backbone/clip/logit_scale", "owlv2.logit_scale"))
rename_keys.append(("backbone/clip/text/text_projection/kernel", "owlv2.text_projection.weight"))
rename_keys.append(("backbone/merged_class_token/scale", "layer_norm.weight"))
rename_keys.append(("backbone/merged_class_token/bias", "layer_norm.bias"))
rename_keys.append(("class_head/Dense_0/kernel", "class_head.dense0.weight"))
rename_keys.append(("class_head/Dense_0/bias", "class_head.dense0.bias"))
rename_keys.append(("class_head/logit_shift/kernel", "class_head.logit_shift.weight"))
rename_keys.append(("class_head/logit_scale/kernel", "class_head.logit_scale.weight"))
rename_keys.append(("class_head/logit_scale/bias", "class_head.logit_scale.bias"))
rename_keys.append(("class_head/logit_shift/bias", "class_head.logit_shift.bias"))
rename_keys.append(("obj_box_head/Dense_0/kernel", "box_head.dense0.weight"))
rename_keys.append(("obj_box_head/Dense_0/bias", "box_head.dense0.bias"))
rename_keys.append(("obj_box_head/Dense_1/kernel", "box_head.dense1.weight"))
rename_keys.append(("obj_box_head/Dense_1/bias", "box_head.dense1.bias"))
rename_keys.append(("obj_box_head/Dense_2/kernel", "box_head.dense2.weight"))
rename_keys.append(("obj_box_head/Dense_2/bias", "box_head.dense2.bias"))
if "v2" in model_name:
rename_keys.append(("objectness_head/Dense_0/kernel", "objectness_head.dense0.weight"))
rename_keys.append(("objectness_head/Dense_0/bias", "objectness_head.dense0.bias"))
rename_keys.append(("objectness_head/Dense_1/kernel", "objectness_head.dense1.weight"))
rename_keys.append(("objectness_head/Dense_1/bias", "objectness_head.dense1.bias"))
rename_keys.append(("objectness_head/Dense_2/kernel", "objectness_head.dense2.weight"))
rename_keys.append(("objectness_head/Dense_2/bias", "objectness_head.dense2.bias"))
return rename_keys
val = dct.pop(old)
if ("out_proj" in new or "v_proj" in new or "k_proj" in new or "q_proj" in new) and "vision" in new:
val = val.reshape(-1, config.vision_config.hidden_size)
if ("out_proj" in new or "v_proj" in new or "k_proj" in new or "q_proj" in new) and "text" in new:
val = val.reshape(-1, config.text_config.hidden_size)
if "patch_embedding" in new:
print("Reshaping patch embedding... for", new)
val = val.transpose(3, 2, 0, 1)
elif new.endswith("weight") and "position_embedding" not in new and "token_embedding" not in new:
val = val.T
if new.endswith("bias"):
val = val.reshape(-1)
dct[new] = torch.from_numpy(np.array(val))
inputs = processor(text=texts, images=image, return_tensors="pt")
if "large" not in model_name:
assert torch.allclose(inputs.pixel_values, original_pixel_values.float(), atol=1e-6)
assert torch.allclose(inputs.input_ids[:4, :], original_input_ids[:4, :], atol=1e-6)
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
pred_boxes = outputs.pred_boxes
objectness_logits = outputs.objectness_logits
else:
print("Model converted without verifying logits")
if pytorch_dump_folder_path is not None:
print("Saving model and processor locally...")
if not os.path.isdir(pytorch_dump_folder_path):
os.mkdir(pytorch_dump_folder_path)
model.save_pretrained(pytorch_dump_folder_path)
processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
print(f"Pushing {model_name} to the hub...")
model.push_to_hub(f"google/{model_name}")
processor.push_to_hub(f"google/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
default="owlv2-base-patch16",
choices=[
"owlv2-base-patch16",
"owlv2-base-patch16-finetuned",
"owlv2-base-patch16-ensemble",
"owlv2-large-patch14",
"owlv2-large-patch14-finetuned",
"owlv2-large-patch14-ensemble",
],
type=str,
help="Name of the Owlv2 model you'd like to convert from FLAX to PyTorch."
)
parser.add_argument(
"--checkpoint_path",
default=None,
type=str,
required=True,
help="Path to the original Flax checkpoint."
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
required=False,
help="Path to the output PyTorch model directory."
)
parser.add_argument(
"--verify_logits",
action="store_false",
required=False,
help="Path to the output PyTorch model directory."
)
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Push model and image preprocessor to the hub"
)
args = parser.parse_args()
convert_owlv2_checkpoint(
args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits
)