Transformers 源码解析(七十六)
.\models\mobilebert\modeling_tf_mobilebert.py
""" TF 2.0 MobileBERT model."""
from __future__ import annotations
import warnings
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
TFBaseModelOutput,
TFBaseModelOutputWithPooling,
TFMaskedLMOutput,
TFMultipleChoiceModelOutput,
TFNextSentencePredictorOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (
TFMaskedLanguageModelingLoss,
TFModelInputType,
TFMultipleChoiceLoss,
TFNextSentencePredictionLoss,
TFPreTrainedModel,
TFQuestionAnsweringLoss,
TFSequenceClassificationLoss,
TFTokenClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_mobilebert import MobileBertConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "google/mobilebert-uncased"
_CONFIG_FOR_DOC = "MobileBertConfig"
_CHECKPOINT_FOR_TOKEN_CLASSIFICATION = "vumichien/mobilebert-finetuned-ner"
_TOKEN_CLASS_EXPECTED_OUTPUT = "['I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', 'I-LOC']"
_TOKEN_CLASS_EXPECTED_LOSS = 0.03
_CHECKPOINT_FOR_QA = "vumichien/mobilebert-uncased-squad-v2"
_QA_EXPECTED_OUTPUT = "'a nice puppet'"
_QA_EXPECTED_LOSS = 3.98
_QA_TARGET_START_INDEX = 12
_QA_TARGET_END_INDEX = 13
_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "vumichien/emo-mobilebert"
_SEQ_CLASS_EXPECTED_OUTPUT = "'others'"
_SEQ_CLASS_EXPECTED_LOSS = "4.72"
TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/mobilebert-uncased",
]
class TFMobileBertPreTrainingLoss:
"""
Placeholder class definition for the MobileBERT pre-training loss.
This class is likely intended to be implemented later.
"""
Loss function suitable for BERT-like pretraining, that is, the task of pretraining a language model by combining
NSP + MLM. .. note:: Any label of -100 will be ignored (along with the corresponding logits) in the loss
computation.
"""
# 定义一个计算损失函数,适用于类似BERT的预训练任务,即结合NSP(Next Sentence Prediction)和MLM(Masked Language Modeling)
def hf_compute_loss(self, labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor:
# 使用稀疏分类交叉熵损失函数,适用于逻辑回归(logits),保留每个样本的独立损失
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
# 将负标签截断为零,以避免NaN和错误,这些位置稍后会被掩盖
unmasked_lm_losses = loss_fn(y_true=tf.nn.relu(labels["labels"]), y_pred=logits[0])
# 确保仅计算不等于-100的标签的损失
lm_loss_mask = tf.cast(labels["labels"] != -100, dtype=unmasked_lm_losses.dtype)
masked_lm_losses = unmasked_lm_losses * lm_loss_mask
reduced_masked_lm_loss = tf.reduce_sum(masked_lm_losses) / tf.reduce_sum(lm_loss_mask)
# 再次将负标签截断为零,避免NaN和错误,这些位置稍后会被掩盖
unmasked_ns_loss = loss_fn(y_true=tf.nn.relu(labels["next_sentence_label"]), y_pred=logits[1])
ns_loss_mask = tf.cast(labels["next_sentence_label"] != -100, dtype=unmasked_ns_loss.dtype)
masked_ns_loss = unmasked_ns_loss * ns_loss_mask
reduced_masked_ns_loss = tf.reduce_sum(masked_ns_loss) / tf.reduce_sum(ns_loss_mask)
# 返回损失的张量形状
return tf.reshape(reduced_masked_lm_loss + reduced_masked_ns_loss, (1,))
class TFMobileBertIntermediate(keras.layers.Layer):
# 初始化中间层,包括一个全连接层和激活函数
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
# 创建全连接层,使用配置中的中间层大小,命名为"dense"
self.dense = keras.layers.Dense(config.intermediate_size, name="dense")
# 根据配置选择激活函数,如果是字符串则通过辅助函数获取对应的 TensorFlow 激活函数
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
# 定义调用方法,对输入的隐藏状态执行全连接层和激活函数操作
def call(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
# 构建层,确保只构建一次,并设置全连接层的输入形状
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
# 设置全连接层的输入形状,其中 None 表示批量大小可变
self.dense.build([None, None, self.config.true_hidden_size])
class TFLayerNorm(keras.layers.LayerNormalization):
# 初始化 LayerNormalization 层,指定特征大小
def __init__(self, feat_size, *args, **kwargs):
self.feat_size = feat_size
super().__init__(*args, **kwargs)
# 构建层,设置输入形状为 [None, None, feat_size]
def build(self, input_shape=None):
super().build([None, None, self.feat_size])
class TFNoNorm(keras.layers.Layer):
# 初始化不进行归一化的层,指定特征大小和其他参数
def __init__(self, feat_size, epsilon=None, **kwargs):
super().__init__(**kwargs)
self.feat_size = feat_size
# 构建层,设置偏置和权重参数的形状,并调用父类的 build 方法
def build(self, input_shape):
self.bias = self.add_weight("bias", shape=[self.feat_size], initializer="zeros")
self.weight = self.add_weight("weight", shape=[self.feat_size], initializer="ones")
super().build(input_shape)
# 定义调用方法,对输入执行加权和加偏操作
def call(self, inputs: tf.Tensor):
return inputs * self.weight + self.bias
# 定义一个字典,将字符串类型的归一化方式映射到对应的类
NORM2FN = {"layer_norm": TFLayerNorm, "no_norm": TFNoNorm}
class TFMobileBertEmbeddings(keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
# 初始化嵌入层,包括词、位置和类型嵌入的构建
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
# 从配置中获取三元输入标志、嵌入大小等信息
self.trigram_input = config.trigram_input
self.embedding_size = config.embedding_size
self.config = config
self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range
# 创建嵌入转换层,将输入转换为隐藏大小的表示,命名为"embedding_transformation"
self.embedding_transformation = keras.layers.Dense(config.hidden_size, name="embedding_transformation")
# 创建归一化层,根据配置中的归一化类型选择对应的类,设置 epsilon 和名称
# 这里保持不改变 TensorFlow 模型变量名称,以便能够加载任何 TensorFlow 检查点文件
self.LayerNorm = NORM2FN[config.normalization_type](
config.hidden_size, epsilon=config.layer_norm_eps, name="LayerNorm"
)
# 创建 dropout 层,根据配置中的隐藏层 dropout 概率设置丢弃率
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
# 计算嵌入输入大小,考虑是否使用三元输入
self.embedded_input_size = self.embedding_size * (3 if self.trigram_input else 1)
# 定义 build 方法,用于构建模型的各个部分
def build(self, input_shape=None):
# 在 "word_embeddings" 命名空间下创建权重变量
with tf.name_scope("word_embeddings"):
self.weight = self.add_weight(
name="weight",
shape=[self.config.vocab_size, self.embedding_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
# 在 "token_type_embeddings" 命名空间下创建 token 类型的嵌入权重变量
with tf.name_scope("token_type_embeddings"):
self.token_type_embeddings = self.add_weight(
name="embeddings",
shape=[self.config.type_vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
# 在 "position_embeddings" 命名空间下创建位置编码的嵌入权重变量
with tf.name_scope("position_embeddings"):
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
# 如果模型已经构建过,直接返回
if self.built:
return
# 标记模型为已构建状态
self.built = True
# 如果存在 embedding_transformation 属性,构建对应的变换层
if getattr(self, "embedding_transformation", None) is not None:
with tf.name_scope(self.embedding_transformation.name):
# 使用 build 方法构建 embedding_transformation 层
self.embedding_transformation.build([None, None, self.embedded_input_size])
# 如果存在 LayerNorm 属性,构建 LayerNorm 层
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
# 使用 build 方法构建 LayerNorm 层
self.LayerNorm.build(None)
def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False):
"""
Applies embedding based on inputs tensor.
Returns:
final_embeddings (`tf.Tensor`): output embedding tensor.
"""
# 断言确保 input_ids 或 inputs_embeds 至少有一个不为 None
assert not (input_ids is None and inputs_embeds is None)
# 如果传入了 input_ids,则根据 input_ids 从权重矩阵中获取对应的嵌入向量
if input_ids is not None:
check_embeddings_within_bounds(input_ids, self.config.vocab_size)
inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
# 获取输入嵌入张量的形状,去掉最后一维(用于嵌入维度)
input_shape = shape_list(inputs_embeds)[:-1]
# 如果未提供 token_type_ids,则创建一个与输入嵌入张量形状相同的张量,并填充为 0
if token_type_ids is None:
token_type_ids = tf.fill(dims=input_shape, value=0)
# 如果设定了 trigram_input 标志
if self.trigram_input:
# 根据 MobileBERT 论文中的描述,对输入嵌入张量进行 trigram 输入处理
inputs_embeds = tf.concat(
[
tf.pad(inputs_embeds[:, 1:], ((0, 0), (0, 1), (0, 0))),
inputs_embeds,
tf.pad(inputs_embeds[:, :-1], ((0, 0), (1, 0), (0, 0))),
],
axis=2,
)
# 如果设定了 trigram_input 标志或者 embedding_size 不等于 hidden_size
if self.trigram_input or self.embedding_size != self.hidden_size:
# 对输入嵌入张量进行额外的嵌入转换处理
inputs_embeds = self.embedding_transformation(inputs_embeds)
# 如果未提供 position_ids,则创建一个一维张量,包含从 0 到输入张量最后维度长度的范围值
if position_ids is None:
position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)
# 根据 position_ids 获取位置嵌入张量
position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
# 根据 token_type_ids 获取 token 类型嵌入张量
token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
# 最终的嵌入张量由输入嵌入张量、位置嵌入张量和 token 类型嵌入张量相加而得
final_embeddings = inputs_embeds + position_embeds + token_type_embeds
# 应用 LayerNorm 层进行标准化处理
final_embeddings = self.LayerNorm(inputs=final_embeddings)
# 根据训练状态应用 dropout 层
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
# 返回最终的嵌入张量
return final_embeddings
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
# 检查隐藏层大小是否能被注意力头数整除
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads}"
)
# 设置注意力头数和是否输出注意力权重的配置
self.num_attention_heads = config.num_attention_heads
self.output_attentions = config.output_attentions
# 确保隐藏层大小能被注意力头数整除
assert config.hidden_size % config.num_attention_heads == 0
# 计算每个注意力头的大小和所有注意力头的总大小
self.attention_head_size = int(config.true_hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
# 初始化查询、键、值矩阵的全连接层
self.query = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
)
self.key = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
)
self.value = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
)
# 初始化 dropout 层,并设置注意力概率
self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
self.config = config
def transpose_for_scores(self, x, batch_size):
# 将输入张量 x 从 [batch_size, seq_length, all_head_size] 重塑为 [batch_size, seq_length, num_attention_heads, attention_head_size]
x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(
self, query_tensor, key_tensor, value_tensor, attention_mask, head_mask, output_attentions, training=False
):
# 实现自注意力机制的前向传播
):
# 获取 batch_size
batch_size = shape_list(attention_mask)[0]
# 计算 query 的混合层
mixed_query_layer = self.query(query_tensor)
# 计算 key 的混合层
mixed_key_layer = self.key(key_tensor)
# 计算 value 的混合层
mixed_value_layer = self.value(value_tensor)
# 调整混合后的 query 层为得分计算做准备
query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
# 调整混合后的 key 层为得分计算做准备
key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
# 调整混合后的 value 层为得分计算做准备
value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
# 计算 "query" 和 "key" 之间的点积,得到原始的注意力分数
attention_scores = tf.matmul(
query_layer, key_layer, transpose_b=True
) # (batch size, num_heads, seq_len_q, seq_len_k)
# 缩放注意力分数
dk = tf.cast(shape_list(key_layer)[-1], dtype=attention_scores.dtype)
attention_scores = attention_scores / tf.math.sqrt(dk)
# 如果有注意力掩码,应用它(在 TFMobileBertModel call() 函数中预先计算)
if attention_mask is not None:
attention_mask = tf.cast(attention_mask, dtype=attention_scores.dtype)
attention_scores = attention_scores + attention_mask
# 将注意力分数归一化为概率
attention_probs = stable_softmax(attention_scores, axis=-1)
# 对注意力概率进行 dropout
attention_probs = self.dropout(attention_probs, training=training)
# 如果有头部掩码,应用头部掩码
if head_mask is not None:
attention_probs = attention_probs * head_mask
# 计算上下文向量
context_layer = tf.matmul(attention_probs, value_layer)
# 转置和重塑上下文向量
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
context_layer = tf.reshape(
context_layer, (batch_size, -1, self.all_head_size)
) # (batch_size, seq_len_q, all_head_size)
# 返回输出结果,根据是否需要返回注意力概率
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果已经定义了 query 层,建立它
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.true_hidden_size])
# 如果已经定义了 key 层,建立它
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.true_hidden_size])
# 如果已经定义了 value 层,建立它
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build(
[
None,
None,
self.config.true_hidden_size
if self.config.use_bottleneck_attention
else self.config.hidden_size,
]
)
# 定义 TFMobileBertSelfOutput 类,继承自 keras.layers.Layer
class TFMobileBertSelfOutput(keras.layers.Layer):
# 初始化方法,接收 config 和其他关键字参数
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
# 根据 config 设置是否使用瓶颈层
self.use_bottleneck = config.use_bottleneck
# 创建一个全连接层,用于变换隐藏状态的维度
self.dense = keras.layers.Dense(
config.true_hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
# 根据 config 设置归一化层,例如 LayerNorm
self.LayerNorm = NORM2FN[config.normalization_type](
config.true_hidden_size, epsilon=config.layer_norm_eps, name="LayerNorm"
)
# 如果不使用瓶颈层,则创建一个 dropout 层,用于训练时随机丢弃部分神经元
if not self.use_bottleneck:
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
# 保存 config 对象
self.config = config
# 定义调用方法,用于前向传播计算
def call(self, hidden_states, residual_tensor, training=False):
# 使用全连接层变换隐藏状态
hidden_states = self.dense(hidden_states)
# 如果不使用瓶颈层,则对变换后的隐藏状态进行 dropout 处理
if not self.use_bottleneck:
hidden_states = self.dropout(hidden_states, training=training)
# 将变换后的隐藏状态与残差张量相加,并通过归一化层处理
hidden_states = self.LayerNorm(hidden_states + residual_tensor)
return hidden_states
# 构建方法,用于构建层次结构
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果存在全连接层,则构建该层
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.true_hidden_size])
# 如果存在归一化层,则构建该层
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build(None)
# 定义 TFMobileBertAttention 类,继承自 keras.layers.Layer
class TFMobileBertAttention(keras.layers.Layer):
# 初始化方法,接收 config 和其他关键字参数
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
# 创建自注意力层对象
self.self = TFMobileBertSelfAttention(config, name="self")
# 创建 TFMobileBertSelfOutput 层对象,用于处理自注意力层的输出
self.mobilebert_output = TFMobileBertSelfOutput(config, name="output")
# 头部剪枝方法,抛出未实现错误
def prune_heads(self, heads):
raise NotImplementedError
# 定义调用方法,用于前向传播计算
def call(
self,
query_tensor,
key_tensor,
value_tensor,
layer_input,
attention_mask,
head_mask,
output_attentions,
training=False,
):
# 使用自注意力层处理输入张量
self_outputs = self.self(
query_tensor, key_tensor, value_tensor, attention_mask, head_mask, output_attentions, training=training
)
# 使用 TFMobileBertSelfOutput 层处理自注意力层的输出和层输入张量
attention_output = self.mobilebert_output(self_outputs[0], layer_input, training=training)
# 构造输出元组,包含注意力输出和可能的额外输出
outputs = (attention_output,) + self_outputs[1:] # 如果需要额外的注意力输出,则添加
return outputs
# 构建方法,用于构建层次结构
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果存在自注意力层,则构建该层
if getattr(self, "self", None) is not None:
with tf.name_scope(self.self.name):
self.self.build(None)
# 如果存在 TFMobileBertSelfOutput 层,则构建该层
if getattr(self, "mobilebert_output", None) is not None:
with tf.name_scope(self.mobilebert_output.name):
self.mobilebert_output.build(None)
# 定义 TFOutputBottleneck 类,继承自 keras.layers.Layer
class TFOutputBottleneck(keras.layers.Layer):
# 初始化方法,用于创建对象时初始化各个成员变量和层对象
def __init__(self, config, **kwargs):
# 调用父类的初始化方法
super().__init__(**kwargs)
# 创建一个全连接层对象,用于变换隐藏状态的维度
self.dense = keras.layers.Dense(config.hidden_size, name="dense")
# 创建一个归一化层对象,根据配置选择不同的归一化类型
self.LayerNorm = NORM2FN[config.normalization_type](
config.hidden_size, epsilon=config.layer_norm_eps, name="LayerNorm"
)
# 创建一个 Dropout 层对象,用于在训练时进行随机失活
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
# 存储配置对象,以便后续使用
self.config = config
# 调用方法,用于实际执行神经网络的前向计算过程
def call(self, hidden_states, residual_tensor, training=False):
# 线性变换层,将隐藏状态映射到新的空间
layer_outputs = self.dense(hidden_states)
# 在训练时对输出进行 Dropout 处理,防止过拟合
layer_outputs = self.dropout(layer_outputs, training=training)
# 应用归一化层,处理残差连接和变换后的输出
layer_outputs = self.LayerNorm(layer_outputs + residual_tensor)
# 返回处理后的输出
return layer_outputs
# 构建方法,用于构建网络层的内部结构
def build(self, input_shape=None):
# 如果已经构建过网络层,直接返回
if self.built:
return
# 标记当前网络层已构建
self.built = True
# 如果存在 dense 层对象,则根据配置构建该层
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.true_hidden_size])
# 如果存在 LayerNorm 层对象,则构建该层
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build(None)
class TFMobileBertOutput(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.use_bottleneck = config.use_bottleneck # 根据配置决定是否使用瓶颈层
self.dense = keras.layers.Dense(
config.true_hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) # 创建全连接层,用于转换输入的隐藏状态维度
self.LayerNorm = NORM2FN[config.normalization_type](
config.true_hidden_size, epsilon=config.layer_norm_eps, name="LayerNorm"
) # 根据配置选择合适的归一化层
if not self.use_bottleneck:
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) # 如果不使用瓶颈层,则创建Dropout层
else:
self.bottleneck = TFOutputBottleneck(config, name="bottleneck") # 如果使用瓶颈层,则创建瓶颈层对象
self.config = config # 保存配置信息
def call(self, hidden_states, residual_tensor_1, residual_tensor_2, training=False):
hidden_states = self.dense(hidden_states) # 经过全连接层转换隐藏状态
if not self.use_bottleneck:
hidden_states = self.dropout(hidden_states, training=training) # 如果不使用瓶颈层,则应用Dropout
hidden_states = self.LayerNorm(hidden_states + residual_tensor_1) # 对输入和残差进行归一化和残差连接
else:
hidden_states = self.LayerNorm(hidden_states + residual_tensor_1) # 对输入和残差进行归一化和残差连接
hidden_states = self.bottleneck(hidden_states, residual_tensor_2) # 经过瓶颈层处理残差
return hidden_states # 返回处理后的隐藏状态
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size]) # 构建全连接层
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build(None) # 构建归一化层
if getattr(self, "bottleneck", None) is not None:
with tf.name_scope(self.bottleneck.name):
self.bottleneck.build(None) # 构建瓶颈层
class TFBottleneckLayer(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(config.intra_bottleneck_size, name="dense") # 创建瓶颈层的全连接层
self.LayerNorm = NORM2FN[config.normalization_type](
config.intra_bottleneck_size, epsilon=config.layer_norm_eps, name="LayerNorm"
) # 根据配置选择合适的归一化层
self.config = config # 保存配置信息
def call(self, inputs):
hidden_states = self.dense(inputs) # 经过全连接层转换输入
hidden_states = self.LayerNorm(hidden_states) # 对转换后的数据进行归一化
return hidden_states # 返回处理后的数据
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size]) # 构建全连接层
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build(None) # 构建归一化层
class TFBottleneck(keras.layers.Layer):
# 这里是 TFBottleneck 类的定义,暂时没有额外的代码需要注释
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.key_query_shared_bottleneck = config.key_query_shared_bottleneck
self.use_bottleneck_attention = config.use_bottleneck_attention
# 使用传入的配置信息初始化共享瓶颈层和注意力机制的使用标志
self.bottleneck_input = TFBottleneckLayer(config, name="input")
# 如果设置了共享瓶颈层,初始化注意力机制的瓶颈层
if self.key_query_shared_bottleneck:
self.attention = TFBottleneckLayer(config, name="attention")
def call(self, hidden_states):
# 这个方法可以返回三种不同的元组值。这些不同的值利用了瓶颈层,这些线性层用于将隐藏状态投影到一个低维向量,
# 从而减少内存使用。这些线性层的权重在训练期间学习。
#
# 如果 `config.use_bottleneck_attention` 为真,则会四次返回瓶颈层的结果,
# 分别用于键、查询、值和“层输入”,供注意力层使用。
# 这个瓶颈层用于投影隐藏层。这个“层输入”将在计算完注意力分数后,作为注意力自输出中的残差张量使用。
#
# 如果不使用 `config.use_bottleneck_attention` 且使用了 `config.key_query_shared_bottleneck`,
# 则会返回四个值,其中三个经过了瓶颈层处理:查询和键通过同一个瓶颈层,而在注意力自输出中,通过另一个瓶颈层处理残差层。
#
# 最后一种情况,查询、键和值的值为未经瓶颈处理的隐藏状态,而残差层则经过了瓶颈处理。
bottlenecked_hidden_states = self.bottleneck_input(hidden_states)
# 根据配置决定返回哪些值的元组
if self.use_bottleneck_attention:
return (bottlenecked_hidden_states,) * 4
elif self.key_query_shared_bottleneck:
shared_attention_input = self.attention(hidden_states)
return (shared_attention_input, shared_attention_input, hidden_states, bottlenecked_hidden_states)
else:
return (hidden_states, hidden_states, hidden_states, bottlenecked_hidden_states)
def build(self, input_shape=None):
# 如果已经构建过,直接返回
if self.built:
return
self.built = True
# 如果存在瓶颈输入层,构建该层
if getattr(self, "bottleneck_input", None) is not None:
with tf.name_scope(self.bottleneck_input.name):
self.bottleneck_input.build(None)
# 如果存在注意力瓶颈层,构建该层
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
# 定义一个 Keras 自定义层 TFMobileBertLayer,继承自 keras.layers.Layer 类
class TFMobileBertLayer(keras.layers.Layer):
# 初始化方法,接受 config 和其他关键字参数
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
# 根据 config 配置决定是否使用瓶颈结构
self.use_bottleneck = config.use_bottleneck
# 存储 feedforward 网络的数量
self.num_feedforward_networks = config.num_feedforward_networks
# 创建 TFMobileBertAttention 层,命名为 "attention"
self.attention = TFMobileBertAttention(config, name="attention")
# 创建 TFMobileBertIntermediate 层,命名为 "intermediate"
self.intermediate = TFMobileBertIntermediate(config, name="intermediate")
# 创建 TFMobileBertOutput 层,命名为 "output"
self.mobilebert_output = TFMobileBertOutput(config, name="output")
# 如果使用瓶颈结构,创建 TFBottleneck 层,命名为 "bottleneck"
if self.use_bottleneck:
self.bottleneck = TFBottleneck(config, name="bottleneck")
# 如果 feedforward 网络数量大于1,创建多个 TFFFNLayer 层
if config.num_feedforward_networks > 1:
# 使用列表推导创建多个 TFFFNLayer 实例,命名为 "ffn.{i}"
self.ffn = [TFFFNLayer(config, name=f"ffn.{i}") for i in range(config.num_feedforward_networks - 1)]
# call 方法定义了层的前向传播逻辑
def call(self, hidden_states):
# 调用注意力层处理隐藏状态
attention_output = self.attention(hidden_states)
# 调用中间层处理注意力输出
intermediate_output = self.intermediate(attention_output)
# 调用 MobileBERT 输出层处理中间层输出和原始隐藏状态
mobilebert_output = self.mobilebert_output(intermediate_output, hidden_states)
# 如果使用瓶颈结构,将输出传入瓶颈层
if self.use_bottleneck:
mobilebert_output = self.bottleneck(mobilebert_output)
# 对于每个 feedforward 网络,依次调用处理
if self.num_feedforward_networks > 1:
for ffn_layer in self.ffn:
mobilebert_output = ffn_layer(mobilebert_output)
# 返回处理后的输出
return mobilebert_output
# build 方法用于构建层,包括初始化权重等操作
def build(self, input_shape=None):
# 如果已经构建过,则直接返回
if self.built:
return
self.built = True
# 如果存在 intermediate 层,则构建该层
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
# 如果存在 MobileBERT 输出层,则构建该层
if getattr(self, "mobilebert_output", None) is not None:
with tf.name_scope(self.mobilebert_output.name):
self.mobilebert_output.build(None)
# 如果使用瓶颈结构,构建瓶颈层
if self.use_bottleneck and getattr(self, "bottleneck", None) is not None:
with tf.name_scope(self.bottleneck.name):
self.bottleneck.build(None)
# 如果有多个 feedforward 网络,依次构建每个网络层
if self.num_feedforward_networks > 1:
for ffn_layer in self.ffn:
with tf.name_scope(ffn_layer.name):
ffn_layer.build(None)
# 定义一个方法,用于处理网络的前向传播,接受隐藏状态、注意力掩码、头掩码、是否输出注意力权重以及训练标志
def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False):
# 如果使用瓶颈层,调用瓶颈层方法生成查询、键、值张量以及层输入
if self.use_bottleneck:
query_tensor, key_tensor, value_tensor, layer_input = self.bottleneck(hidden_states)
else:
# 否则复制隐藏状态作为查询、键、值张量,同时层输入也设为隐藏状态
query_tensor, key_tensor, value_tensor, layer_input = [hidden_states] * 4
# 调用注意力层进行注意力计算,传入查询、键、值张量、层输入、注意力掩码、头掩码、是否输出注意力权重以及训练标志
attention_outputs = self.attention(
query_tensor,
key_tensor,
value_tensor,
layer_input,
attention_mask,
head_mask,
output_attentions,
training=training,
)
# 从注意力输出中获取注意力张量
attention_output = attention_outputs[0]
s = (attention_output,)
# 如果存在多个前馈网络,则依次对注意力输出进行处理
if self.num_feedforward_networks != 1:
for i, ffn_module in enumerate(self.ffn):
attention_output = ffn_module(attention_output)
s += (attention_output,)
# 经过中间层处理注意力输出得到中间输出
intermediate_output = self.intermediate(attention_output)
# 经过MobileBERT输出层处理中间输出、注意力输出以及隐藏状态,得到层输出
layer_output = self.mobilebert_output(intermediate_output, attention_output, hidden_states, training=training)
# 构造最终输出,包括层输出、注意力输出的其它部分以及可能的注意力张量
outputs = (
(layer_output,)
+ attention_outputs[1:]
+ (
tf.constant(0),
query_tensor,
key_tensor,
value_tensor,
layer_input,
attention_output,
intermediate_output,
)
+ s
) # 如果需要输出注意力权重,则添加进输出中
# 返回构造好的输出
return outputs
# 构建网络层,如果已经构建过则直接返回
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果注意力层存在,则逐一构建它们
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
# 如果中间层存在,则逐一构建它们
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
# 如果MobileBERT输出层存在,则逐一构建它们
if getattr(self, "mobilebert_output", None) is not None:
with tf.name_scope(self.mobilebert_output.name):
self.mobilebert_output.build(None)
# 如果瓶颈层存在,则逐一构建它们
if getattr(self, "bottleneck", None) is not None:
with tf.name_scope(self.bottleneck.name):
self.bottleneck.build(None)
# 如果前馈网络存在,则逐一构建它们
if getattr(self, "ffn", None) is not None:
for layer in self.ffn:
with tf.name_scope(layer.name):
layer.build(None)
class TFMobileBertEncoder(keras.layers.Layer):
# TFMobileBertEncoder 类定义,继承自 keras 的 Layer 类
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
# 初始化输出参数的标志
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
# 创建多个 TFMobileBertLayer 层组成的列表
self.layer = [TFMobileBertLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
def call(
self,
hidden_states,
attention_mask,
head_mask,
output_attentions,
output_hidden_states,
return_dict,
training=False,
):
# 初始化存储所有隐藏状态和注意力的元组
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
# 遍历所有层并调用它们的 call 方法
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 调用当前层的 call 方法,计算输出
layer_outputs = layer_module(
hidden_states, attention_mask, head_mask[i], output_attentions, training=training
)
hidden_states = layer_outputs[0]
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
# 添加最后一层的隐藏状态
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 根据 return_dict 决定返回值的形式
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
# 返回 TFBaseModelOutput 对象,包含最后的隐藏状态、所有隐藏状态和注意力
return TFBaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
def build(self, input_shape=None):
# 如果已经构建过,直接返回
if self.built:
return
self.built = True
# 构建每一层
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
class TFMobileBertPooler(keras.layers.Layer):
# TFMobileBertPooler 类定义,继承自 keras 的 Layer 类
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
# 根据配置决定是否激活分类器的激活函数
self.do_activate = config.classifier_activation
if self.do_activate:
# 如果激活,创建一个全连接层,使用 tanh 激活函数
self.dense = keras.layers.Dense(
config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
activation="tanh",
name="dense",
)
self.config = config
def call(self, hidden_states):
# 通过获取第一个 token 对应的隐藏状态来实现模型的 "汇聚"
first_token_tensor = hidden_states[:, 0]
if not self.do_activate:
# 如果不需要激活,直接返回第一个 token 的隐藏状态
return first_token_tensor
else:
# 否则,通过全连接层处理第一个 token 的隐藏状态
pooled_output = self.dense(first_token_tensor)
return pooled_output
def build(self, input_shape=None):
# 如果已经构建过,直接返回
if self.built:
return
self.built = True
# 如果存在全连接层,构建该层
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFMobileBertPredictionHeadTransform(keras.layers.Layer):
# TFMobileBert 模型的预测头变换层,用于处理隐藏状态
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
# 定义一个全连接层,输出维度为 config.hidden_size,使用指定的初始化方法
self.dense = keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
# 根据配置选择激活函数,或者直接使用给定的激活函数对象
if isinstance(config.hidden_act, str):
self.transform_act_fn = get_tf_activation(config.hidden_act)
else:
self.transform_act_fn = config.hidden_act
# 创建 LayerNorm 层,用于归一化隐藏状态向量
self.LayerNorm = NORM2FN["layer_norm"](config.hidden_size, epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config
# 定义调用函数,实现层的前向传播
def call(self, hidden_states):
# 全连接层处理隐藏状态向量
hidden_states = self.dense(hidden_states)
# 应用激活函数变换
hidden_states = self.transform_act_fn(hidden_states)
# 归一化处理
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
# 构建层的方法,用于创建层的权重
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果存在 dense 层,则构建 dense 层的权重
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# 如果存在 LayerNorm 层,则构建 LayerNorm 层的权重
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build(None)
class TFMobileBertLMPredictionHead(keras.layers.Layer):
# TFMobileBert 模型的语言模型预测头层
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
# 创建预测头变换层对象
self.transform = TFMobileBertPredictionHeadTransform(config, name="transform")
self.config = config
# 构建方法,用于创建层的权重
def build(self, input_shape=None):
# 创建偏置项权重,形状为 (config.vocab_size,)
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
# 创建全连接层的权重,形状为 (config.hidden_size - config.embedding_size, config.vocab_size)
self.dense = self.add_weight(
shape=(self.config.hidden_size - self.config.embedding_size, self.config.vocab_size),
initializer="zeros",
trainable=True,
name="dense/weight",
)
# 创建解码器权重,形状为 (config.vocab_size, config.embedding_size)
self.decoder = self.add_weight(
shape=(self.config.vocab_size, self.config.embedding_size),
initializer="zeros",
trainable=True,
name="decoder/weight",
)
if self.built:
return
self.built = True
# 如果存在 transform 层,则构建 transform 层的权重
if getattr(self, "transform", None) is not None:
with tf.name_scope(self.transform.name):
self.transform.build(None)
# 获取输出的嵌入向量
def get_output_embeddings(self):
return self
# 设置输出的嵌入向量
def set_output_embeddings(self, value):
self.decoder = value
self.config.vocab_size = shape_list(value)[0]
# 获取偏置项
def get_bias(self):
return {"bias": self.bias}
# 设置偏置项
def set_bias(self, value):
self.bias = value["bias"]
self.config.vocab_size = shape_list(value["bias"])[0]
# 定义一个方法,用于处理传入的隐藏状态数据
def call(self, hidden_states):
# 调用transform方法,对隐藏状态进行转换处理
hidden_states = self.transform(hidden_states)
# 使用矩阵乘法将转换后的隐藏状态与decoder和dense张量的连接进行乘法运算
hidden_states = tf.matmul(hidden_states, tf.concat([tf.transpose(self.decoder), self.dense], axis=0))
# 将偏置项加到乘法结果上
hidden_states = hidden_states + self.bias
# 返回处理后的隐藏状态数据
return hidden_states
class TFMobileBertMLMHead(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
# 初始化 MLM 预测头部,使用 MobileBertLMPredictionHead 类
self.predictions = TFMobileBertLMPredictionHead(config, name="predictions")
def call(self, sequence_output):
# 调用 predictions 对象进行序列输出的预测评分
prediction_scores = self.predictions(sequence_output)
return prediction_scores
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "predictions", None) is not None:
with tf.name_scope(self.predictions.name):
# 构建 predictions 对象,传入 None 的输入形状
self.predictions.build(None)
@keras_serializable
class TFMobileBertMainLayer(keras.layers.Layer):
config_class = MobileBertConfig
def __init__(self, config, add_pooling_layer=True, **kwargs):
super().__init__(**kwargs)
# 初始化 MobileBertMainLayer,配置各种属性
self.config = config
self.num_hidden_layers = config.num_hidden_layers
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.return_dict = config.use_return_dict
# 初始化 MobileBertEmbeddings、MobileBertEncoder 和可选的 MobileBertPooler 层
self.embeddings = TFMobileBertEmbeddings(config, name="embeddings")
self.encoder = TFMobileBertEncoder(config, name="encoder")
self.pooler = TFMobileBertPooler(config, name="pooler") if add_pooling_layer else None
def get_input_embeddings(self):
# 返回嵌入层对象
return self.embeddings
def set_input_embeddings(self, value):
# 设置嵌入层的权重和词汇大小
self.embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
# 剪枝模型中的注意力头部,heads_to_prune 参数为要剪枝的头部字典
raise NotImplementedError
@unpack_inputs
def call(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
training=False,
):
# 执行 MobileBertMainLayer 的前向传播,支持参数解包和可选的返回字典模式
...
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
# 构建 embeddings 对象,传入 None 的输入形状
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
# 构建 encoder 对象,传入 None 的输入形状
self.encoder.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
# 构建 pooler 对象,传入 None 的输入形状
self.pooler.build(None)
class TFMobileBertPreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = MobileBertConfig
base_model_prefix = "mobilebert"
@dataclass
class TFMobileBertForPreTrainingOutput(ModelOutput):
# TFMobileBert 预训练模型的输出数据结构
...
# 定义一个类似于 Type 注释的多行字符串,描述了 `TFMobileBertForPreTraining` 的输出类型信息
Output type of [`TFMobileBertForPreTraining`].
Args:
prediction_logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
预测语言建模头部的预测分数(在 SoftMax 之前的每个词汇标记的分数)。
seq_relationship_logits (`tf.Tensor` of shape `(batch_size, 2)`):
下一个序列预测(分类)头部的预测分数(在 SoftMax 之前的 True/False 连续性的分数)。
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
一个元组,包含 `tf.Tensor` 的输出(一个用于嵌入的输出 + 每个层的输出),形状为 `(batch_size, sequence_length, hidden_size)`。
模型在每个层输出的隐藏状态以及初始嵌入的输出。
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
一个元组,包含每个层的 `tf.Tensor`,形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
注意力 softmax 后的注意力权重,用于在自注意力头部中计算加权平均值。
"""
loss: tf.Tensor | None = None
prediction_logits: tf.Tensor = None
seq_relationship_logits: tf.Tensor = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
"""
This model inherits from `TFPreTrainedModel`. Check the superclass documentation for the generic methods the
library implements for all its models (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a `keras.Model` subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0
documentation for all matters related to general usage and behavior.
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
positional argument:
- a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
Note that when creating models and layers with
[subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
about any of this, as you can just pass inputs like you would to any other Python function!
</Tip>
Parameters:
config (`MobileBertConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the `PreTrainedModel.from_pretrained` method to load the model weights.
"""
"""
The bare MobileBert Model transformer outputting raw hidden-states without any specific head on top.
This model inherits the documentation from `MOBILEBERT_START_DOCSTRING`, which provides detailed information about
its usage with TensorFlow 2.0, input formats, and integration with Keras.
Parameters:
*inputs: Variable length input arguments to allow flexible input formats as described in the `MOBILEBERT_START_DOCSTRING`.
**kwargs: Additional keyword arguments passed to the superclass constructor.
"""
@add_start_docstrings(
"The bare MobileBert Model transformer outputting raw hidden-states without any specific head on top.",
MOBILEBERT_START_DOCSTRING,
)
class TFMobileBertModel(TFMobileBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
@unpack_inputs
@add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
) -> Union[Tuple, TFBaseModelOutputWithPooling]:
outputs = self.mobilebert(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "mobilebert", None) is not None:
with tf.name_scope(self.mobilebert.name):
self.mobilebert.build(None)
"""
MobileBert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
`next sentence prediction (classification)` head.
"""
class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel, TFMobileBertPreTrainingLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
self.predictions = TFMobileBertMLMHead(config, name="predictions___cls")
self.seq_relationship = TFMobileBertOnlyNSPHead(config, name="seq_relationship___cls")
def get_lm_head(self):
return self.predictions.predictions
def get_prefix_bias_name(self):
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
return self.name + "/" + self.predictions.name + "/" + self.predictions.predictions.name
@unpack_inputs
@add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=TFMobileBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
next_sentence_label: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[Tuple, TFMobileBertForPreTrainingOutput]:
r"""
返回类型注释,此函数返回一个元组或者 TFMobileBertForPreTrainingOutput 对象。
示例:
```
>>> import tensorflow as tf
>>> from transformers import AutoTokenizer, TFMobileBertForPreTraining
>>> tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")
>>> model = TFMobileBertForPreTraining.from_pretrained("google/mobilebert-uncased")
>>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
>>> outputs = model(input_ids)
>>> prediction_scores, seq_relationship_scores = outputs[:2]
```
执行模型的前向传播,生成预测分数和序列关系分数。
Parameters:
- input_ids (tf.Tensor): 输入的 token IDs
- attention_mask (Optional[tf.Tensor]): 注意力掩码
- token_type_ids (Optional[tf.Tensor]): token 类型 IDs
- position_ids (Optional[tf.Tensor]): 位置 IDs
- head_mask (Optional[tf.Tensor]): 头部掩码
- inputs_embeds (Optional[tf.Tensor]): 输入嵌入
- output_attentions (Optional[bool]): 是否输出注意力
- output_hidden_states (Optional[bool]): 是否输出隐藏状态
- return_dict (Optional[bool]): 是否以字典形式返回结果
- training (Optional[bool]): 是否处于训练模式
Returns:
- 如果 return_dict=False,则返回一个元组 (total_loss, prediction_scores, seq_relationship_scores, hidden_states, attentions) 或者 (prediction_scores, seq_relationship_scores, hidden_states, attentions)。
- 如果 return_dict=True,则返回一个 TFMobileBertForPreTrainingOutput 对象,包含 loss, prediction_logits, seq_relationship_logits, hidden_states, attentions 字段。
Raises:
- 无异常抛出。
"""
outputs = self.mobilebert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output, pooled_output = outputs[:2]
prediction_scores = self.predictions(sequence_output)
seq_relationship_score = self.seq_relationship(pooled_output)
total_loss = None
if labels is not None and next_sentence_label is not None:
d_labels = {"labels": labels}
d_labels["next_sentence_label"] = next_sentence_label
total_loss = self.hf_compute_loss(labels=d_labels, logits=(prediction_scores, seq_relationship_score))
if not return_dict:
output = (prediction_scores, seq_relationship_score) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
return TFMobileBertForPreTrainingOutput(
loss=total_loss,
prediction_logits=prediction_scores,
seq_relationship_logits=seq_relationship_score,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "mobilebert", None) is not None:
with tf.name_scope(self.mobilebert.name):
self.mobilebert.build(None)
if getattr(self, "predictions", None) is not None:
with tf.name_scope(self.predictions.name):
self.predictions.build(None)
if getattr(self, "seq_relationship", None) is not None:
with tf.name_scope(self.seq_relationship.name):
self.seq_relationship.build(None)
def tf_to_pt_weight_rename(self, tf_weight):
if tf_weight == "cls.predictions.decoder.weight":
return tf_weight, "mobilebert.embeddings.word_embeddings.weight"
else:
return (tf_weight,)
@add_start_docstrings("""MobileBert Model with a `language modeling` head on top.""", MOBILEBERT_START_DOCSTRING)
class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModelingLoss):
_keys_to_ignore_on_load_unexpected = [
r"pooler",
r"seq_relationship___cls",
r"cls.seq_relationship",
]
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.mobilebert = TFMobileBertMainLayer(config, add_pooling_layer=False, name="mobilebert")
self.predictions = TFMobileBertMLMHead(config, name="predictions___cls")
def get_lm_head(self):
return self.predictions.predictions
def get_prefix_bias_name(self):
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
return self.name + "/" + self.mlm.name + "/" + self.mlm.predictions.name
@unpack_inputs
@add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFMaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
expected_output="'paris'",
expected_loss=0.57,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[Tuple, TFMaskedLMOutput]:
r"""
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels
"""
outputs = self.mobilebert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
prediction_scores = self.predictions(sequence_output, training=training)
loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores)
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFMaskedLMOutput(
loss=loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "mobilebert", None) is not None:
with tf.name_scope(self.mobilebert.name):
self.mobilebert.build(None)
if getattr(self, "predictions", None) is not None:
with tf.name_scope(self.predictions.name):
self.predictions.build(None)
def tf_to_pt_weight_rename(self, tf_weight):
if tf_weight == "cls.predictions.decoder.weight":
return tf_weight, "mobilebert.embeddings.word_embeddings.weight"
else:
return (tf_weight,)
class TFMobileBertOnlyNSPHead(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.seq_relationship = keras.layers.Dense(2, name="seq_relationship")
self.config = config
def call(self, pooled_output):
seq_relationship_score = self.seq_relationship(pooled_output)
return seq_relationship_score
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "seq_relationship", None) is not None:
with tf.name_scope(self.seq_relationship.name):
self.seq_relationship.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""MobileBert 模型,顶部带有`下一句预测(分类)`头部。""",
MOBILEBERT_START_DOCSTRING,
)
class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel, TFNextSentencePredictionLoss):
_keys_to_ignore_on_load_unexpected = [r"predictions___cls", r"cls.predictions"]
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
self.cls = TFMobileBertOnlyNSPHead(config, name="seq_relationship___cls")
@unpack_inputs
@add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=TFNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
next_sentence_label: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[Tuple, TFNextSentencePredictorOutput]:
r"""
返回模型的输出结果或损失值。
Examples:
```
>>> import tensorflow as tf
>>> from transformers import AutoTokenizer, TFMobileBertForNextSentencePrediction
>>> tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")
>>> model = TFMobileBertForNextSentencePrediction.from_pretrained("google/mobilebert-uncased")
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
>>> encoding = tokenizer(prompt, next_sentence, return_tensors="tf")
>>> logits = model(encoding["input_ids"], token_type_ids=encoding["token_type_ids"])[0]
```"""
outputs = self.mobilebert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
pooled_output = outputs[1]
seq_relationship_scores = self.cls(pooled_output)
next_sentence_loss = (
None
if next_sentence_label is None
else self.hf_compute_loss(labels=next_sentence_label, logits=seq_relationship_scores)
)
if not return_dict:
output = (seq_relationship_scores,) + outputs[2:]
return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
return TFNextSentencePredictorOutput(
loss=next_sentence_loss,
logits=seq_relationship_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "mobilebert", None) is not None:
with tf.name_scope(self.mobilebert.name):
self.mobilebert.build(None)
if getattr(self, "cls", None) is not None:
with tf.name_scope(self.cls.name):
self.cls.build(None)
@add_start_docstrings(
"""
MobileBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
MOBILEBERT_START_DOCSTRING,
)
class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSequenceClassificationLoss):
_keys_to_ignore_on_load_unexpected = [
r"predictions___cls",
r"seq_relationship___cls",
r"cls.predictions",
r"cls.seq_relationship",
]
_keys_to_ignore_on_load_missing = [r"dropout"]
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = keras.layers.Dropout(classifier_dropout)
self.classifier = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[Tuple, TFSequenceClassifierOutput]:
r"""
labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
outputs = self.mobilebert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output, training=training)
logits = self.classifier(pooled_output)
loss = None if labels is None else self.hf_compute_loss(labels, logits)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFSequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "mobilebert", None) is not None:
with tf.name_scope(self.mobilebert.name):
self.mobilebert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
MobileBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
MOBILEBERT_START_DOCSTRING,
)
class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAnsweringLoss):
_keys_to_ignore_on_load_unexpected = [
r"pooler",
r"predictions___cls",
r"seq_relationship___cls",
r"cls.predictions",
r"cls.seq_relationship",
]
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.mobilebert = TFMobileBertMainLayer(config, add_pooling_layer=False, name="mobilebert")
self.qa_outputs = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_QA,
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
qa_target_start_index=_QA_TARGET_START_INDEX,
qa_target_end_index=_QA_TARGET_END_INDEX,
expected_output=_QA_EXPECTED_OUTPUT,
expected_loss=_QA_EXPECTED_LOSS,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
start_positions: np.ndarray | tf.Tensor | None = None,
end_positions: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[Tuple, TFQuestionAnsweringModelOutput]:
r"""
start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
outputs = self.mobilebert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = tf.split(logits, 2, axis=-1)
start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1)
loss = None
if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions, "end_position": end_positions}
loss = self.hf_compute_loss(labels, (start_logits, end_logits))
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "mobilebert", None) is not None:
with tf.name_scope(self.mobilebert.name):
self.mobilebert.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
"""
MobileBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
a softmax) e.g. for RocStories/SWAG tasks.
"""
class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoiceLoss):
_keys_to_ignore_on_load_unexpected = [
r"predictions___cls",
r"seq_relationship___cls",
r"cls.predictions",
r"cls.seq_relationship",
]
_keys_to_ignore_on_load_missing = [r"dropout"]
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
self.classifier = keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(
MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[Tuple, TFMultipleChoiceModelOutput]:
r"""
labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
"""
if input_ids is not None:
num_choices = shape_list(input_ids)[1]
seq_length = shape_list(input_ids)[2]
else:
num_choices = shape_list(inputs_embeds)[1]
seq_length = shape_list(inputs_embeds)[2]
flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
flat_inputs_embeds = (
tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3]))
if inputs_embeds is not None
else None
)
outputs = self.mobilebert(
flat_input_ids,
flat_attention_mask,
flat_token_type_ids,
flat_position_ids,
head_mask,
flat_inputs_embeds,
output_attentions,
output_hidden_states,
return_dict=return_dict,
training=training,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output, training=training)
logits = self.classifier(pooled_output)
reshaped_logits = tf.reshape(logits, (-1, num_choices))
loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFMultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "mobilebert", None) is not None:
with tf.name_scope(self.mobilebert.name):
self.mobilebert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
"""
MobileBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
for Named-Entity-Recognition (NER) tasks.
"""
@add_start_docstrings(
"""
MobileBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
for Named-Entity-Recognition (NER) tasks.
""",
MOBILEBERT_START_DOCSTRING,
)
class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenClassificationLoss):
"""
Subclass of TFMobileBertPreTrainedModel and TFTokenClassificationLoss for token classification tasks,
incorporating MobileBert architecture.
"""
_keys_to_ignore_on_load_unexpected = [
r"pooler",
r"predictions___cls",
r"seq_relationship___cls",
r"cls.predictions",
r"cls.seq_relationship",
]
_keys_to_ignore_on_load_missing = [r"dropout"]
def __init__(self, config, *inputs, **kwargs):
"""
Initialize TFMobileBertForTokenClassification model.
Args:
config (MobileBertConfig): Configuration object specifying model parameters.
*inputs: Variable length argument list for additional inputs.
**kwargs: Additional keyword arguments.
"""
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.mobilebert = TFMobileBertMainLayer(config, add_pooling_layer=False, name="mobilebert")
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = keras.layers.Dropout(classifier_dropout)
self.classifier = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION,
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT,
expected_loss=_TOKEN_CLASS_EXPECTED_LOSS,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
**kwargs,
):
"""
Perform forward pass of TFMobileBertForTokenClassification model.
Args:
input_ids (TFModelInputType, optional): Tensor of input token IDs.
attention_mask (np.ndarray or tf.Tensor, optional): Tensor of attention masks.
token_type_ids (np.ndarray or tf.Tensor, optional): Tensor of token type IDs.
position_ids (np.ndarray or tf.Tensor, optional): Tensor of position IDs.
head_mask (np.ndarray or tf.Tensor, optional): Tensor of head masks.
inputs_embeds (np.ndarray or tf.Tensor, optional): Tensor of input embeddings.
output_attentions (bool, optional): Whether to output attentions.
output_hidden_states (bool, optional): Whether to output hidden states.
return_dict (bool, optional): Whether to return a dictionary.
labels (np.ndarray or tf.Tensor, optional): Tensor of labels for token classification.
training (bool, optional): Whether in training mode.
**kwargs: Additional keyword arguments.
Returns:
TFTokenClassifierOutput or dict: Output of the model.
"""
return self.mobilebert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
labels=labels,
**kwargs,
)
) -> Union[Tuple, TFTokenClassifierOutput]:
r"""
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
outputs = self.mobilebert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output)
loss = None if labels is None else self.hf_compute_loss(labels, logits)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFTokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "mobilebert", None) is not None:
with tf.name_scope(self.mobilebert.name):
self.mobilebert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
.\models\mobilebert\tokenization_mobilebert.py
"""Tokenization classes for MobileBERT."""
import collections
import os
import unicodedata
from typing import List, Optional, Tuple
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {"mobilebert-uncased": "https://huggingface.co/google/mobilebert-uncased/resolve/main/vocab.txt"}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"mobilebert-uncased": 512}
PRETRAINED_INIT_CONFIGURATION = {}
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class MobileBertTokenizer(PreTrainedTokenizer):
r"""
Construct a MobileBERT tokenizer. Based on WordPiece.
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
"""
class PreTrainedTokenizer:
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(
self,
vocab_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs,
):
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
" model use `tokenizer = MobileBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
self.do_basic_tokenize = do_basic_tokenize
if do_basic_tokenize:
self.basic_tokenizer = BasicTokenizer(
do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
super().__init__(
do_lower_case=do_lower_case,
do_basic_tokenize=do_basic_tokenize,
never_split=never_split,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
@property
def do_lower_case(self):
return self.basic_tokenizer.do_lower_case
@property
def vocab_size(self):
return len(self.vocab)
def get_vocab(self):
return dict(self.vocab, **self.added_tokens_encoder)
def _tokenize(self, text, split_special_tokens=False):
split_tokens = []
if self.do_basic_tokenize:
for token in self.basic_tokenizer.tokenize(
text, never_split=self.all_special_tokens if not split_special_tokens else None
):
if token in self.basic_tokenizer.never_split:
split_tokens.append(token)
else:
split_tokens += self.wordpiece_tokenizer.tokenize(token)
else:
split_tokens = self.wordpiece_tokenizer.tokenize(text)
return split_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.ids_to_tokens.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and
adding special tokens. A MobileBERT sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of input IDs with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
else:
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create token type IDs tensor from token id pairs for sequence pairs. Token type IDs are binary tensors with 0s and 1s.
0 indicates the first sequence, and 1 indicates the second sequence.
Args:
token_ids_0 (`List[int]`):
List of IDs for the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional list of IDs for the second sequence in a pair.
Returns:
`List[int]`: A list of token type IDs representing the sequences.
"""
def create_mobilebert_attention_mask(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A MobileBERT sequence
pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of token IDs for the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of token IDs for sequence pairs.
Returns:
`List[int]`: List representing token type IDs according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
index = 0
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
else:
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!"
)
index = token_index
writer.write(token + "\n")
index += 1
return (vocab_file,)
class BasicTokenizer(object):
"""
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
Args:
do_lower_case (`bool`, *optional*, defaults to `True`):
是否在分词时将输入转换为小写。
never_split (`Iterable`, *optional*):
在分词过程中永不分割的 token 集合。仅在 `do_basic_tokenize=True` 时有效。
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
是否分词中包含中文字符。这对于日文来说可能需要禁用(参见这个问题)。
strip_accents (`bool`, *optional*):
是否去除所有的重音符号。如果未指定,则根据 `lowercase` 的值决定(与原始的 BERT 一致)。
do_split_on_punc (`bool`, *optional*, defaults to `True`):
在某些情况下,我们希望跳过基本的标点符号分割,以便稍后的分词可以捕获单词的完整上下文,例如缩略词。
"""
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of token not to split.
"""
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
text = self._clean_text(text)
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
if self.do_lower_case:
token = token.lower()
if self.strip_accents is not False:
token = self._run_strip_accents(token)
elif self.strip_accents:
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token, never_split))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF)
or (cp >= 0x20000 and cp <= 0x2A6DF)
or (cp >= 0x2A700 and cp <= 0x2B73F)
or (cp >= 0x2B740 and cp <= 0x2B81F)
or (cp >= 0x2B820 and cp <= 0x2CEAF)
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F)
):
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xFFFD or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
tokenization using the given vocabulary.
For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through *BasicTokenizer*.
Returns:
A list of wordpiece tokens.
"""
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
.\models\mobilebert\tokenization_mobilebert_fast.py
"""Tokenization classes for MobileBERT."""
import json
from typing import List, Optional, Tuple
from tokenizers import normalizers
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .tokenization_mobilebert import MobileBertTokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {"mobilebert-uncased": "https://huggingface.co/google/mobilebert-uncased/resolve/main/vocab.txt"},
"tokenizer_file": {
"mobilebert-uncased": "https://huggingface.co/google/mobilebert-uncased/resolve/main/tokenizer.json"
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"mobilebert-uncased": 512}
PRETRAINED_INIT_CONFIGURATION = {}
class MobileBertTokenizerFast(PreTrainedTokenizerFast):
r"""
Construct a "fast" MobileBERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.
"""
Args:
vocab_file (`str`):
File containing the vocabulary.
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (`str`, *optional*, defaults to `"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
clean_text (`bool`, *optional*, defaults to `True`):
Whether or not to clean the text before tokenization by removing any control characters and replacing all
whitespaces by the classic one.
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original MobileBERT).
wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
The prefix for subwords.
"""
# These constants define the file names expected for different vocabularies
vocab_files_names = VOCAB_FILES_NAMES
# This maps the expected pretrained vocabulary files for different models
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
# This specifies the initial configuration for pretrained models
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
# This maps maximum input sizes for pretrained models that use positional embeddings
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# This defines the class of the tokenizer which will be used, MobileBertTokenizer in this case
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
do_lower_case=True,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs,
):
# 调用父类的构造函数,初始化模型的词汇文件、分词器文件等参数
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
# 从后端分词器获取当前的正常化状态
normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
# 检查正常化器状态是否与初始化时的参数相匹配,若不匹配则更新
if (
normalizer_state.get("lowercase", do_lower_case) != do_lower_case
or normalizer_state.get("strip_accents", strip_accents) != strip_accents
or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
):
# 获取正常化器的类名,并根据当前设置更新状态
normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
normalizer_state["lowercase"] = do_lower_case
normalizer_state["strip_accents"] = strip_accents
normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
# 更新后端分词器的正常化器
self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
# 设置当前实例的小写参数
self.do_lower_case = do_lower_case
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A MobileBERT sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary
"""
# 构建模型输入,根据输入的token_ids_0和token_ids_1连接和添加特殊标记
output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
# 如果有第二个序列token_ids_1,则将其加入到输出中
if token_ids_1 is not None:
output += token_ids_1 + [self.sep_token_id]
# 返回包含特殊标记的输入列表
return output
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
def create_mobilebert_sequence_classification_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A MobileBERT sequence
pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs for the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of token type IDs according to the given sequence(s).
"""
# Define separator and classifier tokens
sep = [self.sep_token_id]
cls = [self.cls_token_id]
# If token_ids_1 is None, return mask for single sequence
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
# Return mask for sequence pair
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the tokenizer model's vocabulary to a specified directory.
Args:
save_directory (str):
Directory path where the vocabulary files will be saved.
filename_prefix (Optional[str]):
Optional prefix for the saved files.
Returns:
Tuple[str]: Tuple containing the filenames where the vocabulary is saved.
"""
# Save the tokenizer model's vocabulary to the specified directory
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
.\models\mobilebert\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_mobilebert": [
"MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP",
"MobileBertConfig",
"MobileBertOnnxConfig",
],
"tokenization_mobilebert": ["MobileBertTokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_mobilebert_fast"] = ["MobileBertTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_mobilebert"] = [
"MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
"MobileBertForMaskedLM",
"MobileBertForMultipleChoice",
"MobileBertForNextSentencePrediction",
"MobileBertForPreTraining",
"MobileBertForQuestionAnswering",
"MobileBertForSequenceClassification",
"MobileBertForTokenClassification",
"MobileBertLayer",
"MobileBertModel",
"MobileBertPreTrainedModel",
"load_tf_weights_in_mobilebert",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_mobilebert"] = [
"TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFMobileBertForMaskedLM",
"TFMobileBertForMultipleChoice",
"TFMobileBertForNextSentencePrediction",
"TFMobileBertForPreTraining",
"TFMobileBertForQuestionAnswering",
"TFMobileBertForSequenceClassification",
"TFMobileBertForTokenClassification",
"TFMobileBertMainLayer",
"TFMobileBertModel",
"TFMobileBertPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_mobilebert import (
MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
MobileBertConfig,
MobileBertOnnxConfig,
)
from .tokenization_mobilebert import MobileBertTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_mobilebert_fast import MobileBertTokenizerFast
```
```
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_mobilebert import (
MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
MobileBertForMaskedLM,
MobileBertForMultipleChoice,
MobileBertForNextSentencePrediction,
MobileBertForPreTraining,
MobileBertForQuestionAnswering,
MobileBertForSequenceClassification,
MobileBertForTokenClassification,
MobileBertLayer,
MobileBertModel,
MobileBertPreTrainedModel,
load_tf_weights_in_mobilebert,
)
```
```
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_mobilebert import (
TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFMobileBertForMaskedLM,
TFMobileBertForMultipleChoice,
TFMobileBertForNextSentencePrediction,
TFMobileBertForPreTraining,
TFMobileBertForQuestionAnswering,
TFMobileBertForSequenceClassification,
TFMobileBertForTokenClassification,
TFMobileBertMainLayer,
TFMobileBertModel,
TFMobileBertPreTrainedModel,
)
```
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\mobilenet_v1\configuration_mobilenet_v1.py
""" MobileNetV1 model configuration"""
from collections import OrderedDict
from typing import Mapping
from packaging import version
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
MOBILENET_V1_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"google/mobilenet_v1_1.0_224": "https://huggingface.co/google/mobilenet_v1_1.0_224/resolve/main/config.json",
"google/mobilenet_v1_0.75_192": "https://huggingface.co/google/mobilenet_v1_0.75_192/resolve/main/config.json",
}
class MobileNetV1Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`MobileNetV1Model`]. It is used to instantiate a
MobileNetV1 model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the MobileNetV1
[google/mobilenet_v1_1.0_224](https://huggingface.co/google/mobilenet_v1_1.0_224) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
pass
model_type = "mobilenet_v1"
def __init__(
self,
num_channels=3,
image_size=224,
depth_multiplier=1.0,
min_depth=8,
hidden_act="relu6",
tf_padding=True,
classifier_dropout_prob=0.999,
initializer_range=0.02,
layer_norm_eps=0.001,
**kwargs,
):
super().__init__(**kwargs)
if depth_multiplier <= 0:
raise ValueError("depth_multiplier must be greater than zero.")
self.num_channels = num_channels
self.image_size = image_size
self.depth_multiplier = depth_multiplier
self.min_depth = min_depth
self.hidden_act = hidden_act
self.tf_padding = tf_padding
self.classifier_dropout_prob = classifier_dropout_prob
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
class MobileNetV1OnnxConfig(OnnxConfig):
torch_onnx_minimum_version = version.parse("1.11")
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
return OrderedDict([("pixel_values", {0: "batch"})])
@property
def outputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task == "image-classification":
return OrderedDict([("logits", {0: "batch"})])
else:
return OrderedDict([("last_hidden_state", {0: "batch"}), ("pooler_output", {0: "batch"})])
@property
def atol_for_validation(self) -> float:
return 1e-4
.\models\mobilenet_v1\convert_original_tf_checkpoint_to_pytorch.py
"""从 tensorflow/models 库中转换 MobileNetV1 检查点。"""
import argparse
import json
import re
from pathlib import Path
import requests
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import (
MobileNetV1Config,
MobileNetV1ForImageClassification,
MobileNetV1ImageProcessor,
load_tf_weights_in_mobilenet_v1,
)
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def get_mobilenet_v1_config(model_name):
config = MobileNetV1Config(layer_norm_eps=0.001)
if "_quant" in model_name:
raise ValueError("Quantized models are not supported.")
matches = re.match(r"^mobilenet_v1_([^_]*)_([^_]*)$", model_name)
if matches:
config.depth_multiplier = float(matches[1])
config.image_size = int(matches[2])
config.num_labels = 1001
filename = "imagenet-1k-id2label.json"
repo_id = "huggingface/label-files"
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k) + 1: v for k, v in id2label.items()}
id2label[0] = "background"
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
return config
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_movilevit_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub=False):
"""
Copy/paste/tweak model's weights to our MobileNetV1 structure.
将模型的权重复制/粘贴/调整到我们的 MobileNetV1 结构中。
"""
config = get_mobilenet_v1_config(model_name)
model = MobileNetV1ForImageClassification(config).eval()
load_tf_weights_in_mobilenet_v1(model, config, checkpoint_path)
image_processor = MobileNetV1ImageProcessor(
crop_size={"width": config.image_size, "height": config.image_size},
size={"shortest_edge": config.image_size + 32},
)
encoding = image_processor(images=prepare_img(), return_tensors="pt")
outputs = model(**encoding)
logits = outputs.logits
assert logits.shape == (1, 1001)
if model_name == "mobilenet_v1_1.0_224":
expected_logits = torch.tensor([-4.1739, -1.1233, 3.1205])
elif model_name == "mobilenet_v1_0.75_192":
expected_logits = torch.tensor([-3.9440, -2.3141, -0.3333])
else:
expected_logits = None
if expected_logits is not None:
assert torch.allclose(logits[0, :3], expected_logits, atol=1e-4)
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving image processor to {pytorch_dump_folder_path}")
image_processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
print("Pushing to the hub...")
repo_id = "google/" + model_name
image_processor.push_to_hub(repo_id)
model.push_to_hub(repo_id)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
default="mobilenet_v1_1.0_224",
type=str,
help="Name of the MobileNetV1 model you'd like to convert. Should in the form 'mobilenet_v1_<depth>_<size>'."
)
parser.add_argument(
"--checkpoint_path", required=True, type=str, help="Path to the original TensorFlow checkpoint (.ckpt file)."
)
parser.add_argument(
"--pytorch_dump_folder_path", required=True, type=str, help="Path to the output PyTorch model directory."
)
parser.add_argument(
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
)
args = parser.parse_args()
convert_movilevit_checkpoint(
args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub
)
.\models\mobilenet_v1\feature_extraction_mobilenet_v1.py
"""Feature extractor class for MobileNetV1."""
import warnings
from ...utils import logging
from .image_processing_mobilenet_v1 import MobileNetV1ImageProcessor
logger = logging.get_logger(__name__)
class MobileNetV1FeatureExtractor(MobileNetV1ImageProcessor):
def __init__(self, *args, **kwargs) -> None:
warnings.warn(
"The class MobileNetV1FeatureExtractor is deprecated and will be removed in version 5 of Transformers."
" Please use MobileNetV1ImageProcessor instead.",
FutureWarning,
)
super().__init__(*args, **kwargs)
.\models\mobilenet_v1\image_processing_mobilenet_v1.py
"""Image processor class for MobileNetV1."""
from typing import Dict, List, Optional, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
get_resize_output_image_size,
resize,
to_channel_dimension_format,
)
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
to_numpy_array,
valid_images,
validate_kwargs,
validate_preprocess_arguments,
)
from ...utils import TensorType, logging
logger = logging.get_logger(__name__)
class MobileNetV1ImageProcessor(BaseImageProcessor):
r"""
Constructs a MobileNetV1 image processor.
构造一个 MobileNetV1 图像处理器类。
# 定义函数参数
Args:
# 是否调整图像的(高度,宽度)尺寸到指定的尺寸,默认为True
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
`do_resize` in the `preprocess` method.
# 图像调整大小后的尺寸,默认为`{"shortest_edge": 256}`
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 256}`):
Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
method.
# 图像调整大小时使用的重采样滤波器,默认为`PILImageResampling.BILINEAR`
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
`preprocess` method.
# 是否在图像中心裁剪,默认为True
do_center_crop (`bool`, *optional*, defaults to `True`):
Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image
is padded with 0's and then center cropped. Can be overridden by the `do_center_crop` parameter in the
`preprocess` method.
# 应用中心裁剪时所需的输出大小,默认为`{"height": 224, "width": 224}`
crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to `True`.
Can be overridden by the `crop_size` parameter in the `preprocess` method.
# 是否按指定的比例对图像进行重新缩放,默认为True
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
parameter in the `preprocess` method.
# 如果重新缩放图像,使用的比例因子,默认为1/255
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
`preprocess` method.
# 是否对图像进行正规化
do_normalize:
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
method.
# 如果对图像进行正规化,则使用的均值,默认为`IMAGENET_STANDARD_MEAN`
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
# 如果对图像进行正规化,则使用的标准差,默认为`IMAGENET_STANDARD_STD`
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Optional[Dict[str, int]] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
do_center_crop: bool = True,
crop_size: Dict[str, int] = None,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"shortest_edge": 256}
size = get_size_dict(size, default_to_square=False)
crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
crop_size = get_size_dict(crop_size)
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
self._valid_processor_keys = [
"images",
"do_resize",
"size",
"resample",
"do_center_crop",
"crop_size",
"do_rescale",
"rescale_factor",
"do_normalize",
"image_mean",
"image_std",
"return_tensors",
"data_format",
"input_data_format",
]
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
"""
Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
resized to keep the input aspect ratio.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Size of the output image. Should contain either "shortest_edge" or "height" and "width".
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use when resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
default_to_square = True
if "shortest_edge" in size:
size = size["shortest_edge"]
default_to_square = False
elif "height" in size and "width" in size:
size = (size["height"], size["width"])
else:
raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
output_size = get_resize_output_image_size(
image,
size=size,
default_to_square=default_to_square,
input_data_format=input_data_format,
)
return resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
.\models\mobilenet_v1\modeling_mobilenet_v1.py
""" PyTorch MobileNetV1 model."""
from typing import Optional, Union
import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutputWithPoolingAndNoAttention, ImageClassifierOutputWithNoAttention
from ...modeling_utils import PreTrainedModel
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_mobilenet_v1 import MobileNetV1Config
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "MobileNetV1Config"
_CHECKPOINT_FOR_DOC = "google/mobilenet_v1_1.0_224"
_EXPECTED_OUTPUT_SHAPE = [1, 1024, 7, 7]
_IMAGE_CLASS_CHECKPOINT = "google/mobilenet_v1_1.0_224"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
MOBILENET_V1_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/mobilenet_v1_1.0_224",
"google/mobilenet_v1_0.75_192",
]
def _build_tf_to_pytorch_map(model, config, tf_weights=None):
"""
A map of modules from TF to PyTorch.
"""
tf_to_pt_map = {}
if isinstance(model, MobileNetV1ForImageClassification):
backbone = model.mobilenet_v1
else:
backbone = model
prefix = "MobilenetV1/Conv2d_0/"
tf_to_pt_map[prefix + "weights"] = backbone.conv_stem.convolution.weight
tf_to_pt_map[prefix + "BatchNorm/beta"] = backbone.conv_stem.normalization.bias
tf_to_pt_map[prefix + "BatchNorm/gamma"] = backbone.conv_stem.normalization.weight
tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = backbone.conv_stem.normalization.running_mean
tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = backbone.conv_stem.normalization.running_var
for i in range(13):
tf_index = i + 1
pt_index = i * 2
pointer = backbone.layer[pt_index]
prefix = f"MobilenetV1/Conv2d_{tf_index}_depthwise/"
tf_to_pt_map[prefix + "depthwise_weights"] = pointer.convolution.weight
tf_to_pt_map[prefix + "BatchNorm/beta"] = pointer.normalization.bias
tf_to_pt_map[prefix + "BatchNorm/gamma"] = pointer.normalization.weight
tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = pointer.normalization.running_mean
tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = pointer.normalization.running_var
pointer = backbone.layer[pt_index + 1]
prefix = f"MobilenetV1/Conv2d_{tf_index}_pointwise/"
tf_to_pt_map[prefix + "weights"] = pointer.convolution.weight
tf_to_pt_map[prefix + "BatchNorm/beta"] = pointer.normalization.bias
tf_to_pt_map[prefix + "BatchNorm/gamma"] = pointer.normalization.weight
tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = pointer.normalization.running_mean
tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = pointer.normalization.running_var
if isinstance(model, MobileNetV1ForImageClassification):
prefix = "MobilenetV1/Logits/Conv2d_1c_1x1/"
tf_to_pt_map[prefix + "weights"] = model.classifier.weight
tf_to_pt_map[prefix + "biases"] = model.classifier.bias
return tf_to_pt_map
def load_tf_weights_in_mobilenet_v1(model, config, tf_checkpoint_path):
try:
import numpy as np
import tensorflow as tf
except ImportError:
logger.error(
"Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
init_vars = tf.train.list_variables(tf_checkpoint_path)
tf_weights = {}
for name, shape in init_vars:
logger.info(f"Loading TF weight {name} with shape {shape}")
array = tf.train.load_variable(tf_checkpoint_path, name)
tf_weights[name] = array
tf_to_pt_map = _build_tf_to_pytorch_map(model, config, tf_weights)
for name, pointer in tf_to_pt_map.items():
logger.info(f"Importing {name}")
if name not in tf_weights:
logger.info(f"{name} not in tf pre-trained weights, skipping")
continue
array = tf_weights[name]
if "depthwise_weights" in name:
logger.info("Transposing depthwise")
array = np.transpose(array, (2, 3, 0, 1))
elif "weights" in name:
logger.info("Transposing")
if len(pointer.shape) == 2:
array = array.squeeze().transpose()
else:
array = np.transpose(array, (3, 2, 0, 1))
if pointer.shape != array.shape:
raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
logger.info(f"Initialize PyTorch weight {name} {array.shape}")
pointer.data = torch.from_numpy(array)
tf_weights.pop(name, None)
tf_weights.pop(name + "/RMSProp", None)
tf_weights.pop(name + "/RMSProp_1", None)
tf_weights.pop(name + "/ExponentialMovingAverage", None)
logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}")
return model
def apply_tf_padding(features: torch.Tensor, conv_layer: nn.Conv2d) -> torch.Tensor:
"""
Apply TensorFlow-style "SAME" padding to a convolution layer. See the notes at:
https://www.tensorflow.org/api_docs/python/tf/nn#notes_on_padding_2
"""
in_height, in_width = features.shape[-2:]
stride_height, stride_width = conv_layer.stride
kernel_height, kernel_width = conv_layer.kernel_size
if in_height % stride_height == 0:
pad_along_height = max(kernel_height - stride_height, 0)
else:
pad_along_height = max(kernel_height - (in_height % stride_height), 0)
if in_width % stride_width == 0:
pad_along_width = max(kernel_width - stride_width, 0)
else:
pad_along_width = max(kernel_width - (in_width % stride_width), 0)
pad_left = pad_along_width // 2
pad_right = pad_along_width - pad_left
pad_top = pad_along_height // 2
pad_bottom = pad_along_height - pad_top
padding = (pad_left, pad_right, pad_top, pad_bottom)
return nn.functional.pad(features, padding, "constant", 0.0)
class MobileNetV1ConvLayer(nn.Module):
def __init__(
self,
config: MobileNetV1Config,
in_channels: int,
out_channels: int,
kernel_size: int,
stride: Optional[int] = 1,
groups: Optional[int] = 1,
bias: bool = False,
use_normalization: Optional[bool] = True,
use_activation: Optional[bool or str] = True,
) -> None:
super().__init__()
self.config = config
if in_channels % groups != 0:
raise ValueError(f"Input channels ({in_channels}) are not divisible by {groups} groups.")
if out_channels % groups != 0:
raise ValueError(f"Output channels ({out_channels}) are not divisible by {groups} groups.")
padding = 0 if config.tf_padding else int((kernel_size - 1) / 2)
self.convolution = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=groups,
bias=bias,
padding_mode="zeros",
)
if use_normalization:
self.normalization = nn.BatchNorm2d(
num_features=out_channels,
eps=config.layer_norm_eps,
momentum=0.9997,
affine=True,
track_running_stats=True,
)
else:
self.normalization = None
if use_activation:
if isinstance(use_activation, str):
self.activation = ACT2FN[use_activation]
elif isinstance(config.hidden_act, str):
self.activation = ACT2FN[config.hidden_act]
else:
self.activation = config.hidden_act
else:
self.activation = None
def forward(self, features: torch.Tensor) -> torch.Tensor:
if self.config.tf_padding:
features = apply_tf_padding(features, self.convolution)
features = self.convolution(features)
if self.normalization is not None:
features = self.normalization(features)
if self.activation is not None:
features = self.activation(features)
return features
class MobileNetV1PreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = MobileNetV1Config
load_tf_weights = load_tf_weights_in_mobilenet_v1
base_model_prefix = "mobilenet_v1"
main_input_name = "pixel_values"
supports_gradient_checkpointing = False
def _init_weights(self, module: Union[nn.Linear, nn.Conv2d]) -> None:
"""Initialize the weights"""
if isinstance(module, (nn.Linear, nn.Conv2d)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.BatchNorm2d):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
MOBILENET_V1_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`MobileNetV1Config`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
MOBILENET_V1_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`MobileNetV1ImageProcessor.__call__`] for details.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare MobileNetV1 model outputting raw hidden-states without any specific head on top.",
MOBILENET_V1_START_DOCSTRING,
)
class MobileNetV1Model(MobileNetV1PreTrainedModel):
def __init__(self, config: MobileNetV1Config, add_pooling_layer: bool = True):
super().__init__(config)
self.config = config
depth = 32
out_channels = max(int(depth * config.depth_multiplier), config.min_depth)
self.conv_stem = MobileNetV1ConvLayer(
config,
in_channels=config.num_channels,
out_channels=out_channels,
kernel_size=3,
stride=2,
)
strides = [1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1]
self.layer = nn.ModuleList()
for i in range(13):
in_channels = out_channels
if strides[i] == 2 or i == 0:
depth *= 2
out_channels = max(int(depth * config.depth_multiplier), config.min_depth)
self.layer.append(
MobileNetV1ConvLayer(
config,
in_channels=in_channels,
out_channels=in_channels,
kernel_size=3,
stride=strides[i],
groups=in_channels,
)
)
self.layer.append(
MobileNetV1ConvLayer(
config,
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
)
)
self.pooler = nn.AdaptiveAvgPool2d((1, 1)) if add_pooling_layer else None
self.post_init()
def _prune_heads(self, heads_to_prune):
raise NotImplementedError
@add_start_docstrings_to_model_forward(MOBILENET_V1_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndNoAttention,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, BaseModelOutputWithPoolingAndNoAttention]:
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
hidden_states = self.conv_stem(pixel_values)
all_hidden_states = () if output_hidden_states else None
for i, layer_module in enumerate(self.layer):
hidden_states = layer_module(hidden_states)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
last_hidden_state = hidden_states
if self.pooler is not None:
pooled_output = torch.flatten(self.pooler(last_hidden_state), start_dim=1)
else:
pooled_output = None
if not return_dict:
return tuple(v for v in [last_hidden_state, pooled_output, all_hidden_states] if v is not None)
return BaseModelOutputWithPoolingAndNoAttention(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=all_hidden_states,
)
@add_start_docstrings(
"""
MobileNetV1 model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
ImageNet.
""",
MOBILENET_V1_START_DOCSTRING,
)
class MobileNetV1ForImageClassification(MobileNetV1PreTrainedModel):
def __init__(self, config: MobileNetV1Config) -> None:
super().__init__(config)
self.num_labels = config.num_labels
self.mobilenet_v1 = MobileNetV1Model(config)
last_hidden_size = self.mobilenet_v1.layer[-1].convolution.out_channels
self.dropout = nn.Dropout(config.classifier_dropout_prob, inplace=True)
self.classifier = nn.Linear(last_hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
self.post_init()
@add_start_docstrings_to_model_forward(MOBILENET_V1_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
output_hidden_states: Optional[bool] = None,
labels: Optional[torch.Tensor] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
用于计算图像分类/回归损失的标签。索引应在 `[0, ..., config.num_labels - 1]` 范围内。
如果 `config.num_labels == 1`,则计算回归损失(均方误差损失)。
如果 `config.num_labels > 1`,则计算分类损失(交叉熵损失)。
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.mobilenet_v1(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
pooled_output = outputs.pooler_output if return_dict else outputs[1]
logits = self.classifier(self.dropout(pooled_output))
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return ImageClassifierOutputWithNoAttention(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
)
.\models\mobilenet_v1\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
_import_structure = {
"configuration_mobilenet_v1": [
"MOBILENET_V1_PRETRAINED_CONFIG_ARCHIVE_MAP",
"MobileNetV1Config",
"MobileNetV1OnnxConfig",
],
}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["feature_extraction_mobilenet_v1"] = ["MobileNetV1FeatureExtractor"]
_import_structure["image_processing_mobilenet_v1"] = ["MobileNetV1ImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_mobilenet_v1"] = [
"MOBILENET_V1_PRETRAINED_MODEL_ARCHIVE_LIST",
"MobileNetV1ForImageClassification",
"MobileNetV1Model",
"MobileNetV1PreTrainedModel",
"load_tf_weights_in_mobilenet_v1",
]
if TYPE_CHECKING:
from .configuration_mobilenet_v1 import (
MOBILENET_V1_PRETRAINED_CONFIG_ARCHIVE_MAP,
MobileNetV1Config,
MobileNetV1OnnxConfig,
)
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .feature_extraction_mobilenet_v1 import MobileNetV1FeatureExtractor
from .image_processing_mobilenet_v1 import MobileNetV1ImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_mobilenet_v1 import (
MOBILENET_V1_PRETRAINED_MODEL_ARCHIVE_LIST,
MobileNetV1ForImageClassification,
MobileNetV1Model,
MobileNetV1PreTrainedModel,
load_tf_weights_in_mobilenet_v1,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\mobilenet_v2\configuration_mobilenet_v2.py
from collections import OrderedDict
from typing import Mapping
from packaging import version
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
MOBILENET_V2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"google/mobilenet_v2_1.4_224": "https://huggingface.co/google/mobilenet_v2_1.4_224/resolve/main/config.json",
"google/mobilenet_v2_1.0_224": "https://huggingface.co/google/mobilenet_v2_1.0_224/resolve/main/config.json",
"google/mobilenet_v2_0.75_160": "https://huggingface.co/google/mobilenet_v2_0.75_160/resolve/main/config.json",
"google/mobilenet_v2_0.35_96": "https://huggingface.co/google/mobilenet_v2_0.35_96/resolve/main/config.json",
}
class MobileNetV2Config(PretrainedConfig):
r"""
这是一个配置类,用于存储[`MobileNetV2Model`]的配置。根据指定的参数实例化MobileNetV2模型,定义模型架构。
使用默认参数实例化配置将产生与MobileNetV2 [google/mobilenet_v2_1.0_224]架构相似的配置。
配置对象继承自[`PretrainedConfig`],可用于控制模型输出。阅读[`PretrainedConfig`]的文档获取更多信息。
"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
Args:
num_channels (`int`, *optional*, defaults to 3):
输入图像的通道数,默认为3。
image_size (`int`, *optional*, defaults to 224):
每张图像的分辨率大小,默认为224。
depth_multiplier (`float`, *optional*, defaults to 1.0):
每层中通道数的缩放倍数。默认为1.0,表示网络从32个通道开始。有时也称为“alpha”或“宽度倍增器”。
depth_divisible_by (`int`, *optional*, defaults to 8):
每层的通道数始终是此数的倍数,默认为8。
min_depth (`int`, *optional*, defaults to 8):
所有层至少具有的通道数,默认为8。
expand_ratio (`float`, *optional*, defaults to 6.0):
每个块中第一层的输出通道数是输入通道数乘以扩展比例。
output_stride (`int`, *optional*, defaults to 32):
输入和输出特征图之间的空间分辨率比例。默认情况下,模型将输入尺寸减少32倍。
如果 `output_stride` 是8或16,模型会在深度wise层上使用扩张卷积,以确保特征图不会比输入图像小超过8倍或16倍。
first_layer_is_expansion (`bool`, *optional*, defaults to `True`):
如果第一个卷积层也是第一个扩展块的扩展层,则为True。
finegrained_output (`bool`, *optional*, defaults to `True`):
如果为True,则最终卷积层中的输出通道数将保持较大值(1280),即使 `depth_multiplier` 小于1。
hidden_act (`str` or `function`, *optional*, defaults to `"relu6"`):
在Transformer编码器和卷积层中使用的非线性激活函数(函数或字符串)。
tf_padding (`bool`, *optional*, defaults to `True`):
是否在卷积层中使用TensorFlow的填充规则。
classifier_dropout_prob (`float`, *optional*, defaults to 0.8):
附加分类器的dropout比率。
initializer_range (`float`, *optional*, defaults to 0.02):
用于初始化所有权重矩阵的截断正态初始化器的标准差。
layer_norm_eps (`float`, *optional*, defaults to 0.001):
层归一化层使用的epsilon值。
semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
语义分割模型损失函数中忽略的索引。
Example:
```
>>> from transformers import MobileNetV2Config, MobileNetV2Model
>>>
>>> configuration = MobileNetV2Config()
model_type = "mobilenet_v2"
class MobileNetV2Model:
def __init__(
self,
num_channels=3,
image_size=224,
depth_multiplier=1.0,
depth_divisible_by=8,
min_depth=8,
expand_ratio=6.0,
output_stride=32,
first_layer_is_expansion=True,
finegrained_output=True,
hidden_act="relu6",
tf_padding=True,
classifier_dropout_prob=0.8,
initializer_range=0.02,
layer_norm_eps=0.001,
semantic_loss_ignore_index=255,
**kwargs,
):
super().__init__(**kwargs)
if depth_multiplier <= 0:
raise ValueError("depth_multiplier must be greater than zero.")
self.num_channels = num_channels
self.image_size = image_size
self.depth_multiplier = depth_multiplier
self.depth_divisible_by = depth_divisible_by
self.min_depth = min_depth
self.expand_ratio = expand_ratio
self.output_stride = output_stride
self.first_layer_is_expansion = first_layer_is_expansion
self.finegrained_output = finegrained_output
self.hidden_act = hidden_act
self.tf_padding = tf_padding
self.classifier_dropout_prob = classifier_dropout_prob
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.semantic_loss_ignore_index = semantic_loss_ignore_index
class MobileNetV2OnnxConfig(OnnxConfig):
torch_onnx_minimum_version = version.parse("1.11")
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
return OrderedDict([("pixel_values", {0: "batch"})])
@property
def outputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task == "image-classification":
return OrderedDict([("logits", {0: "batch"})])
else:
return OrderedDict([("last_hidden_state", {0: "batch"}), ("pooler_output", {0: "batch"})])
@property
def atol_for_validation(self) -> float:
return 1e-4
.\models\mobilenet_v2\convert_original_tf_checkpoint_to_pytorch.py
import argparse
import json
import re
from pathlib import Path
import requests
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import (
MobileNetV2Config,
MobileNetV2ForImageClassification,
MobileNetV2ForSemanticSegmentation,
MobileNetV2ImageProcessor,
load_tf_weights_in_mobilenet_v2,
)
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def get_mobilenet_v2_config(model_name):
config = MobileNetV2Config(layer_norm_eps=0.001)
if "quant" in model_name:
raise ValueError("Quantized models are not supported.")
matches = re.match(r"^.*mobilenet_v2_([^_]*)_([^_]*)$", model_name)
if matches:
config.depth_multiplier = float(matches[1])
config.image_size = int(matches[2])
if model_name.startswith("deeplabv3_"):
config.output_stride = 8
config.num_labels = 21
filename = "pascal-voc-id2label.json"
else:
config.num_labels = 1001
filename = "imagenet-1k-id2label.json"
repo_id = "huggingface/label-files"
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
if config.num_labels == 1001:
id2label = {int(k) + 1: v for k, v in id2label.items()}
id2label[0] = "background"
else:
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
return config
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_movilevit_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub=False):
"""
将模型的权重复制/粘贴/调整到我们的 MobileNetV2 结构中。
"""
config = get_mobilenet_v2_config(model_name)
if model_name.startswith("deeplabv3_"):
model = MobileNetV2ForSemanticSegmentation(config).eval()
else:
model = MobileNetV2ForImageClassification(config).eval()
load_tf_weights_in_mobilenet_v2(model, config, checkpoint_path)
image_processor = MobileNetV2ImageProcessor(
crop_size={"width": config.image_size, "height": config.image_size},
size={"shortest_edge": config.image_size + 32},
)
encoding = image_processor(images=prepare_img(), return_tensors="pt")
outputs = model(**encoding)
logits = outputs.logits
if model_name.startswith("deeplabv3_"):
assert logits.shape == (1, 21, 65, 65)
if model_name == "deeplabv3_mobilenet_v2_1.0_513":
expected_logits = torch.tensor(
[
[[17.5790, 17.7581, 18.3355], [18.3257, 18.4230, 18.8973], [18.6169, 18.8650, 19.2187]],
[[-2.1595, -2.0977, -2.3741], [-2.4226, -2.3028, -2.6835], [-2.7819, -2.5991, -2.7706]],
[[4.2058, 4.8317, 4.7638], [4.4136, 5.0361, 4.9383], [4.5028, 4.9644, 4.8734]],
]
)
else:
raise ValueError(f"Unknown model name: {model_name}")
assert torch.allclose(logits[0, :3, :3, :3], expected_logits, atol=1e-4)
else:
assert logits.shape == (1, 1001)
if model_name == "mobilenet_v2_1.4_224":
expected_logits = torch.tensor([0.0181, -1.0015, 0.4688])
elif model_name == "mobilenet_v2_1.0_224":
expected_logits = torch.tensor([0.2445, -1.1993, 0.1905])
elif model_name == "mobilenet_v2_0.75_160":
expected_logits = torch.tensor([0.2482, 0.4136, 0.6669])
elif model_name == "mobilenet_v2_0.35_96":
expected_logits = torch.tensor([0.1451, -0.4624, 0.7192])
else:
expected_logits = None
if expected_logits is not None:
assert torch.allclose(logits[0, :3], expected_logits, atol=1e-4)
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving image processor to {pytorch_dump_folder_path}")
image_processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
print("Pushing to the hub...")
repo_id = "google/" + model_name
image_processor.push_to_hub(repo_id)
model.push_to_hub(repo_id)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
default="mobilenet_v2_1.0_224",
type=str,
help="Name of the MobileNetV2 model you'd like to convert. Should be in the form 'mobilenet_v2_<depth>_<size>'.",
)
parser.add_argument(
"--checkpoint_path",
required=True,
type=str,
help="Path to the original TensorFlow checkpoint (.ckpt file)."
)
parser.add_argument(
"--pytorch_dump_folder_path",
required=True,
type=str,
help="Path to the output PyTorch model directory."
)
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Whether or not to push the converted model to the 🤗 hub."
)
args = parser.parse_args()
convert_movilevit_checkpoint(
args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub
)
.\models\mobilenet_v2\feature_extraction_mobilenet_v2.py
"""Feature extractor class for MobileNetV2."""
import warnings
from ...utils import logging
from .image_processing_mobilenet_v2 import MobileNetV2ImageProcessor
logger = logging.get_logger(__name__)
class MobileNetV2FeatureExtractor(MobileNetV2ImageProcessor):
def __init__(self, *args, **kwargs) -> None:
warnings.warn(
"The class MobileNetV2FeatureExtractor is deprecated and will be removed in version 5 of Transformers."
" Please use MobileNetV2ImageProcessor instead.",
FutureWarning,
)
super().__init__(*args, **kwargs)
.\models\mobilenet_v2\image_processing_mobilenet_v2.py
"""Image processor class for MobileNetV2."""
from typing import Dict, List, Optional, Tuple, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
get_resize_output_image_size,
resize,
to_channel_dimension_format,
)
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
to_numpy_array,
valid_images,
validate_kwargs,
validate_preprocess_arguments,
)
from ...utils import TensorType, is_torch_available, is_torch_tensor, logging
if is_torch_available():
import torch
logger = logging.get_logger(__name__)
class MobileNetV2ImageProcessor(BaseImageProcessor):
r"""
Constructs a MobileNetV2 image processor.
构建一个 MobileNetV2 图像处理器。
"""
Args:
do_resize (`bool`, *optional*, defaults to `True`):
是否调整图像的高度和宽度尺寸到指定的 `size`。可以在 `preprocess` 方法中通过 `do_resize` 参数进行覆盖。
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 256}`):
调整后的图像尺寸。图像的最短边被调整为 `size["shortest_edge"]`,保持输入的宽高比。可以在 `preprocess` 方法中通过 `size` 参数进行覆盖。
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
调整图像尺寸时使用的重采样滤波器。可以在 `preprocess` 方法中通过 `resample` 参数进行覆盖。
do_center_crop (`bool`, *optional*, defaults to `True`):
是否对图像进行中心裁剪。如果输入尺寸小于任何边缘的 `crop_size`,则用 0 填充图像,然后进行中心裁剪。可以在 `preprocess` 方法中通过 `do_center_crop` 参数进行覆盖。
crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
应用中心裁剪时的期望输出尺寸。仅在 `do_center_crop` 设置为 `True` 时生效。可以在 `preprocess` 方法中通过 `crop_size` 参数进行覆盖。
do_rescale (`bool`, *optional*, defaults to `True`):
是否按指定的比例因子 `rescale_factor` 对图像进行重新缩放。可以在 `preprocess` 方法中通过 `do_rescale` 参数进行覆盖。
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
如果重新缩放图像时使用的缩放因子。可以在 `preprocess` 方法中通过 `rescale_factor` 参数进行覆盖。
do_normalize:
是否对图像进行归一化。可以在 `preprocess` 方法中通过 `do_normalize` 参数进行覆盖。
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
归一化图像时使用的均值。这是一个浮点数或与图像通道数相同长度的浮点数列表。可以在 `preprocess` 方法中通过 `image_mean` 参数进行覆盖。
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
归一化图像时使用的标准差。这是一个浮点数或与图像通道数相同长度的浮点数列表。可以在 `preprocess` 方法中通过 `image_std` 参数进行覆盖。
"""
# 定义模型输入的名称列表
model_input_names = ["pixel_values"]
# 初始化方法,设置图像处理器的各种参数和默认值
def __init__(
self,
do_resize: bool = True, # 是否进行大小调整,默认为True
size: Optional[Dict[str, int]] = None, # 图像大小的字典,可选,默认为None
resample: PILImageResampling = PILImageResampling.BILINEAR, # 重采样方法,默认为双线性插值
do_center_crop: bool = True, # 是否进行中心裁剪,默认为True
crop_size: Dict[str, int] = None, # 裁剪尺寸的字典,可选,默认为None
do_rescale: bool = True, # 是否进行重新缩放,默认为True
rescale_factor: Union[int, float] = 1 / 255, # 重新缩放因子,默认为1/255
do_normalize: bool = True, # 是否进行归一化,默认为True
image_mean: Optional[Union[float, List[float]]] = None, # 图像均值,可选,默认为None
image_std: Optional[Union[float, List[float]]] = None, # 图像标准差,可选,默认为None
**kwargs, # 其他参数
) -> None:
super().__init__(**kwargs) # 调用父类的初始化方法
size = size if size is not None else {"shortest_edge": 256} # 如果size为None,则设置默认最短边为256
size = get_size_dict(size, default_to_square=False) # 根据size字典获取图像尺寸的字典,不默认为正方形
crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224} # 如果crop_size为None,则设置默认裁剪尺寸为224x224
crop_size = get_size_dict(crop_size, param_name="crop_size") # 根据crop_size字典获取裁剪尺寸的字典
self.do_resize = do_resize # 设置是否进行大小调整的属性
self.size = size # 设置图像大小的属性
self.resample = resample # 设置重采样方法的属性
self.do_center_crop = do_center_crop # 设置是否进行中心裁剪的属性
self.crop_size = crop_size # 设置裁剪尺寸的属性
self.do_rescale = do_rescale # 设置是否进行重新缩放的属性
self.rescale_factor = rescale_factor # 设置重新缩放因子的属性
self.do_normalize = do_normalize # 设置是否进行归一化的属性
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN # 设置图像均值的属性,如果为None则使用预设值
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD # 设置图像标准差的属性,如果为None则使用预设值
self._valid_processor_keys = [
"images", # 图像关键字
"do_resize", # 是否进行大小调整的关键字
"size", # 图像大小的关键字
"resample", # 重采样方法的关键字
"do_center_crop", # 是否进行中心裁剪的关键字
"crop_size", # 裁剪尺寸的关键字
"do_rescale", # 是否进行重新缩放的关键字
"rescale_factor", # 重新缩放因子的关键字
"do_normalize", # 是否进行归一化的关键字
"image_mean", # 图像均值的关键字
"image_std", # 图像标准差的关键字
"return_tensors", # 返回张量的关键字
"data_format", # 数据格式的关键字
"input_data_format", # 输入数据格式的关键字
]
# 从transformers.models.mobilenet_v1.image_processing_mobilenet_v1.MobileNetV1ImageProcessor.resize复制而来
def resize(
self,
image: np.ndarray, # 输入图像的numpy数组
size: Dict[str, int], # 目标尺寸的字典
resample: PILImageResampling = PILImageResampling.BICUBIC, # 重采样方法,默认为双三次插值
data_format: Optional[Union[str, ChannelDimension]] = None, # 数据格式,可选,默认为None
input_data_format: Optional[Union[str, ChannelDimension]] = None, # 输入数据格式,可选,默认为None
**kwargs, # 其他参数
) -> np.ndarray:
"""
Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
resized to keep the input aspect ratio.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use when resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
# 默认将图像调整为正方形
default_to_square = True
# 如果输入的尺寸字典中包含 "shortest_edge" 键
if "shortest_edge" in size:
# 将 size 重置为 shortest_edge 的值
size = size["shortest_edge"]
# 取消默认将图像调整为正方形的设置
default_to_square = False
# 如果输入的尺寸字典中同时包含 "height" 和 "width" 键
elif "height" in size and "width" in size:
# 将 size 重置为 (height, width) 的元组
size = (size["height"], size["width"])
else:
# 如果尺寸字典中既没有 "shortest_edge" 也没有同时包含 "height" 和 "width",则抛出数值错误
raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
# 获取调整后的输出图像尺寸
output_size = get_resize_output_image_size(
image,
size=size,
default_to_square=default_to_square,
input_data_format=input_data_format,
)
# 返回调整大小后的图像
return resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
def preprocess(
self,
images: ImageInput,
do_resize: Optional[bool] = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
do_center_crop: bool = None,
crop_size: Dict[str, int] = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
):
"""
Preprocesses images according to specified operations.
Args:
images (`ImageInput`): Input images to preprocess.
do_resize (`bool`, *optional*): Whether to resize the images.
size (`Dict[str, int]`, *optional*): Target size of the images after resizing.
resample (`PILImageResampling`, *optional*): Resampling filter for resizing.
do_center_crop (`bool`, *optional*): Whether to perform center cropping.
crop_size (`Dict[str, int]`, *optional*): Size of the crop.
do_rescale (`bool`, *optional*): Whether to rescale the images.
rescale_factor (`float`, *optional*): Scaling factor for rescaling.
do_normalize (`bool`, *optional*): Whether to normalize the images.
image_mean (`float` or `List[float]`, *optional*): Mean values for normalization.
image_std (`float` or `List[float]`, *optional*): Standard deviation values for normalization.
return_tensors (`str` or `TensorType`, *optional*): Desired tensor type for output.
data_format (`str` or `ChannelDimension`): Channel dimension format of the images.
input_data_format (`str` or `ChannelDimension`, *optional*): Channel dimension format of the input images.
**kwargs: Additional keyword arguments.
Returns:
Preprocessed images according to the specified operations.
"""
# 此处省略了具体的实现内容,根据函数定义,该方法对输入的图像进行预处理,并根据参数执行相应的操作。
# 具体的预处理操作包括但不限于调整大小、中心裁剪、重新缩放、归一化等。
pass
def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None):
"""
Converts the output of [`MobileNetV2ForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch.
Args:
outputs ([`MobileNetV2ForSemanticSegmentation`]):
Raw outputs of the model.
target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
predictions will not be resized.
Returns:
semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
"""
# TODO: add support for other frameworks
# 获取输出中的 logits
logits = outputs.logits
# 如果指定了目标大小,则调整 logits 并计算语义分割图
if target_sizes is not None:
# 检查 logits 的数量与目标大小列表的长度是否一致
if len(logits) != len(target_sizes):
raise ValueError(
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
)
# 如果 target_sizes 是 torch tensor,则转换为 numpy 数组
if is_torch_tensor(target_sizes):
target_sizes = target_sizes.numpy()
# 初始化语义分割结果列表
semantic_segmentation = []
# 遍历每个 logits
for idx in range(len(logits)):
# 使用双线性插值调整 logits 的尺寸
resized_logits = torch.nn.functional.interpolate(
logits[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
)
# 获取调整大小后的语义分割图
semantic_map = resized_logits[0].argmax(dim=0)
# 将语义分割图添加到结果列表中
semantic_segmentation.append(semantic_map)
else:
# 如果未指定目标大小,则直接计算 logits 的每个样本的语义分割图
semantic_segmentation = logits.argmax(dim=1)
semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
# 返回语义分割结果列表
return semantic_segmentation