How to use TFLite models
import tensorflow as tf
import numpy as np
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
l0 = Dense(units=1, input_shape=[1])
model = Sequential([l0])
model.compile(optimizer='sgd', loss='mean_squared_error')
xs = np.array([-1.0, 0.0, 1.0, 2.0, 3.0, 4.0], dtype=float)
ys = np.array([-3.0, -1.0, 1.0, 3.0, 5.0, 7.0], dtype=float)
model.fit(xs, ys, epochs=500)
print(model.predict([10.0]))
print("Here is what I learned: {}".format(l0.get_weights()))
export_dir = 'saved_model/1'
tf.saved_model.save(model, export_dir)
This will create a directory with a number of files and metadata describing your model. To learn more about the SavedModel format, take a little time now to read www.tensorflow.org/guide/saved… , and also check out the colab describing how SavedModel works at colab.research.google.com/github/tens… , and in particular explore ‘The SavedModel format on disk’ section in that colab.
Once you had your saved model, you could then use the TensorFlow Lite converter to convert it to TF Lite format:
# Convert the model.
converter = tf.lite.TFLiteConverter.from_saved_model(export_dir)
tflite_model = converter.convert()
This, in turn could be written to disk as a single file that fully encapsulates the model and its saved weights:
import pathlib
tflite_model_file = pathlib.Path('model.tflite')
tflite_model_file.write_bytes(tflite_model)
To use a pre-saved tflite file, you then instantiate a tf.lite.Interpreter, and use the ‘model_content’ property to specify an existing model:
# Load TFLite model and allocate tensors.
interpreter = tf.lite.Interpreter(model_content=tflite_model)
# Or, if you don’t have the existing model already, and just have a file,
# you can use the ‘model_path’ property to have the interpreter load the file from disk:
# interpreter = tf.lite.Interpreter(model_path=tflite_model_file)
interpreter.allocate_tensors()
Once you’ve loaded the model you can then start performing inference with it. Do note that to run inference you need to get details of the input and output tensors to the model. You’ll then set the value of the input tensor, invoke the model, and then get the value of the output tensor. Your code will typically look like this:
# Get input and output tensors.
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
print(input_details)
print(output_details)
Output:
[{'name': 'serving_default_dense_input:0', 'index': 0, 'shape': array([1, 1], dtype=int32), 'shape_signature': array([-1, 1], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}]
[{'name': 'StatefulPartitionedCall:0', 'index': 3, 'shape': array([1, 1], dtype=int32), 'shape_signature': array([-1, 1], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}]
to_predict = np.array([[10.0]], dtype=np.float32)
print(to_predict)
interpreter.set_tensor(input_details[0]['index'], to_predict)
interpreter.invoke()
tflite_results = interpreter.get_tensor(output_details[0]['index'])
print(tflite_results)
...and a large part of the skills in running models on embedded systems is in being able to format your data to the needs of the model. For example, you might be grabbing frames from a camera that has a particular resolution and encoding, but you need to decode them to 224x224 3-channel images to use with a common model called mobilenet. A large part of any engineering for ML systems is performing this conversion.
To learn more about running inference with models using TensorFLow Lite, check out the documentation at: www.tensorflow.org/lite/guide/…
Running Models with TFLite
import numpy as np
import matplotlib.pylab as plt
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
setattr(tfds.image_classification.cats_vs_dogs, '_URL',"https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_5340.zip")
def format_image(image, label):
image = tf.image.resize(image, (224, 224)) / 255.0
return image, label
(raw_train, raw_validation, raw_test), metadata = tfds.load(
'cats_vs_dogs',
split=['train[:80%]', 'train[80%:90%]', 'train[90%:]'],
with_info=True,
as_supervised=True,
)
num_examples = metadata.splits['train'].num_examples
num_classes = metadata.features['label'].num_classes
print(num_examples)
print(num_classes)
BATCH_SIZE = 32
train_batches = raw_train.shuffle(num_examples // 4).map(format_image).batch(BATCH_SIZE).prefetch(1)
validation_batches = raw_validation.map(format_image).batch(BATCH_SIZE).prefetch(1)
test_batches = raw_test.map(format_image).batch(1)
for image_batch, label_batch in train_batches.take(1):
pass
image_batch.shape
module_selection = ("mobilenet_v2", 224, 1280)
handle_base, pixels, FV_SIZE = module_selection
MODULE_HANDLE ="https://tfhub.dev/google/tf2-preview/{}/feature_vector/4".format(handle_base)
IMAGE_SIZE = (pixels, pixels)
print("Using {} with input size {} and output dimension {}".format(MODULE_HANDLE, IMAGE_SIZE, FV_SIZE))
feature_extractor = hub.KerasLayer(MODULE_HANDLE,
input_shape=IMAGE_SIZE + (3,),
output_shape=[FV_SIZE],
trainable=False)
print("Building model with", MODULE_HANDLE)
model = tf.keras.Sequential([
feature_extractor,
tf.keras.layers.Dense(num_classes, activation='softmax')
])
model.summary()
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
EPOCHS = 5
hist = model.fit(train_batches,
epochs=EPOCHS,
validation_data=validation_batches)
CATS_VS_DOGS_SAVED_MODEL = "exp_saved_model"
tf.saved_model.save(model, CATS_VS_DOGS_SAVED_MODEL)
import pathlib
converter = tf.lite.TFLiteConverter.from_saved_model(CATS_VS_DOGS_SAVED_MODEL)
tflite_model = converter.convert()
tflite_models_dir = pathlib.Path("/tmp/")
tflite_model_file = tflite_models_dir/'model1.tflite'
tflite_model_file.write_bytes(tflite_model)
# This will report back the file size in bytes
from tqdm import tqdm
# Load TFLite model and allocate tensors.
tflite_model_file = '/tmp/model1.tflite'
interpreter = tf.lite.Interpreter(model_path=tflite_model_file)
interpreter.allocate_tensors()
input_index = interpreter.get_input_details()[0]["index"]
output_index = interpreter.get_output_details()[0]["index"]
predictions = []
# This will report how many iterations per second, where each
# iteration is 100 predictions
test_labels, test_imgs = [], []
for img, label in tqdm(test_batches.take(100)):
interpreter.set_tensor(input_index, img)
interpreter.invoke()
predictions.append(interpreter.get_tensor(output_index))
test_labels.append(label.numpy()[0])
test_imgs.append(img)
# This will tell you how many of the predictions were correct
score = 0
for item in range(0,len(predictions)):
prediction=np.argmax(predictions[item])
label = test_labels[item]
if prediction==label:
score=score+1
print("Out of 100 predictions I got " + str(score) + " correct")
#@title Utility functions for plotting
# Utilities for plotting
class_names = ['cat', 'dog']
def plot_image(i, predictions_array, true_label, img):
predictions_array, true_label, img = predictions_array[i], true_label[i], img[i]
plt.grid(False)
plt.xticks([])
plt.yticks([])
img = np.squeeze(img)
plt.imshow(img, cmap=plt.cm.binary)
predicted_label = np.argmax(predictions_array)
if predicted_label == true_label:
color = 'green'
else:
color = 'red'
plt.xlabel("{} {:2.0f}% ({})".format(class_names[predicted_label],
100*np.max(predictions_array),
class_names[true_label]), color=color)
#@title Visualize the outputs { run: "auto" }
max_index = 73 #@param {type:"slider", min:1, max:100, step:1}
for index in range(0,max_index):
plt.figure(figsize=(6,3))
plt.subplot(1,2,1)
plot_image(index, predictions, test_labels, test_imgs)
plt.show()
TFLite Optimizations and Quantization
We start by importing TensorFlow and our Dataset. And splitting our dataset into batches.
import numpy as np
import matplotlib.pylab as plt
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
# format images to have normalized pixels
def format_image(image, label):
image = tf.image.resize(image, (224, 224)) / 255.0
return image, label
# load in our dataset
(raw_train, raw_validation, raw_test), metadata = tfds.load(
'cats_vs_dogs',
split=['train[:80%]', 'train[80%:90%]', 'train[90%:]'],
with_info=True,
as_supervised=True,
)
# display how much data we have
num_examples = metadata.splits['train'].num_examples
num_classes = metadata.features['label'].num_classes
print(num_examples)
print(num_classes)
# split the data in training, validation, and test datasets
BATCH_SIZE = 32
train_batches = raw_train.shuffle(num_examples // 4).map(format_image).batch(BATCH_SIZE).prefetch(1)
validation_batches = raw_validation.map(format_image).batch(BATCH_SIZE).prefetch(1)
test_batches = raw_test.map(format_image).batch(1)
# display the shape of our data
for image_batch, label_batch in train_batches.take(1):
pass
image_batch.shape
We next define our (pre-trained) model
module_selection = ("mobilenet_v2", 224, 1280)
handle_base, pixels, FV_SIZE = module_selection
MODULE_HANDLE ="https://tfhub.dev/google/tf2-preview/{}/feature_vector/4".format(handle_base)
IMAGE_SIZE = (pixels, pixels)
print("Using {} with input size {} and output dimension {}".format(MODULE_HANDLE, IMAGE_SIZE, FV_SIZE))
feature_extractor = hub.KerasLayer(MODULE_HANDLE,
input_shape=IMAGE_SIZE + (3,),
output_shape=[FV_SIZE],
trainable=False)
print("Building model with", MODULE_HANDLE)
model = tf.keras.Sequential([
feature_extractor,
tf.keras.layers.Dense(num_classes, activation='softmax')
])
model.summary()
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
Output:
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
keras_layer (KerasLayer) (None, 1280) 2257984
dense (Dense) (None, 2) 2562
=================================================================
Total params: 2,260,546
Trainable params: 2,562
Non-trainable params: 2,257,984
_________________________________________________________________
We then train and save our model. Since we are doing transfer learning to fine tune a pre-trained model to our dataset we only need to use 5 Epochs.
EPOCHS = 5
hist = model.fit(train_batches,
epochs=EPOCHS,
validation_data=validation_batches)
CATS_VS_DOGS_SAVED_MODEL = "exp_saved_model"
tf.saved_model.save(model, CATS_VS_DOGS_SAVED_MODEL)
Run this code, and you'll have model1.tflite, with no optimization or quantization.
Then, remove the comment on the converter.optimizations = [ ] line. Change the model name to model2.tflite, and rerun. Model2.tflite will now have optimizations added -- you should see a much smaller file size.
Finally, remove the comments on the code to add a representative dataset and set the supported ops as shown. Change the model name to model3.tflite, and rerun. Model3.tflite will now have optimizations added, along with quantization from the representative dataset. Note: it might be slightly larger than model2.tflite!
www.tensorflow.org/api_docs/py…
import pathlib
converter = tf.lite.TFLiteConverter.from_saved_model(CATS_VS_DOGS_SAVED_MODEL)
# These options are for converter optimizaitons
# Consider trying the converter without them and
# explore model size and accuracy
# Then...use them and reconvert the model and explore model
# size an accuracy at that point. What differences do you see?
# converter.optimizations = [tf.lite.Optimize.DEFAULT] # Uncomment this line for Model 2 and Model 3
# def representative_data_gen(): # Uncomment the following 5 lines for Model 3
# for input_value, _ in test_batches.take(100):
# yield [input_value]
# converter.representative_dataset = representative_data_gen
# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
tflite_model = converter.convert()
tflite_models_dir = pathlib.Path("/tmp/")
tflite_model_file = tflite_models_dir/'model1.tflite' # Change the filename here for Model2 and Model3!
tflite_model_file.write_bytes(tflite_model)
# Without any optimizations I got
# 8857848 (model1.tflite)
# With the .optimizations property set I got
# 2629648 (model2.tflite)
# With the .optimizations property and representative data set I got
# 2835952 -- Slightly larger! (model3.tflite)
Now we will test the accuracy of the three models! After you run each model you will get the number of correct predictions and then you can plot which images were correct/incorrect!
- Run this code
- Change the model file to model2.tflite and run it again
- Change the model file to model3.tflite and run it again
Run this cell each time to test your model's accuracy (make sure to change the filename)
#@title Run this cell each time to test your model's accuracy (make sure to change the filename)
from tqdm import tqdm
# Load TFLite model and allocate tensors.
tflite_model_file = '/tmp/model1.tflite' # Change the filename here for Model 2 and 3
interpreter = tf.lite.Interpreter(model_path=tflite_model_file)
interpreter.allocate_tensors()
input_index = interpreter.get_input_details()[0]["index"]
output_index = interpreter.get_output_details()[0]["index"]
predictions = []
test_labels, test_imgs = [], []
for img, label in tqdm(test_batches.take(100)):
interpreter.set_tensor(input_index, img)
interpreter.invoke()
predictions.append(interpreter.get_tensor(output_index))
test_labels.append(label.numpy()[0])
test_imgs.append(img)
# For model 1, I got 32.25 it/s
# For model 2, I got 16.16 it/s
# For model 3, I got 1.19s it/s
# Note: since the it/s will depend on the computer on which your Colab VM
# instance is running -- we would expect it to vary a bit.
score = 0
for item in range(0,100):
prediction=np.argmax(predictions[item])
label = test_labels[item]
if prediction==label:
score=score+1
print("Out of 100 predictions I got " + str(score) + " correct")
# Model 1 - 100 Correct
# Model 2 - 99 Correct
# Model 3 - 99 Correct
# Note: since training starts from a random intialization it would not be
# surprising if your result is off by 1 or 2 correct.
Define utility functions once for plotting
#@title Define utility functions once for plotting
# Utilities for plotting
class_names = ['cat', 'dog']
def plot_image(i, predictions_array, true_label, img):
predictions_array, true_label, img = predictions_array[i], true_label[i], img[i]
plt.grid(False)
plt.xticks([])
plt.yticks([])
img = np.squeeze(img)
plt.imshow(img, cmap=plt.cm.binary)
predicted_label = np.argmax(predictions_array)
if predicted_label == true_label:
color = 'green'
else:
color = 'red'
plt.xlabel("{} {:2.0f}% ({})".format(class_names[predicted_label],
100*np.max(predictions_array),
class_names[true_label]), color=color)
Visualize the outputs each time
#@title Visualize the outputs each time { run: "auto" }
max_index = 12 #@param {type:"slider", min:1, max:100, step:1}
for index in range(0,max_index):
plt.figure(figsize=(6,3))
plt.subplot(1,2,1)
plot_image(index, predictions, test_labels, test_imgs)
plt.show()
Quantization Aware Training
There are two primary forms of quantization -- post training quantization that you had seen previously -- where, as part of the conversion process, your model’s internal weights and ops get converted to int8 and uint8. It’s much easier to use this, and it’s a good way to get started.
As you want to further optimize your model, you may want to explore quantization aware training. It turns out that there is a lot more fine-grained control available to you. To get some exposure to some of those controls, please work on the below colab provided by the TensorFlow team on Quantization-aware Training (QAT). After you are done, if you would like to go deeper, including a comprehensive guide on all the APIs available in the toolkit, check out and read through the model optimization site onTensorFlow.org.
In particular, note the results posted by the Google teams when comparing accuracy of models before and after quantizing like this -- the effects on accuracy are negligible!
| Model | Non-quantized Top-1 Accurac | >8-bit Quantized Accuracy |
|---|---|---|
| MobilenetV1 224 | 71.03% | 71.06% |
| Resnet v1 50 | 76.3% | 76.1% |
| MobilenetV2 224 | 70.77% | 70.01% |
Colab link for QAT for you to work through: colab.research.google.com/github/tiny…
Quantization aware training in Keras example
Overview
Welcome to an end-to-end example for quantization aware training.
Other pages
For an introduction to what quantization aware training is and to determine if you should use it (including what's supported), see the overview page. To quickly find the APIs you need for your use case (beyond fully-quantizing a model with 8-bits), see the comprehensive guide.
Summary
In this tutorial, you will:
- Train a
tf.kerasmodel for MNIST from scratch. - Fine tune the model by applying the quantization aware training API, see the accuracy, and export a quantization aware model.
- Use the model to create an actually quantized model for the TFLite backend.
- See the persistence of accuracy in TFLite and a 4x smaller model. To see the latency benefits on mobile, try out the TFLite examples in the TFLite app repository.
Setup
import tempfile
import os
import tensorflow as tf
from tensorflow import keras
Train a model for MNIST without quantization aware training
# Load MNIST dataset
mnist = keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
# Normalize the input image so that each pixel value is between 0 to 1.
train_images = train_images / 255.0
test_images = test_images / 255.0
# Define the model architecture.
model = keras.Sequential([
keras.layers.InputLayer(input_shape=(28, 28)),
keras.layers.Reshape(target_shape=(28, 28, 1)),
keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation='relu'),
keras.layers.MaxPooling2D(pool_size=(2, 2)),
keras.layers.Flatten(),
keras.layers.Dense(10)
])
# Train the digit classification model
model.compile(optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
model.fit(
train_images,
train_labels,
epochs=1,
validation_split=0.1,
)
Clone and fine-tune pre-trained model with quantization aware training
Define the model
You will apply quantization aware training to the whole model and see this in the model summary. All layers are now prefixed by "quant".
Note that the resulting model is quantization aware but not quantized (e.g. the weights are float32 instead of int8). The sections after show how to create a quantized model from the quantization aware one.
In the comprehensive guide, you can see how to quantize some layers for model accuracy improvements.
import tensorflow_model_optimization as tfmot
quantize_model = tfmot.quantization.keras.quantize_model
# q_aware stands for for quantization aware.
q_aware_model = quantize_model(model)
# `quantize_model` requires a recompile.
q_aware_model.compile(optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
q_aware_model.summary()
Train and evaluate the model against baseline
To demonstrate fine tuning after training the model for just an epoch, fine tune with quantization aware training on a subset of the training data.
train_images_subset = train_images[0:1000] # out of 60000
train_labels_subset = train_labels[0:1000]
q_aware_model.fit(train_images_subset, train_labels_subset,
batch_size=500, epochs=1, validation_split=0.1)
For this example, there is minimal to no loss in test accuracy after quantization aware training, compared to the baseline.
_, baseline_model_accuracy = model.evaluate(
test_images, test_labels, verbose=0)
_, q_aware_model_accuracy = q_aware_model.evaluate(
test_images, test_labels, verbose=0)
print('Baseline test accuracy:', baseline_model_accuracy)
print('Quant test accuracy:', q_aware_model_accuracy)
Create quantized model for TFLite backend
After this, you have an actually quantized model with int8 weights and uint8 activations.
converter = tf.lite.TFLiteConverter.from_keras_model(q_aware_model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
quantized_tflite_model = converter.convert()
See persistence of accuracy from TF to TFLite
Define a helper function to evaluate the TF Lite model on the test dataset.
import numpy as np
def evaluate_model(interpreter):
input_index = interpreter.get_input_details()[0]["index"]
output_index = interpreter.get_output_details()[0]["index"]
# Run predictions on every image in the "test" dataset.
prediction_digits = []
for i, test_image in enumerate(test_images):
if i % 1000 == 0:
print('Evaluated on {n} results so far.'.format(n=i))
# Pre-processing: add batch dimension and convert to float32 to match with
# the model's input data format.
test_image = np.expand_dims(test_image, axis=0).astype(np.float32)
interpreter.set_tensor(input_index, test_image)
# Run inference.
interpreter.invoke()
# Post-processing: remove batch dimension and find the digit with highest
# probability.
output = interpreter.tensor(output_index)
digit = np.argmax(output()[0])
prediction_digits.append(digit)
print('\n')
# Compare prediction results with ground truth labels to calculate accuracy.
prediction_digits = np.array(prediction_digits)
accuracy = (prediction_digits == test_labels).mean()
return accuracy
You evaluate the quantized model and see that the accuracy from TensorFlow persists to the TFLite backend.
interpreter = tf.lite.Interpreter(model_content=quantized_tflite_model)
interpreter.allocate_tensors()
test_accuracy = evaluate_model(interpreter)
print('Quant TFLite test_accuracy:', test_accuracy)
print('Quant TF test accuracy:', q_aware_model_accuracy)
See 4x smaller model from quantization
You create a float TFLite model and then see that the quantized TFLite model is 4x smaller.
# Create float TFLite model.
float_converter = tf.lite.TFLiteConverter.from_keras_model(model)
float_tflite_model = float_converter.convert()
# Measure sizes of models.
_, float_file = tempfile.mkstemp('.tflite')
_, quant_file = tempfile.mkstemp('.tflite')
with open(quant_file, 'wb') as f:
f.write(quantized_tflite_model)
with open(float_file, 'wb') as f:
f.write(float_tflite_model)
print("Float model in Mb:", os.path.getsize(float_file) / float(2**20))
print("Quantized model in Mb:", os.path.getsize(quant_file) / float(2**20))