Tutorials

Accelerate a public Keras SavedModel

First make sure you have TensorFlow 2.6 installed and the latest version of OctoML’s SDK:

! pip install tensorflow==2.6
! pip install octomizer-sdk --upgrade

Import libraries:

from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2

import octomizer.client
import octomizer.models.tensorflow_saved_model as tf_model

Now fetch a public Keras SavedModel from Keras’s official model zoo:

model = MobileNetV2(weights='imagenet')

# Calling `save('my_model')` creates a SavedModel folder `my_model`.
model.save("my_model")

# Create a tarball for the model
! tar -czvf my_model.tgz my_model

Upload the Keras SavedModel to OctoML’s Platform:

# Pass your API token below:
client = octomizer.client.OctomizerClient(access_token=MY_ACCESS_TOKEN)

# Upload the model to Octomizer.
model = tf_model.TensorFlowSavedModel(client, name="my_model", model="my_model.tgz")

This is a model with dynamic inputs, so you’ll need to specify the input shapes before acceleration. If not, you’ll get an error message saying “Dynamic inputs are not supported.”:

# Check the automatically inferred shapes.
inputs = model.get_uploaded_model_variant().inputs
print(inputs)
input_name = list(inputs[0].keys())[0]

# The command above returns a string starting with something like 'input_0:0': [-1, 224, 224, 3].
# The -1 in the first dimension means you need to specify the batch size.

input_shapes = {input_name: [1, 224, 224, 3]} # Notice the -1 has been replaced by 1.
input_dtypes = inputs[1]

Now accelerate the model. By default, the resulting package will be a Python wheel:

wrkflow = model.get_uploaded_model_variant().accelerate(platform="broadwell",
input_shapes=input_shapes, input_dtypes=input_dtypes)

# Save the workflow uuid somewhere so you can use it to access benchmark metrics or the resulting package later.
print(wrkflow.uuid)

After you receive an email notification about the completion of the acceleration workflow, you can view performance benchmark metrics on the hardware you chose and download a packaged version of the accelerated model, either by visiting the UI or invoking the following code:

# Look up the workflow you previously launched using the workflow id
wrkflow = client.get_workflow("<INSERT WORKFLOW ID>")

# To view benchmark metrics, either visit the UI or invoke something similar to:
if wrkflow.completed() and wrkflow.has_benchmark_stage():
    engine = wrkflow.proto.benchmark_stage_spec.engine
    metrics = wrkflow.metrics()
    print(engine, metrics)
    # Save the resulting Python wheel to the current directory.
    wrkflow.save_package(".")

Accelerate a custom Keras SavedModel

First make sure you have TensorFlow 2.6 installed and the latest version of OctoML’s SDK:

! pip install tensorflow==2.6
! pip install octomizer-sdk --upgrade

Import libraries:

from tensorflow import keras
import numpy as np
import octomizer.client
import octomizer.models.tensorflow_saved_model as tf_model

Define and save your Keras SavedModel:

def get_model():
    # Create a simple model using Keras.
    inputs = keras.Input(shape=(32,))
    outputs = keras.layers.Dense(1)(inputs)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer="adam", loss="mean_squared_error")
    return model

model = get_model()

# Train the model.
test_input = np.random.random((128, 32))
test_target = np.random.random((128, 1))
model.fit(test_input, test_target)

# Calling `save('my_model')` creates a SavedModel folder `my_model`.
# Calling keras.models.save_model(model, "my_model") also works.
model.save("my_model")

# Create a tarball for the model
! tar -czvf my_model.tgz my_model

The remaining steps to upload the custom Keras SavedModel, disambiguate inputs, accelerate the model, and view results are same as the steps specified above for public Keras SavedModels.

Accelerate a public TensorFlow GraphDef

First make sure you have TensorFlow 2.6 installed and the latest version of OctoML’s SDK:

! pip install tensorflow==2.6
! pip install octomizer-sdk --upgrade

Import libraries:

import tensorflow as tf
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2
from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2

import octomizer.client
import octomizer.models.tensorflow_graph_def_model as tf_model

Now fetch a public GraphDef from Keras’s official model zoo:

model = MobileNetV2(weights='imagenet')

Convert the model to a GraphDef:

full_model = tf.function(lambda x: model(x))
full_model = full_model.get_concrete_function(x=tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype))

# Get frozen ConcreteFunction
frozen_func = convert_variables_to_constants_v2(full_model)
frozen_func.graph.as_graph_def()

# Save frozen graph from frozen ConcreteFunction to hard drive
tf.io.write_graph(graph_or_graph_def=frozen_func.graph,
                  logdir=".",
                  name="myGraphDef.pb",
                  as_text=False)

Upload the GraphDef to OctoML’s Platform:

# Pass your API token below:
client = octomizer.client.OctomizerClient(access_token=MY_ACCESS_TOKEN)

# Upload the model to Octomizer.
model = tf_model.TensorFlowGraphDefModel(client, name="myGraphDef.pb", model="myGraphDef.pb")

This is a model with dynamic inputs, so you’ll need to specify the input shapes before acceleration. If not, you’ll get an error message saying “Dynamic inputs are not supported.”:

# Check the automatically inferred shapes.
inputs = model.get_uploaded_model_variant().inputs
print(inputs)
input_name = list(inputs[0].keys())[0]

# The command above returns a string starting with something like 'x:0': [-1, 224, 224, 3].
# The -1 in the first dimension means you need to specify the batch size.

input_shapes = {input_name: [1, 224, 224, 3]} # Notice the -1 has been replaced by 1.
input_dtypes = inputs[1]

The remaining steps to accelerate the model and view results are same as the steps specified above for public Keras SavedModels.

Accelerate a custom TensorFlow GraphDef

First make sure you have TensorFlow 2.6 installed and the latest version of OctoML’s SDK:

! pip install tensorflow==2.6
! pip install octomizer-sdk --upgrade

Import libraries:

import tensorflow as tf
from tensorflow import keras
from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2
import numpy as np

import octomizer.client
import octomizer.models.tensorflow_graph_def_model as tf_model

Define the model:

def get_model():
    # Create a simple model using Keras.
    inputs = keras.Input(shape=(32,))
    outputs = keras.layers.Dense(1)(inputs)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer="adam", loss="mean_squared_error")
    return model

model = get_model()

# Train the model.
test_input = np.random.random((128, 32))
test_target = np.random.random((128, 1))
model.fit(test_input, test_target)

The remaining steps to convert the model to a GraphDef, upload the GraphDef to OctoML, disambiguate inputs, accelerate the model, and view results are same as the steps specified above for public GraphDef models.

End-to-end PyTorch object detection example using ImageNet

First, make sure you have all necessary libraries installed:

! pip install onnxruntime==1.11.0
! pip install onnx==1.9.0
! pip install torch==1.9.0
! pip install octomizer-sdk --upgrade

Import libraries:

import numpy as np
import octomizer.client
import octomizer.models.torchscript_model as torchscript_model
import torch.onnx
import torchvision
import urllib

Download a public PyTorch model:

model = torchvision.models.squeezenet1_0(pretrained=True)

Export the PyTorch model to Torchscript.

There are two methods for exporting models to Torchscript: scripting and tracing. We strongly recommend tracing to maximize your Torchscript models’ compatibility. Below, we provide an example of tracing a squeezenet model.

For more information on exporting PyTorch models to Torchscript, we recommend this guide https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html:

model_name = "squeezenet_from_pt"
torchscript_file = model_name + ".pt"

# A standard ImageNet input has 3 color channels (RGB) and images of dimension 224x224.
# Image values can be randomized to conduct tracing
rand_input = torch.randn(1, 3, 224, 224)

model.eval()

traced_model = torch.jit.trace(model, rand_input)

traced_model.save(f=torchscript_file)

Next, upload the model to the OctoML platform.

Note: Torchscript models do not include input shape or data type information, so you’ll need to pass those in as dictionaries for each input. Since Squeezenet only has one input (referred to as “input0” below), these dicts will have only one record for both shape and datatype:

# Pass in your model name and (if you have one) your project name
model_name = 'squeezenet'
description = 'squeezenet first test'

# Pass in your shapes and data type for each input
input_shapes = {'input0': [1, 3, 224, 224]}
input_dtype = {'input0': 'float32'}

#Upload the model to the Octomizer
model = torchscript_model.TorchscriptModel(
                client=client,
                name=model_name,
                model=torchscript_file,
                model_input_shapes=input_shapes,
                model_input_dtypes=input_dtype,
                description=description
)

Now accelerate the model. By default, the resulting package will be a Python wheel:

wrkflow = model.get_uploaded_model_variant().accelerate(platform="broadwell")

# Dynamically shaped models would require `input_shapes` and `input_dtypes` as additional parameters in the accelerate() call.

# Save the workflow uuid somewhere so you can use it to access benchmark metrics or the resulting package later.
print(wrkflow.uuid)

# Also save the ``model_name`` you used somewhere because you will need to call `import <model_name>` after download the resulting package later,
# unless you set a custom package name per https://app.octoml.ai/docs/api/octomizer/octomizer.html?highlight=package#octomizer.model_variant.ModelVariant.accelerate
print(model_name)

After you receive an email notification about the completion of the acceleration workflow, you can view performance benchmark metrics on the hardware you chose and download a packaged version of the accelerated model, either by visiting the UI or invoking the following code:

# Look up the workflow you previously launched using the workflow id
wrkflow = client.get_workflow("<INSERT WORKFLOW ID>")

# To view benchmark metrics, either visit the UI or invoke something similar to:
if wrkflow.completed() and wrkflow.has_benchmark_stage():
    engine = wrkflow.proto.benchmark_stage_spec.engine
    metrics = wrkflow.metrics()
    print(engine, metrics)
    # Save the resulting Python wheel to the current directory.
    wrkflow.save_package(".")

Download the wheel generated by OctoML:

! pip install squeezenet_from_pt-0.1.0-py3-none-any.whl

To test the accelerated model, download a publicly available image from PyTorch:

# Download a picture of a Samoyed dog from PyTorch
url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")

try:
        urllib.URLopener().retrieve(url, filename)
except:
        try:
                urllib.request.urlretrieve(url, filename)
        except:
                print("Cannot import image")


# Use boilerplate image processing code from PyTorch-- see https://pytorch.org/hub/pytorch_vision_squeezenet/
from PIL import Image
from torchvision import transforms

input_image = Image.open(filename)
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the PyTorch model

Run the accelerated model on the image:

# The module name defaults to the name of the model, if the model has alphanumeric characters only.
# You could also have customized this package name when creating an acceleration workflow.
import squeezenet_from_pt

best_model = squeezenet_from_pt.OctomizedModel()
outputs = best_model.run(input_batch.numpy()) # Run the accelerated model

# The accelerated model outputs a tvm.nd.NDArray object with shape (1,1000),
# with confidence scores over Imagenet's 1000 classes. You can convert the
# output to a numpy array by calling .numpy().

# Find the index of the ImageNet label predicted with highest probability.
pred = np.argmax(outputs[0].numpy())

# Download ImageNet labels
! wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt

# Read the labels in ImageNet and print out the label predicted with highest probability
with open("imagenet_classes.txt", "r") as f:
    categories = [s.strip() for s in f.readlines()]
    print("Accelerated model detects object: " + categories[pred])

Finally, check the original, unaccelerated PyTorch model shows the same output for the given image:

# Run the original PyTorch model
with torch.no_grad():
    orig_out = model(input_batch)

# The original output has unnormalized scores. To get probabilities, you can run a softmax on it.
orig_probs = torch.nn.functional.softmax(orig_out[0], dim=0)

# Read the labels in ImageNet and print out the label predicted with highest probability
with open("imagenet_classes.txt", "r") as f:
    categories = [s.strip() for s in f.readlines()]
    orig_pred = np.argmax(orig_probs.numpy())
    print("Unaccelerated model detects object: " + categories[orig_pred])

End-to-end Transformers question answering example

First install all necessary libraries:

! pip install transformers
! pip install onnxruntime==1.8.0
! pip install octomizer-sdk --upgrade

Also install a large BERT model finetuned on the SQuAD question answering dataset from the transformers repo:

! python -m transformers.onnx --model=bert-large-uncased-whole-word-masking-finetuned-squad --feature=question-answering onnx/

Now upload the model to the OctoML platform:

import octomizer.client
import octomizer.models.onnx_model as onnx_model

# Pass your API token below:
client = octomizer.client.OctomizerClient(access_token="<INSERT ACCESS TOKEN>")

# Upload the ONNX model
MODEL_NAME = "bert_squad_qa"
ONNX_FILE = "onnx/model.onnx"
accel_model = onnx_model.ONNXModel(client, name=MODEL_NAME, model=ONNX_FILE)

# Check the automatically inferred input shapes.
inputs = accel_model.get_uploaded_model_variant().inputs
print(inputs)

The command above prints ({‘input_ids’: [-1, -1], ‘attention_mask’: [-1, -1], ‘token_type_ids’: [-1, -1]}, {‘input_ids’: ‘int64’, ‘attention_mask’: ‘int64’, ‘token_type_ids’: ‘int64’}).

Notice the input shapes printed above have negative values, which means they are dynamic and need to be disambiguated. For transformer models, the inputs input_ids, attention_mask, and token_type_ids need to have the same shape: [batch_size, maximum_sequence_length].

In this example, we will specify a batch size of 1 and maximum sequence length of 128. input_ids indicate the IDs of the tokens (words or subwords) in an input sequence. attention_mask is a binary tensor indicating which indices are padded so the model does not attend to them. token_type_ids represents a binary mask identifying the two types of sequence in the model– question or context:

input_shapes = {'input_ids': [1, 128], 'attention_mask': [1, 128], 'token_type_ids': [1, 128]}
input_dtypes = inputs[1] # Use the input data types OctoML automatically inferred

OctoML delivers best performance on Transformer-based models via optimized use of ONNX-RT and packaging for CPU targets. For GPU targets, we recommend using TVM for acceleration by calling accelerate() instead of benchmark():

wrkflow = accel_model.get_uploaded_model_variant().benchmark(platform="broadwell", input_shapes=input_shapes, input_dtypes=input_dtypes, create_package = True)

# Save the workflow uuid somewhere so you can use it to access benchmark metrics or the resulting package later.
print(wrkflow.uuid)

# Also save the MODEL_NAME you used somewhere because you will need to call `import <MODEL_NAME>` after download the resulting package later,
# unless you set a custom package name per https://app.octoml.ai/docs/api/octomizer/octomizer.html?highlight=package#octomizer.model_variant.ModelVariant.accelerate
print(MODEL_NAME)

After you receive an email notification about the completion of the acceleration workflow, you can view performance benchmark metrics on the hardware you chose and download a packaged version of the accelerated model, either by visiting the UI or invoking the following code:

# Look up the workflow you previously launched using the workflow id
wrkflow = client.get_workflow("<INSERT WORKFLOW ID>")

# To view benchmark metrics, either visit the UI or invoke something similar to:
if wrkflow.completed() and wrkflow.has_benchmark_stage():
    engine = wrkflow.proto.benchmark_stage_spec.engine
    metrics = wrkflow.metrics()
    print(engine, metrics)
    # Save the resulting Python wheel to the current directory.
    wrkflow.save_package(".")

Download the wheel generated by OctoML:

! pip install bert_squad_qa-0.1.0-py3-none-any.whl

Import the accelerated model:

import bert_squad_qa

best_model = bert_squad_qa.OctomizedModel()

Now set up a sample input for the accelerated model:

from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
question, context = "What are some example applications of BERT?", "…BERT model can be finetuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications."
encoded_input = tokenizer.encode_plus(question, context, return_tensors="np")

Run the accelerated model:

start_scores, end_scores = best_model.run(*encoded_input.values())

Now let’s interpret the results:

import numpy as np
input_ids = encoded_input['input_ids']
tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze())

# Find the tokens with the highest `start` and `end` scores.
answer_start = np.argmax(start_scores)
answer_end = np.argmax(end_scores)

# Combine the tokens in the answer and print it out.
answer = ' '.join(tokens[answer_start:answer_end+1])
print(answer)

The question we asked the accelerated model was “What are some example applications of BERT?” The model answered “… bert model can be fine ##tu ##ned with just one additional output layer to create state - of - the - art models for a wide range of tasks , such as question answering and language inference.”

Supported Hardware

To get the supported hardware targets available for acceleration, you can use the following code:

# Pass your API token below:
client = octomizer.client.OctomizerClient(access_token=MY_ACCESS_TOKEN)

# Get the list of available hardware targets
targets = client.get_hardware_targets()

Each hardware target in the list will have information like the display name, platform, vendor name, the number of vCPUs, the name of the architecture, and the supported model runtimes for the target (e.g. TVM, ONNX Runtime, etc.). If we want to accelerate a model using the target, Intel Cascade Lake (AWS c5.12xlarge), which has a platform name of aws_c5.12xlarge, we can set it as the platform parameter in the accelerate function:

# Assuming the model has already been uploaded and we infer the inputs
wrkflow = model.get_uploaded_model_variant().accelerate(platform="aws_c5.12xlarge")

After your model has been benchmarked, the lscpu information will be included in the benchmark results:

result = wrkflow.result()
print(result.benchmark_result.lscpu_output)

The output will produce something like this:

Architecture:        x86_64
CPU op-mode(s):      32-bit, 64-bit
Byte Order:          Little Endian
CPU(s):              48
On-line CPU(s) list: 0-47
Thread(s) per core:  2
Core(s) per socket:  24
Socket(s):           1
NUMA node(s):        1
Vendor ID:           GenuineIntel
CPU family:          6
Model:               85
Model name:          Intel(R) Xeon(R) Platinum 8275CL CPU @ 3.00GHz
Stepping:            7
CPU MHz:             1941.374
BogoMIPS:            5999.99
Hypervisor vendor:   KVM
Virtualization type: full
L1d cache:           32K
L1i cache:           32K
L2 cache:            1024K
L3 cache:            36608K
NUMA node0 CPU(s):   0-47
Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single pti fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves ida arat pku ospke avx512_vnni

End-to-End: Optimizing PyTorch Model, Packaging it in Triton, and Pushing to AWS ECR

First, make sure you have all necessary libraries installed:

! pip install onnxruntime==1.11.0
! pip install onnx==1.9.0
! pip install torch==1.9.0
! pip install octomizer-sdk --upgrade

Import libraries:

import numpy as np
import octomizer.client
import octomizer.models.torchscript_model as torchscript_model
import torch.onnx
import torchvision
import urllib

Download a public PyTorch model:

model = torchvision.models.squeezenet1_0(pretrained=True)

Export the PyTorch model to Torchscript.

There are two methods for exporting models to Torchscript: scripting and tracing. We strongly recommend tracing to maximize your Torchscript models’ compatibility. Below, we provide an example of tracing a squeezenet model.

For more information on exporting PyTorch models to Torchscript, we recommend this guide https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html:

model_name = "squeezenet_from_pt"
torchscript_file = model_name + ".pt"

# A standard ImageNet input has 3 color channels (RGB) and images of dimension 224x224.
# Image values can be randomized to conduct tracing
rand_input = torch.randn(1, 3, 224, 224)

model.eval()

traced_model = torch.jit.trace(model, rand_input)

traced_model.save(f=torchscript_file)

Next, upload the model to the OctoML platform.

Note: Torchscript models do not include input shape or data type information, so you’ll need to pass those in as dictionaries for each input. Since Squeezenet only has one input (referred to as “input0” below), these dicts will have only one record for both shape and datatype:

# Pass in your model name and (if you have one) your project name
model_name = 'squeezenet'
description = 'squeezenet first test'

# Pass in your shapes and data type for each input
input_shapes = {'input0': [1, 3, 224, 224]}
input_dtype = {'input0': 'float32'}

#Upload the model to the Octomizer
model = torchscript_model.TorchscriptModel(
                client=client,
                name=model_name,
                model=torchscript_file,
                model_input_shapes=input_shapes,
                model_input_dtypes=input_dtype,
                description=description
)

Now we’ll accelerate the model for an AWS “c5n.xlarge” instance. This step is likely to take several hours. To test a faster flow, set “kernel_trials=0” in the accelerate() command.:

wrkflow = model.get_uploaded_model_variant().accelerate(platform="aws_c5n.xlarge")

# Dynamically shaped models would require `input_shapes` and `input_dtypes` as additional parameters in the accelerate() call.

# Save the workflow uuid somewhere so you can use it to access benchmark metrics or the resulting package later.
print(wrkflow.uuid)

After you receive an email notification about the completion of the acceleration workflow, you can view performance benchmark metrics on the hardware you chose.

You can also see the options available for packaging. In this tutorial, we’re focused upon the “docker_build_triton” package.:

# Look up the workflow you previously launched using the workflow id
wrkflow = client.get_workflow((<INSERT WORKFLOW ID>)

# To view benchmark metrics, either visit the UI or invoke something similar to:
if wrkflow.completed() and wrkflow.has_benchmark_stage():
        engine = wrkflow.proto.benchmark_stage_spec.engine
        metrics = wrkflow.metrics()
        print(engine, metrics)

# Here we print the package types that are available. Notice that Triton is among them.
        packages = wrkflow.proto.status.result.package_result
        print(packages)

Once we’ve confirmed we have a docker package available to us, let’s take a moment to prepare the destination for our package: AWS ECR. For more detail on setting up ECR, check out this guide: https://docs.aws.amazon.com/AmazonECR/latest/userguide/docker-push-ecr-image.html:

Start by logging into your ECR registry using the AWS CLI.:

aws ecr get-login-password --region region | docker login --username AWS --password-stdin aws_account_id.dkr.ecr.region.amazonaws.com

Since we’ve logged in, we need to specify the repository name and tag we’re going to use for the model container image. If you haven’t already created a destination repository in ECR, you’ll need to so first. Check out the guide here: https://docs.aws.amazon.com/AmazonECR/latest/userguide/repository-create.html:

# Create the repository in ECR:
aws ecr create-repository --repository-name $MODEL_NAME --profile [PROFILE]

# Create a local mirror for that repository, and also specify the tag
mirror_repository_name = '[AWS_ACCOUNT_ID].dkr.ecr.[REGION].amazonaws.com/' + model_name
tag = 'v1'

print(repository_name+":"+tag)

Now we are ready to build an image! To do so, we call a special “docker_build_triton” method on the workflow we just completed. We’ll need to pass in the repository name and tag.:

wrkflow.docker_build_triton(mirror_repository_name+":"+tag)

Since we’ve built the image, let’s confirm we have it in our local registry:

docker images

Finally, let’s push that image to AWS ECR, and confirm we’ve succeeded. Note this image is 12GB, it will take awhile the first time we do it.:

docker push [AWS_ACCOUNT_ID].dkr.ecr.[REGION].amazonaws.com/squeezenet_from_pt:v1 #PROD VERSION

Now that we’ve pushed this image, let’s confirm this is in ECR.

aws ecr list-images –repository-name $MODEL_NAME –profile [PROFILE_NAME]

The optimized model container is now in AWS ECR and ready to be pushed wherever you need it!

Using Docker Tarballs

OctoML’s WebUI provides the option to Download Docker tarballs:

Docker tarballs download

This tutorial describes how to build Docker images from these tarballs and push these to AWS ECR.

For information about how to run inference against deployed containers, see Running Inference Against the Triton Server

Get Started

If a “Docker” format is available for your packaged model, do the following:

  • select the package

  • download the package

  • extract the tar.gz package

Example

tar -xzvf your_tarball_name_here.tar.gz

Requirements

To follow this tutorial, you’ll need the following:

  • docker installed on the machine

  • docker daemon running

Script: build.sh

With the tarball, is a build script: build.sh

This script ensures the docker image is correctly built for the target hardware.

The following sample is provided to ensure a Docker image is built for an x86 hardware target.

#!/bin/sh

if [ $# -lt 1 ]; then
  echo usage: build.sh name:tag, e.g. ecr.io/octoml/model:v1
  exit 1
fi
docker build --tag $1 --platform linux/x86_64 --file docker_build_context/Dockerfile docker_build_context

To use this script, input the: name:tag

Example:

Use the image: yolo:latest

Replace name:tag with yolo:latest as seen in the command below.

./build.sh yolo:latest

Executing this command will create a docker image specifically for the target hardware the model was accelerated for.

Docker Image Validation

Confirm the docker image was created correctly by issuing the command: docker images.

If this image was built for a different architecture than your local machine, you won’t be able to run it locally, so do not try to test this on your local machine unless you’re sure the arch is compatible.

Build your image with a specific registry destination. Below, is an example of how to do this with AWS ECS.

  1. push the image to a registry

  2. push the image to the preferred hardware

Deploying Built Triton Image to AWS ECR

Login to the ECR registry using the AWS CLI. Enter the AWS_ACCOUNT_ID and REGION:

docker login --username AWS --password-stdin [AWS_ACCOUNT_ID].dkr.ecr.[REGION].amazonaws.com

Login Succeeded displays in the terminal.

Once you are logged in, specify the repository name and tag to use for the model container image.

Note: If you haven’t already created a destination repository in ECR, you’ll need to so first For more information, visit the Amazon ECR Guide .

Create the repository in ECR:

aws ecr create-repository --repository-name yolo --profile [PROFILE]

Before pushing the built image to ECR, tag it locally:

docker tag yolo:latest [AWS_ACCOUNT_ID].dkr.ecr.[REGION].amazonaws.com/yolo:latest

Confirm your image is properly tagged in your local registry:

docker images

Finally, push the image to AWS ECR, and confirm success.

At time of writing, this image is 12GB. This will take time during the first push.

docker push [AWS_ACCOUNT_ID].dkr.ecr.[REGION].amazonaws.com/yolo:latest

With the image pushed, confirm in ECR.

aws ecr list-images --repository-name yolo --profile [PROFILE_NAME]

The optimized model container is now in AWS ECR and ready to be pushed wherever you need it!

Next steps:

Deploying Triton Containers from AWS ECR to AWS EKS

This tutorial provides an example of how to deploy an OctoML container in EKS, assuming you have:

  1. already pushed your OctoML model container to ECR and

  2. already created an EKS cluster for your application. For information on how to push your OctoML model container to ECR, see Deploying Built Triton Image to AWS ECR

For example, if you requested a container for an AWS c5n.xlarge instance, you’ll want to make sure your cluster has c5n.xlarge instances available. This will ensure you get the performance you were expecting.

First, make sure you have both the AWS command line interface and kubectl installed on your developer machine. Then, set your kubectl context to your AWS EKS cluster.

aws eks update-kubeconfig --name [AWS_CLUSTER_NAME] --region [REGION] --profile [PROFILE]

Make sure your cluster has nodes available to match the hardware you specified when you created your accelerated model container. To do so, create a managed node group of the appropriate instance type using the guide: Creating a managed node group

Once nodes with the relevant instance type are made available to the EKS cluster, the Triton container can be deployed using the Helm Chart.

The following bash script uses this helm chart to create a minimal deployment of Triton in EKS:

#!/bin/sh
aws_account_id=xxxxxxxxxx
REGION=us-west-2
registry=${aws_account_id}.dkr.ecr.${REGION}.amazonaws.com
DOCKER_IMAGE_NAME=triton-example
DOCKER_IMAGE_TAG=v1

cat << EOF > "values.yaml"
imageName: $registry/${DOCKER_IMAGE_NAME}
imageTag: $DOCKER_IMAGE_TAG
imageCredentials:
  registry: $registry
  username: AWS
  password: $(aws ecr get-login-password --region "$REGION")
nodeSelector:
  node.kubernetes.io/instance-type: c5n.xlarge
EOF

## Install helm chart to the EKS cluster via helm
if ! helm repo list | grep octoml-cli-tutorials; then
  helm repo add octoml-helm-charts d
fi

helm repo update octoml-cli-tutorials

helm install triton-example octoml-helm-charts/octoml-triton -n triton-example --create-namespace --values "values.yaml" --atomic --timeout 7m

As part of the helm chart installation, the following components will be created:

  • A pod, which holds the Triton container itself

  • A deployment, which manages the lifecycle of the Triton pod

  • A service, which exposes the pod to clients outside the cluster

The values.yaml file used in the example above utilizes the following fields:

  • imageName. The name of the image—since we’re pulling the image from ECR, we will need the registry name as a prefix

  • imageTag. The tag identifying the Triton image to be pulled.

  • imageCredentials. Credentials used by EKS to pull the Triton image into the pod. Note the use of aws ecr get-login-password , which can be invoked from an authenticated command line client to pull down ECR credentials

  • nodeSelector. a map of node selectors to be used by the Triton pod. Node selectors force a pod to be scheduled on nodes with specific label values. This example leverages the built-in node.kubernetes.io/instance-type label, which is automatically added to all nodes created by EKS, to make sure that the Triton pod lands on a c5n.xlarge instance

While the above example is strictly functional (to run inference on this and similar deployments, see Remote Inference on Triton Server. there are certain optional components missing:

While the Triton inference server in the above example can be accessed either by neighboring pods in the cluster or authenticated clients using kubectl port-forward, it cannot be accessed by an arbitrary anonymous client. To allow for this, consider adding the following to values.yaml:

service:
  type: LoadBalancer

This will provision the Triton service with type LoadBalancer, which will automatically create an AWS elastic load balancer pointing to the Triton pod. To get the DNS name of this load balancer, you can query the service like so:

kubectl get svc -n triton-example
NAME             TYPE           CLUSTER-IP      EXTERNAL-IP                                                              PORT(S)                                        AGE
triton-example   LoadBalancer   172.20.67.101   aad62a015a3a241ef888db24515c4786-974307675.us-west-2.elb.amazonaws.com   8001:32182/TCP,8000:30248/TCP,8002:31026/TCP   2m

Alternatively, if your cluster has an externally-accessible ingress controller you can make Triton accessible through an ingress resource. To do so, add the following to your helm chart values file:

ingress:
  enabled: true
  grpc:
    host: <GRPC_HOSTNAME>
  http:
    host: <HTTP_HOSTNAME>

Doing so will allow helm to create ingress resources for both HTTP and GRPC access to the Triton pods. Assuming that and both redirect to the IP address of your ingress controller, you should be able to access the inference server this way.

Important: the NGINX Ingress Controller, which is one of the more prevalent ones in use today, does not allow for plaintext GRPC communications. If you plan to use this helm chart with that particular controller, you will have to add a TLS Secret:

ingress:
  enabled: true
  grpc:
    host: <GRPC_HOSTNAME>
  http:
    host: <HTTP_HOSTNAME>
    tls:
      - hosts:
          - <GRPC_HOSTNAME>
        secretName: <TLS_SECRET>

While using node selectors as in the example above guarantees that the Triton pod will land on the correct hardware, this does not guarantee other resource-hungry pods will not land on the same node. This leads to the busy neighbor problem, wherein the Triton server’s performance is hindered by excess resource contention. This can be prevented through the use of taints and tolerations , which limit which nodes pods are allowed to schedule on. In this example, start by applying the following taint to the c5n.xlarge node pool:

triton-instance-type=c5n.xlarge:NoSchedule

This taint makes it so that all pods which don’t have the corresponding triton-instance-type=c5n.xlarge toleration cannot be scheduled. To allow our triton pods to schedule on these nodes once more, we can add the following to the helm chart, which will configure the relevant toleration:

tolerations:
-  key: triton-instance-type
   value: c5n.xlarge
   operator: Equal
   effect: NoSchedule

This ensures that the Triton pod will perform without significant resource contention.

Next steps:

Remote Inference on Triton Server

This tutorial provides an example of how to run inference against a deployed Triton container in EKS. It is broken into two parts—accessing the Triton service in Kubernetes, and running instances against the server.

Accessing the Triton Server

For simplicity, we will be working from the example helm chart deployment presented in the latter half of the guide Deploying Triton Containers from AWS ECR to AWS EKS Three methods will be presented, organized in order of production-readiness:

Method 1: Using kubctl

The first and easiest method to access the deployed Triton container is to use kubectl port-forward, which allows an authenticated Kubernetes client to forward traffic from their local machine to a Kubernetes pod or service. To do so for our example, issue the following command:

kubectl port-forward service/triton-example -n triton-example 8000:8000 8001:8001

This will make the Triton service reachable from both port 8000 (http) and port 8001 (grpc) on localhost. To verify the Triton server is reachable, you can execute the following script from the same machine:

import tritonclient.grpc as grpc_client
import numpy as np
from PIL import Image as PyImage

triton = grpc_client.InferenceServerClient("localhost:8001")

model_name = <MODEL_NAME>

config = triton.get_model_config(model_name=model_name).config
print(config)

Where <MODEL_NAME> is the name of the model accelerated by Octomizer. You can find your model name within the tarball.

Under the directory docker_build_context/octoml/models there is a directory containing your model. The name of the directory is the name of your model.

For example, you may find a directory: docker_build_context/octoml/models/rf_iris

In this case, the name of your model is rf_iris.

Method 2: Accessing through a service of type ``LoadBalancer``

If you deployed your helm chart using a service of type LoadBalancer, you can hit the inference service from anywhere using the service’s external IP. For instance, if the external IP is 1.2.3.4, you can invoke the script above with the following line changed:

triton = grpc_client.InferenceServerClient("1.2.3.4:8001")

Method 3: Accessing through an ingress resource

In the previous section, we covered using the provided helm chart to optionally provision ingress resources to open the Triton service to the outside world. Once these are set up, accessing the service is as easy as making the following modification to the code used in the first method:

triton = grpc_client.InferenceServerClient(<GRPC_HOSTNAME>)

Where <GRPC_HOSTNAME> is the previously-chosen hostname for the GRPC service endpoint.

Running Inference Against the Triton Server

The next step is to perform an inference on your model. The model configuration describes the inputs your model expects and the outputs you can expect from your model, including the names, shapes, and datatypes of each input or output. We use this information to construct an inference request using Triton’s gRPC inference client, or an HTTP client, as we will demonstrate later on.

The following is an example model configuration with one input named X, and two outputs named label and probabilities:

name: "rf_iris"
platform: "onnxruntime_onnx"
input [
  {
    name: "X"
    data_type: TYPE_FP64
    dims: -1
    dims: 4
  }
]
output [
  {
    name: "label"
    data_type: TYPE_INT64
    dims: -1
  },
  {
    name: "probabilities"
    data_type: TYPE_FP32
    dims: -1
    dims: 3
  }
]

To begin performing an inference, first install the following packages:

pip install numpy requests tritonclient[all]

Then, navigate to the model directory relative to your tarball’s base directory, docker_build_context/octoml/models/<model_name>. Here, you’ll find two files, inference_grpc.py and inference_http.py, demonstrate making inferences over gRPC or HTTP using the python requests library respectively.

Here’s an example of performing an inference using the gRPC client on a model trained against the Iris Data Set

The model has one input with four features, each 64-bit floating point numbers, and two outputs representing the label as an integer and the probabilities of each label as an array of 32-bit floating point numbers.

import numpy
import tritonclient.grpc as grpc_client

url = "localhost:8001"
client = grpc_client.InferenceServerClient(url)

inputs = []
input_name_0 = "X"
input_shape_0 = [1, 4]
input_datatype_0 = numpy.float64
input_data_0 = numpy.array(numpy.ones(
      input_shape_0,
      input_datatype_0
      ))
triton_datatype_0 = "FP64"

      tensor = grpc_client.InferInput(
    name=input_name_0,
    shape=input_shape_0,
    datatype=triton_datatype_0
)

tensor.set_data_from_numpy(input_data_0)
inputs.append(tensor)

outputs = []
output_name_0 = "label"
tensor = grpc_client.InferRequestedOutput(output_name_0)
outputs.append(tensor)
output_name_1 = "probabilities"
tensor = grpc_client.InferRequestedOutput(output_name_1)
outputs.append(tensor)
      inferences = client.infer(
     model_name="rf_iris",
     model_version="1",
     inputs=inputs, outputs=outputs
 )

print(inferences.as_numpy(name="label"))

Here is the same example, but using HTTP and the requests library to perform the inference, as found in inference_http.py:

import json

import numpy
import requests

post_data = {'inputs': []}

input_name_0 = "X"
input_shape_0 = [1, 4]
input_datatype_0 = numpy.float64
input_data_0 = numpy.ones(input_shape_0, input_datatype_0)
triton_datatype_0 = "FP64"

      post_data['inputs'].append({
     'name': input_name_0,
     'shape': input_shape_0,
     'datatype': triton_datatype_0,
     'data': input_data_0.tolist()
 })

model_name = "rf_iris"
result = requests.post("http://localhost:8000/v2/models/rf_iris_to_onnx_onnx/versions/1/infer",
    data=json.dumps(post_data))