OctoML Python SDK

OctoML provides a Python SDK which provides an easy-to-use wrapper around the OctoML RPC Interface. The Python SDK’s API is documented in OctoML Python API.

Below is a simple example of using the Python SDK to benchmark and optimize an ONNX model with static inputs using OctoML, while moving it to a new Project named My Project.

#!/usr/bin/env python3

"""
This is a simple example of use of the OctoML API. It creates a Project named My Project,
uploads an ONNX version of the MNIST model, benchmarks it on ONNX-RT, accelerates it, and
returns both the accelerated model benchmark results and a Python package for the
optimized model.

After installing the OctoML SDK, run as:
  % export OCTOMIZER_API_TOKEN=<your API token>
  % ./octomizer_example.py

Windows-based users should set their API token as an environment
variable rather than use the export command, which is not supported
in some versions of Windows.

Typical output will look like this:

Uploading model tests/testdata/mnist.onnx...
Benchmarking using ONNX-RT...
ONNX-RT benchmark metrics:
latency_mean_ms: 0.1281123161315918
latency_std_ms: 0.3681384027004242

Waiting for OctoML to complete...
TVM benchmark metrics:
latency_mean_ms: 0.07081685215234756
latency_std_ms: 0.019525768235325813
compile_ms: 1084.7218017578125
full_metrics_dataref_uuid: "7df384b3-1d3d-4ef0-ae25-e57588090876"

Saved packaged model to ./mnist_octomized-0.1.0-py3-none-any.whl
"""

from __future__ import annotations

import octomizer.client
import octomizer.models.onnx_model as onnx_model
import octomizer.project as project
from octomizer.model_variant import AutoschedulerOptions


def main():
    # Specify the model file and input layer parameters.
    onnx_model_file = "tests/testdata/mnist.onnx"

    # Specify the Python package name for the resulting model.
    model_package_name = "mnist_octomized"

    # Specify the platform to target.
    platform = "broadwell"

    # Create the OctoML Client instance.
    client = octomizer.client.OctomizerClient()

    # Create a Project named My Project.
    my_project = project.Project(
        client,
        name="My Project",
        description="Created by octomizer_static_example.py",
    )

    # Upload the ONNX model file.
    print(f"Uploading model {onnx_model_file}...")
    model = onnx_model.ONNXModel(
        client,
        name=model_package_name,
        model=onnx_model_file,
        description="Created by octomizer_static_example.py",
        project=my_project,
    )
    model_variant = model.get_uploaded_model_variant()

    # Benchmark the model and get results. MNIST is a small and simple model,
    # so this should return quickly.
    benchmark_workflow = model_variant.benchmark(platform, create_package=False)
    print("Benchmarking using ONNX-RT...")
    benchmark_workflow.wait()
    if not benchmark_workflow.completed():
        raise RuntimeError(
            f"Workflow has not completed, status is {benchmark_workflow.status()}"
        )
    metrics = benchmark_workflow.metrics()
    print(f"ONNX-RT benchmark metrics:\n{metrics}")

    # Accelerate the model. Since this is a small model and this is only an example,
    # we set the values in `tuning_options` ridiculously low so it does not take too long to
    # complete. Normally these values would be much higher (the defaults are 1000 and 250,
    # respectively).
    #
    # See the python API documentation for the full list of tuning and packaging options.
    octomize_workflow = model_variant.octomize(
        platform,
        tuning_options=AutoschedulerOptions(
            trials_per_kernel=3, early_stopping_threshold=1
        ),
    )

    # Save the workflow uuid somewhere so you can use it later
    print(octomize_workflow.uuid)

    # After you receive an email notification about the completion of the acceleration workflow,
    # you can view performance benchmark metrics on the hardware you chose and download a packaged
    # version of the accelerated model, either by invoking the following code or visiting the UI::

    # Look up the workflow you previously launched using the workflow uuid
    octomize_workflow = client.get_workflow("<INSERT WORKFLOW ID>")

    if not octomize_workflow.completed():
        raise RuntimeError(
            f"Workflow has not completed, status is {octomize_workflow.status()}"
        )
    metrics = octomize_workflow.metrics()
    # Don't be surprised if the TVM numbers are slower than ONNX-RT -- we didn't
    # run enough rounds of autotuning to ensure great performance!
    print(f"TVM benchmark metrics:\n{metrics}")

    # Download the package for the optimized model.
    # If you want to save a specific type of package type (default is Python) you can add
    # package_type=octomizer.package_type.PackageType.PYTHON_PACKAGE or LINUX_SHARED_OBJECT.
    # Note that not all package types may be available for a given engine.
    output_filename = octomize_workflow.save_package(out_dir=".")
    print(f"Saved packaged model to {output_filename}")


if __name__ == "__main__":
    main()

Below is a simple example of using the Python SDK to benchmark and optimize an ONNX model with dynamic inputs using OctoML, while moving it to a new Project named My Project.

#!/usr/bin/env python3

"""
This is a simple example of use of the OctoML API. It creates a Project named My Project,
uploads an ONNX version of the mobilenet model, benchmarks it on ONNX-RT, accelerates it,
and returns both the accelerated model benchmark results and a Python package for the
optimized model.

After installing the OctoML SDK, run as:
  % export OCTOMIZER_API_TOKEN=<your API token>
  % ./octomizer_example.py

Windows-based users should set their API token as an environment variable rather than use the
export command, which is not supported in some versions of Windows.

Typical output will look like this:

Uploading model tests/testdata/mobilenet.onnx...
Benchmarking using ONNX-RT...
ONNX-RT benchmark metrics:
latency_mean_ms: 0.1281123161315918
latency_std_ms: 0.3681384027004242

Waiting for OctoML to complete...
TVM benchmark metrics:
latency_mean_ms: 0.07081685215234756
latency_std_ms: 0.019525768235325813
compile_ms: 1084.7218017578125
full_metrics_dataref_uuid: "7df384b3-1d3d-4ef0-ae25-e57588090876"

Saved packaged model to ./mobilenet_octomized-0.1.0-py3-none-any.whl
"""

from __future__ import annotations

import octomizer.client
import octomizer.models.onnx_model as onnx_model
import octomizer.project as project
from octomizer.model_variant import AutoschedulerOptions


def main():
    # Specify the model file and input layer parameters.
    onnx_model_file = "tests/testdata/mobilenet.onnx"

    # Specify the Python package name for the resulting model.
    model_package_name = "mobilenet_octomized"

    # Specify the platform to target.
    platform = "broadwell"

    # Create the OctoML Client instance.
    client = octomizer.client.OctomizerClient()

    # Create a Project named My Project.
    my_project = project.Project(
        client,
        name="My Project",
        description="Created by octomizer_dynamic_example.py",
    )

    # Upload the ONNX model file.
    print(f"Uploading model {onnx_model_file}...")
    model = onnx_model.ONNXModel(
        client,
        name=model_package_name,
        model=onnx_model_file,
        description="Created by octomizer_dynamic_example.py",
        project=my_project,
    )
    model_variant = model.get_uploaded_model_variant()

    # Inputs are not fully inferrable
    inferred_input_shapes, inferred_input_dtypes = model_variant.inputs
    assert inferred_input_shapes == {"input:0": [-1, 3, 224, 224]}
    assert inferred_input_dtypes == {"input:0": "float32"}

    # Disambiguate the inputs. Notice the -1 has been replaced by 1.
    input_shapes = {"input:0": [1, 3, 224, 224]}
    input_dtypes = {"input:0": "float32"}

    # Benchmark the model and get results. Mobilenet is a small and simple model,
    # so this should return quickly.
    benchmark_workflow = model_variant.benchmark(
        platform=platform,
        input_shapes=input_shapes,
        input_dtypes=input_dtypes,
        create_package=False,
    )
    print("Benchmarking using ONNX-RT...")
    benchmark_workflow.wait()
    if not benchmark_workflow.completed():
        raise RuntimeError(
            f"Workflow has not completed, status is {benchmark_workflow.status()}"
        )
    metrics = benchmark_workflow.metrics()
    print(f"ONNX-RT benchmark metrics:\n{metrics}")

    # Accelerate the model. Since this is a small model and this is only an example,
    # we set the values in `tuning_options` ridiculously low so it does not take too long to
    # complete. Normally these values would be much higher (the defaults are 1000 and 250,
    # respectively).
    #
    # See the python API documentation for the full list of tuning and packaging options.
    octomize_workflow = model_variant.octomize(
        platform,
        tuning_options=AutoschedulerOptions(
            trials_per_kernel=3, early_stopping_threshold=1
        ),
        input_shapes=input_shapes,
        input_dtypes=input_dtypes,
    )

    # Save the workflow uuid somewhere so you can use it later
    print(octomize_workflow.uuid)

    # After you receive an email notification about the completion of the acceleration workflow,
    # you can view performance benchmark metrics on the hardware you chose and download a packaged
    # version of the accelerated model, either by invoking the following code or visiting the UI::

    # Look up the workflow you previously launched using the workflow uuid
    octomize_workflow = client.get_workflow("<INSERT WORKFLOW ID>")

    if not octomize_workflow.completed():
        raise RuntimeError(
            f"Workflow has not completed, status is {octomize_workflow.status()}"
        )
    metrics = octomize_workflow.metrics()

    # Don't be surprised if the TVM numbers are slower than ONNX-RT -- we didn't
    # run enough rounds of autotuning to ensure great performance!
    print(f"TVM benchmark metrics:\n{metrics}")

    # Download the package for the optimized model.
    # If you want to save a specific type of package type (default is Python) you can add
    # package_type=octomizer.package_type.PackageType.PYTHON_PACKAGE or LINUX_SHARED_OBJECT.
    # Note that not all package types may be available for a given engine.
    output_filename = octomize_workflow.save_package(out_dir=".")
    print(f"Saved packaged model to {output_filename}")


if __name__ == "__main__":
    main()

Below is a simple example of using the Python SDK to benchmark and optimize an TensorFlow SavedModel (includes Keras SavedModel) using OctoML, while moving it to a new Project named My Project.

#!/usr/bin/env python3

"""
This is a simple example of use of the OctoML API. It creates a Project named My Project,
uploads a TensorFlow version of the MNIST model, benchmarks it on TensorFlow's runtime,
accelerates it, and returns both the accelerated model benchmark results and a Python
package for the optimized model.

After installing the OctoML SDK, run as:
  % export OCTOMIZER_API_TOKEN=<your API token>
  % ./octomizer_example.py

Windows-based users should set their API token as an environment
variable rather than use the export command, which is not supported
in some versions of Windows.

Typical output will look like this:

Uploading model tests/testdata/mnist.tgz...
Benchmarking using TensorFlow runtime...
TensorFlow runtime benchmark metrics:
latency_mean_ms: 0.1281123161315918
latency_std_ms: 0.3681384027004242

Waiting for OctoML to complete...
TVM benchmark metrics:
latency_mean_ms: 0.07081685215234756
latency_std_ms: 0.019525768235325813
compile_ms: 1084.7218017578125
full_metrics_dataref_uuid: "7df384b3-1d3d-4ef0-ae25-e57588090876"

Saved packaged model to ./mnist_octomized-0.1.0-py3-none-any.whl
"""

from __future__ import annotations

import octomizer.client
import octomizer.models.tensorflow_saved_model as tf_model
import octomizer.project as project
from octomizer.model_variant import AutoschedulerOptions


def main():
    # Specify the model file and input layer parameters.
    tf_model_file = "tests/testdata/mnist.tgz"

    # Specify the Python package name for the resulting model.
    model_package_name = "mnist_octomized"

    # Specify the platform to target.
    platform = "broadwell"

    # Create the OctoML Client instance.
    client = octomizer.client.OctomizerClient()

    # Create a Project named My Project.
    my_project = project.Project(
        client,
        name="My Project",
        description="Created by octomizer_tensorflow_saved_model.py",
    )

    # Upload the TensorFlow model file.
    # Note: TensorFlowSavedModel supports both TF SavedModels and
    # Keras SavedModel formats.
    print(f"Uploading model {tf_model_file}...")
    model = tf_model.TensorFlowSavedModel(
        client,
        name=model_package_name,
        model=tf_model_file,
        description="Created by octomizer_tensorflow_saved_model.py",
        project=my_project,
    )
    model_variant = model.get_uploaded_model_variant()

    # Benchmark the model and get results. MNIST is a small and simple model,
    # so this should return quickly.
    benchmark_workflow = model_variant.benchmark(platform)
    print("Benchmarking using TensorFlow runtime...")
    benchmark_workflow.wait()
    if not benchmark_workflow.completed():
        raise RuntimeError(
            f"Workflow did not complete, status is {benchmark_workflow.status()}"
        )
    metrics = benchmark_workflow.metrics()
    print(f"TensorFlow runtime benchmark metrics:\n{metrics}")

    # Accelerate the model. Since this is a small model and this is only an example,
    # we set the values in `tuning_options` ridiculously low so it does not take too long to
    # complete. Normally these values would be much higher (the defaults are 1000 and 250,
    # respectively).
    #
    # See the python API documentation for the full list of tuning options.
    octomize_workflow = model_variant.octomize(
        platform,
        tuning_options=AutoschedulerOptions(
            trials_per_kernel=3, early_stopping_threshold=1
        ),
    )
    print("Waiting for OctoML to complete...")
    octomize_workflow.wait()
    if not octomize_workflow.completed():
        raise RuntimeError(
            f"Workflow did not complete, status is {octomize_workflow.status()}"
        )
    metrics = octomize_workflow.metrics()
    # Don't be surprised if the TVM numbers are slower than TensorFlow runtime -- we didn't
    # run enough rounds of autotuning to ensure great performance!
    print(f"TVM benchmark metrics:\n{metrics}")

    # Download the package for the optimized model.
    # If you want to save a specific type of package type (default is Python) you can add
    # package_type=octomizer.package_type.PackageType.PYTHON_PACKAGE or LINUX_SHARED_OBJECT.
    # Note that not all package types may be available for a given engine.
    output_filename = octomize_workflow.save_package(out_dir=".")
    print(f"Saved packaged model to {output_filename}")


if __name__ == "__main__":
    main()

Below is a simple example of using the Python SDK to benchmark and optimize an TensorFlow graph def using OctoML, while moving it to a new Project named My Project.

#!/usr/bin/env python3

"""
This is a simple example of use of the OctoML API. It creates a Project named My Project,
uploads a TensorFlow version of the MNIST model, benchmarks it on TensorFlow's runtime,
accelerates it, and returns both the accelerated model benchmark results and a Python
package for the optimized model.

After installing the OctoML SDK, run as:
  % export OCTOMIZER_API_TOKEN=<your API token>
  % ./octomizer_example.py

Windows-based users should set their API token as an environment
variable rather than use the export command, which is not supported
in some versions of Windows.

Typical output will look like this:

Uploading model tests/testdata/mnist.pb...
Benchmarking using TensorFlow runtime...
TensorFlow runtime benchmark metrics:
latency_mean_ms: 0.1281123161315918
latency_std_ms: 0.3681384027004242

Waiting for OctoML to complete...
TVM benchmark metrics:
latency_mean_ms: 0.07081685215234756
latency_std_ms: 0.019525768235325813
compile_ms: 1084.7218017578125
full_metrics_dataref_uuid: "7df384b3-1d3d-4ef0-ae25-e57588090876"

Saved packaged model to ./mnist_octomized-0.1.0-py3-none-any.whl
"""

from __future__ import annotations

import octomizer.client
import octomizer.models.tensorflow_graph_def_model as tf_model
import octomizer.project as project
from octomizer.model_variant import AutoschedulerOptions


def main():
    # Specify the model file and input layer parameters.
    tf_model_file = "tests/testdata/mnist.pb"

    # Specify the Python package name for the resulting model.
    model_package_name = "mnist_octomized"

    # Specify the platform to target.
    platform = "broadwell"

    # Create the OctoML Client instance.
    client = octomizer.client.OctomizerClient()

    # Create a Project named My Project.
    my_project = project.Project(
        client,
        name="My Project",
        description="Created by octomizer_tensorflow_graph_def.py",
    )

    # Upload the TensorFlow model file.
    print(f"Uploading model {tf_model_file}...")
    model = tf_model.TensorFlowGraphDefModel(
        client,
        name=model_package_name,
        model=tf_model_file,
        description="Created by octomizer_tensorflow_graph_def.py",
        project=my_project,
    )
    model_variant = model.get_uploaded_model_variant()

    # Benchmark the model and get results. MNIST is a small and simple model,
    # so this should return quickly.
    benchmark_workflow = model_variant.benchmark(platform)
    print("Benchmarking using TensorFlow runtime...")
    benchmark_workflow.wait()
    if not benchmark_workflow.completed():
        raise RuntimeError(
            f"Workflow did not complete, status is {benchmark_workflow.status()}"
        )
    metrics = benchmark_workflow.metrics()
    print(f"TensorFlow runtime benchmark metrics:\n{metrics}")

    # Accelerate the model. Since this is a small model and this is only an example,
    # we set the values in `tuning_options` ridiculously low so it does not take too long to
    # complete. Normally these values would be much higher (the defaults are 1000 and 250,
    # respectively).
    #
    # See the python API documentation for the full list of tuning options.
    octomize_workflow = model_variant.octomize(
        platform,
        tuning_options=AutoschedulerOptions(
            trials_per_kernel=3, early_stopping_threshold=1
        ),
    )
    print("Waiting for OctoML to complete...")
    octomize_workflow.wait()
    if not octomize_workflow.completed():
        raise RuntimeError(
            f"Workflow did not complete, status is {octomize_workflow.status()}"
        )
    metrics = octomize_workflow.metrics()
    # Don't be surprised if the TVM numbers are slower than TensorFlow runtime -- we didn't
    # run enough rounds of autotuning to ensure great performance!
    print(f"TVM benchmark metrics:\n{metrics}")

    # Download the package for the optimized model.
    # If you want to save a specific type of package type (default is Python) you can add
    # package_type=octomizer.package_type.PackageType.PYTHON_PACKAGE or LINUX_SHARED_OBJECT.
    # Note that not all package types may be available for a given engine.
    output_filename = octomize_workflow.save_package(out_dir=".")
    print(f"Saved packaged model to {output_filename}")


if __name__ == "__main__":
    main()

Below is a simple example of using the Python SDK to benchmark and optimize a TFlite model using OctoML, while moving it to a new Project named My Project.

#!/usr/bin/env python3

"""
This is a simple example of use of the OctoML API. It creates a Project named
Project, uploads a TFLite model, benchmarks it on TVM, accelerates it, and
returns both the accelerated model benchmark results and a Python package for
the optimized model.

After installing the OctoML SDK, run as:
  % export OCTOMIZER_API_TOKEN=<your API token>
  % ./octomizer_example_tflite.py

Windows-based users should set their API token as an environment variable
rather than use the export command, which is not supported in some versions
of Windows.

Typical output will look like this:

Uploading model tests/testdata/mnist.tflite...
Benchmarking using TFLite Runtime
TFLite runtime benchmark metrics:
latency_mean_ms: 0.10985320061445236
latency_std_ms: 0.00474481750279665

Waiting for OctoML to complete...
TVM benchmark metrics:
latency_mean_ms: 0.0573916994035244
latency_std_ms: 0.007800986059010029

Saved packaged model to ./mnist_tflite_octomized-0.1.0-py3-none-any.whl
"""

from __future__ import annotations

import octomizer.client
import octomizer.models.tflite_model as tflite_model
import octomizer.project as project
from octomizer.model_variant import AutoschedulerOptions


def main():
    # Specify the model file and input layer parameters.
    tflite_model_file = "tests/testdata/mnist.tflite"

    # Specify the Python package name for the resulting model.
    model_package_name = "mnist_tflite_octomized"

    # Specify the platform to target.
    platform = "rasp4b"

    # Create the OctoML Client instance.
    client = octomizer.client.OctomizerClient()

    # Create a Project named My Project.
    my_project = project.Project(
        client,
        name="My Project",
        description="Created by octomizer_example_tflite.py",
    )

    # Upload the TFLite model file.
    print(f"Uploading model {tflite_model_file}...")
    model = tflite_model.TFLiteModel(
        client,
        name=model_package_name,
        model=tflite_model_file,
        description="Created by octomizer_example_tflite.py",
        project=my_project,
    )
    model_variant = model.get_uploaded_model_variant()

    # Benchmark the model and get results. MNIST is a small and simple model,
    # so this should return quickly. By default, TFLite models will be benchmarked
    # using TFLite runtime unless otherwise specified by using the `use_onnx_engine`
    # or `untuned_tvm` flags.
    benchmark_workflow = model_variant.benchmark(platform)
    print("Benchmarking using TFLite runtime")

    # To benchmark using ONNX-RT:
    # benchmark_workflow = model_variant.benchmark(platform, use_onnx_engine=True)

    # To benchmark using TVM (untuned):
    # benchmark_workflow = model_variant.benchmark(platform, untuned_tvm=True)

    benchmark_workflow.wait(timeout=1200)
    if not benchmark_workflow.completed():
        raise RuntimeError(
            f"Workflow has not completed, status is {benchmark_workflow.status()}"
        )
    metrics = benchmark_workflow.metrics()
    print(f"TFLite runtime benchmark metrics:\n{metrics}")

    # Accelerate the model. Since this is a small model and this is only an example,
    # we set the values in `tuning_options` ridiculously low so it does not take too long to
    # complete. Normally these values would be much higher (the defaults are 1000 and 250,
    # respectively).
    #
    # See the python API documentation for the full list of tuning and packaging options.
    octomize_workflow = model_variant.octomize(
        platform,
        tuning_options=AutoschedulerOptions(
            trials_per_kernel=3, early_stopping_threshold=1
        ),
    )

    # Save the workflow uuid somewhere so you can use it later
    print(octomize_workflow.uuid)

    # After you receive an email notification about the completion of the acceleration workflow,
    # you can view performance benchmark metrics on the hardware you chose and download a packaged
    # version of the accelerated model, either by invoking the following code or visiting the UI::

    # Look up the workflow you previously launched using the workflow uuid
    octomize_workflow = client.get_workflow(octomize_workflow.uuid)

    print("Waiting for OctoML to complete...")
    octomize_workflow.wait()
    if not octomize_workflow.completed():
        raise RuntimeError(
            f"Workflow has not completed, status is {octomize_workflow.status()}"
        )
    metrics = octomize_workflow.metrics()
    # Don't be surprised if the TVM numbers are slower than TensorFlow runtime -- we didn't
    # run enough rounds of autotuning to ensure great performance!
    print(f"TVM benchmark metrics:\n{metrics}")

    # Download the package for the optimized model.
    # If you want to save a specific type of package type (default is Python) you can add
    # package_type=octomizer.package_type.PackageType.PYTHON_PACKAGE or LINUX_SHARED_OBJECT.
    # Note that not all package types may be available for a given engine.
    output_filename = octomize_workflow.save_package(out_dir=".")
    print(f"Saved packaged model to {output_filename}")


if __name__ == "__main__":
    main()

Below is a simple example of adding Models to a Project, and listing all the Models that belong to a Project.

#!/usr/bin/env python3

"""
This is a simple example of use of the OctoML API. It creates a Project named My Project,
uploads two ONNX models, and adds them to the Project, and finally lists all Models
belonging to the Project.

After installing the OctoML SDK, run as:
  % export OCTOMIZER_API_TOKEN=<your API token>
  % ./octomizer_project_example.py

Windows-based users should set their API token as an environment
variable rather than use the export command, which is not supported
in some versions of Windows.
"""

from __future__ import annotations

import octomizer.client
import octomizer.models.onnx_model as onnx_model
import octomizer.project as project


def __main__():
    # Specify the model files.
    onnx_model_files = [
        ("tests/testdata/mnist.onnx", "mnist"),
        ("tests/testdata/mobilenet.onnx", "mobilenet"),
    ]

    # Create the OctoML Client instance.
    client = octomizer.client.OctomizerClient()

    # Create a Project named My Project.
    my_project = project.Project(
        client,
        name="My Project",
        description="Created by octomizer_project_example.py",
    )

    # Upload the ONNX model files.
    for onnx_model_file, model_package_name in onnx_model_files:
        print(f"Uploading model {onnx_model_file}...")
        _ = onnx_model.ONNXModel(
            client,
            name=model_package_name,
            model=onnx_model_file,
            description="Created by octomizer_project_example.py",
            project=my_project,
        )

    # List all the Models belonging to the Project, which should be the two uploaded above.
    project_models = my_project.list_models()

    # Print all the Models belonging to the Project.
    print(
        f"Models belonging to the Project: {list(model.uuid for model in project_models)}"
    )

    # It is possible to create detached Models. Simply ignore the project parameter or pass
    # None as an argument.
    detached_model = onnx_model.ONNXModel(
        client,
        name=model_package_name,
        model=onnx_model_file,
        description="Created by octomizer_project_example.py",
    )
    assert detached_model.project is None


if __name__ == "__main__":
    __main__()