Note

Go to the end to download the full example code.

Model deployment and tracking with Embedl Hub#

In this tutorial, we demonstrate an end-to-end workflow for deploying vision models using embedl_deploy for TensorRT and tracking the artifacts with Embedl Hub.

The models selected are from the torchvision library, covering a range of architectures: ConvNeXt Base, ResNet-50, and ViT-B/16. We apply post-training static quantization (PTQ) using the built-in TensorRT pattern set. Embedl Deploy automatically handles special cases such as depthwise convolutions, which are memory-bound and left in FP16 to avoid quantization overhead.

The pipeline after transformation and quantization for deploying a model consists of the following steps:

Export the PyTorch model to ONNX and simplify with onnxsim.
Build a TensorRT engine (FP16 for baseline, --best for QDQ models).
Run TensorRT inference.
Measure latency with the TensorRT Python API.

Note

This tutorial requires an NVIDIA GPU with TensorRT 10.x installed. You will need to create an account at hub.embedl.com and create an API key for logging (in the profile section). You can run pip install embedl-hub to install the hub client and then set up the API keys with: embedl-hub auth --api-key <YOUR_API_KEY>

Constants#

import json
import platform
import time
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import urljoin

import tensorrt as trt
import torch
from torch import nn

from embedl_hub.tracking import Client

try:
    from tensorrt_inference import (
        BATCH_SIZE,
        CALIBRATION_BATCHES,
        DATA_DIR,
        IMAGENET_MEAN,
        IMAGENET_STD,
        IMAGENETTE_DIR,
        IMAGENETTE_TO_IMAGENET,
        IMAGENETTE_URL,
        NUM_WORKERS,
        ImageLoader,
        build_trt_engine,
        evaluate_trt,
        make_loaders,
        measure_latency,
    )
except ImportError:
    raise ImportError(
        "This tutorial requires the companion file 'tensorrt_inference.py' "
        "which provides utilities for downloading datasets, compiling models "
        "and measuring latencies. Please download it from the same directory "
        "as this tutorial."
    ) from None

API_KEY_ENV = "EMBEDL_HUB_API_KEY"
MODEL_NAME = "resnet50"
HUB_PROJECT = "Deploy Demo"
CROP_SIZE = 224
RESIZE_SIZE = 256
WARMUP_MS = 1000
BENCHMARK_DURATION = 30
BENCHMARK_DIR = Path(f"artifacts/ptq/{MODEL_NAME}_benchmark")
UPLOAD_LARGE_ARTIFACTS = True


@dataclass(frozen=True)
class ExportResult:
    """Files produced by ONNX export."""

    raw_path: Path
    model_path: Path
    simplified: bool


@dataclass(frozen=True)
class QuantizationResult:
    """Embedl Deploy quantization output and summary metrics."""

    model: nn.Module
    fusion_max_diff: float
    activation_quantizers: int
    weight_quantizers: int


@dataclass(frozen=True)
class CompileResult:
    """Files and measurements produced by one compiled candidate."""

    tag: str
    export: ExportResult
    engine_path: Path
    timing_cache_path: Path
    phase_seconds: dict[str, float]
    quantization_metrics: dict[str, float]


@dataclass(frozen=True)
class ProfileResult:
    """Accuracy and latency measurements for one compiled candidate."""

    tag: str
    accuracy: dict[str, float]
    latency: dict[str, float]
    phase_seconds: dict[str, float]


@dataclass(frozen=True)
class SetupResult:
    """Data and model objects produced by the custom setup run."""

    train_loader: ImageLoader
    val_loader: ImageLoader
    calibration_batches: list[torch.Tensor]
    pretrained_model: nn.Module
    parameter_count: int
    trainable_parameters: int
    train_images: int
    val_images: int
    downloaded: bool
    model_import_seconds: float
    setup_seconds: float
    mapping_path: Path


def _stringify_param(value: object) -> str:
    """Convert values to stable Hub parameter strings."""
    if isinstance(value, (dict, list, tuple)):
        return json.dumps(value, sort_keys=True)
    return str(value)


def _params(items: dict[str, object]) -> list[tuple[str, str]]:
    return [(key, _stringify_param(value)) for key, value in items.items()]


def _metrics(items: dict[str, float]) -> list[tuple[str, float, int | None]]:
    return [(key, float(value), None) for key, value in items.items()]


def _file_size_mb(path: Path) -> float:
    return path.stat().st_size / 1_000_000 if path.exists() else 0.0


def _write_json(path: Path, payload: object) -> Path:
    path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n")
    return path


def _project_url(client: Client, project_id: str) -> str:
    return urljoin(client.api_config.base_url, f"projects/{project_id}")


def _log_demo_tags(client: Client, *extra_tags: tuple[str, str]) -> None:
    tags = [
        ("demo", "deploy"),
        ("workflow", "torchvision-tensorrt-ptq"),
        ("dataset", "imagenette"),
        ("runtime", "tensorrt"),
        ("product_demo", "true"),
        *extra_tags,
    ]
    for name, value in tags:
        client.log_tag(name, value)

Pattern selection#

When no explicit pattern list is passed, transform() uses the active backend’s conversion and fusion patterns automatically. These include structural conversions (e.g. decomposing MultiheadAttention), operator fusions (Conv-BN-ReLU, Linear-ReLU, LayerNorm, etc.), and automatic handling of depthwise convolutions. Depthwise convolutions are memory-bound, so quantizing them adds TensorRT reformatting overhead that exceeds the compute gain from INT8 — Embedl Deploy detects these and keeps them in FP16 automatically.

Dataset helpers#

We use ImageNette — a 10-class subset of ImageNet — for fast evaluation. Data loaders come from tensorrt_inference.

import tarfile
import urllib.request

import torchvision


def download_imagenette() -> bool:
    """Download and extract ImageNette if not already present."""
    if IMAGENETTE_DIR.exists():
        print(f"ImageNette already present at {IMAGENETTE_DIR}")
        return False
    DATA_DIR.mkdir(parents=True, exist_ok=True)
    tgz_path = DATA_DIR / "imagenette2-320.tgz"
    print(f"Downloading ImageNette to {tgz_path} ...")
    urllib.request.urlretrieve(IMAGENETTE_URL, str(tgz_path))
    print("Extracting ...")
    with tarfile.open(tgz_path) as tar:
        tar.extractall(DATA_DIR)
    tgz_path.unlink()
    print("Done.")
    return True

ONNX export#

Export the model to ONNX and simplify with onnxsim.

import onnx
import onnxsim


def export_and_simplify(
    model: nn.Module, onnx_path: Path, crop_size: int = 224
) -> ExportResult:
    """Export to ONNX + simplify with onnxsim."""
    model = model.cpu().eval()
    x = torch.randn(1, 3, crop_size, crop_size)
    torch.onnx.export(
        model,
        (x,),
        str(onnx_path),
        opset_version=20,
        input_names=["input"],
        output_names=["output"],
        dynamo=False,
    )
    onnx_model = onnx.load(str(onnx_path))
    simplified, ok = onnxsim.simplify(onnx_model)
    simp_path = onnx_path.with_name(onnx_path.stem + "_sim.onnx")
    if ok:
        onnx.save(simplified, str(simp_path))
        print(f"  Simplified ONNX: {simp_path}")
    else:
        print("  onnxsim failed; using raw export.")
        simp_path = onnx_path
    return ExportResult(
        raw_path=onnx_path, model_path=simp_path, simplified=ok
    )

Quantization with Embedl Deploy#

Fuse layers, insert QDQ stubs with the selective pattern list, and calibrate on training data.

from embedl_deploy import transform
from embedl_deploy.quantize import (
    Precision,
    QuantConfig,
    QuantStub,
    TensorQuantConfig,
    WeightFakeQuantize,
    quantize,
)


def quantize_embedl(
    pretrained_model: nn.Module,
    calib_batches: list[torch.Tensor],
    crop_size: int = 224,
) -> QuantizationResult:
    """Fuse + quantize + calibrate using Embedl Deploy."""
    print("\n=== Embedl Deploy PTQ ===")

    gm = pretrained_model.cpu().eval()
    example_input = torch.randn(1, 3, crop_size, crop_size)
    fused_model = transform(gm, (example_input,)).model

    # Verify lossless fusion.
    with torch.no_grad():
        x = torch.randn(1, 3, crop_size, crop_size)
        max_diff = (pretrained_model.cpu()(x) - fused_model(x)).abs().max()
    assert max_diff < 1e-4, f"Fusion diverged: {max_diff:.2e}"
    print(f"  Fusion check passed (max diff = {max_diff:.2e})")

    print(f"  Calibrating ({len(calib_batches)} batches) ...")

    def forward_loop(model: nn.Module) -> None:
        with torch.no_grad():
            for batch in calib_batches:
                model(batch)

    quantized = quantize(
        fused_model,
        (torch.randn(1, 3, crop_size, crop_size),),
        config=QuantConfig(
            activation=TensorQuantConfig(Precision.INT8, symmetric=True),
            weight=TensorQuantConfig(
                Precision.INT8,
                symmetric=True,
                per_channel=True,
            ),
        ),
        forward_loop=forward_loop,
        freeze_weights=True,
    )

    n_act = sum(1 for m in quantized.modules() if isinstance(m, QuantStub))
    n_wt = sum(
        1 for m in quantized.modules() if isinstance(m, WeightFakeQuantize)
    )
    print(f"  QuantStubs: {n_act}, WeightFakeQuantize: {n_wt}")
    print("  Calibration complete.")
    return QuantizationResult(
        model=quantized,
        fusion_max_diff=float(max_diff),
        activation_quantizers=n_act,
        weight_quantizers=n_wt,
    )

Benchmark runner#

Helper that runs the full export -> build -> evaluate -> latency pipeline for a single model variant.

def compile_variant(
    tag: str,
    model: nn.Module,
    *,
    dest: Path,
    calib_data: list[torch.Tensor] | None = None,
    best: bool = False,
    fp16: bool = True,
) -> CompileResult:
    """Export and build one TensorRT deployment candidate."""
    print(f"\n{'=' * 60}\n{tag}\n{'=' * 60}")
    phase_seconds: dict[str, float] = {}
    timing_cache_path = dest / "timing.cache"
    model_to_compile = model
    quantization_result: QuantizationResult | None = None

    if calib_data is not None:
        quant_started = time.perf_counter()
        quantization_result = quantize_embedl(model, calib_data, CROP_SIZE)
        model_to_compile = quantization_result.model
        phase_seconds["quantization_seconds"] = (
            time.perf_counter() - quant_started
        )

    export_started = time.perf_counter()
    export = export_and_simplify(
        model_to_compile, dest / f"{tag}.onnx", CROP_SIZE
    )
    phase_seconds["export_seconds"] = time.perf_counter() - export_started

    engine_path = dest / f"{tag}.engine"
    build_started = time.perf_counter()
    build_trt_engine(
        export.model_path,
        engine_path,
        fp16=fp16,
        best=best,
    )
    phase_seconds["engine_build_seconds"] = time.perf_counter() - build_started

    quant_metrics: dict[str, float] = {}
    if quantization_result is not None:
        quant_metrics = {
            "fusion_max_diff": quantization_result.fusion_max_diff,
            "activation_quantizers": float(
                quantization_result.activation_quantizers
            ),
            "weight_quantizers": float(quantization_result.weight_quantizers),
            "calibration_batches": float(len(calib_data or [])),
        }

    print(f"  Engine: {engine_path} ({_file_size_mb(engine_path):.1f} MB)")
    return CompileResult(
        tag=tag,
        export=export,
        engine_path=engine_path,
        timing_cache_path=timing_cache_path,
        phase_seconds=phase_seconds,
        quantization_metrics=quant_metrics,
    )


def log_compile_run(  # noqa: PLR0913
    client: Client,
    *,
    result: CompileResult,
    tag: str,
    parameter_count: int,
    trainable_parameters: int,
    best: bool,
    fp16: bool,
    candidate_tag: str,
    precision_tag: str,
    upload_large_artifacts: bool,
) -> None:
    """Log compile parameters, metrics, and artifacts."""
    _log_demo_tags(
        client,
        ("stage", "compile"),
        ("candidate", candidate_tag),
        ("model", MODEL_NAME),
        ("target", "h200"),
        ("precision", precision_tag),
    )
    client.log_batch(
        params=_params(
            {
                "variant": tag,
                "model_name": MODEL_NAME,
                "parameter_count": parameter_count,
                "trainable_parameters": trainable_parameters,
                "crop_size": CROP_SIZE,
                "precision": "FP16+INT8" if best else "FP16",
                "tensorrt_best": best,
                "tensorrt_fp16": fp16 or best,
                "onnx_opset": 20,
                "builder_optimization_level": 5,
                "raw_onnx_path": result.export.raw_path,
                "deployment_onnx_path": result.export.model_path,
                "engine_path": result.engine_path,
                "timing_cache_path": result.timing_cache_path,
            }
        ),
        metrics=_metrics(
            {
                **result.phase_seconds,
                **result.quantization_metrics,
                "raw_onnx_size_mb": _file_size_mb(result.export.raw_path),
                "deployment_onnx_size_mb": _file_size_mb(
                    result.export.model_path
                ),
                "engine_size_mb": _file_size_mb(result.engine_path),
                "timing_cache_size_mb": _file_size_mb(
                    result.timing_cache_path
                ),
                "onnx_simplified": (1.0 if result.export.simplified else 0.0),
            }
        ),
    )

    if upload_large_artifacts:
        client.log_artifact(result.export.raw_path, name="raw_onnx")
        if result.export.model_path != result.export.raw_path:
            client.log_artifact(
                result.export.model_path, name="deployment_onnx"
            )
        client.log_artifact(result.engine_path, name="path")
        if result.timing_cache_path.exists():
            client.log_artifact(result.timing_cache_path, name="timing_cache")


def profile_variant(
    tag: str,
    compile_result: CompileResult,
    *,
    val_loader: ImageLoader,
) -> ProfileResult:
    """Evaluate accuracy and measure latency for one compiled candidate."""
    phase_seconds: dict[str, float] = {}

    eval_started = time.perf_counter()
    accuracy = evaluate_trt(compile_result.engine_path, val_loader)
    phase_seconds["evaluation_seconds"] = time.perf_counter() - eval_started

    latency_started = time.perf_counter()
    latency = measure_latency(
        compile_result.engine_path,
        warmup_ms=WARMUP_MS,
        duration=BENCHMARK_DURATION,
    )
    phase_seconds["latency_benchmark_seconds"] = (
        time.perf_counter() - latency_started
    )

    print(f"  Top-1: {accuracy['top1']:.2f}%  Top-5: {accuracy['top5']:.2f}%")
    print(
        f"  Latency: {latency['mean_latency_ms']:.3f} ms  "
        f"Throughput: {latency['throughput_qps']:.1f} qps"
    )
    return ProfileResult(
        tag=tag,
        accuracy=accuracy,
        latency=latency,
        phase_seconds=phase_seconds,
    )


def log_profile_run(
    client: Client,
    *,
    result: ProfileResult,
    compile_run_id: str,
    compile_result: CompileResult,
    candidate_tag: str,
    precision_tag: str,
) -> None:
    """Log profile parameters and measurements."""
    _log_demo_tags(
        client,
        ("stage", "profile"),
        ("candidate", candidate_tag),
        ("model", MODEL_NAME),
        ("target", "h200"),
        ("precision", precision_tag),
    )
    client.log_batch(
        params=_params(
            {
                "model_name": MODEL_NAME,
                "compile_run_id": compile_run_id,
                "engine_path": compile_result.engine_path,
                "warmup_ms": WARMUP_MS,
                "benchmark_duration_seconds": BENCHMARK_DURATION,
            }
        ),
        metrics=_metrics(
            {
                **result.accuracy,
                **result.latency,
                **result.phase_seconds,
            }
        ),
    )


def create_root_run(
    client: Client,
    *,
    project_name: str,
) -> str:
    """Create the top-level Hub graph run for the product demo."""
    with client.start_run(
        "graph",
        name=f"Deploy Demo {MODEL_NAME} TensorRT PTQ workflow",
    ) as root_run:
        _log_demo_tags(client, ("stage", "root"))
        client.log_batch(
            params=_params(
                {
                    "project": project_name,
                    "model_name": MODEL_NAME,
                    "dataset": "ImageNette",
                    "source_library": "torchvision",
                    "deployment_runtime": "TensorRT",
                    "optimization": "Embedl Deploy PTQ",
                    "hub_base_url": client.api_config.base_url,
                    "crop_size": CROP_SIZE,
                    "resize_size": RESIZE_SIZE,
                    "batch_size": BATCH_SIZE,
                    "num_workers": NUM_WORKERS,
                    "benchmark_dir": BENCHMARK_DIR,
                    "created_at": datetime.now(timezone.utc).isoformat(),
                    "python": platform.python_version(),
                    "platform": platform.platform(),
                    "torch": torch.__version__,
                    "torchvision": torchvision.__version__,
                    "tensorrt": trt.__version__,
                    "cuda_device": torch.cuda.get_device_name(0),
                }
            ),
        )
        return root_run.id


def prepare_demo_setup() -> SetupResult:
    """Prepare data, calibration batches, and the pretrained model."""
    started = time.perf_counter()
    downloaded = download_imagenette()
    train_dl, val_dl = make_loaders(CROP_SIZE, RESIZE_SIZE)

    train_images = len(train_dl.dataset)  # type: ignore[arg-type]
    val_images = len(val_dl.dataset)  # type: ignore[arg-type]

    calib_data: list[torch.Tensor] = []
    for batch_idx, (imgs, _) in enumerate(train_dl):
        if batch_idx >= CALIBRATION_BATCHES:
            break
        calib_data.append(imgs)
    print(f"Collected {len(calib_data)} calibration batches.")

    model_started = time.perf_counter()
    pretrained = torchvision.models.get_model(
        MODEL_NAME, weights="DEFAULT"
    ).eval()
    model_import_seconds = time.perf_counter() - model_started
    parameter_count = sum(p.numel() for p in pretrained.parameters())
    trainable_parameters = sum(
        p.numel() for p in pretrained.parameters() if p.requires_grad
    )
    print(f"Loaded {MODEL_NAME} ({parameter_count:,} params)")

    elapsed = time.perf_counter() - started
    mapping_path = _write_json(
        BENCHMARK_DIR / "imagenette_class_mapping.json",
        {
            "imagenette_url": IMAGENETTE_URL,
            "imagenette_dir": str(IMAGENETTE_DIR),
            "imagenette_to_imagenet": IMAGENETTE_TO_IMAGENET,
            "train_images": train_images,
            "val_images": val_images,
        },
    )

    return SetupResult(
        train_loader=train_dl,
        val_loader=val_dl,
        calibration_batches=calib_data,
        pretrained_model=pretrained,
        parameter_count=parameter_count,
        trainable_parameters=trainable_parameters,
        train_images=train_images,
        val_images=val_images,
        downloaded=downloaded,
        model_import_seconds=model_import_seconds,
        setup_seconds=elapsed,
        mapping_path=mapping_path,
    )


def log_setup_run(
    client: Client,
    *,
    setup: SetupResult,
) -> None:
    """Log setup metadata after data and model preparation."""
    _log_demo_tags(
        client,
        ("stage", "setup"),
        ("model", MODEL_NAME),
        ("custom_type", "demo_setup"),
    )
    client.log_batch(
        params=_params(
            {
                "model_name": MODEL_NAME,
                "source": "torchvision",
                "weights": "DEFAULT",
                "dataset": "ImageNette",
                "dataset_url": IMAGENETTE_URL,
                "dataset_dir": IMAGENETTE_DIR,
                "downloaded_this_run": setup.downloaded,
                "crop_size": CROP_SIZE,
                "resize_size": RESIZE_SIZE,
                "requested_calibration_batches": CALIBRATION_BATCHES,
                "normalization_mean": IMAGENET_MEAN,
                "normalization_std": IMAGENET_STD,
            }
        ),
        metrics=_metrics(
            {
                "train_images": setup.train_images,
                "val_images": setup.val_images,
                "classes": len(IMAGENETTE_TO_IMAGENET),
                "calibration_batches": len(setup.calibration_batches),
                "calibration_images": (
                    len(setup.calibration_batches) * BATCH_SIZE
                ),
                "parameter_count": setup.parameter_count,
                "trainable_parameters": setup.trainable_parameters,
                "model_import_seconds": setup.model_import_seconds,
                "setup_seconds": setup.setup_seconds,
            }
        ),
    )
    client.log_artifact(setup.mapping_path, name="class_mapping")


def log_comparison_run(  # noqa: PLR0913
    client: Client,
    *,
    parent_run_id: str,
    baseline_compile_run_id: str,
    baseline_profile_run_id: str,
    embedl_compile_run_id: str,
    embedl_profile_run_id: str,
    baseline_compile: CompileResult,
    embedl_compile: CompileResult,
    baseline_profile: ProfileResult,
    embedl_profile: ProfileResult,
) -> None:
    """Log the final product-demo comparison and summary artifacts."""
    baseline_acc = baseline_profile.accuracy
    baseline_lat = baseline_profile.latency
    embedl_acc = embedl_profile.accuracy
    embedl_lat = embedl_profile.latency

    speedup_e = baseline_lat["mean_latency_ms"] / max(
        embedl_lat["mean_latency_ms"], 1e-6
    )
    drop_e = baseline_acc["top1"] - embedl_acc["top1"]
    top5_drop_e = baseline_acc["top5"] - embedl_acc["top5"]
    baseline_engine_size = _file_size_mb(baseline_compile.engine_path)
    embedl_engine_size = _file_size_mb(embedl_compile.engine_path)
    engine_size_reduction_pct = (
        (baseline_engine_size - embedl_engine_size)
        / max(baseline_engine_size, 1e-6)
        * 100
    )

    header = (
        f"{'Variant':<25s} {'Top-1':>7s} {'Top-5':>7s} "
        f"{'Latency(ms)':>12s} {'Throughput':>12s}"
    )
    rows = [
        ("Baseline (FP16)", baseline_acc, baseline_lat),
        ("Embedl Deploy (best)", embedl_acc, embedl_lat),
    ]

    print(f"\n{'=' * 80}")
    print(f"BENCHMARK SUMMARY - {MODEL_NAME} PTQ on ImageNette")
    print("=" * 80)
    print(header)
    print("-" * len(header))
    for label, row_acc, row_lat in rows:
        print(
            f"{label:<25s} {row_acc['top1']:6.2f}% "
            f"{row_acc['top5']:6.2f}% "
            f"{row_lat['mean_latency_ms']:11.3f} "
            f"{row_lat['throughput_qps']:10.1f} qps"
        )
    print()
    print(
        f"  Embedl Deploy - Top-1 drop: {drop_e:+.2f}pp, speedup: {speedup_e:.2f}x"
    )
    print("=" * 80)

    results_path = BENCHMARK_DIR / f"{MODEL_NAME}_results.txt"
    lines = [
        f"TRT {trt.__version__}",
        "=" * 80,
        f"BENCHMARK SUMMARY - {MODEL_NAME} PTQ on ImageNette",
        "=" * 80,
        header,
        "-" * len(header),
    ]
    for label, row_acc, row_lat in rows:
        lines.append(
            f"{label:<25s} {row_acc['top1']:6.2f}% "
            f"{row_acc['top5']:6.2f}% "
            f"{row_lat['mean_latency_ms']:11.3f} "
            f"{row_lat['throughput_qps']:10.1f} qps"
        )
    lines += [
        "",
        f"  Embedl Deploy - Top-1 drop: {drop_e:+.2f}pp, speedup: {speedup_e:.2f}x",
        "=" * 80,
    ]
    results_path.write_text("\n".join(lines) + "\n")

    summary_path = _write_json(
        BENCHMARK_DIR / f"{MODEL_NAME}_hub_summary.json",
        {
            "model_name": MODEL_NAME,
            "baseline_compile_run_id": baseline_compile_run_id,
            "baseline_profile_run_id": baseline_profile_run_id,
            "embedl_compile_run_id": embedl_compile_run_id,
            "embedl_profile_run_id": embedl_profile_run_id,
            "baseline": {
                "accuracy": baseline_acc,
                "latency": baseline_lat,
                "engine_path": str(baseline_compile.engine_path),
            },
            "embedl": {
                "accuracy": embedl_acc,
                "latency": embedl_lat,
                "engine_path": str(embedl_compile.engine_path),
            },
            "comparison": {
                "speedup_x": speedup_e,
                "top1_drop_pp": drop_e,
                "top5_drop_pp": top5_drop_e,
                "engine_size_reduction_pct": engine_size_reduction_pct,
                "mean_latency_ms_delta": (
                    baseline_lat["mean_latency_ms"]
                    - embedl_lat["mean_latency_ms"]
                ),
            },
        },
    )

    with client.start_run(
        "eval",
        name="Compare deployment candidates",
        parent_run_id=parent_run_id,
    ):
        _log_demo_tags(
            client,
            ("stage", "eval"),
            ("candidate", "comparison"),
            ("model", MODEL_NAME),
        )
        client.log_batch(
            params=_params(
                {
                    "model_name": MODEL_NAME,
                    "baseline_compile_run_id": baseline_compile_run_id,
                    "baseline_profile_run_id": baseline_profile_run_id,
                    "embedl_compile_run_id": embedl_compile_run_id,
                    "embedl_profile_run_id": embedl_profile_run_id,
                }
            ),
            metrics=_metrics(
                {
                    "baseline_top1": baseline_acc["top1"],
                    "baseline_top5": baseline_acc["top5"],
                    "baseline_mean_latency_ms": baseline_lat[
                        "mean_latency_ms"
                    ],
                    "baseline_throughput_qps": baseline_lat["throughput_qps"],
                    "embedl_top1": embedl_acc["top1"],
                    "embedl_top5": embedl_acc["top5"],
                    "embedl_mean_latency_ms": embedl_lat["mean_latency_ms"],
                    "embedl_throughput_qps": embedl_lat["throughput_qps"],
                    "speedup_x": speedup_e,
                    "top1_drop_pp": drop_e,
                    "top5_drop_pp": top5_drop_e,
                    "baseline_engine_size_mb": baseline_engine_size,
                    "embedl_engine_size_mb": embedl_engine_size,
                    "engine_size_reduction_pct": engine_size_reduction_pct,
                }
            ),
        )
        client.log_artifact(results_path, name="benchmark_results_text")
        client.log_artifact(summary_path, name="benchmark_summary_json")
    print(f"\nResults saved to {results_path}")

Environment check and Hub client#

Confirm a Hub API key is configured, make sure the local benchmark directory exists, and verify we have a CUDA GPU — the TensorRT compile + INT8 calibration steps below require one. Then create the Client, which is the single entry point for talking to the Embedl Hub, and select (or create) the project that will hold this experiment’s runs.

BENCHMARK_DIR.mkdir(parents=True, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
assert device.type == "cuda", "This benchmark requires a CUDA GPU."

client = Client()
project = client.set_project(HUB_PROJECT)

print(f"Model: {MODEL_NAME}  crop={CROP_SIZE}  resize={RESIZE_SIZE}")
print(f"Output: {BENCHMARK_DIR}")
print(f"Hub project: {_project_url(client, project.id)}")

Root run and dataset setup#

Every variant we benchmark is logged as a child of one root run so they show up grouped together in the Hub UI. create_root_run creates that parent. The first child run, demo_setup, downloads and preprocesses ImageNette and builds the calibration / validation loaders that every subsequent run will share.

root_run_id = create_root_run(
    client,
    project_name=project.name,
)
with client.start_run(
    "demo_setup",
    name="Prepare ImageNette",
    parent_run_id=root_run_id,
) as setup_run:
    setup = prepare_demo_setup()
    log_setup_run(
        client,
        setup=setup,
    )
setup_run_id = setup_run.id

Baseline: compile the unmodified model in FP16#

The first variant is the reference point: the pretrained model exported and compiled with TensorRT in plain FP16 — no Embedl transforms, no INT8. compile_variant exports the model to ONNX, runs trtexec, and returns paths to the engine + metadata. The log_compile_run call attaches engine size, build time, and the precision tag to the Hub run.

with client.start_run(
    "compile",
    name="Compile baseline FP16",
    parent_run_id=setup_run_id,
) as baseline_compile_run:
    baseline_compile = compile_variant(
        "baseline_fp16",
        setup.pretrained_model,
        dest=BENCHMARK_DIR,
    )
    log_compile_run(
        client,
        result=baseline_compile,
        tag="baseline_fp16",
        parameter_count=setup.parameter_count,
        trainable_parameters=setup.trainable_parameters,
        best=False,
        fp16=True,
        candidate_tag="baseline-fp16",
        precision_tag="fp16",
        upload_large_artifacts=UPLOAD_LARGE_ARTIFACTS,
    )
baseline_compile_run_id = baseline_compile_run.id

Profile the baseline engine#

With the FP16 engine built, the profile run measures accuracy on the ImageNette validation set and end-to-end latency / throughput. Nesting this run under the compile run keeps the relationship explicit on the Hub: each profile points back at the exact engine it measured.

with client.start_run(
    "profile",
    name="Profile baseline FP16",
    parent_run_id=baseline_compile_run_id,
) as baseline_profile_run:
    baseline_profile = profile_variant(
        "baseline_fp16",
        baseline_compile,
        val_loader=setup.val_loader,
    )
    log_profile_run(
        client,
        result=baseline_profile,
        compile_run_id=baseline_compile_run_id,
        compile_result=baseline_compile,
        candidate_tag="baseline-fp16",
        precision_tag="fp16",
    )
baseline_profile_run_id = baseline_profile_run.id

Embedl variant: PTQ compile in mixed INT8/FP16#

This is the variant the benchmark is actually evaluating. We hand the same pretrained model to compile_variant with calib_data=setup.calibration_batches and best=True, which applies the Embedl TensorRT patterns, runs INT8 post-training quantization using the calibration batches, then compiles. The resulting engine runs in mixed INT8/FP16 precision — INT8 wherever the patterns placed QDQ pairs, FP16 elsewhere.

with client.start_run(
    "compile",
    name="Compile Embedl PTQ",
    parent_run_id=setup_run_id,
) as embedl_compile_run:
    embedl_compile = compile_variant(
        "embedl_ptq",
        setup.pretrained_model,
        dest=BENCHMARK_DIR,
        calib_data=setup.calibration_batches,
        best=True,
    )
    log_compile_run(
        client,
        result=embedl_compile,
        tag="embedl_ptq",
        parameter_count=setup.parameter_count,
        trainable_parameters=setup.trainable_parameters,
        best=True,
        fp16=True,
        candidate_tag="embedl-ptq",
        precision_tag="mixed-int8-fp16",
        upload_large_artifacts=UPLOAD_LARGE_ARTIFACTS,
    )
embedl_compile_run_id = embedl_compile_run.id

Profile the Embedl engine#

Same accuracy + latency measurements as the baseline profile, but on the INT8/FP16 engine. Running both profiles with identical loaders and the same profile_variant helper is what makes the side-by- side numbers in the next step a fair comparison.

with client.start_run(
    "profile",
    name="Profile Embedl PTQ",
    parent_run_id=embedl_compile_run_id,
) as embedl_profile_run:
    embedl_profile = profile_variant(
        "embedl_ptq",
        embedl_compile,
        val_loader=setup.val_loader,
    )
    log_profile_run(
        client,
        result=embedl_profile,
        compile_run_id=embedl_compile_run_id,
        compile_result=embedl_compile,
        candidate_tag="embedl-ptq",
        precision_tag="mixed-int8-fp16",
    )
embedl_profile_run_id = embedl_profile_run.id

Comparison run#

Finally, log a comparison run as a direct child of the root run. This computes the headline numbers — speedup, top-1 / top-5 drop, engine size reduction — and uploads them along with the rendered benchmark report. Putting it under the root run (not under either variant) is what gives the Hub a single place to surface the baseline-vs-Embedl summary for this experiment.

log_comparison_run(
    client,
    parent_run_id=root_run_id,
    baseline_compile_run_id=baseline_compile_run_id,
    baseline_profile_run_id=baseline_profile_run_id,
    embedl_compile_run_id=embedl_compile_run_id,
    embedl_profile_run_id=embedl_profile_run_id,
    baseline_compile=baseline_compile,
    embedl_compile=embedl_compile,
    baseline_profile=baseline_profile,
    embedl_profile=embedl_profile,
)

Gallery generated by Sphinx-Gallery