Skip to main content

Execution Providers

Execution Providers (EPs) enable ONNX Runtime to execute models on different hardware accelerators including CPUs, GPUs, NPUs, and specialized AI chips. Each provider optimizes inference for specific hardware.

Available Providers

Getting Provider Information

import onnxruntime as ort

# Get all providers built into this installation
all_providers = ort.get_all_providers()
print(f"All providers: {all_providers}")

# Get providers available on this system
available_providers = ort.get_available_providers()
print(f"Available providers: {available_providers}")

Common Execution Providers

CPUExecutionProvider
string
Default CPU execution provider. Always available. Optimized for x86 and ARM CPUs.
CUDAExecutionProvider
string
NVIDIA GPU acceleration via CUDA. Requires CUDA installation.
TensorrtExecutionProvider
string
NVIDIA TensorRT for optimized GPU inference. Requires TensorRT installation.
ROCMExecutionProvider
string
AMD GPU acceleration via ROCm.
OpenVINOExecutionProvider
string
Intel hardware acceleration (CPU, GPU, VPU).
CoreMLExecutionProvider
string
Apple Neural Engine and GPU acceleration on macOS/iOS.
DmlExecutionProvider
string
DirectML acceleration for Windows (AMD, Intel, NVIDIA GPUs).
QNNExecutionProvider
string
Qualcomm NPU acceleration for mobile and edge devices.

Configuring Providers

Basic Provider Selection

import onnxruntime as ort

# Use default providers (all available)
sess = ort.InferenceSession("model.onnx")

# Specify providers by name
sess = ort.InferenceSession(
    "model.onnx",
    providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
)

# Providers are tried in order - CUDA first, CPU as fallback

Provider Options

# Method 1: Provider name with options dict
sess = ort.InferenceSession(
    "model.onnx",
    providers=[
        ("CUDAExecutionProvider", {
            "device_id": 0,
            "arena_extend_strategy": "kNextPowerOfTwo",
            "gpu_mem_limit": 2 * 1024 * 1024 * 1024,  # 2GB
            "cudnn_conv_algo_search": "EXHAUSTIVE",
            "do_copy_in_default_stream": True,
        }),
        "CPUExecutionProvider"
    ]
)

# Method 2: Separate provider_options list
sess = ort.InferenceSession(
    "model.onnx",
    providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
    provider_options=[
        {"device_id": "0", "gpu_mem_limit": "2147483648"},
        {}
    ]
)

CUDA Provider Options

device_id
int
GPU device ID to use. Default is 0.
gpu_mem_limit
int
Maximum GPU memory in bytes. Default is max available.
arena_extend_strategy
str
Memory allocation strategy: “kNextPowerOfTwo” or “kSameAsRequested”.
Convolution algorithm search: “EXHAUSTIVE”, “HEURISTIC”, or “DEFAULT”.
do_copy_in_default_stream
bool
Use default CUDA stream for copies. Default is True.
cudnn_conv_use_max_workspace
bool
Allow cuDNN to use maximum workspace size.
enable_cuda_graph
bool
Enable CUDA graphs for faster execution with fixed input shapes.

CUDA Provider Example

cuda_options = {
    "device_id": 0,
    "arena_extend_strategy": "kSameAsRequested",
    "gpu_mem_limit": 4 * 1024 * 1024 * 1024,  # 4GB limit
    "cudnn_conv_algo_search": "EXHAUSTIVE",
    "do_copy_in_default_stream": True,
    "enable_cuda_graph": False,
}

sess = ort.InferenceSession(
    "model.onnx",
    providers=[("CUDAExecutionProvider", cuda_options), "CPUExecutionProvider"]
)

TensorRT Provider Options

device_id
int
GPU device ID. Default is 0.
trt_max_workspace_size
int
Maximum TensorRT workspace size in bytes. Default is 1GB.
trt_fp16_enable
bool
Enable FP16 precision. Default is False.
trt_int8_enable
bool
Enable INT8 quantization. Default is False.
trt_int8_calibration_table_name
str
Path to INT8 calibration table.
trt_engine_cache_enable
bool
Enable TensorRT engine caching. Default is False.
trt_engine_cache_path
str
Directory to store cached engines.

TensorRT Provider Example

tensorrt_options = {
    "device_id": 0,
    "trt_max_workspace_size": 2147483648,  # 2GB
    "trt_fp16_enable": True,
    "trt_engine_cache_enable": True,
    "trt_engine_cache_path": "./trt_cache",
}

sess = ort.InferenceSession(
    "model.onnx",
    providers=[("TensorrtExecutionProvider", tensorrt_options), "CUDAExecutionProvider", "CPUExecutionProvider"]
)

Dynamic Provider Management

Query Current Providers

sess = ort.InferenceSession("model.onnx")

# Get active providers
providers = sess.get_providers()
print(f"Active providers: {providers}")

# Get provider options
options = sess.get_provider_options()
print(f"Provider options: {options}")

Change Providers at Runtime

sess = ort.InferenceSession("model.onnx")

# Run on CPU
output1 = sess.run(None, inputs)

# Switch to GPU
sess.set_providers(["CUDAExecutionProvider", "CPUExecutionProvider"])
output2 = sess.run(None, inputs)

Dynamic Provider Options

# Modify provider behavior at runtime
sess.set_ep_dynamic_options({
    "device_id": "1",  # Switch to GPU 1
    "gpu_mem_limit": "3221225472"  # 3GB
})

Provider Fallback

# Automatic fallback if provider fails
sess = ort.InferenceSession(
    "model.onnx",
    providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
)
# Falls back to CPU if CUDA initialization fails

# Disable fallback
sess = ort.InferenceSession(
    "model.onnx",
    providers=["CUDAExecutionProvider"],
    enable_fallback=0
)
# Raises exception if CUDA fails

Multi-GPU Inference

# Create sessions on different GPUs
sess_gpu0 = ort.InferenceSession(
    "model.onnx",
    providers=[("CUDAExecutionProvider", {"device_id": 0})]
)

sess_gpu1 = ort.InferenceSession(
    "model.onnx",
    providers=[("CUDAExecutionProvider", {"device_id": 1})]
)

# Run inference on different GPUs
output0 = sess_gpu0.run(None, inputs0)
output1 = sess_gpu1.run(None, inputs1)

Provider-Specific Features

CUDA Graphs

# Enable CUDA graphs for fixed input shapes
sess = ort.InferenceSession(
    "model.onnx",
    providers=[("CUDAExecutionProvider", {"enable_cuda_graph": True})]
)

# First run captures the graph
output = sess.run(None, inputs)

# Subsequent runs replay the graph (much faster)
for _ in range(100):
    output = sess.run(None, inputs)  # Same input shape required

TensorRT Engine Caching

import os

os.makedirs("./trt_cache", exist_ok=True)

# First run builds and caches TensorRT engines
sess = ort.InferenceSession(
    "model.onnx",
    providers=[("TensorrtExecutionProvider", {
        "trt_engine_cache_enable": True,
        "trt_engine_cache_path": "./trt_cache",
        "trt_fp16_enable": True,
    })]
)

# First inference is slow (building engines)
output = sess.run(None, inputs)

# Create new session - loads cached engines (fast)
sess2 = ort.InferenceSession(
    "model.onnx",
    providers=[("TensorrtExecutionProvider", {
        "trt_engine_cache_enable": True,
        "trt_engine_cache_path": "./trt_cache",
        "trt_fp16_enable": True,
    })]
)

Provider Selection Best Practices

# Check for available hardware
available = ort.get_available_providers()

if "CUDAExecutionProvider" in available:
    providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
elif "CoreMLExecutionProvider" in available:
    providers = ["CoreMLExecutionProvider", "CPUExecutionProvider"]
else:
    providers = ["CPUExecutionProvider"]

sess = ort.InferenceSession("model.onnx", providers=providers)