Execution Providers
Execution Providers (EPs) enable ONNX Runtime to execute models on different hardware accelerators including CPUs, GPUs, NPUs, and specialized AI chips. Each provider optimizes inference for specific hardware.
Available Providers
import onnxruntime as ort
# Get all providers built into this installation
all_providers = ort.get_all_providers()
print(f"All providers: {all_providers}")
# Get providers available on this system
available_providers = ort.get_available_providers()
print(f"Available providers: {available_providers}")
Common Execution Providers
Default CPU execution provider. Always available. Optimized for x86 and ARM CPUs.
NVIDIA GPU acceleration via CUDA. Requires CUDA installation.
TensorrtExecutionProvider
NVIDIA TensorRT for optimized GPU inference. Requires TensorRT installation.
AMD GPU acceleration via ROCm.
OpenVINOExecutionProvider
Intel hardware acceleration (CPU, GPU, VPU).
Apple Neural Engine and GPU acceleration on macOS/iOS.
DirectML acceleration for Windows (AMD, Intel, NVIDIA GPUs).
Qualcomm NPU acceleration for mobile and edge devices.
Configuring Providers
Basic Provider Selection
import onnxruntime as ort
# Use default providers (all available)
sess = ort.InferenceSession("model.onnx")
# Specify providers by name
sess = ort.InferenceSession(
"model.onnx",
providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
)
# Providers are tried in order - CUDA first, CPU as fallback
Provider Options
# Method 1: Provider name with options dict
sess = ort.InferenceSession(
"model.onnx",
providers=[
("CUDAExecutionProvider", {
"device_id": 0,
"arena_extend_strategy": "kNextPowerOfTwo",
"gpu_mem_limit": 2 * 1024 * 1024 * 1024, # 2GB
"cudnn_conv_algo_search": "EXHAUSTIVE",
"do_copy_in_default_stream": True,
}),
"CPUExecutionProvider"
]
)
# Method 2: Separate provider_options list
sess = ort.InferenceSession(
"model.onnx",
providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
provider_options=[
{"device_id": "0", "gpu_mem_limit": "2147483648"},
{}
]
)
CUDA Provider Options
GPU device ID to use. Default is 0.
Maximum GPU memory in bytes. Default is max available.
Memory allocation strategy: “kNextPowerOfTwo” or “kSameAsRequested”.
Convolution algorithm search: “EXHAUSTIVE”, “HEURISTIC”, or “DEFAULT”.
do_copy_in_default_stream
Use default CUDA stream for copies. Default is True.
cudnn_conv_use_max_workspace
Allow cuDNN to use maximum workspace size.
Enable CUDA graphs for faster execution with fixed input shapes.
CUDA Provider Example
cuda_options = {
"device_id": 0,
"arena_extend_strategy": "kSameAsRequested",
"gpu_mem_limit": 4 * 1024 * 1024 * 1024, # 4GB limit
"cudnn_conv_algo_search": "EXHAUSTIVE",
"do_copy_in_default_stream": True,
"enable_cuda_graph": False,
}
sess = ort.InferenceSession(
"model.onnx",
providers=[("CUDAExecutionProvider", cuda_options), "CPUExecutionProvider"]
)
TensorRT Provider Options
GPU device ID. Default is 0.
Maximum TensorRT workspace size in bytes. Default is 1GB.
Enable FP16 precision. Default is False.
Enable INT8 quantization. Default is False.
trt_int8_calibration_table_name
Path to INT8 calibration table.
Enable TensorRT engine caching. Default is False.
Directory to store cached engines.
TensorRT Provider Example
tensorrt_options = {
"device_id": 0,
"trt_max_workspace_size": 2147483648, # 2GB
"trt_fp16_enable": True,
"trt_engine_cache_enable": True,
"trt_engine_cache_path": "./trt_cache",
}
sess = ort.InferenceSession(
"model.onnx",
providers=[("TensorrtExecutionProvider", tensorrt_options), "CUDAExecutionProvider", "CPUExecutionProvider"]
)
Dynamic Provider Management
Query Current Providers
sess = ort.InferenceSession("model.onnx")
# Get active providers
providers = sess.get_providers()
print(f"Active providers: {providers}")
# Get provider options
options = sess.get_provider_options()
print(f"Provider options: {options}")
Change Providers at Runtime
sess = ort.InferenceSession("model.onnx")
# Run on CPU
output1 = sess.run(None, inputs)
# Switch to GPU
sess.set_providers(["CUDAExecutionProvider", "CPUExecutionProvider"])
output2 = sess.run(None, inputs)
Dynamic Provider Options
# Modify provider behavior at runtime
sess.set_ep_dynamic_options({
"device_id": "1", # Switch to GPU 1
"gpu_mem_limit": "3221225472" # 3GB
})
Provider Fallback
# Automatic fallback if provider fails
sess = ort.InferenceSession(
"model.onnx",
providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
)
# Falls back to CPU if CUDA initialization fails
# Disable fallback
sess = ort.InferenceSession(
"model.onnx",
providers=["CUDAExecutionProvider"],
enable_fallback=0
)
# Raises exception if CUDA fails
Multi-GPU Inference
# Create sessions on different GPUs
sess_gpu0 = ort.InferenceSession(
"model.onnx",
providers=[("CUDAExecutionProvider", {"device_id": 0})]
)
sess_gpu1 = ort.InferenceSession(
"model.onnx",
providers=[("CUDAExecutionProvider", {"device_id": 1})]
)
# Run inference on different GPUs
output0 = sess_gpu0.run(None, inputs0)
output1 = sess_gpu1.run(None, inputs1)
Provider-Specific Features
CUDA Graphs
# Enable CUDA graphs for fixed input shapes
sess = ort.InferenceSession(
"model.onnx",
providers=[("CUDAExecutionProvider", {"enable_cuda_graph": True})]
)
# First run captures the graph
output = sess.run(None, inputs)
# Subsequent runs replay the graph (much faster)
for _ in range(100):
output = sess.run(None, inputs) # Same input shape required
TensorRT Engine Caching
import os
os.makedirs("./trt_cache", exist_ok=True)
# First run builds and caches TensorRT engines
sess = ort.InferenceSession(
"model.onnx",
providers=[("TensorrtExecutionProvider", {
"trt_engine_cache_enable": True,
"trt_engine_cache_path": "./trt_cache",
"trt_fp16_enable": True,
})]
)
# First inference is slow (building engines)
output = sess.run(None, inputs)
# Create new session - loads cached engines (fast)
sess2 = ort.InferenceSession(
"model.onnx",
providers=[("TensorrtExecutionProvider", {
"trt_engine_cache_enable": True,
"trt_engine_cache_path": "./trt_cache",
"trt_fp16_enable": True,
})]
)
Provider Selection Best Practices
# Check for available hardware
available = ort.get_available_providers()
if "CUDAExecutionProvider" in available:
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
elif "CoreMLExecutionProvider" in available:
providers = ["CoreMLExecutionProvider", "CPUExecutionProvider"]
else:
providers = ["CPUExecutionProvider"]
sess = ort.InferenceSession("model.onnx", providers=providers)