Skip to main content

C/C++ Inference API

The ONNX Runtime C++ API provides high-performance inference with RAII-based resource management and exception safety. This guide covers the complete C++ API with real examples from the codebase.

Installation

Using NuGet (Windows)

nuget install Microsoft.ML.OnnxRuntime

Using vcpkg

vcpkg install onnxruntime

Manual Installation

Download pre-built binaries from the ONNX Runtime releases page.

Quick Start

Here’s a minimal C++ example:
#include <onnxruntime_cxx_api.h>
#include <vector>
#include <iostream>

int main() {
    // Initialize environment
    Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test");
    
    // Create session
    Ort::SessionOptions session_options;
    Ort::Session session(env, "model.onnx", session_options);
    
    // Prepare input
    std::vector<float> input_data(1 * 3 * 224 * 224, 0.5f);
    std::vector<int64_t> input_shape = {1, 3, 224, 224};
    
    auto memory_info = Ort::MemoryInfo::CreateCpu(
        OrtArenaAllocator, OrtMemTypeDefault);
    
    Ort::Value input_tensor = Ort::Value::CreateTensor<float>(
        memory_info, input_data.data(), input_data.size(),
        input_shape.data(), input_shape.size());
    
    // Get input/output names
    Ort::AllocatorWithDefaultOptions allocator;
    const char* input_name = session.GetInputNameAllocated(0, allocator).get();
    const char* output_name = session.GetOutputNameAllocated(0, allocator).get();
    
    // Run inference
    std::vector<const char*> input_names = {input_name};
    std::vector<const char*> output_names = {output_name};
    
    auto output_tensors = session.Run(
        Ort::RunOptions{nullptr},
        input_names.data(), &input_tensor, 1,
        output_names.data(), 1);
    
    // Get output
    float* output_data = output_tensors[0].GetTensorMutableData<float>();
    std::cout << "First output value: " << output_data[0] << std::endl;
    
    return 0;
}

Core Classes

Ort::Env

The environment manages global state. Create one per application.
// Basic initialization
Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "MyApp");

// With custom thread pool
Ort::ThreadingOptions threading_options;
threading_options.SetGlobalIntraOpNumThreads(4);
threading_options.SetGlobalInterOpNumThreads(2);

Ort::Env env(threading_options, ORT_LOGGING_LEVEL_WARNING, "MyApp");
Logging levels:
  • ORT_LOGGING_LEVEL_VERBOSE (0)
  • ORT_LOGGING_LEVEL_INFO (1)
  • ORT_LOGGING_LEVEL_WARNING (2)
  • ORT_LOGGING_LEVEL_ERROR (3)
  • ORT_LOGGING_LEVEL_FATAL (4)

Ort::SessionOptions

Configure session creation and optimization.
Ort::SessionOptions session_options;

// Graph optimization
session_options.SetGraphOptimizationLevel(
    GraphOptimizationLevel::ORT_ENABLE_EXTENDED);

// Threading
session_options.SetIntraOpNumThreads(4);
session_options.SetInterOpNumThreads(2);

// Execution mode
session_options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);

// Memory optimization
session_options.EnableCpuMemArena();
session_options.EnableMemPattern();

// Profiling
session_options.EnableProfiling("profile.json");

// Log settings
session_options.SetLogId("MySession");
session_options.SetLogSeverityLevel(2);

// Save optimized model
session_options.SetOptimizedModelFilePath("optimized_model.onnx");
Graph optimization levels:
GraphOptimizationLevel::ORT_DISABLE_ALL     // No optimizations
GraphOptimizationLevel::ORT_ENABLE_BASIC    // Constant folding, redundant node elimination
GraphOptimizationLevel::ORT_ENABLE_EXTENDED // Operator fusion, node reordering
GraphOptimizationLevel::ORT_ENABLE_ALL      // Layout transformations, NCHWc format

Ort::Session

The main inference session class. Create from file:
Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test");
Ort::SessionOptions session_options;
Ort::Session session(env, "model.onnx", session_options);

// On Windows, use wide strings
Ort::Session session(env, L"model.onnx", session_options);
Create from memory:
// Load model into memory
std::vector<uint8_t> model_data = load_model_file("model.onnx");

Ort::Session session(env, model_data.data(), model_data.size(), session_options);
Query model metadata:
Ort::AllocatorWithDefaultOptions allocator;

// Get input/output counts
size_t num_inputs = session.GetInputCount();
size_t num_outputs = session.GetOutputCount();

// Get input names and types
for (size_t i = 0; i < num_inputs; i++) {
    auto input_name = session.GetInputNameAllocated(i, allocator);
    std::cout << "Input " << i << ": " << input_name.get() << std::endl;
    
    auto type_info = session.GetInputTypeInfo(i);
    auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
    
    std::cout << "  Type: " << tensor_info.GetElementType() << std::endl;
    std::cout << "  Shape: [";
    auto shape = tensor_info.GetShape();
    for (size_t j = 0; j < shape.size(); j++) {
        std::cout << (j > 0 ? ", " : "") << shape[j];
    }
    std::cout << "]" << std::endl;
}

// Get output names and types
for (size_t i = 0; i < num_outputs; i++) {
    auto output_name = session.GetOutputNameAllocated(i, allocator);
    std::cout << "Output " << i << ": " << output_name.get() << std::endl;
}
Get model metadata:
Ort::ModelMetadata metadata = session.GetModelMetadata();
Ort::AllocatorWithDefaultOptions allocator;

auto producer_name = metadata.GetProducerNameAllocated(allocator);
auto graph_name = metadata.GetGraphNameAllocated(allocator);
auto version = metadata.GetVersion();

std::cout << "Producer: " << producer_name.get() << std::endl;
std::cout << "Graph: " << graph_name.get() << std::endl;
std::cout << "Version: " << version << std::endl;

Running Inference

Basic inference:
// Prepare input tensor
std::vector<float> input_data(batch_size * channels * height * width);
std::vector<int64_t> input_shape = {batch_size, channels, height, width};

auto memory_info = Ort::MemoryInfo::CreateCpu(
    OrtArenaAllocator, OrtMemTypeDefault);

Ort::Value input_tensor = Ort::Value::CreateTensor<float>(
    memory_info, input_data.data(), input_data.size(),
    input_shape.data(), input_shape.size());

// Get input/output names
const char* input_names[] = {"input"};
const char* output_names[] = {"output"};

// Run inference
Ort::RunOptions run_options;
auto output_tensors = session.Run(
    run_options,
    input_names, &input_tensor, 1,
    output_names, 1);

// Access output
float* output_data = output_tensors[0].GetTensorMutableData<float>();
auto output_shape = output_tensors[0].GetTensorTypeAndShapeInfo().GetShape();

Ort::Value

Represents tensors and other values. Create tensor from existing data:
std::vector<float> data = {1.0f, 2.0f, 3.0f, 4.0f};
std::vector<int64_t> shape = {2, 2};

auto memory_info = Ort::MemoryInfo::CreateCpu(
    OrtArenaAllocator, OrtMemTypeDefault);

Ort::Value tensor = Ort::Value::CreateTensor<float>(
    memory_info, data.data(), data.size(),
    shape.data(), shape.size());
Create tensor with allocator:
Ort::AllocatorWithDefaultOptions allocator;
std::vector<int64_t> shape = {1, 3, 224, 224};

Ort::Value tensor = Ort::Value::CreateTensor<float>(
    allocator, shape.data(), shape.size());

// Get mutable data pointer
float* data = tensor.GetTensorMutableData<float>();
// Fill data...
Query tensor properties:
bool is_tensor = tensor.IsTensor();

auto type_info = tensor.GetTensorTypeAndShapeInfo();
ONNXTensorElementDataType element_type = type_info.GetElementType();
size_t element_count = type_info.GetElementCount();
std::vector<int64_t> shape = type_info.GetShape();

const float* data = tensor.GetTensorData<float>();

Execution Providers

Add Execution Providers

CUDA:
Ort::SessionOptions session_options;

OrtCUDAProviderOptions cuda_options;
cuda_options.device_id = 0;
cuda_options.arena_extend_strategy = OrtArenaExtendStrategy::kNextPowerOfTwo;
cuda_options.gpu_mem_limit = 2ULL * 1024 * 1024 * 1024; // 2GB
cuda_options.cudnn_conv_algo_search = OrtCudnnConvAlgoSearch::EXHAUSTIVE;

session_options.AppendExecutionProvider_CUDA(cuda_options);
TensorRT:
OrtTensorRTProviderOptions trt_options;
trt_options.device_id = 0;
trt_options.trt_max_workspace_size = 1ULL << 30; // 1GB
trt_options.trt_fp16_enable = 1;

session_options.AppendExecutionProvider_TensorRT(trt_options);
DirectML (Windows):
session_options.AppendExecutionProvider_DML(0); // Device ID
CoreML (macOS/iOS):
uint32_t coreml_flags = 0;
session_options.AppendExecutionProvider_CoreML(coreml_flags);

Complete Example: Image Classification

#include <onnxruntime_cxx_api.h>
#include <array>
#include <vector>
#include <iostream>
#include <fstream>

class ImageClassifier {
private:
    Ort::Env env_;
    Ort::Session session_;
    Ort::AllocatorWithDefaultOptions allocator_;
    std::vector<std::string> input_names_;
    std::vector<std::string> output_names_;
    std::vector<int64_t> input_shape_;
    
public:
    ImageClassifier(const char* model_path)
        : env_(ORT_LOGGING_LEVEL_WARNING, "ImageClassifier"),
          session_(nullptr) {
        
        // Configure session options
        Ort::SessionOptions session_options;
        session_options.SetIntraOpNumThreads(4);
        session_options.SetGraphOptimizationLevel(
            GraphOptimizationLevel::ORT_ENABLE_ALL);
        
        // Add CUDA provider
        OrtCUDAProviderOptions cuda_options;
        cuda_options.device_id = 0;
        session_options.AppendExecutionProvider_CUDA(cuda_options);
        
        // Create session
        session_ = Ort::Session(env_, model_path, session_options);
        
        // Get model metadata
        size_t num_inputs = session_.GetInputCount();
        size_t num_outputs = session_.GetOutputCount();
        
        std::cout << "Model has " << num_inputs << " inputs and "
                  << num_outputs << " outputs" << std::endl;
        
        // Get input names and shapes
        for (size_t i = 0; i < num_inputs; i++) {
            auto name = session_.GetInputNameAllocated(i, allocator_);
            input_names_.push_back(name.get());
            
            auto type_info = session_.GetInputTypeInfo(i);
            auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
            input_shape_ = tensor_info.GetShape();
            
            std::cout << "Input " << i << ": " << input_names_[i]
                      << " [";
            for (size_t j = 0; j < input_shape_.size(); j++) {
                std::cout << (j > 0 ? ", " : "") << input_shape_[j];
            }
            std::cout << "]" << std::endl;
        }
        
        // Get output names
        for (size_t i = 0; i < num_outputs; i++) {
            auto name = session_.GetOutputNameAllocated(i, allocator_);
            output_names_.push_back(name.get());
        }
    }
    
    std::vector<float> classify(const std::vector<float>& image_data) {
        // Validate input size
        size_t expected_size = 1;
        for (auto dim : input_shape_) {
            expected_size *= dim;
        }
        if (image_data.size() != expected_size) {
            throw std::runtime_error("Input size mismatch");
        }
        
        // Create input tensor
        auto memory_info = Ort::MemoryInfo::CreateCpu(
            OrtArenaAllocator, OrtMemTypeDefault);
        
        Ort::Value input_tensor = Ort::Value::CreateTensor<float>(
            memory_info,
            const_cast<float*>(image_data.data()),
            image_data.size(),
            input_shape_.data(),
            input_shape_.size());
        
        // Prepare input/output names as C strings
        std::vector<const char*> input_names_cstr;
        std::vector<const char*> output_names_cstr;
        for (auto& name : input_names_) {
            input_names_cstr.push_back(name.c_str());
        }
        for (auto& name : output_names_) {
            output_names_cstr.push_back(name.c_str());
        }
        
        // Run inference
        Ort::RunOptions run_options;
        auto output_tensors = session_.Run(
            run_options,
            input_names_cstr.data(), &input_tensor, 1,
            output_names_cstr.data(), output_names_cstr.size());
        
        // Get output data
        float* output_data = output_tensors[0].GetTensorMutableData<float>();
        auto output_shape = output_tensors[0].GetTensorTypeAndShapeInfo().GetShape();
        
        size_t output_size = 1;
        for (auto dim : output_shape) {
            output_size *= dim;
        }
        
        return std::vector<float>(output_data, output_data + output_size);
    }
};

int main() {
    try {
        ImageClassifier classifier("resnet50.onnx");
        
        // Create dummy input (1, 3, 224, 224)
        std::vector<float> image_data(1 * 3 * 224 * 224, 0.5f);
        
        // Run inference
        auto predictions = classifier.classify(image_data);
        
        // Find top prediction
        auto max_it = std::max_element(predictions.begin(), predictions.end());
        int max_idx = std::distance(predictions.begin(), max_it);
        
        std::cout << "Top prediction: class " << max_idx
                  << " with score " << *max_it << std::endl;
        
    } catch (const Ort::Exception& e) {
        std::cerr << "ONNX Runtime error: " << e.what() << std::endl;
        return 1;
    }
    
    return 0;
}

Memory Management

The C++ API uses RAII (Resource Acquisition Is Initialization) for automatic resource management:
{
    Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test");
    Ort::SessionOptions options;
    Ort::Session session(env, "model.onnx", options);
    
    // Resources automatically cleaned up when going out of scope
}

Error Handling

try {
    Ort::Session session(env, "model.onnx", session_options);
    auto outputs = session.Run(/*...*/);
} catch (const Ort::Exception& e) {
    std::cerr << "ONNX Runtime error: " << e.what() << std::endl;
    std::cerr << "Error code: " << e.GetOrtErrorCode() << std::endl;
}

Next Steps

Model Optimization

Optimize models for production deployment

Execution Providers

Configure hardware acceleration