Skip to main content
This guide covers advanced features of the ONNX Runtime GenAI C++ API, including multi-turn conversations, custom generation parameters, and multimodal input processing.

Multi-Turn Chat

The chat example demonstrates continuous conversation with context preservation:
#include <iostream>
#include <string>
#include "ort_genai.h"
#include "common.h"

void CXX_API(
    GeneratorParamsArgs& generator_params_args,
    const std::string& model_path,
    const std::string& ep,
    const std::string& system_prompt,
    bool verbose,
    bool interactive,
    bool rewind) {
  
  RegisterEP(ep, ep_path);

  // Create model and tokenizer
  std::unordered_map<std::string, std::string> ep_options;
  auto config = GetConfig(model_path, ep, ep_options, generator_params_args);
  auto model = OgaModel::Create(*config);
  auto tokenizer = OgaTokenizer::Create(*model);
  auto stream = OgaTokenizerStream::Create(*tokenizer);

  // Set search options for generator params
  auto params = OgaGeneratorParams::Create(*model);
  SetSearchOptions(*params, generator_params_args, verbose);

  // Create system message
  nlohmann::ordered_json message = nlohmann::ordered_json::array();
  message.push_back({{"role", "system"}, {"content", system_prompt}});

  // Create generator
  auto generator = OgaGenerator::Create(*model, *params);

  // Apply chat template and encode system prompt
  std::string prompt = ApplyChatTemplate(
    model_path, *tokenizer, message.dump(), false
  );
  auto sequences = OgaSequences::Create();
  tokenizer->Encode(prompt.c_str(), *sequences);
  generator->AppendTokenSequences(*sequences);
  const int prompt_tokens_length = generator->TokenCount();

  // Interactive conversation loop
  while (true) {
    // Get user input
    std::string text;
    std::cout << "Prompt (Use quit() to exit):" << std::endl;
    std::getline(std::cin, text);

    if (text == "quit()") {
      break;
    }

    // Create user message
    message = nlohmann::ordered_json::array();
    message.push_back({{"role", "user"}, {"content", text}});

    // Apply chat template and encode
    prompt = ApplyChatTemplate(model_path, *tokenizer, message.dump(), true);
    sequences = OgaSequences::Create();
    tokenizer->Encode(prompt.c_str(), *sequences);
    generator->AppendTokenSequences(*sequences);

    // Generate response
    std::cout << "Output: ";
    const int current_token_count = generator->TokenCount();
    
    try {
      while (!generator->IsDone()) {
        generator->GenerateNextToken();
        const auto new_token = generator->GetNextTokens()[0];
        std::cout << stream->Decode(new_token) << std::flush;
      }
    } catch (const std::exception& e) {
      std::cout << "\nTerminating generation: " << e.what() << std::endl;
      // Rewind to the last valid state
      generator->RewindTo(current_token_count);
    }

    std::cout << "\n\n" << std::endl;

    // Optionally rewind to system prompt (clears chat history)
    if (rewind) {
      generator->RewindTo(prompt_tokens_length);
    }
  }
}

Key Features

Context Preservation

The chat example maintains conversation context across turns:
// Encode and append system prompt once
auto sequences = OgaSequences::Create();
tokenizer->Encode(prompt.c_str(), *sequences);
generator->AppendTokenSequences(*sequences);
const int prompt_tokens_length = generator->TokenCount();

// For each user turn, append new tokens
// The generator maintains all previous context
generator->AppendTokenSequences(*new_sequences);

Rewind Functionality

Control conversation history with the rewind feature:
// Store the position after system prompt
const int prompt_tokens_length = generator->TokenCount();

// After each turn, optionally rewind to clear history
if (rewind) {
  generator->RewindTo(prompt_tokens_length);
}

// Or rewind on error to last valid state
try {
  while (!generator->IsDone()) {
    generator->GenerateNextToken();
  }
} catch (const std::exception& e) {
  generator->RewindTo(current_token_count);
}

Signal Handling

Gracefully terminate generation with signal handlers:
#include <csignal>

OgaGenerator* g_generator = nullptr;

void TerminateGeneration(int signum) {
  if (g_generator == nullptr) {
    return;
  }
  g_generator->SetRuntimeOption("terminate_session", "1");
}

// Register signal handler
signal(SIGINT, TerminateGeneration);
g_generator = generator.get();

// Clear after use
g_generator = nullptr;

Custom Generation Parameters

Fine-tune generation behavior with custom parameters:
struct GeneratorParamsArgs {
  int batch_size = 1;
  int chunk_size = 0;
  std::optional<bool> do_sample;
  std::optional<int> min_length;
  std::optional<int> max_length;
  int num_beams = 1;
  int num_return_sequences = 1;
  std::optional<double> repetition_penalty;
  std::optional<double> temperature;
  std::optional<int> top_k;
  std::optional<double> top_p;
};

// Configure parameters
GeneratorParamsArgs args;
args.max_length = 512;
args.temperature = 0.7;
args.top_p = 0.9;
args.repetition_penalty = 1.1;

// Apply to generator
auto params = OgaGeneratorParams::Create(*model);
SetSearchOptions(*params, args, verbose);

Parameter Descriptions

  • max_length: Maximum number of tokens to generate
  • temperature: Controls randomness (higher = more random)
  • top_p: Nucleus sampling threshold
  • top_k: Number of highest probability tokens to consider
  • repetition_penalty: Penalizes repeated tokens
  • num_beams: Number of beams for beam search

Multimodal Processing

Process images and audio alongside text input:
void CXX_API(
    const std::string& model_path,
    const std::vector<std::string>& image_paths,
    const std::vector<std::string>& audio_paths,
    const std::string& user_prompt,
    bool interactive) {
  
  auto model = OgaModel::Create(*config);
  auto tokenizer = OgaTokenizer::Create(*model);
  auto stream = OgaTokenizerStream::Create(*tokenizer);
  
  // Create multimodal processor
  auto processor = OgaMultiModalProcessor::Create(*model);

  // Get user images and audios
  std::unique_ptr<OgaImages> images;
  int num_images;
  std::tie(images, num_images) = GetUserImages(image_paths, interactive);

  std::unique_ptr<OgaAudios> audios;
  int num_audios;
  std::tie(audios, num_audios) = GetUserAudios(audio_paths, interactive);

  // Construct user content based on inputs
  auto type = model->GetType();
  nlohmann::ordered_json user_content = GetUserContent(
    std::string(type), num_images, num_audios, user_prompt
  );

  // Create user message with multimodal content
  nlohmann::ordered_json user_message = {
    {"role", "user"}, 
    {"content", user_content}
  };
  std::string messages = user_message.dump();

  // Apply chat template
  std::string prompt = ApplyChatTemplate(
    model_path, *tokenizer, messages, true
  );

  // Process images and audios together with text
  auto input_tensors = processor->ProcessImagesAndAudios(
    prompt.c_str(), images.get(), audios.get()
  );
  
  // Create generator and set inputs
  auto generator = OgaGenerator::Create(*model, *params);
  generator->SetInputs(*input_tensors);

  // Generate response
  while (!generator->IsDone()) {
    generator->GenerateNextToken();
    const auto new_token = generator->GetNextTokens()[0];
    std::cout << stream->Decode(new_token) << std::flush;
  }
}

Constrained Decoding with Guidance

Use JSON schema or LARK grammar for structured output:
struct GuidanceArgs {
  std::string response_format = "";  // "json_schema" or "lark_grammar"
  std::string tools_file = "";       // Path to tools definition file
  bool text_output = false;
  bool tool_output = false;
  std::string tool_call_start = "";
  std::string tool_call_end = "";
};

// Get and set guidance info
std::string guidance_type, guidance_data, tools;
if (!guidance_args.response_format.empty()) {
  std::tie(guidance_type, guidance_data, tools) = GetGuidance(
    guidance_args.response_format,
    guidance_args.tools_file,
    "",       // tools_str
    nullptr,  // tools
    guidance_args.text_output,
    guidance_args.tool_output,
    guidance_args.tool_call_start,
    guidance_args.tool_call_end
  );

  // Set guidance on generator params
  params->SetGuidance(guidance_type.c_str(), guidance_data.c_str());
}

Performance Monitoring

Track generation performance with timing utilities:
class Timing {
public:
  void RecordStartTimestamp();
  void RecordFirstTokenTimestamp();
  void RecordEndTimestamp();
  void Log(const int prompt_tokens_length, const int new_tokens_length);
};

// Usage
Timing timing;
bool is_first_token = true;
timing.RecordStartTimestamp();

while (!generator->IsDone()) {
  generator->GenerateNextToken();
  
  if (is_first_token) {
    timing.RecordFirstTokenTimestamp();
    is_first_token = false;
  }
  
  const auto new_token = generator->GetNextTokens()[0];
  std::cout << stream->Decode(new_token) << std::flush;
}

timing.RecordEndTimestamp();
const int new_tokens_length = generator->TokenCount() - prompt_tokens_length;
timing.Log(prompt_tokens_length, new_tokens_length);

Building Advanced Examples

Build multiple examples at once:
cd examples/c
cmake -G "Visual Studio 17 2022" -S . -B build \
  -DMODEL_CHAT=ON -DMODEL_QA=ON -DMODEL_MM=ON
cmake --build build --parallel --config Debug

Running Advanced Examples

Multi-Turn Chat

.\model_chat.exe -m {model path} -e cuda --interactive --rewind

Multimodal Processing

.\model_mm.exe -m {model path} -e cuda \
  --image_paths image1.jpg,image2.jpg \
  --user_prompt "Describe these images"

Tool Calling with JSON Schema

./model_qa -m {model path} -e cuda \
  --response_format json_schema \
  --tools_file tools.json \
  --tool_output \
  --tool_call_start "<tool_call>" \
  --tool_call_end "</tool_call>"

Next Steps

Build docs developers (and LLMs) love