Skip to main content
The OgaGenerator class is responsible for generating text using a loaded model. It provides methods for token-by-token generation, managing generation state, and accessing generated sequences.

Class Definition

struct OgaGenerator : OgaAbstract {
  static std::unique_ptr<OgaGenerator> Create(const OgaModel& model, OgaGeneratorParams& params);
  
  // Generation control
  void AppendTokenSequences(const OgaSequences& sequences);
  void AppendTokens(const int32_t* input_ids, size_t input_ids_count);
  void GenerateNextToken();
  bool IsDone();
  
  // Sequence access
  const int32_t* GetSequenceData(size_t index) const;
  size_t GetSequenceCount(size_t index) const;
  std::span<const int32_t> GetNextTokens();
  size_t TokenCount() const;
  
  // Advanced features
  void RewindTo(size_t new_length);
  void SetRuntimeOption(const char* key, const char* value);
  void SetModelInput(const char* name, OgaTensor& tensor);
  void SetInputs(OgaNamedTensors& named_tensors);
  
  // Tensor access
  std::unique_ptr<OgaTensor> GetInput(const char* name);
  std::unique_ptr<OgaTensor> GetOutput(const char* name);
  std::unique_ptr<OgaTensor> GetLogits();
  void SetLogits(OgaTensor& tensor);
  
  bool IsSessionTerminated() const;
  void SetActiveAdapter(OgaAdapters& adapters, const char* adapter_name);
};
Defined in: ~/workspace/source/src/ort_genai.h:446

Methods

Create()

Create a generator instance from a model and parameters.
static std::unique_ptr<OgaGenerator> Create(const OgaModel& model, OgaGeneratorParams& params)
model
const OgaModel&
required
The model to use for generation
params
OgaGeneratorParams&
required
Generation parameters (search options, constraints, etc.)
Returns: std::unique_ptr<OgaGenerator> - A unique pointer to the created generator Throws: std::runtime_error if generator creation fails

Example

auto model = OgaModel::Create("phi-2");
auto params = OgaGeneratorParams::Create(*model);
params->SetSearchOption("max_length", 200);

auto generator = OgaGenerator::Create(*model, *params);

AppendTokenSequences()

Append token sequences to the generator (typically the encoded prompt).
void AppendTokenSequences(const OgaSequences& sequences)
sequences
const OgaSequences&
required
The token sequences to append (usually from tokenizer encoding)
Throws: std::runtime_error if appending fails

Example

// Encode prompt and append to generator
auto sequences = OgaSequences::Create();
tokenizer->Encode("A great recipe for Kung Pao chicken is ", *sequences);
generator->AppendTokenSequences(*sequences);
From ~/workspace/source/examples/c/src/model_qa.cpp:128

AppendTokens()

Append individual tokens to the generator.
void AppendTokens(const int32_t* input_ids, size_t input_ids_count)
void AppendTokens(std::span<const int32_t> input_ids)  // C++20
input_ids
const int32_t*
required
Pointer to the token IDs array
input_ids_count
size_t
required
Number of tokens to append

Example

std::vector<int32_t> tokens = {1, 2, 3, 4, 5};
generator->AppendTokens(tokens.data(), tokens.size());

// Or with C++20 span
#if OGA_USE_SPAN
generator->AppendTokens(std::span(tokens));
#endif

GenerateNextToken()

Generate the next token in the sequence.
void GenerateNextToken()
Throws: std::runtime_error if generation fails This is the core method for token-by-token generation. Call this repeatedly until IsDone() returns true.

Example

while (!generator->IsDone()) {
  generator->GenerateNextToken();
  
  // Get and process the newly generated token
  const auto new_token = generator->GetNextTokens()[0];
  std::cout << stream->Decode(new_token) << std::flush;
}
From ~/workspace/source/examples/c/src/model_qa.cpp:136

IsDone()

Check if generation is complete.
bool IsDone()
Returns: bool - True if generation has finished (reached max length or EOS token)

Example

while (!generator->IsDone()) {
  generator->GenerateNextToken();
  // Process token...
}

GetSequenceData()

Get the raw token data for a sequence.
const int32_t* GetSequenceData(size_t index) const
index
size_t
required
The sequence index (typically 0 for single sequence generation)
Returns: const int32_t* - Pointer to the token data array

Example

// Get the full generated sequence
auto output_sequence = generator->GetSequenceData(0);
size_t output_length = generator->GetSequenceCount(0);
auto output_string = tokenizer->Decode(output_sequence, output_length);

GetSequenceCount()

Get the number of tokens in a sequence.
size_t GetSequenceCount(size_t index) const
index
size_t
required
The sequence index
Returns: size_t - Number of tokens in the sequence

GetSequence() (C++20)

Get a sequence as a span.
std::span<const int32_t> GetSequence(size_t index) const
index
size_t
required
The sequence index
Returns: std::span<const int32_t> - Span view of the token sequence

Example

#if OGA_USE_SPAN
auto sequence = generator->GetSequence(0);
auto output_string = tokenizer->Decode(sequence);
#endif

GetNextTokens()

Get the most recently generated tokens.
std::span<const int32_t> GetNextTokens()       // C++20
std::vector<int32_t> GetNextTokens()           // Pre-C++20
Returns: The newly generated tokens (one per batch element)

Example

generator->GenerateNextToken();
const auto new_token = generator->GetNextTokens()[0];
std::cout << stream->Decode(new_token) << std::flush;

TokenCount()

Get the total number of tokens in the current generation.
size_t TokenCount() const
Returns: size_t - Total number of tokens (prompt + generated)

Example

const int prompt_length = generator->TokenCount();

// Generate tokens...

const int new_tokens = generator->TokenCount() - prompt_length;
std::cout << "Generated " << new_tokens << " new tokens" << std::endl;
From ~/workspace/source/examples/c/src/model_qa.cpp:129

RewindTo()

Rewind the generator to a previous state.
void RewindTo(size_t new_length)
new_length
size_t
required
The token position to rewind to
This is useful for chat scenarios where you want to maintain context but remove recent messages.

Example

const int system_prompt_length = generator->TokenCount();

// Generate response...

// Rewind to system prompt for next turn
generator->RewindTo(system_prompt_length);
From ~/workspace/source/examples/c/src/model_chat.cpp:176

SetRuntimeOption()

Set runtime options for the generator.
void SetRuntimeOption(const char* key, const char* value)
key
const char*
required
The option key (e.g., “terminate_session”)
value
const char*
required
The option value

Example

// Terminate generation (useful for signal handling)
generator->SetRuntimeOption("terminate_session", "1");
From ~/workspace/source/examples/c/src/model_qa.cpp:23

SetModelInput()

Set a custom model input tensor.
void SetModelInput(const char* name, OgaTensor& tensor)
name
const char*
required
The input name
tensor
OgaTensor&
required
The input tensor

SetInputs()

Set multiple model inputs at once.
void SetInputs(OgaNamedTensors& named_tensors)
named_tensors
OgaNamedTensors&
required
Named collection of input tensors

GetInput()

Get a model input tensor.
std::unique_ptr<OgaTensor> GetInput(const char* name)
name
const char*
required
The input name
Returns: std::unique_ptr<OgaTensor> - The input tensor

GetOutput()

Get a model output tensor.
std::unique_ptr<OgaTensor> GetOutput(const char* name)
name
const char*
required
The output name
Returns: std::unique_ptr<OgaTensor> - The output tensor

GetLogits()

Get the logits tensor from the last generation step.
std::unique_ptr<OgaTensor> GetLogits()
Returns: std::unique_ptr<OgaTensor> - The logits tensor

SetLogits()

Set custom logits for the next generation step.
void SetLogits(OgaTensor& tensor)
tensor
OgaTensor&
required
The logits tensor to use

IsSessionTerminated()

Check if the generation session has been terminated.
bool IsSessionTerminated() const
Returns: bool - True if the session was terminated (e.g., via SetRuntimeOption)

SetActiveAdapter()

Set the active LoRA adapter for generation.
void SetActiveAdapter(OgaAdapters& adapters, const char* adapter_name)
adapters
OgaAdapters&
required
The adapters collection
adapter_name
const char*
required
Name of the adapter to activate

Complete Examples

Basic Generation

#include "ort_genai.h"
#include <iostream>

int main() {
  OgaHandle handle;
  
  try {
    // Setup
    auto model = OgaModel::Create("phi-2");
    auto tokenizer = OgaTokenizer::Create(*model);
    
    // Encode prompt
    auto sequences = OgaSequences::Create();
    tokenizer->Encode("A great recipe for Kung Pao chicken is ", *sequences);
    
    // Configure generation
    auto params = OgaGeneratorParams::Create(*model);
    params->SetSearchOption("max_length", 200);
    params->SetSearchOption("batch_size", 1);
    
    // Create generator and start generation
    auto generator = OgaGenerator::Create(*model, *params);
    generator->AppendTokenSequences(*sequences);
    
    // Generate tokens
    while (!generator->IsDone()) {
      generator->GenerateNextToken();
    }
    
    // Decode output
    auto output_sequence = generator->GetSequenceData(0);
    auto output_length = generator->GetSequenceCount(0);
    auto output_string = tokenizer->Decode(output_sequence, output_length);
    
    std::cout << "Output: " << output_string << std::endl;
    
  } catch (const std::exception& e) {
    std::cerr << "Error: " << e.what() << std::endl;
    return -1;
  }
  
  return 0;
}
Based on: ~/workspace/source/src/ort_genai.h:21

Streaming Generation

From ~/workspace/source/examples/c/src/model_qa.cpp:132:
// Create tokenizer stream for real-time decoding
auto stream = OgaTokenizerStream::Create(*tokenizer);

// Generate and stream output
std::cout << "Output: ";
while (!generator->IsDone()) {
  generator->GenerateNextToken();
  
  const auto new_token = generator->GetNextTokens()[0];
  std::cout << stream->Decode(new_token) << std::flush;
}
std::cout << std::endl;

Chat with Context Management

From ~/workspace/source/examples/c/src/model_chat.cpp:106:
// Encode system prompt
auto sequences = OgaSequences::Create();
tokenizer->Encode(system_prompt.c_str(), *sequences);
generator->AppendTokenSequences(*sequences);
const int system_prompt_length = generator->TokenCount();

// Chat loop
while (true) {
  // Get user input
  std::string user_input;
  std::getline(std::cin, user_input);
  if (user_input == "quit()") break;
  
  // Encode and append user message
  sequences = OgaSequences::Create();
  tokenizer->Encode(user_input.c_str(), *sequences);
  generator->AppendTokenSequences(*sequences);
  
  // Generate response
  const int current_length = generator->TokenCount();
  std::cout << "Assistant: ";
  try {
    while (!generator->IsDone()) {
      generator->GenerateNextToken();
      const auto new_token = generator->GetNextTokens()[0];
      std::cout << stream->Decode(new_token) << std::flush;
    }
  } catch (const std::exception& e) {
    std::cout << "\nError: " << e.what() << std::endl;
    // Rewind on error
    generator->RewindTo(current_length);
  }
  std::cout << "\n\n";
  
  // Optionally rewind to system prompt for next turn
  // generator->RewindTo(system_prompt_length);
}

See Also

Build docs developers (and LLMs) love