The OgaGenerator class is responsible for generating text using a loaded model. It provides methods for token-by-token generation, managing generation state, and accessing generated sequences.
Class Definition
struct OgaGenerator : OgaAbstract {
static std::unique_ptr<OgaGenerator> Create(const OgaModel& model, OgaGeneratorParams& params);
// Generation control
void AppendTokenSequences(const OgaSequences& sequences);
void AppendTokens(const int32_t* input_ids, size_t input_ids_count);
void GenerateNextToken();
bool IsDone();
// Sequence access
const int32_t* GetSequenceData(size_t index) const;
size_t GetSequenceCount(size_t index) const;
std::span<const int32_t> GetNextTokens();
size_t TokenCount() const;
// Advanced features
void RewindTo(size_t new_length);
void SetRuntimeOption(const char* key, const char* value);
void SetModelInput(const char* name, OgaTensor& tensor);
void SetInputs(OgaNamedTensors& named_tensors);
// Tensor access
std::unique_ptr<OgaTensor> GetInput(const char* name);
std::unique_ptr<OgaTensor> GetOutput(const char* name);
std::unique_ptr<OgaTensor> GetLogits();
void SetLogits(OgaTensor& tensor);
bool IsSessionTerminated() const;
void SetActiveAdapter(OgaAdapters& adapters, const char* adapter_name);
};
Defined in: ~/workspace/source/src/ort_genai.h:446
Methods
Create()
Create a generator instance from a model and parameters.
static std::unique_ptr<OgaGenerator> Create(const OgaModel& model, OgaGeneratorParams& params)
The model to use for generation
params
OgaGeneratorParams&
required
Generation parameters (search options, constraints, etc.)
Returns: std::unique_ptr<OgaGenerator> - A unique pointer to the created generator
Throws: std::runtime_error if generator creation fails
Example
auto model = OgaModel::Create("phi-2");
auto params = OgaGeneratorParams::Create(*model);
params->SetSearchOption("max_length", 200);
auto generator = OgaGenerator::Create(*model, *params);
AppendTokenSequences()
Append token sequences to the generator (typically the encoded prompt).
void AppendTokenSequences(const OgaSequences& sequences)
sequences
const OgaSequences&
required
The token sequences to append (usually from tokenizer encoding)
Throws: std::runtime_error if appending fails
Example
// Encode prompt and append to generator
auto sequences = OgaSequences::Create();
tokenizer->Encode("A great recipe for Kung Pao chicken is ", *sequences);
generator->AppendTokenSequences(*sequences);
From ~/workspace/source/examples/c/src/model_qa.cpp:128
AppendTokens()
Append individual tokens to the generator.
void AppendTokens(const int32_t* input_ids, size_t input_ids_count)
void AppendTokens(std::span<const int32_t> input_ids) // C++20
Pointer to the token IDs array
Number of tokens to append
Example
std::vector<int32_t> tokens = {1, 2, 3, 4, 5};
generator->AppendTokens(tokens.data(), tokens.size());
// Or with C++20 span
#if OGA_USE_SPAN
generator->AppendTokens(std::span(tokens));
#endif
GenerateNextToken()
Generate the next token in the sequence.
Throws: std::runtime_error if generation fails
This is the core method for token-by-token generation. Call this repeatedly until IsDone() returns true.
Example
while (!generator->IsDone()) {
generator->GenerateNextToken();
// Get and process the newly generated token
const auto new_token = generator->GetNextTokens()[0];
std::cout << stream->Decode(new_token) << std::flush;
}
From ~/workspace/source/examples/c/src/model_qa.cpp:136
IsDone()
Check if generation is complete.
Returns: bool - True if generation has finished (reached max length or EOS token)
Example
while (!generator->IsDone()) {
generator->GenerateNextToken();
// Process token...
}
GetSequenceData()
Get the raw token data for a sequence.
const int32_t* GetSequenceData(size_t index) const
The sequence index (typically 0 for single sequence generation)
Returns: const int32_t* - Pointer to the token data array
Example
// Get the full generated sequence
auto output_sequence = generator->GetSequenceData(0);
size_t output_length = generator->GetSequenceCount(0);
auto output_string = tokenizer->Decode(output_sequence, output_length);
GetSequenceCount()
Get the number of tokens in a sequence.
size_t GetSequenceCount(size_t index) const
Returns: size_t - Number of tokens in the sequence
GetSequence() (C++20)
Get a sequence as a span.
std::span<const int32_t> GetSequence(size_t index) const
Returns: std::span<const int32_t> - Span view of the token sequence
Example
#if OGA_USE_SPAN
auto sequence = generator->GetSequence(0);
auto output_string = tokenizer->Decode(sequence);
#endif
GetNextTokens()
Get the most recently generated tokens.
std::span<const int32_t> GetNextTokens() // C++20
std::vector<int32_t> GetNextTokens() // Pre-C++20
Returns: The newly generated tokens (one per batch element)
Example
generator->GenerateNextToken();
const auto new_token = generator->GetNextTokens()[0];
std::cout << stream->Decode(new_token) << std::flush;
TokenCount()
Get the total number of tokens in the current generation.
size_t TokenCount() const
Returns: size_t - Total number of tokens (prompt + generated)
Example
const int prompt_length = generator->TokenCount();
// Generate tokens...
const int new_tokens = generator->TokenCount() - prompt_length;
std::cout << "Generated " << new_tokens << " new tokens" << std::endl;
From ~/workspace/source/examples/c/src/model_qa.cpp:129
RewindTo()
Rewind the generator to a previous state.
void RewindTo(size_t new_length)
The token position to rewind to
This is useful for chat scenarios where you want to maintain context but remove recent messages.
Example
const int system_prompt_length = generator->TokenCount();
// Generate response...
// Rewind to system prompt for next turn
generator->RewindTo(system_prompt_length);
From ~/workspace/source/examples/c/src/model_chat.cpp:176
SetRuntimeOption()
Set runtime options for the generator.
void SetRuntimeOption(const char* key, const char* value)
The option key (e.g., “terminate_session”)
Example
// Terminate generation (useful for signal handling)
generator->SetRuntimeOption("terminate_session", "1");
From ~/workspace/source/examples/c/src/model_qa.cpp:23
Set a custom model input tensor.
void SetModelInput(const char* name, OgaTensor& tensor)
Set multiple model inputs at once.
void SetInputs(OgaNamedTensors& named_tensors)
Named collection of input tensors
Get a model input tensor.
std::unique_ptr<OgaTensor> GetInput(const char* name)
Returns: std::unique_ptr<OgaTensor> - The input tensor
GetOutput()
Get a model output tensor.
std::unique_ptr<OgaTensor> GetOutput(const char* name)
Returns: std::unique_ptr<OgaTensor> - The output tensor
GetLogits()
Get the logits tensor from the last generation step.
std::unique_ptr<OgaTensor> GetLogits()
Returns: std::unique_ptr<OgaTensor> - The logits tensor
SetLogits()
Set custom logits for the next generation step.
void SetLogits(OgaTensor& tensor)
IsSessionTerminated()
Check if the generation session has been terminated.
bool IsSessionTerminated() const
Returns: bool - True if the session was terminated (e.g., via SetRuntimeOption)
SetActiveAdapter()
Set the active LoRA adapter for generation.
void SetActiveAdapter(OgaAdapters& adapters, const char* adapter_name)
Name of the adapter to activate
Complete Examples
Basic Generation
#include "ort_genai.h"
#include <iostream>
int main() {
OgaHandle handle;
try {
// Setup
auto model = OgaModel::Create("phi-2");
auto tokenizer = OgaTokenizer::Create(*model);
// Encode prompt
auto sequences = OgaSequences::Create();
tokenizer->Encode("A great recipe for Kung Pao chicken is ", *sequences);
// Configure generation
auto params = OgaGeneratorParams::Create(*model);
params->SetSearchOption("max_length", 200);
params->SetSearchOption("batch_size", 1);
// Create generator and start generation
auto generator = OgaGenerator::Create(*model, *params);
generator->AppendTokenSequences(*sequences);
// Generate tokens
while (!generator->IsDone()) {
generator->GenerateNextToken();
}
// Decode output
auto output_sequence = generator->GetSequenceData(0);
auto output_length = generator->GetSequenceCount(0);
auto output_string = tokenizer->Decode(output_sequence, output_length);
std::cout << "Output: " << output_string << std::endl;
} catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
return -1;
}
return 0;
}
Based on: ~/workspace/source/src/ort_genai.h:21
Streaming Generation
From ~/workspace/source/examples/c/src/model_qa.cpp:132:
// Create tokenizer stream for real-time decoding
auto stream = OgaTokenizerStream::Create(*tokenizer);
// Generate and stream output
std::cout << "Output: ";
while (!generator->IsDone()) {
generator->GenerateNextToken();
const auto new_token = generator->GetNextTokens()[0];
std::cout << stream->Decode(new_token) << std::flush;
}
std::cout << std::endl;
Chat with Context Management
From ~/workspace/source/examples/c/src/model_chat.cpp:106:
// Encode system prompt
auto sequences = OgaSequences::Create();
tokenizer->Encode(system_prompt.c_str(), *sequences);
generator->AppendTokenSequences(*sequences);
const int system_prompt_length = generator->TokenCount();
// Chat loop
while (true) {
// Get user input
std::string user_input;
std::getline(std::cin, user_input);
if (user_input == "quit()") break;
// Encode and append user message
sequences = OgaSequences::Create();
tokenizer->Encode(user_input.c_str(), *sequences);
generator->AppendTokenSequences(*sequences);
// Generate response
const int current_length = generator->TokenCount();
std::cout << "Assistant: ";
try {
while (!generator->IsDone()) {
generator->GenerateNextToken();
const auto new_token = generator->GetNextTokens()[0];
std::cout << stream->Decode(new_token) << std::flush;
}
} catch (const std::exception& e) {
std::cout << "\nError: " << e.what() << std::endl;
// Rewind on error
generator->RewindTo(current_length);
}
std::cout << "\n\n";
// Optionally rewind to system prompt for next turn
// generator->RewindTo(system_prompt_length);
}
See Also