Skip to main content
The Generator class manages the token generation loop and state.

Constructor

Create a generator from a model and parameters.
import onnxruntime_genai as og

model = og.Model("/path/to/model")
params = og.GeneratorParams(model)
params.set_search_options(max_length=200)

generator = og.Generator(model, params)
model
Model
required
The Model object to generate with
params
GeneratorParams
required
Generation parameters including search options

Methods

append_tokens()

Add input tokens to the generator. Can accept either a numpy array or OgaTensor.
# From numpy array
input_tokens = tokenizer.encode("Hello")
generator.append_tokens(input_tokens)

# From tensor
input_tensor = og.Tensor(input_tokens)
generator.append_tokens(input_tensor)
tokens
numpy.ndarray | OgaTensor
required
Token IDs to add to the input sequence

generate_next_token()

Generate the next token in the sequence.
while not generator.is_done():
    generator.generate_next_token()
This method runs one iteration of the generation loop, including:
  • Model forward pass
  • Sampling from logits
  • Updating internal state

is_done()

Check if generation is complete.
if generator.is_done():
    print("Generation finished")
done
bool
True if generation has finished for all sequences in the batch

get_next_tokens()

Get the most recently generated token for each sequence in the batch.
new_tokens = generator.get_next_tokens()
print(f"Generated token: {new_tokens[0]}")
tokens
numpy.ndarray
Array of int32 token IDs, one per sequence in the batch

get_sequence()

Get the complete token sequence for a specific batch index.
# Get first sequence in batch
sequence = generator.get_sequence(0)
text = tokenizer.decode(sequence)
index
int
required
Batch index of the sequence to retrieve
sequence
numpy.ndarray
Complete array of token IDs including input and generated tokens

token_count()

Get the total number of tokens processed so far.
count = generator.token_count()
print(f"Processed {count} tokens")
count
int
Total number of tokens in the sequence

set_inputs()

Set model inputs from a NamedTensors object (typically from multimodal processor).
# For multimodal models
processor = model.create_multimodal_processor()
images = og.Images.open("image.jpg")
inputs = processor("<|image_1|>\nWhat is this?", images=images)

generator.set_inputs(inputs)
inputs
NamedTensors
required
Named tensor inputs from a processor

get_logits()

Get the current logits (pre-softmax scores) for the next token.
logits = generator.get_logits()
print(f"Logits shape: {logits.shape}")
logits
numpy.ndarray
Float array of shape [batch_size, vocab_size]

set_logits()

Manually set the logits before sampling the next token.
import numpy as np

# Modify logits (e.g., apply custom bias)
logits = generator.get_logits()
logits[:, bad_token_id] = -float('inf')  # Ban a token
generator.set_logits(logits)
logits
numpy.ndarray
required
Float array of shape [batch_size, vocab_size]

rewind_to()

Rewind the generator to a previous token position.
# Save initial position
initial_length = generator.token_count()

# Generate some tokens
for _ in range(10):
    if not generator.is_done():
        generator.generate_next_token()

# Go back to initial position
generator.rewind_to(initial_length)
length
int
required
Token position to rewind to

get_input()

Get a model input tensor by name.
input_ids = generator.get_input("input_ids")
name
str
required
Name of the input tensor
tensor
numpy.ndarray
The requested input tensor as a numpy array

get_output()

Get a model output tensor by name.
output = generator.get_output("logits")
name
str
required
Name of the output tensor
tensor
numpy.ndarray
The requested output tensor as a numpy array

set_model_input()

Manually set a model input tensor.
import numpy as np

attention_mask = np.ones((1, 100), dtype=np.int32)
generator.set_model_input("attention_mask", attention_mask)
name
str
required
Name of the input tensor
value
numpy.ndarray
required
Tensor data as a numpy array

set_active_adapter()

Switch to a different LoRA adapter.
adapters = og.Adapters(model)
adapters.load("/path/to/adapter", "my_adapter")

generator.set_active_adapter(adapters, "my_adapter")
adapters
Adapters
required
Adapters object containing loaded adapters
adapter_name
str
required
Name of the adapter to activate

set_runtime_option()

Set a runtime option for the generator.
generator.set_runtime_option("cuda_graph", "1")
key
str
required
Option key
value
str
required
Option value

Example Usage

Basic generation:
import onnxruntime_genai as og

model = og.Model("/models/phi-3-mini")
tokenizer = og.Tokenizer(model)

params = og.GeneratorParams(model)
params.set_search_options(max_length=200, temperature=0.7)

generator = og.Generator(model, params)

prompt = "The first 4 digits of pi are"
input_tokens = tokenizer.encode(prompt)
generator.append_tokens(input_tokens)

while not generator.is_done():
    generator.generate_next_token()

output = tokenizer.decode(generator.get_sequence(0))
print(output)
Streaming generation:
import onnxruntime_genai as og

model = og.Model("/models/phi-3-mini")
tokenizer = og.Tokenizer(model)
stream = tokenizer.create_stream()

params = og.GeneratorParams(model)
params.set_search_options(max_length=200)

generator = og.Generator(model, params)
input_tokens = tokenizer.encode("Tell me a story")
generator.append_tokens(input_tokens)

print("Output: ", end="", flush=True)
while not generator.is_done():
    generator.generate_next_token()
    new_token = generator.get_next_tokens()[0]
    print(stream.decode(new_token), end="", flush=True)
print()
Batch generation:
import onnxruntime_genai as og

model = og.Model("/models/phi-3-mini")
tokenizer = og.Tokenizer(model)

prompts = [
    "The first 4 digits of pi are",
    "The square root of 2 is",
    "The capital of France is"
]

input_tokens = tokenizer.encode_batch(prompts)

params = og.GeneratorParams(model)
params.set_search_options(batch_size=len(prompts), max_length=100)

generator = og.Generator(model, params)
generator.append_tokens(input_tokens)

while not generator.is_done():
    generator.generate_next_token()

for i in range(len(prompts)):
    output = tokenizer.decode(generator.get_sequence(i))
    print(f"Prompt {i}: {output}\n")
Manipulating logits:
import onnxruntime_genai as og
import numpy as np

model = og.Model("/models/phi-3-mini")
tokenizer = og.Tokenizer(model)

params = og.GeneratorParams(model)
params.set_search_options(max_length=50)

generator = og.Generator(model, params)
input_tokens = tokenizer.encode("Once upon a time")
generator.append_tokens(input_tokens)

# Ban certain tokens
banned_tokens = [tokenizer.to_token_id("violence"), tokenizer.to_token_id("hate")]

while not generator.is_done():
    # Get and modify logits
    logits = generator.get_logits()
    for token_id in banned_tokens:
        logits[:, token_id] = -float('inf')
    generator.set_logits(logits)
    
    generator.generate_next_token()

output = tokenizer.decode(generator.get_sequence(0))
print(output)
Rewinding for chat:
import onnxruntime_genai as og

model = og.Model("/models/phi-3-mini")
tokenizer = og.Tokenizer(model)

params = og.GeneratorParams(model)
params.set_search_options(max_length=512)

generator = og.Generator(model, params)

# System prompt
system_prompt = "You are a helpful AI assistant."
system_tokens = tokenizer.encode(system_prompt)
generator.append_tokens(system_tokens)
system_length = generator.token_count()

while True:
    user_input = input("User: ")
    if user_input == "quit()":
        break
    
    # Add user message
    user_tokens = tokenizer.encode(user_input)
    generator.append_tokens(user_tokens)
    
    # Generate response
    print("Assistant: ", end="", flush=True)
    stream = tokenizer.create_stream()
    
    while not generator.is_done():
        generator.generate_next_token()
        new_token = generator.get_next_tokens()[0]
        print(stream.decode(new_token), end="", flush=True)
    print()
    
    # Rewind to system prompt to clear history
    generator.rewind_to(system_length)

Build docs developers (and LLMs) love