The Generator class manages the token generation loop and state.
Constructor
Create a generator from a model and parameters.
import onnxruntime_genai as og
model = og.Model("/path/to/model")
params = og.GeneratorParams(model)
params.set_search_options(max_length=200)
generator = og.Generator(model, params)
The Model object to generate with
Generation parameters including search options
Methods
append_tokens()
Add input tokens to the generator. Can accept either a numpy array or OgaTensor.
# From numpy array
input_tokens = tokenizer.encode("Hello")
generator.append_tokens(input_tokens)
# From tensor
input_tensor = og.Tensor(input_tokens)
generator.append_tokens(input_tensor)
tokens
numpy.ndarray | OgaTensor
required
Token IDs to add to the input sequence
generate_next_token()
Generate the next token in the sequence.
while not generator.is_done():
generator.generate_next_token()
This method runs one iteration of the generation loop, including:
- Model forward pass
- Sampling from logits
- Updating internal state
is_done()
Check if generation is complete.
if generator.is_done():
print("Generation finished")
True if generation has finished for all sequences in the batch
get_next_tokens()
Get the most recently generated token for each sequence in the batch.
new_tokens = generator.get_next_tokens()
print(f"Generated token: {new_tokens[0]}")
Array of int32 token IDs, one per sequence in the batch
get_sequence()
Get the complete token sequence for a specific batch index.
# Get first sequence in batch
sequence = generator.get_sequence(0)
text = tokenizer.decode(sequence)
Batch index of the sequence to retrieve
Complete array of token IDs including input and generated tokens
token_count()
Get the total number of tokens processed so far.
count = generator.token_count()
print(f"Processed {count} tokens")
Total number of tokens in the sequence
Set model inputs from a NamedTensors object (typically from multimodal processor).
# For multimodal models
processor = model.create_multimodal_processor()
images = og.Images.open("image.jpg")
inputs = processor("<|image_1|>\nWhat is this?", images=images)
generator.set_inputs(inputs)
Named tensor inputs from a processor
get_logits()
Get the current logits (pre-softmax scores) for the next token.
logits = generator.get_logits()
print(f"Logits shape: {logits.shape}")
Float array of shape [batch_size, vocab_size]
set_logits()
Manually set the logits before sampling the next token.
import numpy as np
# Modify logits (e.g., apply custom bias)
logits = generator.get_logits()
logits[:, bad_token_id] = -float('inf') # Ban a token
generator.set_logits(logits)
Float array of shape [batch_size, vocab_size]
rewind_to()
Rewind the generator to a previous token position.
# Save initial position
initial_length = generator.token_count()
# Generate some tokens
for _ in range(10):
if not generator.is_done():
generator.generate_next_token()
# Go back to initial position
generator.rewind_to(initial_length)
Token position to rewind to
Get a model input tensor by name.
input_ids = generator.get_input("input_ids")
The requested input tensor as a numpy array
get_output()
Get a model output tensor by name.
output = generator.get_output("logits")
Name of the output tensor
The requested output tensor as a numpy array
Manually set a model input tensor.
import numpy as np
attention_mask = np.ones((1, 100), dtype=np.int32)
generator.set_model_input("attention_mask", attention_mask)
Tensor data as a numpy array
set_active_adapter()
Switch to a different LoRA adapter.
adapters = og.Adapters(model)
adapters.load("/path/to/adapter", "my_adapter")
generator.set_active_adapter(adapters, "my_adapter")
Adapters object containing loaded adapters
Name of the adapter to activate
set_runtime_option()
Set a runtime option for the generator.
generator.set_runtime_option("cuda_graph", "1")
Example Usage
Basic generation:
import onnxruntime_genai as og
model = og.Model("/models/phi-3-mini")
tokenizer = og.Tokenizer(model)
params = og.GeneratorParams(model)
params.set_search_options(max_length=200, temperature=0.7)
generator = og.Generator(model, params)
prompt = "The first 4 digits of pi are"
input_tokens = tokenizer.encode(prompt)
generator.append_tokens(input_tokens)
while not generator.is_done():
generator.generate_next_token()
output = tokenizer.decode(generator.get_sequence(0))
print(output)
Streaming generation:
import onnxruntime_genai as og
model = og.Model("/models/phi-3-mini")
tokenizer = og.Tokenizer(model)
stream = tokenizer.create_stream()
params = og.GeneratorParams(model)
params.set_search_options(max_length=200)
generator = og.Generator(model, params)
input_tokens = tokenizer.encode("Tell me a story")
generator.append_tokens(input_tokens)
print("Output: ", end="", flush=True)
while not generator.is_done():
generator.generate_next_token()
new_token = generator.get_next_tokens()[0]
print(stream.decode(new_token), end="", flush=True)
print()
Batch generation:
import onnxruntime_genai as og
model = og.Model("/models/phi-3-mini")
tokenizer = og.Tokenizer(model)
prompts = [
"The first 4 digits of pi are",
"The square root of 2 is",
"The capital of France is"
]
input_tokens = tokenizer.encode_batch(prompts)
params = og.GeneratorParams(model)
params.set_search_options(batch_size=len(prompts), max_length=100)
generator = og.Generator(model, params)
generator.append_tokens(input_tokens)
while not generator.is_done():
generator.generate_next_token()
for i in range(len(prompts)):
output = tokenizer.decode(generator.get_sequence(i))
print(f"Prompt {i}: {output}\n")
Manipulating logits:
import onnxruntime_genai as og
import numpy as np
model = og.Model("/models/phi-3-mini")
tokenizer = og.Tokenizer(model)
params = og.GeneratorParams(model)
params.set_search_options(max_length=50)
generator = og.Generator(model, params)
input_tokens = tokenizer.encode("Once upon a time")
generator.append_tokens(input_tokens)
# Ban certain tokens
banned_tokens = [tokenizer.to_token_id("violence"), tokenizer.to_token_id("hate")]
while not generator.is_done():
# Get and modify logits
logits = generator.get_logits()
for token_id in banned_tokens:
logits[:, token_id] = -float('inf')
generator.set_logits(logits)
generator.generate_next_token()
output = tokenizer.decode(generator.get_sequence(0))
print(output)
Rewinding for chat:
import onnxruntime_genai as og
model = og.Model("/models/phi-3-mini")
tokenizer = og.Tokenizer(model)
params = og.GeneratorParams(model)
params.set_search_options(max_length=512)
generator = og.Generator(model, params)
# System prompt
system_prompt = "You are a helpful AI assistant."
system_tokens = tokenizer.encode(system_prompt)
generator.append_tokens(system_tokens)
system_length = generator.token_count()
while True:
user_input = input("User: ")
if user_input == "quit()":
break
# Add user message
user_tokens = tokenizer.encode(user_input)
generator.append_tokens(user_tokens)
# Generate response
print("Assistant: ", end="", flush=True)
stream = tokenizer.create_stream()
while not generator.is_done():
generator.generate_next_token()
new_token = generator.get_next_tokens()[0]
print(stream.decode(new_token), end="", flush=True)
print()
# Rewind to system prompt to clear history
generator.rewind_to(system_length)