The GeneratorParams class configures how the model generates text, including search strategy, sampling parameters, and constraints.
Constructor
Create generation parameters for a model.
import onnxruntime_genai as og
model = og.Model("/path/to/model")
params = og.GeneratorParams(model)
The Model object to create parameters for
Methods
set_search_options()
Set search and sampling parameters for generation.
params.set_search_options(
max_length=200,
temperature=0.7,
top_p=0.9,
top_k=50,
repetition_penalty=1.1
)
All parameters are passed as keyword arguments:
Maximum number of tokens to generate (including input)
Minimum number of tokens to generate (including input)
Number of sequences to generate in parallel
Number of beams for beam search. Use 1 for greedy search.
Number of sequences to return (must be ≤ num_beams)
Sampling temperature. Lower values make output more deterministic. Typical range: 0.1-2.0.
Nucleus sampling probability threshold. Only tokens with cumulative probability < top_p are considered. Typical range: 0.8-0.95.
Sample from top K tokens. 0 = disabled.
Penalty for repeating tokens. Values > 1.0 discourage repetition. Typical range: 1.0-1.5.
Enable random sampling. If false, uses greedy or beam search.
Random seed for sampling (for reproducibility)
Exponential penalty to length for beam search. Values < 1.0 encourage longer sequences, > 1.0 encourage shorter.
Penalty for similar beams in diverse beam search. Higher values encourage diversity.
Size of n-grams that cannot be repeated. 0 = disabled.
Stop beam search when all beams finish
past_present_share_buffer
Share buffer between past and present key-value cache. Required for CUDA graph capture.
Chunk size for prefill chunking during context processing. 0 = disabled, >0 = enabled.
get_search_options()
Retrieve current search options as a dictionary.
options = params.get_search_options()
print(f"Max length: {options['max_length']}")
print(f"Temperature: {options['temperature']}")
Dictionary containing all search option key-value pairs
set_guidance()
Set constrained generation guidance using grammars or JSON schemas.
# JSON schema guidance
json_schema = '{"type": "object", "properties": {...}}'
params.set_guidance("json_schema", json_schema)
# Lark grammar guidance
lark_grammar = "start: TEXT\\nTEXT: /[^<](.|\\n)*/"
params.set_guidance("lark_grammar", lark_grammar, enable_ff_tokens=False)
Guidance type: “json_schema” or “lark_grammar”
The schema or grammar definition as a string
Enable fast-forward tokens for guidance
Example Usage
Basic search options:
import onnxruntime_genai as og
model = og.Model("/models/phi-3-mini")
params = og.GeneratorParams(model)
# Simple greedy decoding
params.set_search_options(
max_length=200
)
generator = og.Generator(model, params)
Sampling with temperature:
import onnxruntime_genai as og
model = og.Model("/models/phi-3-mini")
params = og.GeneratorParams(model)
# Creative sampling
params.set_search_options(
max_length=500,
do_sample=True,
temperature=0.8,
top_p=0.9,
top_k=50,
repetition_penalty=1.1
)
generator = og.Generator(model, params)
Beam search:
import onnxruntime_genai as og
model = og.Model("/models/phi-3-mini")
params = og.GeneratorParams(model)
# Beam search for better quality
params.set_search_options(
max_length=200,
num_beams=5,
num_return_sequences=3,
length_penalty=0.8,
early_stopping=True
)
generator = og.Generator(model, params)
tokenizer = og.Tokenizer(model)
input_tokens = tokenizer.encode("Explain quantum computing")
generator.append_tokens(input_tokens)
while not generator.is_done():
generator.generate_next_token()
# Get top 3 sequences
for i in range(3):
output = tokenizer.decode(generator.get_sequence(i))
print(f"Sequence {i+1}: {output}\n")
Batch generation:
import onnxruntime_genai as og
model = og.Model("/models/phi-3-mini")
tokenizer = og.Tokenizer(model)
prompts = [
"The first 4 digits of pi are",
"The square root of 2 is",
"The capital of France is"
]
input_tokens = tokenizer.encode_batch(prompts)
params = og.GeneratorParams(model)
params.set_search_options(
batch_size=len(prompts),
max_length=100,
temperature=0.7
)
generator = og.Generator(model, params)
generator.append_tokens(input_tokens)
while not generator.is_done():
generator.generate_next_token()
for i in range(len(prompts)):
output = tokenizer.decode(generator.get_sequence(i))
print(f"Output {i}: {output}\n")
JSON schema guidance:
import onnxruntime_genai as og
import json
model = og.Model("/models/phi-3-mini")
tokenizer = og.Tokenizer(model)
params = og.GeneratorParams(model)
params.set_search_options(max_length=512)
# Define JSON schema for tool calling
json_schema = {
"type": "array",
"items": {
"anyOf": [
{
"type": "object",
"properties": {
"name": {"const": "get_weather"},
"parameters": {
"type": "object",
"properties": {
"location": {"type": "string"},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
},
"required": ["location"]
}
},
"required": ["name", "parameters"]
}
]
},
"minItems": 1
}
params.set_guidance("json_schema", json.dumps(json_schema))
generator = og.Generator(model, params)
prompt = "What's the weather in Paris?"
input_tokens = tokenizer.encode(prompt)
generator.append_tokens(input_tokens)
while not generator.is_done():
generator.generate_next_token()
output = tokenizer.decode(generator.get_sequence(0))
print(f"Structured output: {output}")
Lark grammar guidance:
import onnxruntime_genai as og
model = og.Model("/models/phi-3-mini")
tokenizer = og.Tokenizer(model)
params = og.GeneratorParams(model)
params.set_search_options(max_length=200)
# Define grammar that only allows text (no tool calls)
lark_grammar = """start: TEXT
TEXT: /[^<](.|\\n)*/"""
params.set_guidance("lark_grammar", lark_grammar)
generator = og.Generator(model, params)
prompt = "Tell me about Python"
input_tokens = tokenizer.encode(prompt)
generator.append_tokens(input_tokens)
while not generator.is_done():
generator.generate_next_token()
output = tokenizer.decode(generator.get_sequence(0))
print(output)
Reproducible generation:
import onnxruntime_genai as og
model = og.Model("/models/phi-3-mini")
tokenizer = og.Tokenizer(model)
def generate_with_seed(prompt, seed):
params = og.GeneratorParams(model)
params.set_search_options(
max_length=100,
do_sample=True,
temperature=0.8,
random_seed=seed
)
generator = og.Generator(model, params)
input_tokens = tokenizer.encode(prompt)
generator.append_tokens(input_tokens)
while not generator.is_done():
generator.generate_next_token()
return tokenizer.decode(generator.get_sequence(0))
# Same seed produces same output
output1 = generate_with_seed("Once upon a time", seed=42)
output2 = generate_with_seed("Once upon a time", seed=42)
assert output1 == output2
print("Outputs are identical!")