Skip to main content
The GeneratorParams class configures how the model generates text, including search strategy, sampling parameters, and constraints.

Constructor

Create generation parameters for a model.
import onnxruntime_genai as og

model = og.Model("/path/to/model")
params = og.GeneratorParams(model)
model
Model
required
The Model object to create parameters for

Methods

set_search_options()

Set search and sampling parameters for generation.
params.set_search_options(
    max_length=200,
    temperature=0.7,
    top_p=0.9,
    top_k=50,
    repetition_penalty=1.1
)
All parameters are passed as keyword arguments:
max_length
int
Maximum number of tokens to generate (including input)
min_length
int
Minimum number of tokens to generate (including input)
batch_size
int
default:"1"
Number of sequences to generate in parallel
num_beams
int
default:"1"
Number of beams for beam search. Use 1 for greedy search.
num_return_sequences
int
default:"1"
Number of sequences to return (must be ≤ num_beams)
temperature
float
default:"1.0"
Sampling temperature. Lower values make output more deterministic. Typical range: 0.1-2.0.
top_p
float
default:"1.0"
Nucleus sampling probability threshold. Only tokens with cumulative probability < top_p are considered. Typical range: 0.8-0.95.
top_k
int
default:"0"
Sample from top K tokens. 0 = disabled.
repetition_penalty
float
default:"1.0"
Penalty for repeating tokens. Values > 1.0 discourage repetition. Typical range: 1.0-1.5.
do_sample
bool
default:"False"
Enable random sampling. If false, uses greedy or beam search.
random_seed
int
Random seed for sampling (for reproducibility)
length_penalty
float
default:"1.0"
Exponential penalty to length for beam search. Values < 1.0 encourage longer sequences, > 1.0 encourage shorter.
diversity_penalty
float
default:"0.0"
Penalty for similar beams in diverse beam search. Higher values encourage diversity.
no_repeat_ngram_size
int
default:"0"
Size of n-grams that cannot be repeated. 0 = disabled.
early_stopping
bool
default:"False"
Stop beam search when all beams finish
past_present_share_buffer
bool
default:"True"
Share buffer between past and present key-value cache. Required for CUDA graph capture.
chunk_size
int
default:"0"
Chunk size for prefill chunking during context processing. 0 = disabled, >0 = enabled.

get_search_options()

Retrieve current search options as a dictionary.
options = params.get_search_options()
print(f"Max length: {options['max_length']}")
print(f"Temperature: {options['temperature']}")
options
dict
Dictionary containing all search option key-value pairs

set_guidance()

Set constrained generation guidance using grammars or JSON schemas.
# JSON schema guidance
json_schema = '{"type": "object", "properties": {...}}'
params.set_guidance("json_schema", json_schema)

# Lark grammar guidance
lark_grammar = "start: TEXT\\nTEXT: /[^<](.|\\n)*/"
params.set_guidance("lark_grammar", lark_grammar, enable_ff_tokens=False)
type
str
required
Guidance type: “json_schema” or “lark_grammar”
data
str
required
The schema or grammar definition as a string
enable_ff_tokens
bool
default:"False"
Enable fast-forward tokens for guidance

Example Usage

Basic search options:
import onnxruntime_genai as og

model = og.Model("/models/phi-3-mini")
params = og.GeneratorParams(model)

# Simple greedy decoding
params.set_search_options(
    max_length=200
)

generator = og.Generator(model, params)
Sampling with temperature:
import onnxruntime_genai as og

model = og.Model("/models/phi-3-mini")
params = og.GeneratorParams(model)

# Creative sampling
params.set_search_options(
    max_length=500,
    do_sample=True,
    temperature=0.8,
    top_p=0.9,
    top_k=50,
    repetition_penalty=1.1
)

generator = og.Generator(model, params)
Beam search:
import onnxruntime_genai as og

model = og.Model("/models/phi-3-mini")
params = og.GeneratorParams(model)

# Beam search for better quality
params.set_search_options(
    max_length=200,
    num_beams=5,
    num_return_sequences=3,
    length_penalty=0.8,
    early_stopping=True
)

generator = og.Generator(model, params)
tokenizer = og.Tokenizer(model)

input_tokens = tokenizer.encode("Explain quantum computing")
generator.append_tokens(input_tokens)

while not generator.is_done():
    generator.generate_next_token()

# Get top 3 sequences
for i in range(3):
    output = tokenizer.decode(generator.get_sequence(i))
    print(f"Sequence {i+1}: {output}\n")
Batch generation:
import onnxruntime_genai as og

model = og.Model("/models/phi-3-mini")
tokenizer = og.Tokenizer(model)

prompts = [
    "The first 4 digits of pi are",
    "The square root of 2 is",
    "The capital of France is"
]

input_tokens = tokenizer.encode_batch(prompts)

params = og.GeneratorParams(model)
params.set_search_options(
    batch_size=len(prompts),
    max_length=100,
    temperature=0.7
)

generator = og.Generator(model, params)
generator.append_tokens(input_tokens)

while not generator.is_done():
    generator.generate_next_token()

for i in range(len(prompts)):
    output = tokenizer.decode(generator.get_sequence(i))
    print(f"Output {i}: {output}\n")
JSON schema guidance:
import onnxruntime_genai as og
import json

model = og.Model("/models/phi-3-mini")
tokenizer = og.Tokenizer(model)

params = og.GeneratorParams(model)
params.set_search_options(max_length=512)

# Define JSON schema for tool calling
json_schema = {
    "type": "array",
    "items": {
        "anyOf": [
            {
                "type": "object",
                "properties": {
                    "name": {"const": "get_weather"},
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "location": {"type": "string"},
                            "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
                        },
                        "required": ["location"]
                    }
                },
                "required": ["name", "parameters"]
            }
        ]
    },
    "minItems": 1
}

params.set_guidance("json_schema", json.dumps(json_schema))

generator = og.Generator(model, params)
prompt = "What's the weather in Paris?"
input_tokens = tokenizer.encode(prompt)
generator.append_tokens(input_tokens)

while not generator.is_done():
    generator.generate_next_token()

output = tokenizer.decode(generator.get_sequence(0))
print(f"Structured output: {output}")
Lark grammar guidance:
import onnxruntime_genai as og

model = og.Model("/models/phi-3-mini")
tokenizer = og.Tokenizer(model)

params = og.GeneratorParams(model)
params.set_search_options(max_length=200)

# Define grammar that only allows text (no tool calls)
lark_grammar = """start: TEXT
TEXT: /[^<](.|\\n)*/"""

params.set_guidance("lark_grammar", lark_grammar)

generator = og.Generator(model, params)
prompt = "Tell me about Python"
input_tokens = tokenizer.encode(prompt)
generator.append_tokens(input_tokens)

while not generator.is_done():
    generator.generate_next_token()

output = tokenizer.decode(generator.get_sequence(0))
print(output)
Reproducible generation:
import onnxruntime_genai as og

model = og.Model("/models/phi-3-mini")
tokenizer = og.Tokenizer(model)

def generate_with_seed(prompt, seed):
    params = og.GeneratorParams(model)
    params.set_search_options(
        max_length=100,
        do_sample=True,
        temperature=0.8,
        random_seed=seed
    )
    
    generator = og.Generator(model, params)
    input_tokens = tokenizer.encode(prompt)
    generator.append_tokens(input_tokens)
    
    while not generator.is_done():
        generator.generate_next_token()
    
    return tokenizer.decode(generator.get_sequence(0))

# Same seed produces same output
output1 = generate_with_seed("Once upon a time", seed=42)
output2 = generate_with_seed("Once upon a time", seed=42)
assert output1 == output2
print("Outputs are identical!")

Build docs developers (and LLMs) love