Skip to main content
The MultiModalProcessor class handles preprocessing of images, audio, and text for vision-language and audio-language models.

Constructor

Create a multimodal processor from a model.
import onnxruntime_genai as og

model = og.Model("/path/to/model")
processor = model.create_multimodal_processor()
The processor is created using the create_multimodal_processor() method on a Model object.

Methods

call()

Process text prompts along with images and/or audio into model inputs.
# Single prompt with image
images = og.Images.open("photo.jpg")
inputs = processor("<|image_1|>\nWhat is in this image?", images=images)

# Single prompt with audio
audios = og.Audios.open("speech.wav")
inputs = processor("<|audio_1|>\nTranscribe this audio.", audios=audios)

# Multiple prompts
prompts = ["<|image_1|>\nDescribe this", "<|image_1|>\nWhat colors are present?"]
images = og.Images.open("photo.jpg")
inputs = processor(prompts, images=images)
prompt
str | list[str] | None
default:"None"
Text prompt(s) to process. Can be:
  • A single string
  • A list of strings for batch processing
  • None if only processing media without text
images
Images
default:"None"
Images object containing one or more images
audios
Audios
default:"None"
Audios object containing one or more audio files
inputs
NamedTensors
Preprocessed inputs ready to pass to Generator.set_inputs()

decode()

Decode token IDs back to text.
text = processor.decode(tokens)
tokens
numpy.ndarray
required
Array of int32 token IDs to decode
text
str
The decoded text string

create_stream()

Create a streaming tokenizer for incremental decoding.
stream = processor.create_stream()

while not generator.is_done():
    generator.generate_next_token()
    new_token = generator.get_next_tokens()[0]
    print(stream.decode(new_token), end="", flush=True)
stream
TokenizerStream
A TokenizerStream object for streaming decoding

Images Class

Load and manage images for multimodal processing.

open()

Load images from file paths.
# Single image
images = og.Images.open("photo.jpg")

# Multiple images
images = og.Images.open("photo1.jpg", "photo2.jpg", "photo3.jpg")
*image_paths
str
required
One or more file paths to image files
images
Images
Images object containing the loaded images

open_bytes()

Load images from bytes in memory.
import io
from PIL import Image

# Convert PIL Image to bytes
img = Image.open("photo.jpg")
buf = io.BytesIO()
img.save(buf, format='PNG')
image_bytes = buf.getvalue()

images = og.Images.open_bytes(image_bytes)
*image_datas
bytes
required
One or more byte objects containing image data
images
Images
Images object containing the loaded images

Audios Class

Load and manage audio files for multimodal processing.

open()

Load audio files from file paths.
# Single audio
audios = og.Audios.open("speech.wav")

# Multiple audios
audios = og.Audios.open("audio1.wav", "audio2.wav", "audio3.wav")
*audio_paths
str
required
One or more file paths to audio files
audios
Audios
Audios object containing the loaded audio files

open_bytes()

Load audio from bytes in memory.
with open("speech.wav", "rb") as f:
    audio_bytes = f.read()

audios = og.Audios.open_bytes(audio_bytes)
*audio_datas
bytes
required
One or more byte objects containing audio data
audios
Audios
Audios object containing the loaded audio files

Example Usage

Vision-language model (Phi-3 Vision):
import onnxruntime_genai as og

# Load model and create processor
model = og.Model("/models/phi-3-vision")
processor = model.create_multimodal_processor()

# Load image
images = og.Images.open("photo.jpg")

# Process with image tag
prompt = "<|image_1|>\nWhat objects are in this image?"
inputs = processor(prompt, images=images)

# Generate
params = og.GeneratorParams(model)
params.set_search_options(max_length=512, temperature=0.7)
generator = og.Generator(model, params)
generator.set_inputs(inputs)

print("Output: ", end="", flush=True)
stream = processor.create_stream()

while not generator.is_done():
    generator.generate_next_token()
    new_token = generator.get_next_tokens()[0]
    print(stream.decode(new_token), end="", flush=True)
print()
Multiple images:
import onnxruntime_genai as og

model = og.Model("/models/phi-3-vision")
processor = model.create_multimodal_processor()

# Load multiple images
images = og.Images.open("image1.jpg", "image2.jpg", "image3.jpg")

# Reference images in prompt
prompt = """<|image_1|>
<|image_2|>
<|image_3|>
Compare these three images and describe their differences."""

inputs = processor(prompt, images=images)

params = og.GeneratorParams(model)
params.set_search_options(max_length=1024)
generator = og.Generator(model, params)
generator.set_inputs(inputs)

while not generator.is_done():
    generator.generate_next_token()

output = processor.decode(generator.get_sequence(0))
print(output)
Audio transcription:
import onnxruntime_genai as og

model = og.Model("/models/whisper-large")
processor = model.create_multimodal_processor()

# Load audio file
audios = og.Audios.open("speech.wav")

# Process audio
inputs = processor(None, audios=audios)

# Generate transcription
params = og.GeneratorParams(model)
params.set_search_options(max_length=1000)
generator = og.Generator(model, params)
generator.set_inputs(inputs)

while not generator.is_done():
    generator.generate_next_token()

transcript = processor.decode(generator.get_sequence(0))
print(f"Transcript: {transcript}")
Multimodal with images and audio (Phi-4):
import onnxruntime_genai as og

model = og.Model("/models/phi-4-multimodal")
processor = model.create_multimodal_processor()

# Load both images and audio
images = og.Images.open("scene.jpg")
audios = og.Audios.open("description.wav")

# Process together
prompt = "<|image_1|>\n<|audio_1|>\nDoes the audio description match the image?"
inputs = processor(prompt, images=images, audios=audios)

params = og.GeneratorParams(model)
params.set_search_options(max_length=512)
generator = og.Generator(model, params)
generator.set_inputs(inputs)

while not generator.is_done():
    generator.generate_next_token()

output = processor.decode(generator.get_sequence(0))
print(output)
Batch processing multiple prompts:
import onnxruntime_genai as og

model = og.Model("/models/phi-3-vision")
processor = model.create_multimodal_processor()

images = og.Images.open("photo.jpg")

# Multiple prompts for same image
prompts = [
    "<|image_1|>\nWhat is the main subject?",
    "<|image_1|>\nWhat colors are present?",
    "<|image_1|>\nDescribe the lighting."
]

inputs = processor(prompts, images=images)

params = og.GeneratorParams(model)
params.set_search_options(
    batch_size=len(prompts),
    max_length=512
)

generator = og.Generator(model, params)
generator.set_inputs(inputs)

while not generator.is_done():
    generator.generate_next_token()

for i, prompt in enumerate(prompts):
    output = processor.decode(generator.get_sequence(i))
    print(f"Q: {prompt}")
    print(f"A: {output}")
    print()
Loading from bytes:
import onnxruntime_genai as og
import requests
from io import BytesIO

model = og.Model("/models/phi-3-vision")
processor = model.create_multimodal_processor()

# Download image from URL
url = "https://example.com/image.jpg"
response = requests.get(url)
image_bytes = response.content

images = og.Images.open_bytes(image_bytes)

prompt = "<|image_1|>\nWhat is this?"
inputs = processor(prompt, images=images)

params = og.GeneratorParams(model)
params.set_search_options(max_length=512)
generator = og.Generator(model, params)
generator.set_inputs(inputs)

while not generator.is_done():
    generator.generate_next_token()

output = processor.decode(generator.get_sequence(0))
print(output)

Build docs developers (and LLMs) love