The MultiModalProcessor class handles preprocessing of images, audio, and text for vision-language and audio-language models.
Constructor
Create a multimodal processor from a model.
import onnxruntime_genai as og
model = og.Model("/path/to/model")
processor = model.create_multimodal_processor()
The processor is created using the create_multimodal_processor() method on a Model object.
Methods
call()
Process text prompts along with images and/or audio into model inputs.
# Single prompt with image
images = og.Images.open("photo.jpg")
inputs = processor("<|image_1|>\nWhat is in this image?", images=images)
# Single prompt with audio
audios = og.Audios.open("speech.wav")
inputs = processor("<|audio_1|>\nTranscribe this audio.", audios=audios)
# Multiple prompts
prompts = ["<|image_1|>\nDescribe this", "<|image_1|>\nWhat colors are present?"]
images = og.Images.open("photo.jpg")
inputs = processor(prompts, images=images)
prompt
str | list[str] | None
default:"None"
Text prompt(s) to process. Can be:
- A single string
- A list of strings for batch processing
- None if only processing media without text
Images object containing one or more images
Audios object containing one or more audio files
Preprocessed inputs ready to pass to Generator.set_inputs()
decode()
Decode token IDs back to text.
text = processor.decode(tokens)
Array of int32 token IDs to decode
create_stream()
Create a streaming tokenizer for incremental decoding.
stream = processor.create_stream()
while not generator.is_done():
generator.generate_next_token()
new_token = generator.get_next_tokens()[0]
print(stream.decode(new_token), end="", flush=True)
A TokenizerStream object for streaming decoding
Images Class
Load and manage images for multimodal processing.
open()
Load images from file paths.
# Single image
images = og.Images.open("photo.jpg")
# Multiple images
images = og.Images.open("photo1.jpg", "photo2.jpg", "photo3.jpg")
One or more file paths to image files
Images object containing the loaded images
open_bytes()
Load images from bytes in memory.
import io
from PIL import Image
# Convert PIL Image to bytes
img = Image.open("photo.jpg")
buf = io.BytesIO()
img.save(buf, format='PNG')
image_bytes = buf.getvalue()
images = og.Images.open_bytes(image_bytes)
One or more byte objects containing image data
Images object containing the loaded images
Audios Class
Load and manage audio files for multimodal processing.
open()
Load audio files from file paths.
# Single audio
audios = og.Audios.open("speech.wav")
# Multiple audios
audios = og.Audios.open("audio1.wav", "audio2.wav", "audio3.wav")
One or more file paths to audio files
Audios object containing the loaded audio files
open_bytes()
Load audio from bytes in memory.
with open("speech.wav", "rb") as f:
audio_bytes = f.read()
audios = og.Audios.open_bytes(audio_bytes)
One or more byte objects containing audio data
Audios object containing the loaded audio files
Example Usage
Vision-language model (Phi-3 Vision):
import onnxruntime_genai as og
# Load model and create processor
model = og.Model("/models/phi-3-vision")
processor = model.create_multimodal_processor()
# Load image
images = og.Images.open("photo.jpg")
# Process with image tag
prompt = "<|image_1|>\nWhat objects are in this image?"
inputs = processor(prompt, images=images)
# Generate
params = og.GeneratorParams(model)
params.set_search_options(max_length=512, temperature=0.7)
generator = og.Generator(model, params)
generator.set_inputs(inputs)
print("Output: ", end="", flush=True)
stream = processor.create_stream()
while not generator.is_done():
generator.generate_next_token()
new_token = generator.get_next_tokens()[0]
print(stream.decode(new_token), end="", flush=True)
print()
Multiple images:
import onnxruntime_genai as og
model = og.Model("/models/phi-3-vision")
processor = model.create_multimodal_processor()
# Load multiple images
images = og.Images.open("image1.jpg", "image2.jpg", "image3.jpg")
# Reference images in prompt
prompt = """<|image_1|>
<|image_2|>
<|image_3|>
Compare these three images and describe their differences."""
inputs = processor(prompt, images=images)
params = og.GeneratorParams(model)
params.set_search_options(max_length=1024)
generator = og.Generator(model, params)
generator.set_inputs(inputs)
while not generator.is_done():
generator.generate_next_token()
output = processor.decode(generator.get_sequence(0))
print(output)
Audio transcription:
import onnxruntime_genai as og
model = og.Model("/models/whisper-large")
processor = model.create_multimodal_processor()
# Load audio file
audios = og.Audios.open("speech.wav")
# Process audio
inputs = processor(None, audios=audios)
# Generate transcription
params = og.GeneratorParams(model)
params.set_search_options(max_length=1000)
generator = og.Generator(model, params)
generator.set_inputs(inputs)
while not generator.is_done():
generator.generate_next_token()
transcript = processor.decode(generator.get_sequence(0))
print(f"Transcript: {transcript}")
Multimodal with images and audio (Phi-4):
import onnxruntime_genai as og
model = og.Model("/models/phi-4-multimodal")
processor = model.create_multimodal_processor()
# Load both images and audio
images = og.Images.open("scene.jpg")
audios = og.Audios.open("description.wav")
# Process together
prompt = "<|image_1|>\n<|audio_1|>\nDoes the audio description match the image?"
inputs = processor(prompt, images=images, audios=audios)
params = og.GeneratorParams(model)
params.set_search_options(max_length=512)
generator = og.Generator(model, params)
generator.set_inputs(inputs)
while not generator.is_done():
generator.generate_next_token()
output = processor.decode(generator.get_sequence(0))
print(output)
Batch processing multiple prompts:
import onnxruntime_genai as og
model = og.Model("/models/phi-3-vision")
processor = model.create_multimodal_processor()
images = og.Images.open("photo.jpg")
# Multiple prompts for same image
prompts = [
"<|image_1|>\nWhat is the main subject?",
"<|image_1|>\nWhat colors are present?",
"<|image_1|>\nDescribe the lighting."
]
inputs = processor(prompts, images=images)
params = og.GeneratorParams(model)
params.set_search_options(
batch_size=len(prompts),
max_length=512
)
generator = og.Generator(model, params)
generator.set_inputs(inputs)
while not generator.is_done():
generator.generate_next_token()
for i, prompt in enumerate(prompts):
output = processor.decode(generator.get_sequence(i))
print(f"Q: {prompt}")
print(f"A: {output}")
print()
Loading from bytes:
import onnxruntime_genai as og
import requests
from io import BytesIO
model = og.Model("/models/phi-3-vision")
processor = model.create_multimodal_processor()
# Download image from URL
url = "https://example.com/image.jpg"
response = requests.get(url)
image_bytes = response.content
images = og.Images.open_bytes(image_bytes)
prompt = "<|image_1|>\nWhat is this?"
inputs = processor(prompt, images=images)
params = og.GeneratorParams(model)
params.set_search_options(max_length=512)
generator = og.Generator(model, params)
generator.set_inputs(inputs)
while not generator.is_done():
generator.generate_next_token()
output = processor.decode(generator.get_sequence(0))
print(output)