ONNX Runtime provides specialized tools for optimizing transformer-based models including BERT, GPT-2, T5, BART, and other popular architectures. These tools apply graph transformations to fuse operators and improve performance.
Optimizer
optimize_model()
Optimize transformer models with architecture-specific fusions.
from onnxruntime.transformers import optimizer
optimized_model = optimizer.optimize_model(
input: str | ModelProto,
model_type: str = "bert",
num_heads: int = 0,
hidden_size: int = 0,
optimization_options: FusionOptions | None = None,
opt_level: int | None = None,
use_gpu: bool = False,
only_onnxruntime: bool = False
)
Path to ONNX model file or ModelProto object.
Model architecture type. Supported values:
- “bert” - BERT and variants (default)
- “gpt2” - GPT-2, GPT-Neo
- “bart” - BART models
- “t5” - T5 models
- “clip” - CLIP vision/text models
- “unet” - UNet (Stable Diffusion)
- “vae” - VAE (Stable Diffusion)
- “phi” - Phi models
Number of attention heads. Auto-detected if 0.
Hidden layer size. Auto-detected if 0.
Options controlling which optimizations to apply.
Optimization level (0-2). Higher levels apply more aggressive optimizations.
Whether the model will run on GPU. Enables GPU-specific optimizations.
Use only ORT built-in optimizations without custom graph transformations.
Optimized model object that can be saved to file.
Model Types
Supported Architectures
BERT, RoBERTa, DistilBERT, ALBERT, ELECTRA, DeBERTa, and similar encoder models.
GPT-2, GPT-Neo, GPT-J, and other autoregressive decoder models.
T5, mT5, and encoder-decoder models.
BART, mBART, and similar sequence-to-sequence models.
CLIP vision and text encoders.
UNet models from Stable Diffusion.
VAE encoder/decoder from Stable Diffusion.
Example Usage
Basic BERT Optimization
from onnxruntime.transformers import optimizer
# Optimize for CPU
optimized_model = optimizer.optimize_model(
"bert_base.onnx",
model_type="bert",
num_heads=12,
hidden_size=768,
use_gpu=False
)
optimized_model.save_model_to_file("bert_base_optimized.onnx")
GPU Optimization
# Optimize for GPU with FP16
optimized_model = optimizer.optimize_model(
"gpt2_large.onnx",
model_type="gpt2",
num_heads=20,
hidden_size=1280,
use_gpu=True
)
# Convert to FP16 for mixed precision
optimized_model.convert_float_to_float16(keep_io_types=True)
optimized_model.save_model_to_file("gpt2_large_optimized_fp16.onnx")
Custom Optimization Options
from onnxruntime.transformers.fusion_options import FusionOptions
# Create custom fusion options
fusion_options = FusionOptions("bert")
fusion_options.enable_gelu = True
fusion_options.enable_layer_norm = True
fusion_options.enable_attention = True
fusion_options.enable_skip_layer_norm = True
fusion_options.enable_embed_layer_norm = True
fusion_options.enable_bias_skip_layer_norm = False
optimized_model = optimizer.optimize_model(
"bert_custom.onnx",
model_type="bert",
optimization_options=fusion_options
)
optimized_model.save_model_to_file("bert_custom_optimized.onnx")
T5 Model Optimization
# Optimize T5 encoder
encoder_optimized = optimizer.optimize_model(
"t5_encoder.onnx",
model_type="t5",
num_heads=12,
hidden_size=768,
use_gpu=True,
opt_level=2
)
encoder_optimized.save_model_to_file("t5_encoder_optimized.onnx")
# Optimize T5 decoder
decoder_optimized = optimizer.optimize_model(
"t5_decoder.onnx",
model_type="t5",
num_heads=12,
hidden_size=768,
use_gpu=True,
opt_level=2
)
decoder_optimized.save_model_to_file("t5_decoder_optimized.onnx")
Float16 Conversion
from onnxruntime.transformers import optimizer
optimized_model = optimizer.optimize_model(
"bert.onnx",
model_type="bert",
use_gpu=True
)
# Convert to FP16
optimized_model.convert_float_to_float16(
keep_io_types=True, # Keep input/output as FP32
op_block_list=["Softmax"] # Don't convert Softmax to FP16
)
optimized_model.save_model_to_file("bert_fp16.onnx")
Fusion Options
FusionOptions
Controls which graph fusions to apply.
from onnxruntime.transformers.fusion_options import FusionOptions
options = FusionOptions(model_type="bert")
# Enable/disable specific fusions
options.enable_attention = True # Fuse multi-head attention
options.enable_gelu = True # Fuse GELU activation
options.enable_layer_norm = True # Fuse LayerNormalization
options.enable_skip_layer_norm = True # Fuse SkipLayerNorm
options.enable_embed_layer_norm = True # Fuse EmbedLayerNorm
options.enable_bias_skip_layer_norm = True # Fuse BiasSkipLayerNorm
options.enable_bias_gelu = True # Fuse BiasGelu
options.enable_gelu_approximation = False # Use GELU approximation
Optimization Levels
Basic optimizations only. Minimal transformations.
Standard optimizations. Applies common fusions (default for most models).
Aggressive optimizations. Maximum fusions and transformations.
Benchmark Helper
from onnxruntime.transformers import benchmark_helper
import onnxruntime as ort
# Create session for benchmarking
sess = ort.InferenceSession(
"bert_optimized.onnx",
providers=["CUDAExecutionProvider"]
)
# Run benchmark
results = benchmark_helper.benchmark(
sess,
input_data,
warmup_runs=10,
test_runs=100
)
print(f"Average latency: {results['average_latency_ms']:.2f} ms")
print(f"Throughput: {results['throughput_qps']:.2f} QPS")
from onnxruntime.transformers import optimizer
from onnxruntime.quantization import quantize_dynamic, QuantType
# Step 1: Optimize graph structure
optimized_model = optimizer.optimize_model(
"bert.onnx",
model_type="bert",
use_gpu=False
)
optimized_model.save_model_to_file("bert_optimized.onnx")
# Step 2: Quantize to INT8
quantize_dynamic(
"bert_optimized.onnx",
"bert_quantized.onnx",
weight_type=QuantType.QInt8,
per_channel=True,
reduce_range=True,
op_types_to_quantize=["MatMul", "Gemm"]
)
# Now use the quantized model
import onnxruntime as ort
sess = ort.InferenceSession("bert_quantized.onnx")
Model Analysis
from onnxruntime.transformers import optimizer
optimized_model = optimizer.optimize_model("bert.onnx", model_type="bert")
# Print optimization statistics
optimized_model.get_fused_operator_statistics()
# Get model info
print(f"Number of nodes: {len(optimized_model.nodes())}")
print(f"Number of attention layers: {optimized_model.get_attention_count()}")
print(f"Model uses FP16: {optimized_model.use_float16()}")
Export Utilities
Large Model Exporter
from onnxruntime.transformers.large_model_exporter import export_large_model
# Export large models with external data
export_large_model(
model_path="gpt_large.onnx",
output_dir="./exported",
external_data_name="weights.bin",
size_threshold=1024 # 1KB threshold
)
IO Binding Helper
from onnxruntime.transformers.io_binding_helper import TypeHelper, IOBindingHelper
import onnxruntime as ort
import torch
sess = ort.InferenceSession(
"bert_optimized.onnx",
providers=["CUDAExecutionProvider"]
)
# Create IO binding helper
io_helper = IOBindingHelper(sess)
# Bind PyTorch tensors directly
input_ids = torch.randint(0, 1000, (1, 128), device="cuda:0")
attention_mask = torch.ones(1, 128, device="cuda:0")
io_binding = sess.io_binding()
io_helper.bind_input(
"input_ids",
input_ids,
io_binding
)
io_helper.bind_input(
"attention_mask",
attention_mask,
io_binding
)
io_helper.bind_output(
"last_hidden_state",
"cuda",
io_binding
)
sess.run_with_iobinding(io_binding)
outputs = io_binding.get_outputs()
Machine Info
from onnxruntime.transformers.machine_info import get_device_info
# Get system and device information
info = get_device_info()
print(info)
Best Practices
Complete Optimization Pipeline
from onnxruntime.transformers import optimizer
from onnxruntime.quantization import quantize_dynamic, QuantType
import onnxruntime as ort
# 1. Export model from PyTorch/TensorFlow
# torch.onnx.export(...) or tf2onnx.convert(...)
# 2. Optimize graph structure
optimized_model = optimizer.optimize_model(
"model.onnx",
model_type="bert",
num_heads=12,
hidden_size=768,
use_gpu=True,
opt_level=2
)
# 3. Convert to FP16 for GPU
optimized_model.convert_float_to_float16(keep_io_types=True)
optimized_model.save_model_to_file("model_optimized_fp16.onnx")
# 4. (Optional) Quantize for CPU deployment
quantize_dynamic(
"model_optimized.onnx",
"model_quantized.onnx",
weight_type=QuantType.QInt8,
per_channel=True
)
# 5. Run inference
sess = ort.InferenceSession(
"model_optimized_fp16.onnx",
providers=["CUDAExecutionProvider"]
)
outputs = sess.run(None, inputs)