Overview
LiteLLM Router provides sophisticated load balancing to distribute requests across multiple model deployments. This improves reliability, reduces rate limiting, and optimizes resource utilization.
Quick Start
from litellm import Router
router = Router(
model_list=[
{
"model_name": "gpt-4", # Model alias
"litellm_params": {
"model": "gpt-4",
"api_key": "sk-key1"
}
},
{
"model_name": "gpt-4", # Same alias, different deployment
"litellm_params": {
"model": "azure/gpt-4-deployment",
"api_key": "azure-key",
"api_base": "https://..."
}
},
{
"model_name": "gpt-4", # Third deployment
"litellm_params": {
"model": "gpt-4",
"api_key": "sk-key2"
}
}
],
routing_strategy="simple-shuffle" # Load balancing strategy
)
# Requests are distributed across all three deployments
response = router.completion(
model="gpt-4",
messages=[{"role": "user", "content": "Hello!"}]
)
Routing Strategies
LiteLLM supports multiple routing strategies to optimize different use cases:
1. Simple Shuffle (Default)
Randomly distributes requests across deployments:
router = Router(
model_list=[...],
routing_strategy="simple-shuffle"
)
Best for: General use, even distribution, no state tracking needed
2. Least Busy
Routes to the deployment with the fewest ongoing requests:
router = Router(
model_list=[...],
routing_strategy="least-busy"
)
Best for: Minimizing queue times, handling variable request durations
3. Latency-Based Routing
Routes to the deployment with the lowest average latency:
router = Router(
model_list=[...],
routing_strategy="latency-based-routing",
routing_strategy_args={
"ttl": 60, # How long to cache latency data (seconds)
"window_size": 100 # Number of requests to track
}
)
Best for: Optimizing response time, geographic distribution
4. Usage-Based Routing (TPM/RPM)
Routes based on tokens-per-minute and requests-per-minute limits:
router = Router(
model_list=[
{
"model_name": "gpt-4",
"litellm_params": {"model": "gpt-4"},
"tpm": 100000, # Tokens per minute limit
"rpm": 1000 # Requests per minute limit
},
{
"model_name": "gpt-4",
"litellm_params": {"model": "gpt-4"},
"tpm": 200000,
"rpm": 2000
}
],
routing_strategy="usage-based-routing"
)
Best for: Respecting rate limits, preventing quota exhaustion
5. Usage-Based Routing V2
Improved version with better tracking:
router = Router(
model_list=[...],
routing_strategy="usage-based-routing-v2"
)
6. Cost-Based Routing
Routes to the cheapest available deployment:
router = Router(
model_list=[
{
"model_name": "gpt-4",
"litellm_params": {"model": "gpt-4"},
"model_info": {
"cost_per_token": {
"input": 0.00003,
"output": 0.00006
}
}
},
{
"model_name": "gpt-4",
"litellm_params": {"model": "azure/gpt-4"},
"model_info": {
"cost_per_token": {
"input": 0.000025, # Cheaper!
"output": 0.00005
}
}
}
],
routing_strategy="cost-based-routing"
)
Best for: Minimizing costs while maintaining availability
Advanced Configuration
Rate Limits and Quotas
Configure deployment limits:
router = Router(
model_list=[
{
"model_name": "gpt-4",
"litellm_params": {"model": "gpt-4"},
"tpm": 100000, # Tokens per minute
"rpm": 1000, # Requests per minute
"max_parallel_requests": 10 # Concurrent requests
}
],
default_max_parallel_requests=5 # Default for all deployments
)
Deployment Priorities
Prefer certain deployments over others:
router = Router(
model_list=[
{
"model_name": "gpt-4",
"litellm_params": {"model": "gpt-4"},
"model_info": {"priority": 1} # Higher priority
},
{
"model_name": "gpt-4",
"litellm_params": {"model": "azure/gpt-4"},
"model_info": {"priority": 0} # Lower priority
}
]
)
Region-Based Routing
router = Router(
model_list=[
{
"model_name": "gpt-4",
"litellm_params": {
"model": "gpt-4",
"api_base": "https://us-east.openai.com"
},
"model_info": {"region": "us-east"}
},
{
"model_name": "gpt-4",
"litellm_params": {
"model": "gpt-4",
"api_base": "https://eu-west.openai.com"
},
"model_info": {"region": "eu-west"}
}
]
)
# Route based on user location
response = router.completion(
model="gpt-4",
messages=[{"role": "user", "content": "Hello"}],
metadata={"region": "eu-west"} # Prefer EU deployment
)
Health Checks and Cooldowns
Deployment Cooldowns
Automatically remove unhealthy deployments:
router = Router(
model_list=[...],
allowed_fails=3, # Failures before cooldown
cooldown_time=300, # Cooldown duration (5 minutes)
disable_cooldowns=False
)
Checking Deployment Status
# Get deployment statistics
stats = router.deployment_stats
print(stats)
# Check cooldown status
from litellm.router_utils.cooldown_handlers import _get_cooldown_deployments
cooldown_deployments = _get_cooldown_deployments(router)
print(f"Deployments in cooldown: {cooldown_deployments}")
Pre-Call Checks
Filter deployments before routing:
from litellm.types.router import OptionalPreCallChecks
router = Router(
model_list=[...],
enable_pre_call_checks=True,
optional_pre_call_checks=OptionalPreCallChecks(
check_context_window=True, # Filter by context window
check_rate_limit=True, # Filter by rate limits
check_region=True # Filter by region
)
)
Tag-Based Routing
Route based on deployment tags:
router = Router(
model_list=[
{
"model_name": "gpt-4",
"litellm_params": {"model": "gpt-4"},
"model_info": {"tags": ["production", "high-priority"]}
},
{
"model_name": "gpt-4",
"litellm_params": {"model": "gpt-4"},
"model_info": {"tags": ["development", "testing"]}
}
],
enable_tag_filtering=True
)
# Only use production deployments
response = router.completion(
model="gpt-4",
messages=[{"role": "user", "content": "Hello"}],
metadata={"tags": ["production"]}
)
Deployment Affinity
Stick to the same deployment for a user/session:
router = Router(
model_list=[...],
deployment_affinity_ttl_seconds=3600 # 1 hour
)
# All requests with the same user_id use the same deployment
response = router.completion(
model="gpt-4",
messages=[{"role": "user", "content": "Hello"}],
metadata={"user_api_key": "user-123"}
)
Useful for maintaining conversation context or debugging specific deployments
Monitoring and Observability
Deployment Metrics
from litellm.integrations import CustomLogger
class LoadBalancingLogger(CustomLogger):
def log_success_event(self, kwargs, response_obj, start_time, end_time):
deployment_id = kwargs.get("litellm_params", {}).get("model_id")
latency = end_time - start_time
print(f"Deployment {deployment_id}: {latency}s")
router = Router(
model_list=[...],
set_verbose=True
)
router.callbacks = [LoadBalancingLogger()]
Request Distribution
# Track which deployments are being used
for deployment in router.model_list:
model_id = deployment.get("model_info", {}).get("id")
print(f"Deployment {model_id}: {router.deployment_stats.get(model_id)}")
Best Practices
Load Balancing Tips
- Use multiple deployments - At least 2-3 per model for redundancy
- Set appropriate limits - Configure TPM/RPM based on actual quotas
- Monitor cooldowns - Alert when deployments enter cooldown
- Choose right strategy - Match strategy to your use case
- Test failover - Verify behavior when deployments fail
- Distribute geographically - Use multiple regions for global apps
Common Patterns
High-Throughput Pattern
router = Router(
model_list=[
# Multiple deployments with high limits
{
"model_name": "gpt-3.5",
"litellm_params": {"model": "gpt-3.5-turbo"},
"tpm": 1000000,
"rpm": 10000,
"max_parallel_requests": 100
},
# ... more deployments
],
routing_strategy="usage-based-routing-v2",
default_max_parallel_requests=50
)
Multi-Region Pattern
router = Router(
model_list=[
# US deployments
{"model_name": "gpt-4", "litellm_params": {...}, "model_info": {"region": "us"}},
# EU deployments
{"model_name": "gpt-4", "litellm_params": {...}, "model_info": {"region": "eu"}},
# Asia deployments
{"model_name": "gpt-4", "litellm_params": {...}, "model_info": {"region": "asia"}}
],
routing_strategy="latency-based-routing"
)
Cost-Optimized Pattern
router = Router(
model_list=[
# Cheap primary deployments
{
"model_name": "gpt-3.5",
"litellm_params": {"model": "gpt-3.5-turbo"},
"model_info": {"priority": 1}
},
# Expensive fallback
{
"model_name": "gpt-3.5",
"litellm_params": {"model": "azure/gpt-35-turbo"},
"model_info": {"priority": 0}
}
],
routing_strategy="cost-based-routing"
)
Async Support
Load balancing works with async operations:
import asyncio
async def make_requests():
tasks = [
router.acompletion(
model="gpt-4",
messages=[{"role": "user", "content": f"Request {i}"}]
)
for i in range(100)
]
responses = await asyncio.gather(*tasks)
return responses
responses = asyncio.run(make_requests())
# Requests automatically distributed across deployments
Troubleshooting
All Deployments in Cooldown
# Check cooldown status
if not router.healthy_deployments:
print("Warning: All deployments in cooldown!")
# Reduce allowed_fails or increase cooldown_time
Uneven Distribution
# For simple-shuffle, ensure random seed is not set
# For usage-based routing, verify TPM/RPM limits are correct
# For latency-based routing, check routing_strategy_args
router = Router(
model_list=[...],
routing_strategy="simple-shuffle",
set_verbose=True # Enable debug logging
)
Rate Limit Issues
# Ensure TPM/RPM limits match your actual quotas
router = Router(
model_list=[
{
"model_name": "gpt-4",
"litellm_params": {"model": "gpt-4"},
"tpm": 90000, # Set slightly below actual limit
"rpm": 900 # Leave buffer for safety
}
],
routing_strategy="usage-based-routing-v2"
)