Overview
The Router class provides intelligent load balancing, fallbacks, and retries across multiple LLM deployments. It’s the core of LiteLLM’s reliability features.
Class Definition
from litellm import Router
router = Router(
model_list=[...],
# Caching
redis_url=None,
cache_responses=False,
# Reliability
num_retries=0,
timeout=600,
fallbacks=[],
context_window_fallbacks=[],
# Routing
routing_strategy="simple-shuffle",
# Rate limiting
allowed_fails=3,
cooldown_time=60,
# And many more options...
)
Constructor Parameters
Model Configuration
List of model deployments with their configurations.model_list = [
{
"model_name": "gpt-4", # User-facing name
"litellm_params": { # Params for litellm.completion()
"model": "azure/gpt-4",
"api_key": "your-api-key",
"api_base": "https://your-endpoint.openai.azure.com/",
"api_version": "2024-02-01"
},
"tpm": 100000, # Tokens per minute
"rpm": 1000, # Requests per minute
},
{
"model_name": "gpt-4", # Same model_name = load balance
"litellm_params": {
"model": "gpt-4",
"api_key": "openai-key"
}
}
]
Caching
Enable response caching to reduce costs and latency.
Redis connection string for distributed caching.redis_url="redis://localhost:6379"
Redis host (alternative to redis_url).
Routing Strategy
routing_strategy
string
default:"simple-shuffle"
Strategy for selecting deployments.Options:
"simple-shuffle": Random selection
"least-busy": Route to deployment with fewest ongoing requests
"usage-based-routing": Based on TPM/RPM limits
"latency-based-routing": Route to fastest deployment
"cost-based-routing": Route to cheapest deployment
"usage-based-routing-v2": Improved TPM/RPM routing
Additional arguments for routing strategies.# For latency-based routing
routing_strategy_args={
"ttl": 60 # Cache latency for 60s
}
Reliability
Number of retry attempts on failure.
Default timeout for requests in seconds.
Fallback configurations for handling failures.fallbacks=[
{"gpt-4": ["gpt-3.5-turbo", "claude-2"]}
]
Fallbacks specifically for context window exceeded errors.context_window_fallbacks=[
{"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}
]
Number of failures before a deployment enters cooldown.
Cooldown period in seconds after allowed_fails is reached.
Minimum time to wait before retrying a failed request.
Custom retry policy for different exception types.retry_policy={
"RateLimitError": {"max_retries": 5},
"Timeout": {"max_retries": 2}
}
Rate Limiting
default_max_parallel_requests
Maximum parallel requests per deployment.
Observability
Log level: “DEBUG” or “INFO”.
Core Methods
completion()
Make a completion request with routing and fallbacks.
response = router.completion(
model="gpt-4",
messages=[{"role": "user", "content": "Hello"}],
# All standard completion params supported
)
Supports all parameters from litellm.completion().
acompletion()
Async version of completion().
response = await router.acompletion(
model="gpt-4",
messages=[{"role": "user", "content": "Hello"}]
)
embedding()
Generate embeddings with routing.
response = router.embedding(
model="text-embedding-3-small",
input="Text to embed"
)
aembedding()
Async version of embedding().
response = await router.aembedding(
model="text-embedding-3-small",
input="Text to embed"
)
image_generation()
Generate images with routing.
response = router.image_generation(
prompt="A serene landscape",
model="dall-e-3"
)
aimage_generation()
Async version of image_generation().
Usage Examples
Basic Setup
from litellm import Router
router = Router(
model_list=[
{
"model_name": "gpt-4",
"litellm_params": {
"model": "azure/gpt-4",
"api_key": "your-azure-key",
"api_base": "https://your-endpoint.openai.azure.com/"
}
},
{
"model_name": "gpt-4",
"litellm_params": {
"model": "gpt-4",
"api_key": "your-openai-key"
}
}
]
)
response = router.completion(
model="gpt-4",
messages=[{"role": "user", "content": "Hello!"}]
)
Load Balancing with Rate Limits
from litellm import Router
router = Router(
model_list=[
{
"model_name": "gpt-4",
"litellm_params": {
"model": "gpt-4",
"api_key": "key-1"
},
"tpm": 100000, # 100K tokens per minute
"rpm": 1000 # 1K requests per minute
},
{
"model_name": "gpt-4",
"litellm_params": {
"model": "azure/gpt-4",
"api_key": "key-2",
"api_base": "https://endpoint.openai.azure.com/"
},
"tpm": 200000,
"rpm": 2000
}
],
routing_strategy="usage-based-routing" # Respect TPM/RPM limits
)
Fallbacks
from litellm import Router
router = Router(
model_list=[
{
"model_name": "gpt-4",
"litellm_params": {"model": "gpt-4", "api_key": "key"}
},
{
"model_name": "claude-2",
"litellm_params": {"model": "claude-2", "api_key": "key"}
},
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {"model": "gpt-3.5-turbo", "api_key": "key"}
}
],
fallbacks=[
{"gpt-4": ["claude-2", "gpt-3.5-turbo"]}
],
num_retries=3
)
# If gpt-4 fails, tries claude-2, then gpt-3.5-turbo
response = router.completion(
model="gpt-4",
messages=[{"role": "user", "content": "Hello"}]
)
Context Window Fallbacks
from litellm import Router
router = Router(
model_list=[
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {"model": "gpt-3.5-turbo", "api_key": "key"}
},
{
"model_name": "gpt-3.5-turbo-16k",
"litellm_params": {"model": "gpt-3.5-turbo-16k", "api_key": "key"}
}
],
context_window_fallbacks=[
{"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}
]
)
# Automatically uses 16k version if context is too long
response = router.completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Very long prompt..."}]
)
Latency-Based Routing
from litellm import Router
router = Router(
model_list=[
{
"model_name": "gpt-4",
"litellm_params": {"model": "gpt-4", "api_key": "key-1"}
},
{
"model_name": "gpt-4",
"litellm_params": {"model": "azure/gpt-4", "api_key": "key-2", "api_base": "..."}
}
],
routing_strategy="latency-based-routing",
routing_strategy_args={"ttl": 60} # Cache latency for 60s
)
# Routes to the faster deployment
response = router.completion(
model="gpt-4",
messages=[{"role": "user", "content": "Hello"}]
)
With Redis Caching
from litellm import Router
router = Router(
model_list=[...],
redis_url="redis://localhost:6379",
cache_responses=True
)
# First call hits the API
response1 = router.completion(
model="gpt-4",
messages=[{"role": "user", "content": "What is 2+2?"}],
caching=True
)
# Second identical call returns cached response
response2 = router.completion(
model="gpt-4",
messages=[{"role": "user", "content": "What is 2+2?"}],
caching=True
)
Cooldown on Failures
from litellm import Router
router = Router(
model_list=[
{
"model_name": "gpt-4",
"litellm_params": {"model": "gpt-4", "api_key": "key-1"}
},
{
"model_name": "gpt-4",
"litellm_params": {"model": "gpt-4", "api_key": "key-2"}
}
],
allowed_fails=3, # Allow 3 failures
cooldown_time=120 # 2 minute cooldown
)
# After 3 failures, the deployment goes into cooldown
# Router automatically uses other deployments
Cost-Based Routing
from litellm import Router
router = Router(
model_list=[
{
"model_name": "gpt-4",
"litellm_params": {"model": "gpt-4", "api_key": "key"},
},
{
"model_name": "gpt-4",
"litellm_params": {"model": "azure/gpt-4", "api_key": "key", "api_base": "..."},
}
],
routing_strategy="cost-based-routing"
)
# Routes to the cheapest deployment
response = router.completion(
model="gpt-4",
messages=[{"role": "user", "content": "Hello"}]
)
Health Checks
from litellm import Router
router = Router(
model_list=[...],
enable_pre_call_checks=True # Enable health checks before routing
)
# Only routes to healthy deployments
response = router.completion(
model="gpt-4",
messages=[{"role": "user", "content": "Hello"}]
)
Model Aliases
from litellm import Router
router = Router(
model_list=[
{
"model_name": "gpt-4-turbo",
"litellm_params": {"model": "gpt-4-1106-preview", "api_key": "key"}
}
],
model_group_alias={
"gpt-4": "gpt-4-turbo" # Map gpt-4 -> gpt-4-turbo
}
)
# Both work:
response = router.completion(model="gpt-4", messages=[...])
response = router.completion(model="gpt-4-turbo", messages=[...])
Advanced Features
Custom Callbacks
from litellm import Router
from litellm.integrations.custom_logger import CustomLogger
class MyLogger(CustomLogger):
def log_success_event(self, kwargs, response_obj, start_time, end_time):
print(f"Request succeeded in {end_time - start_time}s")
def log_failure_event(self, kwargs, response_obj, start_time, end_time):
print(f"Request failed: {response_obj}")
router = Router(
model_list=[...],
success_callback=[MyLogger()],
failure_callback=[MyLogger()]
)
Router with Authentication
from litellm import Router
router = Router(
model_list=[
{
"model_name": "gpt-4",
"litellm_params": {
"model": "azure/gpt-4",
"api_key": "os.environ/AZURE_API_KEY", # Read from env
"api_base": "os.environ/AZURE_API_BASE"
}
}
]
)
Best Practices
- Use rate limits: Set TPM/RPM to prevent hitting provider limits
- Enable fallbacks: Always have backup models configured
- Use caching: Reduce costs and latency with Redis caching
- Set timeouts: Appropriate timeouts prevent hanging requests
- Monitor health: Enable health checks for production
- Use aliases: Abstract model names for easier updates
- Configure retries: Set appropriate retry policies per error type