Overview
The Router class provides load balancing, fallback handling, rate limiting, and intelligent routing across multiple LLM deployments. It’s designed for production environments requiring high availability and reliability.
Basic Usage
from litellm import Router
model_list = [
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "gpt-3.5-turbo",
"api_key": "sk-...",
}
},
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "azure/gpt-35-turbo",
"api_key": "...",
"api_base": "https://my-resource.openai.azure.com",
"api_version": "2024-02-01"
}
}
]
router = Router(model_list=model_list)
response = router.completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hello!"}]
)
print(response.choices[0].message.content)
Router Configuration
Constructor Parameters
Router(
model_list: List[Dict],
# Caching
redis_url: Optional[str] = None,
cache_responses: bool = False,
# Reliability
num_retries: Optional[int] = None,
timeout: Optional[float] = None,
fallbacks: List = [],
# Routing
routing_strategy: str = "simple-shuffle",
routing_strategy_args: dict = {},
# Rate limiting
allowed_fails: Optional[int] = None,
cooldown_time: Optional[float] = None,
# Settings
set_verbose: bool = False,
**kwargs
)
List of model deployments to route between. Each deployment must have:
model_name: Identifier for the model group
litellm_params: Parameters for litellm.completion()
Strategy for selecting deployments:
"simple-shuffle": Random selection (default)
"least-busy": Choose deployment with fewest ongoing requests
"usage-based-routing": Route based on TPM/RPM limits
"latency-based-routing": Route to lowest latency deployment
"cost-based-routing": Route to lowest cost deployment
Fallback model groups if primary fails.fallbacks=[{"gpt-4": ["claude-2", "gpt-3.5-turbo"]}]
Number of retries on failure. Default: 0
Request timeout in seconds. Default: 600
Enable response caching. Default: False
Number of allowed failures before cooldown. Default: uses litellm setting
Time in seconds to cooldown failed deployment. Default: 1 second
Model List Configuration
Basic Model List
model_list = [
{
"model_name": "gpt-4", # Identifier
"litellm_params": {
"model": "gpt-4",
"api_key": "sk-..."
}
},
{
"model_name": "gpt-4", # Same identifier = load balancing
"litellm_params": {
"model": "azure/gpt-4",
"api_key": "...",
"api_base": "https://my-resource.openai.azure.com",
"api_version": "2024-02-01"
}
}
]
With Rate Limits
model_list = [
{
"model_name": "gpt-4",
"litellm_params": {
"model": "gpt-4",
"api_key": "sk-..."
},
"tpm": 100000, # Tokens per minute
"rpm": 1000 # Requests per minute
}
]
model_list = [
{
"model_name": "gpt-4",
"litellm_params": {
"model": "gpt-4",
"api_key": "sk-..."
},
"model_info": {
"id": "deployment-1",
"region": "us-east"
}
}
]
Router Methods
Completion
router.completion(
model: str,
messages: List[Dict[str, str]],
**kwargs
) -> ModelResponse
Async Completion
await router.acompletion(
model: str,
messages: List[Dict[str, str]],
**kwargs
) -> ModelResponse
Embedding
router.embedding(
model: str,
input: Union[str, List[str]],
**kwargs
) -> EmbeddingResponse
Image Generation
router.image_generation(
model: str,
prompt: str,
**kwargs
) -> ImageResponse
Examples
Load Balancing Multiple Providers
from litellm import Router
model_list = [
{
"model_name": "gpt-4",
"litellm_params": {
"model": "gpt-4",
"api_key": "sk-..."
}
},
{
"model_name": "gpt-4",
"litellm_params": {
"model": "azure/gpt-4",
"api_key": "...",
"api_base": "https://my-resource.openai.azure.com",
"api_version": "2024-02-01"
}
},
{
"model_name": "gpt-4",
"litellm_params": {
"model": "claude-3-opus-20240229",
"api_key": "sk-ant-..."
}
}
]
router = Router(model_list=model_list)
# Router automatically selects deployment
for i in range(10):
response = router.completion(
model="gpt-4",
messages=[{"role": "user", "content": f"Request {i}"}]
)
print(f"Request {i}: {response.choices[0].message.content}")
Fallback Configuration
from litellm import Router
model_list = [
{
"model_name": "gpt-4",
"litellm_params": {"model": "gpt-4", "api_key": "sk-..."}
},
{
"model_name": "claude-3",
"litellm_params": {"model": "claude-3-opus-20240229", "api_key": "sk-ant-..."}
},
{
"model_name": "gpt-3.5",
"litellm_params": {"model": "gpt-3.5-turbo", "api_key": "sk-..."}
}
]
router = Router(
model_list=model_list,
fallbacks=[{"gpt-4": ["claude-3", "gpt-3.5"]}],
num_retries=2
)
# If gpt-4 fails, tries claude-3, then gpt-3.5
response = router.completion(
model="gpt-4",
messages=[{"role": "user", "content": "Hello"}]
)
Rate Limiting
from litellm import Router
model_list = [
{
"model_name": "gpt-4",
"litellm_params": {"model": "gpt-4", "api_key": "sk-..."},
"tpm": 100000, # 100k tokens per minute
"rpm": 1000 # 1000 requests per minute
},
{
"model_name": "gpt-4",
"litellm_params": {"model": "azure/gpt-4", "api_key": "..."},
"tpm": 200000,
"rpm": 2000
}
]
router = Router(
model_list=model_list,
routing_strategy="usage-based-routing" # Respects TPM/RPM limits
)
response = router.completion(
model="gpt-4",
messages=[{"role": "user", "content": "Hello"}]
)
Cooldown on Failures
from litellm import Router
model_list = [
{
"model_name": "gpt-4",
"litellm_params": {"model": "gpt-4", "api_key": "sk-..."},
"model_info": {"id": "deployment-1"}
},
{
"model_name": "gpt-4",
"litellm_params": {"model": "azure/gpt-4", "api_key": "..."},
"model_info": {"id": "deployment-2"}
}
]
router = Router(
model_list=model_list,
allowed_fails=3, # Allow 3 failures
cooldown_time=60.0 # Then cooldown for 60 seconds
)
# Failed deployment automatically moved to cooldown
response = router.completion(
model="gpt-4",
messages=[{"role": "user", "content": "Hello"}]
)
Redis Caching
from litellm import Router
model_list = [
{
"model_name": "gpt-4",
"litellm_params": {"model": "gpt-4", "api_key": "sk-..."}
}
]
router = Router(
model_list=model_list,
redis_url="redis://localhost:6379",
cache_responses=True
)
# First call hits API
response1 = router.completion(
model="gpt-4",
messages=[{"role": "user", "content": "What is 2+2?"}]
)
# Second identical call returns cached response
response2 = router.completion(
model="gpt-4",
messages=[{"role": "user", "content": "What is 2+2?"}]
)
Async Usage
import asyncio
from litellm import Router
model_list = [
{
"model_name": "gpt-4",
"litellm_params": {"model": "gpt-4", "api_key": "sk-..."}
},
{
"model_name": "gpt-4",
"litellm_params": {"model": "azure/gpt-4", "api_key": "..."}
}
]
router = Router(model_list=model_list)
async def make_requests():
tasks = [
router.acompletion(
model="gpt-4",
messages=[{"role": "user", "content": f"Request {i}"}]
)
for i in range(10)
]
responses = await asyncio.gather(*tasks)
for i, response in enumerate(responses):
print(f"Response {i}: {response.choices[0].message.content}")
asyncio.run(make_requests())
Routing Strategies
Simple Shuffle (Default)
router = Router(
model_list=model_list,
routing_strategy="simple-shuffle" # Random selection
)
Least Busy
router = Router(
model_list=model_list,
routing_strategy="least-busy" # Fewest ongoing requests
)
Usage-Based (TPM/RPM)
router = Router(
model_list=model_list,
routing_strategy="usage-based-routing", # Respects rate limits
routing_strategy_args={"ttl": 60} # Cache window
)
Latency-Based
router = Router(
model_list=model_list,
routing_strategy="latency-based-routing", # Lowest latency
routing_strategy_args={"window_size": 100} # Track last N requests
)
Cost-Based
router = Router(
model_list=model_list,
routing_strategy="cost-based-routing" # Lowest cost per token
)
Advanced Features
Model Aliases
router = Router(
model_list=model_list,
model_group_alias={
"gpt-4-alias": "gpt-4",
"fast-model": "gpt-3.5-turbo"
}
)
# Use alias instead of model name
response = router.completion(
model="gpt-4-alias",
messages=[{"role": "user", "content": "Hello"}]
)
Multiple Deployments Per Model
model_list = [
# OpenAI deployment 1
{
"model_name": "gpt-4",
"litellm_params": {"model": "gpt-4", "api_key": "sk-1..."},
"model_info": {"id": "openai-1"}
},
# OpenAI deployment 2
{
"model_name": "gpt-4",
"litellm_params": {"model": "gpt-4", "api_key": "sk-2..."},
"model_info": {"id": "openai-2"}
},
# Azure deployment 1
{
"model_name": "gpt-4",
"litellm_params": {
"model": "azure/gpt-4-east",
"api_key": "...",
"api_base": "https://east.openai.azure.com"
},
"model_info": {"id": "azure-east"}
},
# Azure deployment 2
{
"model_name": "gpt-4",
"litellm_params": {
"model": "azure/gpt-4-west",
"api_key": "...",
"api_base": "https://west.openai.azure.com"
},
"model_info": {"id": "azure-west"}
}
]
router = Router(
model_list=model_list,
routing_strategy="least-busy"
)
Debugging and Monitoring
router = Router(
model_list=model_list,
set_verbose=True, # Enable verbose logging
debug_level="DEBUG" # Set log level
)
# Get deployment stats
print(router.deployment_stats)
# View healthy deployments
print(router.healthy_deployments)
Error Handling
from litellm import Router
from litellm.exceptions import RateLimitError, APIError
router = Router(model_list=model_list)
try:
response = router.completion(
model="gpt-4",
messages=[{"role": "user", "content": "Hello"}]
)
except RateLimitError:
print("All deployments rate limited")
except APIError as e:
print(f"All deployments failed: {e}")
Best Practices
- Multiple deployments: Configure at least 2 deployments per model for reliability
- Set rate limits: Configure TPM/RPM to prevent hitting provider limits
- Use fallbacks: Define fallback chains for critical applications
- Enable caching: Use Redis caching to reduce costs and latency
- Monitor failures: Set appropriate
allowed_fails and cooldown_time
- Choose routing strategy: Select based on your requirements (cost, latency, reliability)
- Configure retries: Set
num_retries based on your latency tolerance
- Use async: Leverage
acompletion() for concurrent requests
- Use
routing_strategy="least-busy" for high concurrency
- Enable Redis caching for repeated queries
- Configure appropriate timeouts to avoid hanging requests
- Use async methods for batch processing
- Monitor
deployment_stats to identify slow deployments