Skip to main content

Common Issues

Container Won’t Start

Symptom: Container exits immediately or fails health checks Diagnosis:
# Check container logs
docker logs litellm

# Kubernetes
kubectl logs -l app=litellm --tail=100

# Check events
kubectl describe pod litellm-xxx
Common causes:
Error: LITELLM_MASTER_KEY not set
Error: DATABASE_URL not configured
Solution:
# Docker
docker run -e LITELLM_MASTER_KEY=sk-1234 \
           -e DATABASE_URL=postgresql://... \
           litellm

# Kubernetes
kubectl create secret generic litellm-secrets \
  --from-literal=LITELLM_MASTER_KEY=sk-1234 \
  --from-literal=DATABASE_URL=postgresql://...

Database Connection Issues

Symptom: Cannot connect to PostgreSQL Diagnosis:
# Test connection from container
docker exec -it litellm sh
psql $DATABASE_URL

# Check connection string format
echo $DATABASE_URL
# Should be: postgresql://user:pass@host:5432/dbname
Solutions:
1

Verify Connection String

# Correct format
DATABASE_URL=postgresql://litellm:password@postgres:5432/litellm

# With SSL
DATABASE_URL=postgresql://user:pass@host:5432/db?sslmode=require

# Special characters in password
DATABASE_URL=postgresql://user:p%40ssw0rd@host:5432/db
# @ encoded as %40, : as %3A, # as %23
2

Check Network Connectivity

# Ping database host
docker exec litellm ping postgres

# Telnet to port
docker exec litellm telnet postgres 5432

# Check Docker network
docker network inspect bridge
3

Verify Database Exists

# List databases
psql -h postgres -U litellm -l

# Create if missing
createdb -h postgres -U postgres litellm
4

Check Firewall Rules

# PostgreSQL
# Allow from LiteLLM subnet
sudo ufw allow from 172.17.0.0/16 to any port 5432

# AWS Security Group
aws ec2 authorize-security-group-ingress \
  --group-id sg-xxx \
  --protocol tcp \
  --port 5432 \
  --source-group sg-litellm

API Request Failures

Symptom: 401, 403, 500 errors from LiteLLM Common errors:
{"error": "Invalid API key"}
{"error": "Authentication failed"}
Causes:
  • Missing Authorization header
  • Invalid master key
  • Expired virtual key
  • Key not found in database
Solutions:
# Check master key
echo $LITELLM_MASTER_KEY

# Verify key format
curl -X POST http://localhost:4000/v1/chat/completions \
  -H "Authorization: Bearer sk-1234" \  # Correct
  # Not: -H "Authorization: sk-1234"     # Wrong
  -H "Content-Type: application/json" \
  -d '{...}'

# Verify key exists
curl http://localhost:4000/key/info?key=sk-litellm-xxx \
  -H "Authorization: Bearer $LITELLM_MASTER_KEY"

Performance Issues

Symptom: Slow response times, high latency Diagnosis:
# Check metrics
curl http://localhost:4000/metrics | grep litellm_request_duration

# Trace a request
curl -X POST http://localhost:4000/v1/chat/completions \
  -H "Authorization: Bearer $API_KEY" \
  -H "Content-Type: application/json" \
  -H "X-Litellm-Trace: true" \
  -d '{...}' -w "\nTime: %{time_total}s\n"

# Check resource usage
kubectl top pods -l app=litellm
docker stats litellm
Common causes:
1

Database Slow Queries

-- Find slow queries
SELECT 
  query,
  mean_exec_time,
  calls
FROM pg_stat_statements
ORDER BY mean_exec_time DESC
LIMIT 10;

-- Check missing indexes
SELECT schemaname, tablename, attname
FROM pg_stats
WHERE schemaname = 'public'
  AND n_distinct > 100
  AND null_frac < 0.1
  AND attname NOT IN (
    SELECT a.attname
    FROM pg_index i
    JOIN pg_attribute a ON a.attrelid = i.indrelid
    WHERE a.attnum = ANY(i.indkey)
  );
2

High Provider Latency

# Check provider latency
curl http://localhost:4000/health/readiness | jq

# Enable provider failover
# In config.yaml:
model_list:
  - model_name: gpt-4o
    litellm_params:
      model: gpt-4o
  - model_name: gpt-4o
    litellm_params:
      model: azure/gpt-4o  # Fallback
3

Insufficient Resources

# Scale up replicas
kubectl scale deployment litellm --replicas=5

# Increase resources
kubectl set resources deployment litellm \
  --limits=cpu=2000m,memory=4Gi \
  --requests=cpu=1000m,memory=2Gi

# Enable autoscaling
kubectl autoscale deployment litellm \
  --min=3 --max=10 --cpu-percent=70
4

No Caching

# Enable Redis caching
general_settings:
  cache: true
  redis_host: redis
  redis_port: 6379

Memory Issues

Symptom: OOM kills, container restarts
# Check memory usage
kubectl top pods
docker stats litellm

# View OOM events
kubectl get events | grep OOMKilled

# Check memory limits
kubectl describe pod litellm-xxx | grep -A5 Limits
Solutions:
# Increase memory limit
resources:
  limits:
    memory: 4Gi  # Up from 2Gi
  requests:
    memory: 2Gi

# Reduce concurrent requests
general_settings:
  max_parallel_requests: 50  # Down from 100

# Enable memory optimization
litellm_settings:
  drop_params: true  # Don't store full request

Health Check Failures

Symptom: Health checks timing out or failing Diagnosis:
# Test health endpoints
curl http://localhost:4000/health
curl http://localhost:4000/health/liveliness
curl http://localhost:4000/health/readiness

# Check timeout settings
kubectl describe pod litellm-xxx | grep -A10 Liveness
Solutions:
# Increase grace period (migrations take time)
livenessProbe:
  httpGet:
    path: /health/liveliness
    port: 4000
  initialDelaySeconds: 120  # Wait 2 minutes
  periodSeconds: 30
  timeoutSeconds: 10
  failureThreshold: 3

readinessProbe:
  httpGet:
    path: /health/readiness
    port: 4000
  initialDelaySeconds: 120
  periodSeconds: 15
  timeoutSeconds: 10
  failureThreshold: 3

startupProbe:
  httpGet:
    path: /health/readiness
    port: 4000
  initialDelaySeconds: 0
  periodSeconds: 10
  failureThreshold: 30  # Allow 5 minutes (30 * 10s)

Debugging Tools

Enable Debug Logging

# Environment variables
LITELLM_LOG=DEBUG
DETAILED_DEBUG=True

# In config
general_settings:
  detailed_debug: true
View debug logs:
# Docker
docker logs litellm -f --tail=100

# Kubernetes
kubectl logs -f deployment/litellm --all-containers=true

# Filter for errors
kubectl logs deployment/litellm | grep ERROR

Interactive Shell

# Docker
docker exec -it litellm sh

# Kubernetes
kubectl exec -it litellm-xxx -- sh

# Inside container:
ps aux              # Check processes
env | grep LITELLM  # Check environment
ls -la /app         # Check files
cat config.yaml     # View config
python --version    # Check Python
psql $DATABASE_URL  # Test database

Network Debugging

# Test DNS resolution
kubectl exec -it litellm-xxx -- nslookup postgres

# Test connectivity
kubectl exec -it litellm-xxx -- nc -zv postgres 5432

# Trace route
kubectl exec -it litellm-xxx -- traceroute api.openai.com

# Check DNS
kubectl exec -it litellm-xxx -- cat /etc/resolv.conf

Database Debugging

-- Check active connections
SELECT 
  datname,
  usename,
  application_name,
  client_addr,
  state,
  query
FROM pg_stat_activity
WHERE datname = 'litellm';

-- Check table sizes
SELECT 
  schemaname,
  tablename,
  pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS size
FROM pg_tables
WHERE schemaname = 'public'
ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC;

-- Check locks
SELECT 
  pid,
  usename,
  pg_blocking_pids(pid) as blocked_by,
  query
FROM pg_stat_activity
WHERE cardinality(pg_blocking_pids(pid)) > 0;

Error Reference

HTTP Status Codes

CodeMeaningCauseSolution
400Bad RequestInvalid request formatCheck request body, headers
401UnauthorizedMissing/invalid API keyVerify Authorization header
403ForbiddenPermission deniedCheck key permissions, budget
404Not FoundInvalid endpoint/modelVerify URL and model name
429Rate LimitedToo many requestsWait and retry, increase limits
500Internal ErrorServer errorCheck logs, report bug
502Bad GatewayUpstream errorCheck provider API status
503Service UnavailableOverloaded/unhealthyScale up, check resources
504Gateway TimeoutRequest timeoutIncrease timeout, check provider

Common Error Messages

Cause: Provider API key is invalid or missingSolution:
# Verify API key
echo $OPENAI_API_KEY

# Test directly
curl https://api.openai.com/v1/models \
  -H "Authorization: Bearer $OPENAI_API_KEY"

# Update in config
kubectl edit secret litellm-secrets
Cause: Provider rate limit reachedSolution:
# Configure multiple deployments
model_list:
  - model_name: gpt-4o
    litellm_params:
      model: gpt-4o
      api_key: os.environ/OPENAI_API_KEY_1
  - model_name: gpt-4o
    litellm_params:
      model: gpt-4o
      api_key: os.environ/OPENAI_API_KEY_2

router_settings:
  routing_strategy: least-busy
Cause: Cannot connect to databaseSolution:
# Check DATABASE_URL
echo $DATABASE_URL

# Verify format
# postgresql://user:pass@host:5432/dbname

# Test connection
psql $DATABASE_URL -c "SELECT 1;"

# Run migrations
docker exec litellm prisma migrate deploy
Cause: Cannot connect to RedisSolution:
# Test Redis connectivity
docker exec litellm redis-cli -h redis ping

# Check Redis is running
kubectl get pods -l app=redis

# Verify environment variables
echo $REDIS_HOST
echo $REDIS_PORT
echo $REDIS_PASSWORD

Support and Resources

Get Help

Discord Community

Join 5000+ users for real-time help

GitHub Issues

Report bugs and request features

Documentation

Complete guides and API reference

Enterprise Support

Dedicated support for production

Reporting Bugs

When reporting issues, include:
# 1. LiteLLM version
litellm --version

# 2. Error logs
docker logs litellm --tail=100 > logs.txt

# 3. Configuration (sanitized)
cat config.yaml | sed 's/api_key:.*/api_key: REDACTED/' > config-sanitized.yaml

# 4. Environment
env | grep LITELLM | sed 's/=.*/=REDACTED/' > env.txt

# 5. System info
uname -a > system.txt
kubectl version >> system.txt

Additional Resources

Next Steps

Monitoring

Set up alerts to catch issues early

Performance

Optimize to prevent issues

Security

Secure your deployment

High Availability

Build resilient systems

Build docs developers (and LLMs) love