Production deployment guide

upstream vllm_backend {
    least_conn;  # Use least connections algorithm
    server vllm0:8000 max_fails=3 fail_timeout=30s;
    server vllm1:8000 max_fails=3 fail_timeout=30s;
    server vllm2:8000 max_fails=3 fail_timeout=30s;
    server vllm3:8000 max_fails=3 fail_timeout=30s;
    
    keepalive 32;  # Connection pooling
}

server {
    listen 80;
    
    location / {
        proxy_pass http://vllm_backend;
        proxy_http_version 1.1;
        proxy_set_header Connection "";
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
        proxy_set_header X-Forwarded-Proto $scheme;
        
        # Timeouts for long-running requests
        proxy_connect_timeout 300s;
        proxy_send_timeout 300s;
        proxy_read_timeout 300s;
    }
    
    location /health {
        proxy_pass http://vllm_backend/health;
        proxy_http_version 1.1;
    }
}

Deploy with Docker Compose

version: '3.8'

services:
  nginx:
    image: nginx:latest
    ports:
      - "8000:80"
    volumes:
      - ./nginx.conf:/etc/nginx/nginx.conf:ro
    depends_on:
      - vllm0
      - vllm1
      - vllm2
      - vllm3
    networks:
      - vllm-network

  vllm0:
    image: vllm/vllm-openai:latest
    runtime: nvidia
    environment:
      - NVIDIA_VISIBLE_DEVICES=0
    volumes:
      - ~/.cache/huggingface:/root/.cache/huggingface
    shm_size: 10gb
    ipc: host
    command: --model meta-llama/Meta-Llama-3-8B-Instruct
    networks:
      - vllm-network

  vllm1:
    image: vllm/vllm-openai:latest
    runtime: nvidia
    environment:
      - NVIDIA_VISIBLE_DEVICES=1
    volumes:
      - ~/.cache/huggingface:/root/.cache/huggingface
    shm_size: 10gb
    ipc: host
    command: --model meta-llama/Meta-Llama-3-8B-Instruct
    networks:
      - vllm-network

  vllm2:
    image: vllm/vllm-openai:latest
    runtime: nvidia
    environment:
      - NVIDIA_VISIBLE_DEVICES=2
    volumes:
      - ~/.cache/huggingface:/root/.cache/huggingface
    shm_size: 10gb
    ipc: host
    command: --model meta-llama/Meta-Llama-3-8B-Instruct
    networks:
      - vllm-network

  vllm3:
    image: vllm/vllm-openai:latest
    runtime: nvidia
    environment:
      - NVIDIA_VISIBLE_DEVICES=3
    volumes:
      - ~/.cache/huggingface:/root/.cache/huggingface
    shm_size: 10gb
    ipc: host
    command: --model meta-llama/Meta-Llama-3-8B-Instruct
    networks:
      - vllm-network

networks:
  vllm-network:
    driver: bridge

Parameter	Recommended	Purpose
`gpu-memory-utilization`	`0.85-0.90`	Leave headroom for fragmentation
`max-model-len`	Model-specific	Reduce for higher throughput
`max-num-seqs`	`128-256`	Balance latency vs throughput
`enable-prefix-caching`	`true`	Cache common prompts
`enable-chunked-prefill`	`true`	Reduce TTFT for long prompts
`disable-log-requests`	`true`	Reduce logging overhead

Method	Memory Savings	Quality	Speed
FP16 (baseline)	0%	100%	1.0x
FP8	50%	98-99%	1.5-2.0x
AWQ/GPTQ	75%	95-98%	1.2-1.5x

Get Started

Core Concepts

Serving

Models

Features

Configuration

Deployment

​Overview

​Architecture patterns

​Single-instance deployment

​Load-balanced deployment

​Multi-model deployment

​Load balancing

​Nginx configuration

​Kubernetes Service with session affinity

​Performance optimization

​Model configuration

​Quantization

​Multi-GPU tensor parallelism

​Monitoring and observability

​Prometheus metrics

​Grafana dashboard

​OpenTelemetry tracing

​Health checks and probes

​Kubernetes probes

​Autoscaling

​Horizontal Pod Autoscaler (HPA)

​SkyPilot autoscaling

​Security best practices

​API authentication

​Network policies

​Secrets management

​Disaster recovery

​Model checkpointing

​Multi-region deployment

​Cost optimization

​Troubleshooting

​High latency

​OOM errors

​Request timeouts

​Checklist

​Next steps

Build docs developers (and LLMs) love

Overview

Architecture patterns

Single-instance deployment

Load-balanced deployment

Multi-model deployment

Load balancing

Nginx configuration

Kubernetes Service with session affinity

Performance optimization

Model configuration

Quantization

Multi-GPU tensor parallelism

Monitoring and observability

Prometheus metrics

Grafana dashboard

OpenTelemetry tracing

Health checks and probes

Kubernetes probes

Autoscaling

Horizontal Pod Autoscaler (HPA)

SkyPilot autoscaling

Security best practices

API authentication

Network policies

Secrets management

Disaster recovery

Model checkpointing

Multi-region deployment

Cost optimization

Troubleshooting

High latency

OOM errors

Request timeouts

Checklist

Next steps