Skip to main content

Overview

Batch processing allows you to send asynchronous requests at significantly reduced costs. LiteLLM supports batch APIs across providers including OpenAI and Anthropic.

Quick Start

from litellm import batch_completion

# Create batch
batch = batch_completion(
    model="gpt-4o",
    messages=[
        [{"role": "user", "content": "What is AI?"}],
        [{"role": "user", "content": "What is ML?"}],
        [{"role": "user", "content": "What is DL?"}]
    ]
)

print(f"Batch ID: {batch.id}")
print(f"Status: {batch.status}")

When to Use Batching

  • 50% lower cost compared to synchronous API
  • Ideal for non-urgent, high-volume tasks
  • Best for offline processing
  • Data classification and labeling
  • Content generation for datasets
  • Evaluation and testing
  • Embedding large corpora
  • Bulk data transformation
  • Real-time applications
  • User-facing features
  • Time-sensitive tasks
  • Interactive workflows

OpenAI Batch API

from litellm import batch_completion_create
import json

# Prepare requests
requests = [
    {
        "custom_id": "request-1",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o",
            "messages": [{"role": "user", "content": "What is AI?"}],
            "max_tokens": 100
        }
    },
    {
        "custom_id": "request-2",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o",
            "messages": [{"role": "user", "content": "What is ML?"}],
            "max_tokens": 100
        }
    }
]

# Save to file
with open("batch_requests.jsonl", "w") as f:
    for req in requests:
        f.write(json.dumps(req) + "\n")

# Create batch
batch = batch_completion_create(
    input_file_path="batch_requests.jsonl",
    endpoint="/v1/chat/completions",
    completion_window="24h"
)

print(f"Batch ID: {batch.id}")

Anthropic Batch API

from litellm import batch_completion

# Create batch for Claude
batch = batch_completion(
    model="anthropic/claude-3.5-sonnet",
    messages=[
        [{"role": "user", "content": "Analyze this text..."}],
        [{"role": "user", "content": "Summarize this..."}],
        [{"role": "user", "content": "Translate this..."}]
    ]
)

print(f"Batch ID: {batch.id}")

Complete Workflow

from litellm import (
    batch_completion_create,
    batch_completion_retrieve,
    batch_completion_list
)
import json
import time

# 1. Prepare batch requests
requests = [
    {
        "custom_id": f"task-{i}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini",
            "messages": [{"role": "user", "content": f"Task {i}"}]
        }
    }
    for i in range(100)
]

# Save to JSONL
with open("requests.jsonl", "w") as f:
    for req in requests:
        f.write(json.dumps(req) + "\n")

# 2. Create batch
batch = batch_completion_create(
    input_file_path="requests.jsonl",
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={"project": "my-project"}
)

print(f"Created batch: {batch.id}")

# 3. Poll for completion
while True:
    batch = batch_completion_retrieve(batch_id=batch.id)
    print(f"Status: {batch.status}")
    
    if batch.status in ["completed", "failed", "expired", "cancelled"]:
        break
    
    time.sleep(60)  # Check every minute

# 4. Process results
if batch.status == "completed":
    # Download and process results
    with open("results.jsonl", "r") as f:
        for line in f:
            result = json.loads(line)
            custom_id = result["custom_id"]
            response = result["response"]["body"]["choices"][0]["message"]["content"]
            print(f"{custom_id}: {response}")
else:
    print(f"Batch failed with status: {batch.status}")

List Batches

from litellm import batch_completion_list

# List all batches
batches = batch_completion_list(limit=10)

for batch in batches.data:
    print(f"ID: {batch.id}")
    print(f"Status: {batch.status}")
    print(f"Created: {batch.created_at}")
    print(f"Requests: {batch.request_counts}")
    print("---")

Batch with Different Request Types

import json

# Mix of completions and embeddings
requests = [
    # Chat completion
    {
        "custom_id": "chat-1",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini",
            "messages": [{"role": "user", "content": "Hello"}]
        }
    },
    # Embedding
    {
        "custom_id": "embed-1",
        "method": "POST",
        "url": "/v1/embeddings",
        "body": {
            "model": "text-embedding-3-small",
            "input": "Text to embed"
        }
    }
]

with open("mixed_batch.jsonl", "w") as f:
    for req in requests:
        f.write(json.dumps(req) + "\n")

Error Handling

from litellm import batch_completion_retrieve
import json

batch = batch_completion_retrieve(batch_id="batch_abc123")

if batch.status == "completed":
    # Check for errors in results
    with open("results.jsonl", "r") as f:
        for line in f:
            result = json.loads(line)
            
            if "error" in result:
                print(f"Error in {result['custom_id']}: {result['error']}")
            else:
                # Process successful result
                response = result["response"]["body"]
                print(f"Success: {result['custom_id']}")

elif batch.status == "failed":
    print(f"Batch failed: {batch.errors}")

Monitoring Progress

from litellm import batch_completion_retrieve
import time

def monitor_batch(batch_id, check_interval=60):
    """Monitor batch progress and print updates."""
    while True:
        batch = batch_completion_retrieve(batch_id=batch_id)
        
        counts = batch.request_counts
        total = counts.get("total", 0)
        completed = counts.get("completed", 0)
        failed = counts.get("failed", 0)
        
        if total > 0:
            progress = (completed + failed) / total * 100
            print(f"Progress: {progress:.1f}% ({completed}/{total} completed, {failed} failed)")
        
        if batch.status in ["completed", "failed", "expired", "cancelled"]:
            print(f"Final status: {batch.status}")
            break
        
        time.sleep(check_interval)

monitor_batch("batch_abc123")

Cost Calculation

from litellm import batch_completion_retrieve, completion

# Batch processing (50% discount)
batch = batch_completion_retrieve(batch_id="batch_abc123")
if batch.status == "completed":
    batch_cost = batch.request_counts["completed"] * 0.50  # Example
    print(f"Batch cost: ${batch_cost:.2f}")

# Compare with synchronous
sync_cost = batch.request_counts["total"] * 1.00  # Example
print(f"Sync cost would be: ${sync_cost:.2f}")
print(f"Savings: ${sync_cost - batch_cost:.2f} ({((sync_cost - batch_cost) / sync_cost * 100):.0f}%)")

Best Practices

  • Use descriptive custom_id values for tracking
  • Validate requests before creating batch
  • Keep batch size reasonable (1000-50000 requests)
  • Include metadata for organization
  • Poll status periodically (every 1-5 minutes)
  • Set up alerts for completion/failure
  • Monitor request counts for progress
  • Log batch IDs for tracking
  • Check for errors in individual results
  • Implement retry logic for failed requests
  • Save partial results before processing
  • Have fallback for batch failures
  • Use batching for all non-urgent tasks
  • Combine similar requests into batches
  • Use cheaper models when appropriate
  • Monitor and optimize batch sizes

Limitations

  • Completion time is not guaranteed (usually within 24h)
  • Cannot cancel individual requests
  • No real-time status updates
  • Results available for limited time (check provider docs)
  • Some features may not be available in batch mode

Supported Features

FeatureOpenAIAnthropic
Chat Completions
Embeddings
Function Calling
Streaming
Vision
JSON Mode
Max Requests50,000Varies
Completion Window24hVaries

LiteLLM Proxy with Batching

model_list:
  - model_name: gpt-4o-batch
    litellm_params:
      model: gpt-4o
      api_key: os.environ/OPENAI_API_KEY
      batch_enabled: true
import openai

client = openai.OpenAI(
    api_key="sk-1234",
    base_url="http://0.0.0.0:4000"
)

# Create batch through proxy
batch = client.batches.create(
    input_file_id="file-abc123",
    endpoint="/v1/chat/completions",
    completion_window="24h"
)

Build docs developers (and LLMs) love