Skip to main content

Overview

Embeddings convert text into numerical vectors for similarity search, retrieval, and RAG applications. LiteLLM provides a unified interface for embeddings across OpenAI, Cohere, HuggingFace, and more.

Quick Start

from litellm import embedding

response = embedding(
    model="text-embedding-3-small",
    input=["Text to embed"]
)

vector = response.data[0].embedding
print(f"Vector length: {len(vector)}")

Basic Usage

from litellm import embedding

response = embedding(
    model="text-embedding-3-small",
    input="The quick brown fox jumps over the lazy dog"
)

vector = response.data[0].embedding

Providers

Latest embedding models with high quality.
from litellm import embedding

# text-embedding-3-small - Fast and efficient
response = embedding(
    model="text-embedding-3-small",
    input=["Text to embed"]
)

# text-embedding-3-large - Higher quality
response = embedding(
    model="text-embedding-3-large",
    input=["Text to embed"]
)

# ada-002 - Previous generation
response = embedding(
    model="text-embedding-ada-002",
    input=["Text to embed"]
)

Dimensions Control

Some providers allow controlling output dimensions.
from litellm import embedding

# OpenAI - Reduce dimensions for storage efficiency
response = embedding(
    model="text-embedding-3-large",
    input=["Text to embed"],
    dimensions=256  # Default is 3072 for 3-large
)

# Cohere - Control output dimension
response = embedding(
    model="cohere/embed-english-v3.0",
    input=["Text to embed"],
    dimensions=384
)

Encoding Format

from litellm import embedding

response = embedding(
    model="text-embedding-3-small",
    input=["Text to embed"],
    encoding_format="float"  # or "base64"
)

# Float format (default)
vector = response.data[0].embedding  # List of floats

# Base64 format - more compact for transmission
response = embedding(
    model="text-embedding-3-small",
    input=["Text to embed"],
    encoding_format="base64"
)

Batch Processing

Process large datasets efficiently.
from litellm import embedding
from typing import List

def embed_in_batches(texts: List[str], batch_size: int = 100):
    all_embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        response = embedding(
            model="text-embedding-3-small",
            input=batch
        )
        batch_embeddings = [data.embedding for data in response.data]
        all_embeddings.extend(batch_embeddings)
    
    return all_embeddings

# Process 1000 documents
texts = [f"Document {i}" for i in range(1000)]
embeddings = embed_in_batches(texts)
from litellm import embedding
import numpy as np

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Embed documents
documents = [
    "Python is a programming language",
    "JavaScript is used for web development",
    "Machine learning uses neural networks"
]

response = embedding(
    model="text-embedding-3-small",
    input=documents
)
doc_embeddings = [data.embedding for data in response.data]

# Embed query
query = "What is Python?"
query_response = embedding(
    model="text-embedding-3-small",
    input=[query]
)
query_embedding = query_response.data[0].embedding

# Find most similar
similarities = [
    cosine_similarity(query_embedding, doc_emb)
    for doc_emb in doc_embeddings
]

best_idx = np.argmax(similarities)
print(f"Most similar: {documents[best_idx]}")
print(f"Similarity: {similarities[best_idx]:.4f}")

RAG (Retrieval Augmented Generation)

from litellm import embedding, completion
import numpy as np

# 1. Embed knowledge base
knowledge_base = [
    "LiteLLM is a unified interface for LLMs",
    "LiteLLM supports 100+ providers",
    "LiteLLM handles automatic retries and fallbacks"
]

kb_response = embedding(
    model="text-embedding-3-small",
    input=knowledge_base
)
kb_embeddings = [data.embedding for data in kb_response.data]

# 2. Embed user query
query = "What does LiteLLM do?"
query_response = embedding(
    model="text-embedding-3-small",
    input=[query]
)
query_embedding = query_response.data[0].embedding

# 3. Find relevant documents
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

similarities = [
    cosine_similarity(query_embedding, kb_emb)
    for kb_emb in kb_embeddings
]

top_k = 2
top_indices = np.argsort(similarities)[-top_k:][::-1]
relevant_docs = [knowledge_base[i] for i in top_indices]

# 4. Generate answer with context
context = "\n".join(relevant_docs)
response = completion(
    model="gpt-4o-mini",
    messages=[{
        "role": "user",
        "content": f"Context:\n{context}\n\nQuestion: {query}"
    }]
)

print(response.choices[0].message.content)

Async Embeddings

import asyncio
from litellm import aembedding

async def embed_async():
    response = await aembedding(
        model="text-embedding-3-small",
        input=["Text to embed"]
    )
    return response.data[0].embedding

vector = asyncio.run(embed_async())

Parallel Processing

import asyncio
from litellm import aembedding

async def embed_multiple_models(text):
    tasks = [
        aembedding(model="text-embedding-3-small", input=[text]),
        aembedding(model="text-embedding-3-large", input=[text]),
        aembedding(model="cohere/embed-english-v3.0", input=[text])
    ]
    
    responses = await asyncio.gather(*tasks)
    return {
        "small": responses[0].data[0].embedding,
        "large": responses[1].data[0].embedding,
        "cohere": responses[2].data[0].embedding
    }

results = asyncio.run(embed_multiple_models("Compare embeddings"))

Caching

Cache embeddings to reduce API calls.
from litellm import embedding
import litellm
import hashlib

# Enable caching
litellm.cache = litellm.Cache()

def get_cached_embedding(text: str, model: str):
    # Embeddings are automatically cached by LiteLLM
    response = embedding(
        model=model,
        input=[text],
        caching=True
    )
    return response.data[0].embedding

# First call - API request
vec1 = get_cached_embedding("Hello world", "text-embedding-3-small")

# Second call - cached
vec2 = get_cached_embedding("Hello world", "text-embedding-3-small")

Usage Tracking

from litellm import embedding

response = embedding(
    model="text-embedding-3-small",
    input=["Text 1", "Text 2", "Text 3"]
)

# Token usage
print(f"Tokens used: {response.usage.total_tokens}")
print(f"Prompt tokens: {response.usage.prompt_tokens}")

# Cost (if available)
if hasattr(response, '_hidden_params'):
    cost = response._hidden_params.get('response_cost')
    if cost:
        print(f"Cost: ${cost}")

Error Handling

from litellm import embedding
from litellm.exceptions import APIError, RateLimitError

try:
    response = embedding(
        model="text-embedding-3-small",
        input=["Very long text..." * 10000]  # May exceed token limit
    )
except RateLimitError as e:
    print(f"Rate limit exceeded: {e}")
except APIError as e:
    print(f"API error: {e.status_code} - {e.message}")

Model Comparison

ModelProviderDimensionsMax TokensUse Case
text-embedding-3-smallOpenAI15368191General purpose, fast
text-embedding-3-largeOpenAI30728191High quality
embed-english-v3.0Cohere1024-Search, classification
all-MiniLM-L6-v2HuggingFace384256Fast, local
bge-large-en-v1.5HuggingFace1024512High quality
nomic-embed-textOllama768-Local, privacy

Best Practices

  • Use text-embedding-3-small for most use cases
  • Use text-embedding-3-large for highest quality
  • Use Cohere for specialized search applications
  • Use Ollama for privacy-sensitive applications
  • Batch texts when possible (up to 100-2000 depending on provider)
  • Use async for concurrent requests
  • Cache embeddings for frequently used texts
  • Consider using smaller dimensions if storage is a concern
  • Use smaller models when quality difference is minimal
  • Reduce dimensions to save storage and compute
  • Cache embeddings to avoid re-computing
  • Batch process to reduce API overhead
  • Normalize text before embedding
  • Keep consistent text format across corpus
  • Use same model for queries and documents
  • Test multiple models for your specific use case

Advanced Patterns

Build docs developers (and LLMs) love