Skip to main content

Overview

The embedding() function generates vector embeddings from text, supporting 100+ embedding providers through a unified interface.

Basic Usage

from litellm import embedding

response = embedding(
    model="text-embedding-ada-002",
    input=["Hello world", "How are you?"]
)

print(response.data[0].embedding[:5])  # First 5 dimensions
print(f"Dimensions: {len(response.data[0].embedding)}")

Function Signature

def embedding(
    model: str,
    input: Union[str, List[str]],
    # Optional parameters
    dimensions: Optional[int] = None,
    encoding_format: Optional[str] = None,
    timeout: float = 600,
    api_base: Optional[str] = None,
    api_version: Optional[str] = None,
    api_key: Optional[str] = None,
    user: Optional[str] = None,
    custom_llm_provider: Optional[str] = None,
    **kwargs
) -> EmbeddingResponse

Parameters

model
string
required
The embedding model to use. Examples: text-embedding-ada-002, text-embedding-3-small, text-embedding-3-large
input
Union[str, List[str]]
required
Text or list of texts to generate embeddings for.
# Single text
input="Hello world"

# Multiple texts
input=["Hello", "World", "How are you?"]
dimensions
int
Number of dimensions for the embedding. Only supported by some models (e.g., text-embedding-3 series).
encoding_format
str
Format for the embeddings. Options: "float" (default) or "base64"
timeout
float
Request timeout in seconds. Default: 600 (10 minutes)
api_key
str
API key for the provider. If not provided, reads from environment variables.

Response Format

class EmbeddingResponse:
    object: str                      # "list"
    data: List[Embedding]
    model: str
    usage: Usage

class Embedding:
    object: str                      # "embedding"
    embedding: List[float]           # Vector of floats
    index: int

class Usage:
    prompt_tokens: int
    total_tokens: int

Examples

Single Text Embedding

from litellm import embedding

response = embedding(
    model="text-embedding-ada-002",
    input="The quick brown fox jumps over the lazy dog"
)

vector = response.data[0].embedding
print(f"Embedding dimensions: {len(vector)}")
print(f"First 5 values: {vector[:5]}")
print(f"Tokens used: {response.usage.total_tokens}")

Batch Embeddings

from litellm import embedding

texts = [
    "Machine learning is fascinating",
    "Deep learning is a subset of ML",
    "Natural language processing enables AI",
    "Computer vision recognizes images"
]

response = embedding(
    model="text-embedding-ada-002",
    input=texts
)

for i, emb in enumerate(response.data):
    print(f"Text {i}: {len(emb.embedding)} dimensions")

print(f"\nTotal tokens: {response.usage.total_tokens}")

Different Providers

from litellm import embedding

response = embedding(
    model="text-embedding-ada-002",
    input=["Hello world"]
)

Using Different Dimensions

from litellm import embedding

# OpenAI embedding-3 models support custom dimensions
response_small = embedding(
    model="text-embedding-3-small",
    input=["Hello world"],
    dimensions=512  # Reduce from 1536 to 512
)

response_large = embedding(
    model="text-embedding-3-large",
    input=["Hello world"],
    dimensions=256  # Reduce from 3072 to 256
)

print(f"Small: {len(response_small.data[0].embedding)} dimensions")
print(f"Large: {len(response_large.data[0].embedding)} dimensions")

Semantic Search Example

from litellm import embedding
import numpy as np

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Documents to search
documents = [
    "Python is a programming language",
    "Machine learning uses algorithms",
    "Deep learning is a subset of machine learning",
    "Natural language processing analyzes text"
]

# Get embeddings for all documents
doc_response = embedding(
    model="text-embedding-ada-002",
    input=documents
)

doc_embeddings = [d.embedding for d in doc_response.data]

# Search query
query = "Tell me about ML and AI"
query_response = embedding(
    model="text-embedding-ada-002",
    input=query
)
query_embedding = query_response.data[0].embedding

# Calculate similarities
similarities = [
    cosine_similarity(query_embedding, doc_emb)
    for doc_emb in doc_embeddings
]

# Get most similar documents
ranked_docs = sorted(
    zip(documents, similarities),
    key=lambda x: x[1],
    reverse=True
)

print("Most relevant documents:")
for doc, score in ranked_docs:
    print(f"Score: {score:.4f} - {doc}")

Async Embeddings

import asyncio
from litellm import aembedding

async def get_embeddings():
    response = await aembedding(
        model="text-embedding-ada-002",
        input=["Hello", "World", "How are you?"]
    )
    return response.data

embeddings = asyncio.run(get_embeddings())
for i, emb in enumerate(embeddings):
    print(f"Embedding {i}: {len(emb.embedding)} dimensions")

Concurrent Async Embeddings

import asyncio
from litellm import aembedding

async def embed_batch(texts: list):
    tasks = [
        aembedding(model="text-embedding-ada-002", input=[text])
        for text in texts
    ]
    responses = await asyncio.gather(*tasks)
    return [r.data[0].embedding for r in responses]

async def main():
    texts = [f"Document {i}" for i in range(10)]
    embeddings = await embed_batch(texts)
    print(f"Generated {len(embeddings)} embeddings")

asyncio.run(main())

Use Cases

Document Clustering

from litellm import embedding
from sklearn.cluster import KMeans
import numpy as np

documents = [
    "Python programming",
    "Java development",
    "Machine learning algorithms",
    "Deep neural networks",
    "JavaScript frameworks",
    "Artificial intelligence"
]

# Get embeddings
response = embedding(
    model="text-embedding-ada-002",
    input=documents
)

embeddings_array = np.array([d.embedding for d in response.data])

# Cluster documents
kmeans = KMeans(n_clusters=2, random_state=0)
clusters = kmeans.fit_predict(embeddings_array)

for i, doc in enumerate(documents):
    print(f"Cluster {clusters[i]}: {doc}")

Duplicate Detection

from litellm import embedding
import numpy as np

def find_duplicates(texts: list, threshold: float = 0.95):
    response = embedding(
        model="text-embedding-ada-002",
        input=texts
    )
    
    embeddings = [d.embedding for d in response.data]
    duplicates = []
    
    for i in range(len(embeddings)):
        for j in range(i + 1, len(embeddings)):
            similarity = np.dot(embeddings[i], embeddings[j]) / (
                np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[j])
            )
            if similarity >= threshold:
                duplicates.append((i, j, similarity))
    
    return duplicates

texts = [
    "The cat sat on the mat",
    "A feline was sitting on the rug",  # Similar to first
    "Python is a programming language",
    "The dog ran in the park"
]

duplicates = find_duplicates(texts)
for i, j, score in duplicates:
    print(f"Similar (score: {score:.3f}):")
    print(f"  {texts[i]}")
    print(f"  {texts[j]}")

Question Answering System

from litellm import embedding
import numpy as np

class QASystem:
    def __init__(self, documents):
        self.documents = documents
        response = embedding(
            model="text-embedding-ada-002",
            input=documents
        )
        self.doc_embeddings = [d.embedding for d in response.data]
    
    def answer(self, question: str, top_k: int = 3):
        # Get question embedding
        response = embedding(
            model="text-embedding-ada-002",
            input=question
        )
        query_embedding = response.data[0].embedding
        
        # Calculate similarities
        similarities = [
            np.dot(query_embedding, doc_emb) / (
                np.linalg.norm(query_embedding) * np.linalg.norm(doc_emb)
            )
            for doc_emb in self.doc_embeddings
        ]
        
        # Get top-k documents
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        return [self.documents[i] for i in top_indices]

# Example usage
knowledge_base = [
    "Python is a high-level programming language",
    "Machine learning is a subset of AI",
    "Deep learning uses neural networks",
    "NLP processes human language"
]

qa = QASystem(knowledge_base)
results = qa.answer("What is Python?")
print("Relevant documents:", results)

Error Handling

from litellm import embedding
from litellm.exceptions import (
    AuthenticationError,
    RateLimitError,
    BadRequestError
)

try:
    response = embedding(
        model="text-embedding-ada-002",
        input=["Hello world"]
    )
except AuthenticationError:
    print("Invalid API key")
except RateLimitError:
    print("Rate limit exceeded")
except BadRequestError as e:
    print(f"Invalid request: {e}")
except Exception as e:
    print(f"Unexpected error: {e}")

Best Practices

  1. Batch processing: Send multiple texts in one request when possible
  2. Choose appropriate dimensions: Smaller dimensions for faster search, larger for accuracy
  3. Normalize vectors: For cosine similarity, normalize embeddings
  4. Cache embeddings: Store embeddings to avoid repeated API calls
  5. Handle rate limits: Use async processing with appropriate delays
  6. Monitor token usage: Track costs, especially with large batches

Performance Tips

# Good: Batch processing
response = embedding(
    model="text-embedding-ada-002",
    input=["text1", "text2", "text3"]  # Single API call
)

# Bad: Individual requests
for text in ["text1", "text2", "text3"]:
    response = embedding(
        model="text-embedding-ada-002",
        input=text  # Multiple API calls
    )

Build docs developers (and LLMs) love