Chroma is a lightweight, embeddable vector database designed for rapid prototyping and local development. It supports persistent storage, cloud deployments, and ephemeral in-memory databases with minimal configuration.
Key features
Local persistent storage : File-based storage without external dependencies
Ephemeral mode : In-memory databases for testing
Cloud deployment : Hosted Chroma instances
Hybrid search : Experimental support for dense + sparse vectors
Multi-tenancy : Tenant and database isolation
Metadata filtering : Chroma-native filter syntax
Installation
Chroma requires pysqlite3 for compatibility:
pip install chromadb pysqlite3-binary
Connection
Local persistent client
Default mode for development:
from vectordb.databases.chroma import ChromaVectorDB
db = ChromaVectorDB(
persistent = True ,
path = "./chroma_data" ,
collection_name = "my_collection"
)
Ephemeral client
In-memory for testing:
db = ChromaVectorDB(
persistent = False ,
collection_name = "test_collection"
)
Cloud/remote client
db = ChromaVectorDB(
host = "localhost" ,
port = 8000 ,
ssl = True ,
api_key = "your-api-key" ,
tenant = "default_tenant" ,
database = "default_database"
)
From config file
db = ChromaVectorDB( config_path = "configs/chroma.yaml" )
chroma :
host : ${CHROMA_HOST} # null for local
port : 8000
api_key : ${CHROMA_API_KEY}
tenant : "default_tenant"
database : "default_database"
path : "./chroma_data"
persistent : true
collection_name : "documents"
ssl : true
Collection creation
Basic collection
db.create_collection(
name = "articles" ,
get_or_create = True # Returns existing if present
)
With custom embedding function
from chromadb.utils import embedding_functions
sentence_transformer = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name = "all-MiniLM-L6-v2"
)
db.create_collection(
name = "articles" ,
embedding_function = sentence_transformer
)
db.create_collection(
name = "articles" ,
metadata = { "description" : "Article embeddings" , "version" : "1.0" }
)
Upserting documents
From Haystack documents
from haystack import Document
documents = [
Document(
id = "doc-1" ,
content = "Chroma is lightweight" ,
embedding = [ 0.1 , 0.2 , ... ],
meta = { "category" : "database" , "priority" : 1 }
)
]
db.upsert(documents)
From raw dictionaries
data = {
"ids" : [ "doc-1" , "doc-2" ],
"documents" : [ "First doc" , "Second doc" ],
"embeddings" : [[ 0.1 , 0.2 , ... ], [ 0.3 , 0.4 , ... ]],
"metadatas" : [{ "category" : "tech" }, { "category" : "science" }]
}
db.upsert(data)
Chroma automatically flattens nested metadata. Nested dicts use dot notation: {"user.id": 123} instead of {"user": {"id": 123}}.
Querying
Vector search
results = db.query(
query_embedding = [ 0.1 , 0.2 , ... ],
n_results = 10 ,
include_vectors = False
)
# Convert to Haystack Documents
documents = db.query_to_documents(results)
for doc in documents:
print ( f "Score: { doc.score } , Content: { doc.content } " )
Text search with auto-embedding
Requires collection with embedding function:
results = db.query(
query_text = "machine learning algorithms" ,
n_results = 10
)
documents = db.query_to_documents(results)
Chroma uses its own filter syntax:
# Simple equality
results = db.query(
query_embedding = vec,
where = { "category" : "technology" },
n_results = 10
)
# Comparison operators
results = db.query(
query_embedding = vec,
where = { "priority" : { "$gt" : 5 }},
n_results = 10
)
# Multiple conditions (AND)
results = db.query(
query_embedding = vec,
where = {
"$and" : [
{ "category" : "tech" },
{ "priority" : { "$gte" : 3 }}
]
}
)
# OR conditions
results = db.query(
query_embedding = vec,
where = {
"$or" : [
{ "category" : "tech" },
{ "category" : "science" }
]
}
)
# Set membership
results = db.query(
query_embedding = vec,
where = { "status" : { "$in" : [ "active" , "pending" ]}}
)
Document content filtering
results = db.query(
query_embedding = vec,
where_document = { "$contains" : "machine learning" }
)
Hybrid search (experimental)
Chroma’s hybrid search is available on hosted/cloud instances:
from chromadb.execution.expression import Knn, Search
results = db.search(
query_embeddings = [ 0.1 , 0.2 , ... ],
n_results = 10 ,
where = { "category" : "tech" }
)
Hybrid search requires Chroma 0.6.0+ and hosted/cloud environments. Local instances fall back to standard query().
Multi-tenancy
Tenant and database isolation
# Create tenant-scoped instance
tenant_db = db.with_tenant(
tenant = "company_a" ,
database = "production"
)
tenant_db.create_collection( "docs" )
tenant_db.upsert(documents)
List collections
collections = db.list_collections()
print (collections) # ["articles", "docs", "embeddings"]
Deleting documents
Delete by IDs
db.delete_documents( ids = [ "doc-1" , "doc-2" ])
Delete by filter
db.delete_documents(
where = { "status" : "archived" }
)
Deleting collections
db.delete_collection( "old_collection" )
Converting results
To Haystack Documents
results = db.query(
query_embedding = vec,
n_results = 10 ,
include = [ "metadatas" , "documents" , "distances" , "embeddings" ]
)
documents = db.query_to_documents(results)
for doc in documents:
print (doc.id, doc.content, doc.score, doc.embedding)
Score conversion
Chroma returns distances (0-2 for cosine). VectorDB converts to similarity scores:
# Chroma distance: 0.0 (identical) to 2.0 (opposite)
# Converted score: 1.0 (perfect match) to 0.0 (no match)
score = 1.0 - distance
Chroma requires flat metadata. Nested structures are automatically flattened:
# Input
meta = {
"user" : { "id" : 123 , "name" : "Alice" },
"tags" : [ "tech" , "ai" ]
}
# Automatically flattened to
flat_meta = {
"user.id" : 123 ,
"user.name" : "Alice" ,
"tags" : [ "tech" , "ai" ]
}
Manual flattening:
flat = db.flatten_metadata(nested_metadata)
Best practices
Use persistent client for development
Persistent storage survives process restarts: # Recommended for development
db = ChromaVectorDB(
persistent = True ,
path = "./chroma_data"
)
# Avoid for development (data lost on exit)
db = ChromaVectorDB( persistent = False )
Metadata design for filtering
Tenant isolation strategy
Use tenant/database scoping for multi-tenancy: # Good: Logical isolation
tenant_a = db.with_tenant( "tenant_a" , "prod" )
tenant_b = db.with_tenant( "tenant_b" , "prod" )
# Each tenant has isolated collections
tenant_a.create_collection( "docs" )
tenant_b.create_collection( "docs" )
Use get_or_create=True for idempotent collection creation: # Safe to call multiple times
db.create_collection(
"articles" ,
get_or_create = True
)
# Raises error if exists
db.create_collection(
"articles" ,
get_or_create = False
)
Limitations
Chroma 1.4+ requires non-empty metadata or None:
# Handled automatically by VectorDB
metadatas = [{}, { "key" : "value" }]
# Becomes: [{"_": "_"}, {"key": "value"}]
Hybrid search availability
Hybrid search requires:
Chroma 0.6.0+
Hosted/cloud environment
Falls back to standard query on local instances
Complex types are converted to strings:
meta = { "data" : { "nested" : "value" }} # Flattened
meta = { "data" : [ 1 , 2 , 3 ]} # Kept as list if uniform types
meta = { "data" : [ 1 , "two" , 3.0 ]} # Converted to string: "[1, 'two', 3.0]"
Error handling
try :
db.create_collection( "articles" )
db.upsert(documents)
except ValueError as e:
print ( f "Configuration error: { e } " )
except ConnectionError as e:
print ( f "Chroma connection failed: { e } " )
except Exception as e:
print ( f "Unexpected error: { e } " )
Source reference
Implementation: src/vectordb/databases/chroma.py
Key classes and methods:
ChromaVectorDB.__init__(): src/vectordb/databases/chroma.py:75
create_collection(): src/vectordb/databases/chroma.py:220
upsert(): src/vectordb/databases/chroma.py:269
query(): src/vectordb/databases/chroma.py:329
search(): src/vectordb/databases/chroma.py:435
flatten_metadata(): src/vectordb/databases/chroma.py:593