The query module provides functions to build expressive search queries combining semantic similarity, keyword matching, and filters.
Overview
TopK queries are built using a fluent API:
from topk_sdk.query import select, field, filter, fn
results = client.collection("books").query(
select(
"title", "year",
similarity=fn.semantic_similarity("title", "classic novel")
)
.filter(field("year") > 1900)
.topk(field("similarity"), 10)
)
Query Stages
select()
Create a query with a select stage. Specify which fields to return and compute additional expressions.
from topk_sdk.query import select, field, fn
# Select specific fields
query = select("title", "author")
# Select fields and compute expressions
query = select(
"title",
year=field("published_year"),
similarity=fn.semantic_similarity("title", "animal")
)
Field names to include in the results.
**kwargs
LogicalExpr | FunctionExpr
Named expressions to compute. The name becomes the field name in results.
A new query with the select stage.
filter()
Create a query with a filter stage. Only documents matching the filter are returned.
from topk_sdk.query import filter, field
# Filter by field value
query = filter(field("published_year") > 1980)
# Combine filters
query = filter(
(field("year") >= 1900) & (field("year") <= 2000)
)
expr
LogicalExpr | TextExpr
required
The filter expression. Documents must satisfy this expression to be included.
A new query with the filter stage.
Query.filter()
Add a filter stage to an existing query.
query = select("title").filter(field("year") > 2000)
expr
LogicalExpr | TextExpr
required
The filter expression.
The query with the filter stage added.
Query.sort()
Add a sort stage to sort results by an expression.
from topk_sdk.query import select, field
# Sort ascending (default)
query = select("title", "year").sort(field("year"))
# Sort descending
query = select("title", "year").sort(field("year"), asc=False)
The expression to sort by.
Sort in ascending order if True, descending if False.
The query with the sort stage added.
Query.topk()
Add a top-k stage to return the top k results by a scoring expression.
from topk_sdk.query import select, field, fn
query = select(
"title",
score=fn.semantic_similarity("title", "dystopian")
).topk(field("score"), 10)
The scoring expression. Results are ranked by this value.
The number of top results to return.
If True, return k results with lowest scores. If False (default), return highest scores.
The query with the top-k stage added.
Query.limit()
Add a limit stage to restrict the number of results.
query = select("title").limit(10)
The maximum number of results to return.
The query with the limit stage added.
Query.count()
Add a count stage to return the count of matching documents instead of the documents themselves.
count_query = filter(field("year") > 2000).count()
The query with the count stage added.
Query.rerank()
Add a rerank stage to re-score results using a reranking model.
query = select("title", "description").rerank(
model="cohere/rerank-english-v3.0",
query="best science fiction books",
fields=["title", "description"],
topk_multiple=3
)
The reranking model to use.
The query text to rerank against.
fields
Sequence[str]
default:"[]"
Fields to consider during reranking.
Retrieve this multiple of k candidates before reranking.
The query with the rerank stage added.
Field References
field()
Reference a field from the document.
from topk_sdk.query import field
# Use in filters
filter(field("year") > 2000)
# Use in select expressions
select("title", publication_year=field("year"))
# Use in topk
select("title", score=fn.semantic_similarity("title", "novel"))
.topk(field("score"), 10)
The name of the field to reference.
An expression referencing the field.
literal()
Create a literal expression from a value.
from topk_sdk.query import literal, field
# Usually not needed, values are auto-converted
filter(field("year").eq(literal(2000)))
# This is equivalent:
filter(field("year") == 2000)
Logical Expressions
LogicalExpr objects support rich operations for building filters and computed fields.
Comparison Operators
from topk_sdk.query import field
# Equality
field("genre").eq("fiction") # or field("genre") == "fiction"
field("genre").ne("non-fiction") # or field("genre") != "non-fiction"
# Ordering
field("year").lt(2000) # or field("year") < 2000
field("year").lte(2000) # or field("year") <= 2000
field("year").gt(2000) # or field("year") > 2000
field("year").gte(2000) # or field("year") >= 2000
Arithmetic Operators
from topk_sdk.query import field, select
# Addition, subtraction, multiplication, division
select(
"title",
discounted_price=field("price") * 0.8,
price_diff=field("price") - field("cost"),
total=field("price") + field("tax")
)
Logical Operators
from topk_sdk.query import field, filter
# AND
filter(field("year").gt(1900) & field("year").lt(2000))
# OR
filter(field("genre").eq("fiction") | field("genre").eq("mystery"))
String Operators
from topk_sdk.query import field, filter
# Check if string starts with prefix
filter(field("title").starts_with("The"))
# Check if list/string contains value
filter(field("tags").contains("bestseller"))
# Check if value is in list
filter(field("genre").in_(["fiction", "mystery", "thriller"]))
# Regular expression matching
filter(field("title").regexp_match(r"^The.*", flags="i"))
Keyword Matching
from topk_sdk.query import field, filter
# Match any term (OR)
filter(field("title").match_any(["science", "fiction"]))
# Match all terms (AND)
filter(field("title").match_all(["science", "fiction"]))
Null Handling
from topk_sdk.query import field, filter
# Check for null
filter(field("description").is_null())
filter(field("description").is_not_null())
# Coalesce null values
select("title", rating=field("rating").coalesce(0.0))
Math Functions
from topk_sdk.query import field, select
select(
"title",
abs_diff=field("price").sub(field("cost")).abs(),
sqrt_price=field("price").sqrt(),
squared=field("rating").square(),
log_price=field("price").ln(),
exp_value=field("value").exp()
)
Conditional Expressions
from topk_sdk.query import field, select
# Choose between two values based on condition
select(
"title",
status=field("in_stock").choose("Available", "Out of Stock")
)
# Boost scoring based on condition
select(
"title",
score=fn.semantic_similarity("title", "novel").boost(
field("is_bestseller"),
2.0 # 2x boost for bestsellers
)
)
# Min/max
select(
"title",
min_value=field("price").min(field("msrp")),
max_value=field("rating").max(4.5)
)
Keyword Search
match()
Perform keyword search for documents containing specific keywords or phrases. Use with fields that have a keyword_index().
from topk_sdk.query import match, filter, select, fn
# Simple keyword match
query = filter(match("science"))
# Match on specific field
query = filter(match("fiction", field="genre"))
# Match with weight
query = filter(match("bestseller", weight=2.0))
# Match all terms (AND)
query = filter(match("science fiction", all=True))
# Combine with BM25 scoring
query = select(
"title",
text_score=fn.bm25_score()
).filter(
match("dystopian") | match("future")
).topk(field("text_score"), 10)
The keyword or phrase to search for.
The specific field to search in. If not provided, searches all keyword-indexed fields.
Weight for this term in scoring.
If True, all tokens must match (AND). If False, any token can match (OR).
A keyword match expression.
Logical Combinators
all()
Create a logical AND expression combining multiple conditions.
from topk_sdk.query import field, all, filter
query = filter(all([
field("published_year") >= 1900,
field("published_year") <= 2000,
field("title").is_not_null()
]))
exprs
Sequence[LogicalExpr]
required
List of expressions to combine with AND.
A logical AND expression.
any()
Create a logical OR expression combining multiple conditions.
from topk_sdk.query import field, any, filter
query = filter(any([
field("genre") == "fiction",
field("genre") == "mystery",
field("genre") == "thriller"
]))
exprs
Sequence[LogicalExpr]
required
List of expressions to combine with OR.
not_()
Negate a logical expression.
from topk_sdk.query import field, not_, filter
query = filter(not_(field("title").contains("Catcher")))
The expression to negate.
Function Expressions
The fn class provides functions for semantic similarity, vector distance, and keyword scoring.
fn.semantic_similarity()
Calculate semantic similarity between a field and a query string. Requires a semantic_index() on the field.
from topk_sdk.query import select, field, fn
results = client.collection("books").query(
select(
"title",
similarity=fn.semantic_similarity("title", "animal story")
).topk(field("similarity"), 10)
)
The field name with a semantic index.
The query text to compare against.
A semantic similarity scoring function.
fn.vector_distance()
Calculate vector distance between a field and a query vector. Requires a vector_index() on the field.
from topk_sdk.query import select, field, fn
results = client.collection("books").query(
select(
"title",
distance=fn.vector_distance(
"title_embedding",
[0.1, 0.2, 0.3, ...] # Your embedding vector
)
).topk(field("distance"), 10)
)
The field name containing vectors with a vector index.
vector
list[int] | list[float] | dict[int, float] | dict[int, int] | SparseVector | List
required
The query vector. Can be:
- Dense vector:
[0.1, 0.2, 0.3, ...]
- Sparse vector:
{0: 0.5, 10: 0.8, 50: 0.3}
SparseVector or List instance from topk_sdk.data
Skip the refinement step for approximate search.
A vector distance scoring function.
fn.multi_vector_distance()
Calculate multi-vector distance between a matrix field and a query matrix. Requires a multi_vector_index() on the field.
from topk_sdk.query import select, field, fn
results = client.collection("documents").query(
select(
"title",
distance=fn.multi_vector_distance(
"colbert_embeddings",
[[0.1, 0.2, ...], [0.3, 0.4, ...], ...],
candidates=100
)
).topk(field("distance"), 10)
)
The field name containing matrices with a multi-vector index.
matrix
Matrix | ndarray | list[list[float]] | list[list[int]]
required
The query matrix. Can be:
- List of lists:
[[0.1, 0.2], [0.3, 0.4]]
- Numpy array:
np.array([[0.1, 0.2], [0.3, 0.4]])
Matrix instance from topk_sdk.data
Limit the number of candidate vectors considered during search.
A multi-vector distance scoring function.
fn.bm25_score()
Calculate BM25 score for keyword search. Use with match() filters.
from topk_sdk.query import select, field, fn, match
results = client.collection("books").query(
select(
"title",
text_score=fn.bm25_score()
)
.filter(match("dystopian") | match("future"))
.topk(field("text_score"), 10)
)
Helper Functions
min()
Compute the minimum of two expressions.
from topk_sdk.query import field, min, select
query = select(
"title",
best_price=min(field("price"), field("sale_price"))
)
left
int | float | str | LogicalExpr
required
First value to compare.
right
int | float | str | LogicalExpr
required
Second value to compare.
max()
Compute the maximum of two expressions.
from topk_sdk.query import field, max, select
query = select(
"title",
max_rating=max(field("user_rating"), field("critic_rating"))
)
left
int | float | str | LogicalExpr
required
First value to compare.
right
int | float | str | LogicalExpr
required
Second value to compare.
abs()
Compute the absolute value of an expression.
from topk_sdk.query import field, abs, filter
query = filter(abs(field("temperature")) > 30)
The expression to compute the absolute value of.
Complete Examples
Semantic Search
from topk_sdk.query import select, field, fn
results = client.collection("books").query(
select(
"title", "author", "year",
similarity=fn.semantic_similarity("title", "space exploration")
)
.filter(field("year") > 1950)
.topk(field("similarity"), 10)
)
Hybrid Search (Semantic + Keyword)
from topk_sdk.query import select, field, fn, match
results = client.collection("books").query(
select(
"title",
semantic_score=fn.semantic_similarity("title", "artificial intelligence"),
keyword_score=fn.bm25_score(),
hybrid_score=fn.semantic_similarity("title", "AI") + fn.bm25_score()
)
.filter(match("technology") | match("computer"))
.topk(field("hybrid_score"), 10)
)
Vector Search with Filters
from topk_sdk.query import select, field, fn
results = client.collection("products").query(
select(
"name", "price", "category",
similarity=fn.vector_distance("image_embedding", user_query_vector)
)
.filter(
(field("price") < 100) &
(field("in_stock") == True) &
field("category").in_(["electronics", "gadgets"])
)
.topk(field("similarity"), 20)
)
Multi-Vector Search (ColBERT)
from topk_sdk.query import select, field, fn
import numpy as np
query_matrix = np.random.rand(10, 128).astype(np.float32)
results = client.collection("documents").query(
select(
"title", "content",
relevance=fn.multi_vector_distance(
"colbert_embeddings",
query_matrix,
candidates=500
)
).topk(field("relevance"), 10)
)