Skip to main content
The Search Thing indexes images by generating structured AI summaries using a vision model, then creating searchable embeddings. This enables natural language search over visual content.

How image indexing works

Images are processed through a vision model that analyzes visual content and generates structured metadata:
  1. Read image bytes and convert to base64 data URI
  2. Send to vision model for analysis
  3. Extract structured summary with objects, actions, setting, OCR, and quality
  4. Create embeddings from the summary text
  5. Store in HelixDB for semantic search
Image indexing uses Groq’s vision-enabled models to understand image content without storing the actual image data in the search index.

Image summary generation

The vision model generates detailed, structured summaries:
# From backend/indexer/image_indexer.py:263
async def generate_summary(
    image_data_uri: str,
) -> tuple[dict, str]:
    client = get_groq_client()
    prompt = (
        "You are an expert vision assistant. Provide a concise JSON summary for "
        "the provided image. Respond with JSON only (no code fences). Use the schema: "
        '{"summary": "<1-2 sentences>", "objects": ["..."], "actions": ["..."], '
        '"setting": "<location or scene>", "ocr": "<visible text or empty>", "quality": "<good|low>"}'
    )
    
    response = client.chat.completions.create(
        model="meta-llama/llama-4-maverick-17b-128e-instruct",
        messages=[{
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": image_data_uri}},
            ],
        }],
        max_tokens=500,
        temperature=0.2,
    )

Structured metadata

Summaries include multiple dimensions of information:
# From backend/indexer/image_indexer.py:182
def _normalize_summary_content(content_str: str) -> dict:
    parsed_obj = json.loads(text)
    
    return {
        "summary": parsed_obj.get("summary"),        # Natural language description
        "objects": parsed_obj.get("objects", []),    # Detected objects
        "actions": parsed_obj.get("actions", []),    # Activities or actions
        "setting": parsed_obj.get("setting", ""),    # Location or scene type
        "ocr": parsed_obj.get("ocr", ""),            # Visible text in image
        "quality": parsed_obj.get("quality", ""),    # Image quality assessment
    }
{
  "summary": "A laptop computer on a wooden desk with a cup of coffee and notebook.",
  "objects": ["laptop", "desk", "coffee cup", "notebook", "pen"],
  "actions": ["working", "studying"],
  "setting": "home office or workspace",
  "ocr": "MacBook Pro",
  "quality": "good"
}

Data URI conversion

Images are converted to base64-encoded data URIs for API transmission:
# From backend/indexer/image_indexer.py:12
def _bytes_to_data_uri(image_bytes: bytes, mime_hint: str = "jpeg") -> str:
    encoded = base64.b64encode(image_bytes).decode("utf-8")
    return f"data:image/{mime_hint};base64,{encoded}"

MIME type detection

# From backend/indexer/image_indexer.py:17
def _mime_hint_from_path(path: Path) -> str:
    suffix = path.suffix.lower()
    mapping = {
        ".jpg": "jpeg",
        ".jpeg": "jpeg",
        ".png": "png",
        ".webp": "webp",
        ".gif": "gif",
        ".bmp": "bmp",
        ".tiff": "tiff",
    }
    return mapping.get(suffix, "jpeg")

Building embedding text

The structured summary is converted to a flat text representation for embedding:
# From backend/indexer/image_indexer.py:240
def _build_embedding_text(summary: dict) -> str:
    parts: list[str] = []
    
    def add(label: str, value: object) -> None:
        if value is None:
            return
        if isinstance(value, list):
            value = ", ".join([str(v) for v in value if v])
        if isinstance(value, str):
            value = value.strip()
        if value:
            parts.append(f"{label}: {value}")
    
    add("summary", summary.get("summary"))
    add("objects", summary.get("objects"))
    add("actions", summary.get("actions"))
    add("setting", summary.get("setting"))
    add("ocr", summary.get("ocr"))
    add("quality", summary.get("quality"))
    
    return " | ".join(parts)
summary: A laptop computer on a wooden desk with a cup of coffee and notebook. | objects: laptop, desk, coffee cup, notebook, pen | actions: working, studying | setting: home office or workspace | ocr: MacBook Pro | quality: good

Complete indexing process

# From backend/indexer/image_indexer.py:31
async def img_indexer(
    file_paths: List[str] | str,
) -> List[dict]:
    if isinstance(file_paths, str):
        file_paths = [file_paths]
    
    results: List[dict] = []
    for path in file_paths:
        p = Path(path)
        if not p.exists():
            results.append({
                "path": path,
                "image_id": None,
                "indexed": False,
                "error": "Path not found",
            })
            continue
        
        # Step 1: Read image bytes
        image_bytes = p.read_bytes()
        mime_hint = _mime_hint_from_path(p)
        
        # Step 2: Compute content hash (deduplication)
        content_hash = compute_bytes_hash(image_bytes)
        
        existing = await get_image_by_hash(content_hash)
        if existing:
            results.append({
                "path": path,
                "image_id": existing.get("image_id"),
                "indexed": False,
                "error": "Duplicate content hash",
            })
            continue
        
        # Step 3: Generate summary
        data_uri = _bytes_to_data_uri(image_bytes, mime_hint)
        summary_payload, embedding_text = await generate_summary(data_uri)
        
        # Step 4: Create image node and embeddings
        image_id = uuid.uuid4().hex
        await create_img(image_id, content_hash, json.dumps(summary_payload), path=path)
        await create_img_embeddings(image_id, embedding_text, path=path)
        
        results.append({"path": path, "image_id": image_id, "indexed": True})
Image deduplication uses content hashing, so identical images at different paths are only indexed once.

Creating image nodes

# From backend/indexer/image_indexer.py:123
async def create_img(image_id: str, content_hash: str, content: str, path: str) -> str:
    image_params = {
        "image_id": image_id,
        "content_hash": content_hash,
        "content": content,  # JSON summary
        "path": path,
    }
    
    def _query() -> str:
        helix_client = get_helix_client()
        response = helix_client.query("CreateImage", image_params)
        return json.dumps(response)
    
    return await asyncio.to_thread(_query)

Creating embeddings

# From backend/indexer/image_indexer.py:167
async def create_img_embeddings(image_id: str, content: str, path: str) -> str:
    image_params = {"image_id": image_id, "content": content, "path": path}
    
    def _query() -> str:
        helix_client = get_helix_client()
        response = helix_client.query(
            "CreateImageEmbeddings",
            image_params,
        )
        return json.dumps(response)
    
    return await asyncio.to_thread(_query)

Querying by hash

# From backend/indexer/image_indexer.py:141
async def get_image_by_hash(content_hash: str) -> dict | None:
    def _query() -> list:
        helix_client = get_helix_client()
        return helix_client.query("GetImageByHash", {"content_hash": content_hash})
    
    response = await asyncio.to_thread(_query)
    if isinstance(response, str):
        try:
            response = json.loads(response)
        except json.JSONDecodeError:
            return None
    
    if isinstance(response, dict):
        image = response.get("image")
        if isinstance(image, list):
            return image[0] if image else None
        if isinstance(image, dict):
            return image
        return None
    
    if isinstance(response, list):
        return response[0] if response else None
    return None

Searching images

# From backend/search.py:135
async def search_images(search_query: str, limit: int = 10) -> dict:
    """
    Search across image embeddings and return file-style results.
    """
    search_params = {"query": search_query, "limit": limit}
    response = get_helix_client().query("SearchImageEmbeddings", search_params)
    
    results: list[dict] = []
    top_contents: list[str] = []
    
    for entry in entries:
        if not isinstance(entry, dict):
            continue
        results.append({
            "image_id": entry.get("image_id"),
            "content": entry.get("content"),
            "path": entry.get("path"),
        })
        content = entry.get("content")
        if isinstance(content, str) and content:
            top_contents.append(content)
    
    return {"response": helix_response, "results": results, "query": search_query}

Search examples

results = await search_images("laptop on desk with coffee")
The vision model can detect text in images (OCR), so you can search for images containing specific text content.

Example usage

from backend.indexer.image_indexer import img_indexer

# Index single image
results = await img_indexer("/path/to/image.jpg")

# Index multiple images
results = await img_indexer([
    "/path/to/image1.jpg",
    "/path/to/image2.png",
    "/path/to/screenshots/error.png"
])

for result in results:
    if result["indexed"]:
        print(f"✓ {result['path']} (ID: {result['image_id']})")
    else:
        print(f"✗ {result['path']}{result.get('error', 'Unknown error')}")

Rust helper functions

The Rust module provides utilities for image processing:
// From src/img.rs:8
#[pyfunction]
pub fn get_base64_bytes(image_path: String) -> PyResult<String> {
    let img = ImageReader::open(&image_path)?
        .decode()?;
    let encoded = STANDARD.encode(img.into_bytes());
    Ok(encoded)
}
While a Rust base64 encoder exists, the Python implementation uses native Python for better compatibility with the Groq API.

Build docs developers (and LLMs) love