Skip to main content
While Convex handles real-time updates for the live demo, JARVIS uses MongoDB Atlas for persistent storage of:
  • Raw capture images and embeddings
  • Archived dossiers for cross-session memory
  • Face embedding vectors for similarity search
  • Historical intelligence data
MongoDB is optional in the current implementation. The system works with Convex alone for real-time demo purposes. MongoDB enables persistent memory and vector search for production deployments.

Architecture

JARVIS uses a DatabaseGateway protocol to abstract storage operations:
backend/db/__init__.py
from typing import Protocol, Any

class DatabaseGateway(Protocol):
    @property
    def configured(self) -> bool: ...

    async def store_person(self, person_id: str, data: dict[str, Any]) -> str:
        """Persist a person record."""
        ...

    async def get_person(self, person_id: str) -> dict[str, Any] | None:
        """Retrieve a person record by ID."""
        ...

    async def update_person(self, person_id: str, data: dict[str, Any]) -> None:
        """Merge new data into existing person record."""
        ...

    async def store_capture(self, capture_id: str, metadata: dict[str, Any]) -> str:
        """Persist capture metadata."""
        ...
Three implementations exist:
  1. ConvexGateway — Real-time updates via Convex HTTP API
  2. InMemoryDatabaseGateway — Local testing without external dependencies
  3. MongoDBGateway — (Planned) Persistent storage with vector search

Configuration

Set the MongoDB connection string in your environment:
.env
MONGODB_URI=mongodb+srv://username:[email protected]/jarvis?retryWrites=true&w=majority
The backend automatically detects MongoDB availability:
backend/config.py
from pydantic import Field
from pydantic_settings import BaseSettings

class Settings(BaseSettings):
    mongodb_uri: str | None = Field(default=None, alias="MONGODB_URI")
    
    def service_flags(self) -> dict[str, bool]:
        return {
            "mongodb": bool(self.mongodb_uri),
            # ... other services
        }
Check MongoDB status:
curl http://localhost:8000/api/services
Response:
[
  {
    "name": "mongodb",
    "configured": true,
    "notes": "Persistent raw captures and dossiers"
  }
]

Data Models

Persons Collection

Stores enriched person records with embeddings:
{
  "_id": ObjectId("..."),
  "person_id": "person_abc123",
  "name": "Alice Smith",
  "photo_url": "https://storage.example.com/faces/abc123.jpg",
  "confidence": 0.95,
  "embedding": [0.123, -0.456, ...],  # 512-dim ArcFace vector
  "status": "enriched",
  "dossier": {
    "summary": "AI researcher at OpenAI. Stanford PhD.",
    "title": "Research Scientist",
    "company": "OpenAI",
    "work_history": [
      {
        "role": "Research Scientist",
        "company": "OpenAI",
        "period": "2022-present"
      }
    ],
    "education": [
      {
        "school": "Stanford University",
        "degree": "PhD Computer Science"
      }
    ],
    "social_profiles": {
      "linkedin": "https://linkedin.com/in/alicesmith",
      "twitter": "https://twitter.com/alicesmith",
      "github": "https://github.com/alicesmith"
    },
    "notable_activity": [
      "Published 15 papers on transformer architectures",
      "Co-authored GPT-4 technical report"
    ],
    "conversation_hooks": [
      "Ask about recent work on multimodal models",
      "Discuss Stanford AI Lab research"
    ],
    "risk_flags": []
  },
  "created_at": ISODate("2024-01-15T10:30:00Z"),
  "updated_at": ISODate("2024-01-15T10:45:00Z")
}

Captures Collection

Stores raw capture metadata and image references:
{
  "_id": ObjectId("..."),
  "capture_id": "cap_xyz789",
  "image_url": "s3://jarvis-captures/2024/01/15/xyz789.jpg",
  "source": "glasses",
  "timestamp": ISODate("2024-01-15T10:30:00Z"),
  "status": "identified",
  "person_id": "person_abc123",
  "detection_metadata": {
    "face_count": 1,
    "bbox": [120, 80, 300, 260],
    "landmarks": {...}
  }
}
MongoDB Atlas supports vector search for finding similar faces using embeddings:

Create Vector Search Index

// In MongoDB Atlas UI or via API
{
  "name": "face_embedding_index",
  "type": "vectorSearch",
  "definition": {
    "fields": [
      {
        "type": "vector",
        "path": "embedding",
        "numDimensions": 512,
        "similarity": "cosine"
      }
    ]
  }
}

Similarity Search Example

from motor.motor_asyncio import AsyncIOMotorClient

class MongoDBGateway:
    def __init__(self, uri: str):
        self.client = AsyncIOMotorClient(uri)
        self.db = self.client.jarvis
        self.persons = self.db.persons

    async def find_similar_faces(
        self, 
        embedding: list[float], 
        limit: int = 5,
        min_score: float = 0.7
    ) -> list[dict]:
        """Find persons with similar face embeddings."""
        pipeline = [
            {
                "$vectorSearch": {
                    "index": "face_embedding_index",
                    "path": "embedding",
                    "queryVector": embedding,
                    "numCandidates": 100,
                    "limit": limit
                }
            },
            {
                "$project": {
                    "person_id": 1,
                    "name": 1,
                    "photo_url": 1,
                    "confidence": 1,
                    "score": {"$meta": "vectorSearchScore"}
                }
            },
            {
                "$match": {
                    "score": {"$gte": min_score}
                }
            }
        ]
        
        results = await self.persons.aggregate(pipeline).to_list(length=limit)
        return results

Usage in Pipeline

backend/identification/pipeline.py
async def identify_person(
    face_image: bytes,
    embedder: FaceEmbedder,
    db: MongoDBGateway
) -> dict | None:
    # Generate embedding from face
    embedding = embedder.embed_from_bytes(face_image)
    
    # Search for similar faces in MongoDB
    matches = await db.find_similar_faces(
        embedding=embedding,
        limit=3,
        min_score=0.85  # High confidence threshold
    )
    
    if matches:
        best_match = matches[0]
        logger.info(
            f"Found match: {best_match['name']} "
            f"(score: {best_match['score']:.2f})"
        )
        return best_match
    
    return None

Dual-Write Strategy

For production, JARVIS uses a dual-write pattern:
  1. Write to Convex — Real-time UI updates
  2. Write to MongoDB — Persistent storage and vector search
backend/db/dual_gateway.py
class DualGateway:
    """Writes to both Convex (real-time) and MongoDB (persistent)."""
    
    def __init__(self, convex: ConvexGateway, mongo: MongoDBGateway):
        self.convex = convex
        self.mongo = mongo

    async def store_person(self, person_id: str, data: dict) -> str:
        # Write to both in parallel
        convex_task = self.convex.store_person(person_id, data)
        mongo_task = self.mongo.store_person(person_id, data)
        
        results = await asyncio.gather(
            convex_task,
            mongo_task,
            return_exceptions=True
        )
        
        # Log any MongoDB failures but don't block on them
        if isinstance(results[1], Exception):
            logger.warning(f"MongoDB write failed: {results[1]}")
        
        return results[0]  # Return Convex result

Data Lifecycle

Capture → Storage Flow

Archival Policy

# Archive old captures after 30 days
async def archive_old_captures():
    cutoff = datetime.now() - timedelta(days=30)
    
    # Move from Convex to MongoDB cold storage
    old_captures = await convex.query(
        "captures:listAll",
        {"before": cutoff.timestamp()}
    )
    
    for capture in old_captures:
        await mongo.archive_capture(capture)
        await convex.delete_capture(capture["_id"])

Performance Considerations

Face embeddings are 512-dimensional float32 vectors (~2KB each). Store them as BSON binary for efficient vector search:
import struct

def encode_embedding(embedding: list[float]) -> bytes:
    return struct.pack(f"{len(embedding)}f", *embedding)

def decode_embedding(data: bytes) -> list[float]:
    return list(struct.unpack(f"{len(data)//4}f", data))
Create indexes for common queries:
// Compound index for status queries
db.persons.createIndex({ "status": 1, "updated_at": -1 })

// Text search on names
db.persons.createIndex({ "name": "text" })

// Capture source filtering
db.captures.createIndex({ "source": 1, "timestamp": -1 })
Configure Motor for optimal async performance:
client = AsyncIOMotorClient(
    mongodb_uri,
    maxPoolSize=50,
    minPoolSize=10,
    maxIdleTimeMS=30000,
    serverSelectionTimeoutMS=5000
)

Migration Scripts

Export from Convex to MongoDB

scripts/export_to_mongo.py
import asyncio
from backend.db.convex_client import ConvexGateway
from backend.db.mongo_client import MongoDBGateway
from backend.config import get_settings

async def export_persons():
    settings = get_settings()
    convex = ConvexGateway(settings)
    mongo = MongoDBGateway(settings.mongodb_uri)
    
    persons = await convex._query("persons:listAll", {})
    
    for person in persons:
        await mongo.store_person(person["_id"], person)
        print(f"Exported {person['name']}")

if __name__ == "__main__":
    asyncio.run(export_persons())

Next: Supermemory

Learn about cross-session agent memory using Supermemory

Build docs developers (and LLMs) love