Skip to main content

Overview

Implements semantic search over world lore and conversation memory using ChromaDB vector database with sentence-transformers embeddings. Enables context-aware retrieval of relevant information during chat.

RagManager Class

Initialization

class RagManager:
    def __init__(self, db_path: str = "./chroma_db"):
        self.db_path = db_path
        
        # Ensure directory exists
        os.makedirs(db_path, exist_ok=True)
        
        # Persistent client stores data to disk
        self.client = chromadb.PersistentClient(path=db_path)
        
        # Use sentence-transformers for local embeddings
        model_name = "all-MiniLM-L6-v2"
        self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
            model_name=model_name
        )
db_path
string
Path to ChromaDB persistent storage directory (default: "./chroma_db")
Embedding Model: all-MiniLM-L6-v2
  • 384-dimensional embeddings
  • Fast inference (local, no API calls)
  • Good balance of speed and quality
  • 22.3M parameters

Collections

Lore Collection

@property
def collection(self) -> Collection:
    if self._collection is None:
        self._collection = self.client.get_or_create_collection(
            name="world_lore",
            embedding_function=self.embedding_function,
            metadata={"hnsw:space": "cosine"}
        )
    return self._collection
Stores world lore chunks for semantic retrieval.
name
string
Collection name: "world_lore"
embedding_function
EmbeddingFunction
SentenceTransformer model for generating embeddings
metadata.hnsw:space
string
Distance metric: "cosine" (measures similarity between vectors)

Memory Collection

@property
def memory_collection(self) -> Collection:
    if self._memory_collection is None:
        self._memory_collection = self.client.get_or_create_collection(
            name="session_memory",
            embedding_function=self.embedding_function,
            metadata={"hnsw:space": "cosine"}
        )
    return self._memory_collection
Stores conversation history for episodic memory retrieval.

Lore Management

add_lore()

def add_lore(
    self, world_id: str, lore_id: str, text: str, metadata: Dict[str, Any] = None
)
Adds a lore chunk to the vector database.
world_id
string
required
World identifier (used for filtering queries)
lore_id
string
required
Unique identifier for this lore chunk (e.g., "base_lore_0", uuid.uuid4())
text
string
required
Lore content to embed and store
metadata
Dict[str, Any]
Optional metadata (automatically includes world_id)
Example
rag_manager.add_lore(
    world_id="fantasy",
    lore_id="ruins_001",
    text="The ancient ruins of Kal'dara hold secrets from the First Age. Only the bravest adventurers dare explore their cursed halls.",
    metadata={"category": "locations", "importance": "high"}
)
Implementation
meta = metadata or {}
meta["world_id"] = world_id  # Ensure world filtering works

self.collection.upsert(
    ids=[f"{world_id}_{lore_id}"],
    documents=[text],
    metadatas=[meta]
)

query_lore()

def query_lore(
    self, world_id: str, query: str, n_results: int = 3, max_chars: int = 1000
) -> Tuple[List[str], bool]
Retrieve most relevant lore chunks for a query.
world_id
string
required
World ID to filter results (only returns lore from this world)
query
string
required
User’s question or message (used for semantic search)
n_results
int
Maximum number of chunks to retrieve (default: 3)
max_chars
int
Character limit for context window warning (default: 1000)
results
List[str]
List of relevant lore text chunks, ordered by relevance
context_warning
bool
True if total characters exceed max_chars (potential context bloat)
Example
lore_chunks, warning = rag_manager.query_lore(
    world_id="fantasy",
    query="Tell me about the ancient ruins",
    n_results=2
)

if warning:
    print("Warning: Retrieved context is large")

for chunk in lore_chunks:
    print(f"Lore: {chunk}")
Implementation
results = self.collection.query(
    query_texts=[query],
    n_results=n_results,
    where={"world_id": world_id}  # Filter by world ID
)

if results["documents"] and results["documents"][0]:
    docs = results["documents"][0]
    
    # Check for context window bloat
    total_chars = sum(len(d) for d in docs)
    context_warning = total_chars > max_chars
    
    return docs, context_warning
return [], False

Memory Management

add_memory()

def add_memory(self, session_id: str, memory_id: str, text: str)
Stores a conversation exchange in vector memory.
session_id
string
required
Session identifier (format: f"{character_id}_{session_id}")
memory_id
string
required
Unique identifier for this memory (typically uuid.uuid4())
text
string
required
Formatted conversation exchange (e.g., "User: [question]\nAI: [response]")
Example
import uuid

memory_key = f"elara_a1b2c3d4e5f6"
rag_manager.add_memory(
    session_id=memory_key,
    memory_id=str(uuid.uuid4()),
    text="User: What happened in the forest?\nAI: You discovered ancient markings on the trees, suggesting this place was once sacred."
)
Implementation
self.memory_collection.upsert(
    ids=[f"{session_id}_{memory_id}"],
    documents=[text],
    metadatas=[{"session_id": session_id}]
)

query_memory()

def query_memory(
    self, session_id: str, query: str, n_results: int = 3, max_chars: int = 1500
) -> Tuple[List[str], bool]
Retrieve relevant conversation history from memory.
session_id
string
required
Session identifier to filter results
query
string
required
Current user message (used for semantic search)
n_results
int
Maximum number of memory chunks to retrieve (default: 3)
max_chars
int
Character limit for context window warning (default: 1500)
results
List[str]
List of relevant conversation exchanges, ordered by relevance
context_warning
bool
True if total characters exceed max_chars
Example
memory_key = f"elara_a1b2c3d4e5f6"
memories, warning = rag_manager.query_memory(
    session_id=memory_key,
    query="What did we discuss about magic?",
    n_results=3
)

for memory in memories:
    print(f"Memory: {memory}")
Implementation
results = self.memory_collection.query(
    query_texts=[query],
    n_results=n_results,
    where={"session_id": session_id}
)

if results["documents"] and results["documents"][0]:
    docs = results["documents"][0]
    total_chars = sum(len(d) for d in docs)
    context_warning = total_chars > max_chars
    return docs, context_warning
return [], False

delete_session_memory()

def delete_session_memory(self, session_id: str)
Permanently deletes all vector embeddings for a session.
session_id
string
required
Session identifier whose memory should be deleted
Example
memory_key = f"elara_a1b2c3d4e5f6"
rag_manager.delete_session_memory(memory_key)
Implementation
self.memory_collection.delete(where={"session_id": session_id})
This operation is irreversible. All conversation embeddings for the session will be permanently deleted.

Global Instance

rag_manager = RagManager()
Singleton-like instance created on module import. Usage
from engine.rag import rag_manager

# Use directly
rag_manager.add_lore("fantasy", "lore_001", "Ancient lore text...")

Usage in Main Flow

Startup Lore Loading

In engine/main.py:startup():
for data in load_yaml_assets("assets/worlds/*.yaml"):
    w = World(...)
    world_manager.add_world(w)
    
    # Split large lore into chunks
    if w.lore:
        chunks = []
        current_chunk = ""
        chunk_size = 800
        
        for line in w.lore.split('\n'):
            if len(current_chunk) + len(line) > chunk_size and current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = line + '\n'
            else:
                current_chunk += line + '\n'
        
        if current_chunk.strip():
            chunks.append(current_chunk.strip())
        
        # Add each chunk separately
        for i, chunk in enumerate(chunks):
            rag_manager.add_lore(w.id, f"base_lore_{i}", chunk)

Chat Context Retrieval

In engine/main.py:websocket_endpoint():
# RAG retrieval
lore_list, _ = rag_manager.query_lore(
    state.ACTIVE_WORLD_ID, prompt, n_results=2
)
mem_key = f"{state.ACTIVE_CHARACTER_ID}_{state.ACTIVE_SESSION_ID}"
mem_list, _ = rag_manager.query_memory(mem_key, prompt, n_results=3)

# Log retrieved chunks
if lore_list:
    log.info(f"\n=== RETRIEVED LORE ({len(lore_list)} chunks) ===")
    for i, chunk in enumerate(lore_list):
        log.info(f"[LORE {i+1}]\n{chunk}\n")

if mem_list:
    log.info(f"\n=== RETRIEVED MEMORY ({len(mem_list)} chunks) ===")
    for i, chunk in enumerate(mem_list):
        log.info(f"[MEMORY {i+1}]\n{chunk}\n")

# Build context
full_context = (
    f"--- RECENT MEMORY ---\n{chr(10).join(mem_list)}" if mem_list else ""
)

Memory Storage After Response

In engine/llm.py:stream_chat_response():
if full_content:
    memory_key = f"{char_id}_{session_id}" if session_id else char_id
    rag_manager.add_memory(
        memory_key, str(uuid.uuid4()), f"User: {prompt}\nAI: {full_content}"
    )

Complete Example

import uuid
from engine.rag import rag_manager

# === LORE MANAGEMENT ===

# Add world lore
rag_manager.add_lore(
    world_id="fantasy",
    lore_id="location_001",
    text="The Tower of Echoes stands at the edge of reality, where the veil between worlds is thinnest.",
    metadata={"type": "location", "region": "borderlands"}
)

rag_manager.add_lore(
    world_id="fantasy",
    lore_id="history_001",
    text="During the War of Shadows, the ancient mages sealed away forbidden knowledge in hidden vaults.",
    metadata={"type": "history", "era": "ancient"}
)

# Query lore
query = "Tell me about magical towers"
lore_results, warning = rag_manager.query_lore(
    world_id="fantasy",
    query=query,
    n_results=2
)

print(f"Query: {query}")
print(f"Context Warning: {warning}")
for i, chunk in enumerate(lore_results, 1):
    print(f"\nLore {i}: {chunk}")

# === MEMORY MANAGEMENT ===

# Session setup
char_id = "elara"
session_id = "a1b2c3d4e5f6"
memory_key = f"{char_id}_{session_id}"

# Store conversation exchanges
rag_manager.add_memory(
    session_id=memory_key,
    memory_id=str(uuid.uuid4()),
    text="User: What's your background?\nAI: I'm a wandering scholar who studies ancient magic and forgotten histories."
)

rag_manager.add_memory(
    session_id=memory_key,
    memory_id=str(uuid.uuid4()),
    text="User: Have you been to the Tower of Echoes?\nAI: Yes, I've visited it once. The experience was... unsettling."
)

# Query memory
query = "Have we talked about your travels?"
memory_results, warning = rag_manager.query_memory(
    session_id=memory_key,
    query=query,
    n_results=3
)

print(f"\nMemory Query: {query}")
for i, memory in enumerate(memory_results, 1):
    print(f"\nMemory {i}: {memory}")

# === CLEANUP ===

# Delete all session memory when done
rag_manager.delete_session_memory(memory_key)
print(f"\nDeleted all memory for session: {memory_key}")

ChromaDB Details

Persistent Storage

self.client = chromadb.PersistentClient(path=db_path)
Data is stored in ./chroma_db/ directory with the following structure:
chroma_db/
├── chroma.sqlite3       # Metadata and document storage
└── [collection_id]/     # Vector embeddings
    └── data_level0.bin

HNSW Index

metadata={"hnsw:space": "cosine"}
  • Algorithm: Hierarchical Navigable Small World (HNSW)
  • Distance Metric: Cosine similarity
  • Performance: O(log N) query time
  • Trade-off: Fast approximate search vs exact nearest neighbors

Embedding Dimensions

  • Model: all-MiniLM-L6-v2
  • Dimensions: 384
  • Format: Float32 array
  • Size: ~1.5KB per document

Performance Considerations

Chunk Size

chunk_size = 800  # characters per chunk
Optimal for:
  • Semantic coherence
  • Embedding quality
  • Context window efficiency

Query Limits

Recommended n_results values:
  • Lore: 2-3 chunks (focused, relevant context)
  • Memory: 3-5 chunks (recent conversation history)

Context Window Management

max_chars = 1000  # for lore
max_chars = 1500  # for memory
Warnings help prevent token limit issues with LLMs.

Error Handling

try:
    results = self.collection.query(...)
    # ...
except Exception as e:
    log.error(f"Error querying lore: {e}")
    return [], False
All query methods return empty results on error, allowing graceful degradation.

Build docs developers (and LLMs) love