Deep dive into TypeAgent’s six specialized indexes and their purposes
TypeAgent maintains six specialized indexes that enable different query patterns and access methods. Each index serves a specific purpose and is updated incrementally as new knowledge is extracted.
class TermToSemanticRefIndex(ITermToSemanticRefIndex): _map: dict[str, list[ScoredSemanticRefOrdinal]] # Maps lowercase terms to semantic reference ordinals with scores
# Add entity name to indexawait semantic_ref_index.add_term( "Alice", # Term semantic_ref_ordinal # Reference to semantic ref)# Stored as:# _map["alice"] = [ScoredSemanticRefOrdinal(42, 1.0)]
# Find all references to "Alice"scored_refs = await semantic_ref_index.lookup_term("Alice")# Returns: [ScoredSemanticRefOrdinal(42, 1.0), ...]# Retrieve actual semantic referencesfor scored_ref in scored_refs: semantic_ref = await semantic_refs.get_item( scored_ref.semantic_ref_ordinal ) # semantic_ref.knowledge - Entity, Action, or Topic # semantic_ref.range - TextRange with message location
# Remove specific reference from termawait semantic_ref_index.remove_term( "Alice", semantic_ref_ordinal)# Removes only that specific ordinal, not the entire term
from typeagent.storage.memory.semrefindex import TermToSemanticRefIndexclass TermToSemanticRefIndex: _map: dict[str, list[ScoredSemanticRefOrdinal]] # In-memory dictionary # Fast lookups: O(1) # No persistence
SQLite Implementation
from typeagent.storage.sqlite.semrefindex import SqliteTermToSemanticRefIndex# Table: SemanticRefIndex# Columns: term (text), semantic_ref_ordinal (int), score (real)# Index: CREATE INDEX idx_semref_term ON SemanticRefIndex(term)# Persistent storage# Indexed queries# Transaction support
# Find entities named "Alice"scored_refs = await property_index.lookup_property( PropertyNames.EntityName.value, "Alice")# Find actions with "discuss" verbscored_refs = await property_index.lookup_property( PropertyNames.Verb.value, "discuss")# Find actions where Alice is the subjectscored_refs = await property_index.lookup_property( PropertyNames.Subject.value, "Alice")
from typeagent.storage.memory.propindex import ( lookup_property_in_property_index)from typeagent.knowpro.collections import TextRangesInScope# Only search within specific time range or threadranges_in_scope = TextRangesInScope(...)scored_refs = await lookup_property_in_property_index( property_index, PropertyNames.EntityName.value, "Alice", semantic_refs, ranges_in_scope # Filter to this scope)
The PropertyIndex enables structured queries that the SemanticRef index cannot:
# SemanticRef index: "What mentions 'blue'?"results = await semantic_ref_index.lookup_term("blue")# Returns all semantic refs with "blue" anywhere# Property index: "What entities have color=blue facet?"results = await property_index.lookup_property( PropertyNames.FacetValue.value, "blue")# Returns only entities with blue as a facet value# Property index: "What actions did Alice perform?"results = await property_index.lookup_property( PropertyNames.Subject.value, "Alice")# Returns only actions where Alice is the subject
class TimestampToTextRangeIndex(ITimestampToTextRangeIndex): _timestamp_to_ordinals: dict[str, list[MessageOrdinal]] # Maps ISO timestamp strings to message ordinals
from datetime import datetime, timezone# Find messages in date rangestart = datetime(2024, 1, 15, tzinfo=timezone.utc)end = datetime(2024, 1, 16, tzinfo=timezone.utc)message_ordinals = await timestamp_index.get_messages_in_range( start, end)# Returns: [0, 1] (messages 0 and 1 fall in range)# Retrieve actual messagesfor ordinal in message_ordinals: message = await messages.get_item(ordinal) print(f"{message.timestamp}: {message.text}")
# Get earliest and latest timestampsearliest, latest = await timestamp_index.get_time_bounds()print(f"Conversation spans {earliest} to {latest}")# Output: Conversation spans 2024-01-15T10:30:00Z to 2024-01-16T09:00:00Z
class MessageTextIndex(IMessageTextIndex): _embeddings: list[tuple[MessageOrdinal, np.ndarray]] # Message ordinals with their embedding vectors _embedding_model: IEmbeddingModel # Model for generating embeddings
# Add terms for fuzzy matchingterms = ["discuss", "talk", "speak", "converse", "chat"]await related_terms_index.fuzzy_index.add_terms(terms)# Each term is embedded and stored
# User searches for "discuss"original_term = "discuss"# Find related termsrelated = await related_terms_index.find_related( original_term, max_distance=0.3)# Search for original term AND related termsall_terms = [original_term] + [term for term, _ in related]# ["discuss", "talk", "speak", "converse"]# Query all variationsfor term in all_terms: results = await semantic_ref_index.lookup_term(term) # Combine results
# Create new threadthread_id = await threads.create_thread( name="Project Discussion", initial_message_ordinal=0)# Add messages to threadawait threads.add_to_thread( thread_id, message_ordinals=[1, 2, 3])
# Retrieve all messages in threadmessage_ordinals = await threads.get_thread_messages(thread_id)# Load actual messagesthread_messages = [ await messages.get_item(ordinal) for ordinal in message_ordinals]
# Which thread does this message belong to?thread_id = await threads.get_message_thread( message_ordinal=5)if thread_id: print(f"Message 5 is in thread {thread_id}")else: print("Message 5 is not in any thread")
MessageText and RelatedTerms use linear similarity search. For large datasets, consider using approximate nearest neighbor (ANN) indexes like FAISS or Annoy.