DecipherIt uses Qdrant as its vector database to enable semantic search across research content. This allows users to ask natural language questions and receive contextually relevant answers.
Vector embeddings transform text into numerical representations that capture semantic meaning, enabling similarity-based search.
def _chunk_text(self, text: str) -> List[str]: """Split text into chunks based on chunk size and overlap. Args: text: The text to split into chunks Returns: List of text chunks with specified size and overlap """ if not text: logger.warning("Empty text provided for chunking") return [] logger.info(f"Chunking text with size {self.chunk_size} and overlap {self.chunk_overlap}") # Use list comprehension for cleaner chunk creation tokens = text.split() chunk_starts = range(0, len(tokens), self.chunk_size - self.chunk_overlap) chunks = [ " ".join(tokens[i:i + self.chunk_size]) for i in chunk_starts if i + self.chunk_size <= len(tokens) ] # Handle remaining tokens if any if tokens[chunk_starts[-1]:]: chunks.append(" ".join(tokens[chunk_starts[-1]:])) logger.info(f"Created {len(chunks)} chunks") return chunks
Chunk Size: 512 tokens
Balances context preservation with granular retrieval
OpenAI embeddings are generated for semantic search:
backend/services/qdrant_service.py
async def _get_embedding(self, text: str) -> List[float]: """Get embedding for text using OpenAI.""" if not self._initialized: await self.initialize() logger.info(f"Getting embedding using model {self.embedding_model}") response = await self.openai_client.embeddings.create( input=text, model=self.embedding_model, ) return response.data[0].embedding
DecipherIt uses text-embedding-3-small for cost-effective, high-quality embeddings with 1536 dimensions.
For operations that need complete notebook content (like mindmap generation):
backend/services/qdrant_service.py
async def get_all_chunks_by_notebook_id(self, notebook_id: str) -> List[Dict[str, Any]]: """Get all chunks for a specific notebook ID. Args: notebook_id: Notebook ID to retrieve chunks for Returns: List of all chunks for the notebook with their metadata """ if not self._initialized: await self.initialize() logger.info(f"Retrieving all chunks for notebook: {notebook_id}") # Set up filter for notebook_id filter_param = rest.Filter( must=[ rest.FieldCondition( key="notebook_id", match=rest.MatchValue(value=notebook_id), ) ] ) # Use scroll to get all points for the notebook scroll_result = await self.qdrant_client.scroll( collection_name=self.collection_name, scroll_filter=filter_param, limit=10000, with_payload=True, with_vectors=False, ) # Format results results = [] for point in scroll_result[0]: results.append({ "id": point.id, "content_chunk": point.payload.get("content_chunk"), "chunk_index": point.payload.get("chunk_index"), "total_chunks": point.payload.get("total_chunks"), "notebook_id": point.payload.get("notebook_id"), "metadata": point.payload.get("metadata"), "url": point.payload.get("url"), "page_title": point.payload.get("page_title"), }) # Sort by chunk_index to maintain order results.sort(key=lambda x: x.get("chunk_index", 0)) logger.info(f"Retrieved {len(results)} chunks for notebook {notebook_id}") return results
The scroll API efficiently retrieves large result sets without loading all vectors into memory.
async def delete_by_notebook_id(self, notebook_id: str) -> int: """Delete all sources for a specific notebook ID. Args: notebook_id: Notebook ID to delete Returns: Number of deleted points """ if not self._initialized: await self.initialize() logger.info(f"Deleting all sources for notebook: {notebook_id}") filter_param = rest.Filter( must=[ rest.FieldCondition( key="notebook_id", match=rest.MatchValue(value=notebook_id), ) ] ) result = await self.qdrant_client.delete( collection_name=self.collection_name, points_selector=rest.FilterSelector(filter=filter_param), ) logger.info(f"Deleted points for notebook {notebook_id}") return result.status