FAISS (Facebook AI Similarity Search) is used to store resume embeddings and perform fast similarity searches. The RAG Recruitment Assistant uses FAISS to find the most relevant candidates based on semantic matching.
query = "Desarrollador Python con experiencia en FastAPI"results = vectorstore.similarity_search(query, k=3)for i, doc in enumerate(results, 1): print(f"\nResult {i}:") print(doc.page_content[:200]) # First 200 chars
Diversity vs relevance trade-off (0=max diversity, 1=max relevance).
query = "Estudiante de ingeniería con proyectos en React"diverse_results = vectorstore.max_marginal_relevance_search( query=query, k=5, fetch_k=20, lambda_mult=0.7 # Favor relevance over diversity)
from langchain_community.vectorstores import FAISSfrom langchain_huggingface import HuggingFaceEmbeddings# Must use same embeddings modelembeddings = HuggingFaceEmbeddings()# Load from diskvectorstore = FAISS.load_local( "faiss_index", embeddings, allow_dangerous_deserialization=True)print("Vector store loaded successfully")
# Default: IndexFlatL2 (exact search, slower for large datasets)vectorstore = FAISS.from_documents(docs, embeddings)# For large datasets, use approximate searchimport faiss# Create custom indexdimension = 384 # MiniLM dimensionindex = faiss.IndexIVFFlat( faiss.IndexFlatL2(dimension), dimension, 100 # number of clusters)vectorstore = FAISS( embedding_function=embeddings.embed_query, index=index, docstore=InMemoryDocstore({}), index_to_docstore_id={})
Batch Processing
# Process large resume collections in batchesbatch_size = 10archivos = glob.glob("cvs_estudiantes_final/*.pdf")vectorstore = Nonefor i in range(0, len(archivos), batch_size): batch_files = archivos[i:i + batch_size] batch_docs = [] for pdf in batch_files: loader = PyPDFLoader(pdf) batch_docs.extend(loader.load()) if vectorstore is None: vectorstore = FAISS.from_documents(batch_docs, embeddings) else: vectorstore.add_documents(batch_docs) print(f"Processed batch {i//batch_size + 1}")
Memory Management
# Clear vector store from memory when donedel vectorstoreimport gcgc.collect()# Use save_local for large datasetsvectorstore.save_local("./index")del vectorstore# Load only when neededvectorstore = FAISS.load_local("./index", embeddings)
# Error: RuntimeError: Error in void faiss::IndexFlat::add_core# Fix: Ensure embeddings model is consistentembeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2")# When loading, use the SAME modelvectorstore = FAISS.load_local("index", embeddings)
Serialization Error
# Must allow deserialization when loadingvectorstore = FAISS.load_local( "faiss_index", embeddings, allow_dangerous_deserialization=True)