The PDF RAG Analyzer enables interactive conversations with multiple PDF files using Google Gemini 1.5 Flash and FAISS vector storage. Specialized for analyzing financial documents, annual reports, and related-party transactions of Indian stock market companies.
from PyPDF2 import PdfReaderdef get_pdf_text(pdf_docs): """Extract text from multiple PDF files.""" text = "" for pdf in pdf_docs: pdf_reader = PdfReader(pdf) for page in pdf_reader.pages: text += page.extract_text() return text
from langchain_google_genai import ChatGoogleGenerativeAIfrom langchain.chains import ConversationalRetrievalChainfrom langchain.memory import ConversationBufferMemorydef get_conversational_chain(): """Create conversational retrieval chain.""" # Finance-aware prompt template prompt_template = """ You are a financial analysis expert. Analyze the provided documents carefully. Context from documents: {context} Chat history: {chat_history} Question: {question} When answering: 1. Focus on financial statements, ratios, and metrics 2. Identify irregularities or red flags 3. Analyze related-party transactions 4. Evaluate managerial remuneration 5. Provide data-driven insights If the answer is not in the documents, say "I don't have that information in the provided documents." Answer: """ # Initialize Gemini model model = ChatGoogleGenerativeAI( model="gemini-1.5-flash", temperature=0.3, # Lower temperature for factual responses google_api_key=os.getenv("GOOGLE_API_KEY") ) # Load FAISS index embeddings = GoogleGenerativeAIEmbeddings( model="models/embedding-001" ) vector_store = FAISS.load_local( "faiss_index", embeddings, allow_dangerous_deserialization=True ) # Create retrieval chain chain = ConversationalRetrievalChain.from_llm( llm=model, retriever=vector_store.as_retriever( search_kwargs={"k": 5} # Retrieve top 5 chunks ), return_source_documents=True, verbose=True ) return chain
# Finance-specific system promptFINANCE_SYSTEM_PROMPT = """You are a financial analysis expert specializing in:- Balance sheet analysis- Cash flow evaluation- Related-party transaction detection- Key Managerial Personnel (KMP) remuneration analysis- Debt-to-equity ratio calculation- CFO to Net Profit conversion analysis- Red flag identification in financial statementsAlways:1. Cite specific numbers from the documents2. Calculate ratios when relevant3. Highlight unusual or concerning patterns4. Compare year-over-year trends5. Flag potential irregularities"""
import streamlit as stimport pandas as pdfrom datetime import datetimest.set_page_config(page_title="Chat with PDFs", page_icon="📚")st.title("📚 Chat with Multiple PDFs")# Sidebar: PDF upload and processingwith st.sidebar: st.header("Upload Documents") # Google AI API key input api_key = st.text_input( "Google AI API Key", type="password", help="Get your API key from https://ai.google.dev/" ) if api_key: os.environ["GOOGLE_API_KEY"] = api_key # PDF upload pdf_docs = st.file_uploader( "Upload PDF files", accept_multiple_files=True, type=["pdf"] ) if st.button("Process Documents") and pdf_docs: with st.spinner("Processing PDFs..."): # Extract text raw_text = get_pdf_text(pdf_docs) # Create chunks text_chunks = get_text_chunks(raw_text) st.write(f"Created {len(text_chunks)} text chunks") # Create vector store vector_store = get_vector_store(text_chunks) st.success("✅ Documents processed successfully!") st.session_state.docs_processed = True# Chat interfaceif "messages" not in st.session_state: st.session_state.messages = []if "chat_history" not in st.session_state: st.session_state.chat_history = []# Display chat messagesfor message in st.session_state.messages: with st.chat_message(message["role"]): st.markdown(message["content"])# Chat inputif prompt := st.chat_input("Ask about your documents..."): if not st.session_state.get('docs_processed', False): st.warning("Please upload and process documents first.") else: # Add user message st.session_state.messages.append({"role": "user", "content": prompt}) with st.chat_message("user"): st.markdown(prompt) # Get response with st.chat_message("assistant"): with st.spinner("Thinking..."): chain = get_conversational_chain() response = chain({ "question": prompt, "chat_history": st.session_state.chat_history }) answer = response["answer"] st.markdown(answer) # Update history st.session_state.messages.append({"role": "assistant", "content": answer}) st.session_state.chat_history.append((prompt, answer))# Export conversationif st.session_state.messages: if st.sidebar.button("💾 Export Conversation"): df = pd.DataFrame(st.session_state.messages) csv = df.to_csv(index=False) st.sidebar.download_button( "Download CSV", csv, f"conversation_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", "text/csv" )
# Balance sheet analysis"What is the company's debt-to-equity ratio over the last 3 years?"# Related-party transactions"Identify all related-party transactions mentioned in the annual report."# Cash flow analysis"What is the CFO to Net Profit conversion trend?"# Remuneration analysis"Has there been any unusual increase in Key Managerial Personnel pay?"# Comparative analysis"Compare the debt levels across the 5 companies in these reports."# Red flags"Are there any red flags or irregularities in the financial statements?"