Overview
AutoGen is a framework for building multi-agent conversational systems. MarkItDown enhances AutoGen applications by:- Converting diverse document formats into clean Markdown for agent consumption
- Enabling agents to work with PDFs, Office documents, images, and more
- Providing structured document content that agents can analyze and discuss
- Supporting document Q&A, summarization, and collaborative analysis workflows
MarkItDown is officially maintained by the AutoGen team and designed specifically for LLM workflows.
Installation
Install both MarkItDown and AutoGen:pip install 'markitdown[all]' 'pyautogen'
Basic Integration
Document Analysis with UserProxyAgent
Convert documents before passing them to AutoGen agents:from markitdown import MarkItDown
import autogen
# Initialize MarkItDown
md = MarkItDown()
# Convert document to Markdown
result = md.convert("quarterly_report.pdf")
document_content = result.text_content
# Configure AutoGen
config_list = [
{
"model": "gpt-4",
"api_key": "your-api-key"
}
]
# Create assistant agent
assistant = autogen.AssistantAgent(
name="document_analyst",
llm_config={"config_list": config_list},
system_message="""You are a document analysis assistant.
Analyze documents and provide insights, summaries, and answer questions."""
)
# Create user proxy agent
user_proxy = autogen.UserProxyAgent(
name="user",
human_input_mode="NEVER",
max_consecutive_auto_reply=0
)
# Start conversation with document content
user_proxy.initiate_chat(
assistant,
message=f"""Please analyze this quarterly report and provide:
1. Key financial highlights
2. Main risks identified
3. Strategic initiatives
Document content:
{document_content}"""
)
Document Q&A System
Build a multi-agent document Q&A system:from markitdown import MarkItDown
from openai import OpenAI
import autogen
from pathlib import Path
class DocumentQASystem:
def __init__(self, config_list):
# Initialize MarkItDown with LLM support for images
openai_client = OpenAI(api_key=config_list[0]["api_key"])
self.md = MarkItDown(
llm_client=openai_client,
llm_model="gpt-4o"
)
# Create document reader agent
self.reader = autogen.AssistantAgent(
name="document_reader",
llm_config={"config_list": config_list},
system_message="""You are a document reader. Extract specific
information from documents accurately and concisely."""
)
# Create analyst agent
self.analyst = autogen.AssistantAgent(
name="analyst",
llm_config={"config_list": config_list},
system_message="""You are an analyst. Synthesize information,
identify patterns, and provide insights."""
)
# Create user proxy
self.user_proxy = autogen.UserProxyAgent(
name="user",
human_input_mode="NEVER",
code_execution_config={"use_docker": False}
)
def load_document(self, file_path):
"""Convert document to Markdown."""
result = self.md.convert(file_path)
return result.text_content
def analyze(self, file_path, question):
"""Analyze document with multi-agent system."""
# Load document
document = self.load_document(file_path)
# Create group chat
groupchat = autogen.GroupChat(
agents=[self.user_proxy, self.reader, self.analyst],
messages=[],
max_round=10
)
manager = autogen.GroupChatManager(
groupchat=groupchat,
llm_config={"config_list": self.config_list}
)
# Start analysis
self.user_proxy.initiate_chat(
manager,
message=f"""Question: {question}
Document to analyze:
{document}
Please work together to answer the question based on the document."""
)
# Usage
config_list = [{"model": "gpt-4", "api_key": "your-api-key"}]
qa_system = DocumentQASystem(config_list)
qa_system.analyze(
"contract.pdf",
"What are the key terms and conditions in this contract?"
)
Multi-Document Analysis
Process multiple documents with collaborative agents:from markitdown import MarkItDown
import autogen
from pathlib import Path
import json
def multi_document_analysis(document_paths, analysis_task, config_list):
"""Analyze multiple documents with AutoGen agents."""
# Initialize MarkItDown
md = MarkItDown()
# Convert all documents
documents = {}
for path in document_paths:
result = md.convert(path)
documents[Path(path).name] = result.text_content
# Create specialized agents
document_reader = autogen.AssistantAgent(
name="document_reader",
llm_config={"config_list": config_list},
system_message="Extract and summarize key information from documents."
)
comparative_analyst = autogen.AssistantAgent(
name="comparative_analyst",
llm_config={"config_list": config_list},
system_message="Compare information across documents and identify patterns."
)
synthesizer = autogen.AssistantAgent(
name="synthesizer",
llm_config={"config_list": config_list},
system_message="Synthesize findings into coherent insights and recommendations."
)
user_proxy = autogen.UserProxyAgent(
name="user",
human_input_mode="NEVER",
max_consecutive_auto_reply=0
)
# Create group chat
groupchat = autogen.GroupChat(
agents=[user_proxy, document_reader, comparative_analyst, synthesizer],
messages=[],
max_round=15
)
manager = autogen.GroupChatManager(
groupchat=groupchat,
llm_config={"config_list": config_list}
)
# Format documents for analysis
documents_text = "\n\n".join([
f"=== Document: {name} ===\n{content}"
for name, content in documents.items()
])
# Initiate analysis
user_proxy.initiate_chat(
manager,
message=f"""Task: {analysis_task}
Documents to analyze:
{documents_text}
Please collaborate to complete the analysis task."""
)
# Usage
config_list = [{"model": "gpt-4", "api_key": "your-api-key"}]
multi_document_analysis(
document_paths=[
"proposal_v1.docx",
"proposal_v2.docx",
"proposal_v3.docx"
],
analysis_task="""Compare the three proposals and identify:
1. Key differences in approach
2. Budget variations
3. Timeline changes
4. Recommended version with justification""",
config_list=config_list
)
RAG with AutoGen
Combine MarkItDown with Retrieval-Augmented Generation:from markitdown import MarkItDown
import autogen
from pathlib import Path
import chromadb
from chromadb.utils import embedding_functions
class DocumentRAGSystem:
def __init__(self, config_list):
self.md = MarkItDown()
self.config_list = config_list
# Initialize vector database
self.chroma_client = chromadb.Client()
self.embedding_function = embedding_functions.OpenAIEmbeddingFunction(
api_key=config_list[0]["api_key"],
model_name="text-embedding-3-small"
)
self.collection = self.chroma_client.create_collection(
name="documents",
embedding_function=self.embedding_function
)
# Create RAG agent
self.rag_agent = autogen.AssistantAgent(
name="rag_assistant",
llm_config={"config_list": config_list},
system_message="""You are a RAG assistant. Answer questions using
the retrieved document context. Cite specific sections when possible."""
)
self.user_proxy = autogen.UserProxyAgent(
name="user",
human_input_mode="NEVER",
code_execution_config={"use_docker": False}
)
def ingest_documents(self, directory_path):
"""Convert and index documents from a directory."""
docs = []
metadatas = []
ids = []
for i, file_path in enumerate(Path(directory_path).rglob("*")):
if file_path.is_file():
try:
# Convert to Markdown
result = self.md.convert(str(file_path))
# Chunk the document (simple splitting)
chunks = self._chunk_text(result.text_content, chunk_size=1000)
for j, chunk in enumerate(chunks):
docs.append(chunk)
metadatas.append({
"source": file_path.name,
"chunk": j,
"path": str(file_path)
})
ids.append(f"{file_path.stem}_{j}")
except Exception as e:
print(f"Failed to process {file_path}: {e}")
# Add to vector database
if docs:
self.collection.add(
documents=docs,
metadatas=metadatas,
ids=ids
)
print(f"Indexed {len(docs)} chunks from {len(set(m['source'] for m in metadatas))} documents")
def _chunk_text(self, text, chunk_size=1000, overlap=200):
"""Simple text chunking."""
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunks.append(text[start:end])
start = end - overlap
return chunks
def query(self, question, n_results=3):
"""Query documents and generate answer."""
# Retrieve relevant chunks
results = self.collection.query(
query_texts=[question],
n_results=n_results
)
# Format context
context = "\n\n".join([
f"Source: {meta['source']}\n{doc}"
for doc, meta in zip(results['documents'][0], results['metadatas'][0])
])
# Query with context
self.user_proxy.initiate_chat(
self.rag_agent,
message=f"""Question: {question}
Relevant document excerpts:
{context}
Please answer the question based on the provided context."""
)
# Usage
config_list = [{"model": "gpt-4", "api_key": "your-api-key"}]
rag_system = DocumentRAGSystem(config_list)
# Ingest documents
rag_system.ingest_documents("./knowledge_base")
# Query
rag_system.query("What is the company's policy on remote work?")
Automated Document Processing Pipeline
Build an automated pipeline with function calling:from markitdown import MarkItDown
import autogen
from pathlib import Path
import json
class DocumentProcessor:
def __init__(self, config_list):
self.md = MarkItDown()
self.config_list = config_list
def convert_document(self, file_path):
"""Convert document to Markdown."""
result = self.md.convert(file_path)
return {
"success": True,
"content": result.text_content,
"title": result.title or Path(file_path).stem
}
def extract_metadata(self, markdown_content):
"""Extract structured metadata from document."""
# This would be implemented by the agent
pass
# Define tools for the agent
processor = DocumentProcessor(
config_list=[{"model": "gpt-4", "api_key": "your-api-key"}]
)
# Create agent with function calling
assistant = autogen.AssistantAgent(
name="document_processor",
llm_config={
"config_list": processor.config_list,
"functions": [
{
"name": "convert_document",
"description": "Convert a document to Markdown format",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the document file"
}
},
"required": ["file_path"]
}
}
]
}
)
user_proxy = autogen.UserProxyAgent(
name="user",
human_input_mode="NEVER",
function_map={
"convert_document": processor.convert_document
}
)
# Process documents automatically
user_proxy.initiate_chat(
assistant,
message="""Please process all PDF files in the ./documents folder:
1. Convert each to Markdown
2. Extract key metadata (title, author, date)
3. Summarize main topics
4. Create a structured index"""
)
Best Practices
1. Chunk Large Documents
def chunk_markdown(text, max_tokens=3000):
"""Split long documents for better agent processing."""
# Simple chunking by headers
sections = text.split("\n# ")
return [s for s in sections if s.strip()]
# Convert and chunk
result = md.convert("large_report.pdf")
chunks = chunk_markdown(result.text_content)
# Process each chunk
for i, chunk in enumerate(chunks):
user_proxy.initiate_chat(
assistant,
message=f"Analyze section {i+1}:\n{chunk}"
)
2. Preserve Document Structure
MarkItDown’s Markdown output preserves headers, lists, and tables - use this structure:system_message = """When analyzing documents:
- Use the header hierarchy to understand document structure
- Reference specific sections by header name
- Preserve table data in your analysis
- Note when information comes from lists vs paragraphs"""
3. Handle Multiple File Types
def process_any_document(file_path, config_list):
"""Universal document processor."""
md = MarkItDown()
try:
result = md.convert(file_path)
return result.text_content
except Exception as e:
return f"Error processing {file_path}: {e}"
# Works with PDFs, DOCX, PPTX, images, etc.
for file_path in Path("./inbox").iterdir():
content = process_any_document(str(file_path), config_list)
# Process with AutoGen...
4. Combine with Image Descriptions
from openai import OpenAI
# Initialize with LLM for image descriptions
openai_client = OpenAI(api_key="your-api-key")
md = MarkItDown(
llm_client=openai_client,
llm_model="gpt-4o"
)
# Images in documents will have AI-generated descriptions
result = md.convert("presentation.pptx")
# AutoGen agents can now reason about image content
Integration Patterns
Pattern 1: Sequential Agent Processing
# Document -> Reader -> Analyst -> Reporter
result = md.convert("report.pdf")
# Stage 1: Extract facts
facts = reader_agent.process(result.text_content)
# Stage 2: Analyze
analysis = analyst_agent.process(facts)
# Stage 3: Generate report
final_report = reporter_agent.process(analysis)
Pattern 2: Parallel Document Processing
import asyncio
async def process_documents(file_paths):
tasks = []
for path in file_paths:
result = md.convert(path)
task = agent.process_async(result.text_content)
tasks.append(task)
return await asyncio.gather(*tasks)
Pattern 3: Interactive Document Exploration
# User asks questions, agent navigates document
document = md.convert("manual.pdf").text_content
while True:
question = input("Your question: ")
user_proxy.initiate_chat(
assistant,
message=f"Document: {document}\n\nQuestion: {question}"
)
Resources
Example: Complete Document Intelligence System
from markitdown import MarkItDown
from openai import OpenAI
import autogen
from pathlib import Path
class DocumentIntelligenceSystem:
def __init__(self, api_key):
# Initialize MarkItDown with vision support
openai_client = OpenAI(api_key=api_key)
self.md = MarkItDown(
llm_client=openai_client,
llm_model="gpt-4o"
)
config_list = [{"model": "gpt-4", "api_key": api_key}]
# Create specialized agents
self.extractor = autogen.AssistantAgent(
name="data_extractor",
llm_config={"config_list": config_list},
system_message="Extract structured data from documents."
)
self.validator = autogen.AssistantAgent(
name="validator",
llm_config={"config_list": config_list},
system_message="Validate extracted data for accuracy and completeness."
)
self.summarizer = autogen.AssistantAgent(
name="summarizer",
llm_config={"config_list": config_list},
system_message="Create concise, accurate summaries."
)
self.user_proxy = autogen.UserProxyAgent(
name="coordinator",
human_input_mode="NEVER",
code_execution_config={"use_docker": False}
)
def process(self, file_path, task):
# Convert document
result = self.md.convert(file_path)
# Create group chat
groupchat = autogen.GroupChat(
agents=[self.user_proxy, self.extractor, self.validator, self.summarizer],
messages=[],
max_round=12
)
manager = autogen.GroupChatManager(
groupchat=groupchat,
llm_config={"config_list": [{"model": "gpt-4", "api_key": self.md._llm_client.api_key}]}
)
# Execute task
self.user_proxy.initiate_chat(
manager,
message=f"Task: {task}\n\nDocument: {result.text_content}"
)
# Usage
system = DocumentIntelligenceSystem(api_key="your-api-key")
system.process(
"invoice.pdf",
"Extract invoice details, validate totals, and summarize transaction."
)