Skip to main content
MarkItDown is built by the AutoGen team at Microsoft, making it a natural fit for AutoGen-powered multi-agent applications. Use MarkItDown to convert documents into LLM-friendly Markdown that AutoGen agents can process and reason about.

Overview

AutoGen is a framework for building multi-agent conversational systems. MarkItDown enhances AutoGen applications by:
  • Converting diverse document formats into clean Markdown for agent consumption
  • Enabling agents to work with PDFs, Office documents, images, and more
  • Providing structured document content that agents can analyze and discuss
  • Supporting document Q&A, summarization, and collaborative analysis workflows
MarkItDown is officially maintained by the AutoGen team and designed specifically for LLM workflows.

Installation

Install both MarkItDown and AutoGen:
pip install 'markitdown[all]' 'pyautogen'

Basic Integration

Document Analysis with UserProxyAgent

Convert documents before passing them to AutoGen agents:
from markitdown import MarkItDown
import autogen

# Initialize MarkItDown
md = MarkItDown()

# Convert document to Markdown
result = md.convert("quarterly_report.pdf")
document_content = result.text_content

# Configure AutoGen
config_list = [
    {
        "model": "gpt-4",
        "api_key": "your-api-key"
    }
]

# Create assistant agent
assistant = autogen.AssistantAgent(
    name="document_analyst",
    llm_config={"config_list": config_list},
    system_message="""You are a document analysis assistant. 
    Analyze documents and provide insights, summaries, and answer questions."""
)

# Create user proxy agent
user_proxy = autogen.UserProxyAgent(
    name="user",
    human_input_mode="NEVER",
    max_consecutive_auto_reply=0
)

# Start conversation with document content
user_proxy.initiate_chat(
    assistant,
    message=f"""Please analyze this quarterly report and provide:
    1. Key financial highlights
    2. Main risks identified
    3. Strategic initiatives
    
    Document content:
    {document_content}"""
)

Document Q&A System

Build a multi-agent document Q&A system:
from markitdown import MarkItDown
from openai import OpenAI
import autogen
from pathlib import Path

class DocumentQASystem:
    def __init__(self, config_list):
        # Initialize MarkItDown with LLM support for images
        openai_client = OpenAI(api_key=config_list[0]["api_key"])
        self.md = MarkItDown(
            llm_client=openai_client,
            llm_model="gpt-4o"
        )
        
        # Create document reader agent
        self.reader = autogen.AssistantAgent(
            name="document_reader",
            llm_config={"config_list": config_list},
            system_message="""You are a document reader. Extract specific 
            information from documents accurately and concisely."""
        )
        
        # Create analyst agent
        self.analyst = autogen.AssistantAgent(
            name="analyst",
            llm_config={"config_list": config_list},
            system_message="""You are an analyst. Synthesize information, 
            identify patterns, and provide insights."""
        )
        
        # Create user proxy
        self.user_proxy = autogen.UserProxyAgent(
            name="user",
            human_input_mode="NEVER",
            code_execution_config={"use_docker": False}
        )
    
    def load_document(self, file_path):
        """Convert document to Markdown."""
        result = self.md.convert(file_path)
        return result.text_content
    
    def analyze(self, file_path, question):
        """Analyze document with multi-agent system."""
        # Load document
        document = self.load_document(file_path)
        
        # Create group chat
        groupchat = autogen.GroupChat(
            agents=[self.user_proxy, self.reader, self.analyst],
            messages=[],
            max_round=10
        )
        
        manager = autogen.GroupChatManager(
            groupchat=groupchat,
            llm_config={"config_list": self.config_list}
        )
        
        # Start analysis
        self.user_proxy.initiate_chat(
            manager,
            message=f"""Question: {question}
            
            Document to analyze:
            {document}
            
            Please work together to answer the question based on the document."""
        )

# Usage
config_list = [{"model": "gpt-4", "api_key": "your-api-key"}]
qa_system = DocumentQASystem(config_list)

qa_system.analyze(
    "contract.pdf",
    "What are the key terms and conditions in this contract?"
)

Multi-Document Analysis

Process multiple documents with collaborative agents:
from markitdown import MarkItDown
import autogen
from pathlib import Path
import json

def multi_document_analysis(document_paths, analysis_task, config_list):
    """Analyze multiple documents with AutoGen agents."""
    
    # Initialize MarkItDown
    md = MarkItDown()
    
    # Convert all documents
    documents = {}
    for path in document_paths:
        result = md.convert(path)
        documents[Path(path).name] = result.text_content
    
    # Create specialized agents
    document_reader = autogen.AssistantAgent(
        name="document_reader",
        llm_config={"config_list": config_list},
        system_message="Extract and summarize key information from documents."
    )
    
    comparative_analyst = autogen.AssistantAgent(
        name="comparative_analyst",
        llm_config={"config_list": config_list},
        system_message="Compare information across documents and identify patterns."
    )
    
    synthesizer = autogen.AssistantAgent(
        name="synthesizer",
        llm_config={"config_list": config_list},
        system_message="Synthesize findings into coherent insights and recommendations."
    )
    
    user_proxy = autogen.UserProxyAgent(
        name="user",
        human_input_mode="NEVER",
        max_consecutive_auto_reply=0
    )
    
    # Create group chat
    groupchat = autogen.GroupChat(
        agents=[user_proxy, document_reader, comparative_analyst, synthesizer],
        messages=[],
        max_round=15
    )
    
    manager = autogen.GroupChatManager(
        groupchat=groupchat,
        llm_config={"config_list": config_list}
    )
    
    # Format documents for analysis
    documents_text = "\n\n".join([
        f"=== Document: {name} ===\n{content}"
        for name, content in documents.items()
    ])
    
    # Initiate analysis
    user_proxy.initiate_chat(
        manager,
        message=f"""Task: {analysis_task}
        
        Documents to analyze:
        {documents_text}
        
        Please collaborate to complete the analysis task."""
    )

# Usage
config_list = [{"model": "gpt-4", "api_key": "your-api-key"}]

multi_document_analysis(
    document_paths=[
        "proposal_v1.docx",
        "proposal_v2.docx",
        "proposal_v3.docx"
    ],
    analysis_task="""Compare the three proposals and identify:
        1. Key differences in approach
        2. Budget variations
        3. Timeline changes
        4. Recommended version with justification""",
    config_list=config_list
)

RAG with AutoGen

Combine MarkItDown with Retrieval-Augmented Generation:
from markitdown import MarkItDown
import autogen
from pathlib import Path
import chromadb
from chromadb.utils import embedding_functions

class DocumentRAGSystem:
    def __init__(self, config_list):
        self.md = MarkItDown()
        self.config_list = config_list
        
        # Initialize vector database
        self.chroma_client = chromadb.Client()
        self.embedding_function = embedding_functions.OpenAIEmbeddingFunction(
            api_key=config_list[0]["api_key"],
            model_name="text-embedding-3-small"
        )
        
        self.collection = self.chroma_client.create_collection(
            name="documents",
            embedding_function=self.embedding_function
        )
        
        # Create RAG agent
        self.rag_agent = autogen.AssistantAgent(
            name="rag_assistant",
            llm_config={"config_list": config_list},
            system_message="""You are a RAG assistant. Answer questions using 
            the retrieved document context. Cite specific sections when possible."""
        )
        
        self.user_proxy = autogen.UserProxyAgent(
            name="user",
            human_input_mode="NEVER",
            code_execution_config={"use_docker": False}
        )
    
    def ingest_documents(self, directory_path):
        """Convert and index documents from a directory."""
        docs = []
        metadatas = []
        ids = []
        
        for i, file_path in enumerate(Path(directory_path).rglob("*")):
            if file_path.is_file():
                try:
                    # Convert to Markdown
                    result = self.md.convert(str(file_path))
                    
                    # Chunk the document (simple splitting)
                    chunks = self._chunk_text(result.text_content, chunk_size=1000)
                    
                    for j, chunk in enumerate(chunks):
                        docs.append(chunk)
                        metadatas.append({
                            "source": file_path.name,
                            "chunk": j,
                            "path": str(file_path)
                        })
                        ids.append(f"{file_path.stem}_{j}")
                except Exception as e:
                    print(f"Failed to process {file_path}: {e}")
        
        # Add to vector database
        if docs:
            self.collection.add(
                documents=docs,
                metadatas=metadatas,
                ids=ids
            )
            print(f"Indexed {len(docs)} chunks from {len(set(m['source'] for m in metadatas))} documents")
    
    def _chunk_text(self, text, chunk_size=1000, overlap=200):
        """Simple text chunking."""
        chunks = []
        start = 0
        while start < len(text):
            end = start + chunk_size
            chunks.append(text[start:end])
            start = end - overlap
        return chunks
    
    def query(self, question, n_results=3):
        """Query documents and generate answer."""
        # Retrieve relevant chunks
        results = self.collection.query(
            query_texts=[question],
            n_results=n_results
        )
        
        # Format context
        context = "\n\n".join([
            f"Source: {meta['source']}\n{doc}"
            for doc, meta in zip(results['documents'][0], results['metadatas'][0])
        ])
        
        # Query with context
        self.user_proxy.initiate_chat(
            self.rag_agent,
            message=f"""Question: {question}
            
            Relevant document excerpts:
            {context}
            
            Please answer the question based on the provided context."""
        )

# Usage
config_list = [{"model": "gpt-4", "api_key": "your-api-key"}]
rag_system = DocumentRAGSystem(config_list)

# Ingest documents
rag_system.ingest_documents("./knowledge_base")

# Query
rag_system.query("What is the company's policy on remote work?")

Automated Document Processing Pipeline

Build an automated pipeline with function calling:
from markitdown import MarkItDown
import autogen
from pathlib import Path
import json

class DocumentProcessor:
    def __init__(self, config_list):
        self.md = MarkItDown()
        self.config_list = config_list
    
    def convert_document(self, file_path):
        """Convert document to Markdown."""
        result = self.md.convert(file_path)
        return {
            "success": True,
            "content": result.text_content,
            "title": result.title or Path(file_path).stem
        }
    
    def extract_metadata(self, markdown_content):
        """Extract structured metadata from document."""
        # This would be implemented by the agent
        pass

# Define tools for the agent
processor = DocumentProcessor(
    config_list=[{"model": "gpt-4", "api_key": "your-api-key"}]
)

# Create agent with function calling
assistant = autogen.AssistantAgent(
    name="document_processor",
    llm_config={
        "config_list": processor.config_list,
        "functions": [
            {
                "name": "convert_document",
                "description": "Convert a document to Markdown format",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "file_path": {
                            "type": "string",
                            "description": "Path to the document file"
                        }
                    },
                    "required": ["file_path"]
                }
            }
        ]
    }
)

user_proxy = autogen.UserProxyAgent(
    name="user",
    human_input_mode="NEVER",
    function_map={
        "convert_document": processor.convert_document
    }
)

# Process documents automatically
user_proxy.initiate_chat(
    assistant,
    message="""Please process all PDF files in the ./documents folder:
    1. Convert each to Markdown
    2. Extract key metadata (title, author, date)
    3. Summarize main topics
    4. Create a structured index"""
)

Best Practices

1. Chunk Large Documents

def chunk_markdown(text, max_tokens=3000):
    """Split long documents for better agent processing."""
    # Simple chunking by headers
    sections = text.split("\n# ")
    return [s for s in sections if s.strip()]

# Convert and chunk
result = md.convert("large_report.pdf")
chunks = chunk_markdown(result.text_content)

# Process each chunk
for i, chunk in enumerate(chunks):
    user_proxy.initiate_chat(
        assistant,
        message=f"Analyze section {i+1}:\n{chunk}"
    )

2. Preserve Document Structure

MarkItDown’s Markdown output preserves headers, lists, and tables - use this structure:
system_message = """When analyzing documents:
- Use the header hierarchy to understand document structure
- Reference specific sections by header name
- Preserve table data in your analysis
- Note when information comes from lists vs paragraphs"""

3. Handle Multiple File Types

def process_any_document(file_path, config_list):
    """Universal document processor."""
    md = MarkItDown()
    
    try:
        result = md.convert(file_path)
        return result.text_content
    except Exception as e:
        return f"Error processing {file_path}: {e}"

# Works with PDFs, DOCX, PPTX, images, etc.
for file_path in Path("./inbox").iterdir():
    content = process_any_document(str(file_path), config_list)
    # Process with AutoGen...

4. Combine with Image Descriptions

from openai import OpenAI

# Initialize with LLM for image descriptions
openai_client = OpenAI(api_key="your-api-key")
md = MarkItDown(
    llm_client=openai_client,
    llm_model="gpt-4o"
)

# Images in documents will have AI-generated descriptions
result = md.convert("presentation.pptx")
# AutoGen agents can now reason about image content

Integration Patterns

Pattern 1: Sequential Agent Processing

# Document -> Reader -> Analyst -> Reporter
result = md.convert("report.pdf")

# Stage 1: Extract facts
facts = reader_agent.process(result.text_content)

# Stage 2: Analyze
analysis = analyst_agent.process(facts)

# Stage 3: Generate report
final_report = reporter_agent.process(analysis)

Pattern 2: Parallel Document Processing

import asyncio

async def process_documents(file_paths):
    tasks = []
    for path in file_paths:
        result = md.convert(path)
        task = agent.process_async(result.text_content)
        tasks.append(task)
    
    return await asyncio.gather(*tasks)

Pattern 3: Interactive Document Exploration

# User asks questions, agent navigates document
document = md.convert("manual.pdf").text_content

while True:
    question = input("Your question: ")
    user_proxy.initiate_chat(
        assistant,
        message=f"Document: {document}\n\nQuestion: {question}"
    )

Resources

Example: Complete Document Intelligence System

from markitdown import MarkItDown
from openai import OpenAI
import autogen
from pathlib import Path

class DocumentIntelligenceSystem:
    def __init__(self, api_key):
        # Initialize MarkItDown with vision support
        openai_client = OpenAI(api_key=api_key)
        self.md = MarkItDown(
            llm_client=openai_client,
            llm_model="gpt-4o"
        )
        
        config_list = [{"model": "gpt-4", "api_key": api_key}]
        
        # Create specialized agents
        self.extractor = autogen.AssistantAgent(
            name="data_extractor",
            llm_config={"config_list": config_list},
            system_message="Extract structured data from documents."
        )
        
        self.validator = autogen.AssistantAgent(
            name="validator",
            llm_config={"config_list": config_list},
            system_message="Validate extracted data for accuracy and completeness."
        )
        
        self.summarizer = autogen.AssistantAgent(
            name="summarizer",
            llm_config={"config_list": config_list},
            system_message="Create concise, accurate summaries."
        )
        
        self.user_proxy = autogen.UserProxyAgent(
            name="coordinator",
            human_input_mode="NEVER",
            code_execution_config={"use_docker": False}
        )
    
    def process(self, file_path, task):
        # Convert document
        result = self.md.convert(file_path)
        
        # Create group chat
        groupchat = autogen.GroupChat(
            agents=[self.user_proxy, self.extractor, self.validator, self.summarizer],
            messages=[],
            max_round=12
        )
        
        manager = autogen.GroupChatManager(
            groupchat=groupchat,
            llm_config={"config_list": [{"model": "gpt-4", "api_key": self.md._llm_client.api_key}]}
        )
        
        # Execute task
        self.user_proxy.initiate_chat(
            manager,
            message=f"Task: {task}\n\nDocument: {result.text_content}"
        )

# Usage
system = DocumentIntelligenceSystem(api_key="your-api-key")
system.process(
    "invoice.pdf",
    "Extract invoice details, validate totals, and summarize transaction."
)

Build docs developers (and LLMs) love