Why MarkItDown + OpenAI?
MarkItDown’s Markdown output is ideal for OpenAI models because:- GPT models are extensively trained on Markdown and understand it natively
- Markdown is highly token-efficient compared to raw text or HTML
- Document structure (headers, lists, tables) is preserved for better comprehension
- Vision models can generate descriptions for images during document conversion
- Clean, structured output improves prompt engineering and response quality
MarkItDown has built-in support for OpenAI’s vision models to generate image descriptions during conversion.
Installation
pip install 'markitdown[all]' openai
Basic Document Analysis
Convert a document and analyze it with GPT:from markitdown import MarkItDown
from openai import OpenAI
# Initialize clients
md = MarkItDown()
client = OpenAI(api_key="your-api-key")
# Convert document to Markdown
result = md.convert("quarterly_report.pdf")
# Analyze with GPT
response = client.chat.completions.create(
model="gpt-4",
messages=[
{
"role": "system",
"content": "You are a financial analyst. Analyze documents and provide insights."
},
{
"role": "user",
"content": f"""Analyze this quarterly report and provide:
1. Key financial highlights
2. Major risks
3. Growth opportunities
Document:
{result.text_content}"""
}
],
temperature=0
)
print(response.choices[0].message.content)
Document Q&A System
Build an interactive document Q&A system:from markitdown import MarkItDown
from openai import OpenAI
class DocumentQA:
"""Question-answering system for documents."""
def __init__(self, api_key: str, model: str = "gpt-4"):
self.md = MarkItDown()
self.client = OpenAI(api_key=api_key)
self.model = model
self.document_content = None
self.conversation_history = []
def load_document(self, file_path: str):
"""Load and convert a document."""
result = self.md.convert(file_path)
self.document_content = result.text_content
# Initialize conversation with document context
self.conversation_history = [
{
"role": "system",
"content": """You are a helpful assistant that answers questions
about documents. Provide accurate, concise answers based solely on
the document content. Cite specific sections when possible."""
},
{
"role": "user",
"content": f"Here is the document to analyze:\n\n{self.document_content}"
},
{
"role": "assistant",
"content": "I've received the document and I'm ready to answer questions about it."
}
]
print(f"Loaded document ({len(self.document_content)} characters)")
def ask(self, question: str) -> str:
"""Ask a question about the document."""
if not self.document_content:
return "Please load a document first."
# Add question to conversation
self.conversation_history.append({
"role": "user",
"content": question
})
# Get response
response = self.client.chat.completions.create(
model=self.model,
messages=self.conversation_history,
temperature=0
)
answer = response.choices[0].message.content
# Add response to conversation history
self.conversation_history.append({
"role": "assistant",
"content": answer
})
return answer
def reset(self):
"""Reset conversation history."""
self.conversation_history = self.conversation_history[:3]
# Usage
qa = DocumentQA(api_key="your-api-key")
qa.load_document("employee_handbook.pdf")
print(qa.ask("What is the vacation policy?"))
print(qa.ask("How many days of paid leave do employees get?"))
print(qa.ask("Are there any restrictions on when vacation can be taken?"))
Vision-Enhanced Conversion
Use OpenAI’s vision models to describe images during conversion:from markitdown import MarkItDown
from openai import OpenAI
# Initialize with vision support
client = OpenAI(api_key="your-api-key")
md = MarkItDown(
llm_client=client,
llm_model="gpt-4o",
llm_prompt="Describe this image in detail, focusing on key information."
)
# Convert presentation with images
result = md.convert("product_launch.pptx")
# The output now includes AI-generated descriptions for all images
print(result.text_content)
# Analyze the presentation
response = client.chat.completions.create(
model="gpt-4",
messages=[
{
"role": "user",
"content": f"""Summarize this product launch presentation:
{result.text_content}"""
}
]
)
print(response.choices[0].message.content)
Structured Data Extraction
Extract structured data using function calling:from markitdown import MarkItDown
from openai import OpenAI
import json
def extract_invoice_data(file_path: str, api_key: str):
"""Extract structured data from an invoice."""
# Convert invoice to Markdown
md = MarkItDown()
result = md.convert(file_path)
# Define extraction schema
functions = [
{
"name": "extract_invoice",
"description": "Extract structured information from an invoice",
"parameters": {
"type": "object",
"properties": {
"invoice_number": {"type": "string"},
"date": {"type": "string"},
"vendor_name": {"type": "string"},
"total_amount": {"type": "number"},
"currency": {"type": "string"},
"line_items": {
"type": "array",
"items": {
"type": "object",
"properties": {
"description": {"type": "string"},
"quantity": {"type": "number"},
"unit_price": {"type": "number"},
"total": {"type": "number"}
}
}
}
},
"required": ["invoice_number", "total_amount"]
}
}
]
# Extract with GPT
client = OpenAI(api_key=api_key)
response = client.chat.completions.create(
model="gpt-4",
messages=[
{
"role": "user",
"content": f"Extract invoice data from this document:\n\n{result.text_content}"
}
],
functions=functions,
function_call={"name": "extract_invoice"},
temperature=0
)
# Parse function call result
function_args = json.loads(
response.choices[0].message.function_call.arguments
)
return function_args
# Usage
invoice_data = extract_invoice_data("invoice.pdf", "your-api-key")
print(json.dumps(invoice_data, indent=2))
Document Summarization
Summarize long documents efficiently:from markitdown import MarkItDown
from openai import OpenAI
def summarize_document(
file_path: str,
api_key: str,
summary_type: str = "concise"
):
"""Summarize a document."""
# Convert to Markdown
md = MarkItDown()
result = md.convert(file_path)
# Define prompts for different summary types
prompts = {
"concise": "Provide a concise 2-3 paragraph summary of the key points.",
"executive": """Create an executive summary with:
- Overview (2-3 sentences)
- Key findings (bullet points)
- Recommendations (bullet points)""",
"detailed": "Provide a detailed summary covering all major sections and key points.",
"bullet": "Summarize the main points as a bulleted list."
}
prompt = prompts.get(summary_type, prompts["concise"])
# Generate summary
client = OpenAI(api_key=api_key)
response = client.chat.completions.create(
model="gpt-4",
messages=[
{
"role": "system",
"content": "You are an expert at summarizing documents clearly and accurately."
},
{
"role": "user",
"content": f"{prompt}\n\nDocument:\n{result.text_content}"
}
],
temperature=0.3
)
return response.choices[0].message.content
# Usage
summary = summarize_document(
"research_paper.pdf",
"your-api-key",
summary_type="executive"
)
print(summary)
Multi-Document Analysis
Analyze and compare multiple documents:from markitdown import MarkItDown
from openai import OpenAI
from pathlib import Path
def compare_documents(file_paths: list, analysis_task: str, api_key: str):
"""Compare multiple documents."""
md = MarkItDown()
client = OpenAI(api_key=api_key)
# Convert all documents
documents = {}
for path in file_paths:
result = md.convert(path)
documents[Path(path).name] = result.text_content
# Format for analysis
formatted_docs = "\n\n".join([
f"=== Document: {name} ===\n{content}"
for name, content in documents.items()
])
# Analyze
response = client.chat.completions.create(
model="gpt-4",
messages=[
{
"role": "system",
"content": "You are an expert at analyzing and comparing documents."
},
{
"role": "user",
"content": f"""{analysis_task}
Documents:
{formatted_docs}"""
}
],
temperature=0
)
return response.choices[0].message.content
# Usage
analysis = compare_documents(
file_paths=[
"contract_draft_v1.pdf",
"contract_draft_v2.pdf",
"contract_final.pdf"
],
analysis_task="""Compare these contract versions and identify:
1. Major changes between versions
2. New clauses added
3. Terms that were modified
4. Any concerning changes""",
api_key="your-api-key"
)
print(analysis)
Batch Processing
Process multiple documents efficiently:from markitdown import MarkItDown
from openai import OpenAI
from pathlib import Path
import asyncio
class BatchDocumentProcessor:
"""Process multiple documents in batch."""
def __init__(self, api_key: str):
self.md = MarkItDown()
self.client = OpenAI(api_key=api_key)
async def process_document(self, file_path: str, task: str):
"""Process a single document."""
# Convert
result = self.md.convert(file_path)
# Analyze
response = self.client.chat.completions.create(
model="gpt-4",
messages=[
{
"role": "user",
"content": f"{task}\n\nDocument:\n{result.text_content}"
}
],
temperature=0
)
return {
"file": file_path,
"result": response.choices[0].message.content
}
async def process_directory(
self,
directory: str,
task: str,
glob_pattern: str = "*.pdf"
):
"""Process all documents in a directory."""
tasks = []
for file_path in Path(directory).glob(glob_pattern):
task_coro = self.process_document(str(file_path), task)
tasks.append(task_coro)
results = await asyncio.gather(*tasks)
return results
# Usage
async def main():
processor = BatchDocumentProcessor(api_key="your-api-key")
results = await processor.process_directory(
directory="./invoices",
task="Extract: invoice number, date, total amount, and vendor name.",
glob_pattern="*.pdf"
)
for result in results:
print(f"\nFile: {result['file']}")
print(result['result'])
asyncio.run(main())
Assistants API Integration
Use MarkItDown with OpenAI Assistants:from markitdown import MarkItDown
from openai import OpenAI
import time
class DocumentAssistant:
"""OpenAI Assistant for document analysis."""
def __init__(self, api_key: str):
self.md = MarkItDown()
self.client = OpenAI(api_key=api_key)
# Create assistant
self.assistant = self.client.beta.assistants.create(
name="Document Analyst",
instructions="""You are a document analysis assistant.
Analyze documents and provide clear, accurate insights.
When asked about specific details, cite the relevant sections.""",
model="gpt-4-1106-preview"
)
self.thread = None
def load_document(self, file_path: str):
"""Load a document into a new conversation thread."""
# Convert document
result = self.md.convert(file_path)
# Create thread with document
self.thread = self.client.beta.threads.create(
messages=[
{
"role": "user",
"content": f"Please analyze this document:\n\n{result.text_content}"
}
]
)
print(f"Loaded {file_path}")
def ask(self, question: str) -> str:
"""Ask a question about the document."""
if not self.thread:
return "Please load a document first."
# Add message
self.client.beta.threads.messages.create(
thread_id=self.thread.id,
role="user",
content=question
)
# Run assistant
run = self.client.beta.threads.runs.create(
thread_id=self.thread.id,
assistant_id=self.assistant.id
)
# Wait for completion
while run.status in ["queued", "in_progress"]:
time.sleep(0.5)
run = self.client.beta.threads.runs.retrieve(
thread_id=self.thread.id,
run_id=run.id
)
# Get response
messages = self.client.beta.threads.messages.list(
thread_id=self.thread.id
)
return messages.data[0].content[0].text.value
# Usage
assistant = DocumentAssistant(api_key="your-api-key")
assistant.load_document("policy_document.pdf")
print(assistant.ask("What are the key policies outlined in this document?"))
print(assistant.ask("Are there any compliance requirements mentioned?"))
Custom Analysis Templates
Create reusable analysis templates:from markitdown import MarkItDown
from openai import OpenAI
from typing import Dict, List
class DocumentAnalyzer:
"""Reusable document analyzer with templates."""
TEMPLATES = {
"contract_review": {
"system": "You are a legal contract reviewer.",
"prompt": """Review this contract and identify:
1. Key terms and conditions
2. Obligations and responsibilities
3. Termination clauses
4. Potential risks or concerns
5. Missing or unclear provisions"""
},
"financial_analysis": {
"system": "You are a financial analyst.",
"prompt": """Analyze this financial document:
1. Key financial metrics and trends
2. Revenue and profitability analysis
3. Major expenses and cost drivers
4. Financial risks and concerns
5. Recommendations"""
},
"technical_review": {
"system": "You are a technical documentation reviewer.",
"prompt": """Review this technical document:
1. Clarity and completeness
2. Technical accuracy
3. Missing information
4. Suggested improvements"""
}
}
def __init__(self, api_key: str):
self.md = MarkItDown()
self.client = OpenAI(api_key=api_key)
def analyze(self, file_path: str, template: str, model: str = "gpt-4"):
"""Analyze document using a template."""
# Convert document
result = self.md.convert(file_path)
# Get template
template_config = self.TEMPLATES.get(template)
if not template_config:
raise ValueError(f"Unknown template: {template}")
# Analyze
response = self.client.chat.completions.create(
model=model,
messages=[
{
"role": "system",
"content": template_config["system"]
},
{
"role": "user",
"content": f"{template_config['prompt']}\n\nDocument:\n{result.text_content}"
}
],
temperature=0
)
return response.choices[0].message.content
# Usage
analyzer = DocumentAnalyzer(api_key="your-api-key")
# Contract review
contract_analysis = analyzer.analyze(
"service_agreement.pdf",
template="contract_review"
)
print(contract_analysis)
# Financial analysis
financial_analysis = analyzer.analyze(
"quarterly_report.pdf",
template="financial_analysis"
)
print(financial_analysis)
Best Practices
Token Management: For large documents, consider chunking the Markdown output or using GPT-4 Turbo with its 128K context window.
Vision Integration: Enable
llm_client and llm_model in MarkItDown for documents with important visual content (charts, diagrams, images).Structured Output: Use function calling for extracting structured data - MarkItDown’s clean Markdown makes extraction more reliable.
Temperature Settings: Use temperature=0 for factual extraction, 0.3-0.7 for summaries and creative analysis.
System Messages: Leverage system messages to define the assistant’s role and analysis approach - this improves consistency.