MarkItDown provides a comprehensive Python API for converting documents to Markdown programmatically.
Basic Usage
The simplest way to use MarkItDown:
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert( "example.pdf" )
print (result.markdown)
Initialization
Default Initialization
from markitdown import MarkItDown
# Create instance with all built-in converters enabled
md = MarkItDown()
With Plugins
# Enable third-party plugins
md = MarkItDown( enable_plugins = True )
Disable Built-in Converters
# Start with no converters (rare use case)
md = MarkItDown( enable_builtins = False )
# Later enable them if needed
md.enable_builtins()
With Custom Configuration
# With custom requests session
import requests
session = requests.Session()
session.headers.update({ "User-Agent" : "MyApp/1.0" })
md = MarkItDown( requests_session = session)
Conversion Methods
convert()
The main conversion method accepts multiple input types:
File path (string)
Path object
URL
Binary stream
requests.Response
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert( "document.pdf" )
print (result.markdown)
convert_local()
Convert a local file:
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert_local( "document.pdf" )
print (result.markdown)
convert_stream()
Convert from a binary stream:
from markitdown import MarkItDown
import sys
md = MarkItDown()
# From stdin
result = md.convert_stream(sys.stdin.buffer)
# From file handle
with open ( "document.pdf" , "rb" ) as f:
result = md.convert_stream(f)
print (result.markdown)
convert_url() / convert_uri()
Convert from a URL:
from markitdown import MarkItDown
md = MarkItDown()
# HTTP/HTTPS URLs
result = md.convert_url( "https://example.com/document.pdf" )
# File URIs
result = md.convert_uri( "file:///path/to/document.pdf" )
# Data URIs
result = md.convert_uri( "data:text/plain;base64,SGVsbG8gV29ybGQ=" )
print (result.markdown)
convert_url() is an alias for convert_uri(). The convert_uri() method is preferred for new code.
convert_response()
Convert from a requests Response object:
import requests
from markitdown import MarkItDown
md = MarkItDown()
response = requests.get( "https://example.com/document.pdf" )
result = md.convert_response(response)
print (result.markdown)
StreamInfo
Provide metadata hints about the file being converted:
from markitdown import MarkItDown, StreamInfo
import sys
md = MarkItDown()
# When reading from stdin with known file type
stream_info = StreamInfo(
extension = ".pdf" ,
mimetype = "application/pdf" ,
charset = "UTF-8"
)
result = md.convert_stream(
sys.stdin.buffer,
stream_info = stream_info
)
StreamInfo Fields
All fields are optional:
from markitdown import StreamInfo
stream_info = StreamInfo(
mimetype = "application/pdf" , # MIME type
extension = ".pdf" , # File extension
charset = "UTF-8" , # Character encoding
filename = "document.pdf" , # Original filename
local_path = "/path/to/file.pdf" , # Local file path
url = "https://example.com/doc.pdf" # Source URL
)
DocumentConverterResult
All conversion methods return a DocumentConverterResult object:
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert( "document.pdf" )
# Access the markdown content
print (result.markdown) # Preferred
print (result.text_content) # Deprecated alias
print ( str (result)) # Also works
# Access optional metadata
if result.title:
print ( f "Title: { result.title } " )
Configuration Options
LLM Integration for Image Descriptions
from markitdown import MarkItDown
from openai import OpenAI
client = OpenAI( api_key = "your-api-key" )
md = MarkItDown(
llm_client = client,
llm_model = "gpt-4o" ,
llm_prompt = "Describe this image in detail."
)
# Images will now include LLM-generated descriptions
result = md.convert( "photo.jpg" )
print (result.markdown)
See the LLM Integration guide for details.
Azure Document Intelligence
from markitdown import MarkItDown
md = MarkItDown(
docintel_endpoint = "https://YOUR_ENDPOINT.cognitiveservices.azure.com/"
)
# Convert using Document Intelligence
result = md.convert( "document.pdf" )
With custom credential:
from markitdown import MarkItDown
from azure.core.credentials import AzureKeyCredential
md = MarkItDown(
docintel_endpoint = "https://YOUR_ENDPOINT.cognitiveservices.azure.com/" ,
docintel_credential = AzureKeyCredential( "your-api-key" )
)
See the Azure Document Intelligence guide for details.
Specify the path to ExifTool for image metadata extraction:
from markitdown import MarkItDown
md = MarkItDown(
exiftool_path = "/usr/local/bin/exiftool"
)
result = md.convert( "photo.jpg" )
DOCX Style Map
Customize DOCX conversion with a Mammoth style map:
from markitdown import MarkItDown
style_map = """
p[style-name='Heading 1'] => h1:fresh
p[style-name='Heading 2'] => h2:fresh
"""
md = MarkItDown( style_map = style_map)
result = md.convert( "document.docx" )
Data URI Handling
Control whether data URIs are kept in the output:
from markitdown import MarkItDown
md = MarkItDown()
# Keep data URIs (default: truncated)
result = md.convert( "page.html" , keep_data_uris = True )
Advanced: Custom Converters
Registering Custom Converters
from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult, StreamInfo
from typing import BinaryIO, Any
class CustomConverter ( DocumentConverter ):
def accepts ( self , file_stream : BinaryIO, stream_info : StreamInfo, ** kwargs : Any) -> bool :
return stream_info.extension == ".custom"
def convert ( self , file_stream : BinaryIO, stream_info : StreamInfo, ** kwargs : Any) -> DocumentConverterResult:
content = file_stream.read().decode( "utf-8" )
return DocumentConverterResult( markdown = f "# Custom \n { content } " )
md = MarkItDown()
md.register_converter(CustomConverter())
result = md.convert( "file.custom" )
Converter Priority
from markitdown import MarkItDown, PRIORITY_SPECIFIC_FILE_FORMAT , PRIORITY_GENERIC_FILE_FORMAT
md = MarkItDown()
# Register with specific priority (lower = higher priority)
md.register_converter(
CustomConverter(),
priority = PRIORITY_SPECIFIC_FILE_FORMAT # 0.0 (default)
)
# Generic converters use higher priority value
md.register_converter(
GenericConverter(),
priority = PRIORITY_GENERIC_FILE_FORMAT # 10.0
)
Error Handling
from markitdown import (
MarkItDown,
MarkItDownException,
MissingDependencyException,
FileConversionException,
UnsupportedFormatException
)
md = MarkItDown()
try :
result = md.convert( "document.xyz" )
except MissingDependencyException as e:
print ( f "Missing dependency: { e } " )
print ( "Install with: pip install markitdown[all]" )
except UnsupportedFormatException as e:
print ( f "Unsupported format: { e } " )
except FileConversionException as e:
print ( f "Conversion failed: { e } " )
# Access failed attempts
for attempt in e.attempts:
print ( f " Converter: { attempt.converter } " )
except MarkItDownException as e:
print ( f "General error: { e } " )
Complete Example
A comprehensive example combining multiple features:
from markitdown import MarkItDown, StreamInfo
from openai import OpenAI
from pathlib import Path
import sys
def convert_document ( input_path , output_path = None ):
"""Convert a document to Markdown with LLM support."""
# Initialize with LLM client
client = OpenAI( api_key = "your-api-key" )
md = MarkItDown(
llm_client = client,
llm_model = "gpt-4o" ,
enable_plugins = True
)
try :
# Convert the document
result = md.convert(input_path)
# Write to file or stdout
if output_path:
Path(output_path).write_text(result.markdown)
print ( f "Converted to { output_path } " )
else :
print (result.markdown)
# Print metadata if available
if result.title:
print ( f "Document title: { result.title } " , file = sys.stderr)
except Exception as e:
print ( f "Error: { e } " , file = sys.stderr)
sys.exit( 1 )
if __name__ == "__main__" :
convert_document( "example.pdf" , "example.md" )