Skip to main content
MarkItDown provides a comprehensive Python API for converting documents to Markdown programmatically.

Basic Usage

The simplest way to use MarkItDown:
from markitdown import MarkItDown

md = MarkItDown()
result = md.convert("example.pdf")
print(result.markdown)

Initialization

Default Initialization

from markitdown import MarkItDown

# Create instance with all built-in converters enabled
md = MarkItDown()

With Plugins

# Enable third-party plugins
md = MarkItDown(enable_plugins=True)

Disable Built-in Converters

# Start with no converters (rare use case)
md = MarkItDown(enable_builtins=False)

# Later enable them if needed
md.enable_builtins()

With Custom Configuration

# With custom requests session
import requests

session = requests.Session()
session.headers.update({"User-Agent": "MyApp/1.0"})

md = MarkItDown(requests_session=session)

Conversion Methods

convert()

The main conversion method accepts multiple input types:
from markitdown import MarkItDown

md = MarkItDown()
result = md.convert("document.pdf")
print(result.markdown)

convert_local()

Convert a local file:
from markitdown import MarkItDown

md = MarkItDown()
result = md.convert_local("document.pdf")
print(result.markdown)

convert_stream()

Convert from a binary stream:
from markitdown import MarkItDown
import sys

md = MarkItDown()

# From stdin
result = md.convert_stream(sys.stdin.buffer)

# From file handle
with open("document.pdf", "rb") as f:
    result = md.convert_stream(f)

print(result.markdown)

convert_url() / convert_uri()

Convert from a URL:
from markitdown import MarkItDown

md = MarkItDown()

# HTTP/HTTPS URLs
result = md.convert_url("https://example.com/document.pdf")

# File URIs
result = md.convert_uri("file:///path/to/document.pdf")

# Data URIs
result = md.convert_uri("data:text/plain;base64,SGVsbG8gV29ybGQ=")

print(result.markdown)
convert_url() is an alias for convert_uri(). The convert_uri() method is preferred for new code.

convert_response()

Convert from a requests Response object:
import requests
from markitdown import MarkItDown

md = MarkItDown()
response = requests.get("https://example.com/document.pdf")
result = md.convert_response(response)
print(result.markdown)

StreamInfo

Provide metadata hints about the file being converted:
from markitdown import MarkItDown, StreamInfo
import sys

md = MarkItDown()

# When reading from stdin with known file type
stream_info = StreamInfo(
    extension=".pdf",
    mimetype="application/pdf",
    charset="UTF-8"
)

result = md.convert_stream(
    sys.stdin.buffer,
    stream_info=stream_info
)

StreamInfo Fields

All fields are optional:
from markitdown import StreamInfo

stream_info = StreamInfo(
    mimetype="application/pdf",      # MIME type
    extension=".pdf",                 # File extension
    charset="UTF-8",                  # Character encoding
    filename="document.pdf",          # Original filename
    local_path="/path/to/file.pdf",  # Local file path
    url="https://example.com/doc.pdf" # Source URL
)

DocumentConverterResult

All conversion methods return a DocumentConverterResult object:
from markitdown import MarkItDown

md = MarkItDown()
result = md.convert("document.pdf")

# Access the markdown content
print(result.markdown)  # Preferred
print(result.text_content)  # Deprecated alias
print(str(result))  # Also works

# Access optional metadata
if result.title:
    print(f"Title: {result.title}")

Configuration Options

LLM Integration for Image Descriptions

from markitdown import MarkItDown
from openai import OpenAI

client = OpenAI(api_key="your-api-key")

md = MarkItDown(
    llm_client=client,
    llm_model="gpt-4o",
    llm_prompt="Describe this image in detail."
)

# Images will now include LLM-generated descriptions
result = md.convert("photo.jpg")
print(result.markdown)
See the LLM Integration guide for details.

Azure Document Intelligence

from markitdown import MarkItDown

md = MarkItDown(
    docintel_endpoint="https://YOUR_ENDPOINT.cognitiveservices.azure.com/"
)

# Convert using Document Intelligence
result = md.convert("document.pdf")
With custom credential:
from markitdown import MarkItDown
from azure.core.credentials import AzureKeyCredential

md = MarkItDown(
    docintel_endpoint="https://YOUR_ENDPOINT.cognitiveservices.azure.com/",
    docintel_credential=AzureKeyCredential("your-api-key")
)
See the Azure Document Intelligence guide for details.

ExifTool Path

Specify the path to ExifTool for image metadata extraction:
from markitdown import MarkItDown

md = MarkItDown(
    exiftool_path="/usr/local/bin/exiftool"
)

result = md.convert("photo.jpg")

DOCX Style Map

Customize DOCX conversion with a Mammoth style map:
from markitdown import MarkItDown

style_map = """
p[style-name='Heading 1'] => h1:fresh
p[style-name='Heading 2'] => h2:fresh
"""

md = MarkItDown(style_map=style_map)
result = md.convert("document.docx")

Data URI Handling

Control whether data URIs are kept in the output:
from markitdown import MarkItDown

md = MarkItDown()

# Keep data URIs (default: truncated)
result = md.convert("page.html", keep_data_uris=True)

Advanced: Custom Converters

Registering Custom Converters

from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult, StreamInfo
from typing import BinaryIO, Any

class CustomConverter(DocumentConverter):
    def accepts(self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any) -> bool:
        return stream_info.extension == ".custom"
    
    def convert(self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any) -> DocumentConverterResult:
        content = file_stream.read().decode("utf-8")
        return DocumentConverterResult(markdown=f"# Custom\n{content}")

md = MarkItDown()
md.register_converter(CustomConverter())

result = md.convert("file.custom")

Converter Priority

from markitdown import MarkItDown, PRIORITY_SPECIFIC_FILE_FORMAT, PRIORITY_GENERIC_FILE_FORMAT

md = MarkItDown()

# Register with specific priority (lower = higher priority)
md.register_converter(
    CustomConverter(),
    priority=PRIORITY_SPECIFIC_FILE_FORMAT  # 0.0 (default)
)

# Generic converters use higher priority value
md.register_converter(
    GenericConverter(),
    priority=PRIORITY_GENERIC_FILE_FORMAT  # 10.0
)

Error Handling

from markitdown import (
    MarkItDown,
    MarkItDownException,
    MissingDependencyException,
    FileConversionException,
    UnsupportedFormatException
)

md = MarkItDown()

try:
    result = md.convert("document.xyz")
except MissingDependencyException as e:
    print(f"Missing dependency: {e}")
    print("Install with: pip install markitdown[all]")
except UnsupportedFormatException as e:
    print(f"Unsupported format: {e}")
except FileConversionException as e:
    print(f"Conversion failed: {e}")
    # Access failed attempts
    for attempt in e.attempts:
        print(f"  Converter: {attempt.converter}")
except MarkItDownException as e:
    print(f"General error: {e}")

Complete Example

A comprehensive example combining multiple features:
from markitdown import MarkItDown, StreamInfo
from openai import OpenAI
from pathlib import Path
import sys

def convert_document(input_path, output_path=None):
    """Convert a document to Markdown with LLM support."""
    
    # Initialize with LLM client
    client = OpenAI(api_key="your-api-key")
    md = MarkItDown(
        llm_client=client,
        llm_model="gpt-4o",
        enable_plugins=True
    )
    
    try:
        # Convert the document
        result = md.convert(input_path)
        
        # Write to file or stdout
        if output_path:
            Path(output_path).write_text(result.markdown)
            print(f"Converted to {output_path}")
        else:
            print(result.markdown)
            
        # Print metadata if available
        if result.title:
            print(f"Document title: {result.title}", file=sys.stderr)
            
    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)

if __name__ == "__main__":
    convert_document("example.pdf", "example.md")

Build docs developers (and LLMs) love