Skip to main content

Function Signature

The zerox() function is an asynchronous API that performs OCR (Optical Character Recognition) to markdown using vision models. It processes PDF files, images, and various document formats and converts them into markdown format.
async def zerox(
    cleanup: bool = True,
    concurrency: int = 10,
    file_path: Optional[str] = "",
    image_density: int = 300,
    image_height: tuple[Optional[int], int] = (None, 1056),
    maintain_format: bool = False,
    model: str = "gpt-4o-mini",
    output_dir: Optional[str] = None,
    temp_dir: Optional[str] = None,
    custom_system_prompt: Optional[str] = None,
    select_pages: Optional[Union[int, Iterable[int]]] = None,
    **kwargs
) -> ZeroxOutput

Import

from pyzerox import zerox

Basic Usage

The Python SDK uses async/await syntax. Make sure to call zerox() within an async function and use asyncio.run() to execute.

Process a PDF from URL

import asyncio
from pyzerox import zerox
import os

# Set up your API key
os.environ["OPENAI_API_KEY"] = "your-api-key"

async def main():
    result = await zerox(
        file_path="https://example.com/document.pdf",
        model="gpt-4o-mini"
    )
    print(result)

# Run the async function
result = asyncio.run(main())

Process a Local File

import asyncio
from pyzerox import zerox
import os

os.environ["OPENAI_API_KEY"] = "your-api-key"

async def main():
    result = await zerox(
        file_path="./path/to/document.pdf",
        model="gpt-4o-mini",
        output_dir="./output"  # Save markdown to file
    )
    return result

result = asyncio.run(main())

Process Specific Pages

import asyncio
from pyzerox import zerox

async def main():
    # Process only pages 1, 3, and 5
    result = await zerox(
        file_path="document.pdf",
        model="gpt-4o-mini",
        select_pages=[1, 3, 5]
    )
    return result

result = asyncio.run(main())

Maintain Format Across Pages

import asyncio
from pyzerox import zerox

async def main():
    # Useful for documents with tables spanning multiple pages
    result = await zerox(
        file_path="document.pdf",
        model="gpt-4o-mini",
        maintain_format=True
    )
    return result

result = asyncio.run(main())

Using Different Model Providers

The Python SDK uses LiteLLM to support multiple vision model providers.

OpenAI

import os

os.environ["OPENAI_API_KEY"] = "your-api-key"

result = await zerox(
    file_path="document.pdf",
    model="gpt-4o-mini"  # or "gpt-4o"
)

Azure OpenAI

import os

os.environ["AZURE_API_KEY"] = "your-azure-api-key"
os.environ["AZURE_API_BASE"] = "https://example-endpoint.openai.azure.com"
os.environ["AZURE_API_VERSION"] = "2023-05-15"

result = await zerox(
    file_path="document.pdf",
    model="azure/gpt-4o-mini"  # Format: azure/<deployment_name>
)

Google Gemini

import os

os.environ["GEMINI_API_KEY"] = "your-gemini-api-key"

result = await zerox(
    file_path="document.pdf",
    model="gemini/gemini-1.5-flash"  # Format: gemini/<model_name>
)

Anthropic Claude

import os

os.environ["ANTHROPIC_API_KEY"] = "your-anthropic-api-key"

result = await zerox(
    file_path="document.pdf",
    model="claude-3-opus-20240229"
)

AWS Bedrock

import os

os.environ["AWS_ACCESS_KEY_ID"] = "your-access-key"
os.environ["AWS_SECRET_ACCESS_KEY"] = "your-secret-key"
os.environ["AWS_REGION_NAME"] = "us-east-1"

result = await zerox(
    file_path="document.pdf",
    model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0"
)

Vertex AI

import json

# Load service account credentials
with open('path/to/vertex_ai_service_account.json', 'r') as file:
    vertex_credentials = json.load(file)

vertex_credentials_json = json.dumps(vertex_credentials)

result = await zerox(
    file_path="document.pdf",
    model="vertex_ai/gemini-1.5-flash-001",
    vertex_credentials=vertex_credentials_json
)

Custom System Prompts

The Python SDK supports custom system prompts, unlike the Node.js SDK which uses a fixed prompt.
from pyzerox import zerox, DEFAULT_SYSTEM_PROMPT

# Use custom prompt
custom_prompt = """
Convert this document to markdown.
Focus on extracting tables and charts.
Return only markdown with no explanations.
"""

result = await zerox(
    file_path="document.pdf",
    model="gpt-4o-mini",
    custom_system_prompt=custom_prompt
)

Advanced Configuration

result = await zerox(
    file_path="document.pdf",
    model="gpt-4o-mini",
    
    # Processing options
    concurrency=5,              # Process 5 pages concurrently
    maintain_format=True,       # Maintain formatting across pages
    select_pages=[1, 2, 3],    # Process only specific pages
    
    # Image conversion options
    image_density=300,          # DPI for image conversion
    image_height=(None, 1056),  # Max height for images
    
    # File management
    output_dir="./output",      # Save markdown to file
    temp_dir="./temp",          # Custom temp directory
    cleanup=True,               # Clean up temp files after processing
    
    # Model customization
    custom_system_prompt="...",  # Override default prompt
    
    # Additional LiteLLM parameters
    temperature=0.1,
    max_tokens=4096
)

Parameters

Detailed parameter documentation

Response

Response structure and fields

Build docs developers (and LLMs) love