Translate text elements and table cells in documents while preserving structure and embedded images.
Overview
This example demonstrates:
Converting PDFs with image generation
Translating text elements and table cells
Preserving document structure during translation
Saving original and translated Markdown with embedded images
Basic Translation Setup
from pathlib import Path
from docling_core.types.doc import ImageRefMode, TableItem, TextItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
IMAGE_RESOLUTION_SCALE = 2.0
input_doc_path = Path( "document.pdf" )
output_dir = Path( "scratch" )
output_dir.mkdir( parents = True , exist_ok = True )
# Configure pipeline to preserve images
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True
doc_converter = DocumentConverter(
format_options = {
InputFormat. PDF : PdfFormatOption( pipeline_options = pipeline_options)
}
)
conv_res = doc_converter.convert(input_doc_path)
Setting images_scale preserves images during conversion. Without this, images are discarded to save memory.
Translation Function
Define Translation Function
Create a translate() function using your preferred translation API.
Iterate Document Elements
Loop through TextItems and TableItems.
Translate Content
Apply translation to text while preserving structure.
Export Results
Save both original and translated documents.
def translate ( text : str , src : str = "en" , dest : str = "de" ) -> str :
"""
Translate text using your preferred translation service.
Replace this with your actual translation implementation:
- Google Translate API
- DeepL API
- OpenAI API
- Azure Translator
- etc.
"""
# Example with googletrans (uncomment to use):
# from googletrans import Translator
# translator = Translator()
# translated = translator.translate(text, src=src, dest=dest)
# return translated.text
# Placeholder - replace with actual translation
print ( "Warning: Using placeholder translation function" )
return text
Translate Document
conv_doc = conv_res.document
doc_filename = conv_res.input.file.name
# Save original
md_filename = output_dir / f " { doc_filename } -orig.md"
conv_doc.save_as_markdown(md_filename, image_mode = ImageRefMode. EMBEDDED )
# Translate all text elements
for element, _level in conv_res.document.iterate_items():
if isinstance (element, TextItem):
element.orig = element.text # Store original
element.text = translate( text = element.text, src = "en" , dest = "de" )
elif isinstance (element, TableItem):
for cell in element.data.table_cells:
cell.text = translate( text = cell.text, src = "en" , dest = "de" )
# Save translated version
md_filename = output_dir / f " { doc_filename } -translated.md"
conv_doc.save_as_markdown(md_filename, image_mode = ImageRefMode. EMBEDDED )
Translation API Examples
Google Translate
DeepL
OpenAI
Azure Translator
from googletrans import Translator
def translate ( text : str , src : str = "en" , dest : str = "de" ) -> str :
translator = Translator()
translated = translator.translate(text, src = src, dest = dest)
return translated.text
Complete Example
from pathlib import Path
from docling_core.types.doc import ImageRefMode, TableItem, TextItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
def translate ( text : str , src : str = "en" , dest : str = "de" ) -> str :
"""Replace with your translation implementation."""
# Your translation code here
return text
def translate_document ( input_path : Path, output_dir : Path, src : str = "en" , dest : str = "de" ):
"""Translate a PDF document."""
output_dir.mkdir( parents = True , exist_ok = True )
# Configure pipeline
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = 2.0
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True
# Convert document
doc_converter = DocumentConverter(
format_options = {
InputFormat. PDF : PdfFormatOption( pipeline_options = pipeline_options)
}
)
conv_res = doc_converter.convert(input_path)
doc_filename = conv_res.input.file.name
# Save original
orig_path = output_dir / f " { doc_filename } - { src } .md"
conv_res.document.save_as_markdown(orig_path, image_mode = ImageRefMode. EMBEDDED )
print ( f "Saved original: { orig_path } " )
# Translate
for element, _level in conv_res.document.iterate_items():
if isinstance (element, TextItem):
element.text = translate(element.text, src = src, dest = dest)
elif isinstance (element, TableItem):
for cell in element.data.table_cells:
cell.text = translate(cell.text, src = src, dest = dest)
# Save translated
trans_path = output_dir / f " { doc_filename } - { dest } .md"
conv_res.document.save_as_markdown(trans_path, image_mode = ImageRefMode. EMBEDDED )
print ( f "Saved translated: { trans_path } " )
if __name__ == "__main__" :
translate_document(
input_path = Path( "document.pdf" ),
output_dir = Path( "scratch" ),
src = "en" ,
dest = "de"
)
Requirements
Python 3.9+
docling package
Translation API library (googletrans, deepl, openai, etc.)
# Install Docling
pip install docling
# Install translation library of choice
pip install googletrans==4.0.0rc1 # Google Translate
# or
pip install deepl # DeepL
# or
pip install openai # OpenAI