The PDF skill provides comprehensive PDF processing capabilities including reading and extracting text/tables, combining or splitting PDFs, rotating pages, adding watermarks, creating new PDFs from scratch, filling forms, encrypting/decrypting, extracting images, and performing OCR on scanned documents.
Use this skill for any task involving PDF files - reading, creating, modifying, or converting. For advanced features and detailed examples, consult the skill’s REFERENCE.md file. For PDF form filling, see FORMS.md.
from pypdf import PdfWriter, PdfReaderwriter = PdfWriter()for pdf_file in ["doc1.pdf", "doc2.pdf", "doc3.pdf"]: reader = PdfReader(pdf_file) for page in reader.pages: writer.add_page(page)with open("merged.pdf", "wb") as output: writer.write(output)
Split PDF
reader = PdfReader("input.pdf")for i, page in enumerate(reader.pages): writer = PdfWriter() writer.add_page(page) with open(f"page_{i+1}.pdf", "wb") as output: writer.write(output)
import pdfplumberwith pdfplumber.open("document.pdf") as pdf: for page in pdf.pages: text = page.extract_text() print(text)
Extract Tables
with pdfplumber.open("document.pdf") as pdf: for i, page in enumerate(pdf.pages): tables = page.extract_tables() for j, table in enumerate(tables): print(f"Table {j+1} on page {i+1}:") for row in table: print(row)
Advanced Table Extraction to Excel
import pandas as pdwith pdfplumber.open("document.pdf") as pdf: all_tables = [] for page in pdf.pages: tables = page.extract_tables() for table in tables: if table: # Check if table is not empty df = pd.DataFrame(table[1:], columns=table[0]) all_tables.append(df)# Combine all tablesif all_tables: combined_df = pd.concat(all_tables, ignore_index=True) combined_df.to_excel("extracted_tables.xlsx", index=False)
from reportlab.lib.pagesizes import letterfrom reportlab.pdfgen import canvasc = canvas.Canvas("hello.pdf", pagesize=letter)width, height = letter# Add textc.drawString(100, height - 100, "Hello World!")c.drawString(100, height - 120, "This is a PDF created with reportlab")# Add a linec.line(100, height - 140, 400, height - 140)# Savec.save()
Create Multi-Page PDF
from reportlab.lib.pagesizes import letterfrom reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreakfrom reportlab.lib.styles import getSampleStyleSheetdoc = SimpleDocTemplate("report.pdf", pagesize=letter)styles = getSampleStyleSheet()story = []# Add contenttitle = Paragraph("Report Title", styles['Title'])story.append(title)story.append(Spacer(1, 12))body = Paragraph("This is the body of the report. " * 20, styles['Normal'])story.append(body)story.append(PageBreak())# Page 2story.append(Paragraph("Page 2", styles['Heading1']))story.append(Paragraph("Content for page 2", styles['Normal']))# Build PDFdoc.build(story)
Subscripts and Superscripts
IMPORTANT: Never use Unicode subscript/superscript characters (₀₁₂₃, ⁰¹²³) in ReportLab PDFs. Built-in fonts don’t include these glyphs, causing them to render as solid black boxes.
Instead, use ReportLab’s XML markup tags in Paragraph objects:
from reportlab.platypus import Paragraphfrom reportlab.lib.styles import getSampleStyleSheetstyles = getSampleStyleSheet()# Subscripts: use <sub> tagchemical = Paragraph("H<sub>2</sub>O", styles['Normal'])# Superscripts: use <super> tagsquared = Paragraph("x<super>2</super> + y<super>2</super>", styles['Normal'])
For canvas-drawn text (not Paragraph objects), manually adjust font size and position.
import pytesseractfrom pdf2image import convert_from_path# Convert PDF to imagesimages = convert_from_path('scanned.pdf')# OCR each pagetext = ""for i, image in enumerate(images): text += f"Page {i+1}:\n" text += pytesseract.image_to_string(image) text += "\n\n"print(text)
# Using pdfimages (poppler-utils)pdfimages -j input.pdf output_prefix# This extracts all images as:# output_prefix-000.jpg, output_prefix-001.jpg, etc.