PII Detection and Obfuscation

Detect and obfuscate PII (Personally Identifiable Information) using Hugging Face NER models or GLiNER.

Overview

This example demonstrates:

Converting PDFs and detecting PII
Using Hugging Face token-classification models
Using GLiNER for fine-grained PII detection
Obfuscating detected entities with stable IDs
Saving original and redacted versions

Basic PII Obfuscation

pii_obfuscate.py

from pathlib import Path
from docling_core.types.doc import ImageRefMode, TableItem, TextItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

HF_MODEL = "dslim/bert-base-NER"

# Build NER pipeline
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
model = AutoModelForTokenClassification.from_pretrained(HF_MODEL)
ner = pipeline(
    "token-classification",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple"
)

Convert PDF with Images

Configure Pipeline

Enable image generation to preserve visuals in output.

Convert Document

Process the PDF and save original Markdown.

Detect PII

Run NER model on text elements and table cells.

Obfuscate Entities

Replace detected PII with stable type-based IDs.

input_doc_path = Path("document.pdf")
output_dir = Path("scratch")
output_dir.mkdir(parents=True, exist_ok=True)

pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = 2.0
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

conv_res = doc_converter.convert(input_doc_path)

# Save original
md_filename = output_dir / f"{conv_res.input.file.name}-orig.md"
conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)

Simple PII Obfuscator

import re
from typing import Dict

class SimplePiiObfuscator:
    """Tracks PII strings and replaces with stable IDs per entity type."""
    
    def __init__(self, ner_callable):
        self.ner = ner_callable
        self.entity_map: Dict[str, str] = {}
        self.counters = {
            "person": 0,
            "org": 0,
            "location": 0,
            "misc": 0,
        }
        # Map model labels to coarse types
        self.label_map = {
            "PER": "person",
            "PERSON": "person",
            "ORG": "org",
            "ORGANIZATION": "org",
            "LOC": "location",
            "LOCATION": "location",
            "GPE": "location",
            "MISC": "misc",
        }
        self.allowed_types = {"person", "org", "location"}
    
    def _next_id(self, typ: str) -> str:
        self.counters[typ] += 1
        return f"{typ}-{self.counters[typ]}"
    
    def obfuscate_text(self, text: str) -> str:
        if not text:
            return text
        
        results = self.ner(text)
        entities = []
        
        for r in results:
            raw_label = r.get("entity_group") or r.get("entity") or "MISC"
            label = self.label_map.get(raw_label, "misc")
            if label not in self.allowed_types:
                continue
            word = re.sub(r"\s+", " ", r.get("word", "")).strip()
            if word:
                entities.append((word, label))
        
        # Map entities to stable IDs
        unique_words = {}
        for word, label in entities:
            if word not in self.entity_map:
                replacement = self._next_id(label)
                self.entity_map[word] = replacement
            unique_words[word] = self.entity_map[word]
        
        # Replace longest matches first
        sorted_pairs = sorted(
            unique_words.items(), key=lambda x: len(x[0]), reverse=True
        )
        
        obfuscated = text
        for old, new in sorted_pairs:
            pattern = re.escape(old)
            obfuscated = re.sub(pattern, new, obfuscated)
        
        return obfuscated

Obfuscate Document

obfuscator = SimplePiiObfuscator(ner)

for element, _level in conv_res.document.iterate_items():
    if isinstance(element, TextItem):
        element.text = obfuscator.obfuscate_text(element.text)
    elif isinstance(element, TableItem):
        for cell in element.data.table_cells:
            cell.text = obfuscator.obfuscate_text(cell.text)

# Save obfuscated version
md_filename = output_dir / f"{conv_res.input.file.name}-obfuscated.md"
conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)

# Print entity mapping
print("\nObfuscated Entities:")
for original, replacement in obfuscator.entity_map.items():
    print(f"{original} => {replacement}")

Using GLiNER (Advanced)

from gliner import GLiNER

GLINER_MODEL = "urchade/gliner_multi_pii-v1"

model = GLiNER.from_pretrained(GLINER_MODEL)
labels = [
    "person",
    "email",
    "phone number",
    "passport number",
    "Social Security Number",
    "driver licence",
    "full address",
    "company",
]

Run via Command Line

# Use Hugging Face NER (default)
python docs/examples/pii_obfuscate.py

# Use GLiNER
python docs/examples/pii_obfuscate.py --engine gliner
# or
PII_ENGINE=gliner python docs/examples/pii_obfuscate.py

Requirements

Hugging Face NER

pip install docling transformers

GLiNER

pip install docling gliner
# For CPU-only environments:
pip install torch --extra-index-url https://download.pytorch.org/whl/cpu

This is a demonstration. For production PII detection, use specialized models and thorough evaluation.

Conversion

Advanced Processing

RAG & AI Workflows

PII Detection and Obfuscation

Overview

Basic PII Obfuscation

Convert PDF with Images

Simple PII Obfuscator

Obfuscate Document

Using GLiNER (Advanced)

Run via Command Line

Requirements

Hugging Face NER

GLiNER

Build docs developers (and LLMs) love

Conversion

Advanced Processing

RAG & AI Workflows

​Overview

​Basic PII Obfuscation

​Convert PDF with Images

​Simple PII Obfuscator

​Obfuscate Document

​Using GLiNER (Advanced)

​Run via Command Line

​Requirements

​Hugging Face NER

​GLiNER

Build docs developers (and LLMs) love

Overview

Basic PII Obfuscation

Convert PDF with Images

Simple PII Obfuscator

Obfuscate Document

Using GLiNER (Advanced)

Run via Command Line

Requirements

Hugging Face NER

GLiNER