Skip to main content
Detect and obfuscate PII (Personally Identifiable Information) using Hugging Face NER models or GLiNER.

Overview

This example demonstrates:
  • Converting PDFs and detecting PII
  • Using Hugging Face token-classification models
  • Using GLiNER for fine-grained PII detection
  • Obfuscating detected entities with stable IDs
  • Saving original and redacted versions

Basic PII Obfuscation

pii_obfuscate.py
from pathlib import Path
from docling_core.types.doc import ImageRefMode, TableItem, TextItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

HF_MODEL = "dslim/bert-base-NER"

# Build NER pipeline
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
model = AutoModelForTokenClassification.from_pretrained(HF_MODEL)
ner = pipeline(
    "token-classification",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple"
)

Convert PDF with Images

1

Configure Pipeline

Enable image generation to preserve visuals in output.
2

Convert Document

Process the PDF and save original Markdown.
3

Detect PII

Run NER model on text elements and table cells.
4

Obfuscate Entities

Replace detected PII with stable type-based IDs.
input_doc_path = Path("document.pdf")
output_dir = Path("scratch")
output_dir.mkdir(parents=True, exist_ok=True)

pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = 2.0
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

conv_res = doc_converter.convert(input_doc_path)

# Save original
md_filename = output_dir / f"{conv_res.input.file.name}-orig.md"
conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)

Simple PII Obfuscator

import re
from typing import Dict

class SimplePiiObfuscator:
    """Tracks PII strings and replaces with stable IDs per entity type."""
    
    def __init__(self, ner_callable):
        self.ner = ner_callable
        self.entity_map: Dict[str, str] = {}
        self.counters = {
            "person": 0,
            "org": 0,
            "location": 0,
            "misc": 0,
        }
        # Map model labels to coarse types
        self.label_map = {
            "PER": "person",
            "PERSON": "person",
            "ORG": "org",
            "ORGANIZATION": "org",
            "LOC": "location",
            "LOCATION": "location",
            "GPE": "location",
            "MISC": "misc",
        }
        self.allowed_types = {"person", "org", "location"}
    
    def _next_id(self, typ: str) -> str:
        self.counters[typ] += 1
        return f"{typ}-{self.counters[typ]}"
    
    def obfuscate_text(self, text: str) -> str:
        if not text:
            return text
        
        results = self.ner(text)
        entities = []
        
        for r in results:
            raw_label = r.get("entity_group") or r.get("entity") or "MISC"
            label = self.label_map.get(raw_label, "misc")
            if label not in self.allowed_types:
                continue
            word = re.sub(r"\s+", " ", r.get("word", "")).strip()
            if word:
                entities.append((word, label))
        
        # Map entities to stable IDs
        unique_words = {}
        for word, label in entities:
            if word not in self.entity_map:
                replacement = self._next_id(label)
                self.entity_map[word] = replacement
            unique_words[word] = self.entity_map[word]
        
        # Replace longest matches first
        sorted_pairs = sorted(
            unique_words.items(), key=lambda x: len(x[0]), reverse=True
        )
        
        obfuscated = text
        for old, new in sorted_pairs:
            pattern = re.escape(old)
            obfuscated = re.sub(pattern, new, obfuscated)
        
        return obfuscated

Obfuscate Document

obfuscator = SimplePiiObfuscator(ner)

for element, _level in conv_res.document.iterate_items():
    if isinstance(element, TextItem):
        element.text = obfuscator.obfuscate_text(element.text)
    elif isinstance(element, TableItem):
        for cell in element.data.table_cells:
            cell.text = obfuscator.obfuscate_text(cell.text)

# Save obfuscated version
md_filename = output_dir / f"{conv_res.input.file.name}-obfuscated.md"
conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)

# Print entity mapping
print("\nObfuscated Entities:")
for original, replacement in obfuscator.entity_map.items():
    print(f"{original} => {replacement}")

Using GLiNER (Advanced)

from gliner import GLiNER

GLINER_MODEL = "urchade/gliner_multi_pii-v1"

model = GLiNER.from_pretrained(GLINER_MODEL)
labels = [
    "person",
    "email",
    "phone number",
    "passport number",
    "Social Security Number",
    "driver licence",
    "full address",
    "company",
]

Run via Command Line

# Use Hugging Face NER (default)
python docs/examples/pii_obfuscate.py

# Use GLiNER
python docs/examples/pii_obfuscate.py --engine gliner
# or
PII_ENGINE=gliner python docs/examples/pii_obfuscate.py

Requirements

Hugging Face NER

pip install docling transformers

GLiNER

pip install docling gliner
# For CPU-only environments:
pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
This is a demonstration. For production PII detection, use specialized models and thorough evaluation.

Build docs developers (and LLMs) love