Detect and obfuscate PII (Personally Identifiable Information) using Hugging Face NER models or GLiNER.
Overview
This example demonstrates:
Converting PDFs and detecting PII
Using Hugging Face token-classification models
Using GLiNER for fine-grained PII detection
Obfuscating detected entities with stable IDs
Saving original and redacted versions
Basic PII Obfuscation
from pathlib import Path
from docling_core.types.doc import ImageRefMode, TableItem, TextItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
HF_MODEL = "dslim/bert-base-NER"
# Build NER pipeline
tokenizer = AutoTokenizer.from_pretrained( HF_MODEL )
model = AutoModelForTokenClassification.from_pretrained( HF_MODEL )
ner = pipeline(
"token-classification" ,
model = model,
tokenizer = tokenizer,
aggregation_strategy = "simple"
)
Convert PDF with Images
Configure Pipeline
Enable image generation to preserve visuals in output.
Convert Document
Process the PDF and save original Markdown.
Detect PII
Run NER model on text elements and table cells.
Obfuscate Entities
Replace detected PII with stable type-based IDs.
input_doc_path = Path( "document.pdf" )
output_dir = Path( "scratch" )
output_dir.mkdir( parents = True , exist_ok = True )
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = 2.0
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True
doc_converter = DocumentConverter(
format_options = {
InputFormat. PDF : PdfFormatOption( pipeline_options = pipeline_options)
}
)
conv_res = doc_converter.convert(input_doc_path)
# Save original
md_filename = output_dir / f " { conv_res.input.file.name } -orig.md"
conv_res.document.save_as_markdown(md_filename, image_mode = ImageRefMode. EMBEDDED )
Simple PII Obfuscator
import re
from typing import Dict
class SimplePiiObfuscator :
"""Tracks PII strings and replaces with stable IDs per entity type."""
def __init__ ( self , ner_callable ):
self .ner = ner_callable
self .entity_map: Dict[ str , str ] = {}
self .counters = {
"person" : 0 ,
"org" : 0 ,
"location" : 0 ,
"misc" : 0 ,
}
# Map model labels to coarse types
self .label_map = {
"PER" : "person" ,
"PERSON" : "person" ,
"ORG" : "org" ,
"ORGANIZATION" : "org" ,
"LOC" : "location" ,
"LOCATION" : "location" ,
"GPE" : "location" ,
"MISC" : "misc" ,
}
self .allowed_types = { "person" , "org" , "location" }
def _next_id ( self , typ : str ) -> str :
self .counters[typ] += 1
return f " { typ } - { self .counters[typ] } "
def obfuscate_text ( self , text : str ) -> str :
if not text:
return text
results = self .ner(text)
entities = []
for r in results:
raw_label = r.get( "entity_group" ) or r.get( "entity" ) or "MISC"
label = self .label_map.get(raw_label, "misc" )
if label not in self .allowed_types:
continue
word = re.sub( r " \s + " , " " , r.get( "word" , "" )).strip()
if word:
entities.append((word, label))
# Map entities to stable IDs
unique_words = {}
for word, label in entities:
if word not in self .entity_map:
replacement = self ._next_id(label)
self .entity_map[word] = replacement
unique_words[word] = self .entity_map[word]
# Replace longest matches first
sorted_pairs = sorted (
unique_words.items(), key = lambda x : len (x[ 0 ]), reverse = True
)
obfuscated = text
for old, new in sorted_pairs:
pattern = re.escape(old)
obfuscated = re.sub(pattern, new, obfuscated)
return obfuscated
Obfuscate Document
obfuscator = SimplePiiObfuscator(ner)
for element, _level in conv_res.document.iterate_items():
if isinstance (element, TextItem):
element.text = obfuscator.obfuscate_text(element.text)
elif isinstance (element, TableItem):
for cell in element.data.table_cells:
cell.text = obfuscator.obfuscate_text(cell.text)
# Save obfuscated version
md_filename = output_dir / f " { conv_res.input.file.name } -obfuscated.md"
conv_res.document.save_as_markdown(md_filename, image_mode = ImageRefMode. EMBEDDED )
# Print entity mapping
print ( " \n Obfuscated Entities:" )
for original, replacement in obfuscator.entity_map.items():
print ( f " { original } => { replacement } " )
Using GLiNER (Advanced)
GLiNER Setup
GLiNER Obfuscator
from gliner import GLiNER
GLINER_MODEL = "urchade/gliner_multi_pii-v1"
model = GLiNER.from_pretrained( GLINER_MODEL )
labels = [
"person" ,
"email" ,
"phone number" ,
"passport number" ,
"Social Security Number" ,
"driver licence" ,
"full address" ,
"company" ,
]
Run via Command Line
# Use Hugging Face NER (default)
python docs/examples/pii_obfuscate.py
# Use GLiNER
python docs/examples/pii_obfuscate.py --engine gliner
# or
PII_ENGINE = gliner python docs/examples/pii_obfuscate.py
Requirements
Hugging Face NER
pip install docling transformers
GLiNER
pip install docling gliner
# For CPU-only environments:
pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
This is a demonstration. For production PII detection, use specialized models and thorough evaluation.