Skip to main content

Overview

Each pipeline function corresponds to a CLI command but takes explicit parameters instead of reading from config files. Use these from Jupyter notebooks, web apps, or anywhere you want sift-kg as a library.

run_pipeline

from sift_kg import run_pipeline
Run the full pipeline: extract → build → narrate. Skips resolve/apply-merges (those require human review).

Signature

def run_pipeline(
    doc_dir: Path,
    model: str,
    domain: DomainConfig,
    output_dir: Path,
    max_cost: float | None = None,
    include_narrative: bool = True,
) -> Path

Parameters

doc_dir
Path
required
Directory containing documents (PDF, text, HTML, 75+ formats)
model
str
required
LLM model string (e.g. "openai/gpt-4o-mini", "anthropic/claude-3-5-sonnet-20241022")
domain
DomainConfig
required
Domain configuration object loaded via load_domain()
output_dir
Path
required
Output directory for all artifacts (extractions, graph, narratives)
max_cost
float | None
default:"None"
Budget cap in USD. Pipeline stops if cost exceeds this limit.
include_narrative
bool
default:"True"
Whether to generate narrative summary at the end

Returns

output_dir
Path
Path to output directory containing all pipeline artifacts

Example

from pathlib import Path
from sift_kg import load_domain, run_pipeline

domain = load_domain(bundled_name="schema-free")

output_dir = run_pipeline(
    doc_dir=Path("./documents"),
    model="openai/gpt-4o-mini",
    domain=domain,
    output_dir=Path("./output"),
    max_cost=10.0,
    include_narrative=True,
)

print(f"Pipeline complete! Check {output_dir}")

run_extract

from sift_kg import run_extract
Extract entities and relations from all documents in a directory.

Signature

def run_extract(
    doc_dir: Path,
    model: str,
    domain: DomainConfig,
    output_dir: Path,
    max_cost: float | None = None,
    concurrency: int = 4,
    chunk_size: int = 10000,
    force: bool = False,
    extractor: str = "kreuzberg",
    ocr: bool = False,
    ocr_backend: str = "tesseract",
    ocr_language: str = "eng",
    rpm: int = 40,
) -> list[DocumentExtraction]

Parameters

doc_dir
Path
required
Directory containing documents to extract from
model
str
required
LLM model string (e.g. "openai/gpt-4o-mini")
domain
DomainConfig
required
Domain configuration
output_dir
Path
required
Where to save extraction JSON files
max_cost
float | None
default:"None"
Budget cap in USD
concurrency
int
default:"4"
Concurrent LLM calls per document
chunk_size
int
default:"10000"
Characters per text chunk. Larger = fewer API calls but longer context.
force
bool
default:"False"
Re-extract all documents, ignoring cached results
extractor
str
default:"kreuzberg"
Extraction backend — "kreuzberg" (default) or "pdfplumber"
ocr
bool
default:"False"
Enable OCR for scanned documents
ocr_backend
str
default:"tesseract"
OCR engine — "tesseract", "easyocr", "paddleocr", or "gcv"
ocr_language
str
default:"eng"
OCR language code (ISO 639-3, e.g. "eng", "spa", "fra")
rpm
int
default:"40"
Max requests per minute for rate limiting

Returns

extractions
list[DocumentExtraction]
List of extraction results, one per document

Example

from pathlib import Path
from sift_kg import load_domain, run_extract

domain = load_domain(bundled_name="biomedical")

extractions = run_extract(
    doc_dir=Path("./papers"),
    model="openai/gpt-4o-mini",
    domain=domain,
    output_dir=Path("./output"),
    chunk_size=15000,
    concurrency=8,
    ocr=True,
    max_cost=5.0,
)

print(f"Extracted {len(extractions)} documents")

run_build

from sift_kg import run_build
Build knowledge graph from extraction results. Also flags relations for review and saves the graph.

Signature

def run_build(
    output_dir: Path,
    domain: DomainConfig,
    review_threshold: float = 0.7,
    postprocess: bool = True,
) -> KnowledgeGraph

Parameters

output_dir
Path
required
Directory with extraction JSON files (from run_extract)
domain
DomainConfig
required
Domain configuration (used for review_required types)
review_threshold
float
default:"0.7"
Flag relations below this confidence for human review
postprocess
bool
default:"True"
Whether to remove redundant edges during graph construction

Returns

kg
KnowledgeGraph
Populated knowledge graph saved to output_dir/graph_data.json

Example

from pathlib import Path
from sift_kg import load_domain, run_build

domain = load_domain(bundled_name="schema-free")

kg = run_build(
    output_dir=Path("./output"),
    domain=domain,
    review_threshold=0.6,
    postprocess=True,
)

print(f"Built graph: {kg.entity_count} entities, {kg.relation_count} relations")

run_resolve

from sift_kg import run_resolve
Find duplicate entities using LLM-based resolution. Generates merge proposals for human review.

Signature

def run_resolve(
    output_dir: Path,
    model: str,
    domain: DomainConfig | None = None,
    use_embeddings: bool = False,
    concurrency: int = 4,
    rpm: int = 40,
) -> MergeFile

Parameters

output_dir
Path
required
Directory with graph_data.json
model
str
required
LLM model string for entity comparison
domain
DomainConfig | None
default:"None"
Domain configuration (provides system context for smarter resolution)
use_embeddings
bool
default:"False"
Use semantic clustering for batching candidates (requires sift-kg[embeddings])
concurrency
int
default:"4"
Concurrent LLM calls
rpm
int
default:"40"
Max requests per minute

Returns

merge_file
MergeFile
Merge file with DRAFT proposals saved to output_dir/merge_proposals.yaml

Example

from pathlib import Path
from sift_kg import load_domain, run_resolve

domain = load_domain(bundled_name="biomedical")

merge_file = run_resolve(
    output_dir=Path("./output"),
    model="openai/gpt-4o-mini",
    domain=domain,
    use_embeddings=True,
    concurrency=8,
)

print(f"Found {len(merge_file.proposals)} merge candidates")
print("Review and edit ./output/merge_proposals.yaml")

run_apply_merges

from sift_kg import run_apply_merges
Apply confirmed entity merges and relation rejections after human review.

Signature

def run_apply_merges(output_dir: Path) -> dict

Parameters

output_dir
Path
required
Directory with graph_data.json and review files (merge_proposals.yaml, relation_review.yaml)

Returns

stats
dict
Stats dict with keys:
  • merges_applied (int): Number of entity merges applied
  • rejected_count (int): Number of relations rejected

Example

from pathlib import Path
from sift_kg import run_apply_merges

# After reviewing merge_proposals.yaml and changing status: DRAFT → CONFIRMED
stats = run_apply_merges(output_dir=Path("./output"))

print(f"Applied {stats['merges_applied']} merges")
print(f"Rejected {stats['rejected_count']} relations")

run_narrate

from sift_kg import run_narrate
Generate narrative summary from the knowledge graph using community detection and LLM summarization.

Signature

def run_narrate(
    output_dir: Path,
    model: str,
    system_context: str = "",
    include_entity_descriptions: bool = True,
    max_cost: float | None = None,
    communities_only: bool = False,
) -> Path

Parameters

output_dir
Path
required
Directory with graph_data.json
model
str
required
LLM model string
system_context
str
default:""
Optional domain context injected into LLM prompts
include_entity_descriptions
bool
default:"True"
Generate per-entity descriptions (more expensive)
max_cost
float | None
default:"None"
Budget cap in USD
communities_only
bool
default:"False"
Only regenerate community labels (~$0.01 cost)

Returns

output_path
Path
Path to generated narrative.md or communities.json

Example

from pathlib import Path
from sift_kg import run_narrate

narrative_path = run_narrate(
    output_dir=Path("./output"),
    model="openai/gpt-4o-mini",
    system_context="This is a biomedical research corpus.",
    include_entity_descriptions=True,
    max_cost=2.0,
)

print(f"Narrative saved to {narrative_path}")

run_view

from sift_kg import run_view
Generate interactive graph visualization with optional pre-filters.

Signature

def run_view(
    output_dir: Path,
    to: Path | None = None,
    open_browser: bool = True,
    top_n: int | None = None,
    min_confidence: float | None = None,
    source_doc: str | None = None,
    neighborhood: str | None = None,
    depth: int = 1,
    community: str | None = None,
) -> Path

Parameters

output_dir
Path
required
Directory with graph_data.json
to
Path | None
default:"None"
Output HTML path (default: output_dir/graph.html)
open_browser
bool
default:"True"
Whether to open the visualization in a browser automatically
top_n
int | None
default:"None"
Show only top N entities by degree (useful for large graphs)
min_confidence
float | None
default:"None"
Hide nodes/edges below this confidence threshold
source_doc
str | None
default:"None"
Show only entities from this document
neighborhood
str | None
default:"None"
Center visualization on entity ID (e.g. "person:alice")
depth
int
default:"1"
Number of hops for neighborhood filter (used with neighborhood)
community
str | None
default:"None"
Focus on a specific community label

Returns

html_path
Path
Path to generated interactive HTML file

Example

from pathlib import Path
from sift_kg import run_view

# Full graph
html_path = run_view(
    output_dir=Path("./output"),
    min_confidence=0.5,
    open_browser=True,
)

# Neighborhood view
html_path = run_view(
    output_dir=Path("./output"),
    neighborhood="person:alice",
    depth=2,
    open_browser=False,
)

print(f"Visualization: {html_path}")

run_export

from sift_kg import run_export
Export the knowledge graph to various formats.

Signature

def run_export(
    output_dir: Path,
    fmt: str = "json",
    export_path: Path | None = None,
) -> Path

Parameters

output_dir
Path
required
Directory with graph_data.json
fmt
str
default:"json"
Export format — "json", "graphml", "gexf", "csv", or "sqlite"
export_path
Path | None
default:"None"
Where to write output (default: output_dir/graph.{fmt})

Returns

export_path
Path
Path to the exported file or directory (for CSV format)

Example

from pathlib import Path
from sift_kg import run_export

# Export to GraphML for Gephi/Cytoscape
graphml_path = run_export(
    output_dir=Path("./output"),
    fmt="graphml",
)

# Export to SQLite database
db_path = run_export(
    output_dir=Path("./output"),
    fmt="sqlite",
    export_path=Path("./graph.db"),
)

# Export to CSV files (nodes.csv + edges.csv)
csv_dir = run_export(
    output_dir=Path("./output"),
    fmt="csv",
)

print(f"Exported to {graphml_path}, {db_path}, {csv_dir}")

Build docs developers (and LLMs) love