Overview
Each pipeline function corresponds to a CLI command but takes explicit parameters instead of reading from config files. Use these from Jupyter notebooks, web apps, or anywhere you want sift-kg as a library.
run_pipeline
from sift_kg import run_pipeline
Run the full pipeline: extract → build → narrate. Skips resolve/apply-merges (those require human review).
Signature
def run_pipeline(
doc_dir: Path,
model: str,
domain: DomainConfig,
output_dir: Path,
max_cost: float | None = None,
include_narrative: bool = True,
) -> Path
Parameters
Directory containing documents (PDF, text, HTML, 75+ formats)
LLM model string (e.g. "openai/gpt-4o-mini", "anthropic/claude-3-5-sonnet-20241022")
Domain configuration object loaded via load_domain()
Output directory for all artifacts (extractions, graph, narratives)
max_cost
float | None
default:"None"
Budget cap in USD. Pipeline stops if cost exceeds this limit.
Whether to generate narrative summary at the end
Returns
Path to output directory containing all pipeline artifacts
Example
from pathlib import Path
from sift_kg import load_domain, run_pipeline
domain = load_domain(bundled_name="schema-free")
output_dir = run_pipeline(
doc_dir=Path("./documents"),
model="openai/gpt-4o-mini",
domain=domain,
output_dir=Path("./output"),
max_cost=10.0,
include_narrative=True,
)
print(f"Pipeline complete! Check {output_dir}")
from sift_kg import run_extract
Extract entities and relations from all documents in a directory.
Signature
def run_extract(
doc_dir: Path,
model: str,
domain: DomainConfig,
output_dir: Path,
max_cost: float | None = None,
concurrency: int = 4,
chunk_size: int = 10000,
force: bool = False,
extractor: str = "kreuzberg",
ocr: bool = False,
ocr_backend: str = "tesseract",
ocr_language: str = "eng",
rpm: int = 40,
) -> list[DocumentExtraction]
Parameters
Directory containing documents to extract from
LLM model string (e.g. "openai/gpt-4o-mini")
Where to save extraction JSON files
max_cost
float | None
default:"None"
Budget cap in USD
Concurrent LLM calls per document
Characters per text chunk. Larger = fewer API calls but longer context.
Re-extract all documents, ignoring cached results
Extraction backend — "kreuzberg" (default) or "pdfplumber"
Enable OCR for scanned documents
OCR engine — "tesseract", "easyocr", "paddleocr", or "gcv"
OCR language code (ISO 639-3, e.g. "eng", "spa", "fra")
Max requests per minute for rate limiting
Returns
List of extraction results, one per document
Example
from pathlib import Path
from sift_kg import load_domain, run_extract
domain = load_domain(bundled_name="biomedical")
extractions = run_extract(
doc_dir=Path("./papers"),
model="openai/gpt-4o-mini",
domain=domain,
output_dir=Path("./output"),
chunk_size=15000,
concurrency=8,
ocr=True,
max_cost=5.0,
)
print(f"Extracted {len(extractions)} documents")
run_build
from sift_kg import run_build
Build knowledge graph from extraction results. Also flags relations for review and saves the graph.
Signature
def run_build(
output_dir: Path,
domain: DomainConfig,
review_threshold: float = 0.7,
postprocess: bool = True,
) -> KnowledgeGraph
Parameters
Directory with extraction JSON files (from run_extract)
Domain configuration (used for review_required types)
Flag relations below this confidence for human review
Whether to remove redundant edges during graph construction
Returns
Populated knowledge graph saved to output_dir/graph_data.json
Example
from pathlib import Path
from sift_kg import load_domain, run_build
domain = load_domain(bundled_name="schema-free")
kg = run_build(
output_dir=Path("./output"),
domain=domain,
review_threshold=0.6,
postprocess=True,
)
print(f"Built graph: {kg.entity_count} entities, {kg.relation_count} relations")
run_resolve
from sift_kg import run_resolve
Find duplicate entities using LLM-based resolution. Generates merge proposals for human review.
Signature
def run_resolve(
output_dir: Path,
model: str,
domain: DomainConfig | None = None,
use_embeddings: bool = False,
concurrency: int = 4,
rpm: int = 40,
) -> MergeFile
Parameters
Directory with graph_data.json
LLM model string for entity comparison
domain
DomainConfig | None
default:"None"
Domain configuration (provides system context for smarter resolution)
Use semantic clustering for batching candidates (requires sift-kg[embeddings])
Returns
Merge file with DRAFT proposals saved to output_dir/merge_proposals.yaml
Example
from pathlib import Path
from sift_kg import load_domain, run_resolve
domain = load_domain(bundled_name="biomedical")
merge_file = run_resolve(
output_dir=Path("./output"),
model="openai/gpt-4o-mini",
domain=domain,
use_embeddings=True,
concurrency=8,
)
print(f"Found {len(merge_file.proposals)} merge candidates")
print("Review and edit ./output/merge_proposals.yaml")
run_apply_merges
from sift_kg import run_apply_merges
Apply confirmed entity merges and relation rejections after human review.
Signature
def run_apply_merges(output_dir: Path) -> dict
Parameters
Directory with graph_data.json and review files (merge_proposals.yaml, relation_review.yaml)
Returns
Stats dict with keys:
merges_applied (int): Number of entity merges applied
rejected_count (int): Number of relations rejected
Example
from pathlib import Path
from sift_kg import run_apply_merges
# After reviewing merge_proposals.yaml and changing status: DRAFT → CONFIRMED
stats = run_apply_merges(output_dir=Path("./output"))
print(f"Applied {stats['merges_applied']} merges")
print(f"Rejected {stats['rejected_count']} relations")
run_narrate
from sift_kg import run_narrate
Generate narrative summary from the knowledge graph using community detection and LLM summarization.
Signature
def run_narrate(
output_dir: Path,
model: str,
system_context: str = "",
include_entity_descriptions: bool = True,
max_cost: float | None = None,
communities_only: bool = False,
) -> Path
Parameters
Directory with graph_data.json
Optional domain context injected into LLM prompts
include_entity_descriptions
Generate per-entity descriptions (more expensive)
max_cost
float | None
default:"None"
Budget cap in USD
Only regenerate community labels (~$0.01 cost)
Returns
Path to generated narrative.md or communities.json
Example
from pathlib import Path
from sift_kg import run_narrate
narrative_path = run_narrate(
output_dir=Path("./output"),
model="openai/gpt-4o-mini",
system_context="This is a biomedical research corpus.",
include_entity_descriptions=True,
max_cost=2.0,
)
print(f"Narrative saved to {narrative_path}")
run_view
from sift_kg import run_view
Generate interactive graph visualization with optional pre-filters.
Signature
def run_view(
output_dir: Path,
to: Path | None = None,
open_browser: bool = True,
top_n: int | None = None,
min_confidence: float | None = None,
source_doc: str | None = None,
neighborhood: str | None = None,
depth: int = 1,
community: str | None = None,
) -> Path
Parameters
Directory with graph_data.json
to
Path | None
default:"None"
Output HTML path (default: output_dir/graph.html)
Whether to open the visualization in a browser automatically
Show only top N entities by degree (useful for large graphs)
min_confidence
float | None
default:"None"
Hide nodes/edges below this confidence threshold
Show only entities from this document
Center visualization on entity ID (e.g. "person:alice")
Number of hops for neighborhood filter (used with neighborhood)
Focus on a specific community label
Returns
Path to generated interactive HTML file
Example
from pathlib import Path
from sift_kg import run_view
# Full graph
html_path = run_view(
output_dir=Path("./output"),
min_confidence=0.5,
open_browser=True,
)
# Neighborhood view
html_path = run_view(
output_dir=Path("./output"),
neighborhood="person:alice",
depth=2,
open_browser=False,
)
print(f"Visualization: {html_path}")
run_export
from sift_kg import run_export
Export the knowledge graph to various formats.
Signature
def run_export(
output_dir: Path,
fmt: str = "json",
export_path: Path | None = None,
) -> Path
Parameters
Directory with graph_data.json
Export format — "json", "graphml", "gexf", "csv", or "sqlite"
export_path
Path | None
default:"None"
Where to write output (default: output_dir/graph.{fmt})
Returns
Path to the exported file or directory (for CSV format)
Example
from pathlib import Path
from sift_kg import run_export
# Export to GraphML for Gephi/Cytoscape
graphml_path = run_export(
output_dir=Path("./output"),
fmt="graphml",
)
# Export to SQLite database
db_path = run_export(
output_dir=Path("./output"),
fmt="sqlite",
export_path=Path("./graph.db"),
)
# Export to CSV files (nodes.csv + edges.csv)
csv_dir = run_export(
output_dir=Path("./output"),
fmt="csv",
)
print(f"Exported to {graphml_path}, {db_path}, {csv_dir}")