Nectr builds a Neo4j knowledge graph that tracks code ownership, file relationships, and PR history. The graph is used to answer questions like:
Who are the experts on these files?
What past PRs touched the same code?
Which files are high-risk (frequent bug fixes)?
Which files are dead (never touched by any PR)?
The graph is built incrementally: Repository + File nodes are created when you connect a repo, and PullRequest + Developer nodes are added after each review.
The knowledge graph provides structural context (who touched what) while Mem0 provides semantic context (what patterns exist). Together they power Nectr’s contextual intelligence.
Called when you connect a repo (and on manual rescan). Fetches the full recursive file tree from GitHub and creates Repository + File nodes + CONTAINS edges.
# app/services/graph_builder.py:93-194async def build_repo_graph(owner: str, repo: str, access_token: str) -> int: repo_full_name = f"{owner}/{repo}" # Fetch recursive tree from GitHub blobs = await _fetch_file_tree(owner, repo, access_token) async with get_session() as session: # Upsert Repository node await session.run( """ MERGE (r:Repository {full_name: $full_name}) SET r.scanned_at = $now """, full_name=repo_full_name, now=datetime.now(timezone.utc).isoformat(), ) # Batch-upsert File nodes (200 at a time) for i in range(0, len(blobs), 200): chunk = blobs[i : i + 200] files_data = [ { "path": b["path"], "language": _lang_from_path(b["path"]), "size": b.get("size", 0), } for b in chunk ] await session.run( """ UNWIND $files AS f MERGE (file:File {repo: $repo, path: f.path}) SET file.language = f.language, file.size = f.size WITH file MERGE (r:Repository {full_name: $repo}) MERGE (r)-[:CONTAINS]->(file) """, repo=repo_full_name, files=files_data, ) # Remove stale File nodes (files deleted from repo) current_paths = [b["path"] for b in blobs] await session.run( """ MATCH (r:Repository {full_name: $repo})-[:CONTAINS]->(f:File) WHERE NOT f.path IN $paths DETACH DELETE f """, repo=repo_full_name, paths=current_paths, ) return len(blobs)
Performance: Processes 200 files per batch. A 5000-file repo takes ~3 seconds.
GitHub caps the recursive tree API at ~100k nodes. For monorepos, the tree may be truncated. Nectr logs a warning but continues with the partial set.
Called after each PR review. Creates PullRequest + Developer nodes and edges (TOUCHES, AUTHORED_BY, CLOSES, CONTRIBUTED_TO).
# app/services/graph_builder.py:197-288async def index_pr( repo_full_name: str, pr_number: int, title: str, author: str, files_changed: list[str], verdict: str, issue_numbers: list[int] | None = None,) -> None: async with get_session() as session: # Upsert PullRequest node await session.run( """ MERGE (pr:PullRequest {repo: $repo, number: $number}) SET pr.title = $title, pr.author = $author, pr.verdict = $verdict, pr.reviewed_at = $now """, repo=repo_full_name, number=pr_number, title=title, author=author, verdict=verdict, now=datetime.now(timezone.utc).isoformat(), ) # Create Developer node + AUTHORED_BY + CONTRIBUTED_TO edges if author: await session.run( """ MERGE (d:Developer {login: $login}) WITH d MATCH (pr:PullRequest {repo: $repo, number: $number}) MERGE (pr)-[:AUTHORED_BY]->(d) WITH d MERGE (r:Repository {full_name: $repo}) MERGE (d)-[:CONTRIBUTED_TO]->(r) """, login=author, repo=repo_full_name, number=pr_number, ) # TOUCHES edges for changed files if files_changed: files_data = [{"path": p, "language": _lang_from_path(p)} for p in files_changed] await session.run( """ UNWIND $files AS f MATCH (pr:PullRequest {repo: $repo, number: $number}) MERGE (file:File {repo: $repo, path: f.path}) ON CREATE SET file.language = f.language MERGE (pr)-[:TOUCHES]->(file) """, repo=repo_full_name, number=pr_number, files=files_data, ) # CLOSES edges for resolved issues if issue_numbers: await session.run( """ UNWIND $issue_nums AS issue_num MERGE (i:Issue {repo: $repo, number: issue_num}) WITH i, issue_num MATCH (pr:PullRequest {repo: $repo, number: $pr_num}) MERGE (pr)-[:CLOSES]->(i) """, repo=repo_full_name, issue_nums=issue_numbers, pr_num=pr_number, )
Race condition fix: Uses MERGE for Repository node (not MATCH) so CONTRIBUTED_TO edge is never silently dropped if index_pr() runs before build_repo_graph() completes.
Returns developers who have most frequently touched the given files.
# app/services/graph_builder.py:294-324async def get_file_experts( repo_full_name: str, file_paths: list[str], top_k: int = 5,) -> list[dict]: async with get_session() as session: result = await session.run( """ UNWIND $paths AS path MATCH (pr:PullRequest {repo: $repo})-[:TOUCHES]->(f:File {repo: $repo, path: path}) MATCH (pr)-[:AUTHORED_BY]->(d:Developer) RETURN d.login AS login, count(*) AS touch_count ORDER BY touch_count DESC LIMIT $top_k """, repo=repo_full_name, paths=file_paths, top_k=top_k, ) return [{"login": r["login"], "touch_count": r["touch_count"]} async for r in result]
Returns past PRs that touched the same files (structural similarity).
# app/services/graph_builder.py:327-374async def get_related_prs( repo_full_name: str, file_paths: list[str], exclude_pr: int | None = None, top_k: int = 5,) -> list[dict]: async with get_session() as session: result = await session.run( """ UNWIND $paths AS path MATCH (pr:PullRequest {repo: $repo})-[:TOUCHES]->(f:File {repo: $repo, path: path}) WHERE ($exclude IS NULL OR pr.number <> $exclude) AND pr.verdict IS NOT NULL WITH pr, count(DISTINCT f) AS overlap ORDER BY overlap DESC LIMIT $top_k RETURN pr.number AS number, pr.title AS title, pr.author AS author, pr.verdict AS verdict, overlap """, repo=repo_full_name, paths=file_paths, exclude=exclude_pr, top_k=top_k, ) return [ { "number": r["number"], "title": r["title"], "author": r["author"], "verdict": r["verdict"], "overlap": r["overlap"], } async for r in result ]
Batch-fetch Issue nodes and which PRs closed them.
# app/services/graph_builder.py:377-413async def get_linked_issues( repo_full_name: str, issue_numbers: list[int],) -> list[dict]: async with get_session() as session: result = await session.run( """ UNWIND $nums AS num OPTIONAL MATCH (i:Issue {repo: $repo, number: num}) OPTIONAL MATCH (pr:PullRequest)-[:CLOSES]->(i) RETURN num, i IS NOT NULL AS found, collect(pr.number) AS closed_by """, repo=repo_full_name, nums=issue_numbers, ) return [ {"number": r["num"], "closed_by": r["closed_by"]} async for r in result ]
Files touched by the most PRs (high churn / high importance).
MATCH (pr:PullRequest {repo: $repo})-[:TOUCHES]->(f:File)RETURN f.path AS path, f.language AS language, count(pr) AS pr_countORDER BY pr_count DESC LIMIT 10
Files that repeatedly get REQUEST_CHANGES verdict (fragile, needs attention).
MATCH (pr:PullRequest {repo: $repo, verdict: "REQUEST_CHANGES"})-[:TOUCHES]->(f:File)RETURN f.path AS path, f.language AS language, count(pr) AS risk_countORDER BY risk_count DESC LIMIT 8
For each heavily-touched file, who is the dominant contributor.
MATCH (pr:PullRequest {repo: $repo})-[:AUTHORED_BY]->(d:Developer), (pr)-[:TOUCHES]->(f:File)WITH f.path AS path, d.login AS dev, count(*) AS touchesORDER BY path, touches DESCWITH path, collect({dev: dev, touches: touches})[0] AS top_owner, sum(touches) AS total_touchesWHERE total_touches >= 2RETURN path, top_owner.dev AS owner, top_owner.touches AS owner_touches, total_touchesORDER BY total_touches DESC LIMIT 10
Per developer: which top-level directories they contribute to most.
MATCH (pr:PullRequest {repo: $repo})-[:AUTHORED_BY]->(d:Developer), (pr)-[:TOUCHES]->(f:File)WITH d.login AS dev, CASE WHEN size(split(f.path, '/')) > 1 THEN split(f.path, '/')[0] ELSE '(root)' END AS directory, count(*) AS touchesORDER BY dev, touches DESCWITH dev, collect({directory: directory, touches: touches})[0..4] AS top_dirs, sum(touches) AS total_touchesRETURN dev, top_dirs, total_touchesORDER BY total_touches DESC LIMIT 8
Nectr creates constraints and indexes on startup to ensure fast queries:
-- Unique constraints (also create indexes)CREATE CONSTRAINT repo_full_name IF NOT EXISTS FOR (r:Repository) REQUIRE r.full_name IS UNIQUECREATE CONSTRAINT file_repo_path IF NOT EXISTS FOR (f:File) REQUIRE (f.repo, f.path) IS UNIQUECREATE CONSTRAINT pr_repo_number IF NOT EXISTS FOR (pr:PullRequest) REQUIRE (pr.repo, pr.number) IS UNIQUECREATE CONSTRAINT dev_login IF NOT EXISTS FOR (d:Developer) REQUIRE d.login IS UNIQUECREATE CONSTRAINT issue_repo_number IF NOT EXISTS FOR (i:Issue) REQUIRE (i.repo, i.number) IS UNIQUE-- Additional indexes for common queriesCREATE INDEX file_language IF NOT EXISTS FOR (f:File) ON (f.language)CREATE INDEX pr_verdict IF NOT EXISTS FOR (pr:PullRequest) ON (pr.verdict)