Overview
Artifact Miner uses the DeepRepoAnalyzer to extract developer skills from repository artifacts through multiple signal sources. The system combines regex pattern matching, dependency analysis, file structure inspection, git history mining, and infrastructure detection to build a comprehensive skill profile.DeepRepoAnalyzer Architecture
Core Components
The analyzer orchestrates multiple signal extractors:class DeepRepoAnalyzer:
"""Per-repo analyzer that relies on user additions for attribution."""
def __init__(self, enable_llm: bool = False) -> None:
self.extractor = SkillExtractor(enable_llm=enable_llm)
self._validate_insight_rules()
def analyze(
self,
repo_path: str,
repo_stat: Any,
user_email: str,
user_contributions: Dict | None = None,
consent_level: str = "none",
user_stats: Any = None,
) -> DeepAnalysisResult:
"""Run baseline skill extraction, then derive insights."""
skills = self.extractor.extract_skills(
repo_path=repo_path,
repo_stat=repo_stat,
user_email=user_email,
user_contributions=user_contributions or {},
consent_level=consent_level,
)
insights = self._derive_insights(skills)
git_stats = self._extract_git_stats(
repo_path, user_email, user_contributions, user_stats
)
infra_signals = self._extract_infra_signals(repo_path, user_contributions)
repo_quality = self._extract_repo_quality(repo_path, user_contributions)
return DeepAnalysisResult(
skills=skills,
insights=insights,
git_stats=git_stats,
infra_signals=infra_signals,
repo_quality=repo_quality,
)
Data Models
ExtractedSkill:@dataclass
class ExtractedSkill:
"""Structured skill result with evidence and proficiency."""
skill: str
category: str
evidence: List[str] = field(default_factory=list)
proficiency: float = 0.0
def add_evidence(self, items: Iterable[str]) -> None:
"""Append unique evidence snippets."""
deduped = set(self.evidence)
for item in items:
if item not in deduped:
self.evidence.append(item)
deduped.add(item)
@dataclass
class DeepAnalysisResult:
"""Baseline skills plus higher-order insights."""
skills: List[ExtractedSkill]
insights: List[Insight]
git_stats: GitStatsResult | None = None
infra_signals: InfraSignalsResult | None = None
repo_quality: RepoQualityResult | None = None
Skill Signals
1. Code Signals
Detect skills through regex pattern matching on user code additions:def iter_code_pattern_hits(additions_text: str, ecosystems: Set[str]) -> Iterator[Tuple[CodePattern, int]]:
"""Yield patterns and hit counts for additions text, respecting ecosystem gates."""
for pattern in CODE_REGEX_PATTERNS:
if pattern.ecosystems:
if not ecosystems.intersection(set(pattern.ecosystems)):
continue
hits = len(re.findall(pattern.regex, additions_text, flags=re.MULTILINE))
if hits:
yield pattern, hits
CODE_REGEX_PATTERNS = [
CodePattern(
skill="Asynchronous Programming",
category="Concurrency",
regex=r"\b(async def|await|asyncio)\b",
ecosystems=["python"]
),
CodePattern(
skill="REST API Design",
category="Web Development",
regex=r"@(app|router)\.(get|post|put|delete|patch)",
ecosystems=["python"]
),
CodePattern(
skill="Exception Design",
category="Error Handling",
regex=r"class\s+\w+Error\(.*Exception\):",
ecosystems=["python"]
),
]
for pattern, hits in iter_code_pattern_hits(additions_text, ecosystems):
skill = get_or_create_skill(pattern.skill, pattern.category)
# Extract actual code snippets as evidence
evidence_snippets = re.findall(pattern.regex, additions_text)[:3]
skill.add_evidence(evidence_snippets)
skill.proficiency += calculate_proficiency(hits)
2. Dependency Signals
Detect skills from package manager files: Python (requirements.txt, pyproject.toml, Pipfile):DEPENDENCY_PATTERNS = {
"FastAPI": {"packages": ["fastapi"], "category": "Web Frameworks"},
"SQLAlchemy": {"packages": ["sqlalchemy"], "category": "Database ORMs"},
"Pydantic": {"packages": ["pydantic"], "category": "Data Validation"},
}
# Example evidence
evidence = "fastapi==0.104.1" # from requirements.txt
DEPENDENCY_PATTERNS = {
"React": {"packages": ["react"], "category": "Frontend Frameworks"},
"Express": {"packages": ["express"], "category": "Web Frameworks"},
"TypeScript": {"packages": ["typescript"], "category": "Languages"},
}
3. File Signals
Detect skills from file structure and naming patterns:FILE_PATTERNS = [
FilePattern(
skill="Docker",
category="DevOps",
files=["Dockerfile", "docker-compose.yml", ".dockerignore"]
),
FilePattern(
skill="CI/CD",
category="DevOps",
files=[".github/workflows/*.yml", ".gitlab-ci.yml", "Jenkinsfile"]
),
FilePattern(
skill="Database Migrations",
category="Database",
files=["alembic/", "migrations/", "**/migrations/*.py"]
),
]
evidence = [
"Dockerfile",
"docker-compose.yml",
".github/workflows/ci.yml"
]
4. Git Signals
Extract contribution patterns from git history:@dataclass
class GitStatsResult:
"""Git contribution metrics for a user in a repo."""
commit_count_window: int = 0 # Last 90 days
commit_frequency: float = 0.0 # Commits per week
contribution_percent: float = 0.0 # User's % of total commits
first_commit_date: Any = None
last_commit_date: Any = None
has_branches: bool = False
branch_count: int = 0
has_tags: bool = False
merge_commits: int = 0
def detect_git_patterns(repo_path: str, touched_paths: list[str] | None = None) -> Dict:
"""Detect advanced git workflow patterns."""
patterns = {}
# Branch workflow
branches = subprocess.check_output(
["git", "branch", "-a"], cwd=repo_path
).decode().strip().split("\n")
patterns["has_branches"] = len(branches) > 1
patterns["branch_count"] = len(branches)
# Release management (tags)
tags = subprocess.check_output(
["git", "tag"], cwd=repo_path
).decode().strip().split("\n")
patterns["has_tags"] = bool(tags and tags[0])
# Merge commits (collaboration indicator)
merge_commits = subprocess.check_output(
["git", "log", "--merges", "--oneline"], cwd=repo_path
).decode().strip().split("\n")
patterns["merge_commits"] = len([c for c in merge_commits if c])
return patterns
5. Infrastructure Signals
Detect DevOps and infrastructure skills:@dataclass
class InfraSignalsResult:
"""Infrastructure and DevOps configuration signals."""
ci_cd_tools: List[str] = field(default_factory=list)
docker_tools: List[str] = field(default_factory=list)
env_build_tools: List[str] = field(default_factory=list)
all_tools: List[str] = field(default_factory=list)
def get_infra_signals(repo_path: str, touched_paths: list[str] | None = None) -> Dict:
"""Extract infrastructure and DevOps configuration signals."""
signals = {
"ci_cd_tools": [],
"docker_tools": [],
"env_build_tools": []
}
# CI/CD detection
ci_patterns = {
".github/workflows/": "GitHub Actions",
".gitlab-ci.yml": "GitLab CI",
"Jenkinsfile": "Jenkins",
".circleci/": "CircleCI",
}
# Docker detection
docker_patterns = {
"Dockerfile": "Docker",
"docker-compose.yml": "Docker Compose",
".dockerignore": "Docker",
}
# Environment/build tools
build_patterns = {
"Makefile": "Make",
"pyproject.toml": "Python Build Tools",
"package.json": "npm",
}
for path in Path(repo_path).rglob("*"):
for pattern, tool in ci_patterns.items():
if pattern in str(path):
signals["ci_cd_tools"].append(tool)
# ... similar for docker and build tools
signals["all_tools"] = (
signals["ci_cd_tools"] +
signals["docker_tools"] +
signals["env_build_tools"]
)
return signals
6. Repository Quality Signals
Assess engineering practices:@dataclass
class RepoQualityResult:
"""Repository quality signals: testing, documentation, code quality."""
test_file_count: int = 0
has_tests: bool = False
test_frameworks: List[str] = field(default_factory=list)
has_readme: bool = False
has_changelog: bool = False
has_contributing: bool = False
has_docs_dir: bool = False
has_lint_config: bool = False
has_precommit: bool = False
has_type_check: bool = False
quality_tools: List[str] = field(default_factory=list)
def get_repo_quality_signals(repo_path: str, touched_paths: list[str] | None = None) -> RepoQualityResult:
result = RepoQualityResult()
# Test detection
test_patterns = ["test_*.py", "*_test.py", "tests/", "__tests__/"]
for pattern in test_patterns:
matches = list(Path(repo_path).rglob(pattern))
result.test_file_count += len(matches)
result.has_tests = result.test_file_count > 0
# Framework detection
if Path(repo_path, "pytest.ini").exists():
result.test_frameworks.append("pytest")
if Path(repo_path, "jest.config.js").exists():
result.test_frameworks.append("jest")
# Documentation
result.has_readme = any(Path(repo_path).glob("README*"))
result.has_changelog = any(Path(repo_path).glob("CHANGELOG*"))
result.has_docs_dir = Path(repo_path, "docs").exists()
# Quality tooling
result.has_lint_config = any([
Path(repo_path, ".pylintrc").exists(),
Path(repo_path, ".eslintrc.js").exists(),
])
result.has_precommit = Path(repo_path, ".pre-commit-config.yaml").exists()
result.has_type_check = Path(repo_path, "mypy.ini").exists()
return result
Chronology Tracking
Skill Timeline
Track when skills were first demonstrated:@dataclass
class SkillChronologyItem:
"""Chronological skill entry showing when a skill was first demonstrated."""
date: datetime | None # From project's first commit
skill: str
project: str
proficiency: float | None = None
category: str | None = None
Fetch Chronology
def fetch_skill_chronology(
db: Session,
*,
project_path_prefixes: list[str] | None = None,
) -> list[SkillChronologyItem]:
"""Get chronological list of skills ordered by when they were first demonstrated."""
items: list[SkillChronologyItem] = []
# Query project skills
project_results = (
db.query(ProjectSkill, Skill, RepoStat)
.join(Skill, ProjectSkill.skill_id == Skill.id)
.join(RepoStat, ProjectSkill.repo_stat_id == RepoStat.id)
.filter(RepoStat.deleted_at.is_(None))
.all()
)
for project_skill, skill, repo_stat in project_results:
items.append(
SkillChronologyItem(
date=repo_stat.first_commit,
skill=skill.name,
project=repo_stat.project_name,
proficiency=project_skill.proficiency,
category=skill.category,
)
)
# Sort by date (oldest first)
items.sort(key=lambda item: item.date or datetime.max)
return items
API Response
GET /skills/chronology
[
{
"date": "2024-01-15T10:00:00",
"skill": "Python",
"project": "data-pipeline",
"proficiency": 0.75,
"category": "Programming Languages"
},
{
"date": "2024-03-20T14:30:00",
"skill": "FastAPI",
"project": "api-service",
"proficiency": 0.85,
"category": "Web Frameworks"
},
{
"date": "2024-06-10T09:15:00",
"skill": "Docker",
"project": "microservices",
"proficiency": 0.65,
"category": "DevOps"
}
]
Insight Derivation
Higher-Order Insights
The analyzer derives insights from skill combinations:_INSIGHT_RULES: Dict[str, Dict[str, Any]] = {
"Complexity awareness": {
"skills": {"Resource Management"},
"why": "Resource caps and chunking show attention to cost/complexity under load.",
},
"Data structure and optimization": {
"skills": {"Advanced Collections", "Algorithm Optimization"},
"why": "Specialized collections and optimization tools indicate performance-minded choices.",
},
"Abstraction and encapsulation": {
"skills": {"Dataclass Design", "Abstract Interfaces", "Data Validation"},
"why": "Structured modeling and interfaces reflect design thinking beyond scripts.",
},
"Robustness and error handling": {
"skills": {"Exception Design", "Context Management", "Error Handling", "Logging"},
"why": "Custom exceptions, managed resources, and logging reduce brittleness.",
},
"Async and concurrency": {
"skills": {"Asynchronous Programming"},
"why": "Async patterns enable scalable, non-blocking operations.",
},
"API design and architecture": {
"skills": {"REST API Design", "Dependency Injection", "Data Validation"},
"why": "Clean API design with validation and DI shows architectural maturity.",
},
}
Insight Generation
def _derive_insights(self, skills: List[ExtractedSkill]) -> List[Insight]:
insight_results: List[Insight] = []
skill_map = {s.skill: s for s in skills}
for title, rule in self._INSIGHT_RULES.items():
names = rule["skills"]
matched = [skill_map[name] for name in names if name in skill_map]
if not matched:
continue
evidence: List[str] = []
for skill in matched:
evidence.extend(skill.evidence[:2])
insight_results.append(
Insight(
title=title,
evidence=evidence[:5],
why_it_matters=rule["why"],
)
)
return insight_results
Related Features
Evidence Tracking
Learn how extracted skills are stored as evidence
Portfolio Analysis
See how skills contribute to portfolio ranking
Resume Generation
Generate resume items from skill analysis
Skills API
Complete API documentation for skills endpoints