When to Create an Adapter
Create a custom adapter when:- You have a complex system with multiple optimizable components
- You need custom evaluation logic beyond simple scoring
- You want domain-specific feedback generation
- You’re integrating with an existing framework or codebase
For simple cases, use
optimize_anything() directly. Adapters are for advanced integration scenarios.Adapter Skeleton
Every adapter follows this pattern:from gepa.core.adapter import GEPAAdapter, EvaluationBatch
from typing import TypedDict, Any
# 1. Define your data types
class MyDataInst(TypedDict):
"""Format of your input examples."""
input: str
expected: str
class MyTrajectory(TypedDict):
"""Execution trace for reflection."""
steps: list[str]
output: str
class MyOutput(TypedDict):
"""Final output format."""
prediction: str
# 2. Implement the adapter
class MyAdapter(GEPAAdapter[MyDataInst, MyTrajectory, MyOutput]):
def evaluate(
self,
batch: list[MyDataInst],
candidate: dict[str, str],
capture_traces: bool = False,
) -> EvaluationBatch[MyTrajectory, MyOutput]:
# Execute your system and return results
...
def make_reflective_dataset(
self,
candidate: dict[str, str],
eval_batch: EvaluationBatch[MyTrajectory, MyOutput],
components_to_update: list[str],
) -> dict[str, list[dict[str, Any]]]:
# Extract feedback for reflection
...
Step-by-Step Tutorial
Step 1: Define Data Types
Start by defining the shape of your data:from typing import TypedDict
class TranslationTask(TypedDict):
"""A translation example."""
source_text: str
target_language: str
reference_translation: str
class TranslationTrace(TypedDict):
"""Detailed trace of translation process."""
source_text: str
intermediate_steps: list[str]
final_translation: str
detected_language: str
class TranslationOutput(TypedDict):
"""Final translation result."""
translation: str
confidence: float
Step 2: Implement evaluate()
This method runs your system with candidate parameters:def evaluate(
self,
batch: list[TranslationTask],
candidate: dict[str, str],
capture_traces: bool = False,
) -> EvaluationBatch[TranslationTrace, TranslationOutput]:
outputs = []
scores = []
trajectories = [] if capture_traces else None
# Build system with candidate parameters
system_prompt = candidate.get("system_prompt", "")
translation_template = candidate.get("translation_template", "")
for task in batch:
# Build the prompt
prompt = translation_template.format(
source=task["source_text"],
target_lang=task["target_language"],
)
# Run your model
translation = self.llm.generate(
system=system_prompt,
user=prompt,
)
# Score the result
score = self.compute_bleu_score(
translation,
task["reference_translation"],
)
# Collect results
outputs.append({
"translation": translation,
"confidence": score,
})
scores.append(score)
# Capture trace if requested
if capture_traces:
trajectories.append({
"source_text": task["source_text"],
"intermediate_steps": [], # Could capture reasoning
"final_translation": translation,
"detected_language": "auto",
})
return EvaluationBatch(
outputs=outputs,
scores=scores,
trajectories=trajectories,
)
Critical: Never raise exceptions for individual example failures. Return a score of 0.0 and continue processing.
Step 3: Implement make_reflective_dataset()
Extract structured feedback for the reflection LLM:def make_reflective_dataset(
self,
candidate: dict[str, str],
eval_batch: EvaluationBatch[TranslationTrace, TranslationOutput],
components_to_update: list[str],
) -> dict[str, list[dict[str, Any]]]:
reflective_data = {}
for component in components_to_update:
examples = []
# Process each trajectory
for i, trace in enumerate(eval_batch.trajectories):
output = eval_batch.outputs[i]
score = eval_batch.scores[i]
# Generate feedback
if score >= 0.8:
feedback = f"Good translation! Score: {score:.2f}"
else:
feedback = (
f"Translation needs improvement. Score: {score:.2f}\n"
f"Source: {trace['source_text']}\n"
f"Generated: {trace['final_translation']}\n"
f"Consider: Better handling of idioms and context."
)
# Create reflective example
example = {
"Inputs": {
"source_text": trace["source_text"],
},
"Generated Outputs": {
"translation": trace["final_translation"],
},
"Feedback": feedback,
}
examples.append(example)
reflective_data[component] = examples
return reflective_data
Step 4: Initialize Your Adapter
Add initialization logic:class TranslationAdapter(GEPAAdapter[TranslationTask, TranslationTrace, TranslationOutput]):
def __init__(self, llm_model: str, reference_metric: str = "bleu"):
self.llm = self.create_llm_client(llm_model)
self.metric = reference_metric
def create_llm_client(self, model: str):
"""Initialize your LLM client."""
import litellm
return lambda **kwargs: litellm.completion(
model=model,
messages=[{"role": "system", "content": kwargs["system"]},
{"role": "user", "content": kwargs["user"]}],
).choices[0].message.content
def compute_bleu_score(self, prediction: str, reference: str) -> float:
"""Compute BLEU score."""
# Simplified - use actual BLEU implementation
from difflib import SequenceMatcher
return SequenceMatcher(None, prediction, reference).ratio()
# ... evaluate() and make_reflective_dataset() methods
Complete Example: Code Generation Adapter
from gepa.core.adapter import GEPAAdapter, EvaluationBatch
from typing import TypedDict, Any
import subprocess
import tempfile
import os
class CodeTask(TypedDict):
problem: str
test_cases: list[dict[str, Any]]
reference_solution: str
class CodeTrace(TypedDict):
problem: str
generated_code: str
test_results: list[dict]
error_message: str | None
class CodeOutput(TypedDict):
code: str
passed_tests: int
total_tests: int
class CodeGenAdapter(GEPAAdapter[CodeTask, CodeTrace, CodeOutput]):
def __init__(self, llm_model: str, timeout: int = 5):
import litellm
self.llm = litellm
self.model = llm_model
self.timeout = timeout
def evaluate(
self,
batch: list[CodeTask],
candidate: dict[str, str],
capture_traces: bool = False,
) -> EvaluationBatch[CodeTrace, CodeOutput]:
outputs = []
scores = []
trajectories = [] if capture_traces else None
system_prompt = candidate.get("system_prompt", "")
code_template = candidate.get("code_generation_prompt", "")
for task in batch:
# Generate code
prompt = code_template.format(problem=task["problem"])
try:
response = self.llm.completion(
model=self.model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
],
)
code = self.extract_code(response.choices[0].message.content)
except Exception as e:
code = f"# Error generating code: {e}"
# Test the code
test_results, error = self.run_tests(code, task["test_cases"])
passed = sum(1 for r in test_results if r["passed"])
total = len(test_results)
# Score based on test passage rate
score = passed / total if total > 0 else 0.0
outputs.append({
"code": code,
"passed_tests": passed,
"total_tests": total,
})
scores.append(score)
if capture_traces:
trajectories.append({
"problem": task["problem"],
"generated_code": code,
"test_results": test_results,
"error_message": error,
})
return EvaluationBatch(
outputs=outputs,
scores=scores,
trajectories=trajectories,
)
def make_reflective_dataset(
self,
candidate: dict[str, str],
eval_batch: EvaluationBatch[CodeTrace, CodeOutput],
components_to_update: list[str],
) -> dict[str, list[dict[str, Any]]]:
reflective_data = {}
for component in components_to_update:
examples = []
for i, trace in enumerate(eval_batch.trajectories):
output = eval_batch.outputs[i]
score = eval_batch.scores[i]
# Build detailed feedback
feedback_parts = [
f"Tests passed: {output['passed_tests']}/{output['total_tests']}",
]
if score < 1.0:
failed_tests = [
r for r in trace["test_results"] if not r["passed"]
]
if failed_tests:
feedback_parts.append("\nFailed test cases:")
for test in failed_tests[:3]: # Show first 3
feedback_parts.append(
f" Input: {test['input']}\n"
f" Expected: {test['expected']}\n"
f" Got: {test['actual']}"
)
if trace["error_message"]:
feedback_parts.append(
f"\nRuntime error: {trace['error_message']}"
)
example = {
"Inputs": {"problem": trace["problem"]},
"Generated Outputs": trace["generated_code"],
"Feedback": "\n".join(feedback_parts),
}
examples.append(example)
reflective_data[component] = examples
return reflective_data
def extract_code(self, response: str) -> str:
"""Extract code from markdown code blocks."""
if "```python" in response:
start = response.find("```python") + 9
end = response.find("```", start)
return response[start:end].strip()
elif "```" in response:
start = response.find("```") + 3
end = response.find("```", start)
return response[start:end].strip()
return response.strip()
def run_tests(
self,
code: str,
test_cases: list[dict],
) -> tuple[list[dict], str | None]:
"""Execute code and run test cases."""
results = []
error_message = None
# Write code to temporary file
with tempfile.NamedTemporaryFile(
mode='w', suffix='.py', delete=False
) as f:
f.write(code)
code_file = f.name
try:
for test in test_cases:
try:
# Run code with test input
result = subprocess.run(
['python', code_file],
input=str(test["input"]),
capture_output=True,
text=True,
timeout=self.timeout,
)
actual = result.stdout.strip()
expected = str(test["expected"])
passed = actual == expected
results.append({
"input": test["input"],
"expected": expected,
"actual": actual,
"passed": passed,
})
except subprocess.TimeoutExpired:
error_message = "Code execution timeout"
results.append({
"input": test["input"],
"expected": test["expected"],
"actual": "[TIMEOUT]",
"passed": False,
})
except Exception as e:
error_message = str(e)
results.append({
"input": test["input"],
"expected": test["expected"],
"actual": f"[ERROR: {e}]",
"passed": False,
})
finally:
os.unlink(code_file)
return results, error_message
# Usage
adapter = CodeGenAdapter(llm_model="openai/gpt-4o")
train_data = [
{
"problem": "Write a function to reverse a string",
"test_cases": [
{"input": "hello", "expected": "olleh"},
{"input": "world", "expected": "dlrow"},
],
"reference_solution": "def reverse(s): return s[::-1]",
},
]
from gepa import optimize
result = optimize(
seed_candidate={
"system_prompt": "You are a Python expert.",
"code_generation_prompt": "Write Python code to solve: {problem}",
},
trainset=train_data,
valset=train_data,
adapter=adapter,
reflection_lm="openai/gpt-4o",
max_metric_calls=20,
)
print("Best prompts:")
print(result.best_candidate)
Advanced Patterns
Multi-Objective Scoring
def evaluate(self, batch, candidate, capture_traces=False):
# ... run evaluations ...
objective_scores = []
for output in outputs:
objective_scores.append({
"accuracy": compute_accuracy(output),
"speed": compute_speed(output),
"cost": compute_cost(output),
})
return EvaluationBatch(
outputs=outputs,
scores=primary_scores,
trajectories=trajectories,
objective_scores=objective_scores, # Multi-objective tracking
)
Custom Proposal Logic
from gepa.core.adapter import ProposalFn
class MyAdapter(GEPAAdapter[...]):
def __init__(self):
# Set custom proposer
self.propose_new_texts = self.custom_proposer
def custom_proposer(
self,
candidate: dict[str, str],
reflective_dataset: dict[str, list[dict]],
components_to_update: list[str],
) -> dict[str, str]:
"""Custom proposal logic."""
new_texts = {}
for component in components_to_update:
examples = reflective_dataset[component]
# Your domain-specific logic
new_texts[component] = self.my_proposal_algorithm(examples)
return new_texts
Parallel Evaluation
from concurrent.futures import ThreadPoolExecutor
def evaluate(self, batch, candidate, capture_traces=False):
system_prompt = candidate["system_prompt"]
def process_example(example):
result = self.run_system(example, system_prompt)
score = self.compute_score(result, example)
return result, score
# Parallel execution
with ThreadPoolExecutor(max_workers=8) as executor:
results = list(executor.map(process_example, batch))
outputs = [r[0] for r in results]
scores = [r[1] for r in results]
return EvaluationBatch(outputs=outputs, scores=scores)
Testing Your Adapter
# Test evaluate()
adapter = MyAdapter(...)
test_batch = [{"input": "test", "expected": "result"}]
test_candidate = {"prompt": "Test prompt"}
result = adapter.evaluate(test_batch, test_candidate, capture_traces=True)
assert len(result.outputs) == len(test_batch)
assert len(result.scores) == len(test_batch)
assert result.trajectories is not None
# Test make_reflective_dataset()
reflective = adapter.make_reflective_dataset(
test_candidate,
result,
["prompt"],
)
assert "prompt" in reflective
assert len(reflective["prompt"]) > 0
assert "Feedback" in reflective["prompt"][0]
Best Practices Summary
Next Steps
Adapter Overview
Learn about the adapter architecture
DSPy Integration
Study a production adapter example
Evaluation Metrics
Design better scoring functions
Configuration
Configure GEPA for your adapter