Skip to main content
Evaluations (evals) enable you to systematically test agent performance, validate outputs, and ensure quality. They can be used for testing, monitoring, and continuous improvement.

BaseEval

The BaseEval class is similar to guardrails but designed for evaluation and logging rather than blocking execution.
from agno.eval import BaseEval
from agno.run.agent import RunInput

class MyEval(BaseEval):
    def check(self, run_input: RunInput) -> None:
        """Evaluate and log, don't block."""
        # Log metrics, don't raise exceptions
        print(f"Input length: {len(run_input.message)}")
    
    async def async_check(self, run_input: RunInput) -> None:
        """Async evaluation."""
        print(f"Input length: {len(run_input.message)}")

Usage with Agents

from agno import Agent
from agno.eval import BaseEval

class ResponseQuality(BaseEval):
    def check(self, run_input: RunInput) -> None:
        response = run_input.run_response
        if response:
            # Log quality metrics
            length = len(response.content or "")
            print(f"Response length: {length}")
            
            # Store for analysis
            self.log_metric("response_length", length)
    
    async def async_check(self, run_input: RunInput) -> None:
        self.check(run_input)

agent = Agent(
    model="gpt-4o",
    post_hooks=[ResponseQuality()]  # Runs after response
)

Example Evaluations

from agno.eval import BaseEval
import time

class LatencyEval(BaseEval):
    def __init__(self):
        self.latencies = []
    
    def check(self, run_input: RunInput) -> None:
        response = run_input.run_response
        if response and response.metrics:
            latency = response.metrics.time_to_first_token
            if latency:
                self.latencies.append(latency)
                print(f"TTFT: {latency:.2f}s")
                
                # Calculate running average
                avg = sum(self.latencies) / len(self.latencies)
                print(f"Avg TTFT: {avg:.2f}s")
    
    async def async_check(self, run_input: RunInput) -> None:
        self.check(run_input)

agent = Agent(
    model="gpt-4o",
    post_hooks=[LatencyEval()]
)

Eval Datasets

Create eval datasets for systematic testing:
from agno.eval import BaseEval

class DatasetEval(BaseEval):
    def __init__(self, dataset):
        self.dataset = dataset
        self.results = []
    
    def run_evals(self, agent):
        """Run evaluations on dataset."""
        for item in self.dataset:
            response = agent.run(item["input"])
            
            result = {
                "input": item["input"],
                "expected": item["expected"],
                "actual": response.content,
                "passed": self.evaluate(item["expected"], response.content)
            }
            
            self.results.append(result)
        
        # Print summary
        passed = sum(1 for r in self.results if r["passed"])
        total = len(self.results)
        print(f"Passed: {passed}/{total} ({100*passed/total:.1f}%)")
    
    def evaluate(self, expected, actual):
        """Evaluate if actual matches expected."""
        # Your evaluation logic
        return expected.lower() in actual.lower()

# Usage
dataset = [
    {"input": "What is 2+2?", "expected": "4"},
    {"input": "What is the capital of France?", "expected": "Paris"},
]

eval = DatasetEval(dataset)
agent = Agent(model="gpt-4o")
eval.run_evals(agent)

Best Practices

  1. Don’t block: Evals should log/measure, not raise exceptions
  2. Collect metrics: Store metrics for analysis
  3. Multiple evals: Use different evals for different aspects
  4. Datasets: Create comprehensive eval datasets
  5. Baselines: Establish baselines to measure improvement
  6. CI/CD: Run evals in CI/CD pipelines
  7. Monitoring: Use evals in production for monitoring

Build docs developers (and LLMs) love