Skip to main content
Experiments in Phoenix allow you to systematically evaluate your AI application’s performance on a dataset. Each experiment runs your task function on every example, captures outputs and traces, and optionally evaluates results using metrics and quality checks.

Basic Experiment

Here’s a simple experiment that runs a task on a dataset:
from phoenix.client import Client
from phoenix.experiments import run_experiment

client = Client()

# Get your dataset
dataset = client.datasets.get_dataset(dataset="qa-dataset")

# Define your task
def answer_question(input):
    """Your AI application logic"""
    question = input["question"]
    # Call your model/LLM here
    answer = generate_answer(question)
    return {"answer": answer}

# Run the experiment
experiment = run_experiment(
    dataset=dataset,
    task=answer_question,
    experiment_name="baseline-v1",
    experiment_description="Initial baseline using GPT-4"
)

print(experiment)
This will:
  1. Execute answer_question on each example in the dataset
  2. Capture outputs and execution traces
  3. Store results in Phoenix
  4. Display a summary of the experiment

Task Functions

Task functions can access different parts of the example:

Single Parameter (Input Only)

def simple_task(input):
    """Most common pattern - receives the input dict"""
    question = input["question"]
    return {"answer": generate_answer(question)}

Multiple Parameters

def task_with_reference(input, expected, metadata):
    """
    Access multiple fields:
    - input: The input data
    - expected/reference: The expected output (aliases)
    - metadata: Example metadata
    - example: The full Example object
    """
    question = input["question"]
    difficulty = metadata.get("difficulty", "unknown")
    
    # Use difficulty to adjust generation
    answer = generate_answer(question, difficulty=difficulty)
    return {"answer": answer}

Async Tasks

import asyncio

async def async_task(input):
    """Async tasks enable concurrent execution"""
    question = input["question"]
    # Async LLM call
    answer = await async_generate_answer(question)
    return {"answer": answer}

# Run with higher concurrency for async tasks
experiment = run_experiment(
    dataset=dataset,
    task=async_task,
    concurrency=10  # Run 10 examples concurrently
)

Task Output Format

Tasks must return JSON-serializable data:
# Valid return types
return {"answer": "text response"}  # Dict
return "simple string response"      # String
return 42                             # Number
return True                           # Boolean
return ["item1", "item2"]            # List
return None                           # None

# Can also return nested structures
return {
    "answer": "The answer is 42",
    "confidence": 0.95,
    "sources": ["doc1", "doc2"],
    "metadata": {
        "model": "gpt-4",
        "tokens": 150
    }
}

Experiments with Evaluators

Evaluate experiment outputs using built-in or custom evaluators:
from phoenix.experiments import run_experiment
from phoenix.experiments.evaluators import create_evaluator

# Run experiment with evaluation
experiment = run_experiment(
    dataset=dataset,
    task=answer_question,
    evaluators=[
        # Check if output exactly matches expected
        create_evaluator("exact_match"),
        # Measure semantic similarity
        create_evaluator("semantic_similarity"),
    ],
    experiment_name="evaluated-v1"
)

# Results include evaluation scores
print(experiment)

Evaluator Functions

Evaluators receive the task output and example data:
def my_evaluator(output, expected, input, metadata):
    """
    Parameters:
    - output: The task's output
    - expected/reference: Expected output from dataset (aliases)
    - input: The input from the example
    - metadata: Example metadata
    
    Return types:
    - EvaluationResult object with score, label, explanation
    - float (interpreted as score)
    - bool (0 or 1 score, "True"/"False" label)
    - str (interpreted as label)
    - (score, explanation) tuple
    """
    
    from phoenix.experiments.types import EvaluationResult
    
    # Your evaluation logic
    is_correct = output["answer"] == expected["answer"]
    
    return EvaluationResult(
        score=1.0 if is_correct else 0.0,
        label="correct" if is_correct else "incorrect",
        explanation=f"Answer matches: {is_correct}"
    )

Adding Evaluations to Existing Experiments

You can add evaluations to experiments that have already run:
from phoenix.experiments import evaluate_experiment

# Run experiment without evaluation
experiment = run_experiment(
    dataset=dataset,
    task=answer_question,
    experiment_name="baseline-v1"
)

# Later, add evaluation
evaluated = evaluate_experiment(
    experiment=experiment,
    evaluators=[
        create_evaluator("exact_match"),
        custom_quality_check
    ]
)

print(evaluated)

Experiment Configuration

Concurrency

Control how many examples run in parallel:
experiment = run_experiment(
    dataset=dataset,
    task=async_task,
    concurrency=10,  # Run 10 examples at a time (async only)
)

Timeout

Set a timeout for long-running tasks:
experiment = run_experiment(
    dataset=dataset,
    task=slow_task,
    timeout=300,  # 5 minutes per example
)

Rate Limiting

Handle rate limits gracefully:
from openai import RateLimitError

experiment = run_experiment(
    dataset=dataset,
    task=openai_task,
    rate_limit_errors=[RateLimitError],  # Automatically retry on rate limits
)

Dry Run

Test your experiment on a subset without storing results:
# Run on 5 random examples without storing
experiment = run_experiment(
    dataset=dataset,
    task=answer_question,
    dry_run=5,  # Test on 5 examples
    print_summary=True
)

Experiment Metadata

Add rich metadata to track experiment context:
experiment = run_experiment(
    dataset=dataset,
    task=answer_question,
    experiment_name="gpt4-turbo-v1",
    experiment_description="Testing GPT-4 Turbo with new prompt template",
    experiment_metadata={
        "model": "gpt-4-turbo-preview",
        "temperature": 0.7,
        "prompt_version": "v2.1",
        "git_commit": "abc123def",
        "team": "ml-platform"
    }
)

Accessing Results

Summary Statistics

# Print experiment summary
print(experiment)

# Access task summary
print(f"Total runs: {experiment.task_summary.stats['n_runs'].values[0]}")
print(f"Errors: {experiment.task_summary.stats['n_errors'].values[0]}")

# Access evaluation summaries
for eval_summary in experiment.eval_summaries:
    print(eval_summary)

Individual Runs

# Iterate over all runs
for run in experiment:
    print(f"Example: {run.dataset_example_id}")
    print(f"Output: {run.output}")
    print(f"Error: {run.error}")
    print(f"Trace ID: {run.trace_id}")

# Access specific run by index
first_run = experiment[0]
print(first_run.output)

# Get run with example data
print(first_run.input)       # Example input
print(first_run.expected)    # Expected output
print(first_run.metadata)    # Example metadata

DataFrame Export

# Export experiment runs to DataFrame
runs_df = experiment.as_dataframe()
print(runs_df.columns)
# ['error', 'output', 'input', 'expected', 'metadata', 'example_id']

# Export evaluations to DataFrame
evals_df = experiment.get_evaluations()
print(evals_df.columns)
# ['name', 'error', 'score', 'label', 'explanation', 'error', 'output', 'input', 'expected', 'metadata', 'example_id']

# Analyze results
avg_score = evals_df['score'].mean()
print(f"Average score: {avg_score:.2f}")

Comparing Experiments

Compare multiple experiments in the Phoenix UI:
# Run baseline
baseline = run_experiment(
    dataset=dataset,
    task=gpt35_task,
    experiment_name="baseline-gpt35"
)

# Run improved version
improved = run_experiment(
    dataset=dataset,
    task=gpt4_task,
    experiment_name="improved-gpt4"
)

# View comparison in UI
print(f"Compare at: {improved.url}")
The Phoenix UI provides:
  • Side-by-side output comparison
  • Evaluation score differences
  • Trace viewing for debugging
  • Statistical summaries

Best Practices

Start Small

Test your task on a few examples with dry_run before running the full experiment.

Use Async

Implement async tasks with appropriate concurrency for faster experiments.

Handle Errors

Implement error handling in your task to avoid stopping the entire experiment.

Rich Metadata

Add detailed metadata to experiments for better tracking and comparison.

Next Steps

Evaluators

Learn about built-in and custom evaluators

Dataset Versioning

Manage dataset versions and exports

Build docs developers (and LLMs) love