Compare two agent versions side-by-side to measure which performs better
Pairwise evaluation compares two agent versions on the same inputs and determines which produces better outputs. This is especially useful when absolute scoring is difficult but relative comparison is easier.
from langsmith import evaluate# Run experiment for agent v1results_v1 = evaluate( agent_v1, data="officeflow-dataset", experiment_prefix="agent-v1")# Run experiment for agent v2results_v2 = evaluate( agent_v2, data="officeflow-dataset", experiment_prefix="agent-v2")print(f"v1 experiment: {results_v1.experiment_name}")print(f"v2 experiment: {results_v2.experiment_name}")
4
Create Pairwise Evaluator
5
Write an evaluator that compares two outputs:
6
from openai import OpenAIclient = OpenAI()def conciseness_evaluator(inputs: dict, outputs: list[dict]) -> list[int]: """Compare two responses for conciseness. Returns [1, 0] if first response wins, [0, 1] if second wins, [0, 0] for tie. """ prompt = f""" Which response is MORE CONCISE while still providing all crucial information? Question: {inputs['question']} Response A: {outputs[0]['answer']} Response B: {outputs[1]['answer']} Output ONLY a single number: 1 if Response A is more concise 2 if Response B is more concise 0 if they are roughly equal """ response = client.chat.completions.create( model="gpt-5-nano", messages=[ {"role": "system", "content": "You are a conciseness evaluator. Respond with only: 0, 1, or 2."}, {"role": "user", "content": prompt} ], temperature=0 ) preference = int(response.choices[0].message.content.strip()) if preference == 1: return [1, 0] # A wins elif preference == 2: return [0, 1] # B wins else: return [0, 0] # Tie
7
Run Pairwise Comparison
8
Compare the experiments:
9
from langsmith import evaluateevaluate( ("agent-v1-3e016f9c", "agent-v2-7d7ee287"), # Experiment names evaluators=[conciseness_evaluator], randomize_order=True # Prevent position bias)
This example from the OfficeFlow course compares two agent versions on conciseness:
eval_conciseness_pairwise.py
"""Pairwise conciseness evaluator for comparing two experiments.Run this AFTER run_agents.py, using the experiment names it outputs.Usage: uv run python eval_conciseness_pairwise.py <experiment-a> <experiment-b> uv run python eval_conciseness_pairwise.py agent-v4-3e016f9c agent-v5-7d7ee287"""from openai import OpenAIfrom langsmith import evaluateclient = OpenAI()CONCISENESS_PROMPT = """You are evaluating two responses to the same customer question.Determine which response is MORE CONCISE while still providing all crucial information.**Conciseness** means getting straight to the point, avoiding filler, and not repeating information.**Crucial information** includes direct answers, necessary context, and required next steps.A shorter response is NOT automatically better if it omits crucial information.**Question:** {question}**Response A:**{response_a}**Response B:**{response_b}Output your verdict as a single number:1 if Response A is more concise while preserving crucial information2 if Response B is more concise while preserving crucial information0 if they are roughly equal"""def conciseness_evaluator(inputs: dict, outputs: list[dict]) -> list[int]: response = client.chat.completions.create( model="gpt-5-nano", messages=[ {"role": "system", "content": "You are a conciseness evaluator. Respond with only a single number: 0, 1, or 2."}, {"role": "user", "content": CONCISENESS_PROMPT.format( question=inputs["question"], response_a=outputs[0].get("answer", "N/A"), response_b=outputs[1].get("answer", "N/A"), )} ], ) preference = int(response.choices[0].message.content.strip()) if preference == 1: return [1, 0] # A wins elif preference == 2: return [0, 1] # B wins else: return [0, 0] # Tieif __name__ == "__main__": import sys if len(sys.argv) != 3: print("Usage: python eval_conciseness_pairwise.py <experiment-a> <experiment-b>") print("Example: python eval_conciseness_pairwise.py agent-v4-3e016f9c agent-v5-7d7ee287") sys.exit(1) evaluate( (sys.argv[1], sys.argv[2]), evaluators=[conciseness_evaluator], randomize_order=True, )
randomize_order=True is critical - it prevents position bias where the LLM might favor the first or second response systematically.
def pairwise_evaluator(inputs: dict, outputs: list[dict]) -> list[int]: # Determine which is better if outputs[0]["answer"] is better: return [1, 0] # First wins elif outputs[1]["answer"] is better: return [0, 1] # Second wins else: return [0, 0] # Tie
def pairwise_with_strength(inputs: dict, outputs: list[dict]) -> list[float]: # Score each from 0-1 score_a = evaluate_quality(outputs[0]) score_b = evaluate_quality(outputs[1]) # Normalize so they sum to 1 total = score_a + score_b if total > 0: return [score_a / total, score_b / total] else: return [0.5, 0.5] # Equal if both scored 0
from openai import OpenAIimport jsonclient = OpenAI()def multi_criteria_pairwise(inputs: dict, outputs: list[dict]) -> list[dict]: """Compare on multiple criteria and return detailed scores.""" prompt = f""" Compare these two responses on multiple criteria: Question: {inputs['question']} Response A: {outputs[0]['answer']} Response B: {outputs[1]['answer']} For each criterion, output which response is better (1 for A, 2 for B, 0 for tie): - Conciseness - Accuracy - Helpfulness - Professionalism Return JSON: {{ "conciseness": 1 or 2 or 0, "accuracy": 1 or 2 or 0, "helpfulness": 1 or 2 or 0, "professionalism": 1 or 2 or 0 }} """ response = client.chat.completions.create( model="gpt-5-nano", messages=[{"role": "user", "content": prompt}], response_format={"type": "json_object"}, temperature=0 ) results = json.loads(response.choices[0].message.content) # Calculate overall winner a_wins = sum(1 for v in results.values() if v == 1) b_wins = sum(1 for v in results.values() if v == 2) return [ {"score": 1 if a_wins > b_wins else 0, "details": results}, {"score": 1 if b_wins > a_wins else 0, "details": results} ]
GOOD_PROMPT = """Which response is MORE CONCISE while still providing all crucial information?**Conciseness** means:- Getting straight to the point- Avoiding unnecessary filler words- Not repeating information**Crucial information** includes:- Direct answer to the question- Necessary context- Required next steps"""VAGUE_PROMPT = """Which response is better?"""
# Two agents with different promptsagent_a = ChatAgent(prompt="You are a helpful assistant.")agent_b = ChatAgent(prompt="You are a concise assistant who answers in under 50 words.")# Compare concisenessevaluate( (experiment_a, experiment_b), evaluators=[conciseness_evaluator])
For more than two versions, run multiple pairwise comparisons:
multi_version.py
versions = ["v1", "v2", "v3"]# Compare all pairsfor i, v1 in enumerate(versions): for v2 in versions[i+1:]: print(f"Comparing {v1} vs {v2}") evaluate((v1, v2), evaluators=[your_evaluator])