The evaluate() function runs experiments to test your LLM application against datasets, measuring performance with custom evaluators.
Basic usage
from langsmith import evaluate
def my_app(inputs: dict) -> dict:
# Your LLM application
return {"output": process(inputs["input"])}
results = evaluate(
my_app,
data="my-dataset",
evaluators=[accuracy_evaluator],
experiment_prefix="baseline"
)
print(results.metrics)
Parameters
target
Callable | Runnable | str | UUID | tuple
required
The system to evaluate. Can be:
- Function:
(dict) -> dict or (dict, dict) -> dict (with example)
- LangChain Runnable: Any
Runnable object
- Experiment ID: String or UUID of existing experiment
- Tuple of experiment IDs: For comparative evaluation
# Function
def my_function(inputs):
return {"output": inputs["input"].upper()}
evaluate(my_function, data="dataset")
# LangChain Runnable
from langchain_core.runnables import RunnableLambda
evaluate(RunnableLambda(lambda x: x), data="dataset")
# Existing experiment
evaluate("abc-123", data="dataset", evaluators=[...])
# Compare two experiments
evaluate(("exp-1", "exp-2"), evaluators=[comparison_evaluator])
data
str | UUID | Iterable[Example] | Dataset
Dataset to evaluate on. Can be:
- Dataset name (string)
- Dataset ID (UUID)
- List/iterator of examples
- Dataset object
# By name
evaluate(target, data="my-qa-dataset")
# By ID
from uuid import UUID
evaluate(target, data=UUID("..."))
# Inline examples
evaluate(target, data=[
{"inputs": {"question": "What is 2+2?"}, "outputs": {"answer": "4"}},
{"inputs": {"question": "What is 3+3?"}, "outputs": {"answer": "6"}}
])
evaluators
Sequence[EVALUATOR_T] | None
List of evaluators to run on each example. Each evaluator receives the run and example.from langsmith import evaluate
from langsmith.evaluation import EvaluationResult
def accuracy_evaluator(run, example):
prediction = run.outputs["output"]
reference = example.outputs["answer"]
return EvaluationResult(
key="accuracy",
score=1.0 if prediction == reference else 0.0
)
evaluate(
target,
data="dataset",
evaluators=[accuracy_evaluator]
)
summary_evaluators
Sequence[SUMMARY_EVALUATOR_T] | None
Evaluators that run on the entire dataset after all examples are processed.def dataset_summary(runs, examples):
total = len(runs)
successful = sum(1 for r in runs if not r.error)
return EvaluationResult(
key="success_rate",
score=successful / total
)
evaluate(
target,
data="dataset",
summary_evaluators=[dataset_summary]
)
Prefix for the experiment name. The full name will be {prefix}-{hash}.evaluate(
target,
data="dataset",
experiment_prefix="gpt-4-baseline"
)
# Creates experiment named "gpt-4-baseline-a1b2c3"
Free-form description of the experiment.evaluate(
target,
data="dataset",
description="Testing new prompt template with chain-of-thought"
)
Metadata to attach to the experiment.evaluate(
target,
data="dataset",
metadata={
"model": "gpt-4",
"temperature": 0.7,
"version": "v2"
}
)
Maximum number of concurrent evaluations. 0 for sequential, None for unlimited.# Run 10 examples concurrently
evaluate(
target,
data="dataset",
max_concurrency=10
)
Number of times to run each example. Useful for testing variance. Default is 1.# Run each example 3 times
evaluate(
target,
data="dataset",
num_repetitions=3
)
LangSmith client to use. Uses default client if not provided.
Whether to block until evaluation completes. Default is True.# Non-blocking evaluation
results = evaluate(
target,
data="dataset",
blocking=False
)
# Continue other work...
results.wait() # Block later
Whether to upload results to LangSmith. Default is True.
How to handle errors in individual runs:
"log": Include failed runs in results (default)
"ignore": Skip failed runs entirely
evaluate(
target,
data="dataset",
error_handling="ignore" # Skip failures
)
Return value
Object containing experiment results and metrics.
ExperimentResults properties
results = evaluate(target, data="dataset", evaluators=[...])
# Access metrics
print(results.metrics) # Aggregate scores
# Get experiment info
print(results.experiment_name)
print(results.experiment_url) # Link to view in UI
# Access individual runs
for run in results.runs:
print(run.id, run.outputs)
# Wait for completion (if blocking=False)
results.wait()
Name of the created experiment.
URL to view the experiment in the LangSmith UI.
Aggregated evaluation metrics (averages, counts, etc.).
List of all runs from the evaluation.
Examples
Evaluate a function
from langsmith import evaluate
from langsmith.evaluation import EvaluationResult
def qa_function(inputs: dict) -> dict:
# Your QA logic
return {"answer": generate_answer(inputs["question"])}
def correctness_evaluator(run, example):
prediction = run.outputs.get("answer", "")
reference = example.outputs.get("answer", "")
return EvaluationResult(
key="correctness",
score=1.0 if prediction.lower() == reference.lower() else 0.0
)
results = evaluate(
qa_function,
data="qa-dataset",
evaluators=[correctness_evaluator],
experiment_prefix="baseline"
)
print(f"Accuracy: {results.metrics['correctness']}")
Evaluate with LangChain
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langsmith import evaluate
prompt = ChatPromptTemplate.from_template(
"Answer this question: {question}"
)
model = ChatOpenAI(model="gpt-3.5-turbo")
chain = prompt | model
evaluate(
chain,
data="qa-dataset",
evaluators=[correctness_evaluator],
experiment_prefix="gpt-3.5"
)
Compare experiments
from langsmith import evaluate
from langsmith.evaluation import ComparisonEvaluationResult
def pairwise_evaluator(runs, example):
"""Compare two runs and pick the better one."""
run_a, run_b = runs
# Your comparison logic
better = run_a if len(run_a.outputs["answer"]) > len(run_b.outputs["answer"]) else run_b
return ComparisonEvaluationResult(
key="preference",
scores={
run_a.id: 1.0 if better == run_a else 0.0,
run_b.id: 1.0 if better == run_b else 0.0
}
)
# Compare two existing experiments
results = evaluate(
("experiment-1-id", "experiment-2-id"),
evaluators=[pairwise_evaluator]
)
Custom dataset inline
evaluate(
my_function,
data=[
Example(
inputs={"query": "What is LangSmith?"},
outputs={"answer": "A platform for LLM development"}
),
Example(
inputs={"query": "What is LangChain?"},
outputs={"answer": "A framework for LLM applications"}
)
],
evaluators=[accuracy_evaluator]
)
evaluate_existing
Re-evaluate an existing experiment with new evaluators.
from langsmith import evaluate_existing
# Run new evaluators on an existing experiment
results = evaluate_existing(
"experiment-id-or-url",
evaluators=[new_evaluator],
summary_evaluators=[dataset_summary]
)
ID or URL of the existing experiment.
Evaluators to run on the existing runs.
summary_evaluators
Sequence[SUMMARY_EVALUATOR_T]
Summary evaluators for the dataset.
See the evaluators documentation for details on creating custom evaluators.