Datasets are collections of examples used to evaluate, test, and improve your LLM applications. Each example contains inputs and optionally expected outputs, forming a ground truth for testing.
What is a dataset?
A dataset in LangSmith contains:
- Examples - Individual test cases with inputs and outputs
- Metadata - Information about the dataset (description, creation date)
- Schemas - Optional input/output schemas for validation
- Splits - Groups like “train” and “test” for organizing examples
Dataset types
LangSmith supports three dataset types:
Key-value (kv)
General-purpose datasets with arbitrary input/output dictionaries:
{
"inputs": {
"question": "What is the capital of France?",
"context": "France is a country in Europe."
},
"outputs": {
"answer": "Paris",
"confidence": 0.95
}
}
LLM
For testing language model completions:
{
"inputs": {
"prompt": "Complete this sentence: The capital of France is"
},
"outputs": {
"completion": "Paris."
}
}
Chat
For testing conversational applications:
{
"inputs": {
"messages": [
{"role": "user", "content": "What's the weather?"}
]
},
"outputs": {
"messages": [
{"role": "assistant", "content": "I can't check real-time weather."}
]
}
}
Creating datasets
From scratch
from langsmith import Client
client = Client()
# Create a dataset
dataset = client.create_dataset(
dataset_name="qa-evaluation",
description="Question answering test cases",
data_type="kv", # or "llm" or "chat"
)
print(f"Created dataset: {dataset.id}")
Adding examples
# Add examples one at a time
client.create_example(
inputs={"question": "What is LangSmith?"},
outputs={"answer": "LangSmith is a platform for LLM observability"},
dataset_id=dataset.id,
)
# Add multiple examples
examples = [
{
"inputs": {"question": "What is tracing?"},
"outputs": {"answer": "Tracing captures execution flow"},
},
{
"inputs": {"question": "What are evaluators?"},
"outputs": {"answer": "Evaluators score LLM outputs"},
},
]
client.create_examples(
inputs=[e["inputs"] for e in examples],
outputs=[e["outputs"] for e in examples],
dataset_id=dataset.id,
)
From CSV or existing data
import pandas as pd
# From a pandas DataFrame
df = pd.DataFrame({
"question": ["What is AI?", "What is ML?"],
"answer": ["Artificial Intelligence", "Machine Learning"],
})
# Create dataset from DataFrame
dataset = client.create_dataset(
dataset_name="from-csv",
data_type="kv",
)
for _, row in df.iterrows():
client.create_example(
inputs={"question": row["question"]},
outputs={"answer": row["answer"]},
dataset_id=dataset.id,
)
# Or create dataset and examples in one operation
dataset = client.create_dataset(
dataset_name="from-list",
data_type="kv",
)
client.create_examples(
inputs=df["question"].apply(lambda x: {"question": x}).tolist(),
outputs=df["answer"].apply(lambda x: {"answer": x}).tolist(),
dataset_id=dataset.id,
)
From production runs
Capture real usage patterns:
# Get runs from a production project
runs = client.list_runs(
project_name="production-app",
filter='eq(feedback_key, "user_rating") and gte(feedback_score, 4)',
)
# Create dataset from high-quality runs
dataset = client.create_dataset(
dataset_name="golden-examples",
description="High-rated production examples",
)
for run in runs:
client.create_example(
inputs=run.inputs,
outputs=run.outputs,
dataset_id=dataset.id,
source_run_id=run.id, # Link back to original run
)
Managing datasets
Reading datasets
# List all datasets
datasets = client.list_datasets()
for dataset in datasets:
print(f"{dataset.name}: {dataset.example_count} examples")
# Get a specific dataset
dataset = client.read_dataset(dataset_name="qa-evaluation")
print(f"Dataset: {dataset.name}")
print(f"Created: {dataset.created_at}")
print(f"Examples: {dataset.example_count}")
# List examples in a dataset
examples = client.list_examples(dataset_name="qa-evaluation")
for example in examples:
print(f"Input: {example.inputs}")
print(f"Output: {example.outputs}")
Updating examples
# Update an example
example = next(client.list_examples(dataset_name="qa-evaluation"))
client.update_example(
example_id=example.id,
inputs={"question": "Updated question"},
outputs={"answer": "Updated answer"},
metadata={"reviewed": True, "reviewer": "alice"},
)
Deleting datasets and examples
# Delete an example
client.delete_example(example_id=example.id)
# Delete a dataset (and all its examples)
client.delete_dataset(dataset_name="qa-evaluation")
Dataset splits
Organize examples into splits for training and testing:
# Add examples with splits
client.create_example(
inputs={"question": "Training question"},
outputs={"answer": "Training answer"},
dataset_id=dataset.id,
split="train",
)
client.create_example(
inputs={"question": "Test question"},
outputs={"answer": "Test answer"},
dataset_id=dataset.id,
split="test",
)
# Filter examples by split
train_examples = client.list_examples(
dataset_name="qa-evaluation",
splits=["train"],
)
test_examples = client.list_examples(
dataset_name="qa-evaluation",
splits=["test"],
)
Using datasets in evaluations
Datasets integrate seamlessly with the evaluation framework:
def my_application(inputs: dict) -> dict:
# Your application logic
return {"answer": process(inputs["question"])}
# Evaluate using a dataset
results = client.evaluate(
my_application,
data="qa-evaluation", # Use dataset name
evaluators=[accuracy_evaluator, relevance_evaluator],
)
# Or use dataset ID
results = client.evaluate(
my_application,
data=dataset.id,
evaluators=[accuracy_evaluator],
)
# Or filter to specific splits
results = client.evaluate(
my_application,
data=client.list_examples(
dataset_name="qa-evaluation",
splits=["test"],
),
evaluators=[accuracy_evaluator],
)
Add rich metadata to track dataset evolution:
# Create dataset with metadata
dataset = client.create_dataset(
dataset_name="qa-v2",
description="Updated Q&A dataset with more examples",
metadata={
"version": "2.0",
"created_by": "data-team",
"source": "production-logs",
"quality_threshold": 4.0,
},
)
# Update dataset metadata
client.update_dataset(
dataset_id=dataset.id,
description="Enhanced Q&A dataset",
metadata={
**dataset.metadata,
"last_updated": "2024-03-15",
"num_reviews": 100,
},
)
Best practices
Start small and iterateBegin with 10-20 carefully curated examples. Add more as you identify edge cases.
Use diverse examplesInclude examples that cover different scenarios, edge cases, and potential failure modes.
Capture production dataRegularly add high-quality production examples to keep your dataset realistic and up-to-date.
Version your datasetsUse naming conventions (e.g., qa-v1, qa-v2) or metadata to track dataset versions.
Be cautious about including sensitive data in datasets. Review and sanitize production data before adding it.
Next steps
- Use datasets in evaluations to assess quality
- Understand tracing to see how examples are processed
- Learn about dataset versioning in the LangSmith UI