Skip to main content
Datasets are collections of examples used to evaluate, test, and improve your LLM applications. Each example contains inputs and optionally expected outputs, forming a ground truth for testing.

What is a dataset?

A dataset in LangSmith contains:
  • Examples - Individual test cases with inputs and outputs
  • Metadata - Information about the dataset (description, creation date)
  • Schemas - Optional input/output schemas for validation
  • Splits - Groups like “train” and “test” for organizing examples

Dataset types

LangSmith supports three dataset types:

Key-value (kv)

General-purpose datasets with arbitrary input/output dictionaries:
{
  "inputs": {
    "question": "What is the capital of France?",
    "context": "France is a country in Europe."
  },
  "outputs": {
    "answer": "Paris",
    "confidence": 0.95
  }
}

LLM

For testing language model completions:
{
  "inputs": {
    "prompt": "Complete this sentence: The capital of France is"
  },
  "outputs": {
    "completion": "Paris."
  }
}

Chat

For testing conversational applications:
{
  "inputs": {
    "messages": [
      {"role": "user", "content": "What's the weather?"}
    ]
  },
  "outputs": {
    "messages": [
      {"role": "assistant", "content": "I can't check real-time weather."}
    ]
  }
}

Creating datasets

From scratch

from langsmith import Client

client = Client()

# Create a dataset
dataset = client.create_dataset(
    dataset_name="qa-evaluation",
    description="Question answering test cases",
    data_type="kv",  # or "llm" or "chat"
)

print(f"Created dataset: {dataset.id}")

Adding examples

# Add examples one at a time
client.create_example(
    inputs={"question": "What is LangSmith?"},
    outputs={"answer": "LangSmith is a platform for LLM observability"},
    dataset_id=dataset.id,
)

# Add multiple examples
examples = [
    {
        "inputs": {"question": "What is tracing?"},
        "outputs": {"answer": "Tracing captures execution flow"},
    },
    {
        "inputs": {"question": "What are evaluators?"},
        "outputs": {"answer": "Evaluators score LLM outputs"},
    },
]

client.create_examples(
    inputs=[e["inputs"] for e in examples],
    outputs=[e["outputs"] for e in examples],
    dataset_id=dataset.id,
)

From CSV or existing data

import pandas as pd

# From a pandas DataFrame
df = pd.DataFrame({
    "question": ["What is AI?", "What is ML?"],
    "answer": ["Artificial Intelligence", "Machine Learning"],
})

# Create dataset from DataFrame
dataset = client.create_dataset(
    dataset_name="from-csv",
    data_type="kv",
)

for _, row in df.iterrows():
    client.create_example(
        inputs={"question": row["question"]},
        outputs={"answer": row["answer"]},
        dataset_id=dataset.id,
    )

# Or create dataset and examples in one operation
dataset = client.create_dataset(
    dataset_name="from-list",
    data_type="kv",
)

client.create_examples(
    inputs=df["question"].apply(lambda x: {"question": x}).tolist(),
    outputs=df["answer"].apply(lambda x: {"answer": x}).tolist(),
    dataset_id=dataset.id,
)

From production runs

Capture real usage patterns:
# Get runs from a production project
runs = client.list_runs(
    project_name="production-app",
    filter='eq(feedback_key, "user_rating") and gte(feedback_score, 4)',
)

# Create dataset from high-quality runs
dataset = client.create_dataset(
    dataset_name="golden-examples",
    description="High-rated production examples",
)

for run in runs:
    client.create_example(
        inputs=run.inputs,
        outputs=run.outputs,
        dataset_id=dataset.id,
        source_run_id=run.id,  # Link back to original run
    )

Managing datasets

Reading datasets

# List all datasets
datasets = client.list_datasets()
for dataset in datasets:
    print(f"{dataset.name}: {dataset.example_count} examples")

# Get a specific dataset
dataset = client.read_dataset(dataset_name="qa-evaluation")
print(f"Dataset: {dataset.name}")
print(f"Created: {dataset.created_at}")
print(f"Examples: {dataset.example_count}")

# List examples in a dataset
examples = client.list_examples(dataset_name="qa-evaluation")
for example in examples:
    print(f"Input: {example.inputs}")
    print(f"Output: {example.outputs}")

Updating examples

# Update an example
example = next(client.list_examples(dataset_name="qa-evaluation"))

client.update_example(
    example_id=example.id,
    inputs={"question": "Updated question"},
    outputs={"answer": "Updated answer"},
    metadata={"reviewed": True, "reviewer": "alice"},
)

Deleting datasets and examples

# Delete an example
client.delete_example(example_id=example.id)

# Delete a dataset (and all its examples)
client.delete_dataset(dataset_name="qa-evaluation")

Dataset splits

Organize examples into splits for training and testing:
# Add examples with splits
client.create_example(
    inputs={"question": "Training question"},
    outputs={"answer": "Training answer"},
    dataset_id=dataset.id,
    split="train",
)

client.create_example(
    inputs={"question": "Test question"},
    outputs={"answer": "Test answer"},
    dataset_id=dataset.id,
    split="test",
)

# Filter examples by split
train_examples = client.list_examples(
    dataset_name="qa-evaluation",
    splits=["train"],
)

test_examples = client.list_examples(
    dataset_name="qa-evaluation",
    splits=["test"],
)

Using datasets in evaluations

Datasets integrate seamlessly with the evaluation framework:
def my_application(inputs: dict) -> dict:
    # Your application logic
    return {"answer": process(inputs["question"])}

# Evaluate using a dataset
results = client.evaluate(
    my_application,
    data="qa-evaluation",  # Use dataset name
    evaluators=[accuracy_evaluator, relevance_evaluator],
)

# Or use dataset ID
results = client.evaluate(
    my_application,
    data=dataset.id,
    evaluators=[accuracy_evaluator],
)

# Or filter to specific splits
results = client.evaluate(
    my_application,
    data=client.list_examples(
        dataset_name="qa-evaluation",
        splits=["test"],
    ),
    evaluators=[accuracy_evaluator],
)

Dataset metadata and versioning

Add rich metadata to track dataset evolution:
# Create dataset with metadata
dataset = client.create_dataset(
    dataset_name="qa-v2",
    description="Updated Q&A dataset with more examples",
    metadata={
        "version": "2.0",
        "created_by": "data-team",
        "source": "production-logs",
        "quality_threshold": 4.0,
    },
)

# Update dataset metadata
client.update_dataset(
    dataset_id=dataset.id,
    description="Enhanced Q&A dataset",
    metadata={
        **dataset.metadata,
        "last_updated": "2024-03-15",
        "num_reviews": 100,
    },
)

Best practices

Start small and iterateBegin with 10-20 carefully curated examples. Add more as you identify edge cases.
Use diverse examplesInclude examples that cover different scenarios, edge cases, and potential failure modes.
Capture production dataRegularly add high-quality production examples to keep your dataset realistic and up-to-date.
Version your datasetsUse naming conventions (e.g., qa-v1, qa-v2) or metadata to track dataset versions.
Be cautious about including sensitive data in datasets. Review and sanitize production data before adding it.

Next steps

  • Use datasets in evaluations to assess quality
  • Understand tracing to see how examples are processed
  • Learn about dataset versioning in the LangSmith UI

Build docs developers (and LLMs) love