Skip to main content

Overview

Verifiers uses the HuggingFace datasets library for loading and manipulating training and evaluation data. Each dataset row becomes a rollout during generation or evaluation.
from datasets import Dataset
import verifiers as vf

dataset = Dataset.from_list([
    {"prompt": [{"role": "user", "content": "What is 2+2?"}], "answer": "4"},
    {"prompt": [{"role": "user", "content": "What is 3*5?"}], "answer": "15"},
])

env = vf.SingleTurnEnv(dataset=dataset, rubric=rubric)

Dataset Schema

Required Columns

Datasets are automatically processed by the environment to include:
  • example_id - Integer ID for grouping rollouts (auto-generated if missing)
  • prompt - Messages to send to the model (list of chat messages)
  • task - Task identifier for routing in EnvGroup (defaults to env_id)

Optional Columns

  • answer - Ground truth for scoring (string)
  • info - Structured metadata (dict or JSON string)
dataset = Dataset.from_list([
    {
        "prompt": [{"role": "user", "content": "Solve: 2x + 3 = 7"}],
        "answer": "x = 2",
        "info": '{"difficulty": 3, "topic": "algebra"}'
    },
])
When using info, prefer JSON strings if rows have different schemas (different fields or nested structures). The environment automatically parses JSON strings into dicts during rollout initialization.

Building Prompts

Direct Prompt Construction

Provide a prompt column with pre-formatted chat messages:
dataset = Dataset.from_list([
    {
        "prompt": [
            {"role": "system", "content": "You are a math tutor."},
            {"role": "user", "content": "What is 2+2?"}
        ],
        "answer": "4"
    },
])

Question-Based Construction

Use a question column and let the environment wrap it:
dataset = Dataset.from_list([
    {"question": "What is 2+2?", "answer": "4"},
    {"question": "What is 3*5?", "answer": "15"},
])

# Automatically converts to:
# {"prompt": [{"role": "user", "content": "What is 2+2?"}], "answer": "4"}

System Prompts and Few-Shot Examples

Add system prompts and few-shot examples via environment parameters:
env = vf.SingleTurnEnv(
    dataset=dataset,
    system_prompt="You are a helpful math tutor.",
    few_shot=[
        {"role": "user", "content": "What is 1+1?"},
        {"role": "assistant", "content": "1+1 equals 2."}
    ],
    rubric=rubric
)
Result:
[
    {"role": "system", "content": "You are a helpful math tutor."},
    {"role": "user", "content": "What is 1+1?"},
    {"role": "assistant", "content": "1+1 equals 2."},
    {"role": "user", "content": "What is 2+2?"}
]
Behavior:
  • If dataset has prompt column: system_prompt is prepended (if not already present), few_shot is ignored
  • If dataset has question column: both system_prompt and few_shot are applied

Dataset Builders (Lazy Loading)

For large datasets or when running multiple environment replicas, defer dataset loading using a DatasetBuilder - a callable that returns a Dataset:
from datasets import load_dataset
import verifiers as vf

def get_dataset_builder(split: str = "train", seed: int = 42) -> vf.DatasetBuilder:
    """Returns a builder that lazily loads the dataset."""
    def build() -> Dataset:
        ds = load_dataset("gsm8k", "main", split=split)
        ds = ds.shuffle(seed=seed)
        return ds
    return build

def load_environment():
    dataset_builder = get_dataset_builder(split="train")
    eval_builder = get_dataset_builder(split="test")
    
    return vf.SingleTurnEnv(
        dataset=dataset_builder,      # built on first access
        eval_dataset=eval_builder,    # built on first access
        rubric=rubric,
    )
When to use builders:
  • Dataset loading is expensive (e.g., downloading from HuggingFace Hub)
  • Multiple environment replicas don’t all need to own the dataset
  • You want to parameterize dataset creation without loading immediately
Lazy loading behavior:
# Dataset not loaded yet
env = vf.SingleTurnEnv(dataset=dataset_builder, rubric=rubric)

# Triggers loading
train_ds = env.get_dataset()

# Already loaded, returns cached dataset
train_ds_again = env.get_dataset()
When a raw Dataset is passed (not a builder), it’s loaded eagerly during environment initialization for backwards compatibility.

Training vs Evaluation Datasets

Environments support separate datasets for training and evaluation:
train_ds = Dataset.from_list([...])
eval_ds = Dataset.from_list([...])

env = vf.SingleTurnEnv(
    dataset=train_ds,
    eval_dataset=eval_ds,
    rubric=rubric
)

# Uses train_ds
train_results = await env.generate(train_ds, client=client, model="gpt-4")

# Uses eval_ds
eval_results = await env.evaluate(client=client, model="gpt-4")
Fallback behavior:
  • If eval_dataset is not provided, evaluate() falls back to dataset
  • If neither is provided, environment initialization raises ValueError

Dataset Access Methods

get_dataset()

Retrieve the training dataset with optional sampling:
# Get full dataset
train_ds = env.get_dataset()

# Get first 100 examples
train_ds = env.get_dataset(n=100)

# Shuffle and get first 50
train_ds = env.get_dataset(n=50, seed=42)
Type signature:
def get_dataset(self, n: int = -1, seed: int | None = None) -> Dataset:
    ...

get_eval_dataset()

Retrieve the evaluation dataset:
# Get full eval dataset
eval_ds = env.get_eval_dataset()

# Get first 20 examples, shuffled
eval_ds = env.get_eval_dataset(n=20, seed=42)
Type signature:
def get_eval_dataset(self, n: int = -1, seed: int | None = None) -> Dataset:
    ...

Dataset Formatting

The environment automatically formats datasets during initialization:

Example ID Assignment

# Input dataset
dataset = Dataset.from_list([
    {"question": "What is 2+2?"},
    {"question": "What is 3*5?"},
])

# After formatting
# example_id column added: [0, 1]

Prompt Construction

# Input with question column
dataset = Dataset.from_list([{"question": "What is 2+2?"}])

env = vf.SingleTurnEnv(
    dataset=dataset,
    system_prompt="You are helpful.",
    rubric=rubric
)

# Formatted dataset has prompt column:
# {
#     "prompt": [
#         {"role": "system", "content": "You are helpful."},
#         {"role": "user", "content": "What is 2+2?"}
#     ]
# }

Task Assignment

env = vf.SingleTurnEnv(
    dataset=dataset,
    env_id="math-qa",
    rubric=rubric
)

# Each row gets task="math-qa"

Loading Example Datasets

Verifiers includes built-in example datasets:
import verifiers as vf

# Load example dataset
dataset = vf.load_example_dataset("gsm8k-sample")

env = vf.SingleTurnEnv(dataset=dataset, rubric=rubric)

Creating Datasets Programmatically

From Lists

from datasets import Dataset

data = [
    {"question": "What is 2+2?", "answer": "4"},
    {"question": "What is 3*5?", "answer": "15"},
]

dataset = Dataset.from_list(data)

From HuggingFace Hub

from datasets import load_dataset

dataset = load_dataset("gsm8k", "main", split="train")

From Pandas DataFrame

import pandas as pd
from datasets import Dataset

df = pd.DataFrame({
    "question": ["What is 2+2?", "What is 3*5?"],
    "answer": ["4", "15"]
})

dataset = Dataset.from_pandas(df)

Using make_dataset()

Static helper for creating datasets from rollout inputs:
import verifiers as vf

inputs = [
    {"prompt": [{"role": "user", "content": "Hello"}], "answer": "Hi"},
    {"prompt": [{"role": "user", "content": "Goodbye"}], "answer": "Bye"},
]

dataset = vf.Environment.make_dataset(inputs)

Dataset Transformations

Use the datasets library’s .map() for preprocessing:
from datasets import load_dataset

dataset = load_dataset("gsm8k", "main", split="train")

# Extract answer from GSM8K format
def extract_answer(example):
    answer_text = example["answer"].split("####")[1].strip()
    return {"answer": answer_text}

dataset = dataset.map(extract_answer)
Configure mapping parallelism:
env = vf.SingleTurnEnv(
    dataset=dataset,
    map_kwargs={"num_proc": 4},  # use 4 processes for .map()
    rubric=rubric
)

Info Column Patterns

Simple Metadata

dataset = Dataset.from_list([
    {
        "question": "What is 2+2?",
        "answer": "4",
        "info": {"difficulty": 1, "category": "arithmetic"}
    },
])

Heterogeneous Schemas (JSON Strings)

dataset = Dataset.from_list([
    {
        "question": "Solve: 2x + 3 = 7",
        "info": '{"type": "algebra", "variables": ["x"]}'
    },
    {
        "question": "What is the capital of France?",
        "info": '{"type": "geography", "country": "France"}'
    },
])

Tool Definitions in Info

Store per-example tool definitions:
dataset = Dataset.from_list([
    {
        "question": "Calculate the square root of 16",
        "info": {
            "tool_defs": [
                {
                    "name": "sqrt",
                    "description": "Calculate square root",
                    "parameters": {"type": "object", "properties": {"x": {"type": "number"}}}
                }
            ]
        }
    },
])
The environment automatically extracts and normalizes info["tool_defs"] during state initialization.

Dataset Persistence

Saving to Disk

dataset.save_to_disk("./my_dataset")

# Load later
from datasets import load_from_disk
dataset = load_from_disk("./my_dataset")

Pushing to HuggingFace Hub

dataset.push_to_hub("username/dataset-name")

# Load later
from datasets import load_dataset
dataset = load_dataset("username/dataset-name")

Rollouts Per Example

Generate multiple rollouts per dataset row for best-of-N sampling or diversity:
# Dataset has 100 examples
dataset = Dataset.from_list([...])  # 100 rows

# Generate 4 rollouts per example = 400 total rollouts
results = await env.evaluate(
    client=client,
    model="gpt-4",
    num_examples=100,
    rollouts_per_example=4
)

# Results grouped by example_id for pass@k metrics
Implementation:
# Environment internally repeats the dataset
if rollouts_per_example > 1:
    inputs = dataset.repeat(rollouts_per_example)

Example: Complete Dataset Pipeline

from datasets import load_dataset
import verifiers as vf

def get_dataset_builder():
    def build():
        # Load from HuggingFace
        ds = load_dataset("gsm8k", "main", split="train")
        
        # Extract answer
        def extract_answer(example):
            answer = example["answer"].split("####")[1].strip()
            return {"answer": answer}
        
        ds = ds.map(extract_answer)
        
        # Format question as prompt
        def format_question(example):
            return {
                "question": example["question"],
                "answer": example["answer"],
                "info": {"original_answer": example["answer"]}
            }
        
        ds = ds.map(format_question)
        
        return ds
    
    return build

def load_environment():
    dataset_builder = get_dataset_builder()
    
    async def correct_answer(completion, answer, parser) -> float:
        parsed = parser.parse_answer(completion)
        return 1.0 if parsed == answer else 0.0
    
    parser = vf.XMLParser(fields=["reasoning", "answer"])
    rubric = vf.Rubric(funcs=[correct_answer], parser=parser)
    
    return vf.SingleTurnEnv(
        dataset=dataset_builder,
        system_prompt="Solve the math problem step by step.",
        parser=parser,
        rubric=rubric,
        env_id="gsm8k"
    )
When using DatasetBuilder, ensure the builder function is deterministic if you need reproducible dataset ordering across runs. Use fixed seeds for shuffling operations.

Build docs developers (and LLMs) love