Skip to main content
Tool environments extend multi-turn interaction by giving models access to external functions. They’re essential for agents that need to search, calculate, execute code, or interact with APIs.

Overview

Verifiers provides three tool environment types:
EnvironmentUse CaseTools
ToolEnvStateless, idempotent toolsPython functions
MCPEnvMCP server toolsMCP protocol servers
StatefulToolEnvTools requiring per-rollout statePython functions with hidden args

ToolEnv: Stateless Tools

ToolEnv is designed for simple, stateless tools where each call is independent. Tools are Python functions with type hints and docstrings.

Defining Tools

Tools are extracted from function signatures:
async def calculate(expression: str) -> str:
    """Evaluate a mathematical expression.
    
    Args:
        expression: A mathematical expression to evaluate (e.g. "2 + 2 * 3")
    
    Returns:
        The result of the evaluation.
    """
    try:
        result = eval(expression)
        return str(result)
    except Exception as e:
        return f"Error: {e}"

async def lookup(term: str) -> str:
    """Look up a term in the knowledge base.
    
    Args:
        term: The term to search for.
    
    Returns:
        Information about the term.
    """
    # Your lookup logic here
    return knowledge_base.get(term, "Term not found")
Key components:
  • Function name → tool name
  • Type hints → parameter types
  • Docstring → tool description
  • Args section → parameter descriptions
Always use async def for tools to avoid blocking the event loop, even if the function doesn’t await anything internally.

Creating a Tool Environment

1
Define your tools
2
import verifiers as vf

async def search_pages(query: str) -> list[dict]:
    """Search for relevant articles.
    
    Args:
        query: The search query.
    
    Returns:
        List of dicts with page_id and title.
    """
    # Your search logic
    return results

async def read_page(page_id: str) -> str:
    """Read a page's content.
    
    Args:
        page_id: The ID of the page to read.
    
    Returns:
        The page content.
    """
    # Your read logic
    return content
3
Create the environment
4
def load_environment():
    dataset = load_dataset("my-questions", split="train")
    
    async def correct_answer(completion, answer, judge) -> float:
        response = completion[-1]["content"]
        verdict = await judge(response, answer)
        return 1.0 if "correct" in verdict.lower() else 0.0
    
    judge_rubric = vf.JudgeRubric(judge_model="gpt-4.1-mini")
    judge_rubric.add_reward_func(correct_answer)
    
    return vf.ToolEnv(
        dataset=dataset,
        tools=[search_pages, read_page],
        rubric=judge_rubric,
        max_turns=10,
    )
5
Run evaluation
6
prime env install my-tool-env
prime eval run my-tool-env -m gpt-4.1-mini -n 10
Let’s examine the wiki-search environment:
environments/wiki_search/wiki_search.py
import asyncio
import os
import chromadb
from datasets import load_dataset
from openai import AsyncOpenAI
import verifiers as vf
from verifiers.rubrics.judge_rubric import JudgeRubric

def load_environment(
    max_turns: int = 10,
    judge_model: str = "gpt-4.1-mini",
    corpus_dataset: str = "willcb/rare-wiki-pages",
    **kwargs,
):
    # Load corpus into memory
    corpus = load_dataset(corpus_dataset, split="train")
    page_id_to_content = {row["id"]: row["content"] for row in corpus}
    
    # Initialize ChromaDB for embedding search
    openai_ef = embedding_functions.OpenAIEmbeddingFunction(
        model_name="text-embedding-3-small"
    )
    client = chromadb.PersistentClient(path=".chroma_db")
    collection = client.get_or_create_collection(
        name="wiki_titles",
        embedding_function=openai_ef,
    )
    
    # Define tools
    async def search_pages(query: str) -> list[dict]:
        """Search for top 10 relevant articles using title embedding similarity.
        
        Args:
            query: The query to search for.
        
        Returns:
            List of dicts with page_id and title.
        """
        results = await asyncio.to_thread(
            collection.query, query_texts=[query], n_results=10
        )
        return [
            {"page_id": results["ids"][0][i], "title": results["metadatas"][0][i]["title"]}
            for i in range(len(results["ids"][0]))
        ]
    
    async def read_section(section_id: str) -> str:
        """Read a section of a page.
        
        Args:
            section_id: The ID of the section to read (format: page_id:section_name).
        
        Returns:
            The content of the section.
        """
        page_id, section_name = section_id.split(":", 1)
        content = page_id_to_content[page_id]
        # Extract section...
        return section_content
    
    tools = [search_pages, read_section]
    
    # Setup judge rubric
    judge_client = AsyncOpenAI(base_url=judge_base_url, api_key=os.environ["OPENAI_API_KEY"])
    judge_rubric = JudgeRubric(
        judge_client=judge_client,
        judge_model=judge_model,
        parser=vf.Parser(),
    )
    
    async def judge_reward_func(judge, prompt, completion, answer, state) -> float:
        judge_response = await judge(prompt, completion, answer, state)
        return 1.0 if "yes" in judge_response.lower() else 0.0
    
    judge_rubric.add_reward_func(judge_reward_func, weight=1.0)
    
    dataset = load_dataset("willcb/wiki-trivia-questions-v4", split="train")
    
    return vf.ToolEnv(
        dataset=dataset,
        system_prompt="Use the provided Wikipedia search tools to help answer questions.",
        rubric=judge_rubric,
        tools=tools,
        max_turns=max_turns,
    )
Key features:
  • Vector search with ChromaDB embeddings
  • Async tool execution for performance
  • LLM-as-judge for evaluation
  • Modular tool design (search → read)

MCPEnv: MCP Server Tools

MCPEnv integrates with Model Context Protocol (MCP) servers, allowing you to use any MCP-compatible tool server.

Basic MCP Setup

import verifiers as vf

mcp_servers = [
    {
        "name": "fetch",
        "command": "uvx",
        "args": ["mcp-server-fetch"],
    },
    {
        "name": "filesystem",
        "command": "npx",
        "args": ["-y", "@modelcontextprotocol/server-filesystem", "/tmp"],
    },
]

def load_environment():
    dataset = load_dataset("my-tasks", split="train")
    rubric = vf.Rubric(funcs=[my_reward_func])
    
    return vf.MCPEnv(
        mcp_servers=mcp_servers,
        dataset=dataset,
        rubric=rubric,
        max_turns=15,
    )
The environment automatically:
  • Starts MCP server processes
  • Connects via stdio
  • Discovers available tools
  • Handles tool calls
  • Manages server lifecycle

MCP Server Configuration

from verifiers.envs.experimental.mcp_env import MCPServerConfig

mcp_servers = [
    MCPServerConfig(
        name="my-server",
        command="uvx",
        args=["mcp-server-package"],
        env={"API_KEY": "your-key"},  # Environment variables
        description="My custom MCP server",
    )
]

StatefulToolEnv: Persistent State

StatefulToolEnv is for tools that need per-rollout state (e.g., sandbox containers, database sessions, game state).

Concept: Hidden Arguments

Some tool parameters should be injected by the environment but hidden from the model:
class MySandboxEnv(vf.StatefulToolEnv):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        # Add tool with hidden 'sandbox_id' parameter
        self.add_tool(self.run_code, args_to_skip=["sandbox_id"])
    
    async def setup_state(self, state, **kwargs):
        """Initialize sandbox for this rollout."""
        state["sandbox_id"] = await create_sandbox()
        return await super().setup_state(state, **kwargs)
    
    def update_tool_args(self, tool_name, tool_args, messages, state, **kwargs):
        """Inject hidden arguments before tool execution."""
        if tool_name == "run_code":
            tool_args["sandbox_id"] = state["sandbox_id"]
        return tool_args
    
    async def run_code(self, code: str, sandbox_id: str) -> str:
        """Execute code in the sandbox.
        
        Args:
            code: The Python code to execute.
        
        Returns:
            The output of the code execution.
        """
        return await execute_in_sandbox(sandbox_id, code)
The model sees: run_code(code: str)
The environment calls: run_code(code=user_input, sandbox_id=state["sandbox_id"])

Built-in Stateful Environments

Verifiers includes production-ready stateful environments:
import verifiers as vf

def load_environment():
    dataset = load_dataset("my-bash-tasks", split="train")
    rubric = vf.Rubric(funcs=[my_reward])
    
    return vf.SandboxEnv(
        dataset=dataset,
        rubric=rubric,
        max_turns=20,
        cpu_cores=2,
        memory_gb=4,
        timeout_minutes=60,
    )
PythonEnv provides a persistent Python REPL with:
  • Package installation via pip_install_packages
  • Sandboxed execution via Prime Sandboxes
  • Automatic resource cleanup

Sandbox Configuration

Both SandboxEnv and PythonEnv accept detailed configuration:
vf_env = vf.PythonEnv(
    dataset=dataset,
    rubric=rubric,
    # Sandbox resources
    cpu_cores=2,
    memory_gb=4,
    disk_size_gb=10,
    gpu_count=0,
    timeout_minutes=60,
    # Python-specific
    pip_install_packages="numpy pandas matplotlib",
    max_startup_wait_seconds=120,
    # Optional labels for organization
    labels=["experiment-1", "math-tasks"],
)

Tool Error Handling

Control how tool errors are handled:
vf_env = vf.ToolEnv(
    tools=[my_tool],
    stop_errors=[vf.ToolParseError],  # Stop rollout on parse errors
    error_formatter=lambda e: f"Tool error: {e}",
    max_turns=10,
)
When a tool error occurs:
  • If error type is in stop_errors → rollout stops immediately
  • Otherwise → error message returned to model (chance to recover)
Error hierarchy:
vf.ToolError
├── vf.ToolParseError    # Failed to parse tool call JSON
└── vf.ToolCallError     # Tool execution failed

Tool Metrics

Tool environments automatically track:
  • total_tool_calls — Total number of tool invocations
  • {tool_name}_calls — Per-tool call counts
Example output:
{
  "reward": 0.85,
  "correct_answer": 0.85,
  "num_turns": 7,
  "total_tool_calls": 12,
  "search_pages_calls": 3,
  "read_section_calls": 9
}

Advanced Patterns

Dynamic Tool Registration

Add tools after initialization:
env = vf.ToolEnv(dataset=dataset, tools=[tool1], rubric=rubric)
env.add_tool(tool2)
env.remove_tool(tool1)

Tool Return Types

Tools can return various types:
# String (most common)
async def get_weather(city: str) -> str:
    return f"Weather in {city}: sunny, 72°F"

# Structured data (serialized to JSON)
async def search(query: str) -> list[dict]:
    return [{"title": "...", "url": "..."}]

# Complex types (serialized automatically)
async def analyze(text: str) -> dict:
    return {"sentiment": 0.8, "entities": [...]}

Combining Multiple Tool Sets

# Define tool groups
search_tools = [search_pages, view_sections, read_section]
calculation_tools = [calculate, solve_equation]

vf_env = vf.ToolEnv(
    dataset=dataset,
    tools=search_tools + calculation_tools,
    rubric=rubric,
    max_turns=15,
)

Common Patterns

Rate Limiting

import asyncio
from collections import defaultdict

rate_limiter = defaultdict(lambda: asyncio.Semaphore(5))  # 5 concurrent calls

async def rate_limited_api_call(endpoint: str, **params) -> str:
    """Call external API with rate limiting."""
    async with rate_limiter[endpoint]:
        response = await external_api.call(endpoint, **params)
        return response

Caching Tool Results

from functools import lru_cache

@lru_cache(maxsize=1000)
def get_page_content(page_id: str) -> str:
    """Cached page content lookup."""
    return database.get(page_id)

async def read_page(page_id: str) -> str:
    """Read a page (cached)."""
    return get_page_content(page_id)

Tool Chaining

async def search_and_summarize(query: str) -> str:
    """Search and return a summary.
    
    Args:
        query: Search query.
    
    Returns:
        Summary of top result.
    """
    results = await search_pages(query)
    if not results:
        return "No results found"
    
    top_result = results[0]
    content = await read_page(top_result["page_id"])
    summary = await summarize(content)
    return summary

Testing Tool Environments

1
Test tools independently
2
import asyncio

async def test_tools():
    result = await search_pages("python programming")
    print(f"Found {len(result)} pages")
    
    content = await read_page(result[0]["page_id"])
    print(f"Content length: {len(content)}")

asyncio.run(test_tools())
3
Run with debug mode
4
prime eval run my-tool-env -m gpt-4.1-mini -n 2 -v
5
Shows:
6
  • Tool calls with arguments
  • Tool results
  • Timing information
  • 7
    Check tool usage metrics
    8
    prime eval run my-tool-env -m gpt-4.1-mini -n 10 -s
    
    9
    Look for:
    10
  • Average tool calls per rollout
  • Which tools are most frequently used
  • Tool call success rates
  • Next Steps

    Build docs developers (and LLMs) love