Tool environments extend multi-turn interaction by giving models access to external functions. They’re essential for agents that need to search, calculate, execute code, or interact with APIs.
Overview
Verifiers provides three tool environment types:
| Environment | Use Case | Tools |
|---|
ToolEnv | Stateless, idempotent tools | Python functions |
MCPEnv | MCP server tools | MCP protocol servers |
StatefulToolEnv | Tools requiring per-rollout state | Python functions with hidden args |
ToolEnv is designed for simple, stateless tools where each call is independent. Tools are Python functions with type hints and docstrings.
Tools are extracted from function signatures:
async def calculate(expression: str) -> str:
"""Evaluate a mathematical expression.
Args:
expression: A mathematical expression to evaluate (e.g. "2 + 2 * 3")
Returns:
The result of the evaluation.
"""
try:
result = eval(expression)
return str(result)
except Exception as e:
return f"Error: {e}"
async def lookup(term: str) -> str:
"""Look up a term in the knowledge base.
Args:
term: The term to search for.
Returns:
Information about the term.
"""
# Your lookup logic here
return knowledge_base.get(term, "Term not found")
Key components:
- Function name → tool name
- Type hints → parameter types
- Docstring → tool description
- Args section → parameter descriptions
Always use async def for tools to avoid blocking the event loop, even if the function doesn’t await anything internally.
import verifiers as vf
async def search_pages(query: str) -> list[dict]:
"""Search for relevant articles.
Args:
query: The search query.
Returns:
List of dicts with page_id and title.
"""
# Your search logic
return results
async def read_page(page_id: str) -> str:
"""Read a page's content.
Args:
page_id: The ID of the page to read.
Returns:
The page content.
"""
# Your read logic
return content
def load_environment():
dataset = load_dataset("my-questions", split="train")
async def correct_answer(completion, answer, judge) -> float:
response = completion[-1]["content"]
verdict = await judge(response, answer)
return 1.0 if "correct" in verdict.lower() else 0.0
judge_rubric = vf.JudgeRubric(judge_model="gpt-4.1-mini")
judge_rubric.add_reward_func(correct_answer)
return vf.ToolEnv(
dataset=dataset,
tools=[search_pages, read_page],
rubric=judge_rubric,
max_turns=10,
)
prime env install my-tool-env
prime eval run my-tool-env -m gpt-4.1-mini -n 10
Real Example: Wiki Search
Let’s examine the wiki-search environment:
environments/wiki_search/wiki_search.py
import asyncio
import os
import chromadb
from datasets import load_dataset
from openai import AsyncOpenAI
import verifiers as vf
from verifiers.rubrics.judge_rubric import JudgeRubric
def load_environment(
max_turns: int = 10,
judge_model: str = "gpt-4.1-mini",
corpus_dataset: str = "willcb/rare-wiki-pages",
**kwargs,
):
# Load corpus into memory
corpus = load_dataset(corpus_dataset, split="train")
page_id_to_content = {row["id"]: row["content"] for row in corpus}
# Initialize ChromaDB for embedding search
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
model_name="text-embedding-3-small"
)
client = chromadb.PersistentClient(path=".chroma_db")
collection = client.get_or_create_collection(
name="wiki_titles",
embedding_function=openai_ef,
)
# Define tools
async def search_pages(query: str) -> list[dict]:
"""Search for top 10 relevant articles using title embedding similarity.
Args:
query: The query to search for.
Returns:
List of dicts with page_id and title.
"""
results = await asyncio.to_thread(
collection.query, query_texts=[query], n_results=10
)
return [
{"page_id": results["ids"][0][i], "title": results["metadatas"][0][i]["title"]}
for i in range(len(results["ids"][0]))
]
async def read_section(section_id: str) -> str:
"""Read a section of a page.
Args:
section_id: The ID of the section to read (format: page_id:section_name).
Returns:
The content of the section.
"""
page_id, section_name = section_id.split(":", 1)
content = page_id_to_content[page_id]
# Extract section...
return section_content
tools = [search_pages, read_section]
# Setup judge rubric
judge_client = AsyncOpenAI(base_url=judge_base_url, api_key=os.environ["OPENAI_API_KEY"])
judge_rubric = JudgeRubric(
judge_client=judge_client,
judge_model=judge_model,
parser=vf.Parser(),
)
async def judge_reward_func(judge, prompt, completion, answer, state) -> float:
judge_response = await judge(prompt, completion, answer, state)
return 1.0 if "yes" in judge_response.lower() else 0.0
judge_rubric.add_reward_func(judge_reward_func, weight=1.0)
dataset = load_dataset("willcb/wiki-trivia-questions-v4", split="train")
return vf.ToolEnv(
dataset=dataset,
system_prompt="Use the provided Wikipedia search tools to help answer questions.",
rubric=judge_rubric,
tools=tools,
max_turns=max_turns,
)
Key features:
- Vector search with ChromaDB embeddings
- Async tool execution for performance
- LLM-as-judge for evaluation
- Modular tool design (search → read)
MCPEnv integrates with Model Context Protocol (MCP) servers, allowing you to use any MCP-compatible tool server.
Basic MCP Setup
import verifiers as vf
mcp_servers = [
{
"name": "fetch",
"command": "uvx",
"args": ["mcp-server-fetch"],
},
{
"name": "filesystem",
"command": "npx",
"args": ["-y", "@modelcontextprotocol/server-filesystem", "/tmp"],
},
]
def load_environment():
dataset = load_dataset("my-tasks", split="train")
rubric = vf.Rubric(funcs=[my_reward_func])
return vf.MCPEnv(
mcp_servers=mcp_servers,
dataset=dataset,
rubric=rubric,
max_turns=15,
)
The environment automatically:
- Starts MCP server processes
- Connects via stdio
- Discovers available tools
- Handles tool calls
- Manages server lifecycle
MCP Server Configuration
from verifiers.envs.experimental.mcp_env import MCPServerConfig
mcp_servers = [
MCPServerConfig(
name="my-server",
command="uvx",
args=["mcp-server-package"],
env={"API_KEY": "your-key"}, # Environment variables
description="My custom MCP server",
)
]
StatefulToolEnv is for tools that need per-rollout state (e.g., sandbox containers, database sessions, game state).
Concept: Hidden Arguments
Some tool parameters should be injected by the environment but hidden from the model:
class MySandboxEnv(vf.StatefulToolEnv):
def __init__(self, **kwargs):
super().__init__(**kwargs)
# Add tool with hidden 'sandbox_id' parameter
self.add_tool(self.run_code, args_to_skip=["sandbox_id"])
async def setup_state(self, state, **kwargs):
"""Initialize sandbox for this rollout."""
state["sandbox_id"] = await create_sandbox()
return await super().setup_state(state, **kwargs)
def update_tool_args(self, tool_name, tool_args, messages, state, **kwargs):
"""Inject hidden arguments before tool execution."""
if tool_name == "run_code":
tool_args["sandbox_id"] = state["sandbox_id"]
return tool_args
async def run_code(self, code: str, sandbox_id: str) -> str:
"""Execute code in the sandbox.
Args:
code: The Python code to execute.
Returns:
The output of the code execution.
"""
return await execute_in_sandbox(sandbox_id, code)
The model sees: run_code(code: str)
The environment calls: run_code(code=user_input, sandbox_id=state["sandbox_id"])
Built-in Stateful Environments
Verifiers includes production-ready stateful environments:
import verifiers as vf
def load_environment():
dataset = load_dataset("my-bash-tasks", split="train")
rubric = vf.Rubric(funcs=[my_reward])
return vf.SandboxEnv(
dataset=dataset,
rubric=rubric,
max_turns=20,
cpu_cores=2,
memory_gb=4,
timeout_minutes=60,
)
PythonEnv provides a persistent Python REPL with:
- Package installation via
pip_install_packages
- Sandboxed execution via Prime Sandboxes
- Automatic resource cleanup
Sandbox Configuration
Both SandboxEnv and PythonEnv accept detailed configuration:
vf_env = vf.PythonEnv(
dataset=dataset,
rubric=rubric,
# Sandbox resources
cpu_cores=2,
memory_gb=4,
disk_size_gb=10,
gpu_count=0,
timeout_minutes=60,
# Python-specific
pip_install_packages="numpy pandas matplotlib",
max_startup_wait_seconds=120,
# Optional labels for organization
labels=["experiment-1", "math-tasks"],
)
Control how tool errors are handled:
vf_env = vf.ToolEnv(
tools=[my_tool],
stop_errors=[vf.ToolParseError], # Stop rollout on parse errors
error_formatter=lambda e: f"Tool error: {e}",
max_turns=10,
)
When a tool error occurs:
- If error type is in
stop_errors → rollout stops immediately
- Otherwise → error message returned to model (chance to recover)
Error hierarchy:
vf.ToolError
├── vf.ToolParseError # Failed to parse tool call JSON
└── vf.ToolCallError # Tool execution failed
Tool environments automatically track:
total_tool_calls — Total number of tool invocations
{tool_name}_calls — Per-tool call counts
Example output:
{
"reward": 0.85,
"correct_answer": 0.85,
"num_turns": 7,
"total_tool_calls": 12,
"search_pages_calls": 3,
"read_section_calls": 9
}
Advanced Patterns
Add tools after initialization:
env = vf.ToolEnv(dataset=dataset, tools=[tool1], rubric=rubric)
env.add_tool(tool2)
env.remove_tool(tool1)
Tools can return various types:
# String (most common)
async def get_weather(city: str) -> str:
return f"Weather in {city}: sunny, 72°F"
# Structured data (serialized to JSON)
async def search(query: str) -> list[dict]:
return [{"title": "...", "url": "..."}]
# Complex types (serialized automatically)
async def analyze(text: str) -> dict:
return {"sentiment": 0.8, "entities": [...]}
# Define tool groups
search_tools = [search_pages, view_sections, read_section]
calculation_tools = [calculate, solve_equation]
vf_env = vf.ToolEnv(
dataset=dataset,
tools=search_tools + calculation_tools,
rubric=rubric,
max_turns=15,
)
Common Patterns
Rate Limiting
import asyncio
from collections import defaultdict
rate_limiter = defaultdict(lambda: asyncio.Semaphore(5)) # 5 concurrent calls
async def rate_limited_api_call(endpoint: str, **params) -> str:
"""Call external API with rate limiting."""
async with rate_limiter[endpoint]:
response = await external_api.call(endpoint, **params)
return response
from functools import lru_cache
@lru_cache(maxsize=1000)
def get_page_content(page_id: str) -> str:
"""Cached page content lookup."""
return database.get(page_id)
async def read_page(page_id: str) -> str:
"""Read a page (cached)."""
return get_page_content(page_id)
async def search_and_summarize(query: str) -> str:
"""Search and return a summary.
Args:
query: Search query.
Returns:
Summary of top result.
"""
results = await search_pages(query)
if not results:
return "No results found"
top_result = results[0]
content = await read_page(top_result["page_id"])
summary = await summarize(content)
return summary
import asyncio
async def test_tools():
result = await search_pages("python programming")
print(f"Found {len(result)} pages")
content = await read_page(result[0]["page_id"])
print(f"Content length: {len(content)}")
asyncio.run(test_tools())
prime eval run my-tool-env -m gpt-4.1-mini -n 2 -v
Tool calls with arguments
Tool results
Timing information
prime eval run my-tool-env -m gpt-4.1-mini -n 10 -s
Average tool calls per rollout
Which tools are most frequently used
Tool call success rates
Next Steps