import verifiers as vf
from datasets import Dataset
import numpy as np
class TicTacToeEnv(vf.MultiTurnEnv):
def __init__(self, **kwargs):
super().__init__(max_turns=9, **kwargs)
async def setup_state(self, state: vf.State) -> vf.State:
"""Initialize empty board."""
state["board"] = np.zeros((3, 3), dtype=int)
state["current_player"] = 1 # 1 = X (model), -1 = O (environment)
state["winner"] = None
return await super().setup_state(state)
async def env_response(
self,
messages: vf.Messages,
state: vf.State
) -> vf.Messages:
"""Process model's move and make counter-move."""
# Parse model's move
last_msg = messages[-1]["content"]
parsed = self.parser.parse(messages)
try:
row, col = int(parsed.row), int(parsed.col)
if not (0 <= row < 3 and 0 <= col < 3):
return [{"role": "user", "content": "Invalid position. Use 0-2 for row and col."}]
if state["board"][row, col] != 0:
return [{"role": "user", "content": "That position is taken. Try again."}]
except (ValueError, AttributeError):
return [{"role": "user", "content": "Invalid format. Use <row>0</row><col>0</col>."}]
# Apply model's move
state["board"][row, col] = 1
# Check win/draw
if self.check_winner(state["board"]) == 1:
state["winner"] = "model"
return [{"role": "user", "content": f"You win!\n{self.render_board(state['board'])}"}]
if np.all(state["board"] != 0):
state["winner"] = "draw"
return [{"role": "user", "content": f"Draw!\n{self.render_board(state['board'])}"}]
# Environment's move (simple strategy)
env_row, env_col = self.make_env_move(state["board"])
state["board"][env_row, env_col] = -1
# Check if environment won
if self.check_winner(state["board"]) == -1:
state["winner"] = "environment"
return [{"role": "user", "content": f"I win!\n{self.render_board(state['board'])}"}]
# Game continues
return [{"role": "user", "content": self.render_board(state["board"])}]
@vf.stop
async def game_ended(self, state: vf.State) -> bool:
return state.get("winner") is not None
def check_winner(self, board):
"""Check for winner. Returns 1 (X wins), -1 (O wins), or 0 (no winner)."""
# Check rows, cols, diagonals
for i in range(3):
if abs(board[i, :].sum()) == 3:
return board[i, 0]
if abs(board[:, i].sum()) == 3:
return board[0, i]
if abs(board.diagonal().sum()) == 3:
return board[0, 0]
if abs(np.fliplr(board).diagonal().sum()) == 3:
return board[0, 2]
return 0
def make_env_move(self, board):
"""Simple strategy: take center, then corners, then edges."""
# Take center if available
if board[1, 1] == 0:
return 1, 1
# Take corners
for r, c in [(0, 0), (0, 2), (2, 0), (2, 2)]:
if board[r, c] == 0:
return r, c
# Take edges
for r, c in [(0, 1), (1, 0), (1, 2), (2, 1)]:
if board[r, c] == 0:
return r, c
def render_board(self, board):
"""Render board as string."""
symbols = {0: ".", 1: "X", -1: "O"}
lines = []
for row in board:
lines.append(" ".join(symbols[cell] for cell in row))
return "\n".join(lines)
# Load environment
def load_environment():
dataset = Dataset.from_list([
{"prompt": [{"role": "user", "content": "Let's play tic-tac-toe. You are X. Make your move using <row>0</row><col>0</col> format."}]}
for _ in range(100)
])
parser = vf.XMLParser(["row", "col"])
async def model_won(state) -> float:
return 1.0 if state.get("winner") == "model" else 0.0
async def draw_bonus(state) -> float:
return 0.5 if state.get("winner") == "draw" else 0.0
rubric = vf.Rubric(
funcs=[model_won, draw_bonus],
weights=[1.0, 1.0],
parser=parser,
)
return TicTacToeEnv(dataset=dataset, parser=parser, rubric=rubric)