Configuration System
The benchmark uses a centralized configuration system in config.py with four main sections:
GameConfig
Board size, word counts, turn limits
LLMConfig
Temperature, tokens, timeouts, retries
OrchestratorConfig
Logging, validation, results storage
DataConfig
Word lists, file paths, caching
Game Configuration
Default Settings
from config import Config
# Use default configuration
config = Config.default()
# Access game settings
config.game.BOARD_SIZE # 25
config.game.BLUE_WORDS # 9
config.game.RED_WORDS # 8
config.game.NEUTRAL_WORDS # 7
config.game.BOMB_COUNT # 1
config.game.MAX_TURNS # 50
config.game.STARTING_TEAM # "BLUE"
Custom Board Sizes
Mini Game
Large Game
Custom Starting Team
from config import Config
# 9-word board for quick games
mini_config = Config.custom_game(board_size=9)
# Automatically calculates:
# - Blue: 4 words (starting team)
# - Red: 3 words
# - Neutral: 1 word
# - Bomb: 1 word
from config import Config
# 49-word board for extended games
large_config = Config.custom_game(board_size=49)
# Automatically calculates:
# - Blue: 17 words (starting team)
# - Red: 16 words
# - Neutral: 15 words
# - Bomb: 1 word
from config import Config
# Red team starts
red_start = Config.custom_game(
board_size=25,
starting_team="RED"
)
# Now red has 9 words, blue has 8
Board size must be odd for fair play. The system automatically validates and calculates proportional word distributions.
Hint Count Limits
from config import GameConfig
config = GameConfig(
MIN_HINT_COUNT=1, # Minimum hint number
MAX_HINT_COUNT=9, # Maximum hint number
)
Guess Limits
config.game.MAX_GUESSES_PER_TURN = None # Default: hint_count + 1
config.game.MAX_GUESSES_PER_TURN = 5 # Hard limit: 5 guesses
Setting MAX_GUESSES_PER_TURN=None allows hint_count + 1 guesses (standard Codenames rules).
LLM Configuration
Temperature Settings
from config import LLMConfig
llm_config = LLMConfig(
DEFAULT_TEMPERATURE=0.7, # Balance creativity/consistency
MIN_TEMPERATURE=0.0, # Deterministic
MAX_TEMPERATURE=2.0, # Maximum creativity
)
Temperature is configured per-model in model_config.py. Reasoning models (o-series) require temperature=1.0.
Response Settings
llm_config = LLMConfig(
MAX_TOKENS=1024, # Maximum response length
RESPONSE_TIMEOUT=30, # Timeout in seconds
)
Retry Logic
llm_config = LLMConfig(
MAX_RETRIES=3, # Number of retry attempts
RETRY_DELAY=1.0, # Base delay (exponential backoff)
)
Retry behavior:
- First retry: 5 seconds
- Second retry: 10 seconds
- Third retry: 20 seconds
- Each with +0-2s jitter
Default Models
llm_config = LLMConfig(
OPENAI_DEFAULT_MODEL="gpt-4o-mini",
ANTHROPIC_DEFAULT_MODEL="claude-sonnet-4-5-20250929",
GEMINI_DEFAULT_MODEL="gemini-2.5-flash",
GROK_DEFAULT_MODEL="grok-beta",
DEEPSEEK_DEFAULT_MODEL="deepseek-chat",
LLAMA_DEFAULT_MODEL="llama-3.3-70b",
)
Orchestrator Configuration
Logging
from config import OrchestratorConfig
orchestrator_config = OrchestratorConfig(
VERBOSE_DEFAULT=True, # Print game progress
LOG_TO_FILE=False, # Save logs to file
LOG_FILE_PATH="game_logs/", # Log directory
)
Results Storage
orchestrator_config = OrchestratorConfig(
SAVE_RESULTS=True, # Save game results
RESULTS_PATH="game_results/", # Results directory
RESULT_FORMAT="json", # Format: json, csv, or both
)
Timing
orchestrator_config = OrchestratorConfig(
TURN_DELAY=0.0, # Delay between turns (for demos)
GUESS_DELAY=0.0, # Delay between guesses (for demos)
)
Validation
orchestrator_config = OrchestratorConfig(
STRICT_VALIDATION=True, # Enforce strict rules
ALLOW_INVALID_HINTS=False, # Reject hints matching board words
)
Disabling STRICT_VALIDATION can lead to invalid game states. Use only for testing.
Data Configuration
Word Lists
from config import DataConfig
data_config = DataConfig(
WORD_LIST_PATH="utils/words.csv",
MIN_WORDS_IN_POOL=100, # Minimum words needed
CACHE_WORD_LIST=True, # Cache in memory
)
File Paths
data_config = DataConfig(
ENV_FILE=".env",
ENV_EXAMPLE_FILE=".env.example",
)
Reproducibility
data_config = DataConfig(
DEMO_RANDOM_SEED=42, # Set for reproducible demos
)
Using Configuration
In GameRunner
from config import Config
from orchestrator import GameRunner
config = Config.custom_game(board_size=49)
runner = GameRunner(
board=board,
blue_hint_giver=blue_hint,
blue_guesser=blue_guess,
red_hint_giver=red_hint,
red_guesser=red_guess,
max_turns=config.game.MAX_TURNS,
verbose=config.orchestrator.VERBOSE_DEFAULT,
config=config.orchestrator,
llm_config=config.llm,
)
In Board Creation
from game import Board
from config import Config
config = Config.custom_game(board_size=49)
words = generate_word_list(config.game.BOARD_SIZE)
board = Board(words, config=config.game)
In Benchmarks
from benchmark import BenchmarkRunner
from config import Config
config = Config.default()
config.game.MAX_TURNS = 30 # Shorter games
runner = BenchmarkRunner(
games_per_combination=5,
verbose=True
)
runner.config = config
result = runner.run()
Complete Configuration Example
from config import Config, GameConfig, LLMConfig, OrchestratorConfig
# Create custom configuration
config = Config()
# Customize game settings
config.game = GameConfig(
BOARD_SIZE=25,
BLUE_WORDS=9,
RED_WORDS=8,
NEUTRAL_WORDS=7,
BOMB_COUNT=1,
MAX_TURNS=40,
MAX_GUESSES_PER_TURN=None,
STARTING_TEAM="BLUE",
MIN_HINT_COUNT=1,
MAX_HINT_COUNT=9,
)
# Customize LLM settings
config.llm = LLMConfig(
DEFAULT_TEMPERATURE=0.7,
MAX_TOKENS=1024,
RESPONSE_TIMEOUT=30,
MAX_RETRIES=3,
RETRY_DELAY=1.0,
)
# Customize orchestrator
config.orchestrator = OrchestratorConfig(
VERBOSE_DEFAULT=True,
SAVE_RESULTS=True,
RESULTS_PATH="my_results/",
RESULT_FORMAT="json",
STRICT_VALIDATION=True,
)
# Validate configuration
config.validate()
# Export to dict
config_dict = config.to_dict()
Configuration Validation
The system automatically validates configurations:
from config import GameConfig
# Valid configuration
valid_config = GameConfig(
BOARD_SIZE=25,
BLUE_WORDS=9,
RED_WORDS=8,
NEUTRAL_WORDS=7,
BOMB_COUNT=1,
)
valid_config.validate() # Returns True
# Invalid configuration
invalid_config = GameConfig(
BOARD_SIZE=26, # Even number!
BLUE_WORDS=10,
RED_WORDS=10,
NEUTRAL_WORDS=5,
BOMB_COUNT=1,
)
invalid_config.validate() # Raises ValueError
Validation checks:
- Board size is odd
- Board size ≥ 9
- Word counts sum to board size
- Hint counts are valid
Model-Specific Configuration
Model settings are in model_config.py:
from model_config import MODEL_CONFIGS, get_model_config
from agents.llm import BAMLModel
# Get model configuration
config = get_model_config(BAMLModel.GPT5_MINI)
print(config) # {'temperature': 0.7}
# O-series models have restrictions
config = get_model_config(BAMLModel.O3_MINI)
print(config) # {'temperature': 1.0}
See Model Selection for details.
Environment Variables
Configuration also supports environment variables:
# API Keys
OPENAI_API_KEY=sk-...
ANTHROPIC_API_KEY=sk-ant-...
GOOGLE_API_KEY=AIza...
# Optional: Override defaults
MAX_TURNS=30
VERBOSE=true
Best Practices
Use Config.default() for standard games
config = Config.default()
Only customize when you need non-standard behavior.Validate custom configurations
config = Config()
config.game = GameConfig(...)
config.validate() # Always validate!
Store configurations with results
result = runner.run()
config_dict = config.to_dict()
# Save together
output = {
'result': result.to_dict(),
'config': config_dict,
}
Use custom_game() for variant sizes
# Better than manual configuration
mini = Config.custom_game(board_size=9)
# Instead of:
manual = Config()
manual.game.BOARD_SIZE = 9
manual.game.BLUE_WORDS = ... # Error-prone!
Next Steps
Running Games
Apply your configuration to games
Model Selection
Configure model-specific settings
Benchmarking
Use configuration in benchmarks
Cost Management
Review cost configurations