Skip to main content

Configuration System

The benchmark uses a centralized configuration system in config.py with four main sections:

GameConfig

Board size, word counts, turn limits

LLMConfig

Temperature, tokens, timeouts, retries

OrchestratorConfig

Logging, validation, results storage

DataConfig

Word lists, file paths, caching

Game Configuration

Default Settings

from config import Config

# Use default configuration
config = Config.default()

# Access game settings
config.game.BOARD_SIZE        # 25
config.game.BLUE_WORDS         # 9
config.game.RED_WORDS          # 8
config.game.NEUTRAL_WORDS      # 7
config.game.BOMB_COUNT         # 1
config.game.MAX_TURNS          # 50
config.game.STARTING_TEAM      # "BLUE"

Custom Board Sizes

from config import Config

# 9-word board for quick games
mini_config = Config.custom_game(board_size=9)

# Automatically calculates:
# - Blue: 4 words (starting team)
# - Red: 3 words
# - Neutral: 1 word
# - Bomb: 1 word
Board size must be odd for fair play. The system automatically validates and calculates proportional word distributions.

Hint Count Limits

from config import GameConfig

config = GameConfig(
    MIN_HINT_COUNT=1,   # Minimum hint number
    MAX_HINT_COUNT=9,   # Maximum hint number
)

Guess Limits

config.game.MAX_GUESSES_PER_TURN = None  # Default: hint_count + 1
config.game.MAX_GUESSES_PER_TURN = 5     # Hard limit: 5 guesses
Setting MAX_GUESSES_PER_TURN=None allows hint_count + 1 guesses (standard Codenames rules).

LLM Configuration

Temperature Settings

from config import LLMConfig

llm_config = LLMConfig(
    DEFAULT_TEMPERATURE=0.7,  # Balance creativity/consistency
    MIN_TEMPERATURE=0.0,      # Deterministic
    MAX_TEMPERATURE=2.0,      # Maximum creativity
)
Temperature is configured per-model in model_config.py. Reasoning models (o-series) require temperature=1.0.

Response Settings

llm_config = LLMConfig(
    MAX_TOKENS=1024,           # Maximum response length
    RESPONSE_TIMEOUT=30,       # Timeout in seconds
)

Retry Logic

llm_config = LLMConfig(
    MAX_RETRIES=3,             # Number of retry attempts
    RETRY_DELAY=1.0,           # Base delay (exponential backoff)
)
Retry behavior:
  1. First retry: 5 seconds
  2. Second retry: 10 seconds
  3. Third retry: 20 seconds
  4. Each with +0-2s jitter

Default Models

llm_config = LLMConfig(
    OPENAI_DEFAULT_MODEL="gpt-4o-mini",
    ANTHROPIC_DEFAULT_MODEL="claude-sonnet-4-5-20250929",
    GEMINI_DEFAULT_MODEL="gemini-2.5-flash",
    GROK_DEFAULT_MODEL="grok-beta",
    DEEPSEEK_DEFAULT_MODEL="deepseek-chat",
    LLAMA_DEFAULT_MODEL="llama-3.3-70b",
)

Orchestrator Configuration

Logging

from config import OrchestratorConfig

orchestrator_config = OrchestratorConfig(
    VERBOSE_DEFAULT=True,          # Print game progress
    LOG_TO_FILE=False,             # Save logs to file
    LOG_FILE_PATH="game_logs/",   # Log directory
)

Results Storage

orchestrator_config = OrchestratorConfig(
    SAVE_RESULTS=True,                # Save game results
    RESULTS_PATH="game_results/",    # Results directory
    RESULT_FORMAT="json",             # Format: json, csv, or both
)

Timing

orchestrator_config = OrchestratorConfig(
    TURN_DELAY=0.0,    # Delay between turns (for demos)
    GUESS_DELAY=0.0,   # Delay between guesses (for demos)
)

Validation

orchestrator_config = OrchestratorConfig(
    STRICT_VALIDATION=True,      # Enforce strict rules
    ALLOW_INVALID_HINTS=False,   # Reject hints matching board words
)
Disabling STRICT_VALIDATION can lead to invalid game states. Use only for testing.

Data Configuration

Word Lists

from config import DataConfig

data_config = DataConfig(
    WORD_LIST_PATH="utils/words.csv",
    MIN_WORDS_IN_POOL=100,        # Minimum words needed
    CACHE_WORD_LIST=True,         # Cache in memory
)

File Paths

data_config = DataConfig(
    ENV_FILE=".env",
    ENV_EXAMPLE_FILE=".env.example",
)

Reproducibility

data_config = DataConfig(
    DEMO_RANDOM_SEED=42,  # Set for reproducible demos
)

Using Configuration

In GameRunner

from config import Config
from orchestrator import GameRunner

config = Config.custom_game(board_size=49)

runner = GameRunner(
    board=board,
    blue_hint_giver=blue_hint,
    blue_guesser=blue_guess,
    red_hint_giver=red_hint,
    red_guesser=red_guess,
    max_turns=config.game.MAX_TURNS,
    verbose=config.orchestrator.VERBOSE_DEFAULT,
    config=config.orchestrator,
    llm_config=config.llm,
)

In Board Creation

from game import Board
from config import Config

config = Config.custom_game(board_size=49)
words = generate_word_list(config.game.BOARD_SIZE)
board = Board(words, config=config.game)

In Benchmarks

from benchmark import BenchmarkRunner
from config import Config

config = Config.default()
config.game.MAX_TURNS = 30  # Shorter games

runner = BenchmarkRunner(
    games_per_combination=5,
    verbose=True
)
runner.config = config

result = runner.run()

Complete Configuration Example

from config import Config, GameConfig, LLMConfig, OrchestratorConfig

# Create custom configuration
config = Config()

# Customize game settings
config.game = GameConfig(
    BOARD_SIZE=25,
    BLUE_WORDS=9,
    RED_WORDS=8,
    NEUTRAL_WORDS=7,
    BOMB_COUNT=1,
    MAX_TURNS=40,
    MAX_GUESSES_PER_TURN=None,
    STARTING_TEAM="BLUE",
    MIN_HINT_COUNT=1,
    MAX_HINT_COUNT=9,
)

# Customize LLM settings
config.llm = LLMConfig(
    DEFAULT_TEMPERATURE=0.7,
    MAX_TOKENS=1024,
    RESPONSE_TIMEOUT=30,
    MAX_RETRIES=3,
    RETRY_DELAY=1.0,
)

# Customize orchestrator
config.orchestrator = OrchestratorConfig(
    VERBOSE_DEFAULT=True,
    SAVE_RESULTS=True,
    RESULTS_PATH="my_results/",
    RESULT_FORMAT="json",
    STRICT_VALIDATION=True,
)

# Validate configuration
config.validate()

# Export to dict
config_dict = config.to_dict()

Configuration Validation

The system automatically validates configurations:
from config import GameConfig

# Valid configuration
valid_config = GameConfig(
    BOARD_SIZE=25,
    BLUE_WORDS=9,
    RED_WORDS=8,
    NEUTRAL_WORDS=7,
    BOMB_COUNT=1,
)
valid_config.validate()  # Returns True

# Invalid configuration
invalid_config = GameConfig(
    BOARD_SIZE=26,  # Even number!
    BLUE_WORDS=10,
    RED_WORDS=10,
    NEUTRAL_WORDS=5,
    BOMB_COUNT=1,
)
invalid_config.validate()  # Raises ValueError
Validation checks:
  • Board size is odd
  • Board size ≥ 9
  • Word counts sum to board size
  • Hint counts are valid

Model-Specific Configuration

Model settings are in model_config.py:
from model_config import MODEL_CONFIGS, get_model_config
from agents.llm import BAMLModel

# Get model configuration
config = get_model_config(BAMLModel.GPT5_MINI)
print(config)  # {'temperature': 0.7}

# O-series models have restrictions
config = get_model_config(BAMLModel.O3_MINI)
print(config)  # {'temperature': 1.0}
See Model Selection for details.

Environment Variables

Configuration also supports environment variables:
# API Keys
OPENAI_API_KEY=sk-...
ANTHROPIC_API_KEY=sk-ant-...
GOOGLE_API_KEY=AIza...

# Optional: Override defaults
MAX_TURNS=30
VERBOSE=true

Best Practices

config = Config.default()
Only customize when you need non-standard behavior.
config = Config()
config.game = GameConfig(...)
config.validate()  # Always validate!
result = runner.run()
config_dict = config.to_dict()

# Save together
output = {
    'result': result.to_dict(),
    'config': config_dict,
}
# Better than manual configuration
mini = Config.custom_game(board_size=9)

# Instead of:
manual = Config()
manual.game.BOARD_SIZE = 9
manual.game.BLUE_WORDS = ...  # Error-prone!

Next Steps

Running Games

Apply your configuration to games

Model Selection

Configure model-specific settings

Benchmarking

Use configuration in benchmarks

Cost Management

Review cost configurations

Build docs developers (and LLMs) love