Skip to main content
Discover tasks from the dataset and query completed runs from logs.

discover_tasks()

Query available benchmark tasks from the dataset/ directory.

Function signature

from cooperbench import discover_tasks

tasks = discover_tasks(
    subset: str | None = None,
    repo_filter: str | None = None,
    task_filter: int | None = None,
    features_filter: list[int] | None = None,
) -> list[dict]

Parameters

subset
str | None
default:"None"
Use a predefined task subset (e.g., "lite"). Subsets are defined in dataset/subsets/.
repo_filter
str | None
default:"None"
Filter by repository name (e.g., "llama_index_task").
task_filter
int | None
default:"None"
Filter by specific task ID.
features_filter
list[int] | None
default:"None"
Filter to a specific feature pair (e.g., [1, 2]).

Returns

List of task dictionaries with the following structure:
repo
str
Repository name
task_id
int
Task identifier
features
list[int]
Feature pair (e.g., [1, 2])

Usage examples

List all tasks

from cooperbench import discover_tasks

tasks = discover_tasks()
print(f"Found {len(tasks)} tasks")

for task in tasks[:5]:
    print(f"{task['repo']}/task{task['task_id']}: features {task['features']}")
Found 450 tasks
llama_index_task/task1: features [1, 2]
llama_index_task/task1: features [1, 3]
llama_index_task/task1: features [2, 3]
llama_index_task/task2: features [1, 2]
django_task/task1: features [1, 2]

Filter by subset

# Get tasks from the lite subset
lite_tasks = discover_tasks(subset="lite")
print(f"Lite subset has {len(lite_tasks)} tasks")

Filter by repository

# Get all tasks from a specific repository
llama_tasks = discover_tasks(repo_filter="llama_index_task")

for task in llama_tasks:
    print(f"Task {task['task_id']}: features {task['features']}")

Filter by task ID

# Get all feature pairs for a specific task
task1_pairs = discover_tasks(
    repo_filter="llama_index_task",
    task_filter=1,
)

print(f"Task 1 has {len(task1_pairs)} feature pairs:")
for pair in task1_pairs:
    print(f"  {pair['features']}")

Filter by feature pair

# Find all tasks with features [1, 2]
tasks_with_f1_f2 = discover_tasks(features_filter=[1, 2])

for task in tasks_with_f1_f2:
    print(f"{task['repo']}/task{task['task_id']}")

Combine filters

# Get a very specific task
specific = discover_tasks(
    repo_filter="llama_index_task",
    task_filter=1,
    features_filter=[1, 2],
)

if specific:
    task = specific[0]
    print(f"Found: {task['repo']}/task{task['task_id']} with features {task['features']}")

discover_runs()

Query completed runs from the logs/ directory.

Function signature

from cooperbench import discover_runs

runs = discover_runs(
    run_name: str,
    subset: str | None = None,
    repo_filter: str | None = None,
    task_filter: int | None = None,
    features_filter: list[int] | None = None,
) -> list[dict]

Parameters

run_name
str
required
Name of the run to query (corresponds to the run name used in run()).
subset
str | None
default:"None"
Filter to a specific subset (e.g., "lite").
repo_filter
str | None
default:"None"
Filter by repository name.
task_filter
int | None
default:"None"
Filter by task ID.
features_filter
list[int] | None
default:"None"
Filter to a specific feature pair.

Returns

List of run dictionaries with the following structure:
repo
str
Repository name
task_id
int
Task identifier
features
list[int]
Feature pair (e.g., [1, 2])
log_dir
str
Path to the run’s log directory
setting
str
Execution mode ("coop" or "solo")

Usage examples

List all completed runs

from cooperbench import discover_runs

runs = discover_runs(run_name="my_experiment")
print(f"Found {len(runs)} completed runs")

for run in runs[:5]:
    print(f"{run['repo']}/task{run['task_id']}: features {run['features']} ({run['setting']} mode)")

Check which tasks are complete

from cooperbench import discover_tasks, discover_runs

# Compare available vs completed
all_tasks = discover_tasks(subset="lite")
completed = discover_runs(run_name="my_experiment", subset="lite")

print(f"Progress: {len(completed)}/{len(all_tasks)} tasks completed")

Find runs to evaluate

from pathlib import Path
from cooperbench import discover_runs

# Find runs that haven't been evaluated yet
runs = discover_runs(run_name="my_experiment")
unevaluated = [
    run for run in runs
    if not (Path(run["log_dir"]) / "eval.json").exists()
]

print(f"{len(unevaluated)} runs need evaluation")

Filter by repository and setting

# Find all cooperative runs from a specific repo
coop_runs = discover_runs(
    run_name="my_experiment",
    repo_filter="llama_index_task",
)

coop_runs = [r for r in coop_runs if r["setting"] == "coop"]
print(f"Found {len(coop_runs)} cooperative runs")

Access run results

import json
from pathlib import Path
from cooperbench import discover_runs

runs = discover_runs(run_name="my_experiment")

for run in runs:
    result_path = Path(run["log_dir"]) / "result.json"
    if result_path.exists():
        with open(result_path) as f:
            result = json.load(f)
        print(f"{run['repo']}/task{run['task_id']}: ${result.get('total_cost', 0):.2f}")

Working with subsets

Subsets are predefined collections of tasks stored in dataset/subsets/. They’re useful for quick testing and benchmarking.

Subset file format

{
  "name": "lite",
  "description": "Lightweight subset for quick testing",
  "tasks": [
    {
      "repo": "llama_index_task",
      "task_id": 1,
      "pairs": [[1, 2], [1, 3]]
    },
    {
      "repo": "django_task",
      "task_id": 5
    }
  ]
}
  • If pairs is specified, only those feature pairs are included
  • If pairs is omitted, all pairwise feature combinations are used

Load subset programmatically

from cooperbench.runner.tasks import load_subset

subset_data = load_subset("lite")

print(f"Tasks: {subset_data['tasks']}")
print(f"Specific pairs: {subset_data['pairs']}")