Environment classes - CooperBench

CooperBench uses Docker-based environments to provide isolated, reproducible task execution. Each task runs in a containerized environment with the repository code and test infrastructure.

Environment structure

Each task has a corresponding Docker image that includes:

Base repository code (before features are implemented)
Python/Node.js/other runtime dependencies
Test runner script (/usr/local/bin/runner.sh)
Git repository setup

Image naming convention

Images follow the pattern:

cooperbench/{repo_name}:task{task_id}

Examples:

cooperbench/llama_index_task:task1
cooperbench/django_task:task5
cooperbench/flask_task:task3

Getting image names

from cooperbench.utils import get_image_name

image = get_image_name("llama_index_task", 1)
print(image)  # "cooperbench/llama_index_task:task1"

Working with environments

Create a sandbox

from cooperbench.eval.backends import get_backend
from cooperbench.utils import get_image_name

# Get the image for a specific task
image = get_image_name("llama_index_task", 1)

# Create sandbox
backend = get_backend("modal")
sandbox = backend.create_sandbox(
    image=image,
    timeout=600,
    workdir="/workspace",
)

try:
    # Sandbox is ready to use
    result = sandbox.exec("ls", "-la")
    print(result.stdout_read())
finally:
    sandbox.terminate()

Execute commands

from cooperbench.eval.backends import get_backend
from cooperbench.utils import get_image_name

backend = get_backend("docker")
image = get_image_name("llama_index_task", 1)
sandbox = backend.create_sandbox(image, timeout=300)

try:
    # Check Python version
    result = sandbox.exec("python", "--version")
    print(result.stdout_read())

    # List repository files
    result = sandbox.exec("find", "/workspace", "-type", "f", "-name", "*.py")
    files = result.stdout_read().strip().split("\n")
    print(f"Found {len(files)} Python files")

    # Run tests
    result = sandbox.exec("pytest", "--version")
    print(result.stdout_read())
finally:
    sandbox.terminate()

Apply patches

import base64
from cooperbench.eval.backends import get_backend
from cooperbench.utils import get_image_name

def write_patch(sandbox, filename: str, content: str):
    """Write a patch file to the sandbox."""
    encoded = base64.b64encode(content.encode()).decode()
    sandbox.exec("bash", "-c", f"echo {encoded} | base64 -d > {filename}")

backend = get_backend("modal")
image = get_image_name("llama_index_task", 1)
sandbox = backend.create_sandbox(image, timeout=600)

try:
    # Write patch
    patch_content = """
diff --git a/src/main.py b/src/main.py
index 123456..789abc 100644
--- a/src/main.py
+++ b/src/main.py
@@ -1,3 +1,4 @@
 def main():
+    print("Hello, world!")
     pass
"""
    write_patch(sandbox, "feature.patch", patch_content)

    # Apply patch
    result = sandbox.exec("git", "apply", "feature.patch")
    if result.returncode == 0:
        print("Patch applied successfully")
    else:
        print(f"Patch failed: {result.stderr_read()}")
finally:
    sandbox.terminate()

Run tests

from cooperbench.eval.backends import get_backend
from cooperbench.utils import get_image_name

backend = get_backend("modal")
image = get_image_name("llama_index_task", 1)
sandbox = backend.create_sandbox(image, timeout=600)

try:
    # Use the built-in test runner
    # runner.sh applies patches and runs tests
    result = sandbox.exec(
        "bash",
        "/usr/local/bin/runner.sh",
        "tests.patch",
        "feature.patch",
    )

    output = result.stdout_read() + result.stderr_read()
    print(output)

    # Parse test results
    if "passed" in output.lower():
        print("Tests passed!")
    else:
        print("Tests failed")
finally:
    sandbox.terminate()

Test runner script

Each environment includes /usr/local/bin/runner.sh which:

Applies patches in order (tests first, then features)
Runs the test suite
Reports results

Usage:

# Apply tests patch only
/usr/local/bin/runner.sh tests.patch

# Apply tests and feature patches
/usr/local/bin/runner.sh tests.patch feature.patch

# Apply tests and multiple feature patches
/usr/local/bin/runner.sh tests.patch feature1.patch feature2.patch

Parsing test output

import re

def parse_test_results(output: str) -> dict:
    """Parse pytest output for pass/fail counts."""
    # Look for pytest summary line
    # Example: "5 passed, 2 failed in 1.23s"
    match = re.search(r"(\d+)\s+passed", output)
    passed = int(match.group(1)) if match else 0

    match = re.search(r"(\d+)\s+failed", output)
    failed = int(match.group(1)) if match else 0

    return {
        "passed": passed,
        "failed": failed,
        "total": passed + failed,
        "success": failed == 0 and passed > 0,
    }

# Use it
from cooperbench.eval.backends import get_backend
from cooperbench.utils import get_image_name

backend = get_backend("modal")
image = get_image_name("llama_index_task", 1)
sandbox = backend.create_sandbox(image, timeout=600)

try:
    result = sandbox.exec("pytest", "tests/", "-v")
    output = result.stdout_read() + result.stderr_read()

    results = parse_test_results(output)
    print(f"Results: {results['passed']}/{results['total']} tests passed")
finally:
    sandbox.terminate()

Environment variables

Environments can be configured with environment variables:

# Note: This is backend-specific
# For Docker backend:
from cooperbench.eval.backends.docker import DockerBackend
import docker

client = docker.from_env()
container = client.containers.run(
    image="cooperbench/llama_index_task:task1",
    command="python --version",
    environment={
        "PYTHONPATH": "/workspace/src",
        "DEBUG": "1",
    },
    remove=True,
    detach=False,
)

Building custom environments

Dockerfile structure

Create a Dockerfile for your custom task:

FROM python:3.11-slim

# Install dependencies
RUN apt-get update && apt-get install -y \
    git \
    && rm -rf /var/lib/apt/lists/*

# Set up workspace
WORKDIR /workspace

# Copy repository code
COPY repo/ /workspace/

# Install Python dependencies
RUN pip install --no-cache-dir -r requirements.txt

# Set up git repository
RUN git init && \
    git config user.name "CooperBench" && \
    git config user.email "test@cooperbench" && \
    git add . && \
    git commit -m "Initial commit"

# Copy test runner script
COPY runner.sh /usr/local/bin/runner.sh
RUN chmod +x /usr/local/bin/runner.sh

# Default command
CMD ["/bin/bash"]

Test runner script

Create runner.sh:

#!/bin/bash
set -e

# Apply patches in order
for patch in "$@"; do
    if [ -f "$patch" ]; then
        echo "Applying $patch..."
        git apply "$patch" || exit 1
    fi
done

# Run tests
echo "Running tests..."
pytest tests/ -v --tb=short

Build and push image

# Build image
docker build -t cooperbench/my_task:task1 .

# Test locally
docker run --rm cooperbench/my_task:task1 python --version

# Push to registry
docker push cooperbench/my_task:task1

Use custom image

from cooperbench.eval.backends import get_backend

backend = get_backend("docker")
sandbox = backend.create_sandbox(
    image="cooperbench/my_task:task1",
    timeout=600,
)

try:
    result = sandbox.exec("python", "--version")
    print(result.stdout_read())
finally:
    sandbox.terminate()

Advanced usage

Multi-step execution

from cooperbench.eval.backends import get_backend
from cooperbench.utils import get_image_name

backend = get_backend("modal")
image = get_image_name("llama_index_task", 1)
sandbox = backend.create_sandbox(image, timeout=900)

try:
    # Install additional dependencies
    sandbox.exec("pip", "install", "black", "mypy")

    # Format code
    result = sandbox.exec("black", "/workspace/src")
    print("Formatted code")

    # Type check
    result = sandbox.exec("mypy", "/workspace/src")
    if result.returncode == 0:
        print("Type checking passed")

    # Run tests
    result = sandbox.exec("pytest", "tests/", "-v")
    print(result.stdout_read())
finally:
    sandbox.terminate()

Copy files from sandbox

# Note: This is backend-specific
# For Docker backend, you can copy files out:
import docker
import tarfile
import io

client = docker.from_env()
container = client.containers.run(
    image="cooperbench/llama_index_task:task1",
    command="sleep 300",
    detach=True,
)

try:
    # Get file from container
    bits, stat = container.get_archive("/workspace/output.txt")
    stream = io.BytesIO(b"".join(bits))
    tar = tarfile.open(fileobj=stream)
    content = tar.extractfile("output.txt").read()
    print(content.decode())
finally:
    container.stop()
    container.remove()

Environment debugging

from cooperbench.eval.backends import get_backend
from cooperbench.utils import get_image_name

def debug_environment(repo: str, task_id: int):
    """Print environment info for a task."""
    backend = get_backend("docker")
    image = get_image_name(repo, task_id)
    sandbox = backend.create_sandbox(image, timeout=300)

    try:
        # System info
        result = sandbox.exec("uname", "-a")
        print(f"System: {result.stdout_read()}")

        # Python version
        result = sandbox.exec("python", "--version")
        print(f"Python: {result.stdout_read()}")

        # Installed packages
        result = sandbox.exec("pip", "list")
        print(f"Packages:\n{result.stdout_read()}")

        # Git status
        result = sandbox.exec("git", "status")
        print(f"Git status:\n{result.stdout_read()}")

        # File structure
        result = sandbox.exec("tree", "/workspace", "-L", "2")
        print(f"Files:\n{result.stdout_read()}")
    finally:
        sandbox.terminate()

# Use it
debug_environment("llama_index_task", 1)

Best practices

Always cleanup sandboxes

from cooperbench.eval.backends import get_backend

backend = get_backend("modal")
sandbox = backend.create_sandbox("cooperbench/llama_index_task:task1")

try:
    # Your code here
    pass
finally:
    # Always terminate to free resources
    sandbox.terminate()

Handle timeouts gracefully

from cooperbench.eval.backends import get_backend

backend = get_backend("modal")

# Set appropriate timeout
sandbox = backend.create_sandbox(
    image="cooperbench/llama_index_task:task1",
    timeout=600,  # 10 minutes
)

try:
    result = sandbox.exec("pytest", "tests/", "--timeout=300")
    # Handle result
except Exception as e:
    print(f"Execution timed out or failed: {e}")
finally:
    sandbox.terminate()

Reuse images

from cooperbench.utils import get_image_name
from cooperbench.eval.backends import get_backend

# Cache image name
image = get_image_name("llama_index_task", 1)
backend = get_backend("modal")

# Create multiple sandboxes from same image
for i in range(5):
    sandbox = backend.create_sandbox(image, timeout=300)
    try:
        # Use sandbox
        pass
    finally:
        sandbox.terminate()

get_backend() - Get execution backend
run_patch_test() - Test patches in environments
test_merged() - Test merged patches

Core Functions

Advanced

​Environment structure

​Image naming convention

​Getting image names

​Working with environments

​Create a sandbox

​Execute commands

​Apply patches

​Run tests

​Test runner script

​Parsing test output

​Environment variables

​Building custom environments

​Dockerfile structure

​Test runner script

​Build and push image

​Use custom image

​Advanced usage

​Multi-step execution

​Copy files from sandbox

​Environment debugging

​Best practices

​Always cleanup sandboxes

​Handle timeouts gracefully

​Reuse images

​Related functions

Environment structure

Image naming convention

Getting image names

Working with environments

Create a sandbox

Execute commands

Apply patches

Run tests

Test runner script

Parsing test output

Environment variables

Building custom environments

Dockerfile structure

Test runner script

Build and push image

Use custom image

Advanced usage

Multi-step execution

Copy files from sandbox

Environment debugging

Best practices

Always cleanup sandboxes

Handle timeouts gracefully

Reuse images

Related functions