Skip to main content
This example demonstrates running a reinforcement learning training loop (CartPole + DQN) inside an isolated OpenSandbox container. The sandbox installs RL dependencies, trains a policy, saves checkpoints, and returns training summaries.

Overview

OpenSandbox provides an ideal environment for RL training:
  • Isolated Execution: Each agent trains in a clean, isolated environment
  • Reproducible Results: Consistent environment across training runs
  • Scalable: Run hundreds of parallel training jobs using BatchSandbox
  • Safe: Contained execution prevents system interference
  • Portable: Train locally or in Kubernetes clusters

Prerequisites

1

Install OpenSandbox

uv pip install opensandbox opensandbox-server
2

Initialize Server Config

opensandbox-server init-config ~/.sandbox.toml --example docker
3

Start OpenSandbox Server

opensandbox-server

Basic RL Training Example

Training Script

Create the training script that will run inside the sandbox:
train.py
import json
import os
import gymnasium as gym
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy

# Configuration
timesteps = int(os.getenv("RL_TIMESTEPS", "5000"))
tensorboard_log = os.getenv("RL_TENSORBOARD_LOG", "runs")

# Create environment
env = gym.make("CartPole-v1")

# Initialize DQN agent
model = DQN(
    "MlpPolicy",
    env,
    verbose=1,
    tensorboard_log=tensorboard_log,
    learning_rate=1e-3,
    buffer_size=10000,
    learning_starts=1000,
    batch_size=32,
    train_freq=4,
    gradient_steps=1,
)

# Train the agent
model.learn(total_timesteps=timesteps)

# Save checkpoint
os.makedirs("checkpoints", exist_ok=True)
checkpoint_path = "checkpoints/cartpole_dqn"
model.save(checkpoint_path)

# Evaluate policy
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5)

# Save summary
summary = {
    "timesteps": timesteps,
    "mean_reward": float(mean_reward),
    "std_reward": float(std_reward),
    "checkpoint_path": f"{checkpoint_path}.zip",
}
with open("training_summary.json", "w", encoding="utf-8") as f:
    json.dump(summary, f, indent=2)

print("Training summary:", summary)
env.close()

Requirements File

requirements.txt
gymnasium==0.29.1
stable-baselines3==2.3.2
tensorboard==2.16.2
torch==2.9.1

Python Client

main.py
import asyncio
import os
import textwrap
from datetime import timedelta
from pathlib import Path
from opensandbox import Sandbox
from opensandbox.config import ConnectionConfig

def _load_requirements() -> str:
    requirements_path = Path(__file__).with_name("requirements.txt")
    return requirements_path.read_text(encoding="utf-8")

def _training_script() -> str:
    # Load train.py content
    script_path = Path(__file__).with_name("train.py")
    return script_path.read_text(encoding="utf-8")

def _with_python_env(command: str) -> str:
    return (
        "bash -lc '"
        "source /opt/opensandbox/code-interpreter-env.sh "
        "python ${PYTHON_VERSION:-3.14} >/dev/null "
        "&& "
        f"{command}"
        "'"
    )

async def _print_execution_logs(execution) -> None:
    for msg in execution.logs.stdout:
        print(f"[stdout] {msg.text}")
    for msg in execution.logs.stderr:
        print(f"[stderr] {msg.text}")
    if execution.error:
        print(f"[error] {execution.error.name}: {execution.error.value}")

async def _run_command(sandbox: Sandbox, command: str) -> bool:
    execution = await sandbox.commands.run(command)
    await _print_execution_logs(execution)
    return execution.error is None

async def main() -> None:
    domain = os.getenv("SANDBOX_DOMAIN", "localhost:8080")
    api_key = os.getenv("SANDBOX_API_KEY")
    image = os.getenv(
        "SANDBOX_IMAGE",
        "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/code-interpreter:v1.0.1"
    )
    timesteps = os.getenv("RL_TIMESTEPS", "5000")

    config = ConnectionConfig(
        domain=domain,
        api_key=api_key,
        request_timeout=timedelta(minutes=10),
    )

    # Create sandbox with RL environment variables
    sandbox = await Sandbox.create(
        image,
        connection_config=config,
        env={"RL_TIMESTEPS": timesteps},
    )

    async with sandbox:
        try:
            # Upload requirements
            await sandbox.files.write_file("requirements.txt", _load_requirements())

            # Install dependencies
            print("Installing RL dependencies...")
            install_cmd = _with_python_env(
                "python3 -m pip install --no-cache-dir --break-system-packages -r requirements.txt"
            )
            if not await _run_command(sandbox, install_cmd):
                print("Failed to install RL dependencies.")
                return

            # Upload and run training script
            await sandbox.files.write_file("train.py", _training_script())
            print("\nStarting RL training...")
            train_exec = await sandbox.commands.run(_with_python_env("python3 train.py"))
            await _print_execution_logs(train_exec)

            if train_exec.error:
                print("Training failed inside the sandbox.")
                return

            # Read training summary
            try:
                summary = await sandbox.files.read_file("training_summary.json")
                print("\n=== Training Summary ===")
                print(summary)
            except Exception as exc:
                print(f"\nFailed to read training summary: {exc}")
        finally:
            await sandbox.kill()

if __name__ == "__main__":
    asyncio.run(main())

Run the Example

# Set environment variables (optional)
export SANDBOX_DOMAIN="localhost:8080"
export RL_TIMESTEPS="10000"

# Run the training
uv run python main.py
Expected output:
Installing RL dependencies...
[stdout] Collecting gymnasium==0.29.1
[stdout] Collecting stable-baselines3==2.3.2
...

Starting RL training...
[stdout] ---------------------------------
[stdout] | rollout/           |          |
[stdout] |    ep_len_mean     | 22.5     |
[stdout] |    ep_rew_mean     | 22.5     |
[stdout] | time/              |          |
[stdout] |    total_timesteps | 10000    |
[stdout] ---------------------------------

=== Training Summary ===
{
  "timesteps": 10000,
  "mean_reward": 195.4,
  "std_reward": 12.8,
  "checkpoint_path": "checkpoints/cartpole_dqn.zip"
}

Advanced: Batch RL Training

Scale up to hundreds of parallel training runs using BatchSandbox:

Step 1: Deploy Kubernetes Controller

See Kubernetes Deployment for full setup.

Step 2: Create RL Training Pool

rl-pool.yaml
apiVersion: sandbox.opensandbox.io/v1alpha1
kind: Pool
metadata:
  name: rl-training-pool
  namespace: opensandbox
spec:
  template:
    spec:
      containers:
      - name: sandbox
        image: opensandbox/code-interpreter:v1.0.1
        resources:
          requests:
            memory: "2Gi"
            cpu: "1000m"
          limits:
            memory: "4Gi"
            cpu: "2000m"
  capacitySpec:
    bufferMax: 50
    bufferMin: 10
    poolMax: 200
    poolMin: 20
kubectl apply -f rl-pool.yaml

Step 3: Launch Batch Training

rl-batch.yaml
apiVersion: sandbox.opensandbox.io/v1alpha1
kind: BatchSandbox
metadata:
  name: rl-training-batch
  namespace: opensandbox
spec:
  replicas: 100  # Train 100 agents in parallel
  poolRef: rl-training-pool
  taskTemplate:
    spec:
      process:
        command: ["bash"]
        args:
        - "-c"
        - |
          source /opt/opensandbox/code-interpreter-env.sh &&
          python3 -m pip install gymnasium stable-baselines3 &&
          python3 /workspace/train.py
        env:
        - name: RL_TIMESTEPS
          value: "50000"
kubectl apply -f rl-batch.yaml

# Monitor training
kubectl get batchsandbox rl-training-batch -w

Heterogeneous Training

Train different agents or hyperparameters across sandboxes:
heterogeneous-rl.yaml
apiVersion: sandbox.opensandbox.io/v1alpha1
kind: BatchSandbox
metadata:
  name: hyperparameter-search
  namespace: opensandbox
spec:
  replicas: 4
  poolRef: rl-training-pool
  taskTemplate:
    spec:
      process:
        command: ["python3"]
        args: ["/workspace/train.py"]
  shardTaskPatches:
  - spec:
      process:
        env:
        - name: LEARNING_RATE
          value: "1e-3"
        - name: RL_TIMESTEPS
          value: "50000"
  - spec:
      process:
        env:
        - name: LEARNING_RATE
          value: "1e-4"
        - name: RL_TIMESTEPS
          value: "50000"
  - spec:
      process:
        env:
        - name: LEARNING_RATE
          value: "5e-4"
        - name: RL_TIMESTEPS
          value: "50000"
  - spec:
      process:
        env:
        - name: LEARNING_RATE
          value: "1e-5"
        - name: RL_TIMESTEPS
          value: "50000"

TensorBoard Integration

Visualize training metrics with TensorBoard:
tensorboard_example.py
async def setup_tensorboard(sandbox: Sandbox) -> None:
    # Training logs to runs/ directory
    await sandbox.commands.run(
        _with_python_env("python3 train.py")
    )

    # Start TensorBoard server
    await sandbox.commands.run(
        "nohup tensorboard --logdir runs --host 0.0.0.0 --port 6006 &",
        background=True
    )

    print("TensorBoard available at http://<sandbox-ip>:6006")
Use Kubernetes port-forwarding to access TensorBoard:
kubectl port-forward pod/<sandbox-pod> 6006:6006
Then open http://localhost:6006

Checkpoint Management

Save and retrieve trained models:
checkpoint_management.py
async def save_checkpoint(sandbox: Sandbox, local_path: str) -> None:
    # Read checkpoint from sandbox
    checkpoint = await sandbox.files.read_file(
        "checkpoints/cartpole_dqn.zip",
        binary=True
    )

    # Save locally
    with open(local_path, "wb") as f:
        f.write(checkpoint)

    print(f"Checkpoint saved to {local_path}")

async def load_checkpoint(sandbox: Sandbox, local_path: str) -> None:
    # Read local checkpoint
    with open(local_path, "rb") as f:
        checkpoint = f.read()

    # Upload to sandbox
    await sandbox.files.write_file(
        "checkpoints/cartpole_dqn.zip",
        checkpoint,
        binary=True
    )

    print(f"Checkpoint loaded from {local_path}")

Environment Variables

VariableDescriptionDefault
SANDBOX_DOMAINSandbox service addresslocalhost:8080
SANDBOX_API_KEYAPI key for authenticationNone
SANDBOX_IMAGEDocker image to useopensandbox/code-interpreter:v1.0.1
RL_TIMESTEPSTraining timesteps5000
RL_TENSORBOARD_LOGTensorBoard log directoryruns
LEARNING_RATELearning rate1e-3

Performance Tips

  • Use pooled sandboxes for faster startup
  • Pre-install dependencies in custom images
  • Increase train_freq and gradient_steps for faster learning
  • Use GPU-enabled sandbox images for deep RL
  • Use BatchSandbox for 100+ parallel agents
  • Set appropriate pool buffer sizes
  • Monitor cluster resources and autoscale
  • Use heterogeneous tasks for hyperparameter search
  • Save checkpoints periodically during training
  • Use sandbox file system for intermediate results
  • Download final checkpoints to persistent storage
  • Implement checkpoint rotation for long training runs
  • Use TensorBoard for real-time metrics
  • Log training summaries to JSON files
  • Track reward curves and loss values
  • Set up alerts for failed training runs

Common Patterns

Population-Based Training

pbt_example.py
async def population_based_training(
    population_size: int = 10,
    generations: int = 5
) -> None:
    """Train a population of agents with evolutionary selection."""
    for generation in range(generations):
        # Create batch of sandboxes
        batch = await create_batch_sandbox(
            replicas=population_size,
            task_patches=generate_hyperparameters()
        )

        # Wait for training completion
        await wait_for_completion(batch)

        # Evaluate and select best agents
        results = await collect_results(batch)
        best_agents = select_top_performers(results, top_k=5)

        # Clean up batch
        await delete_batch_sandbox(batch)

Distributed PPO

distributed_ppo.py
async def distributed_ppo(
    num_workers: int = 16,
    timesteps_per_worker: int = 10000
) -> None:
    """Run distributed PPO with multiple worker sandboxes."""
    # Create batch of worker sandboxes
    workers = await create_batch_sandbox(
        replicas=num_workers,
        task_template={
            "command": ["python3"],
            "args": ["ppo_worker.py"],
            "env": {"TIMESTEPS": str(timesteps_per_worker)}
        }
    )

    # Collect experiences from all workers
    experiences = await gather_worker_experiences(workers)

    # Update policy
    await update_policy(experiences)

Troubleshooting

Problem: pip install fails inside sandboxSolution:
  • Use --break-system-packages flag
  • Try alternative installation methods (apt, apk)
  • Pre-build custom image with dependencies
Problem: Sandbox crashes during trainingSolution:
  • Increase memory limits in pool spec
  • Reduce buffer size or batch size
  • Use smaller models or environments
  • Monitor memory usage during training
Problem: Cannot find checkpoint filesSolution:
  • Verify checkpoint directory exists
  • Check file permissions in sandbox
  • Use absolute paths for checkpoint saving
  • Read files before sandbox termination

Next Steps

Batch Sandboxes

Learn batch sandbox patterns

Kubernetes Deployment

Deploy on Kubernetes

Python SDK

Python SDK reference

API Reference

Complete API documentation

Build docs developers (and LLMs) love