Skip to main content
PufferLib provides optimized support for Atari environments, making it easy to train agents on classic Atari games like Breakout, Pong, and Space Invaders. The library includes configurations and wrappers compatible with CleanRL and other popular RL frameworks.

Quick start

Training an Atari agent with PufferLib is straightforward using the built-in configurations:
from pufferlib import pufferl

# Train using the default Atari configuration
pufferl.train('puffer_breakout')
This single line handles environment creation, policy initialization, and training loop execution.

Atari configuration

PufferLib includes a comprehensive configuration file for Atari environments at pufferlib/config/atari.ini. Here are the key settings:

Environment settings

[base]
package = atari
env_name = breakout  # Can be any Atari game
policy_name = Policy
rnn_name = Recurrent

[env]
frameskip = 1
repeat_action_probability = 0.0
The configuration supports all classic Atari games including: adventure, alien, asteroids, breakout, enduro, freeway, pacman, pong, qbert, space_invaders, and many more.

Vectorization settings

[vec]
num_envs = 128      # Number of parallel environments
num_workers = 16    # Number of worker processes
batch_size = 64     # Batch size for policy updates

Training hyperparameters

[train]
batch_size = 8192
minibatch_size = 2048
update_epochs = 1
bptt_horizon = 64
total_timesteps = 10_000_000
anneal_lr = False

CleanRL integration

PufferLib’s vectorization makes CleanRL approximately 65% faster on Atari benchmarks. Here’s how to integrate PufferLib into a CleanRL PPO implementation:
1

Set up the environment

import pufferlib.vector
import pufferlib.environments.atari

envs = pufferlib.vector.make(
    pufferlib.environments.atari.env_creator('breakout'),
    env_kwargs=dict(framestack=4),
    backend=pufferlib.vector.Multiprocessing,
    num_envs=8,
)
2

Create the policy network

import torch
import torch.nn as nn
import numpy as np

def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

class Agent(nn.Module):
    def __init__(self, envs):
        super().__init__()
        self.network = nn.Sequential(
            layer_init(nn.Conv2d(4, 32, 8, stride=4)),
            nn.ReLU(),
            layer_init(nn.Conv2d(32, 64, 4, stride=2)),
            nn.ReLU(),
            layer_init(nn.Conv2d(64, 64, 3, stride=1)),
            nn.ReLU(),
            nn.Flatten(),
            layer_init(nn.Linear(64 * 6 * 9, 512)),
            nn.ReLU(),
        )
        self.actor = layer_init(nn.Linear(512, envs.single_action_space.n), std=0.01)
        self.critic = layer_init(nn.Linear(512, 1), std=1)

    def get_value(self, x):
        x = x.permute(0, 2, 1, 3)
        return self.critic(self.network(x / 255.0))

    def get_action_and_value(self, x, action=None):
        from torch.distributions.categorical import Categorical
        x = x.permute(0, 2, 1, 3)
        hidden = self.network(x / 255.0)
        logits = self.actor(hidden)
        probs = Categorical(logits=logits)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action), probs.entropy(), self.critic(hidden)
3

Run the training loop

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
agent = Agent(envs).to(device)
optimizer = torch.optim.Adam(agent.parameters(), lr=2.5e-4, eps=1e-5)

next_obs, _ = envs.reset()
next_obs = torch.Tensor(next_obs).to(device)

# Training loop
for step in range(num_steps):
    with torch.no_grad():
        action, logprob, _, value = agent.get_action_and_value(next_obs)
    
    next_obs, reward, terminations, truncations, infos = envs.step(action.cpu().numpy())
    next_obs = torch.from_numpy(next_obs).to(device)
    
    # Process episode statistics
    for item in infos:
        if "episode" in item.keys():
            print(f"episodic_return={item['episode']['r']}")

Custom Atari training

You can customize the Atari training configuration programmatically:
from pufferlib import pufferl
import pufferlib.vector
import pufferlib.ocean

def custom_atari_trainer(env_name='puffer_breakout'):
    args = pufferl.load_config(env_name)

    # Customize the configuration
    args['vec']['num_envs'] = 64
    args['env']['num_envs'] = 2048
    args['policy']['hidden_size'] = 256
    args['rnn']['input_size'] = 256
    args['rnn']['hidden_size'] = 256
    args['train']['total_timesteps'] = 10_000_000
    args['train']['learning_rate'] = 0.03

    # Create environment and policy
    vecenv = pufferl.load_env(env_name, args)
    policy = pufferl.load_policy(args, vecenv, env_name)

    # Initialize trainer
    trainer = pufferl.PuffeRL(args['train'], vecenv, policy)

    # Training loop
    while trainer.epoch < trainer.total_epochs:
        trainer.evaluate()
        logs = trainer.train()

    trainer.print_dashboard()
    trainer.close()

custom_atari_trainer()

Using your own policy

You can define a custom policy for Atari environments:
import torch
import pufferlib.pytorch
from pufferlib import pufferl

class CustomAtariPolicy(torch.nn.Module):
    def __init__(self, env):
        super().__init__()
        self.net = torch.nn.Sequential(
            pufferlib.pytorch.layer_init(torch.nn.Linear(
                env.single_observation_space.shape[0], 128
            )),
            torch.nn.ReLU(),
            pufferlib.pytorch.layer_init(torch.nn.Linear(128, 128)),
        )
        self.action_head = torch.nn.Linear(128, env.single_action_space.n)
        self.value_head = torch.nn.Linear(128, 1)

    def forward_eval(self, observations, state=None):
        hidden = self.net(observations)
        logits = self.action_head(hidden)
        values = self.value_head(hidden)
        return logits, values

    def forward(self, observations, state=None):
        return self.forward_eval(observations, state)

# Use the custom policy
env_name = 'puffer_breakout'
env_creator = pufferlib.ocean.env_creator(env_name)
vecenv = pufferlib.vector.make(
    env_creator, 
    num_envs=2, 
    num_workers=2, 
    batch_size=1,
    backend=pufferlib.vector.Multiprocessing, 
    env_kwargs={'num_envs': 4096}
)

policy = CustomAtariPolicy(vecenv.driver_env).cuda()
args = pufferl.load_config('default')
args['train']['env'] = env_name

trainer = pufferl.PuffeRL(args['train'], vecenv, policy)

for epoch in range(10):
    trainer.evaluate()
    logs = trainer.train()

trainer.print_dashboard()
trainer.close()

Performance tips

Use multiprocessing vectorization for maximum throughput:
vecenv = pufferlib.vector.make(
    env_creator,
    backend=pufferlib.vector.Multiprocessing,
    num_envs=128,      # More environments = better GPU utilization
    num_workers=16,    # Match to CPU cores
    batch_size=64,     # Balance between latency and throughput
)
PufferLib’s optimized vectorization can make your Atari training up to 65% faster compared to standard implementations. The multiprocessing backend is recommended for production training runs.

Build docs developers (and LLMs) love