Skip to main content

Overview

Agones is built on Kubernetes custom resources and controllers. You can extend Agones by creating custom controllers that react to GameServer, Fleet, or GameServerAllocation events to implement custom logic.

Understanding Agones Controllers

Controller Architecture

Agones uses the Kubernetes controller pattern with informers and work queues:
// From pkg/gameservers/controller.go:78-108
type Controller struct {
    baseLogger               *logrus.Entry
    controllerHooks          cloudproduct.ControllerHooksInterface
    sidecarImage             string
    podGetter                typedcorev1.PodsGetter
    podLister                corelisterv1.PodLister
    podSynced                cache.InformerSynced
    gameServerGetter         getterv1.GameServersGetter
    gameServerLister         listerv1.GameServerLister
    gameServerSynced         cache.InformerSynced
    nodeLister               corelisterv1.NodeLister
    nodeSynced               cache.InformerSynced
    portAllocator            portallocator.Interface
    healthController         *HealthController
    migrationController      *MigrationController
    workerqueue              *workerqueue.WorkerQueue
    recorder                 record.EventRecorder
}

Key Components

  1. Informers: Watch Kubernetes resources and cache them locally
  2. Listers: Efficiently query cached resources
  3. Work Queues: Process events asynchronously
  4. Reconcilers: Implement the core business logic

Building a Custom Controller

Example: Auto-Restart Controller

Let’s build a controller that automatically restarts unhealthy game servers after a cooldown period.

Step 1: Define the Controller Structure

package customcontrollers

import (
    "context"
    "time"

    agonesv1 "agones.dev/agones/pkg/apis/agones/v1"
    "agones.dev/agones/pkg/client/clientset/versioned"
    "agones.dev/agones/pkg/client/informers/externalversions"
    listerv1 "agones.dev/agones/pkg/client/listers/agones/v1"
    "agones.dev/agones/pkg/util/runtime"
    "github.com/sirupsen/logrus"
    metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    "k8s.io/client-go/tools/cache"
    "k8s.io/client-go/util/workqueue"
)

type AutoRestartController struct {
    logger           *logrus.Entry
    agonesClient     versioned.Interface
    gameServerLister listerv1.GameServerLister
    gameServerSynced cache.InformerSynced
    workqueue        workqueue.RateLimitingInterface
    cooldownPeriod   time.Duration
}

func NewAutoRestartController(
    agonesClient versioned.Interface,
    agonesInformerFactory externalversions.SharedInformerFactory,
    cooldownPeriod time.Duration,
) *AutoRestartController {
    gameServerInformer := agonesInformerFactory.Agones().V1().GameServers()

    controller := &AutoRestartController{
        logger:           runtime.NewLoggerWithType(&AutoRestartController{}),
        agonesClient:     agonesClient,
        gameServerLister: gameServerInformer.Lister(),
        gameServerSynced: gameServerInformer.Informer().HasSynced,
        workqueue:        workqueue.NewNamedRateLimitingQueue(
            workqueue.DefaultControllerRateLimiter(),
            "AutoRestart",
        ),
        cooldownPeriod: cooldownPeriod,
    }

    // Register event handlers
    gameServerInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
        UpdateFunc: controller.handleGameServerUpdate,
    })

    return controller
}

Step 2: Implement Event Handlers

func (c *AutoRestartController) handleGameServerUpdate(oldObj, newObj interface{}) {
    oldGs := oldObj.(*agonesv1.GameServer)
    newGs := newObj.(*agonesv1.GameServer)

    // Only process transitions to Unhealthy state
    if oldGs.Status.State != agonesv1.GameServerStateUnhealthy &&
        newGs.Status.State == agonesv1.GameServerStateUnhealthy {
        c.logger.WithField("gameserver", newGs.Name).Info("GameServer became unhealthy")
        c.enqueue(newGs)
    }
}

func (c *AutoRestartController) enqueue(gs *agonesv1.GameServer) {
    key, err := cache.MetaNamespaceKeyFunc(gs)
    if err != nil {
        c.logger.WithError(err).Error("Failed to get key for GameServer")
        return
    }
    c.workqueue.AddAfter(key, c.cooldownPeriod)
}

Step 3: Implement the Reconciliation Logic

func (c *AutoRestartController) Run(ctx context.Context, workers int) error {
    defer c.workqueue.ShutDown()

    c.logger.Info("Starting AutoRestart controller")

    if !cache.WaitForCacheSync(ctx.Done(), c.gameServerSynced) {
        return fmt.Errorf("failed to wait for caches to sync")
    }

    for i := 0; i < workers; i++ {
        go c.runWorker(ctx)
    }

    c.logger.Info("Started workers")
    <-ctx.Done()
    c.logger.Info("Shutting down workers")

    return nil
}

func (c *AutoRestartController) runWorker(ctx context.Context) {
    for c.processNextWorkItem(ctx) {
    }
}

func (c *AutoRestartController) processNextWorkItem(ctx context.Context) bool {
    obj, shutdown := c.workqueue.Get()
    if shutdown {
        return false
    }

    err := func(obj interface{}) error {
        defer c.workqueue.Done(obj)

        key, ok := obj.(string)
        if !ok {
            c.workqueue.Forget(obj)
            return fmt.Errorf("expected string in workqueue but got %#v", obj)
        }

        if err := c.syncHandler(ctx, key); err != nil {
            c.workqueue.AddRateLimited(key)
            return fmt.Errorf("error syncing '%s': %s, requeuing", key, err.Error())
        }

        c.workqueue.Forget(obj)
        c.logger.Infof("Successfully synced '%s'", key)
        return nil
    }(obj)

    if err != nil {
        c.logger.Error(err)
    }

    return true
}

func (c *AutoRestartController) syncHandler(ctx context.Context, key string) error {
    namespace, name, err := cache.SplitMetaNamespaceKey(key)
    if err != nil {
        return fmt.Errorf("invalid resource key: %s", key)
    }

    // Get the GameServer
    gs, err := c.gameServerLister.GameServers(namespace).Get(name)
    if err != nil {
        if errors.IsNotFound(err) {
            c.logger.Infof("GameServer %s no longer exists", key)
            return nil
        }
        return err
    }

    // Only restart if still unhealthy
    if gs.Status.State != agonesv1.GameServerStateUnhealthy {
        c.logger.WithField("gameserver", name).Info("GameServer recovered, skipping restart")
        return nil
    }

    // Delete the GameServer to trigger recreation by Fleet/GameServerSet
    c.logger.WithField("gameserver", name).Info("Deleting unhealthy GameServer")
    err = c.agonesClient.AgonesV1().GameServers(namespace).Delete(
        ctx,
        name,
        metav1.DeleteOptions{},
    )
    if err != nil {
        return fmt.Errorf("failed to delete GameServer: %w", err)
    }

    return nil
}

Step 4: Deploy the Controller

package main

import (
    "context"
    "flag"
    "os"
    "os/signal"
    "syscall"
    "time"

    "agones.dev/agones/pkg/client/clientset/versioned"
    "agones.dev/agones/pkg/client/informers/externalversions"
    "k8s.io/client-go/tools/clientcmd"
)

func main() {
    var kubeconfig string
    var cooldown time.Duration

    flag.StringVar(&kubeconfig, "kubeconfig", "", "path to kubeconfig")
    flag.DurationVar(&cooldown, "cooldown", 5*time.Minute, "cooldown before restart")
    flag.Parse()

    // Build configuration
    config, err := clientcmd.BuildConfigFromFlags("", kubeconfig)
    if err != nil {
        panic(err)
    }

    // Create Agones client
    agonesClient, err := versioned.NewForConfig(config)
    if err != nil {
        panic(err)
    }

    // Create informer factory
    agonesInformerFactory := externalversions.NewSharedInformerFactory(
        agonesClient,
        time.Second*30,
    )

    // Create controller
    controller := NewAutoRestartController(
        agonesClient,
        agonesInformerFactory,
        cooldown,
    )

    // Start informers
    ctx, cancel := context.WithCancel(context.Background())
    defer cancel()

    agonesInformerFactory.Start(ctx.Done())

    // Handle shutdown
    sigCh := make(chan os.Signal, 1)
    signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
    go func() {
        <-sigCh
        cancel()
    }()

    // Run controller
    if err := controller.Run(ctx, 2); err != nil {
        panic(err)
    }
}

Deployment Manifest

apiVersion: apps/v1
kind: Deployment
metadata:
  name: autorestart-controller
  namespace: agones-system
spec:
  replicas: 1
  selector:
    matchLabels:
      app: autorestart-controller
  template:
    metadata:
      labels:
        app: autorestart-controller
    spec:
      serviceAccountName: autorestart-controller
      containers:
      - name: controller
        image: my-registry/autorestart-controller:latest
        args:
        - --cooldown=5m
        resources:
          requests:
            cpu: 50m
            memory: 64Mi
          limits:
            cpu: 100m
            memory: 128Mi
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: autorestart-controller
  namespace: agones-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: autorestart-controller
rules:
- apiGroups: ["agones.dev"]
  resources: ["gameservers"]
  verbs: ["get", "list", "watch", "delete"]
- apiGroups: [""]
  resources: ["events"]
  verbs: ["create", "patch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: autorestart-controller
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: autorestart-controller
subjects:
- kind: ServiceAccount
  name: autorestart-controller
  namespace: agones-system

Advanced Patterns

Watching Multiple Resources

// Watch both GameServers and Fleets
gameServerInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
    AddFunc: controller.handleGameServerAdd,
})

fleetInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
    UpdateFunc: controller.handleFleetUpdate,
})

Using Owner References

import "k8s.io/apimachinery/pkg/apis/meta/v1"

// Create a resource owned by a GameServer
obj := &CustomResource{
    ObjectMeta: metav1.ObjectMeta{
        Name:      "custom-" + gs.Name,
        Namespace: gs.Namespace,
        OwnerReferences: []metav1.OwnerReference{
            *metav1.NewControllerRef(gs, agonesv1.SchemeGroupVersion.WithKind("GameServer")),
        },
    },
}

Metrics and Monitoring

import (
    "github.com/prometheus/client_golang/prometheus"
    "github.com/prometheus/client_golang/prometheus/promauto"
)

var (
    restartsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "autorestart_gameservers_total",
            Help: "Total number of GameServers restarted",
        },
        []string{"namespace", "fleet"},
    )
)

func (c *AutoRestartController) recordRestart(gs *agonesv1.GameServer) {
    fleet := gs.Labels["agones.dev/fleet"]
    restartsTotal.WithLabelValues(gs.Namespace, fleet).Inc()
}

Testing Custom Controllers

Unit Testing

package customcontrollers

import (
    "context"
    "testing"
    "time"

    agonesv1 "agones.dev/agones/pkg/apis/agones/v1"
    "agones.dev/agones/pkg/client/clientset/versioned/fake"
    "agones.dev/agones/pkg/client/informers/externalversions"
    "github.com/stretchr/testify/assert"
    metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func TestAutoRestartController(t *testing.T) {
    // Create fake client
    agonesClient := fake.NewSimpleClientset()
    
    // Create informer factory
    informerFactory := externalversions.NewSharedInformerFactory(
        agonesClient,
        0,
    )
    
    // Create controller
    controller := NewAutoRestartController(
        agonesClient,
        informerFactory,
        1*time.Second,
    )
    
    // Create unhealthy GameServer
    gs := &agonesv1.GameServer{
        ObjectMeta: metav1.ObjectMeta{
            Name:      "test-gs",
            Namespace: "default",
        },
        Status: agonesv1.GameServerStatus{
            State: agonesv1.GameServerStateUnhealthy,
        },
    }
    
    _, err := agonesClient.AgonesV1().GameServers("default").Create(
        context.Background(),
        gs,
        metav1.CreateOptions{},
    )
    assert.NoError(t, err)
    
    // Test sync handler
    err = controller.syncHandler(context.Background(), "default/test-gs")
    assert.NoError(t, err)
    
    // Verify GameServer was deleted
    _, err = agonesClient.AgonesV1().GameServers("default").Get(
        context.Background(),
        "test-gs",
        metav1.GetOptions{},
    )
    assert.Error(t, err)
}

Best Practices

1

Use Rate Limiting

Always use rate-limiting work queues to prevent overwhelming the API server
2

Handle Deletion

Check if resources exist before operating on them (they may be deleted)
3

Use Finalizers Carefully

Only add finalizers if you need to perform cleanup before deletion
4

Implement Metrics

Expose Prometheus metrics for observability
5

Graceful Shutdown

Properly handle SIGTERM and drain work queues
6

Set Resource Limits

Controllers should have CPU/memory limits like any other workload

Common Use Cases

  • Custom Autoscaling: Scale based on custom metrics (player count, queue length)
  • Integration: Sync game server state to external systems
  • Compliance: Enforce organizational policies on game servers
  • Cost Optimization: Shut down idle servers after a grace period
  • Multi-Region: Coordinate game servers across multiple clusters

Build docs developers (and LLMs) love