Preemptive multitasking scheduler implementation in Aeolos
Aeolos implements a preemptive round-robin task scheduler that supports multiple CPU cores. Tasks are switched via timer interrupts, providing fair time-sharing among all runnable tasks.
Tasks are represented by the task_t structure from kernel/proc/task.h:63:
kernel/proc/task.h
typedef struct task_t { void* kstack_top; // Kernel stack top (saved state) uint64_t cr3; // Virtual address space (page table) tid_t tid; // Task ID priority_t priority; // Task priority (currently unused) tstatus_t status; // Current status flags timeval_t wakeuptime; // Time at which task should wake up tmode_t mode; // Kernel mode or usermode void* kstack_limit; // Kernel stack base address vec_struct(vfs_node_desc_t*) openfiles; // Open files struct task_t* next; // Next task in queue struct task_t* prev; // Previous task in queue} task_t;
The scheduler maintains several data structures at kernel/proc/sched/sched.c:15:
kernel/proc/sched/sched.c
static lock_t sched_lock;// an idle task for each cpustatic task_t* tasks_idle[CPU_MAX];// currently running tasks on each cpustatic task_t* tasks_running[CPU_MAX];// list of all tasksvec_new_static(task_t*, tasks_all);// asleep and dead tasks respectivelystatic tqueue_t tasks_asleep, tasks_dead;
All scheduler operations are protected by sched_lock to ensure thread safety in the SMP environment.
The scheduler is initialized on each CPU core at kernel/proc/sched/sched.c:256:
kernel/proc/sched/sched.c
void sched_init(void (*first_task)(tid_t)){ uint16_t cpu_id = smp_get_current_info()->cpu_id; /* create idle task for current core and set it to running idle tasks are removed from task list to prevent the scheduler from attempting to schedule them */ tid_t tid_idle = task_add(idle, 0, TASK_KERNEL_MODE, NULL, 0); tasks_idle[cpu_id] = tasks_all.data[tid_idle]; tasks_running[cpu_id] = tasks_all.data[tid_idle]; tasks_all.data[tid_idle] = NULL; // scheduler has been started on the bsp, do some initialization if (first_task) { task_add(first_task, PRIORITY_MID, TASK_KERNEL_MODE, NULL, 0); task_add(sched_janitor, PRIORITY_MIN, TASK_KERNEL_MODE, NULL, 0); klog_ok("started on bsp\n"); } // configure and start the lapic timer apic_timer_set_period(TIMESLICE_DEFAULT); apic_timer_set_mode(APIC_TIMER_MODE_ONESHOT); apic_timer_set_handler(init_context_switch); apic_timer_start();}
Key initialization steps:
Creates an idle task for the CPU (runs when nothing else is runnable)
On the BSP (first_task != NULL), creates the initial kernel task and janitor
Configures the APIC timer for preemptive scheduling
Task memory cannot be freed immediately when a task exits because it may still be using its own stack. The janitor ensures cleanup happens from a different context.
Context switches occur via the APIC timer interrupt. The core switching logic is in _do_context_switch() at kernel/proc/sched/sched.c:76:
kernel/proc/sched/sched.c
void _do_context_switch(task_state_t* state){ lock_wait(&sched_lock); uint16_t cpu = smp_get_current_info()->cpu_id; // save state of current task task_t* curr = tasks_running[cpu]; curr->kstack_top = state; curr->status &= ~TASK_RUNNING; // wake up tasks which need to be woken up while (tasks_asleep.back && tasks_asleep.back->wakeuptime < hpet_get_nanos()) { task_t* t = tq_pop_back(&tasks_asleep); t->status &= ~TASK_ASLEEP; } // choose next task task_t* next = NULL; for (tid_t i = curr->tid + 1; i < tasks_all.len; i++) { task_t* t = tasks_all.data[i]; if (t != NULL && t->status == TASK_READY) { next = t; goto task_chosen; } } for (tid_t i = 0; i <= curr->tid; i++) { task_t* t = tasks_all.data[i]; if (t != NULL && t->status == TASK_READY) { next = t; goto task_chosen; } }task_chosen: // could not find a runnable task, so idle if (!next) next = tasks_idle[cpu]; // we've chosen next task next->status |= TASK_RUNNING; tasks_running[cpu] = next; // set rsp0 in the tss smp_get_current_info()->tss.rsp0 = (uint64_t)(next->kstack_limit + KSTACK_SIZE); apic_timer_set_period(TIMESLICE_DEFAULT); apic_send_eoi(); // release sched_lock and switch to next task lock_release(&sched_lock); finish_context_switch(next);}
Tasks can sleep for a specified duration using sched_sleep() at kernel/proc/sched/sched.c:130:
kernel/proc/sched/sched.c
void sched_sleep(timeval_t nanos){ // if sleep time is too little, busy sleep if (nanos <= TIMESLICE_DEFAULT) { hpet_nanosleep(nanos); return; } lock_wait(&sched_lock); task_t* curr = tasks_running[smp_get_current_info()->cpu_id]; curr->wakeuptime = hpet_get_nanos() + nanos; curr->status |= TASK_ASLEEP; add_sleeping_sorted(curr); lock_release(&sched_lock); asm volatile("hlt");}
Sleeping tasks are kept in a sorted queue (earliest wakeup time at the back) at kernel/proc/sched/sched.c:59:
kernel/proc/sched/sched.c
static void add_sleeping_sorted(task_t* t){ if (!tasks_asleep.front) tq_push_front(&tasks_asleep, t); else if (tasks_asleep.back->wakeuptime > t->wakeuptime) tq_insert_after(&tasks_asleep, tasks_asleep.back, t); else { for (task_t* i = tasks_asleep.front; i; i = i->next) { if (i->wakeuptime <= t->wakeuptime) { tq_insert_after(&tasks_asleep, i->prev, t); break; } } }}
For sleep durations shorter than a timeslice, the function busy-waits using the HPET instead of yielding the CPU, avoiding unnecessary context switches.
Tasks can be terminated with sched_kill() at kernel/proc/sched/sched.c:149:
kernel/proc/sched/sched.c
int64_t sched_kill(tid_t tid){ lock_wait(&sched_lock); // check if tid is valid if (!IS_TID_VALID(tid)) { klog_err("there's no task with tid %d\n", tid); lock_release(&sched_lock); return -1; } // mark the task as dead, it will be cleaned up later task_t* t = tasks_all.data[tid]; t->status |= TASK_DEAD; tasks_all.data[tid] = NULL; tq_push_front(&tasks_dead, t); // if it was sleeping, remove it from the sleeping list if (t->status & TASK_ASLEEP) { tq_remove(t, &tasks_asleep); t->status &= ~TASK_ASLEEP; } // was it the currently running task? bool is_current = (t == tasks_running[smp_get_current_info()->cpu_id]); lock_release(&sched_lock); // we can't return if it was the current one if (is_current) asm volatile("hlt"); return 0;}
If a task kills itself, it halts and waits for a timer interrupt to schedule another task.