The Machine class simulates a custom Very Large Instruction Word (VLIW) architecture with SIMD capabilities. It executes programs built by KernelBuilder and provides tracing and debugging features.
Executes the program until all cores halt or pause.
problem.py:197-217
def run(self): for core in self.cores: if core.state == CoreState.PAUSED: core.state = CoreState.RUNNING while any(c.state == CoreState.RUNNING for c in self.cores): has_non_debug = False for core in self.cores: if core.state != CoreState.RUNNING: continue if core.pc >= len(self.program): core.state = CoreState.STOPPED continue instr = self.program[core.pc] if self.prints: self.print_step(instr, core) core.pc += 1 self.step(instr, core) if any(name != "debug" for name in instr.keys()): has_non_debug = True if has_non_debug: self.cycle += 1
The cycle counter only increments for instructions with non-debug operations. This is what the performance test measures.
def step(self, instr: Instruction, core): """ Execute all the slots in each engine for a single instruction bundle """ ENGINE_FNS = { "alu": self.alu, "valu": self.valu, "load": self.load, "store": self.store, "flow": self.flow, } self.scratch_write = {} self.mem_write = {} for name, slots in instr.items(): if name == "debug": if not self.enable_debug: continue for slot in slots: if slot[0] == "compare": loc, key = slot[1], slot[2] ref = self.value_trace[key] res = core.scratch[loc] assert res == ref, f"{res} != {ref} for {key} at pc={core.pc}" elif slot[0] == "vcompare": loc, keys = slot[1], slot[2] ref = [self.value_trace[key] for key in keys] res = core.scratch[loc : loc + VLEN] assert res == ref, ( f"{res} != {ref} for {keys} at pc={core.pc} loc={loc}" ) continue assert len(slots) <= SLOT_LIMITS[name] for i, slot in enumerate(slots): if self.trace is not None: self.trace_slot(core, slot, name, i) ENGINE_FNS[name](core, *slot) for addr, val in self.scratch_write.items(): core.scratch[addr] = val for addr, val in self.mem_write.items(): self.mem[addr] = val if self.trace: self.trace_post_step(instr, core) del self.scratch_write del self.mem_write
All writes (scratch and memory) take effect at the end of the cycle. Reading and writing the same location in one instruction uses the old value.
Initializes Perfetto trace generation for visualization.
problem.py:151-196
def setup_trace(self): """ The simulator generates traces in Chrome's Trace Event Format for visualization in Perfetto (or chrome://tracing if you prefer it). See the bottom of the file for info about how to use this. See the format docs in case you want to add more info to the trace: https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview """ self.trace = open("trace.json", "w") self.trace.write("[") # ... trace initialization code ...
Run python perf_takehome.py Tests.test_kernel_trace then python watch_trace.py to view execution traces in Perfetto. This is the recommended debugging workflow.
def alu(self, core, op, dest, a1, a2): a1 = core.scratch[a1] a2 = core.scratch[a2] match op: case "+": res = a1 + a2 case "-": res = a1 - a2 case "*": res = a1 * a2 case "//": res = a1 // a2 case "cdiv": res = cdiv(a1, a2) case "^": res = a1 ^ a2 case "&": res = a1 & a2 case "|": res = a1 | a2 case "<<": res = a1 << a2 case ">>": res = a1 >> a2 case "%": res = a1 % a2 case "<": res = int(a1 < a2) case "==": res = int(a1 == a2) case _: raise NotImplementedError(f"Unknown alu op {op}") res = res % (2**32) self.scratch_write[dest] = res
def valu(self, core, *slot): match slot: case ("vbroadcast", dest, src): for i in range(VLEN): self.scratch_write[dest + i] = core.scratch[src] case ("multiply_add", dest, a, b, c): for i in range(VLEN): mul = (core.scratch[a + i] * core.scratch[b + i]) % (2**32) self.scratch_write[dest + i] = (mul + core.scratch[c + i]) % (2**32) case (op, dest, a1, a2): for i in range(VLEN): self.alu(core, op, dest + i, a1 + i, a2 + i) case _: raise NotImplementedError(f"Unknown valu op {slot}")
def store(self, core, *slot): match slot: case ("store", addr, src): addr = core.scratch[addr] self.mem_write[addr] = core.scratch[src] case ("vstore", addr, src): # addr is a scalar addr = core.scratch[addr] for vi in range(VLEN): self.mem_write[addr + vi] = core.scratch[src + vi] case _: raise NotImplementedError(f"Unknown store op {slot}")
2 loads + 2 stores per cycle (or 16 loads + 16 stores with vectors)
Control Flow
1 flow operation per cycle
Maximize instruction-level parallelism by packing multiple independent operations into each cycle. Use vector instructions to process 8x more data per cycle.