Atoms are the fundamental building blocks in CuTe DSL that define hardware-level operations. They encapsulate the metadata and behavior of copy instructions and MMA (Matrix Multiply-Accumulate) operations.
Overview
An Atom consists of:
- An Operation (pure Python class modeling a specific instruction)
- A Trait (wraps IR value with metadata using CuTe Layouts)
Atoms are composed into Tiled operations that map across thread blocks.
Copy Atoms
make_copy_atom
Creates a copy atom from a copy operation.
copy_atom = cute.make_copy_atom(copy_op, element_type)
Copy operation (e.g., cute.nvgpu.CopyUniversalOp())
Element data type (e.g., cute.Float32, cute.BFloat16)
Common Copy Operations:
# Universal copy (works across memory spaces)
copy_atom = cute.make_copy_atom(
cute.nvgpu.CopyUniversalOp(),
cute.Float32
)
# CP.ASYNC for global -> shared memory
copy_atom = cute.make_copy_atom(
cute.nvgpu.cpasync.CopyG2SAsync(),
cute.BFloat16
)
# TMA (Tensor Memory Accelerator) for Hopper+
tma_atom = cute.make_copy_atom(
cute.nvgpu.tcgen05.TMA_LOAD(),
cute.Float16
)
make_tiled_copy
Creates a tiled copy operation with explicit tiling and thread-value layout.
tiled_copy = cute.make_tiled_copy(
copy_atom=copy_atom,
thr_layout=thread_layout,
val_layout=value_layout
)
Thread layout mapping thread IDs to coordinates
Value layout defining vectorization per thread
Example:
# Thread layout: 4x32 threads, column-major
thr_layout = cute.make_layout((4, 32), stride=(32, 1))
# Value layout: 4x4 values per thread, row-major for vectorization
val_layout = cute.make_layout((4, 4), stride=(4, 1))
# Create tiled copy
copy_atom = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), cute.Float32)
tiled_copy = cute.make_tiled_copy(copy_atom, thr_layout, val_layout)
make_tiled_copy_tv
Creates a tiled copy with automatic TV (Thread-Value) layout inference.
tiled_copy = cute.make_tiled_copy_tv(
copy_atom,
thr_layout,
val_layout
)
Automatically infers the tiler and TV layout from thread and value layouts.
Specialized Tiled Copy Constructors
# Source-specialized copy
tiled_copy_S = cute.make_tiled_copy_S(copy_atom, thr_layout, val_layout)
# Destination-specialized copy
tiled_copy_D = cute.make_tiled_copy_D(copy_atom, thr_layout, val_layout)
# Specialized for tensor A/B/C in GEMM
tiled_copy_A = cute.make_tiled_copy_A(copy_atom, ...)
tiled_copy_B = cute.make_tiled_copy_B(copy_atom, ...)
tiled_copy_C = cute.make_tiled_copy_C(copy_atom, ...)
TiledCopy Operations
# Get per-thread slice
thr_copy = tiled_copy.get_slice(thread_idx)
# Partition source tensor
thrA_src = thr_copy.partition_S(block_tensor)
# Partition destination tensor
thrA_dst = thr_copy.partition_D(fragment)
# Execute copy
cute.copy(copy_atom, thrA_src, thrA_dst)
MMA Atoms
make_mma_atom
Creates an MMA atom from an MMA operation.
mma_atom = cute.make_mma_atom(mma_op)
MMA operation (e.g., warp or warpgroup MMA)
Ampere Warp-Level MMA:
# FP32 accumulator, FP32 inputs
mma_atom = cute.make_mma_atom(
cute.nvgpu.warp.MMA_F32F32F32_M16N8K8_TN()
)
# FP32 accumulator, BF16 inputs
mma_atom = cute.make_mma_atom(
cute.nvgpu.warp.MMA_F32BF16BF16_M16N8K16_TN()
)
Hopper Warpgroup MMA:
mma_atom = cute.make_mma_atom(
cute.nvgpu.warpgroup.MMA_F32F16F16_M64N256K32_TN()
)
Blackwell tcgen05 MMA:
mma_atom = cute.make_mma_atom(
cute.nvgpu.tcgen05.MMA_F32F16F16_M64N256K32_SS()
)
make_tiled_mma
Creates a tiled MMA operation.
tiled_mma = cute.make_tiled_mma(
mma_atom,
thr_layout=None # Optional: custom thread layout
)
Optional custom thread layout (auto-derived if None)
Example:
# Create MMA atom
mma_atom = cute.make_mma_atom(
cute.nvgpu.warp.MMA_F32F32F32_M16N8K8_TN()
)
# Create tiled MMA
tiled_mma = cute.make_tiled_mma(mma_atom)
# Get per-thread slice
thr_mma = tiled_mma.get_slice(thread_idx)
TiledMma Operations
# Partition A tensor
thrA = thr_mma.partition_A(block_A)
# Partition B tensor
thrB = thr_mma.partition_B(block_B)
# Partition C (accumulator) tensor
thrC = thr_mma.partition_C(block_C)
# Execute GEMM
cute.gemm(tiled_mma.atom, thrC, thrA, thrB, thrC)
Runtime State Management
Some atoms have runtime state (e.g., tcgen05 MMA accumulate flag).
set
Sets runtime fields of an atom.
tiled_mma.set(field, value)
Example:
# Set accumulate mode for tcgen05 MMA
tiled_mma.set(cute.nvgpu.tcgen05.Field.ACCUMULATE, True)
get
Gets runtime fields of an atom.
value = tiled_mma.get(field)
Example:
# Get accumulate mode
accum = tiled_mma.get(cute.nvgpu.tcgen05.Field.ACCUMULATE)
with_
Returns a new atom with modified runtime state.
new_atom = atom.with_(field=value)
Architecture-Specific Atoms
Ampere (SM80)
# Warp-level tensor core MMA
mma = cute.make_mma_atom(cute.nvgpu.warp.MMA_F32F16F16_M16N8K16_TN())
# Async copy (cp.async)
copy = cute.make_copy_atom(cute.nvgpu.cpasync.CopyG2SAsync(), cute.Float16)
Hopper (SM90)
# Warpgroup MMA
mma = cute.make_mma_atom(cute.nvgpu.warpgroup.MMA_F32F16F16_M64N256K32_TN())
# TMA load
tma_load = cute.make_copy_atom(cute.nvgpu.tcgen05.TMA_LOAD(), cute.Float16)
# TMA store
tma_store = cute.make_copy_atom(cute.nvgpu.tcgen05.TMA_STORE(), cute.Float16)
Blackwell (SM100)
# tcgen05 MMA with scaled inputs
mma = cute.make_mma_atom(cute.nvgpu.tcgen05.MMA_F32F16F16_M64N256K32_SS())
# Configure accumulate mode
mma.set(cute.nvgpu.tcgen05.Field.ACCUMULATE, True)
Complete Example: Tiled Copy
import cutlass.cute as cute
@cute.kernel
def copy_kernel(
gSrc: cute.Tensor,
gDst: cute.Tensor,
thr_layout: cute.Layout,
val_layout: cute.Layout,
):
tidx, _, _ = cute.arch.thread_idx()
bidx, _, _ = cute.arch.block_idx()
# Create copy atom
copy_atom = cute.make_copy_atom(
cute.nvgpu.CopyUniversalOp(),
gSrc.element_type
)
# Create tiled copy
tiled_copy = cute.make_tiled_copy_tv(
copy_atom,
thr_layout,
val_layout
)
# Get thread slice
thr_copy = tiled_copy.get_slice(tidx)
# Partition tensors
blkSrc = gSrc[((None, None), bidx)]
blkDst = gDst[((None, None), bidx)]
thrSrc = thr_copy.partition_S(blkSrc)
thrDst = thr_copy.partition_D(blkDst)
# Allocate fragment
frag = cute.make_fragment_like(thrSrc)
# Execute copy
cute.copy(copy_atom, thrSrc, frag)
cute.copy(copy_atom, frag, thrDst)
Complete Example: Tiled MMA
import cutlass.cute as cute
@cute.kernel
def mma_kernel(
mA: cute.Tensor,
mB: cute.Tensor,
mC: cute.Tensor,
):
tidx, _, _ = cute.arch.thread_idx()
bidx, _, _ = cute.arch.block_idx()
# Create MMA atom
mma_atom = cute.make_mma_atom(
cute.nvgpu.warp.MMA_F32F16F16_M16N8K16_TN()
)
# Create tiled MMA
tiled_mma = cute.make_tiled_mma(mma_atom)
# Get thread slice
thr_mma = tiled_mma.get_slice(tidx)
# Partition block tensors
blkA = mA[((None, None, None), bidx)]
blkB = mB[((None, None, None), bidx)]
blkC = mC[((None, None), bidx)]
thrA = thr_mma.partition_A(blkA)
thrB = thr_mma.partition_B(blkB)
thrC = thr_mma.partition_C(blkC)
# Allocate fragments
fragA = cute.make_fragment_like(thrA[None, None, 0])
fragB = cute.make_fragment_like(thrB[None, None, 0])
fragC = cute.make_fragment_like(thrC)
# Initialize accumulator
cute.basic_copy(thrC, fragC)
# GEMM loop
K_tiles = cute.size(thrA.shape[2])
for k in range(K_tiles):
# Load tiles
cute.basic_copy(thrA[None, None, k], fragA)
cute.basic_copy(thrB[None, None, k], fragB)
# Compute
cute.gemm(tiled_mma.atom, fragC, fragA, fragB, fragC)
# Store result
cute.basic_copy(fragC, thrC)
See Also