Skip to main content
Atoms are the fundamental building blocks in CuTe DSL that define hardware-level operations. They encapsulate the metadata and behavior of copy instructions and MMA (Matrix Multiply-Accumulate) operations.

Overview

An Atom consists of:
  • An Operation (pure Python class modeling a specific instruction)
  • A Trait (wraps IR value with metadata using CuTe Layouts)
Atoms are composed into Tiled operations that map across thread blocks.

Copy Atoms

make_copy_atom

Creates a copy atom from a copy operation.
copy_atom = cute.make_copy_atom(copy_op, element_type)
copy_op
CopyOp
required
Copy operation (e.g., cute.nvgpu.CopyUniversalOp())
element_type
Type[Numeric]
required
Element data type (e.g., cute.Float32, cute.BFloat16)
Common Copy Operations:
# Universal copy (works across memory spaces)
copy_atom = cute.make_copy_atom(
    cute.nvgpu.CopyUniversalOp(),
    cute.Float32
)

# CP.ASYNC for global -> shared memory
copy_atom = cute.make_copy_atom(
    cute.nvgpu.cpasync.CopyG2SAsync(),
    cute.BFloat16
)

# TMA (Tensor Memory Accelerator) for Hopper+
tma_atom = cute.make_copy_atom(
    cute.nvgpu.tcgen05.TMA_LOAD(),
    cute.Float16
)

make_tiled_copy

Creates a tiled copy operation with explicit tiling and thread-value layout.
tiled_copy = cute.make_tiled_copy(
    copy_atom=copy_atom,
    thr_layout=thread_layout,
    val_layout=value_layout
)
copy_atom
CopyAtom
required
Base copy atom
thr_layout
Layout
required
Thread layout mapping thread IDs to coordinates
val_layout
Layout
required
Value layout defining vectorization per thread
Example:
# Thread layout: 4x32 threads, column-major
thr_layout = cute.make_layout((4, 32), stride=(32, 1))

# Value layout: 4x4 values per thread, row-major for vectorization
val_layout = cute.make_layout((4, 4), stride=(4, 1))

# Create tiled copy
copy_atom = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), cute.Float32)
tiled_copy = cute.make_tiled_copy(copy_atom, thr_layout, val_layout)

make_tiled_copy_tv

Creates a tiled copy with automatic TV (Thread-Value) layout inference.
tiled_copy = cute.make_tiled_copy_tv(
    copy_atom,
    thr_layout,
    val_layout
)
Automatically infers the tiler and TV layout from thread and value layouts.

Specialized Tiled Copy Constructors

# Source-specialized copy
tiled_copy_S = cute.make_tiled_copy_S(copy_atom, thr_layout, val_layout)

# Destination-specialized copy  
tiled_copy_D = cute.make_tiled_copy_D(copy_atom, thr_layout, val_layout)

# Specialized for tensor A/B/C in GEMM
tiled_copy_A = cute.make_tiled_copy_A(copy_atom, ...)
tiled_copy_B = cute.make_tiled_copy_B(copy_atom, ...)
tiled_copy_C = cute.make_tiled_copy_C(copy_atom, ...)

TiledCopy Operations

# Get per-thread slice
thr_copy = tiled_copy.get_slice(thread_idx)

# Partition source tensor
thrA_src = thr_copy.partition_S(block_tensor)

# Partition destination tensor
thrA_dst = thr_copy.partition_D(fragment)

# Execute copy
cute.copy(copy_atom, thrA_src, thrA_dst)

MMA Atoms

make_mma_atom

Creates an MMA atom from an MMA operation.
mma_atom = cute.make_mma_atom(mma_op)
mma_op
MmaOp
required
MMA operation (e.g., warp or warpgroup MMA)
Ampere Warp-Level MMA:
# FP32 accumulator, FP32 inputs
mma_atom = cute.make_mma_atom(
    cute.nvgpu.warp.MMA_F32F32F32_M16N8K8_TN()
)

# FP32 accumulator, BF16 inputs
mma_atom = cute.make_mma_atom(
    cute.nvgpu.warp.MMA_F32BF16BF16_M16N8K16_TN()
)
Hopper Warpgroup MMA:
mma_atom = cute.make_mma_atom(
    cute.nvgpu.warpgroup.MMA_F32F16F16_M64N256K32_TN()
)
Blackwell tcgen05 MMA:
mma_atom = cute.make_mma_atom(
    cute.nvgpu.tcgen05.MMA_F32F16F16_M64N256K32_SS()
)

make_tiled_mma

Creates a tiled MMA operation.
tiled_mma = cute.make_tiled_mma(
    mma_atom,
    thr_layout=None  # Optional: custom thread layout
)
mma_atom
MmaAtom
required
Base MMA atom
thr_layout
Layout
default:"None"
Optional custom thread layout (auto-derived if None)
Example:
# Create MMA atom
mma_atom = cute.make_mma_atom(
    cute.nvgpu.warp.MMA_F32F32F32_M16N8K8_TN()
)

# Create tiled MMA
tiled_mma = cute.make_tiled_mma(mma_atom)

# Get per-thread slice
thr_mma = tiled_mma.get_slice(thread_idx)

TiledMma Operations

# Partition A tensor
thrA = thr_mma.partition_A(block_A)

# Partition B tensor
thrB = thr_mma.partition_B(block_B)

# Partition C (accumulator) tensor
thrC = thr_mma.partition_C(block_C)

# Execute GEMM
cute.gemm(tiled_mma.atom, thrC, thrA, thrB, thrC)

Runtime State Management

Some atoms have runtime state (e.g., tcgen05 MMA accumulate flag).

set

Sets runtime fields of an atom.
tiled_mma.set(field, value)
Example:
# Set accumulate mode for tcgen05 MMA
tiled_mma.set(cute.nvgpu.tcgen05.Field.ACCUMULATE, True)

get

Gets runtime fields of an atom.
value = tiled_mma.get(field)
Example:
# Get accumulate mode
accum = tiled_mma.get(cute.nvgpu.tcgen05.Field.ACCUMULATE)

with_

Returns a new atom with modified runtime state.
new_atom = atom.with_(field=value)

Architecture-Specific Atoms

Ampere (SM80)

# Warp-level tensor core MMA
mma = cute.make_mma_atom(cute.nvgpu.warp.MMA_F32F16F16_M16N8K16_TN())

# Async copy (cp.async)
copy = cute.make_copy_atom(cute.nvgpu.cpasync.CopyG2SAsync(), cute.Float16)

Hopper (SM90)

# Warpgroup MMA
mma = cute.make_mma_atom(cute.nvgpu.warpgroup.MMA_F32F16F16_M64N256K32_TN())

# TMA load
tma_load = cute.make_copy_atom(cute.nvgpu.tcgen05.TMA_LOAD(), cute.Float16)

# TMA store
tma_store = cute.make_copy_atom(cute.nvgpu.tcgen05.TMA_STORE(), cute.Float16)

Blackwell (SM100)

# tcgen05 MMA with scaled inputs
mma = cute.make_mma_atom(cute.nvgpu.tcgen05.MMA_F32F16F16_M64N256K32_SS())

# Configure accumulate mode
mma.set(cute.nvgpu.tcgen05.Field.ACCUMULATE, True)

Complete Example: Tiled Copy

import cutlass.cute as cute

@cute.kernel
def copy_kernel(
    gSrc: cute.Tensor,
    gDst: cute.Tensor,
    thr_layout: cute.Layout,
    val_layout: cute.Layout,
):
    tidx, _, _ = cute.arch.thread_idx()
    bidx, _, _ = cute.arch.block_idx()
    
    # Create copy atom
    copy_atom = cute.make_copy_atom(
        cute.nvgpu.CopyUniversalOp(),
        gSrc.element_type
    )
    
    # Create tiled copy
    tiled_copy = cute.make_tiled_copy_tv(
        copy_atom,
        thr_layout,
        val_layout
    )
    
    # Get thread slice
    thr_copy = tiled_copy.get_slice(tidx)
    
    # Partition tensors
    blkSrc = gSrc[((None, None), bidx)]
    blkDst = gDst[((None, None), bidx)]
    
    thrSrc = thr_copy.partition_S(blkSrc)
    thrDst = thr_copy.partition_D(blkDst)
    
    # Allocate fragment
    frag = cute.make_fragment_like(thrSrc)
    
    # Execute copy
    cute.copy(copy_atom, thrSrc, frag)
    cute.copy(copy_atom, frag, thrDst)

Complete Example: Tiled MMA

import cutlass.cute as cute

@cute.kernel
def mma_kernel(
    mA: cute.Tensor,
    mB: cute.Tensor,
    mC: cute.Tensor,
):
    tidx, _, _ = cute.arch.thread_idx()
    bidx, _, _ = cute.arch.block_idx()
    
    # Create MMA atom
    mma_atom = cute.make_mma_atom(
        cute.nvgpu.warp.MMA_F32F16F16_M16N8K16_TN()
    )
    
    # Create tiled MMA
    tiled_mma = cute.make_tiled_mma(mma_atom)
    
    # Get thread slice
    thr_mma = tiled_mma.get_slice(tidx)
    
    # Partition block tensors
    blkA = mA[((None, None, None), bidx)]
    blkB = mB[((None, None, None), bidx)]
    blkC = mC[((None, None), bidx)]
    
    thrA = thr_mma.partition_A(blkA)
    thrB = thr_mma.partition_B(blkB)
    thrC = thr_mma.partition_C(blkC)
    
    # Allocate fragments
    fragA = cute.make_fragment_like(thrA[None, None, 0])
    fragB = cute.make_fragment_like(thrB[None, None, 0])
    fragC = cute.make_fragment_like(thrC)
    
    # Initialize accumulator
    cute.basic_copy(thrC, fragC)
    
    # GEMM loop
    K_tiles = cute.size(thrA.shape[2])
    for k in range(K_tiles):
        # Load tiles
        cute.basic_copy(thrA[None, None, k], fragA)
        cute.basic_copy(thrB[None, None, k], fragB)
        
        # Compute
        cute.gemm(tiled_mma.atom, fragC, fragA, fragB, fragC)
    
    # Store result
    cute.basic_copy(fragC, thrC)

See Also

Build docs developers (and LLMs) love