Skip to main content
Get up and running with CUTLASS Python packages in minutes.

Installation

1

Install CUDA Toolkit

Ensure you have CUDA 11.8+ or 12.x installed:
nvcc --version
2

Install CUTLASS Python Package

pip install nvidia-cutlass
3

Verify Installation

Test your installation:
import cutlass
import cutlass.cute as cute
print(f"CUTLASS version: {cutlass.__version__}")
print(f"CuTe DSL available: {cute is not None}")

CUTLASS Python Interface (cutlass_cppgen)

Basic GEMM Example

The simplest way to run a GEMM with CUTLASS:
import cutlass
import numpy as np

# Create a GEMM plan
plan = cutlass.op.Gemm(
    element=np.float16,
    layout=cutlass.LayoutType.RowMajor
)

# Create input matrices
M, N, K = 1024, 1024, 1024
A = np.ones((M, K), dtype=np.float16)
B = np.ones((K, N), dtype=np.float16)
C = np.zeros((M, N), dtype=np.float16)
D = np.zeros((M, N), dtype=np.float16)

# Run the GEMM: D = A @ B + C
plan.run(A, B, C, D)

print(f"Result shape: {D.shape}")
print(f"Result sample: {D[:3, :3]}")

With PyTorch Tensors

import cutlass
import torch

# Create PyTorch tensors on GPU
M, N, K = 2048, 2048, 2048
A = torch.randn(M, K, dtype=torch.float16, device='cuda')
B = torch.randn(K, N, dtype=torch.float16, device='cuda')
C = torch.zeros(M, N, dtype=torch.float16, device='cuda')
D = torch.zeros(M, N, dtype=torch.float16, device='cuda')

# Create and run GEMM plan
plan = cutlass.op.Gemm(
    element=torch.float16,
    layout=cutlass.LayoutType.RowMajor
)
plan.run(A, B, C, D)

# Verify against PyTorch
ref = torch.matmul(A, B) + C
torch.testing.assert_close(D, ref, rtol=1e-3, atol=1e-3)
print("Results match!")

Configuration Options

import cutlass
import numpy as np

plan = cutlass.op.Gemm(
    element=np.float16,              # Data type
    layout=cutlass.LayoutType.RowMajor,  # Memory layout
    element_accumulator=np.float32,   # Accumulator type (optional)
    element_epilogue=np.float16,      # Epilogue type (optional)
    # Advanced options
    alignment=8,                      # Memory alignment
    kernel_schedule="auto"            # Kernel schedule
)

# Enumerate available configurations
configs = plan.configurations()
print(f"Found {len(configs)} configurations")
for config in configs[:3]:
    print(config)

CuTe DSL

Hello World Kernel

Your first CUDA kernel with CuTe DSL:
import cutlass
import cutlass.cute as cute
import torch

# Define kernel using @cute.kernel decorator
@cute.kernel
def hello_world_kernel(tensor: cute.Tensor):
    tidx, _, _ = cute.arch.thread_idx()
    bidx, _, _ = cute.arch.block_idx()
    
    idx = bidx * cute.arch.block_dim()[0] + tidx
    if idx < cute.size(tensor):
        tensor[idx] = tensor[idx] * 2.0

# Wrapper function with @cute.jit
@cute.jit
def hello_world(tensor):
    num_threads = 256
    num_blocks = (cute.size(tensor) + num_threads - 1) // num_threads
    
    hello_world_kernel(tensor).launch(
        grid=[num_blocks, 1, 1],
        block=[num_threads, 1, 1]
    )

# Create tensor and run
data = torch.ones(1024, dtype=torch.float32, device='cuda')
tensor = cute.from_dlpack(data)

# Compile and execute
compiled = cute.compile(hello_world, tensor)
compiled(tensor)

print(f"Result: {data[:10]}")  # Should be all 2.0

Elementwise Addition

A more complete example showing CuTe’s tiled copy pattern:
import cutlass
import cutlass.cute as cute
import torch
from cutlass.cute.runtime import from_dlpack

@cute.kernel
def elementwise_add_kernel(
    gA: cute.Tensor,
    gB: cute.Tensor,
    gC: cute.Tensor,
    thr_layout: cute.Layout,
    val_layout: cute.Layout
):
    tidx, _, _ = cute.arch.thread_idx()
    bidx, _, _ = cute.arch.block_idx()
    
    # Slice for this thread block
    blkA = gA[((None, None), bidx)]
    blkB = gB[((None, None), bidx)]
    blkC = gC[((None, None), bidx)]
    
    # Create tiled copy operations
    copy_atom = cute.make_copy_atom(
        cute.nvgpu.CopyUniversalOp(),
        gA.element_type
    )
    tiled_copy = cute.make_tiled_copy_tv(
        copy_atom, thr_layout, val_layout
    )
    
    # Get this thread's slice
    thr_copy = tiled_copy.get_slice(tidx)
    thrA = thr_copy.partition_S(blkA)
    thrB = thr_copy.partition_S(blkB)
    thrC = thr_copy.partition_D(blkC)
    
    # Allocate register fragments
    frgA = cute.make_fragment_like(thrA)
    frgB = cute.make_fragment_like(thrB)
    frgC = cute.make_fragment_like(thrC)
    
    # Copy data: gmem -> rmem
    cute.copy(copy_atom, thrA, frgA)
    cute.copy(copy_atom, thrB, frgB)
    
    # Compute
    result = frgA.load() + frgB.load()
    frgC.store(result)
    
    # Copy result: rmem -> gmem
    cute.copy(copy_atom, frgC, thrC)

@cute.jit
def elementwise_add(mA, mB, mC):
    # Define thread and value layouts
    thr_layout = cute.make_ordered_layout((4, 32), order=(1, 0))
    val_layout = cute.make_ordered_layout((4, 4), order=(1, 0))
    tiler_mn, tv_layout = cute.make_layout_tv(thr_layout, val_layout)
    
    # Tile tensors
    gA = cute.zipped_divide(mA, tiler_mn)
    gB = cute.zipped_divide(mB, tiler_mn)
    gC = cute.zipped_divide(mC, tiler_mn)
    
    # Launch kernel
    elementwise_add_kernel(gA, gB, gC, thr_layout, val_layout).launch(
        grid=[cute.size(gC, mode=[1]), 1, 1],
        block=[cute.size(tv_layout, mode=[0]), 1, 1]
    )

# Usage
M, N = 1024, 512
A = torch.randn(M, N, dtype=torch.float32, device='cuda')
B = torch.randn(M, N, dtype=torch.float32, device='cuda')
C = torch.zeros(M, N, dtype=torch.float32, device='cuda')

mA = from_dlpack(A).mark_layout_dynamic()
mB = from_dlpack(B).mark_layout_dynamic()
mC = from_dlpack(C).mark_layout_dynamic()

# Compile and run
compiled = cute.compile(elementwise_add, mA, mB, mC)
compiled(mA, mB, mC)

# Verify
expected = A + B
torch.testing.assert_close(C, expected)
print("Success!")

Simple GEMM

A basic GEMM kernel using CuTe DSL:
import cutlass
import cutlass.cute as cute
import torch
from cutlass.cute.runtime import from_dlpack

class SimpleGemm:
    def __init__(self, tile_m=128, tile_n=128, tile_k=8):
        self.tile_m = tile_m
        self.tile_n = tile_n
        self.tile_k = tile_k
    
    @cute.kernel
    def kernel(self, mA: cute.Tensor, mB: cute.Tensor, mC: cute.Tensor,
               tiled_mma: cute.TiledMma):
        tidx, _, _ = cute.arch.thread_idx()
        bidx, bidy, _ = cute.arch.block_idx()
        
        # Get thread slice of MMA
        thr_mma = tiled_mma.get_slice(tidx)
        
        # Tile tensors to thread blocks
        gA = cute.local_tile(mA, tiler=(self.tile_m, self.tile_k, None),
                           coord=(bidx, None, None))
        gB = cute.local_tile(mB, tiler=(self.tile_n, self.tile_k, None),
                           coord=(bidy, None, None))
        gC = cute.local_tile(mC, tiler=(self.tile_m, self.tile_n, None),
                           coord=(bidx, bidy, None))
        
        # Allocate accumulator
        tCgC = thr_mma.partition_C(gC)
        acc = tiled_mma.make_fragment_C(tCgC)
        acc.fill(0.0)
        
        # Main loop over K
        for k in range(cute.size(gA, mode=[1])):
            # Load A and B tiles
            tCsA = thr_mma.partition_A(gA[..., k])
            tCsB = thr_mma.partition_B(gB[..., k])
            
            # Perform MMA
            cute.gemm(tiled_mma, acc, tCsA, tCsB, acc)
        
        # Store result
        atom = cute.make_copy_atom(
            cute.nvgpu.CopyUniversalOp(), mC.element_type
        )
        cute.copy(atom, acc, tCgC)
    
    @cute.jit
    def __call__(self, mA, mB, mC):
        # Create MMA operation
        op = cute.nvgpu.MmaUniversalOp(cutlass.Float32)
        atoms_layout = cute.make_layout((16, 16, 1), stride=(16, 1, 0))
        tiled_mma = cute.make_tiled_mma(op, atoms_layout)
        
        # Calculate grid dimensions
        grid_m = (cute.size(mA, mode=[0]) + self.tile_m - 1) // self.tile_m
        grid_n = (cute.size(mB, mode=[0]) + self.tile_n - 1) // self.tile_n
        
        # Launch kernel
        self.kernel(mA, mB, mC, tiled_mma).launch(
            grid=[grid_m, grid_n, 1],
            block=[cute.size(atoms_layout), 1, 1]
        )

# Usage
M, N, K = 512, 512, 256
A = torch.randn(M, K, dtype=torch.float32, device='cuda')
B = torch.randn(N, K, dtype=torch.float32, device='cuda')
C = torch.zeros(M, N, dtype=torch.float32, device='cuda')

mA = from_dlpack(A)
mB = from_dlpack(B)
mC = from_dlpack(C)

gemm = SimpleGemm()
compiled = cute.compile(gemm, mA, mB, mC)
compiled(mA, mB, mC)

ref = torch.matmul(A, B.t())
torch.testing.assert_close(C, ref, rtol=1e-3, atol=1e-3)
print("GEMM Success!")

Key Concepts

CuTe uses layout algebra to express memory access patterns:
# Create a layout: (shape, stride)
layout = cute.make_layout((16, 8), stride=(8, 1))  # Row-major 16x8

# Create tensor with layout
tensor = cute.make_tensor(ptr, layout)

# Compose layouts
tiled = cute.make_layout(
    (cute.make_layout((4, 8)), cute.make_layout((2, 4))),
    stride=(1, 32)
)
Partition data across threads for efficient copying:
# Create tiled copy from thread/value layouts
tiled_copy = cute.make_tiled_copy_tv(copy_atom, thr_layout, val_layout)

# Get slice for current thread
thr_copy = tiled_copy.get_slice(tidx)

# Partition source and destination
src = thr_copy.partition_S(source_tensor)
dst = thr_copy.partition_D(dest_tensor)
Matrix multiply-accumulate using Tensor Cores:
# Create tiled MMA
op = cute.nvgpu.MmaUniversalOp(dtype)
tiled_mma = cute.make_tiled_mma(op, atoms_layout)

# Get thread slice
thr_mma = tiled_mma.get_slice(tidx)

# Partition A, B, C
tA = thr_mma.partition_A(A_tensor)
tB = thr_mma.partition_B(B_tensor)
tC = thr_mma.partition_C(C_tensor)

# Perform GEMM
cute.gemm(tiled_mma, tC, tA, tB, tC)
Compile kernels to optimized CUDA code:
# Method 1: cute.compile
compiled = cute.compile(jit_function, *args)
compiled(*args)

# Method 2: With options
compiled = cute.compile[
    cute.GenerateLineInfo,
    cute.OptimizationLevel(3)
](jit_function, *args)

# Method 3: Inline compilation
result = jit_function(*args)  # Auto-compiles on first call

Next Steps

CuTe DSL Deep Dive

Learn advanced CuTe DSL features and patterns

Examples

Explore more complex examples and benchmarks

PyTorch Integration

Build PyTorch CUDA extensions with CUTLASS

API Reference

Browse the complete API documentation

Troubleshooting

# Ensure CUDA is available
python -c "import torch; print(torch.cuda.is_available())"

# Check CUDA version matches
python -c "from cuda import cuda; print(cuda.cuDriverGetVersion())"

# Reinstall with matching CUDA version
pip uninstall nvidia-cutlass nvidia-cutlass-dsl
pip install nvidia-cutlass nvidia-cutlass-dsl
Set CUTLASS_PATH and CUDA_INSTALL_PATH:
export CUTLASS_PATH=/path/to/cutlass
export CUDA_INSTALL_PATH=/usr/local/cuda
Enable debug output:
import os
os.environ['CUTLASS_DEBUG'] = '1'

# Compile with debug info
compiled = cute.compile[
    cute.GenerateLineInfo
](function, *args, options="--generate-line-info -g")

Build docs developers (and LLMs) love