Installation
Install CUTLASS Python Package
- cutlass_cppgen
- CuTe DSL
- Both
pip install nvidia-cutlass
pip install nvidia-cutlass-dsl
pip install nvidia-cutlass nvidia-cutlass-dsl
CUTLASS Python Interface (cutlass_cppgen)
Basic GEMM Example
The simplest way to run a GEMM with CUTLASS:import cutlass
import numpy as np
# Create a GEMM plan
plan = cutlass.op.Gemm(
element=np.float16,
layout=cutlass.LayoutType.RowMajor
)
# Create input matrices
M, N, K = 1024, 1024, 1024
A = np.ones((M, K), dtype=np.float16)
B = np.ones((K, N), dtype=np.float16)
C = np.zeros((M, N), dtype=np.float16)
D = np.zeros((M, N), dtype=np.float16)
# Run the GEMM: D = A @ B + C
plan.run(A, B, C, D)
print(f"Result shape: {D.shape}")
print(f"Result sample: {D[:3, :3]}")
With PyTorch Tensors
import cutlass
import torch
# Create PyTorch tensors on GPU
M, N, K = 2048, 2048, 2048
A = torch.randn(M, K, dtype=torch.float16, device='cuda')
B = torch.randn(K, N, dtype=torch.float16, device='cuda')
C = torch.zeros(M, N, dtype=torch.float16, device='cuda')
D = torch.zeros(M, N, dtype=torch.float16, device='cuda')
# Create and run GEMM plan
plan = cutlass.op.Gemm(
element=torch.float16,
layout=cutlass.LayoutType.RowMajor
)
plan.run(A, B, C, D)
# Verify against PyTorch
ref = torch.matmul(A, B) + C
torch.testing.assert_close(D, ref, rtol=1e-3, atol=1e-3)
print("Results match!")
Configuration Options
import cutlass
import numpy as np
plan = cutlass.op.Gemm(
element=np.float16, # Data type
layout=cutlass.LayoutType.RowMajor, # Memory layout
element_accumulator=np.float32, # Accumulator type (optional)
element_epilogue=np.float16, # Epilogue type (optional)
# Advanced options
alignment=8, # Memory alignment
kernel_schedule="auto" # Kernel schedule
)
# Enumerate available configurations
configs = plan.configurations()
print(f"Found {len(configs)} configurations")
for config in configs[:3]:
print(config)
CuTe DSL
Hello World Kernel
Your first CUDA kernel with CuTe DSL:import cutlass
import cutlass.cute as cute
import torch
# Define kernel using @cute.kernel decorator
@cute.kernel
def hello_world_kernel(tensor: cute.Tensor):
tidx, _, _ = cute.arch.thread_idx()
bidx, _, _ = cute.arch.block_idx()
idx = bidx * cute.arch.block_dim()[0] + tidx
if idx < cute.size(tensor):
tensor[idx] = tensor[idx] * 2.0
# Wrapper function with @cute.jit
@cute.jit
def hello_world(tensor):
num_threads = 256
num_blocks = (cute.size(tensor) + num_threads - 1) // num_threads
hello_world_kernel(tensor).launch(
grid=[num_blocks, 1, 1],
block=[num_threads, 1, 1]
)
# Create tensor and run
data = torch.ones(1024, dtype=torch.float32, device='cuda')
tensor = cute.from_dlpack(data)
# Compile and execute
compiled = cute.compile(hello_world, tensor)
compiled(tensor)
print(f"Result: {data[:10]}") # Should be all 2.0
Elementwise Addition
A more complete example showing CuTe’s tiled copy pattern:import cutlass
import cutlass.cute as cute
import torch
from cutlass.cute.runtime import from_dlpack
@cute.kernel
def elementwise_add_kernel(
gA: cute.Tensor,
gB: cute.Tensor,
gC: cute.Tensor,
thr_layout: cute.Layout,
val_layout: cute.Layout
):
tidx, _, _ = cute.arch.thread_idx()
bidx, _, _ = cute.arch.block_idx()
# Slice for this thread block
blkA = gA[((None, None), bidx)]
blkB = gB[((None, None), bidx)]
blkC = gC[((None, None), bidx)]
# Create tiled copy operations
copy_atom = cute.make_copy_atom(
cute.nvgpu.CopyUniversalOp(),
gA.element_type
)
tiled_copy = cute.make_tiled_copy_tv(
copy_atom, thr_layout, val_layout
)
# Get this thread's slice
thr_copy = tiled_copy.get_slice(tidx)
thrA = thr_copy.partition_S(blkA)
thrB = thr_copy.partition_S(blkB)
thrC = thr_copy.partition_D(blkC)
# Allocate register fragments
frgA = cute.make_fragment_like(thrA)
frgB = cute.make_fragment_like(thrB)
frgC = cute.make_fragment_like(thrC)
# Copy data: gmem -> rmem
cute.copy(copy_atom, thrA, frgA)
cute.copy(copy_atom, thrB, frgB)
# Compute
result = frgA.load() + frgB.load()
frgC.store(result)
# Copy result: rmem -> gmem
cute.copy(copy_atom, frgC, thrC)
@cute.jit
def elementwise_add(mA, mB, mC):
# Define thread and value layouts
thr_layout = cute.make_ordered_layout((4, 32), order=(1, 0))
val_layout = cute.make_ordered_layout((4, 4), order=(1, 0))
tiler_mn, tv_layout = cute.make_layout_tv(thr_layout, val_layout)
# Tile tensors
gA = cute.zipped_divide(mA, tiler_mn)
gB = cute.zipped_divide(mB, tiler_mn)
gC = cute.zipped_divide(mC, tiler_mn)
# Launch kernel
elementwise_add_kernel(gA, gB, gC, thr_layout, val_layout).launch(
grid=[cute.size(gC, mode=[1]), 1, 1],
block=[cute.size(tv_layout, mode=[0]), 1, 1]
)
# Usage
M, N = 1024, 512
A = torch.randn(M, N, dtype=torch.float32, device='cuda')
B = torch.randn(M, N, dtype=torch.float32, device='cuda')
C = torch.zeros(M, N, dtype=torch.float32, device='cuda')
mA = from_dlpack(A).mark_layout_dynamic()
mB = from_dlpack(B).mark_layout_dynamic()
mC = from_dlpack(C).mark_layout_dynamic()
# Compile and run
compiled = cute.compile(elementwise_add, mA, mB, mC)
compiled(mA, mB, mC)
# Verify
expected = A + B
torch.testing.assert_close(C, expected)
print("Success!")
Simple GEMM
A basic GEMM kernel using CuTe DSL:import cutlass
import cutlass.cute as cute
import torch
from cutlass.cute.runtime import from_dlpack
class SimpleGemm:
def __init__(self, tile_m=128, tile_n=128, tile_k=8):
self.tile_m = tile_m
self.tile_n = tile_n
self.tile_k = tile_k
@cute.kernel
def kernel(self, mA: cute.Tensor, mB: cute.Tensor, mC: cute.Tensor,
tiled_mma: cute.TiledMma):
tidx, _, _ = cute.arch.thread_idx()
bidx, bidy, _ = cute.arch.block_idx()
# Get thread slice of MMA
thr_mma = tiled_mma.get_slice(tidx)
# Tile tensors to thread blocks
gA = cute.local_tile(mA, tiler=(self.tile_m, self.tile_k, None),
coord=(bidx, None, None))
gB = cute.local_tile(mB, tiler=(self.tile_n, self.tile_k, None),
coord=(bidy, None, None))
gC = cute.local_tile(mC, tiler=(self.tile_m, self.tile_n, None),
coord=(bidx, bidy, None))
# Allocate accumulator
tCgC = thr_mma.partition_C(gC)
acc = tiled_mma.make_fragment_C(tCgC)
acc.fill(0.0)
# Main loop over K
for k in range(cute.size(gA, mode=[1])):
# Load A and B tiles
tCsA = thr_mma.partition_A(gA[..., k])
tCsB = thr_mma.partition_B(gB[..., k])
# Perform MMA
cute.gemm(tiled_mma, acc, tCsA, tCsB, acc)
# Store result
atom = cute.make_copy_atom(
cute.nvgpu.CopyUniversalOp(), mC.element_type
)
cute.copy(atom, acc, tCgC)
@cute.jit
def __call__(self, mA, mB, mC):
# Create MMA operation
op = cute.nvgpu.MmaUniversalOp(cutlass.Float32)
atoms_layout = cute.make_layout((16, 16, 1), stride=(16, 1, 0))
tiled_mma = cute.make_tiled_mma(op, atoms_layout)
# Calculate grid dimensions
grid_m = (cute.size(mA, mode=[0]) + self.tile_m - 1) // self.tile_m
grid_n = (cute.size(mB, mode=[0]) + self.tile_n - 1) // self.tile_n
# Launch kernel
self.kernel(mA, mB, mC, tiled_mma).launch(
grid=[grid_m, grid_n, 1],
block=[cute.size(atoms_layout), 1, 1]
)
# Usage
M, N, K = 512, 512, 256
A = torch.randn(M, K, dtype=torch.float32, device='cuda')
B = torch.randn(N, K, dtype=torch.float32, device='cuda')
C = torch.zeros(M, N, dtype=torch.float32, device='cuda')
mA = from_dlpack(A)
mB = from_dlpack(B)
mC = from_dlpack(C)
gemm = SimpleGemm()
compiled = cute.compile(gemm, mA, mB, mC)
compiled(mA, mB, mC)
ref = torch.matmul(A, B.t())
torch.testing.assert_close(C, ref, rtol=1e-3, atol=1e-3)
print("GEMM Success!")
Key Concepts
Layouts and Tensors
Layouts and Tensors
CuTe uses layout algebra to express memory access patterns:
# Create a layout: (shape, stride)
layout = cute.make_layout((16, 8), stride=(8, 1)) # Row-major 16x8
# Create tensor with layout
tensor = cute.make_tensor(ptr, layout)
# Compose layouts
tiled = cute.make_layout(
(cute.make_layout((4, 8)), cute.make_layout((2, 4))),
stride=(1, 32)
)
Tiled Copy
Tiled Copy
Partition data across threads for efficient copying:
# Create tiled copy from thread/value layouts
tiled_copy = cute.make_tiled_copy_tv(copy_atom, thr_layout, val_layout)
# Get slice for current thread
thr_copy = tiled_copy.get_slice(tidx)
# Partition source and destination
src = thr_copy.partition_S(source_tensor)
dst = thr_copy.partition_D(dest_tensor)
MMA Operations
MMA Operations
Matrix multiply-accumulate using Tensor Cores:
# Create tiled MMA
op = cute.nvgpu.MmaUniversalOp(dtype)
tiled_mma = cute.make_tiled_mma(op, atoms_layout)
# Get thread slice
thr_mma = tiled_mma.get_slice(tidx)
# Partition A, B, C
tA = thr_mma.partition_A(A_tensor)
tB = thr_mma.partition_B(B_tensor)
tC = thr_mma.partition_C(C_tensor)
# Perform GEMM
cute.gemm(tiled_mma, tC, tA, tB, tC)
JIT Compilation
JIT Compilation
Compile kernels to optimized CUDA code:
# Method 1: cute.compile
compiled = cute.compile(jit_function, *args)
compiled(*args)
# Method 2: With options
compiled = cute.compile[
cute.GenerateLineInfo,
cute.OptimizationLevel(3)
](jit_function, *args)
# Method 3: Inline compilation
result = jit_function(*args) # Auto-compiles on first call
Next Steps
CuTe DSL Deep Dive
Learn advanced CuTe DSL features and patterns
Examples
Explore more complex examples and benchmarks
PyTorch Integration
Build PyTorch CUDA extensions with CUTLASS
API Reference
Browse the complete API documentation
Troubleshooting
Import Errors
Import Errors
# Ensure CUDA is available
python -c "import torch; print(torch.cuda.is_available())"
# Check CUDA version matches
python -c "from cuda import cuda; print(cuda.cuDriverGetVersion())"
# Reinstall with matching CUDA version
pip uninstall nvidia-cutlass nvidia-cutlass-dsl
pip install nvidia-cutlass nvidia-cutlass-dsl
Compilation Errors
Compilation Errors
Set
CUTLASS_PATH and CUDA_INSTALL_PATH:export CUTLASS_PATH=/path/to/cutlass
export CUDA_INSTALL_PATH=/usr/local/cuda
Runtime Errors
Runtime Errors
Enable debug output:
import os
os.environ['CUTLASS_DEBUG'] = '1'
# Compile with debug info
compiled = cute.compile[
cute.GenerateLineInfo
](function, *args, options="--generate-line-info -g")