Skip to main content
The CuTe DSL provides comprehensive memory management APIs for creating and manipulating tensors across different memory spaces (global, shared, register).

Tensor Creation

make_tensor

Creates a tensor from a pointer and layout.
tensor = cute.make_tensor(ptr, layout)
ptr
Pointer
required
Pointer to memory (global, shared, or register)
layout
Layout
required
Layout defining shape and stride
Example:
# Create tensor from global memory pointer
ptr = cute.make_ptr(data, element_type=cute.Float32)
layout = cute.make_layout(
    shape=(M, N),
    stride=(N, 1)  # Row-major
)
tensor = cute.make_tensor(ptr, layout)

make_fragment

Allocates a register memory tensor (fragment).
fragment = cute.make_fragment(element_type, layout)
element_type
Type[Numeric]
required
Element data type (e.g., cute.Float32, cute.BFloat16)
layout
Layout
required
Layout of the fragment
Example:
# Allocate 16x8 fragment in registers
layout = cute.make_layout((16, 8))
fragment = cute.make_fragment(cute.Float32, layout)

make_fragment_like

Allocates a fragment with the same shape as another tensor.
fragment = cute.make_fragment_like(tensor)
tensor
Tensor
required
Template tensor
Example:
# Create fragment matching partitioned tensor
thrA = thr_copy.partition_S(blockA)
fragA = cute.make_fragment_like(thrA)

make_rmem_tensor

Allocates register memory tensor.
tensor = cute.make_rmem_tensor(element_type, layout)
Alias for make_fragment.

make_rmem_tensor_like

Allocates register memory tensor matching another tensor.
tensor = cute.make_rmem_tensor_like(template_tensor)
Alias for make_fragment_like.

Tensor Initialization

zeros_like

Creates a tensor initialized to zero.
zero_tensor = cute.zeros_like(tensor)
Example:
# Initialize accumulator to zero
accum = cute.zeros_like(fragC)

ones_like

Creates a tensor initialized to one.
one_tensor = cute.ones_like(tensor)

full

Creates a tensor filled with a specific value.
filled_tensor = cute.full(layout, value, element_type)
layout
Layout
required
Tensor layout
value
Numeric
required
Fill value
element_type
Type[Numeric]
required
Element data type

full_like

Creates a tensor filled with a value, matching another tensor’s shape.
filled_tensor = cute.full_like(tensor, value)
Example:
# Initialize accumulator to identity (for debugging)
identity = cute.full_like(fragC, 1.0)

empty_like

Allocates an uninitialized tensor matching another tensor’s shape.
empty_tensor = cute.empty_like(tensor)

Pointer Operations

make_ptr

Creates a typed pointer from raw memory address.
ptr = cute.make_ptr(
    address,
    element_type=cute.Float32,
    address_space=cute.AddressSpace.GLOBAL
)
address
Union[int, Tensor]
required
Memory address or tensor
element_type
Type[Numeric]
required
Element data type
address_space
AddressSpace
default:"GLOBAL"
Memory address space
Example:
# Create pointer from DLPack tensor
from cutlass.cute.runtime import from_dlpack

tensor_desc = from_dlpack(torch_tensor)
ptr = cute.make_ptr(
    tensor_desc,
    element_type=cute.Float16,
    address_space=cute.AddressSpace.GLOBAL
)

recast_ptr

Recasts a pointer to a different element type.
new_ptr = cute.recast_ptr(ptr, new_element_type)
ptr
Pointer
required
Original pointer
new_element_type
Type[Numeric]
required
New element data type
Example:
# Recast FP16 pointer to INT8 for quantization
f16_ptr = cute.make_ptr(data, element_type=cute.Float16)
i8_ptr = cute.recast_ptr(f16_ptr, cute.Int8)

Layout Operations

make_layout

Creates a layout from shape and stride.
layout = cute.make_layout(
    shape=(M, N),
    stride=(N, 1)  # Row-major
)
shape
Shape
required
Shape tuple (can be nested)
stride
Stride
default:"compact"
Stride tuple (defaults to compact row-major)
Examples:
# Row-major 2D
layout = cute.make_layout((M, N), stride=(N, 1))

# Column-major 2D
layout = cute.make_layout((M, N), stride=(1, M))

# Hierarchical layout
layout = cute.make_layout(
    shape=((4, 8), (2, 16)),
    stride=((128, 1), (64, 8))
)

make_identity_layout

Creates an identity layout (coordinate layout).
layout = cute.make_identity_layout(shape)
Example:
# Create coordinate layout for predication
coord_layout = cute.make_identity_layout((M, N))

make_ordered_layout

Creates a layout with specified dimension ordering.
layout = cute.make_ordered_layout(
    shape=(M, N),
    order=(1, 0)  # Column-major
)

make_layout_like

Creates a layout matching another layout’s shape.
new_layout = cute.make_layout_like(template_layout)

recast_layout

Recasts a layout to a different element type.
new_layout = cute.recast_layout(layout, old_type, new_type)
Example:
# Recast layout from FP32 to FP16 (doubles logical size)
f16_layout = cute.recast_layout(f32_layout, cute.Float32, cute.Float16)

Identity and Coordinate Tensors

make_identity_tensor

Creates a coordinate tensor for bounds checking.
coord_tensor = cute.make_identity_tensor(shape)
shape
Shape
required
Tensor shape
Example:
# Create coordinate tensor for predication
shape = (M, N)
coord = cute.make_identity_tensor(shape)

# Partition to threads
thr_coord = thr_copy.partition_S(coord)

# Create predicate for out-of-bounds
pred = thr_coord < (M, N)

# Predicated copy
cute.basic_copy_if(pred, src, dst)

Tensor Recasting

recast_tensor

Recasts a tensor to a different element type.
new_tensor = cute.recast_tensor(tensor, new_element_type)
tensor
Tensor
required
Original tensor
new_element_type
Type[Numeric]
required
New element data type
Example:
# Recast for type conversion
f16_tensor = cute.recast_tensor(f32_tensor, cute.Float16)

Memory Space Management

Address Spaces

# Global memory
ptr_global = cute.make_ptr(
    address,
    element_type=cute.Float32,
    address_space=cute.AddressSpace.GLOBAL
)

# Shared memory
ptr_shared = cute.make_ptr(
    address,
    element_type=cute.Float32,
    address_space=cute.AddressSpace.SHARED
)

# Register memory (fragments)
fragment = cute.make_fragment(cute.Float32, layout)

Tensor Partitioning

Partition by Tiled Copy

# Partition source tensor
thrA_src = thr_copy.partition_S(block_tensor)

# Partition destination tensor
thrA_dst = thr_copy.partition_D(fragment)

Partition by Tiled MMA

# Partition A operand
thrA = thr_mma.partition_A(block_A)

# Partition B operand
thrB = thr_mma.partition_B(block_B)

# Partition C (accumulator)
thrC = thr_mma.partition_C(block_C)

Complete Example: Memory Management

import cutlass.cute as cute
from cutlass.cute.runtime import from_dlpack

@cute.kernel
def memory_example_kernel(
    gA: cute.Tensor,  # Global memory input
    gB: cute.Tensor,  # Global memory output
    sA_layout: cute.Layout,  # Shared memory layout
):
    tidx, _, _ = cute.arch.thread_idx()
    bidx, _, _ = cute.arch.block_idx()
    
    # Allocate shared memory
    sA = cute.make_tensor(
        cute.make_ptr(
            cute.arch.smem_ptr(0),
            element_type=gA.element_type,
            address_space=cute.AddressSpace.SHARED
        ),
        sA_layout
    )
    
    # Partition global memory block
    blkA = gA[((None, None), bidx)]
    blkB = gB[((None, None), bidx)]
    
    # Create tiled copy
    copy_atom = cute.make_copy_atom(
        cute.nvgpu.CopyUniversalOp(),
        gA.element_type
    )
    thr_layout = cute.make_layout((4, 32), stride=(32, 1))
    val_layout = cute.make_layout((1, 4), stride=(4, 1))
    tiled_copy = cute.make_tiled_copy_tv(
        copy_atom,
        thr_layout,
        val_layout
    )
    thr_copy = tiled_copy.get_slice(tidx)
    
    # Partition tensors
    thrA_gmem = thr_copy.partition_S(blkA)
    thrA_smem = thr_copy.partition_D(sA)
    
    # Allocate register fragment
    fragA = cute.make_fragment_like(thrA_gmem)
    
    # Load: Global -> Register
    cute.copy(copy_atom, thrA_gmem, fragA)
    
    # Store: Register -> Shared
    cute.copy(copy_atom, fragA, thrA_smem)
    
    # Synchronize threads
    cute.arch.syncthreads()
    
    # Partition output
    thrB = thr_copy.partition_D(blkB)
    
    # Process data (example: simple copy)
    fragB = cute.make_fragment_like(thrB)
    cute.basic_copy(thrA_smem, fragB)
    
    # Store: Register -> Global
    cute.copy(copy_atom, fragB, thrB)


@cute.jit
def memory_example(
    torch_A,
    torch_B,
    stream,
):
    # Convert from DLPack
    A_desc = from_dlpack(torch_A)
    B_desc = from_dlpack(torch_B)
    
    # Create tensors
    M, N = torch_A.shape
    gA = cute.make_tensor(
        cute.make_ptr(A_desc, element_type=cute.Float32),
        cute.make_layout((M, N), stride=(N, 1))
    )
    gB = cute.make_tensor(
        cute.make_ptr(B_desc, element_type=cute.Float32),
        cute.make_layout((M, N), stride=(N, 1))
    )
    
    # Shared memory layout
    tile_M, tile_N = 128, 128
    sA_layout = cute.make_layout((tile_M, tile_N))
    
    # Launch kernel
    num_blocks = (M + tile_M - 1) // tile_M
    memory_example_kernel(gA, gB, sA_layout).launch(
        grid=(num_blocks, 1, 1),
        block=(128, 1, 1),
        stream=stream
    )

See Also

Build docs developers (and LLMs) love