cute::Tensor - CUTLASS

Overview

cute::Tensor is CuTe’s core data structure that combines an Engine (data storage) with a Layout (coordinate-to-index mapping). Tensors can be owning (allocating their own storage) or non-owning (viewing existing memory).

Class Template

template <class Engine, class Layout>
struct Tensor
{
  using iterator     = typename Engine::iterator;
  using value_type   = typename Engine::value_type;
  using element_type = typename Engine::element_type;
  using reference    = typename Engine::reference;
  
  using engine_type  = Engine;
  using layout_type  = Layout;
  
  static constexpr int rank = Layout::rank;
  
  CUTE_HOST_DEVICE constexpr
  Tensor(Engine const& engine, Layout const& layout);
};

Template Parameters

Engine

The data storage engine. Can be:

ViewEngine<T*> for non-owning views
ArrayEngine<T, N> for owning static arrays
ConstViewEngine<const T*> for const views

Layout

Layout<Shape, Stride>

The layout describing the coordinate-to-index mapping.

Source Location

include/cute/tensor_impl.hpp:135-341

Member Functions

Data Access

data()

CUTE_HOST_DEVICE constexpr decltype(auto)
data() const;

CUTE_HOST_DEVICE constexpr decltype(auto)
data();

Returns a pointer/iterator to the underlying data. Example:

auto tensor = make_tensor<float>(make_shape(16, 32));
float* ptr = tensor.data();

layout()

CUTE_HOST_DEVICE constexpr decltype(auto)
layout() const;

Returns the layout of the tensor.

shape()

CUTE_HOST_DEVICE constexpr decltype(auto)
shape() const;

Returns the shape of the tensor.

stride()

CUTE_HOST_DEVICE constexpr decltype(auto)
stride() const;

Returns the stride of the tensor.

size()

CUTE_HOST_DEVICE constexpr auto
size() const;

Returns the total number of elements in the tensor.

Indexing

operator[]

template <class Coord>
CUTE_HOST_DEVICE constexpr decltype(auto)
operator[](Coord const& coord);

template <class Coord>
CUTE_HOST_DEVICE constexpr decltype(auto)
operator[](Coord const& coord) const;

Indexes into the tensor using array-style indexing. Returns a reference to the element. Example:

auto tensor = make_tensor<float>(ptr, make_shape(16, 32));
tensor[make_coord(3, 5)] = 42.0f;

operator()

template <class Coord>
CUTE_HOST_DEVICE constexpr decltype(auto)
operator()(Coord const& coord);

template <class Coord>
CUTE_HOST_DEVICE constexpr decltype(auto)
operator()(Coord const& coord) const;

Indexes or slices the tensor. If coord contains _ (underscore), returns a sliced subtensor. Otherwise, returns a reference to the element. Example:

auto tensor = make_tensor<float>(ptr, make_shape(16, 32));

// Element access
float& elem = tensor(3, 5);

// Slicing
auto row = tensor(3, _);      // Get row 3
auto col = tensor(_, 5);      // Get column 5
auto block = tensor(make_coord(0,4), make_coord(0,8));  // 4x8 block

Transformation

compose()

template <class... Layouts>
CUTE_HOST_DEVICE constexpr auto
compose(Layouts const&... layouts) const;

Composes the tensor’s layout with other layouts, returning a new tensor view. Example:

auto tensor = make_tensor<float>(ptr, make_shape(64));
auto reshaped = tensor.compose(make_layout(make_shape(8, 8)));
// Now has shape (8, 8)

tile()

template <class... Layouts>
CUTE_HOST_DEVICE constexpr auto
tile(Layouts const&... layouts) const;

Tiles the tensor into blocks. Example:

auto tensor = make_tensor<float>(ptr, make_shape(64, 64));
auto tiled = tensor.tile(make_shape(8, 8));
// Shape: ((8,8), (8,8))

Coordinate Conversion

get_1d_coord()

template <class Int>
CUTE_HOST_DEVICE constexpr auto
get_1d_coord(Int const& linear_idx) const;

Converts a linear index to a 1D logical coordinate.

get_hier_coord()

template <class Int>
CUTE_HOST_DEVICE constexpr auto
get_hier_coord(Int const& linear_idx) const;

Converts a linear index to hierarchical coordinates matching the tensor’s shape structure.

get_flat_coord()

template <class Int>
CUTE_HOST_DEVICE constexpr auto
get_flat_coord(Int const& linear_idx) const;

Converts a linear index to flat N-dimensional coordinates.

Factory Functions

make_tensor() - Owning

template <class T, class... Args>
CUTE_HOST_DEVICE constexpr auto
make_tensor(Args const&... args);

Creates an owning tensor that allocates static storage. Parameters:

T: Element type
args: Layout arguments (shape, stride, or Layout)

Example:

// Allocate a 16x32 register tensor of floats
auto tensor = make_tensor<float>(make_shape(Int<16>{}, Int<32>{}));

// Allocate with explicit layout
auto layout = make_layout(make_shape(Int<8>{}, Int<8>{}));
auto tensor2 = make_tensor<double>(layout);

make_tensor() - Non-owning

template <class Iterator, class... Args>
CUTE_HOST_DEVICE constexpr auto
make_tensor(Iterator const& iter, Args const&... args);

Creates a non-owning tensor view over existing memory. Parameters:

iter: Pointer or iterator to the data
args: Layout arguments

Example:

float* gmem_ptr = ...;
auto tensor = make_tensor(gmem_ptr, make_shape(64, 64));

// With explicit stride
auto tensor2 = make_tensor(gmem_ptr, 
                          make_shape(32, 32),
                          make_stride(1, 64));  // 32x32 with pitch 64

make_tensor_like()

template <class NewT, class Engine, class Layout>
CUTE_HOST_DEVICE constexpr auto
make_tensor_like(Tensor<Engine,Layout> const& tensor);

Creates a register tensor with the same shape and layout as another tensor. Example:

auto src = make_tensor(ptr, make_shape(16, 32));
auto fragment = make_tensor_like<float>(src);
// fragment has same shape (16, 32) but owns its data

make_fragment_like()

template <class NewT, class Engine, class Layout>
CUTE_HOST_DEVICE constexpr auto
make_fragment_like(Tensor<Engine,Layout> const& tensor);

Creates a register tensor optimized for fragments, with special handling for the first mode (typically used for MMA/Copy atom values). Example:

auto partitioned = tiled_mma.partition_A(gmem_A);
auto fragment = make_fragment_like<half_t>(partitioned);

make_identity_tensor()

template <class Shape>
CUTE_HOST_DEVICE constexpr auto
make_identity_tensor(Shape const& shape);

Creates a tensor that maps coordinates to themselves (useful for tracking transformations).

Engine Types

ViewEngine

Non-owning view of existing memory:

ViewEngine<float*>          // Mutable view
ViewEngine<half_t*>         // Custom type view
ConstViewEngine<const T*>   // Const view

ArrayEngine

Owning static array (compile-time size):

ArrayEngine<float, 256>     // 256 floats
ArrayEngine<half_t, 1024>   // 1024 half-precision values

Common Operations

Tensor Partitioning

// Partition tensor for threads
auto tensor = make_tensor(gmem_ptr, make_shape(128, 128));
auto tiled = tensor.tile(make_shape(16, 16));  // ((16,16), (8,8))

// Per-thread view
int tid = threadIdx.x;
auto thread_data = tiled(_, make_coord(tid % 8, tid / 8));

Tensor Slicing

auto tensor = make_tensor(ptr, make_shape(64, 64, 32));

// Get a 2D slice
auto slice = tensor(_, _, 5);  // Shape: (64, 64)

// Get a 1D slice
auto row = tensor(10, _, 5);   // Shape: (64,)

// Range slicing
auto block = tensor(make_coord(0, 16), make_coord(0, 16), _);
// Shape: (16, 16, 32)

Tensor Reshaping

// Original 1D tensor
auto vec = make_tensor(ptr, make_shape(256));

// Reshape to 2D
auto mat = vec.compose(make_layout(make_shape(16, 16)));

// Reshape to 3D
auto vol = vec.compose(make_layout(make_shape(8, 8, 4)));

Usage Examples

Basic Tensor Creation and Access

#include <cute/tensor.hpp>

using namespace cute;

__global__ void kernel(float* gmem, int M, int N) {
  // Create a view of global memory
  auto tensor = make_tensor(gmem, make_shape(M, N));
  
  // Access elements
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  int j = blockIdx.y * blockDim.y + threadIdx.y;
  
  if (i < M && j < N) {
    tensor(i, j) = i * N + j;
  }
}

Register Tensors (Fragments)

__device__ void compute() {
  // Allocate a small tensor in registers
  auto fragment = make_tensor<float>(make_shape(Int<8>{}, Int<8>{}));
  
  // Initialize
  for (int i = 0; i < 8; ++i) {
    for (int j = 0; j < 8; ++j) {
      fragment(i, j) = 0.0f;
    }
  }
  
  // Use in computation...
}

Tiled Tensor Processing

__device__ void tiled_copy(float* src, float* dst, int M, int N) {
  auto src_tensor = make_tensor(src, make_shape(M, N));
  auto dst_tensor = make_tensor(dst, make_shape(M, N));
  
  // Tile into 16x16 blocks
  auto src_tiled = src_tensor.tile(make_shape(16, 16));
  auto dst_tiled = dst_tensor.tile(make_shape(16, 16));
  
  // Process each tile
  int tile_x = blockIdx.x;
  int tile_y = blockIdx.y;
  
  auto src_tile = src_tiled(_, make_coord(tile_y, tile_x));
  auto dst_tile = dst_tiled(_, make_coord(tile_y, tile_x));
  
  // Copy tile...
}

Hierarchical Tensor Layouts

// Create a hierarchical tensor for blocked matrix
auto shape = make_shape(make_shape(4, 8),   // Block of 4x8
                        make_shape(16, 8));  // 16x8 blocks
auto tensor = make_tensor<float>(shape);
// Represents 64x64 matrix organized in 4x8 atom blocks

// Access specific block
auto block = tensor(_, make_coord(5, 2));
// Shape: (4, 8)

C++ Templates

CuTe Library

Python API

CuTe DSL

​Overview

​Class Template

​Template Parameters

​Source Location

​Member Functions

​Data Access

​data()

​layout()

​shape()

​stride()

​size()

​Indexing

​operator[]

​operator()

​Transformation

​compose()

​tile()

​Coordinate Conversion

​get_1d_coord()

​get_hier_coord()

​get_flat_coord()

​Factory Functions

​make_tensor() - Owning

​make_tensor() - Non-owning

​make_tensor_like()

​make_fragment_like()

​make_identity_tensor()

​Engine Types

​ViewEngine

​ArrayEngine

​Common Operations

​Tensor Partitioning

​Tensor Slicing

​Tensor Reshaping

​Usage Examples

​Basic Tensor Creation and Access

​Register Tensors (Fragments)

​Tiled Tensor Processing

​Hierarchical Tensor Layouts

​See Also

Build docs developers (and LLMs) love