Overview
cute::Tensor is CuTe’s core data structure that combines an Engine (data storage) with a Layout (coordinate-to-index mapping). Tensors can be owning (allocating their own storage) or non-owning (viewing existing memory).
Class Template
template <class Engine, class Layout>
struct Tensor
{
using iterator = typename Engine::iterator;
using value_type = typename Engine::value_type;
using element_type = typename Engine::element_type;
using reference = typename Engine::reference;
using engine_type = Engine;
using layout_type = Layout;
static constexpr int rank = Layout::rank;
CUTE_HOST_DEVICE constexpr
Tensor(Engine const& engine, Layout const& layout);
};
Template Parameters
The data storage engine. Can be:
ViewEngine<T*> for non-owning views
ArrayEngine<T, N> for owning static arrays
ConstViewEngine<const T*> for const views
The layout describing the coordinate-to-index mapping.
Source Location
include/cute/tensor_impl.hpp:135-341
Member Functions
Data Access
data()
CUTE_HOST_DEVICE constexpr decltype(auto)
data() const;
CUTE_HOST_DEVICE constexpr decltype(auto)
data();
Returns a pointer/iterator to the underlying data.
Example:
auto tensor = make_tensor<float>(make_shape(16, 32));
float* ptr = tensor.data();
layout()
CUTE_HOST_DEVICE constexpr decltype(auto)
layout() const;
Returns the layout of the tensor.
shape()
CUTE_HOST_DEVICE constexpr decltype(auto)
shape() const;
Returns the shape of the tensor.
stride()
CUTE_HOST_DEVICE constexpr decltype(auto)
stride() const;
Returns the stride of the tensor.
size()
CUTE_HOST_DEVICE constexpr auto
size() const;
Returns the total number of elements in the tensor.
Indexing
operator[]
template <class Coord>
CUTE_HOST_DEVICE constexpr decltype(auto)
operator[](Coord const& coord);
template <class Coord>
CUTE_HOST_DEVICE constexpr decltype(auto)
operator[](Coord const& coord) const;
Indexes into the tensor using array-style indexing. Returns a reference to the element.
Example:
auto tensor = make_tensor<float>(ptr, make_shape(16, 32));
tensor[make_coord(3, 5)] = 42.0f;
operator()
template <class Coord>
CUTE_HOST_DEVICE constexpr decltype(auto)
operator()(Coord const& coord);
template <class Coord>
CUTE_HOST_DEVICE constexpr decltype(auto)
operator()(Coord const& coord) const;
Indexes or slices the tensor. If coord contains _ (underscore), returns a sliced subtensor. Otherwise, returns a reference to the element.
Example:
auto tensor = make_tensor<float>(ptr, make_shape(16, 32));
// Element access
float& elem = tensor(3, 5);
// Slicing
auto row = tensor(3, _); // Get row 3
auto col = tensor(_, 5); // Get column 5
auto block = tensor(make_coord(0,4), make_coord(0,8)); // 4x8 block
compose()
template <class... Layouts>
CUTE_HOST_DEVICE constexpr auto
compose(Layouts const&... layouts) const;
Composes the tensor’s layout with other layouts, returning a new tensor view.
Example:
auto tensor = make_tensor<float>(ptr, make_shape(64));
auto reshaped = tensor.compose(make_layout(make_shape(8, 8)));
// Now has shape (8, 8)
tile()
template <class... Layouts>
CUTE_HOST_DEVICE constexpr auto
tile(Layouts const&... layouts) const;
Tiles the tensor into blocks.
Example:
auto tensor = make_tensor<float>(ptr, make_shape(64, 64));
auto tiled = tensor.tile(make_shape(8, 8));
// Shape: ((8,8), (8,8))
Coordinate Conversion
get_1d_coord()
template <class Int>
CUTE_HOST_DEVICE constexpr auto
get_1d_coord(Int const& linear_idx) const;
Converts a linear index to a 1D logical coordinate.
get_hier_coord()
template <class Int>
CUTE_HOST_DEVICE constexpr auto
get_hier_coord(Int const& linear_idx) const;
Converts a linear index to hierarchical coordinates matching the tensor’s shape structure.
get_flat_coord()
template <class Int>
CUTE_HOST_DEVICE constexpr auto
get_flat_coord(Int const& linear_idx) const;
Converts a linear index to flat N-dimensional coordinates.
Factory Functions
make_tensor() - Owning
template <class T, class... Args>
CUTE_HOST_DEVICE constexpr auto
make_tensor(Args const&... args);
Creates an owning tensor that allocates static storage.
Parameters:
T: Element type
args: Layout arguments (shape, stride, or Layout)
Example:
// Allocate a 16x32 register tensor of floats
auto tensor = make_tensor<float>(make_shape(Int<16>{}, Int<32>{}));
// Allocate with explicit layout
auto layout = make_layout(make_shape(Int<8>{}, Int<8>{}));
auto tensor2 = make_tensor<double>(layout);
make_tensor() - Non-owning
template <class Iterator, class... Args>
CUTE_HOST_DEVICE constexpr auto
make_tensor(Iterator const& iter, Args const&... args);
Creates a non-owning tensor view over existing memory.
Parameters:
iter: Pointer or iterator to the data
args: Layout arguments
Example:
float* gmem_ptr = ...;
auto tensor = make_tensor(gmem_ptr, make_shape(64, 64));
// With explicit stride
auto tensor2 = make_tensor(gmem_ptr,
make_shape(32, 32),
make_stride(1, 64)); // 32x32 with pitch 64
make_tensor_like()
template <class NewT, class Engine, class Layout>
CUTE_HOST_DEVICE constexpr auto
make_tensor_like(Tensor<Engine,Layout> const& tensor);
Creates a register tensor with the same shape and layout as another tensor.
Example:
auto src = make_tensor(ptr, make_shape(16, 32));
auto fragment = make_tensor_like<float>(src);
// fragment has same shape (16, 32) but owns its data
make_fragment_like()
template <class NewT, class Engine, class Layout>
CUTE_HOST_DEVICE constexpr auto
make_fragment_like(Tensor<Engine,Layout> const& tensor);
Creates a register tensor optimized for fragments, with special handling for the first mode (typically used for MMA/Copy atom values).
Example:
auto partitioned = tiled_mma.partition_A(gmem_A);
auto fragment = make_fragment_like<half_t>(partitioned);
make_identity_tensor()
template <class Shape>
CUTE_HOST_DEVICE constexpr auto
make_identity_tensor(Shape const& shape);
Creates a tensor that maps coordinates to themselves (useful for tracking transformations).
Engine Types
ViewEngine
Non-owning view of existing memory:
ViewEngine<float*> // Mutable view
ViewEngine<half_t*> // Custom type view
ConstViewEngine<const T*> // Const view
ArrayEngine
Owning static array (compile-time size):
ArrayEngine<float, 256> // 256 floats
ArrayEngine<half_t, 1024> // 1024 half-precision values
Common Operations
Tensor Partitioning
// Partition tensor for threads
auto tensor = make_tensor(gmem_ptr, make_shape(128, 128));
auto tiled = tensor.tile(make_shape(16, 16)); // ((16,16), (8,8))
// Per-thread view
int tid = threadIdx.x;
auto thread_data = tiled(_, make_coord(tid % 8, tid / 8));
Tensor Slicing
auto tensor = make_tensor(ptr, make_shape(64, 64, 32));
// Get a 2D slice
auto slice = tensor(_, _, 5); // Shape: (64, 64)
// Get a 1D slice
auto row = tensor(10, _, 5); // Shape: (64,)
// Range slicing
auto block = tensor(make_coord(0, 16), make_coord(0, 16), _);
// Shape: (16, 16, 32)
Tensor Reshaping
// Original 1D tensor
auto vec = make_tensor(ptr, make_shape(256));
// Reshape to 2D
auto mat = vec.compose(make_layout(make_shape(16, 16)));
// Reshape to 3D
auto vol = vec.compose(make_layout(make_shape(8, 8, 4)));
Usage Examples
Basic Tensor Creation and Access
#include <cute/tensor.hpp>
using namespace cute;
__global__ void kernel(float* gmem, int M, int N) {
// Create a view of global memory
auto tensor = make_tensor(gmem, make_shape(M, N));
// Access elements
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < M && j < N) {
tensor(i, j) = i * N + j;
}
}
Register Tensors (Fragments)
__device__ void compute() {
// Allocate a small tensor in registers
auto fragment = make_tensor<float>(make_shape(Int<8>{}, Int<8>{}));
// Initialize
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) {
fragment(i, j) = 0.0f;
}
}
// Use in computation...
}
Tiled Tensor Processing
__device__ void tiled_copy(float* src, float* dst, int M, int N) {
auto src_tensor = make_tensor(src, make_shape(M, N));
auto dst_tensor = make_tensor(dst, make_shape(M, N));
// Tile into 16x16 blocks
auto src_tiled = src_tensor.tile(make_shape(16, 16));
auto dst_tiled = dst_tensor.tile(make_shape(16, 16));
// Process each tile
int tile_x = blockIdx.x;
int tile_y = blockIdx.y;
auto src_tile = src_tiled(_, make_coord(tile_y, tile_x));
auto dst_tile = dst_tiled(_, make_coord(tile_y, tile_x));
// Copy tile...
}
Hierarchical Tensor Layouts
// Create a hierarchical tensor for blocked matrix
auto shape = make_shape(make_shape(4, 8), // Block of 4x8
make_shape(16, 8)); // 16x8 blocks
auto tensor = make_tensor<float>(shape);
// Represents 64x64 matrix organized in 4x8 atom blocks
// Access specific block
auto block = tensor(_, make_coord(5, 2));
// Shape: (4, 8)
See Also