Overview
cute::TiledCopy represents a tiled copy operation that partitions data movement across multiple threads. It builds on Copy_Atom (hardware-level copy instructions like ldgsts, cp.async, TMA) by tiling them across threads and values.
Class Template
template <class Copy_Atom,
class LayoutCopy_TV, // (tid,vid) -> coord
class ShapeTiler_MN> // coord space
struct TiledCopy : Copy_Atom
{
// Layout information from the CopyAtom
using AtomThrID = typename Copy_Atom::ThrID;
using AtomLayoutSrc = typename Copy_Atom::ValLayoutSrc;
using AtomLayoutDst = typename Copy_Atom::ValLayoutDst;
using AtomLayoutRef = typename Copy_Atom::ValLayoutRef;
// Layout information for the TiledCopy
using Tiler_MN = ShapeTiler_MN;
using TiledLayout_TV = LayoutCopy_TV;
using TiledNumThr = decltype(size<0>(TiledLayout_TV{}));
using TiledNumVal = decltype(size<1>(TiledLayout_TV{}));
};
Template Parameters
Copy_Atom
Copy_Atom<CopyOperation, InternalType>
The atomic copy operation (e.g., SM80_CP_ASYNC_CACHEALWAYS for async copy).
The (thread_id, value_id) -> coordinate layout specifying how threads and values map to coordinates.
The shape of the coordinate space being tiled (typically 2D for matrix operations).
Source Location
include/cute/atom/copy_atom.hpp:182-355
Member Types
Value Type
using ValType = CopyInternalType; // The value type for copy operations
Layout Types
using AtomLayoutSrc = /* ... */; // (thr,val) -> src offset
using AtomLayoutDst = /* ... */; // (thr,val) -> dst offset
using AtomLayoutRef = /* ... */; // (thr,val) -> ref offset
Thread and Value Counts
using AtomNumThr = decltype(size<0>(AtomLayoutRef{}));
using AtomNumVal = decltype(size<1>(AtomLayoutRef{}));
using TiledNumThr = decltype(size<0>(TiledLayout_TV{}));
using TiledNumVal = decltype(size<1>(TiledLayout_TV{}));
Member Functions
Tensor Partitioning
tidfrg_S()
template <class STensor>
CUTE_HOST_DEVICE constexpr static auto
tidfrg_S(STensor&& stensor);
Tiles a source tensor from shape (M,N,...) to shape (Thr,(FrgV,FrgX),(RestM,RestN,...)).
Parameters:
stensor: The source tensor to partition
Returns: Partitioned tensor with thread and fragment modes
Example:
auto gmem_src = make_tensor(src_ptr, make_shape(128, 128));
auto thr_layout = tiled_copy.tidfrg_S(gmem_src);
// Shape: (Thr, (FrgV,FrgX), (RestM,RestN))
tidfrg_D()
template <class DTensor>
CUTE_HOST_DEVICE constexpr static auto
tidfrg_D(DTensor&& dtensor);
Tiles a destination tensor from shape (M,N,...) to shape (Thr,(FrgV,FrgX),(RestM,RestN,...)).
Parameters:
dtensor: The destination tensor to partition
Returns: Partitioned tensor with thread and fragment modes
Thread Slicing
get_slice()
template <class ThrIdx>
CUTE_HOST_DEVICE static auto
get_slice(ThrIdx const& thr_idx);
Returns a ThrCopy object for a specific thread index.
Parameters:
thr_idx: Thread index within the TiledCopy
Returns: ThrCopy<TiledCopy, ThrIdx> for per-thread copy operations
Example:
int tid = threadIdx.x;
auto thr_copy = tiled_copy.get_slice(tid);
get_thread_slice()
template <class ThrIdx>
CUTE_HOST_DEVICE static auto
get_thread_slice(ThrIdx const& thr_idx);
Alias for get_slice(). Returns per-thread copy object.
Layout Accessors
get_layoutS_TV()
CUTE_HOST_DEVICE constexpr static auto
get_layoutS_TV();
Returns the (thread_idx, value_idx) -> (M, N) layout for the source tensor.
get_layoutD_TV()
CUTE_HOST_DEVICE constexpr static auto
get_layoutD_TV();
Returns the (thread_idx, value_idx) -> (M, N) layout for the destination tensor.
Retiling
retile()
template <class Tensor>
CUTE_HOST_DEVICE constexpr static auto
retile(Tensor&& tensor);
Retiles a tensor according to the TiledCopy’s value layout.
ThrCopy - Per-Thread Copy
template <class TiledCopy, class ThrIdx>
struct ThrCopy
{
ThrIdx thr_idx_;
CUTE_HOST_DEVICE
ThrCopy(ThrIdx const& thr_idx);
};
Represents the copy operation for a single thread.
ThrCopy Methods
partition_S()
template <class STensor>
CUTE_HOST_DEVICE auto
partition_S(STensor&& stensor) const;
Partitions the source tensor for this thread.
Returns: Per-thread view of source with shape ((FrgV,FrgX),(RestM,RestN,...))
Example:
auto thr_copy = tiled_copy.get_slice(threadIdx.x);
auto gmem = make_tensor(ptr, make_shape(128, 128));
auto thr_gmem = thr_copy.partition_S(gmem);
partition_D()
template <class DTensor>
CUTE_HOST_DEVICE auto
partition_D(DTensor&& dtensor) const;
Partitions the destination tensor for this thread.
Returns: Per-thread view of destination with shape ((FrgV,FrgX),(RestM,RestN,...))
retile_S()
template <class STensor>
CUTE_HOST_DEVICE static auto
retile_S(STensor&& stensor);
Retiles the source tensor according to the copy pattern.
retile_D()
template <class DTensor>
CUTE_HOST_DEVICE static auto
retile_D(DTensor&& dtensor);
Retiles the destination tensor according to the copy pattern.
Common Copy Atoms
Ampere (SM80)
// Async global to shared copy
SM80_CP_ASYNC_CACHEALWAYS<uint128_t> // 16-byte async copy
SM80_CP_ASYNC_CACHEGLOBAL<uint128_t> // 16-byte with cache hint
// LDG (load global)
SM50_U32x1_LDSM_N // Scalar load
SM50_U32x2_LDSM_N // Vectorized load
SM50_U32x4_LDSM_N // 16-byte vectorized load
Hopper (SM90)
// Tensor Memory Accelerator (TMA)
SM90_TMA_LOAD // TMA load
SM90_TMA_LOAD_MULTICAST // TMA multicast load
SM90_TMA_STORE // TMA store
// Bulk async copy
SM90_BULK_COPY_AUTO // Automatic bulk copy
SM90_BULK_COPY_G2S // Global to shared
Factory Functions
make_tiled_copy()
template <class Copy_Atom,
class LayoutCopy_TV,
class ShapeTiler_MN>
CUTE_HOST_DEVICE constexpr auto
make_tiled_copy(Copy_Atom const& copy_atom,
LayoutCopy_TV const& layout_tv,
ShapeTiler_MN const& tiler_mn);
Creates a TiledCopy from a copy atom, thread-value layout, and tiler shape.
Parameters:
copy_atom: The atomic copy operation
layout_tv: (thread, value) layout
tiler_mn: Shape to tile in (M, N) dimensions
Example:
using namespace cute;
// 16-byte vectorized copy
using Copy_Atom = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<uint128_t>, half_t>;
// 128 threads, each copies 8 values
auto layout_tv = make_layout(
make_shape(Int<128>{}, Int<8>{}),
make_stride(Int<8>{}, Int<1>{})
);
// Tile a 128x128 region
auto tiled_copy = make_tiled_copy(
Copy_Atom{},
layout_tv,
make_shape(Int<128>{}, Int<128>{})
);
Usage Examples
Basic TiledCopy Setup
#include <cute/tensor.hpp>
#include <cute/atom/copy_atom.hpp>
using namespace cute;
// Define copy atom for vectorized loads
using Copy_Atom = Copy_Atom<
SM80_CP_ASYNC_CACHEALWAYS<uint128_t>,
half_t
>;
// Create thread-value layout
auto layout_tv = make_layout(
make_shape(Int<128>{}, Int<8>{})
);
// Create tiled copy
auto tiled_copy = make_tiled_copy(
Copy_Atom{},
layout_tv,
make_shape(Int<128>{}, Int<128>{})
);
// Get per-thread copy
int tid = threadIdx.x;
auto thr_copy = tiled_copy.get_slice(tid);
Global to Shared Memory Copy
__device__ void g2s_copy_example(
half_t* gmem_ptr,
half_t* smem_ptr,
int M, int K)
{
// Create tensors
auto gmem = make_tensor(gmem_ptr, make_shape(M, K));
auto smem = make_tensor(make_smem_ptr(smem_ptr), make_shape(M, K));
// Setup TiledCopy
using Copy_Atom = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<uint128_t>, half_t>;
auto tiled_copy = make_tiled_copy(
Copy_Atom{},
make_layout(make_shape(Int<128>{}, Int<8>{})),
make_shape(M, K)
);
// Get per-thread copy
auto thr_copy = tiled_copy.get_slice(threadIdx.x);
// Partition tensors
auto thr_gmem = thr_copy.partition_S(gmem);
auto thr_smem = thr_copy.partition_D(smem);
// Perform async copy
copy(tiled_copy, thr_gmem, thr_smem);
// Wait for completion
cp_async_fence();
cp_async_wait<0>();
}
Shared to Register Copy
__device__ void s2r_copy_example(
half_t* smem_ptr,
int M, int K)
{
// Shared memory tensor
auto smem = make_tensor(make_smem_ptr(smem_ptr), make_shape(M, K));
// Setup TiledCopy for LDS (shared to register)
using Copy_Atom = Copy_Atom<SM75_U32x4_LDSM_N, half_t>;
auto tiled_copy = make_tiled_copy(
Copy_Atom{},
make_layout(make_shape(Int<32>{}, Int<8>{})),
make_shape(M, K)
);
// Get per-thread copy
auto thr_copy = tiled_copy.get_slice(threadIdx.x);
auto thr_smem = thr_copy.partition_S(smem);
// Create register fragment
auto fragment = make_fragment_like(thr_smem);
// Copy to registers
copy(tiled_copy, thr_smem, fragment);
}
Multi-Stage Pipeline with TiledCopy
template <int NUM_STAGES>
__device__ void pipeline_copy(
half_t* gmem_A, half_t* gmem_B,
half_t* smem_A, half_t* smem_B,
int M, int N, int K)
{
// Setup tensors
auto gA = make_tensor(gmem_A, make_shape(M, K));
auto gB = make_tensor(gmem_B, make_shape(N, K));
// Multi-stage shared memory
auto sA_stages = make_tensor(
make_smem_ptr(smem_A),
make_shape(M, K, Int<NUM_STAGES>{})
);
auto sB_stages = make_tensor(
make_smem_ptr(smem_B),
make_shape(N, K, Int<NUM_STAGES>{})
);
// Setup TiledCopy
using Copy_Atom = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<uint128_t>, half_t>;
auto tiled_copy = make_tiled_copy(
Copy_Atom{},
make_layout(make_shape(Int<128>{}, Int<8>{})),
make_shape(M, K)
);
auto thr_copy = tiled_copy.get_slice(threadIdx.x);
// Partition
auto thr_gA = thr_copy.partition_S(gA);
auto thr_gB = thr_copy.partition_S(gB);
auto thr_sA = thr_copy.partition_D(sA_stages);
auto thr_sB = thr_copy.partition_D(sB_stages);
// Prefetch first NUM_STAGES-1 tiles
for (int stage = 0; stage < NUM_STAGES-1; ++stage) {
copy(tiled_copy, thr_gA(_, _, stage), thr_sA(_, _, stage));
copy(tiled_copy, thr_gB(_, _, stage), thr_sB(_, _, stage));
cp_async_fence();
}
// Main loop with rotating stages
int num_tiles = K / TILE_K;
for (int tile = 0; tile < num_tiles; ++tile) {
int read_stage = tile % NUM_STAGES;
int write_stage = (tile + NUM_STAGES - 1) % NUM_STAGES;
// Wait for current stage
cp_async_wait<NUM_STAGES-2>();
__syncthreads();
// Compute with read_stage data
// ...
// Issue next copy into write_stage
if (tile + NUM_STAGES - 1 < num_tiles) {
copy(tiled_copy, thr_gA(_, _, write_stage), thr_sA(_, _, write_stage));
copy(tiled_copy, thr_gB(_, _, write_stage), thr_sB(_, _, write_stage));
cp_async_fence();
}
}
}
TMA Copy (Hopper SM90)
__device__ void tma_copy_example(
half_t* gmem_ptr,
half_t* smem_ptr,
int M, int K)
{
// Setup tensors
auto gmem = make_tensor(gmem_ptr, make_shape(M, K));
auto smem = make_tensor(make_smem_ptr(smem_ptr), make_shape(M, K));
// TMA requires special setup with descriptor
// TMA copy atom
using Copy_Atom = Copy_Atom<SM90_TMA_LOAD, half_t>;
// TiledCopy with TMA
auto tiled_copy = make_tiled_copy(
Copy_Atom{},
make_layout(make_shape(Int<1>{}, Int<128>{})), // TMA is 1 thread
make_shape(M, K)
);
// Only one thread issues TMA
if (threadIdx.x == 0) {
auto thr_copy = tiled_copy.get_slice(0);
auto thr_gmem = thr_copy.partition_S(gmem);
auto thr_smem = thr_copy.partition_D(smem);
// TMA copy (async)
copy(tiled_copy, thr_gmem, thr_smem);
}
// Arrive-wait barrier
__syncthreads();
}
Custom Copy Pattern
// Create a custom interleaved copy pattern
auto custom_layout = make_layout(
make_shape(Int<128>{}, Int<8>{}),
make_stride(Int<8>{}, Int<1>{}) // Threads interleaved
);
using Copy_Atom = Copy_Atom<UniversalCopy<half_t>, half_t>;
auto tiled_copy = make_tiled_copy(
Copy_Atom{},
custom_layout,
make_shape(Int<128>{}, Int<128>{})
);
See Also