The NVIDIA Hopper architecture (compute capability 9.0) represents a major leap in GPU computing with Warpgroup Matrix Multiply-Accumulate (WGMMA), Tensor Memory Accelerator (TMA), and native FP8 support.
Supported GPUs
| GPU Model | Compute Capability | Min CUDA Toolkit |
|---|
| NVIDIA H100 Tensor Core GPU | 9.0 | 11.8 |
| NVIDIA H200 Tensor Core GPU | 9.0 | 11.8 |
Hopper architecture-accelerated features require compiling with the sm_90a target (note the “a” suffix) to enable WGMMA and TMA instructions:cmake .. -DCUTLASS_NVCC_ARCHS="90a"
Key Features
1. Warpgroup Matrix Multiply-Accumulate (WGMMA)
WGMMA operates on 4 warps (128 threads) simultaneously, dramatically improving instruction throughput:
// From include/cute/arch/mma_sm90_gmma.hpp
namespace cute::SM90::GMMA {
enum class Major {
K = 0, // K-major layout
MN = 1 // MN-major layout
};
enum class ScaleOut {
Zero = 0, // Zero out accumulator
One = 1 // Accumulate
};
// GMMA 64x8x16 F16+=F16*F16
template <
GMMA::Major tnspA,
GMMA::Major tnspB,
GMMA::ScaleIn scaleA = GMMA::ScaleIn::One,
GMMA::ScaleIn scaleB = GMMA::ScaleIn::One
>
struct MMA_64x8x16_F16F16F16_SS {
using DRegisters = void;
using ARegisters = uint64_t[1];
using BRegisters = uint64_t[1];
using CRegisters = uint32_t[2];
CUTE_HOST_DEVICE static void
fma(uint64_t const& desc_a,
uint64_t const& desc_b,
uint32_t & d0, uint32_t & d1,
GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One) {
#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
asm volatile(
"wgmma.mma_async.sync.aligned.m64n8k16.f16.f16.f16 "
"{%0, %1}, %2, %3, %4, %5, %6;\n"
: "+r"(d0), "+r"(d1)
: "l"(desc_a), "l"(desc_b),
"n"(int32_t(scaleA)), "n"(int32_t(scaleB)),
"n"(int32_t(scale_D)));
#endif
}
};
} // namespace cute::SM90::GMMA
WGMMA Synchronization Primitives:
// From include/cute/arch/mma_sm90_gmma.hpp
CUTE_HOST_DEVICE void warpgroup_arrive() {
#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
asm volatile ("wgmma.fence.sync.aligned;\n" ::: "memory");
#endif
}
template <int N>
CUTE_HOST_DEVICE void warpgroup_wait() {
static_assert(N >= 0 && N <= 7, "WGMMA wait: N must be in range [0, 7]");
#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
asm volatile("wgmma.wait_group.sync.aligned %0;\n" :: "n"(N) : "memory");
#endif
}
CUTE_HOST_DEVICE void warpgroup_commit_batch() {
#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
asm volatile("wgmma.commit_group.sync.aligned;\n" ::: "memory");
#endif
}
Usage Pattern:
// Issue WGMMA operations
warpgroup_arrive();
mma_operation(desc_a, desc_b, accum);
warpgroup_commit_batch();
// Continue with other work
// ...
// Wait for completion (0 = wait for all)
warpgroup_wait<0>();
2. Tensor Memory Accelerator (TMA)
TMA provides hardware-accelerated bulk data transfer between global and shared memory:
// From include/cute/arch/copy_sm90_tma.hpp
struct SM90_TMA_LOAD_2D {
CUTE_HOST_DEVICE static void
copy(void const* desc_ptr,
uint64_t* mbar_ptr,
uint64_t cache_hint,
void* smem_ptr,
int32_t const& crd0,
int32_t const& crd1) {
#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
uint32_t smem_int_ptr = cast_smem_ptr_to_uint(smem_ptr);
asm volatile (
"cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes"
".L2::cache_hint [%0], [%1, {%3, %4}], [%2], %5;\n"
:
: "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
"r"(crd0), "r"(crd1), "l"(cache_hint)
: "memory");
#endif
}
};
TMA Benefits:
- Hardware-managed address generation
- Automatic boundary checking
- Integrated with async barriers
- Support for up to 5D tensors
- Prefetch capabilities
3. Enhanced FP64 Tensor Cores
Hopper provides improved FP64 tensor core shapes:
// From include/cutlass/arch/mma_sm90.h
template <>
struct Mma<
gemm::GemmShape<16,8,4>,
32,
double,
layout::RowMajor,
double,
layout::ColumnMajor,
double,
layout::RowMajor,
OpMultiplyAdd> {
using Shape = gemm::GemmShape<16,8,4>;
using ArchTag = arch::Sm90;
CUTLASS_HOST_DEVICE
void operator()(FragmentC &d, FragmentA const &a,
FragmentB const &b, FragmentC const &c) const {
#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
asm volatile(
"mma.sync.aligned.m16n8k4.row.col.f64.f64.f64.f64.rn "
"{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
: "=d"(D[0]), "=d"(D[1]), "=d"(D[2]), "=d"(D[3])
: "d"(A[0]), "d"(A[1]), "d"(B[0]),
"d"(C[0]), "d"(C[1]), "d"(C[2]), "d"(C[3]));
#endif
}
};
Supported FP64 Shapes:
- 16x8x4 (new in Hopper)
- 16x8x8 (new in Hopper)
- 16x8x16 (new in Hopper)
4. FP8 Tensor Core Support
Hopper introduces native FP8 support with two formats:
- E4M3: 4 exponent bits, 3 mantissa bits (better for forward pass)
- E5M2: 5 exponent bits, 2 mantissa bits (better for gradients)
// FP8 E4M3 x E4M3 -> FP32
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementAccumulator = float;
// GMMA instruction shape for FP8
// 64x8x32 (much larger K dimension than FP16)
Instruction Shapes
| Data Type | WGMMA Shape (MxNxK) | Accumulator |
|---|
| FP8 (E4M3/E5M2) | 64x8x32, 64x16x32, 64x32x32 | FP32 |
| FP16 | 64x8x16, 64x16x16, 64x32x16 | FP16/FP32 |
| BF16 | 64x8x16, 64x16x16, 64x32x16 | FP32 |
| TF32 | 64x8x8, 64x16x8, 64x32x8 | FP32 |
| INT8 | 64x8x32, 64x16x32, 64x32x32 | INT32 |
| FP64 | 16x8x4, 16x8x8, 16x8x16 | FP64 |
Notice the significantly larger M and K dimensions compared to Ampere’s 16x8xK shapes. This enables more efficient computation per instruction.
Thread Block Clusters
Hopper introduces thread block clusters for multi-CTA cooperation:
// From include/cute/arch/cluster_sm90.hpp
namespace cute {
// Get cluster dimensions
CUTE_HOST_DEVICE dim3 cluster_shape() {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
dim3 shape;
asm volatile("mov.u32 %0, %%cluster_nctaid.x;\n" : "=r"(shape.x));
asm volatile("mov.u32 %0, %%cluster_nctaid.y;\n" : "=r"(shape.y));
asm volatile("mov.u32 %0, %%cluster_nctaid.z;\n" : "=r"(shape.z));
return shape;
#else
return {1, 1, 1};
#endif
}
// Synchronize cluster
CUTE_HOST_DEVICE void cluster_arrive() {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
asm volatile("barrier.cluster.arrive;\n" ::: "memory");
#endif
}
CUTE_HOST_DEVICE void cluster_wait() {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
asm volatile("barrier.cluster.wait;\n" ::: "memory");
#endif
}
} // namespace cute
Cluster Configuration:
cudaLaunchConfig_t config;
config.gridDim = dim3(num_blocks_x, num_blocks_y, num_blocks_z);
config.blockDim = dim3(128, 1, 1); // Must be multiple of 128 for WGMMA
// Configure cluster
cudaLaunchAttribute cluster_attr;
cluster_attr.id = cudaLaunchAttributeClusterDimension;
cluster_attr.val.clusterDim.x = 2; // 2x1x1 cluster
cluster_attr.val.clusterDim.y = 1;
cluster_attr.val.clusterDim.z = 1;
config.attrs = &cluster_attr;
config.numAttrs = 1;
cudaLaunchKernelEx(&config, kernel, args...);
Asynchronous Barriers
Hopper enhances barrier functionality for TMA integration:
// Barrier initialization for TMA
__shared__ __align__(16) uint64_t barrier[1];
// Initialize barrier
if (threadIdx.x == 0) {
uint32_t expected_bytes = tile_size_bytes;
asm volatile(
"mbarrier.init.shared.b64 [%0], %1;\n"
:: "r"(cast_smem_ptr_to_uint(barrier)),
"r"(expected_bytes));
}
__syncthreads();
// Issue TMA load with barrier
if (threadIdx.x == 0) {
SM90_TMA_LOAD_2D::copy(
desc_ptr, barrier, cache_hint,
smem_ptr, coord0, coord1
);
}
// Wait for TMA completion
asm volatile(
"mbarrier.arrive_expect_tx.shared.b64 _, [%0], %1;\n"
:: "r"(cast_smem_ptr_to_uint(barrier)),
"r"(tile_size_bytes));
Complete WGMMA+TMA Example
#include "cutlass/cutlass.h"
#include "cutlass/gemm/device/gemm_universal.h"
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cute::Shape<cute::_128, cute::_128, cute::_64>, // TileShape MxNxK
cutlass::gemm::collective::CollectiveMma<
cutlass::gemm::TileScheduler,
cute::Shape<cute::_128, cute::_128, cute::_64>,
cutlass::half_t, // ElementA
cutlass::layout::RowMajor,
cutlass::half_t, // ElementB
cutlass::layout::ColumnMajor,
float, // ElementAccumulator
cute::Shape<cute::_2, cute::_1, cute::_1>, // ClusterShape
cutlass::gemm::collective::StageCount<3>,
cutlass::gemm::collective::KernelTmaWarpSpecialized // Use WGMMA+TMA
>,
cutlass::epilogue::collective::DefaultEpilogue<...>
>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
int main() {
Gemm gemm_op;
typename Gemm::Arguments args{
cutlass::gemm::GemmUniversalMode::kGemm,
{M, N, K},
{A, lda, B, ldb},
{{alpha, beta}, C, ldc, D, ldd}
};
// Kernel uses WGMMA instructions automatically
cutlass::Status status = gemm_op(args);
return status == cutlass::Status::kSuccess ? 0 : -1;
}
Optimal Tile Sizes for WGMMA
// Recommended threadblock shapes for Hopper
using TileShape_128x128x64 = cute::Shape<cute::_128, cute::_128, cute::_64>;
using TileShape_128x256x64 = cute::Shape<cute::_128, cute::_256, cute::_64>;
using TileShape_256x128x64 = cute::Shape<cute::_256, cute::_128, cute::_64>;
Cluster Configuration
- Small problems: 1x1x1 cluster (single CTA)
- Medium problems: 2x1x1 or 1x2x1 cluster
- Large problems: 2x2x1 cluster (maximum practical)
Pipeline Stages
- WGMMA kernels: 3-4 stages typical
- Balance: TMA latency vs shared memory capacity
- Hopper H100: Up to 227 KB shared memory per SM
Compilation
# Required: sm_90a for WGMMA and TMA
nvcc -arch=sm_90a -std=c++17 example.cu -o example
# CMake
cmake .. -DCUTLASS_NVCC_ARCHS="90a"
# Verify architecture-accelerated features
nvcc --ptx -arch=sm_90a example.cu -o example.ptx
grep "wgmma" example.ptx # Should find wgmma instructions
Using sm_90 (without “a”) will not enable WGMMA or TMA. Always use sm_90a for Hopper architecture-accelerated features.
Examples
Hopper-specific examples in CUTLASS:
examples/48_hopper_warp_specialized_gemm/ - WGMMA-based GEMM
examples/57_hopper_warp_specialized_gemm_with_epilogue_visitor/ - Epilogue fusion
examples/58_hopper_persistent_gemm/ - Persistent kernel design
examples/111_hopper_ssd/ - State Space Decomposition
See Also