Skip to main content
The NVIDIA Hopper architecture (compute capability 9.0) represents a major leap in GPU computing with Warpgroup Matrix Multiply-Accumulate (WGMMA), Tensor Memory Accelerator (TMA), and native FP8 support.

Supported GPUs

GPU ModelCompute CapabilityMin CUDA Toolkit
NVIDIA H100 Tensor Core GPU9.011.8
NVIDIA H200 Tensor Core GPU9.011.8
Hopper architecture-accelerated features require compiling with the sm_90a target (note the “a” suffix) to enable WGMMA and TMA instructions:
cmake .. -DCUTLASS_NVCC_ARCHS="90a"

Key Features

1. Warpgroup Matrix Multiply-Accumulate (WGMMA)

WGMMA operates on 4 warps (128 threads) simultaneously, dramatically improving instruction throughput:
// From include/cute/arch/mma_sm90_gmma.hpp
namespace cute::SM90::GMMA {

enum class Major {
  K  = 0,  // K-major layout
  MN = 1   // MN-major layout
};

enum class ScaleOut {
  Zero = 0,  // Zero out accumulator
  One  = 1   // Accumulate
};

// GMMA 64x8x16 F16+=F16*F16
template <
  GMMA::Major tnspA,
  GMMA::Major tnspB,
  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
>
struct MMA_64x8x16_F16F16F16_SS {
  using DRegisters = void;
  using ARegisters = uint64_t[1];
  using BRegisters = uint64_t[1];
  using CRegisters = uint32_t[2];
  
  CUTE_HOST_DEVICE static void
  fma(uint64_t const& desc_a,
      uint64_t const& desc_b,
      uint32_t      & d0, uint32_t      & d1,
      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One) {
#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
    asm volatile(
      "wgmma.mma_async.sync.aligned.m64n8k16.f16.f16.f16 "
      "{%0, %1}, %2, %3, %4, %5, %6;\n"
      : "+r"(d0), "+r"(d1)
      : "l"(desc_a), "l"(desc_b),
        "n"(int32_t(scaleA)), "n"(int32_t(scaleB)),
        "n"(int32_t(scale_D)));
#endif
  }
};

} // namespace cute::SM90::GMMA
WGMMA Synchronization Primitives:
// From include/cute/arch/mma_sm90_gmma.hpp
CUTE_HOST_DEVICE void warpgroup_arrive() {
#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
  asm volatile ("wgmma.fence.sync.aligned;\n" ::: "memory");
#endif
}

template <int N>
CUTE_HOST_DEVICE void warpgroup_wait() {
  static_assert(N >= 0 && N <= 7, "WGMMA wait: N must be in range [0, 7]");
#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
  asm volatile("wgmma.wait_group.sync.aligned %0;\n" :: "n"(N) : "memory");
#endif
}

CUTE_HOST_DEVICE void warpgroup_commit_batch() {
#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
  asm volatile("wgmma.commit_group.sync.aligned;\n" ::: "memory");
#endif
}
Usage Pattern:
// Issue WGMMA operations
warpgroup_arrive();
mma_operation(desc_a, desc_b, accum);
warpgroup_commit_batch();

// Continue with other work
// ...

// Wait for completion (0 = wait for all)
warpgroup_wait<0>();

2. Tensor Memory Accelerator (TMA)

TMA provides hardware-accelerated bulk data transfer between global and shared memory:
// From include/cute/arch/copy_sm90_tma.hpp
struct SM90_TMA_LOAD_2D {
  CUTE_HOST_DEVICE static void
  copy(void const* desc_ptr, 
       uint64_t* mbar_ptr, 
       uint64_t cache_hint,
       void* smem_ptr,
       int32_t const& crd0, 
       int32_t const& crd1) {
#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
    
    asm volatile (
      "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes"
      ".L2::cache_hint [%0], [%1, {%3, %4}], [%2], %5;\n"
      :
      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
        "r"(crd0), "r"(crd1), "l"(cache_hint)
      : "memory");
#endif
  }
};
TMA Benefits:
  • Hardware-managed address generation
  • Automatic boundary checking
  • Integrated with async barriers
  • Support for up to 5D tensors
  • Prefetch capabilities

3. Enhanced FP64 Tensor Cores

Hopper provides improved FP64 tensor core shapes:
// From include/cutlass/arch/mma_sm90.h
template <>
struct Mma<
  gemm::GemmShape<16,8,4>,
  32,
  double,
  layout::RowMajor,
  double,
  layout::ColumnMajor,
  double,
  layout::RowMajor,
  OpMultiplyAdd> {
  
  using Shape = gemm::GemmShape<16,8,4>;
  using ArchTag = arch::Sm90;
  
  CUTLASS_HOST_DEVICE
  void operator()(FragmentC &d, FragmentA const &a, 
                  FragmentB const &b, FragmentC const &c) const {
#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
    asm volatile(
      "mma.sync.aligned.m16n8k4.row.col.f64.f64.f64.f64.rn "
      "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
      : "=d"(D[0]), "=d"(D[1]), "=d"(D[2]), "=d"(D[3])
      : "d"(A[0]), "d"(A[1]), "d"(B[0]),
        "d"(C[0]), "d"(C[1]), "d"(C[2]), "d"(C[3]));
#endif
  }
};
Supported FP64 Shapes:
  • 16x8x4 (new in Hopper)
  • 16x8x8 (new in Hopper)
  • 16x8x16 (new in Hopper)

4. FP8 Tensor Core Support

Hopper introduces native FP8 support with two formats:
  • E4M3: 4 exponent bits, 3 mantissa bits (better for forward pass)
  • E5M2: 5 exponent bits, 2 mantissa bits (better for gradients)
// FP8 E4M3 x E4M3 -> FP32
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementAccumulator = float;

// GMMA instruction shape for FP8
// 64x8x32 (much larger K dimension than FP16)

Instruction Shapes

Data TypeWGMMA Shape (MxNxK)Accumulator
FP8 (E4M3/E5M2)64x8x32, 64x16x32, 64x32x32FP32
FP1664x8x16, 64x16x16, 64x32x16FP16/FP32
BF1664x8x16, 64x16x16, 64x32x16FP32
TF3264x8x8, 64x16x8, 64x32x8FP32
INT864x8x32, 64x16x32, 64x32x32INT32
FP6416x8x4, 16x8x8, 16x8x16FP64
Notice the significantly larger M and K dimensions compared to Ampere’s 16x8xK shapes. This enables more efficient computation per instruction.

Thread Block Clusters

Hopper introduces thread block clusters for multi-CTA cooperation:
// From include/cute/arch/cluster_sm90.hpp
namespace cute {

// Get cluster dimensions
CUTE_HOST_DEVICE dim3 cluster_shape() {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
  dim3 shape;
  asm volatile("mov.u32 %0, %%cluster_nctaid.x;\n" : "=r"(shape.x));
  asm volatile("mov.u32 %0, %%cluster_nctaid.y;\n" : "=r"(shape.y));
  asm volatile("mov.u32 %0, %%cluster_nctaid.z;\n" : "=r"(shape.z));
  return shape;
#else
  return {1, 1, 1};
#endif
}

// Synchronize cluster
CUTE_HOST_DEVICE void cluster_arrive() {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
  asm volatile("barrier.cluster.arrive;\n" ::: "memory");
#endif
}

CUTE_HOST_DEVICE void cluster_wait() {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
  asm volatile("barrier.cluster.wait;\n" ::: "memory");
#endif
}

} // namespace cute
Cluster Configuration:
cudaLaunchConfig_t config;
config.gridDim = dim3(num_blocks_x, num_blocks_y, num_blocks_z);
config.blockDim = dim3(128, 1, 1);  // Must be multiple of 128 for WGMMA

// Configure cluster
cudaLaunchAttribute cluster_attr;
cluster_attr.id = cudaLaunchAttributeClusterDimension;
cluster_attr.val.clusterDim.x = 2;  // 2x1x1 cluster
cluster_attr.val.clusterDim.y = 1;
cluster_attr.val.clusterDim.z = 1;

config.attrs = &cluster_attr;
config.numAttrs = 1;

cudaLaunchKernelEx(&config, kernel, args...);

Asynchronous Barriers

Hopper enhances barrier functionality for TMA integration:
// Barrier initialization for TMA
__shared__ __align__(16) uint64_t barrier[1];

// Initialize barrier
if (threadIdx.x == 0) {
  uint32_t expected_bytes = tile_size_bytes;
  asm volatile(
    "mbarrier.init.shared.b64 [%0], %1;\n"
    :: "r"(cast_smem_ptr_to_uint(barrier)),
       "r"(expected_bytes));
}
__syncthreads();

// Issue TMA load with barrier
if (threadIdx.x == 0) {
  SM90_TMA_LOAD_2D::copy(
    desc_ptr, barrier, cache_hint,
    smem_ptr, coord0, coord1
  );
}

// Wait for TMA completion
asm volatile(
  "mbarrier.arrive_expect_tx.shared.b64 _, [%0], %1;\n"
  :: "r"(cast_smem_ptr_to_uint(barrier)),
     "r"(tile_size_bytes));

Complete WGMMA+TMA Example

#include "cutlass/cutlass.h"
#include "cutlass/gemm/device/gemm_universal.h"

using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
  cute::Shape<cute::_128, cute::_128, cute::_64>,  // TileShape MxNxK
  cutlass::gemm::collective::CollectiveMma<
    cutlass::gemm::TileScheduler,
    cute::Shape<cute::_128, cute::_128, cute::_64>,
    cutlass::half_t,    // ElementA
    cutlass::layout::RowMajor,
    cutlass::half_t,    // ElementB  
    cutlass::layout::ColumnMajor,
    float,              // ElementAccumulator
    cute::Shape<cute::_2, cute::_1, cute::_1>,  // ClusterShape
    cutlass::gemm::collective::StageCount<3>,
    cutlass::gemm::collective::KernelTmaWarpSpecialized  // Use WGMMA+TMA
  >,
  cutlass::epilogue::collective::DefaultEpilogue<...>
>;

using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;

int main() {
  Gemm gemm_op;
  
  typename Gemm::Arguments args{
    cutlass::gemm::GemmUniversalMode::kGemm,
    {M, N, K},
    {A, lda, B, ldb},
    {{alpha, beta}, C, ldc, D, ldd}
  };
  
  // Kernel uses WGMMA instructions automatically
  cutlass::Status status = gemm_op(args);
  
  return status == cutlass::Status::kSuccess ? 0 : -1;
}

Performance Optimization

Optimal Tile Sizes for WGMMA

// Recommended threadblock shapes for Hopper
using TileShape_128x128x64  = cute::Shape<cute::_128, cute::_128, cute::_64>;
using TileShape_128x256x64  = cute::Shape<cute::_128, cute::_256, cute::_64>;
using TileShape_256x128x64  = cute::Shape<cute::_256, cute::_128, cute::_64>;

Cluster Configuration

  • Small problems: 1x1x1 cluster (single CTA)
  • Medium problems: 2x1x1 or 1x2x1 cluster
  • Large problems: 2x2x1 cluster (maximum practical)

Pipeline Stages

  • WGMMA kernels: 3-4 stages typical
  • Balance: TMA latency vs shared memory capacity
  • Hopper H100: Up to 227 KB shared memory per SM

Compilation

# Required: sm_90a for WGMMA and TMA
nvcc -arch=sm_90a -std=c++17 example.cu -o example

# CMake
cmake .. -DCUTLASS_NVCC_ARCHS="90a"

# Verify architecture-accelerated features
nvcc --ptx -arch=sm_90a example.cu -o example.ptx
grep "wgmma" example.ptx  # Should find wgmma instructions
Using sm_90 (without “a”) will not enable WGMMA or TMA. Always use sm_90a for Hopper architecture-accelerated features.

Examples

Hopper-specific examples in CUTLASS:
  • examples/48_hopper_warp_specialized_gemm/ - WGMMA-based GEMM
  • examples/57_hopper_warp_specialized_gemm_with_epilogue_visitor/ - Epilogue fusion
  • examples/58_hopper_persistent_gemm/ - Persistent kernel design
  • examples/111_hopper_ssd/ - State Space Decomposition

See Also

Build docs developers (and LLMs) love