Skip to main content

Overview

cutlass::Array is a statically sized array template that accommodates all CUTLASS-supported numeric types and is safe to use in a union. It provides STL-compatible interfaces including iterators, element access, and various operations. Header: cutlass/array.h

Template Signature

template <
  typename T,
  int N,
  bool RegisterSized = sizeof_bits<T>::value >= 32
>
struct Array;

Template Parameters

T
typename
Element type. Can be any CUTLASS numeric type including float, half_t, int8_t, etc.
N
int
Number of elements in the array
RegisterSized
bool
default:"sizeof_bits<T>::value >= 32"
Internal optimization flag. Set to true for types with 32 or more bits.

Member Types

using Storage = T;
using Element = T;

static constexpr size_t kStorageElements = N;
static constexpr size_t kElements = N;

// C++ standard container types
typedef T value_type;
typedef size_t size_type;
typedef ptrdiff_t difference_type;
typedef value_type &reference;
typedef value_type const & const_reference;
typedef value_type *pointer;
typedef value_type const * const_pointer;

Data Members

Storage storage[kElements];
Internal storage for array elements.

Member Functions

Element Access

at

CUTLASS_HOST_DEVICE
reference at(size_type pos);

CUTLASS_HOST_DEVICE
const_reference at(size_type pos) const;
Accesses element at specified position with bounds checking.

operator[]

CUTLASS_HOST_DEVICE
reference operator[](size_type pos);

CUTLASS_HOST_DEVICE
const_reference operator[](size_type pos) const;
Accesses element at specified position without bounds checking.

front

CUTLASS_HOST_DEVICE
reference front();

CUTLASS_HOST_DEVICE
const_reference front() const;
Returns reference to the first element.

back

CUTLASS_HOST_DEVICE
reference back();

CUTLASS_HOST_DEVICE
const_reference back() const;
Returns reference to the last element.

data

CUTLASS_HOST_DEVICE
pointer data();

CUTLASS_HOST_DEVICE
const_pointer data() const;
Returns pointer to the underlying array.

raw_data

CUTLASS_HOST_DEVICE
pointer raw_data();

CUTLASS_HOST_DEVICE
const_pointer raw_data() const;
Returns pointer to the underlying storage.

Capacity

empty

CUTLASS_HOST_DEVICE
constexpr bool empty() const;
Returns true if the array is empty (N == 0).

size

CUTLASS_HOST_DEVICE
constexpr size_type size() const;
Returns the number of elements (N).

max_size

CUTLASS_HOST_DEVICE
constexpr size_type max_size() const;
Returns the maximum number of elements (N).

Operations

clear

CUTLASS_HOST_DEVICE
void clear();
Sets all elements to zero using fill(T(0)).

fill

CUTLASS_HOST_DEVICE
void fill(T const &value);
Fills all elements with the specified value. Implementation:
CUTLASS_PRAGMA_UNROLL
for (int i = 0; i < int(kElements); ++i) {
  storage[i] = static_cast<Storage>(value);
}

Iterators

begin / end

CUTLASS_HOST_DEVICE
iterator begin();

CUTLASS_HOST_DEVICE
const_iterator begin() const;

CUTLASS_HOST_DEVICE
const_iterator cbegin() const;

CUTLASS_HOST_DEVICE
iterator end();

CUTLASS_HOST_DEVICE
const_iterator end() const;

CUTLASS_HOST_DEVICE
const_iterator cend() const;
Returns iterators to the beginning and end of the array.

rbegin / rend

CUTLASS_HOST_DEVICE
reverse_iterator rbegin();

CUTLASS_HOST_DEVICE
const_reverse_iterator rbegin() const;

CUTLASS_HOST_DEVICE
const_reverse_iterator crbegin() const;

CUTLASS_HOST_DEVICE
reverse_iterator rend();

CUTLASS_HOST_DEVICE
const_reverse_iterator rend() const;

CUTLASS_HOST_DEVICE
const_reverse_iterator crend() const;
Returns reverse iterators.

Factory Functions

make_Array

template <typename Element>
CUTLASS_HOST_DEVICE
Array<Element, 1> make_Array(Element x);

template <typename Element>
CUTLASS_HOST_DEVICE
Array<Element, 2> make_Array(Element x, Element y);

template <typename Element>
CUTLASS_HOST_DEVICE
Array<Element, 3> make_Array(Element x, Element y, Element z);

template <typename Element>
CUTLASS_HOST_DEVICE
Array<Element, 4> make_Array(Element x, Element y, Element z, Element w);
Factory functions to construct arrays with 1-4 elements.

Numeric Specializations

Array provides specialized implementations of numeric operations for SIMD efficiency on supported types like half_t.

Arithmetic Operations

template <typename T, int N>
struct plus<Array<T, N>>;

template <typename T, int N>
struct minus<Array<T, N>>;

template <typename T, int N>
struct multiplies<Array<T, N>>;

template <typename T, int N>
struct divides<Array<T, N>>;

template <typename T, int N>
struct negate<Array<T, N>>;

Comparison Operations

template <typename T, int N, bool PropagateNaN>
struct maximum<Array<T, N>, PropagateNaN>;

template <typename T, int N, bool PropagateNaN>
struct minimum<Array<T, N>, PropagateNaN>;

Fused Operations

template <typename T, int N>
struct multiply_add<Array<T, N>, Array<T, N>, Array<T, N>>;

template <typename T, int N>
struct multiply_add_relu0<Array<T, N>, Array<T, N>, Array<T, N>>;

Mathematical Functions

template <typename T, int N>
struct absolute_value_op<Array<T, N>>;

template <typename T, int N>
struct inverse_square_root<Array<T, N>>;

template <typename T, int N>
struct reciprocal_approximate<Array<T, N>>;

Usage Examples

Basic Array Operations

// Create and initialize an array
cutlass::Array<float, 4> arr;
arr.fill(1.0f);

// Access elements
arr[0] = 2.0f;
float val = arr.at(1);

// Iterate over elements
for (auto it = arr.begin(); it != arr.end(); ++it) {
  *it *= 2.0f;
}

// Range-based for loop
for (float& elem : arr) {
  elem += 1.0f;
}

Factory Functions

// Create arrays with specific values
auto vec3 = cutlass::make_Array(1.0f, 2.0f, 3.0f);
auto vec4 = cutlass::make_Array(1.0f, 2.0f, 3.0f, 4.0f);

Numeric Operations

cutlass::Array<float, 4> a, b, c;
a.fill(2.0f);
b.fill(3.0f);

// Element-wise addition
cutlass::plus<cutlass::Array<float, 4>> add_op;
c = add_op(a, b);  // c = {5, 5, 5, 5}

// Multiply-add: c = a * b + c
cutlass::multiply_add<cutlass::Array<float, 4>> ma_op;
c = ma_op(a, b, c);

With Half-Precision Types

// Array of half-precision values
cutlass::Array<cutlass::half_t, 8> half_arr;
half_arr.fill(cutlass::half_t(1.5f));

// SIMD-optimized operations on half_t arrays
cutlass::multiplies<cutlass::Array<cutlass::half_t, 8>> mul_op;
auto result = mul_op(half_arr, cutlass::half_t(2.0f));

In Device Code

__global__ void kernel() {
  // Arrays work in both host and device code
  cutlass::Array<float, 4> fragment;
  
  // Initialize
  fragment.clear();
  
  // Compute
  CUTLASS_PRAGMA_UNROLL
  for (int i = 0; i < fragment.size(); ++i) {
    fragment[i] = threadIdx.x * i;
  }
}

SIMD Optimization

For half_t arrays, operations are automatically SIMD-optimized on SM_53 and later:
// From include/cutlass/array.h:1180-1212
template <int N>
struct plus<Array<half_t, N>> {
  CUTLASS_HOST_DEVICE
  Array<half_t, N> operator()(Array<half_t, N> const & lhs, 
                              Array<half_t, N> const &rhs) const {
    Array<half_t, N> result;
    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);

    CUTLASS_PRAGMA_UNROLL
    for (int i = 0; i < N / 2; ++i) {
      result_ptr[i] = __hadd2(lhs_ptr[i], rhs_ptr[i]);
    }

    if constexpr (N % 2) {
      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
      __half d_residual = __hadd(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
    }

    #else
    // Fallback for host code or older architectures
    CUTLASS_PRAGMA_UNROLL
    for (int i = 0; i < N; ++i) {
      result[i] = lhs[i] + rhs[i];
    }
    #endif

    return result;
  }
};

See Also

Build docs developers (and LLMs) love