Tensor Operations

Arithmetic

`ggml_add`

struct ggml_tensor * ggml_add(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    struct ggml_tensor  * b);

Element-wise addition a + b. b is broadcast to the shape of a when necessary.

`ggml_add1`

struct ggml_tensor * ggml_add1(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    struct ggml_tensor  * b);

Adds the scalar value held in tensor b to every element of a.

`ggml_sub`

struct ggml_tensor * ggml_sub(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    struct ggml_tensor  * b);

Element-wise subtraction a - b.

`ggml_mul`

struct ggml_tensor * ggml_mul(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    struct ggml_tensor  * b);

Element-wise multiplication a * b (Hadamard product). b is broadcast to the shape of a.

`ggml_div`

struct ggml_tensor * ggml_div(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    struct ggml_tensor  * b);

Element-wise division a / b.

`ggml_sqr`

struct ggml_tensor * ggml_sqr(
    struct ggml_context * ctx,
    struct ggml_tensor  * a);

Element-wise square a².

`ggml_sqrt`

struct ggml_tensor * ggml_sqrt(
    struct ggml_context * ctx,
    struct ggml_tensor  * a);

Element-wise square root √a.

`ggml_abs`

struct ggml_tensor * ggml_abs(
    struct ggml_context * ctx,
    struct ggml_tensor  * a);

Element-wise absolute value |a|.

`ggml_neg`

struct ggml_tensor * ggml_neg(
    struct ggml_context * ctx,
    struct ggml_tensor  * a);

Element-wise negation -a.

`ggml_log`

struct ggml_tensor * ggml_log(
    struct ggml_context * ctx,
    struct ggml_tensor  * a);

Element-wise natural logarithm ln(a).

`ggml_exp`

struct ggml_tensor * ggml_exp(
    struct ggml_context * ctx,
    struct ggml_tensor  * a);

Element-wise exponential eᵃ.

`ggml_sin` / `ggml_cos`

struct ggml_tensor * ggml_sin(struct ggml_context * ctx, struct ggml_tensor * a);
struct ggml_tensor * ggml_cos(struct ggml_context * ctx, struct ggml_tensor * a);

Element-wise trigonometric functions.

`ggml_scale`

struct ggml_tensor * ggml_scale(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    float                 s);

Multiplies every element of a by the scalar s. Equivalent to a * s.

`ggml_clamp`

struct ggml_tensor * ggml_clamp(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    float                 min,
    float                 max);

Clamps every element of a to [min, max]. Operates in-place and returns a view of a.

Matrix operations

`ggml_mul_mat`

struct ggml_tensor * ggml_mul_mat(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    struct ggml_tensor  * b);

Matrix multiplication. a is the weight matrix (k columns, n rows) and b is the input (k columns, m rows — transposed internally). The result is n columns by m rows.

a: [ne03, ne02, n, k]
b: [ne03*x, ne02*y, m, k]
result: [ne03*x, ne02*y, m, n]

a may be quantized; b must be F32 or F16.

`ggml_mul_mat_set_prec`

void ggml_mul_mat_set_prec(
    struct ggml_tensor * a,
    enum ggml_prec       prec);

Overrides the accumulation precision of a ggml_mul_mat result tensor. Set to GGML_PREC_F32 for higher-precision accumulation (useful for models like Phi-2).

`ggml_mul_mat_id`

struct ggml_tensor * ggml_mul_mat_id(
    struct ggml_context * ctx,
    struct ggml_tensor  * as,
    struct ggml_tensor  * b,
    struct ggml_tensor  * ids);

Indirect matrix multiplication. Selects one of the weight matrices from as using the row indices in ids, then multiplies by b. Used in mixture-of-experts routing.

`ggml_out_prod`

struct ggml_tensor * ggml_out_prod(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    struct ggml_tensor  * b);

Outer product. a is [m, n], b is [p, n], result is [m, p].

Activation functions

`ggml_relu`

struct ggml_tensor * ggml_relu(
    struct ggml_context * ctx,
    struct ggml_tensor  * a);

Rectified linear unit: max(0, a) element-wise.

`ggml_leaky_relu`

struct ggml_tensor * ggml_leaky_relu(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    float                 negative_slope,
    bool                  inplace);

Leaky ReLU: a >= 0 ? a : negative_slope * a.

`ggml_gelu`

struct ggml_tensor * ggml_gelu(
    struct ggml_context * ctx,
    struct ggml_tensor  * a);

Gaussian Error Linear Unit. Uses the standard approximation based on tanh.

`ggml_gelu_erf`

struct ggml_tensor * ggml_gelu_erf(
    struct ggml_context * ctx,
    struct ggml_tensor  * a);

GELU computed using the error function (erf) when available. Some backends may fall back to the Abramowitz and Stegun approximation.

`ggml_gelu_quick`

struct ggml_tensor * ggml_gelu_quick(
    struct ggml_context * ctx,
    struct ggml_tensor  * a);

Faster GELU approximation.

`ggml_silu`

struct ggml_tensor * ggml_silu(
    struct ggml_context * ctx,
    struct ggml_tensor  * a);

Sigmoid Linear Unit: a * sigmoid(a).

`ggml_silu_back`

struct ggml_tensor * ggml_silu_back(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,  // x (forward input)
    struct ggml_tensor  * b); // dy (upstream gradient)

Backward pass of SiLU. Returns dx given x and dy.

`ggml_sigmoid`

struct ggml_tensor * ggml_sigmoid(
    struct ggml_context * ctx,
    struct ggml_tensor  * a);

Logistic sigmoid: 1 / (1 + exp(-a)).

`ggml_tanh`

struct ggml_tensor * ggml_tanh(
    struct ggml_context * ctx,
    struct ggml_tensor  * a);

Hyperbolic tangent.

`ggml_elu`

struct ggml_tensor * ggml_elu(
    struct ggml_context * ctx,
    struct ggml_tensor  * a);

Exponential Linear Unit: a >= 0 ? a : exp(a) - 1.

`ggml_hardswish` / `ggml_hardsigmoid`

struct ggml_tensor * ggml_hardswish  (struct ggml_context * ctx, struct ggml_tensor * a);
struct ggml_tensor * ggml_hardsigmoid(struct ggml_context * ctx, struct ggml_tensor * a);

hardswish(x) = x * relu6(x + 3) / 6
hardsigmoid(x) = relu6(x + 3) / 6

Gated linear units

ggml provides fused GLU variants that split or gate the activation in a single op:

// Single-tensor GLU (gate in second half of row)
struct ggml_tensor * ggml_reglu  (struct ggml_context * ctx, struct ggml_tensor * a);
struct ggml_tensor * ggml_geglu  (struct ggml_context * ctx, struct ggml_tensor * a);
struct ggml_tensor * ggml_swiglu (struct ggml_context * ctx, struct ggml_tensor * a);

// Split-tensor GLU (separate tensors for input and gate)
struct ggml_tensor * ggml_reglu_split (
    struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b);
struct ggml_tensor * ggml_geglu_split (
    struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b);
struct ggml_tensor * ggml_swiglu_split(
    struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b);

Normalization

`ggml_norm`

struct ggml_tensor * ggml_norm(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    float                 eps);

Layer normalization along rows. Subtracts the row mean and divides by the row standard deviation. eps is added to the variance before taking the square root for numerical stability.

`ggml_rms_norm`

struct ggml_tensor * ggml_rms_norm(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    float                 eps);

Root mean square normalization along rows. Divides each row by its RMS. Commonly used in LLaMA-style transformers.

`ggml_l2_norm`

struct ggml_tensor * ggml_l2_norm(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    float                 eps);

L2 normalization along rows. Divides each row by its L2 norm. Used in RWKV v7.

`ggml_group_norm`

struct ggml_tensor * ggml_group_norm(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    int                   n_groups,
    float                 eps);

Group normalization along ne0 * ne1 / n_groups channels. Commonly used in image models such as Stable Diffusion.

n_groups

int

required

Number of channel groups to normalize over.

eps

float

required

Small constant added to the variance for numerical stability.

Attention

`ggml_flash_attn_ext`

struct ggml_tensor * ggml_flash_attn_ext(
    struct ggml_context * ctx,
    struct ggml_tensor  * q,
    struct ggml_tensor  * k,
    struct ggml_tensor  * v,
    struct ggml_tensor  * mask,
    float                 scale,
    float                 max_bias,
    float                 logit_softcap);

Fused scaled-dot-product attention with optional ALiBi bias and logit soft-capping. This is the primary attention kernel used by llama.cpp and related projects.Tensor layout:

q: [n_embd_k, n_batch, n_head, ne3]
k: [n_embd_k, n_kv, n_head_kv, ne3]
v: [n_embd_v, n_kv, n_head_kv, ne3] — not pre-transposed
mask: [n_kv, n_batch, ne32, ne33] — F16 or F32, optional
result: [n_embd_v, n_head, n_batch, ne3] — permuted

scale

float

required

Attention scaling factor applied before softmax. Typically 1/sqrt(head_dim).

max_bias

float

required

Maximum ALiBi slope. Set to 0.0 to disable ALiBi bias.

logit_softcap

float

required

Soft-cap applied to logits as tanh(logit / cap) * cap. Set to 0.0 to disable.

void ggml_flash_attn_ext_set_prec(
    struct ggml_tensor * a,
    enum ggml_prec       prec);

Overrides the precision of the flash attention accumulation (e.g. GGML_PREC_F32).

`ggml_soft_max_ext`

struct ggml_tensor * ggml_soft_max_ext(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    struct ggml_tensor  * mask,
    float                 scale,
    float                 max_bias);

Fused softmax with optional attention mask and ALiBi bias. Computes softmax(a * scale + mask * alibi_slope).

Reshape and view

`ggml_reshape_1d` / `_2d` / `_3d` / `_4d`

struct ggml_tensor * ggml_reshape_1d(
    struct ggml_context * ctx, struct ggml_tensor * a, int64_t ne0);

struct ggml_tensor * ggml_reshape_2d(
    struct ggml_context * ctx, struct ggml_tensor * a,
    int64_t ne0, int64_t ne1);

struct ggml_tensor * ggml_reshape_3d(
    struct ggml_context * ctx, struct ggml_tensor * a,
    int64_t ne0, int64_t ne1, int64_t ne2);

struct ggml_tensor * ggml_reshape_4d(
    struct ggml_context * ctx, struct ggml_tensor * a,
    int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);

Returns a view of a with the specified shape. Total element count must match. a must be contiguous.

`ggml_view_1d` / `_2d` / `_3d` / `_4d`

struct ggml_tensor * ggml_view_1d(
    struct ggml_context * ctx, struct ggml_tensor * a,
    int64_t ne0, size_t offset);

struct ggml_tensor * ggml_view_2d(
    struct ggml_context * ctx, struct ggml_tensor * a,
    int64_t ne0, int64_t ne1,
    size_t nb1,   // row stride in bytes
    size_t offset);

struct ggml_tensor * ggml_view_3d(
    struct ggml_context * ctx, struct ggml_tensor * a,
    int64_t ne0, int64_t ne1, int64_t ne2,
    size_t nb1, size_t nb2, size_t offset);

struct ggml_tensor * ggml_view_4d(
    struct ggml_context * ctx, struct ggml_tensor * a,
    int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3,
    size_t nb1, size_t nb2, size_t nb3, size_t offset);

Creates a view into a starting at offset bytes. Strides can differ from a, enabling sub-matrix and strided views without copying.

`ggml_transpose`

struct ggml_tensor * ggml_transpose(
    struct ggml_context * ctx,
    struct ggml_tensor  * a);

Swaps the first two dimensions of a. Equivalent to ggml_permute(ctx, a, 1, 0, 2, 3). Returns a view; no data is copied.

`ggml_permute`

struct ggml_tensor * ggml_permute(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    int axis0, int axis1, int axis2, int axis3);

Arbitrarily reorders the four axes of a. For example, ggml_permute(ctx, a, 2, 1, 0, 3) moves dimension 2 to position 0. Returns a non-contiguous view; no data is copied.

`ggml_cont`

struct ggml_tensor * ggml_cont(struct ggml_context * ctx, struct ggml_tensor * a);

Makes a contiguous copy of a if it is not already contiguous. Variants ggml_cont_1d through ggml_cont_4d also reshape while making contiguous.

Reduction

`ggml_sum`

struct ggml_tensor * ggml_sum(
    struct ggml_context * ctx,
    struct ggml_tensor  * a);

Reduces all elements to a scalar by summing.

`ggml_sum_rows`

struct ggml_tensor * ggml_sum_rows(
    struct ggml_context * ctx,
    struct ggml_tensor  * a);

Sums along dimension 0 (rows). Input shape [a, b, c, d] → output shape [1, b, c, d].

`ggml_mean`

struct ggml_tensor * ggml_mean(
    struct ggml_context * ctx,
    struct ggml_tensor  * a);

Computes the mean along rows.

`ggml_argmax`

struct ggml_tensor * ggml_argmax(
    struct ggml_context * ctx,
    struct ggml_tensor  * a);

Returns the index of the maximum element along each row.

`ggml_top_k`

struct ggml_tensor * ggml_top_k(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    int                   k);

Returns the top-k elements per row. The returned indices are not in sorted order.

Use ggml_argsort if you need fully sorted rows.

`ggml_argsort`

struct ggml_tensor * ggml_argsort(
    struct ggml_context  * ctx,
    struct ggml_tensor   * a,
    enum ggml_sort_order   order); // GGML_SORT_ORDER_ASC or GGML_SORT_ORDER_DESC

Returns the indices that would sort each row in the given order.

`ggml_cumsum`

struct ggml_tensor * ggml_cumsum(
    struct ggml_context * ctx,
    struct ggml_tensor  * a);

Cumulative sum along the row dimension.

Convolution

`ggml_conv_1d`

struct ggml_tensor * ggml_conv_1d(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,  // convolution kernel
    struct ggml_tensor  * b,  // input data
    int                   s0, // stride
    int                   p0, // padding
    int                   d0);// dilation

1D convolution of data b with kernel a.

struct ggml_tensor *

required

Convolution kernel tensor.

struct ggml_tensor *

required

Input data tensor.

int

required

Stride along dimension 0.

int

required

Padding along dimension 0.

int

required

Dilation along dimension 0.

`ggml_conv_2d`

struct ggml_tensor * ggml_conv_2d(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,  // convolution kernel
    struct ggml_tensor  * b,  // input data
    int                   s0, // stride dimension 0
    int                   s1, // stride dimension 1
    int                   p0, // padding dimension 0
    int                   p1, // padding dimension 1
    int                   d0, // dilation dimension 0
    int                   d1);// dilation dimension 1

2D convolution. Implemented via ggml_im2col + ggml_mul_mat.

Embedding and positional encoding

`ggml_get_rows`

struct ggml_tensor * ggml_get_rows(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,  // data   [n_embd, ne1, ne2, ne3]
    struct ggml_tensor  * b); // row indices (I32) [n_rows, ne2, ne3, 1]

Gathers rows from a by the integer indices stored in b. Used for token embedding lookup.Result shape: [n_embd, n_rows, ne2, ne3].

`ggml_rope`

struct ggml_tensor * ggml_rope(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,     // query or key tensor
    struct ggml_tensor  * b,     // position indices (I32), size == a->ne[2]
    int                   n_dims,// number of dimensions to rotate
    int                   mode); // GGML_ROPE_TYPE_NORMAL, GGML_ROPE_TYPE_NEOX, etc.

Applies Rotary Position Embedding (RoPE) to a. b is a 1D tensor of position indices.

`ggml_rope_ext`

struct ggml_tensor * ggml_rope_ext(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    struct ggml_tensor  * b,          // position indices
    struct ggml_tensor  * c,          // frequency factors (optional, e.g. Phi3-128k)
    int                   n_dims,
    int                   mode,
    int                   n_ctx_orig, // original context length for YaRN scaling
    float                 freq_base,
    float                 freq_scale,
    float                 ext_factor,
    float                 attn_factor,
    float                 beta_fast,
    float                 beta_slow);

Extended RoPE with support for YaRN-style context extension and custom frequency scaling. Use this instead of the deprecated ggml_rope_custom.

struct ggml_tensor *

Optional per-dimension frequency scaling factors. Pass NULL to use default RoPE frequencies.

n_ctx_orig

int

required

Original training context length. Used to compute YaRN correction dimensions.

freq_base

float

required

Base frequency for the sinusoidal position encoding (e.g. 10000.0).

ext_factor

float

required

YaRN extrapolation factor. Set to 0.0 to disable YaRN.

Loss functions

`ggml_cross_entropy_loss`

struct ggml_tensor * ggml_cross_entropy_loss(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,  // logits
    struct ggml_tensor  * b); // labels

Computes cross-entropy loss between logits a and ground-truth labels b. The result is a scalar tensor. Mark it with ggml_set_loss() to use it as the optimization objective.

Concatenation and repetition

`ggml_concat`

struct ggml_tensor * ggml_concat(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    struct ggml_tensor  * b,
    int                   dim);

Concatenates a and b along dimension dim.

`ggml_repeat`

struct ggml_tensor * ggml_repeat(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    struct ggml_tensor  * b);

Repeats (tiles) a to match the shape of b. If a already has the same shape as b and is not a parameter tensor, returns a directly.

`ggml_repeat_4d`

struct ggml_tensor * ggml_repeat_4d(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);

Repeats a to an explicit 4D target shape.

Diagonal and masking

`ggml_diag`

struct ggml_tensor * ggml_diag(
    struct ggml_context * ctx,
    struct ggml_tensor  * a);

Constructs a diagonal matrix from vector a.

`ggml_diag_mask_inf`

struct ggml_tensor * ggml_diag_mask_inf(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    int                   n_past);

Sets elements above the diagonal to -INF. Used to implement causal attention masks.

n_past

int

required

Number of past tokens. Columns at or before n_past are not masked.

`ggml_diag_mask_zero`

struct ggml_tensor * ggml_diag_mask_zero(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    int                   n_past);

Sets elements above the diagonal to 0.

Core API

Backend API

Optimization API

GGUF API

​ggml_add

​ggml_add1

​ggml_sub

​ggml_mul

​ggml_div

​ggml_sqr

​ggml_sqrt

​ggml_abs

​ggml_neg

​ggml_log

​ggml_exp

​ggml_sin / ggml_cos

​ggml_scale

​ggml_clamp

​ggml_mul_mat

​ggml_mul_mat_set_prec

​ggml_mul_mat_id

​ggml_out_prod

​ggml_relu

​ggml_leaky_relu

​ggml_gelu

​ggml_gelu_erf

​ggml_gelu_quick

​ggml_silu

​ggml_silu_back

​ggml_sigmoid

​ggml_tanh

​ggml_elu

​ggml_hardswish / ggml_hardsigmoid

​Gated linear units

​ggml_norm

​ggml_rms_norm

​ggml_l2_norm

​ggml_group_norm

​ggml_flash_attn_ext

​ggml_soft_max_ext

​ggml_reshape_1d / _2d / _3d / _4d

​ggml_view_1d / _2d / _3d / _4d

​ggml_transpose

​ggml_permute

​ggml_cont

​ggml_sum

​ggml_sum_rows

​ggml_mean

​ggml_argmax

​ggml_top_k

​ggml_argsort

​ggml_cumsum

​ggml_conv_1d

​ggml_conv_2d

​ggml_get_rows

​ggml_rope

​ggml_rope_ext

​ggml_cross_entropy_loss

​ggml_concat

​ggml_repeat

​ggml_repeat_4d

​ggml_diag

​ggml_diag_mask_inf

​ggml_diag_mask_zero

Build docs developers (and LLMs) love

`ggml_add`

`ggml_add1`

`ggml_sub`

`ggml_mul`

`ggml_div`

`ggml_sqr`

`ggml_sqrt`

`ggml_abs`

`ggml_neg`

`ggml_log`

`ggml_exp`

`ggml_sin` / `ggml_cos`

`ggml_scale`

`ggml_clamp`

`ggml_mul_mat`

`ggml_mul_mat_set_prec`

`ggml_mul_mat_id`

`ggml_out_prod`

`ggml_relu`

`ggml_leaky_relu`

`ggml_gelu`

`ggml_gelu_erf`

`ggml_gelu_quick`

`ggml_silu`

`ggml_silu_back`

`ggml_sigmoid`

`ggml_tanh`

`ggml_elu`

`ggml_hardswish` / `ggml_hardsigmoid`

Gated linear units

`ggml_norm`

`ggml_rms_norm`

`ggml_l2_norm`

`ggml_group_norm`

`ggml_flash_attn_ext`

`ggml_soft_max_ext`

`ggml_reshape_1d` / `_2d` / `_3d` / `_4d`

`ggml_view_1d` / `_2d` / `_3d` / `_4d`

`ggml_transpose`

`ggml_permute`

`ggml_cont`

`ggml_sum`

`ggml_sum_rows`

`ggml_mean`

`ggml_argmax`

`ggml_top_k`

`ggml_argsort`

`ggml_cumsum`

`ggml_conv_1d`

`ggml_conv_2d`

`ggml_get_rows`

`ggml_rope`

`ggml_rope_ext`

`ggml_cross_entropy_loss`

`ggml_concat`

`ggml_repeat`

`ggml_repeat_4d`

`ggml_diag`

`ggml_diag_mask_inf`

`ggml_diag_mask_zero`