Install Python dependencies (optional)
Some examples require Python tooling to download model weights. Skip this
step if you only want to build the library.
python3.10 -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt
Build with CMake
mkdir build && cd build
cmake ..
cmake --build . --config Release -j 8
build/bin/.Working examples
The twosimple examples demonstrate the two main APIs.
- simple-ctx (legacy CPU API)
- simple-backend (modern multi-backend API)
This example allocates a context that owns tensor data, builds a matrix
multiplication graph, and executes it on the CPU.Key points:
simple-ctx.cpp
#include "ggml.h"
#include "ggml-cpu.h"
#include <cassert>
#include <cstdio>
#include <cstring>
#include <vector>
struct simple_model {
struct ggml_tensor * a;
struct ggml_tensor * b;
struct ggml_context * ctx;
};
void load_model(simple_model & model, float * a, float * b,
int rows_A, int cols_A, int rows_B, int cols_B) {
size_t ctx_size = 0;
ctx_size += rows_A * cols_A * ggml_type_size(GGML_TYPE_F32);
ctx_size += rows_B * cols_B * ggml_type_size(GGML_TYPE_F32);
ctx_size += 2 * ggml_tensor_overhead();
ctx_size += ggml_graph_overhead();
ctx_size += 1024;
struct ggml_init_params params {
/*.mem_size =*/ ctx_size,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ false,
};
model.ctx = ggml_init(params);
model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_A, rows_A);
model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_B, rows_B);
memcpy(model.a->data, a, ggml_nbytes(model.a));
memcpy(model.b->data, b, ggml_nbytes(model.b));
}
struct ggml_cgraph * build_graph(const simple_model & model) {
struct ggml_cgraph * gf = ggml_new_graph(model.ctx);
// result = a * b^T
struct ggml_tensor * result = ggml_mul_mat(model.ctx, model.a, model.b);
ggml_build_forward_expand(gf, result);
return gf;
}
struct ggml_tensor * compute(const simple_model & model) {
struct ggml_cgraph * gf = build_graph(model);
ggml_graph_compute_with_ctx(model.ctx, gf, /*n_threads=*/1);
return ggml_graph_node(gf, -1);
}
int main(void) {
ggml_time_init();
const int rows_A = 4, cols_A = 2;
float matrix_A[rows_A * cols_A] = { 2, 8, 5, 1, 4, 2, 8, 6 };
const int rows_B = 3, cols_B = 2;
float matrix_B[rows_B * cols_B] = { 10, 5, 9, 9, 5, 4 };
simple_model model;
load_model(model, matrix_A, matrix_B, rows_A, cols_A, rows_B, cols_B);
struct ggml_tensor * result = compute(model);
std::vector<float> out_data(ggml_nelements(result));
memcpy(out_data.data(), result->data, ggml_nbytes(result));
printf("mul mat (%d x %d) (transposed result):\n[",
(int)result->ne[0], (int)result->ne[1]);
for (int j = 0; j < result->ne[1]; j++) {
if (j > 0) printf("\n");
for (int i = 0; i < result->ne[0]; i++)
printf(" %.2f", out_data[j * result->ne[0] + i]);
}
printf(" ]\n");
ggml_free(model.ctx);
return 0;
}
ggml_init()creates a context that owns tensor memory (no_alloc = false).ggml_new_tensor_2d()allocates a tensor inside the context.ggml_mul_mat()records the operation in the graph — no computation yet.ggml_graph_compute_with_ctx()executes the graph on the CPU.ggml_free()releases the entire context and all its tensors.
This example uses the backend scheduler to automatically dispatch work to
the best available device (GPU if available, otherwise CPU).Key differences from simple-ctx:
simple-backend.cpp
#include "ggml.h"
#include "ggml-backend.h"
#include <cstdio>
#include <cstring>
#include <vector>
struct simple_model {
struct ggml_tensor * a {};
struct ggml_tensor * b {};
ggml_backend_t backend {};
ggml_backend_t cpu_backend {};
ggml_backend_sched_t sched {};
std::vector<uint8_t> buf;
};
const int rows_A = 4, cols_A = 2;
float matrix_A[rows_A * cols_A] = { 2, 8, 5, 1, 4, 2, 8, 6 };
const int rows_B = 3, cols_B = 2;
float matrix_B[rows_B * cols_B] = { 10, 5, 9, 9, 5, 4 };
void init_model(simple_model & model) {
ggml_backend_load_all();
model.backend = ggml_backend_init_best();
model.cpu_backend = ggml_backend_init_by_type(
GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
ggml_backend_t backends[2] = { model.backend, model.cpu_backend };
model.sched = ggml_backend_sched_new(
backends, nullptr, 2,
GGML_DEFAULT_GRAPH_SIZE, false, true);
}
struct ggml_cgraph * build_graph(simple_model & model) {
size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE
+ ggml_graph_overhead();
model.buf.resize(buf_size);
struct ggml_init_params params0 = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ model.buf.data(),
/*.no_alloc =*/ true, // tensors allocated later by the scheduler
};
struct ggml_context * ctx = ggml_init(params0);
struct ggml_cgraph * gf = ggml_new_graph(ctx);
model.a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, cols_A, rows_A);
model.b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, cols_B, rows_B);
struct ggml_tensor * result = ggml_mul_mat(ctx, model.a, model.b);
ggml_build_forward_expand(gf, result);
ggml_free(ctx);
return gf;
}
struct ggml_tensor * compute(simple_model & model, struct ggml_cgraph * gf) {
ggml_backend_sched_reset(model.sched);
ggml_backend_sched_alloc_graph(model.sched, gf);
// upload data from CPU memory to backend buffer
ggml_backend_tensor_set(model.a, matrix_A, 0, ggml_nbytes(model.a));
ggml_backend_tensor_set(model.b, matrix_B, 0, ggml_nbytes(model.b));
ggml_backend_sched_graph_compute(model.sched, gf);
return ggml_graph_node(gf, -1);
}
int main(void) {
ggml_time_init();
simple_model model;
init_model(model);
struct ggml_cgraph * gf = build_graph(model);
struct ggml_tensor * result = compute(model, gf);
std::vector<float> out_data(ggml_nelements(result));
ggml_backend_tensor_get(result, out_data.data(), 0, ggml_nbytes(result));
printf("mul mat (%d x %d) (transposed result):\n[",
(int)result->ne[0], (int)result->ne[1]);
for (int j = 0; j < result->ne[1]; j++) {
if (j > 0) printf("\n");
for (int i = 0; i < result->ne[0]; i++)
printf(" %.2f", out_data[j * result->ne[0] + i]);
}
printf(" ]\n");
ggml_backend_sched_free(model.sched);
ggml_backend_free(model.backend);
ggml_backend_free(model.cpu_backend);
return 0;
}
ggml_backend_load_all()discovers all compiled backends at startup.ggml_backend_init_best()picks the highest-priority available device.- The context is created with
no_alloc = true; the scheduler allocates tensors on the appropriate device. ggml_backend_tensor_set/gettransfer data between CPU and device memory.
