Model Loading

Loading a Model

The primary function for loading models is llama_model_load_from_file:

LLAMA_API struct llama_model * llama_model_load_from_file(
    const char * path_model,
    struct llama_model_params params
);

path_model

const char *

Path to the GGUF model file. For split models, use the naming pattern: <name>-%05d-of-%05d.gguf

params

struct llama_model_params

Model loading parameters (see below)

return

llama_model *

Returns pointer to loaded model, or NULL on failure

Example

llama_model_params params = llama_model_default_params();
params.n_gpu_layers = 32;  // Offload 32 layers to GPU
params.use_mmap = true;

llama_model * model = llama_model_load_from_file("model.gguf", params);
if (model == NULL) {
    fprintf(stderr, "Failed to load model\n");
    return 1;
}

Model Parameters

Structure Definition

struct llama_model_params {
    // NULL-terminated list of devices for offloading
    ggml_backend_dev_t * devices;
    
    // Buffer type overrides for tensors matching a pattern
    const struct llama_model_tensor_buft_override * tensor_buft_overrides;
    
    // Number of layers to store in VRAM (-1 = all layers)
    int32_t n_gpu_layers;
    
    // How to split the model across multiple GPUs
    enum llama_split_mode split_mode;
    
    // GPU used for entire model when split_mode is LLAMA_SPLIT_MODE_NONE
    int32_t main_gpu;
    
    // Proportion of model to offload to each GPU
    const float * tensor_split;
    
    // Progress callback (return false to abort loading)
    llama_progress_callback progress_callback;
    void * progress_callback_user_data;
    
    // Override model metadata key-value pairs
    const struct llama_model_kv_override * kv_overrides;
    
    // Boolean flags
    bool vocab_only;       // Only load vocabulary, no weights
    bool use_mmap;         // Use mmap if possible
    bool use_direct_io;    // Use direct I/O (overrides use_mmap)
    bool use_mlock;        // Force system to keep model in RAM
    bool check_tensors;    // Validate model tensor data
    bool use_extra_bufts;  // Use extra buffer types for weight repacking
    bool no_host;          // Bypass host buffer
    bool no_alloc;         // Only load metadata, simulate allocations
};

Parameter Details

n_gpu_layers

int32_t

default:"0"

Number of model layers to offload to GPU. Use -1 to offload all layers. Set to 0 for CPU-only inference.

params.n_gpu_layers = 32;  // Offload 32 layers
params.n_gpu_layers = -1;  // Offload all layers
params.n_gpu_layers = 0;   // CPU only

split_mode

enum llama_split_mode

default:"LLAMA_SPLIT_MODE_LAYER"

How to distribute the model across multiple GPUs:

LLAMA_SPLIT_MODE_NONE: Single GPU
LLAMA_SPLIT_MODE_LAYER: Split layers and KV cache across GPUs
LLAMA_SPLIT_MODE_ROW: Split layers and KV cache, use tensor parallelism if supported

params.split_mode = LLAMA_SPLIT_MODE_LAYER;

main_gpu

int32_t

default:"0"

The GPU device ID to use when split_mode is LLAMA_SPLIT_MODE_NONE.

params.main_gpu = 0;  // Use first GPU

vocab_only

bool

default:"false"

Load only the vocabulary without model weights. Useful for tokenization-only applications.

params.vocab_only = true;

use_mmap

bool

default:"true"

Use memory mapping to load the model. This can improve loading speed and reduce memory usage.

params.use_mmap = true;

use_mlock

bool

default:"false"

Force the system to keep the model in RAM, preventing swapping to disk. Requires sufficient RAM.

params.use_mlock = true;

check_tensors

bool

default:"false"

Validate model tensor data during loading. Useful for debugging corrupted models.

params.check_tensors = true;

Loading Split Models

For models split across multiple files with custom naming:

LLAMA_API struct llama_model * llama_model_load_from_splits(
    const char ** paths,
    size_t n_paths,
    struct llama_model_params params
);

paths

const char **

Array of file paths in the correct order

n_paths

size_t

Number of split files

Example

const char * paths[] = {
    "model-part1.gguf",
    "model-part2.gguf",
    "model-part3.gguf"
};

llama_model_params params = llama_model_default_params();
llama_model * model = llama_model_load_from_splits(paths, 3, params);

Progress Callback

Monitor model loading progress:

typedef bool (*llama_progress_callback)(float progress, void * user_data);

progress

float

Loading progress from 0.0 to 1.0

user_data

void *

User-provided context pointer

return

bool

Return true to continue loading, false to abort

Example

bool progress_callback(float progress, void * user_data) {
    printf("Loading: %.1f%%\r", progress * 100.0f);
    fflush(stdout);
    return true;  // Continue loading
}

llama_model_params params = llama_model_default_params();
params.progress_callback = progress_callback;
params.progress_callback_user_data = NULL;

llama_model * model = llama_model_load_from_file("model.gguf", params);

Model Metadata

Access model metadata from GGUF files:

// Get metadata value by key
int32_t llama_model_meta_val_str(
    const struct llama_model * model,
    const char * key,
    char * buf,
    size_t buf_size
);

// Get number of metadata key/value pairs
int32_t llama_model_meta_count(const struct llama_model * model);

// Get metadata key by index
int32_t llama_model_meta_key_by_index(
    const struct llama_model * model,
    int32_t i,
    char * buf,
    size_t buf_size
);

// Get model description
int32_t llama_model_desc(
    const struct llama_model * model,
    char * buf,
    size_t buf_size
);

Example

char buf[256];

// Get model description
if (llama_model_desc(model, buf, sizeof(buf)) > 0) {
    printf("Model: %s\n", buf);
}

// Iterate metadata
int32_t n_meta = llama_model_meta_count(model);
for (int32_t i = 0; i < n_meta; i++) {
    char key[128], value[256];
    llama_model_meta_key_by_index(model, i, key, sizeof(key));
    llama_model_meta_val_str_by_index(model, i, value, sizeof(value));
    printf("%s = %s\n", key, value);
}

Model Properties

Query model architecture and capabilities:

int32_t n_layers = llama_model_n_layer(model);
int32_t n_embd = llama_model_n_embd(model);
int32_t n_head = llama_model_n_head(model);
int32_t n_ctx_train = llama_model_n_ctx_train(model);
uint64_t n_params = llama_model_n_params(model);
uint64_t size_bytes = llama_model_size(model);

printf("Layers: %d\n", n_layers);
printf("Embedding dim: %d\n", n_embd);
printf("Parameters: %llu\n", n_params);
printf("Size: %.2f GB\n", size_bytes / 1024.0 / 1024.0 / 1024.0);

Freeing Models

Free model memory when done:

void llama_model_free(struct llama_model * model);

Always free models before calling llama_backend_free(). All contexts created from the model must be freed before freeing the model.

Example

// Correct order:
llama_free(ctx);           // Free context first
llama_model_free(model);   // Then free model
llama_backend_free();      // Finally free backend

Saving Models

Save a loaded model back to a file:

void llama_model_save_to_file(
    const struct llama_model * model,
    const char * path_model
);

Fitting Parameters to Memory

Automatically adjust parameters to fit available device memory:

enum llama_params_fit_status llama_params_fit(
    const char * path_model,
    struct llama_model_params * mparams,
    struct llama_context_params * cparams,
    float * tensor_split,
    struct llama_model_tensor_buft_override * tensor_buft_overrides,
    size_t * margins,
    uint32_t n_ctx_min,
    enum ggml_log_level log_level
);

return

enum llama_params_fit_status

LLAMA_PARAMS_FIT_STATUS_SUCCESS: Parameters adjusted successfully
LLAMA_PARAMS_FIT_STATUS_FAILURE: Could not find fitting allocations
LLAMA_PARAMS_FIT_STATUS_ERROR: Hard error (e.g., model not found)

This function modifies the global logger state and is not thread-safe. Only parameters matching defaults are modified, except context size which is modified if equal to 0.

Complete Example

#include "llama.h"
#include <stdio.h>

int main() {
    // Initialize backend
    llama_backend_init();
    
    // Configure model parameters
    llama_model_params params = llama_model_default_params();
    params.n_gpu_layers = 32;
    params.use_mmap = true;
    params.use_mlock = false;
    params.check_tensors = true;
    
    // Load model
    const char * model_path = "models/llama-2-7b.Q4_K_M.gguf";
    llama_model * model = llama_model_load_from_file(model_path, params);
    
    if (model == NULL) {
        fprintf(stderr, "Failed to load model from %s\n", model_path);
        llama_backend_free();
        return 1;
    }
    
    // Print model information
    printf("Model loaded successfully\n");
    printf("  Layers: %d\n", llama_model_n_layer(model));
    printf("  Embedding dimension: %d\n", llama_model_n_embd(model));
    printf("  Context size (training): %d\n", llama_model_n_ctx_train(model));
    printf("  Parameters: %llu\n", llama_model_n_params(model));
    printf("  Size: %.2f GB\n", 
           llama_model_size(model) / 1024.0 / 1024.0 / 1024.0);
    
    // Check capabilities
    if (llama_model_is_recurrent(model)) {
        printf("  Type: Recurrent\n");
    } else if (llama_model_has_encoder(model)) {
        printf("  Type: Encoder-decoder\n");
    } else {
        printf("  Type: Decoder-only\n");
    }
    
    // Cleanup
    llama_model_free(model);
    llama_backend_free();
    
    return 0;
}

C/C++ API

REST API

Tools

Loading a Model

Example

Model Parameters

Structure Definition

Parameter Details

Loading Split Models

Example

Progress Callback

Example

Model Metadata

Example

Model Properties

Freeing Models

Example

Saving Models

Fitting Parameters to Memory

Complete Example

Next Steps

Inference

Sampling

C/C++ API

REST API

Tools

​Loading a Model

​Example

​Model Parameters

​Structure Definition

​Parameter Details

​Loading Split Models

​Example

​Progress Callback

​Example

​Model Metadata

​Example

​Model Properties

​Freeing Models

​Example

​Saving Models

​Fitting Parameters to Memory

​Complete Example

​Next Steps

Inference

Sampling

Loading a Model

Example

Model Parameters

Structure Definition

Parameter Details

Loading Split Models

Example

Progress Callback

Example

Model Metadata

Example

Model Properties

Freeing Models

Example

Saving Models

Fitting Parameters to Memory

Complete Example

Next Steps