Loading a Model
The primary function for loading models is llama_model_load_from_file:
LLAMA_API struct llama_model * llama_model_load_from_file (
const char * path_model ,
struct llama_model_params params
);
Path to the GGUF model file. For split models, use the naming pattern: <name>-%05d-of-%05d.gguf
params
struct llama_model_params
Model loading parameters (see below)
Returns pointer to loaded model, or NULL on failure
Example
llama_model_params params = llama_model_default_params ();
params.n_gpu_layers = 32 ; // Offload 32 layers to GPU
params.use_mmap = true ;
llama_model * model = llama_model_load_from_file ( "model.gguf" , params);
if (model == NULL ) {
fprintf (stderr, "Failed to load model \n " );
return 1 ;
}
Model Parameters
Structure Definition
struct llama_model_params {
// NULL-terminated list of devices for offloading
ggml_backend_dev_t * devices;
// Buffer type overrides for tensors matching a pattern
const struct llama_model_tensor_buft_override * tensor_buft_overrides;
// Number of layers to store in VRAM (-1 = all layers)
int32_t n_gpu_layers;
// How to split the model across multiple GPUs
enum llama_split_mode split_mode;
// GPU used for entire model when split_mode is LLAMA_SPLIT_MODE_NONE
int32_t main_gpu;
// Proportion of model to offload to each GPU
const float * tensor_split;
// Progress callback (return false to abort loading)
llama_progress_callback progress_callback;
void * progress_callback_user_data;
// Override model metadata key-value pairs
const struct llama_model_kv_override * kv_overrides;
// Boolean flags
bool vocab_only; // Only load vocabulary, no weights
bool use_mmap; // Use mmap if possible
bool use_direct_io; // Use direct I/O (overrides use_mmap)
bool use_mlock; // Force system to keep model in RAM
bool check_tensors; // Validate model tensor data
bool use_extra_bufts; // Use extra buffer types for weight repacking
bool no_host; // Bypass host buffer
bool no_alloc; // Only load metadata, simulate allocations
};
Parameter Details
Number of model layers to offload to GPU. Use -1 to offload all layers. Set to 0 for CPU-only inference. params.n_gpu_layers = 32 ; // Offload 32 layers
params.n_gpu_layers = - 1 ; // Offload all layers
params.n_gpu_layers = 0 ; // CPU only
split_mode
enum llama_split_mode
default: "LLAMA_SPLIT_MODE_LAYER"
How to distribute the model across multiple GPUs:
LLAMA_SPLIT_MODE_NONE: Single GPU
LLAMA_SPLIT_MODE_LAYER: Split layers and KV cache across GPUs
LLAMA_SPLIT_MODE_ROW: Split layers and KV cache, use tensor parallelism if supported
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
The GPU device ID to use when split_mode is LLAMA_SPLIT_MODE_NONE. params.main_gpu = 0 ; // Use first GPU
Load only the vocabulary without model weights. Useful for tokenization-only applications. params.vocab_only = true ;
Use memory mapping to load the model. This can improve loading speed and reduce memory usage.
Force the system to keep the model in RAM, preventing swapping to disk. Requires sufficient RAM.
Validate model tensor data during loading. Useful for debugging corrupted models. params.check_tensors = true ;
Loading Split Models
For models split across multiple files with custom naming:
LLAMA_API struct llama_model * llama_model_load_from_splits (
const char ** paths ,
size_t n_paths ,
struct llama_model_params params
);
Array of file paths in the correct order
Example
const char * paths [] = {
"model-part1.gguf" ,
"model-part2.gguf" ,
"model-part3.gguf"
};
llama_model_params params = llama_model_default_params ();
llama_model * model = llama_model_load_from_splits (paths, 3 , params);
Progress Callback
Monitor model loading progress:
typedef bool ( * llama_progress_callback)( float progress, void * user_data);
Loading progress from 0.0 to 1.0
User-provided context pointer
Return true to continue loading, false to abort
Example
bool progress_callback ( float progress , void * user_data ) {
printf ( "Loading: %.1f%% \r " , progress * 100.0 f );
fflush (stdout);
return true ; // Continue loading
}
llama_model_params params = llama_model_default_params ();
params.progress_callback = progress_callback;
params.progress_callback_user_data = NULL ;
llama_model * model = llama_model_load_from_file ( "model.gguf" , params);
Access model metadata from GGUF files:
// Get metadata value by key
int32_t llama_model_meta_val_str (
const struct llama_model * model ,
const char * key ,
char * buf ,
size_t buf_size
);
// Get number of metadata key/value pairs
int32_t llama_model_meta_count ( const struct llama_model * model );
// Get metadata key by index
int32_t llama_model_meta_key_by_index (
const struct llama_model * model ,
int32_t i ,
char * buf ,
size_t buf_size
);
// Get model description
int32_t llama_model_desc (
const struct llama_model * model ,
char * buf ,
size_t buf_size
);
Example
char buf [ 256 ];
// Get model description
if ( llama_model_desc (model, buf, sizeof (buf)) > 0 ) {
printf ( "Model: %s \n " , buf);
}
// Iterate metadata
int32_t n_meta = llama_model_meta_count (model);
for ( int32_t i = 0 ; i < n_meta; i ++ ) {
char key [ 128 ], value [ 256 ];
llama_model_meta_key_by_index (model, i, key, sizeof (key));
llama_model_meta_val_str_by_index (model, i, value, sizeof (value));
printf ( " %s = %s \n " , key, value);
}
Model Properties
Query model architecture and capabilities:
Architecture
Capabilities
RoPE Configuration
int32_t n_layers = llama_model_n_layer (model);
int32_t n_embd = llama_model_n_embd (model);
int32_t n_head = llama_model_n_head (model);
int32_t n_ctx_train = llama_model_n_ctx_train (model);
uint64_t n_params = llama_model_n_params (model);
uint64_t size_bytes = llama_model_size (model);
printf ( "Layers: %d \n " , n_layers);
printf ( "Embedding dim: %d \n " , n_embd);
printf ( "Parameters: %llu \n " , n_params);
printf ( "Size: %.2f GB \n " , size_bytes / 1024.0 / 1024.0 / 1024.0 );
Freeing Models
Free model memory when done:
void llama_model_free ( struct llama_model * model );
Always free models before calling llama_backend_free(). All contexts created from the model must be freed before freeing the model.
Example
// Correct order:
llama_free (ctx); // Free context first
llama_model_free (model); // Then free model
llama_backend_free (); // Finally free backend
Saving Models
Save a loaded model back to a file:
void llama_model_save_to_file (
const struct llama_model * model ,
const char * path_model
);
Fitting Parameters to Memory
Automatically adjust parameters to fit available device memory:
enum llama_params_fit_status llama_params_fit (
const char * path_model ,
struct llama_model_params * mparams ,
struct llama_context_params * cparams ,
float * tensor_split ,
struct llama_model_tensor_buft_override * tensor_buft_overrides ,
size_t * margins ,
uint32_t n_ctx_min ,
enum ggml_log_level log_level
);
return
enum llama_params_fit_status
LLAMA_PARAMS_FIT_STATUS_SUCCESS: Parameters adjusted successfully
LLAMA_PARAMS_FIT_STATUS_FAILURE: Could not find fitting allocations
LLAMA_PARAMS_FIT_STATUS_ERROR: Hard error (e.g., model not found)
This function modifies the global logger state and is not thread-safe . Only parameters matching defaults are modified, except context size which is modified if equal to 0.
Complete Example
#include "llama.h"
#include <stdio.h>
int main () {
// Initialize backend
llama_backend_init ();
// Configure model parameters
llama_model_params params = llama_model_default_params ();
params . n_gpu_layers = 32 ;
params . use_mmap = true ;
params . use_mlock = false ;
params . check_tensors = true ;
// Load model
const char * model_path = "models/llama-2-7b.Q4_K_M.gguf" ;
llama_model * model = llama_model_load_from_file (model_path, params);
if (model == NULL ) {
fprintf (stderr, "Failed to load model from %s \n " , model_path);
llama_backend_free ();
return 1 ;
}
// Print model information
printf ( "Model loaded successfully \n " );
printf ( " Layers: %d \n " , llama_model_n_layer (model));
printf ( " Embedding dimension: %d \n " , llama_model_n_embd (model));
printf ( " Context size (training): %d \n " , llama_model_n_ctx_train (model));
printf ( " Parameters: %llu \n " , llama_model_n_params (model));
printf ( " Size: %.2f GB \n " ,
llama_model_size (model) / 1024.0 / 1024.0 / 1024.0 );
// Check capabilities
if ( llama_model_is_recurrent (model)) {
printf ( " Type: Recurrent \n " );
} else if ( llama_model_has_encoder (model)) {
printf ( " Type: Encoder-decoder \n " );
} else {
printf ( " Type: Decoder-only \n " );
}
// Cleanup
llama_model_free (model);
llama_backend_free ();
return 0 ;
}
Next Steps
Inference Learn how to create contexts and run inference
Sampling Configure token sampling strategies