Inference

Context Creation

Before running inference, create a context from a loaded model:

LLAMA_API struct llama_context * llama_init_from_model(
    struct llama_model * model,
    struct llama_context_params params
);

model

llama_model *

Previously loaded model

params

struct llama_context_params

Context configuration parameters

return

llama_context *

Returns context pointer, or NULL on failure

Example

llama_context_params params = llama_context_default_params();
params.n_ctx = 2048;
params.n_batch = 512;
params.n_threads = 8;

llama_context * ctx = llama_init_from_model(model, params);
if (ctx == NULL) {
    fprintf(stderr, "Failed to create context\n");
    return 1;
}

Context Parameters

struct llama_context_params {
    uint32_t n_ctx;              // Text context size (0 = from model)
    uint32_t n_batch;            // Logical maximum batch size
    uint32_t n_ubatch;           // Physical maximum batch size
    uint32_t n_seq_max;          // Max number of sequences
    int32_t  n_threads;          // Threads for generation
    int32_t  n_threads_batch;    // Threads for batch processing
    
    enum llama_rope_scaling_type rope_scaling_type;
    enum llama_pooling_type      pooling_type;
    enum llama_attention_type    attention_type;
    enum llama_flash_attn_type   flash_attn_type;
    
    // RoPE parameters
    float    rope_freq_base;
    float    rope_freq_scale;
    float    yarn_ext_factor;
    float    yarn_attn_factor;
    float    yarn_beta_fast;
    float    yarn_beta_slow;
    uint32_t yarn_orig_ctx;
    
    // Callbacks
    ggml_backend_sched_eval_callback cb_eval;
    void * cb_eval_user_data;
    ggml_abort_callback abort_callback;
    void * abort_callback_data;
    
    // KV cache types [EXPERIMENTAL]
    enum ggml_type type_k;
    enum ggml_type type_v;
    
    // Flags
    bool embeddings;   // Extract embeddings
    bool offload_kqv;  // Offload KQV ops to GPU
    bool no_perf;      // Disable performance timings
    bool op_offload;   // Offload host tensor operations
    bool swa_full;     // Use full-size SWA cache
    bool kv_unified;   // Use unified buffer for attention
};

Key Parameters

n_ctx

uint32_t

default:"from model"

Text context window size. Maximum number of tokens the model can attend to. Use 0 to use the model’s training context size.

params.n_ctx = 2048;  // 2K context
params.n_ctx = 4096;  // 4K context
params.n_ctx = 0;     // Use model default

n_batch

uint32_t

default:"2048"

Logical maximum batch size for llama_decode(). Controls how many tokens can be processed in a single call.

params.n_batch = 512;  // Process up to 512 tokens at once

n_ubatch

uint32_t

default:"512"

Physical maximum batch size. The logical batch is split into physical batches of this size for processing.

params.n_ubatch = 256;  // Process in chunks of 256 tokens

n_threads

int32_t

default:"auto"

Number of threads for single-token generation (autoregressive decoding).

params.n_threads = 8;

n_threads_batch

int32_t

default:"auto"

Number of threads for prompt processing and batch operations.

params.n_threads_batch = 16;  // More threads for parallel prompt processing

After creating a context, query the actual values using llama_n_ctx(), llama_n_batch(), etc., as they may differ from requested values.

The Batch Structure

typedef struct llama_batch {
    int32_t n_tokens;           // Number of tokens in this batch
    
    llama_token  * token;       // Token IDs (when embd is NULL)
    float        * embd;        // Token embeddings (when token is NULL)
    llama_pos    * pos;         // Token positions (NULL = auto-track)
    int32_t      * n_seq_id;    // Number of sequence IDs per token
    llama_seq_id ** seq_id;     // Sequence IDs per token (NULL = seq 0)
    int8_t       * logits;      // Output logits flag (NULL = last only)
} llama_batch;

Creating Batches

// Helper for single sequence with auto-tracked positions
llama_batch llama_batch_get_one(
    llama_token * tokens,
    int32_t n_tokens
);

// Usage
llama_token tokens[] = {1, 2, 3, 4, 5};
llama_batch batch = llama_batch_get_one(tokens, 5);

Decoding

llama_decode

Process a batch of tokens through the decoder:

LLAMA_API int32_t llama_decode(
    struct llama_context * ctx,
    struct llama_batch batch
);

ctx

llama_context *

Context with memory for KV cache

batch

llama_batch

Batch of tokens to process

return

int32_t

0: Success
1: No KV slot available (try smaller batch or larger context)
2: Aborted by callback
-1: Invalid input batch
< -1: Fatal error

llama_decode() requires the context to have memory. For encoder-decoder models, this processes the batch using the decoder.

llama_encode

Process a batch using the encoder (for encoder-decoder models):

LLAMA_API int32_t llama_encode(
    struct llama_context * ctx,
    struct llama_batch batch
);

return

int32_t

0: Success
< 0: Error (memory state restored)

llama_encode() does not use the KV cache. It stores encoder output internally for later use by decoder’s cross-attention.

Basic Inference Loop

// Tokenize prompt
std::vector<llama_token> tokens = /* ... tokenize prompt ... */;

// Process prompt
llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size());
if (llama_decode(ctx, batch) != 0) {
    fprintf(stderr, "Failed to decode prompt\n");
    return 1;
}

// Generate tokens
for (int i = 0; i < n_predict; i++) {
    // Sample next token
    llama_token token = llama_sampler_sample(sampler, ctx, -1);
    
    // Check for end of generation
    if (llama_vocab_is_eog(vocab, token)) {
        break;
    }
    
    // Decode single token
    batch = llama_batch_get_one(&token, 1);
    if (llama_decode(ctx, batch) != 0) {
        fprintf(stderr, "Failed to decode token\n");
        return 1;
    }
}

Getting Logits and Embeddings

Logits

// Get all logits (for tokens where batch.logits[i] != 0)
float * llama_get_logits(struct llama_context * ctx);

// Get logits for ith token (supports negative indexing)
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);

// Get logits for last token
float * logits = llama_get_logits_ith(ctx, -1);
int32_t n_vocab = llama_vocab_n_tokens(vocab);

// Find token with highest probability
llama_token max_token = 0;
float max_logit = logits[0];
for (int32_t i = 1; i < n_vocab; i++) {
    if (logits[i] > max_logit) {
        max_logit = logits[i];
        max_token = i;
    }
}

Embeddings

// Get all embeddings
float * llama_get_embeddings(struct llama_context * ctx);

// Get embedding for ith token
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);

// Get embedding for sequence
float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);

Enable embeddings by setting ctx_params.embeddings = true during context creation.

Memory Management (KV Cache)

The KV cache stores key-value pairs for efficient attention computation:

// Get memory handle from context
llama_memory_t llama_get_memory(const struct llama_context * ctx);

// Clear all memory (data = true also clears data buffers)
void llama_memory_clear(llama_memory_t mem, bool data);

Sequence Operations

// Remove tokens in range [p0, p1) for sequence
bool llama_memory_seq_rm(
    llama_memory_t mem,
    llama_seq_id seq_id,  // -1 = all sequences
    llama_pos p0,         // -1 = 0
    llama_pos p1          // -1 = inf
);

// Example: Remove first 10 tokens from sequence 0
llama_memory_seq_rm(mem, 0, 0, 10);

// Remove all tokens from sequence 1
llama_memory_seq_rm(mem, 1, -1, -1);

Parallel Decoding Example

// Process multiple independent sequences in parallel
llama_batch batch = llama_batch_init(512, 0, 4);  // Max 4 sequences

batch.n_tokens = 4;
batch.token    = (llama_token[]){101, 102, 103, 104};
batch.pos      = (llama_pos[]){0, 0, 0, 0};
batch.n_seq_id = (int32_t[]){1, 1, 1, 1};

// Assign to different sequences
llama_seq_id seq_id_0[] = {0};
llama_seq_id seq_id_1[] = {1};
llama_seq_id seq_id_2[] = {2};
llama_seq_id seq_id_3[] = {3};

batch.seq_id = (llama_seq_id*[]){seq_id_0, seq_id_1, seq_id_2, seq_id_3};
batch.logits = (int8_t[]){1, 1, 1, 1};  // Get logits for all

if (llama_decode(ctx, batch) != 0) {
    fprintf(stderr, "Parallel decode failed\n");
}

// Sample for each sequence
for (int i = 0; i < 4; i++) {
    llama_token token = llama_get_logits_ith(ctx, i);
    // Process token for sequence i
}

State Persistence

Save and restore context state:

// Get state size
size_t llama_state_get_size(struct llama_context * ctx);

// Save state to buffer
size_t llama_state_get_data(
    struct llama_context * ctx,
    uint8_t * dst,
    size_t size
);

// Restore state from buffer
size_t llama_state_set_data(
    struct llama_context * ctx,
    const uint8_t * src,
    size_t size
);

// Save/load state from file
bool llama_state_save_file(
    struct llama_context * ctx,
    const char * path_session,
    const llama_token * tokens,
    size_t n_token_count
);

bool llama_state_load_file(
    struct llama_context * ctx,
    const char * path_session,
    llama_token * tokens_out,
    size_t n_token_capacity,
    size_t * n_token_count_out
);

Thread Control

// Set number of threads (can be changed during inference)
void llama_set_n_threads(
    struct llama_context * ctx,
    int32_t n_threads,        // For generation
    int32_t n_threads_batch   // For batch processing
);

// Query current thread counts
int32_t llama_n_threads(struct llama_context * ctx);
int32_t llama_n_threads_batch(struct llama_context * ctx);

Synchronization

// Wait for all computations to finish
void llama_synchronize(struct llama_context * ctx);

This is automatically called when getting logits/embeddings. Explicit calls are rarely needed.

Cleanup

// Free context
void llama_free(struct llama_context * ctx);

Always free contexts before freeing the associated model.

Complete Inference Example

#include "llama.h"
#include <stdio.h>
#include <vector>

int main() {
    // Load model (see model-loading.mdx)
    llama_backend_init();
    llama_model_params mparams = llama_model_default_params();
    llama_model * model = llama_model_load_from_file("model.gguf", mparams);
    
    // Create context
    llama_context_params cparams = llama_context_default_params();
    cparams.n_ctx = 2048;
    cparams.n_batch = 512;
    cparams.n_threads = 8;
    
    llama_context * ctx = llama_init_from_model(model, cparams);
    const llama_vocab * vocab = llama_model_get_vocab(model);
    llama_memory_t mem = llama_get_memory(ctx);
    
    // Tokenize
    const char * prompt = "The capital of France is";
    std::vector<llama_token> tokens(256);
    int n_tokens = llama_tokenize(vocab, prompt, strlen(prompt),
                                   tokens.data(), tokens.size(), true, false);
    tokens.resize(n_tokens);
    
    // Process prompt
    llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size());
    if (llama_decode(ctx, batch) != 0) {
        fprintf(stderr, "Failed to process prompt\n");
        return 1;
    }
    
    // Create sampler
    llama_sampler * sampler = llama_sampler_chain_init(
        llama_sampler_chain_default_params()
    );
    llama_sampler_chain_add(sampler, llama_sampler_init_greedy());
    
    // Generate
    int n_gen = 0;
    while (n_gen < 50) {
        llama_token token = llama_sampler_sample(sampler, ctx, -1);
        
        if (llama_vocab_is_eog(vocab, token)) {
            break;
        }
        
        // Print token
        char buf[128];
        int n = llama_token_to_piece(vocab, token, buf, sizeof(buf), 0, true);
        printf("%.*s", n, buf);
        fflush(stdout);
        
        // Decode next token
        batch = llama_batch_get_one(&token, 1);
        if (llama_decode(ctx, batch) != 0) {
            fprintf(stderr, "Failed to decode\n");
            break;
        }
        
        n_gen++;
    }
    
    printf("\n");
    
    // Cleanup
    llama_sampler_free(sampler);
    llama_free(ctx);
    llama_model_free(model);
    llama_backend_free();
    
    return 0;
}

C/C++ API

REST API

Tools

Context Creation

Example

Context Parameters

Key Parameters

The Batch Structure

Creating Batches

Decoding

llama_decode

llama_encode

Basic Inference Loop

Getting Logits and Embeddings

Logits

Embeddings

Memory Management (KV Cache)

Sequence Operations

Parallel Decoding Example

State Persistence

Thread Control

Synchronization

Cleanup

Complete Inference Example

Next Steps

Sampling

libllama Overview

C/C++ API

REST API

Tools

​Context Creation

​Example

​Context Parameters

​Key Parameters

​The Batch Structure

​Creating Batches

​Decoding

​llama_decode

​llama_encode

​Basic Inference Loop

​Getting Logits and Embeddings

​Logits

​Embeddings

​Memory Management (KV Cache)

​Sequence Operations

​Parallel Decoding Example

​State Persistence

​Thread Control

​Synchronization

​Cleanup

​Complete Inference Example

​Next Steps

Sampling

libllama Overview

Context Creation

Example

Context Parameters

Key Parameters

The Batch Structure

Creating Batches

Decoding

llama_decode

llama_encode

Basic Inference Loop

Getting Logits and Embeddings

Logits

Embeddings

Memory Management (KV Cache)

Sequence Operations

Parallel Decoding Example

State Persistence

Thread Control

Synchronization

Cleanup

Complete Inference Example

Next Steps