Skip to main content

Overview

The glm4-mlx crate provides high-performance inference for GLM-4 (General Language Model) on Apple Silicon using MLX. GLM-4 features partial RoPE, extra layer normalization, and support for 4-bit quantization.

Key features

  • Partial RoPE - Rotary position embeddings applied to half of head dimensions
  • Fused gate_up_proj - Efficient MLP implementation
  • Extra LayerNorms - post_self_attn and post_mlp normalization
  • 4-bit quantization - Reduced memory footprint with minimal quality loss

Installation

Add to your Cargo.toml:
[dependencies]
glm4-mlx = "0.1"

Core functions

load_model

Loads a GLM-4 model from a directory containing weights and configuration.
pub fn load_model(model_dir: impl AsRef<Path>) -> Result<Model, Error>
model_dir
impl AsRef<Path>
required
Path to the model directory containing:
  • config.json - Model configuration
  • model.safetensors.index.json - Weight file index
  • model-*.safetensors - Model weights
Result<Model, Error>
Result
Returns a loaded Model ready for inference

load_tokenizer

Loads the tokenizer from the model directory.
pub fn load_tokenizer(model_dir: impl AsRef<Path>) -> Result<Tokenizer, Error>
model_dir
impl AsRef<Path>
required
Path to the model directory containing tokenizer.json
Result<Tokenizer, Error>
Result
Returns a HuggingFace Tokenizer instance

get_model_args

Parses model configuration from config.json.
pub fn get_model_args(model_dir: impl AsRef<Path>) -> Result<ModelArgs, Error>
model_dir
impl AsRef<Path>
required
Path to directory containing config.json
Result<ModelArgs, Error>
Result
Returns parsed ModelArgs with model hyperparameters

Types

Model

The main model struct for GLM-4 inference.
pub struct Model {
    pub args: ModelArgs,
    pub model: Glm4Model,
    pub lm_head: Option<MaybeQuantized<nn::Linear>>,
}
args
ModelArgs
Model configuration and hyperparameters
model
Glm4Model
The core GLM-4 transformer model
lm_head
Option<MaybeQuantized<nn::Linear>>
Language modeling head (None if tie_word_embeddings is true)

ModelArgs

GLM-4 model configuration.
pub struct ModelArgs {
    pub model_type: String,
    pub hidden_size: i32,
    pub num_hidden_layers: i32,
    pub intermediate_size: i32,
    pub num_attention_heads: i32,
    pub rms_norm_eps: f32,
    pub vocab_size: i32,
    pub num_key_value_heads: i32,
    pub max_position_embeddings: i32,
    pub rope_theta: f32,
    pub head_dim: i32,
    pub tie_word_embeddings: bool,
    pub partial_rotary_factor: f32,  // GLM-4 uses 0.5
    pub attention_bias: bool,
    pub quantization: Option<QuantizationConfig>,
}
partial_rotary_factor
f32
default:"0.5"
Fraction of head dimensions to apply RoPE to (GLM-4 uses 0.5)
attention_bias
bool
default:"true"
Whether attention layers have bias terms (GLM-4 has QKV bias)

Generate

Iterator for autoregressive text generation.
pub struct Generate<'a, C: KeyValueCache> {
    model: &'a mut Model,
    cache: &'a mut Vec<Option<C>>,
    temp: f32,
    state: GenerateState<'a>,
    prefetched: Option<Array>,
    token_count: usize,
}

Constructor

pub fn new(
    model: &'a mut Model,
    cache: &'a mut Vec<Option<C>>,
    temp: f32,
    prompt_token: &'a Array,
) -> Self
model
&'a mut Model
required
Mutable reference to the loaded model
cache
&'a mut Vec<Option<C>>
required
KV cache for attention (initially empty)
temp
f32
required
Sampling temperature (0.0 = greedy, higher = more random)
prompt_token
&'a Array
required
Encoded prompt tokens as MLX array with shape [1, seq_len]

Example usage

Basic generation

use glm4_mlx::{load_model, load_tokenizer, Generate, KVCache};
use mlx_rs::ops::indexing::NewAxis;

let model_dir = "models/glm-4-9b-chat";

// Load model and tokenizer
let tokenizer = load_tokenizer(model_dir)?;
let mut model = load_model(model_dir)?;

// Encode prompt (GLM-4 is trained on Chinese and English)
let encoding = tokenizer.encode("你好,请介绍一下", true)?;
let prompt = mlx_rs::Array::from(encoding.get_ids()).index(NewAxis);

// Initialize cache
let mut cache = Vec::new();

// Generate tokens
let generator = Generate::<KVCache>::new(&mut model, &mut cache, 0.7, &prompt);

for token in generator.take(100) {
    let token = token?;
    let text = tokenizer.decode(&[token.item::<u32>()], true)?;
    print!("{}", text);
}

Chat mode with ChatGLM format

use glm4_mlx::{load_model, load_tokenizer, Generate, KVCache};
use mlx_rs::ops::indexing::NewAxis;

let model_dir = "models/glm-4-9b-chat";
let tokenizer = load_tokenizer(model_dir)?;
let mut model = load_model(model_dir)?;

// Format chat prompt using ChatGLM format
let system = "你是一个乐于助人的AI助手。";
let user_message = "什么是量子计算?";

let prompt_text = format!(
    "[gMASK]<sop><|system|>\n{system}<|user|>\n{user_message}<|assistant|>\n"
);

let encoding = tokenizer.encode(&prompt_text, true)?;
let prompt = mlx_rs::Array::from(encoding.get_ids()).index(NewAxis);

let mut cache = Vec::new();
let generator = Generate::<KVCache>::new(&mut model, &mut cache, 0.8, &prompt);

for token in generator.take(200) {
    let token = token?;
    let id = token.item::<u32>();
    
    // Check for EOS token
    if id == 2 {
        break;
    }
    
    let text = tokenizer.decode(&[id], true)?;
    print!("{}", text);
}

Architecture components

Glm4Attention

Attention with partial RoPE (applied to half of head dimensions).
pub struct Glm4Attention {
    pub n_heads: i32,
    pub n_kv_heads: i32,
    pub head_dim: i32,
    pub rope_dim: i32,  // Dimensions to apply RoPE to
    pub scale: f32,
    pub q_proj: MaybeQuantized<nn::Linear>,
    pub k_proj: MaybeQuantized<nn::Linear>,
    pub v_proj: MaybeQuantized<nn::Linear>,
    pub o_proj: MaybeQuantized<nn::Linear>,
    pub rope: nn::Rope,
}
rope_dim
i32
Number of dimensions to apply RoPE to (typically head_dim / 2)

Glm4Mlp

Feed-forward network with fused gate and up projections.
pub struct Glm4Mlp {
    pub gate_up_proj: MaybeQuantized<nn::Linear>,
    pub down_proj: MaybeQuantized<nn::Linear>,
}

Glm4DecoderLayer

Transformer layer with extra normalization layers.
pub struct Glm4DecoderLayer {
    pub self_attn: Glm4Attention,
    pub mlp: Glm4Mlp,
    pub input_layernorm: nn::RmsNorm,
    pub post_attention_layernorm: nn::RmsNorm,
    pub post_self_attn: nn::RmsNorm,     // Extra norm
    pub post_mlp: nn::RmsNorm,           // Extra norm
}

Partial RoPE implementation

GLM-4 applies RoPE to only half of the head dimensions:
let rope_dim = (args.head_dim as f32 * args.partial_rotary_factor) as i32;

// Split Q and K into rotary and non-rotary parts
let (q_rot, q_pass) = q.split(&[rope_dim], -1)?;
let (k_rot, k_pass) = k.split(&[rope_dim], -1)?;

// Apply RoPE only to rotary parts
let q_rot = rope.forward(&q_rot, offset)?;
let k_rot = rope.forward(&k_rot, offset)?;

// Concatenate back
let q = mlx_rs::ops::concatenate(&[&q_rot, &q_pass], -1)?;
let k = mlx_rs::ops::concatenate(&[&k_rot, &k_pass], -1)?;

Performance notes

  • Partial RoPE reduces computation while maintaining position awareness
  • Fused projections minimize memory bandwidth usage
  • 4-bit quantization reduces memory by ~4x
  • Metal acceleration provides optimal performance on Apple Silicon

See also

Build docs developers (and LLMs) love