qwen-image-mlx

Overview

Qwen-Image-2512 is a text-to-image generation model implemented in Rust using MLX bindings for Apple Silicon.

Key features

Qwen transformer: Large-scale diffusion transformer for high-quality image generation
Flow-matching Euler scheduler: Efficient sampling with time shifting
Qwen VAE: 16-channel latent space with 8x downsampling
Classifier-free guidance: Support for guided generation
Quantization support: 4-bit quantized models for reduced memory

Core types

QwenImagePipeline

End-to-end text-to-image generation pipeline.

pub struct QwenImagePipeline {
    pub transformer: QwenTransformer,
    pub vae: QwenVAE,
    pub scheduler: FlowMatchEulerScheduler,
}

transformer

QwenTransformer

The diffusion transformer model

vae

QwenVAE

VAE for encoding/decoding images

scheduler

FlowMatchEulerScheduler

Flow-matching scheduler for denoising

Methods

new

fn(transformer: QwenTransformer, vae: QwenVAE, num_inference_steps: i32, shift: f32) -> Self

Create a new generation pipeline.

transformer

QwenTransformer

Pre-loaded transformer model

vae

QwenVAE

Pre-loaded VAE model

num_inference_steps

i32

Number of denoising steps (typically 20-50)

shift

f32

Time shift parameter for scheduler (typically 3.0)

generate

fn(&mut self, encoder_hidden_states: &Array, height: i32, width: i32, num_frames: i32, seed: Option<u64>) -> Result<Array, Exception>

Generate image from text embeddings.

encoder_hidden_states

&Array

Text embeddings from text encoder [batch, seq_len, dim]

height

i32

Output image height (must be divisible by 16)

width

i32

Output image width (must be divisible by 16)

num_frames

i32

Number of frames (1 for single image)

seed

Option<u64>

Random seed for reproducibility (None for random)

returns

Array

Generated image [batch, height, width, 3] in RGB format

generate_cfg

fn(&mut self, encoder_hidden_states: &Array, null_encoder_hidden_states: &Array, height: i32, width: i32, num_frames: i32, guidance_scale: f32, seed: Option<u64>) -> Result<Array, Exception>

Generate with classifier-free guidance.

encoder_hidden_states

&Array

Conditional text embeddings [batch, seq_len, dim]

null_encoder_hidden_states

&Array

Unconditional (empty prompt) embeddings [batch, seq_len, dim]

guidance_scale

f32

Guidance strength (typically 3.5-7.0, higher = more prompt adherence)

returns

Array

Generated image with CFG guidance

QwenTransformer

Diffusion transformer model.

pub struct QwenTransformer {
    pub config: QwenTransformerConfig,
    // ... internal layers
}

forward

fn(&mut self, latents: &Array, encoder_hidden_states: &Array, timestep: &Array) -> Result<Array, Exception>

Predict velocity for denoising step.

latents

&Array

Noisy latents [batch, channels, num_frames, height, width]

encoder_hidden_states

&Array

Text embeddings [batch, seq_len, dim]

timestep

&Array

Current timestep values [batch]

returns

Array

Predicted velocity for flow-matching

QwenTransformerConfig

Configuration for Qwen transformer.

pub struct QwenTransformerConfig {
    // Configuration fields
}

QwenVAE

Variational autoencoder for encoding/decoding images.

pub struct QwenVAE {
    // ... internal layers
}

encode

fn(&mut self, x: &Array) -> Result<Array, Exception>

Encode images to latent space.

&Array

Images [batch, channels, height, width]

returns

Array

Latents [batch, 16, height/8, width/8]

decode

fn(&mut self, z: &Array) -> Result<Array, Exception>

Decode latents to images.

&Array

Latents [batch, 16, num_frames, height, width]

returns

Array

Decoded images [batch, 3, num_frames, height*8, width*8]

load_vae_from_dir

fn(dir_path: &Path) -> Result<QwenVAE, Exception>

Load VAE model from directory.

dir_path

&Path

Path to directory containing VAE weights

Scheduling

FlowMatchEulerScheduler

Flow-matching Euler discrete scheduler.

pub struct FlowMatchEulerScheduler {
    pub num_inference_steps: i32,
    pub shift: f32,
    // ... internal state
}

num_inference_steps

i32

Total number of denoising steps

shift

f32

Time shift parameter for warping timestep schedule

new

fn(num_inference_steps: i32, shift: f32) -> Self

Create a new scheduler.

num_inference_steps

i32

Number of denoising steps (typically 20-50)

shift

f32

Time shift parameter (typically 3.0)

timesteps

fn(&self) -> &[f32]

Get timestep schedule.

returns

&[f32]

Timestep values from 1.0 (pure noise) to 0.0 (clean)

sigmas

fn(&self) -> &[f32]

Get sigma schedule with time shifting.

returns

&[f32]

Shifted sigma values for each timestep

step

fn(&self, model_output: &Array, timestep_idx: usize, sample: &Array) -> Result<Array, Exception>

Perform one Euler step.

model_output

&Array

Model velocity prediction

timestep_idx

usize

Current timestep index

sample

&Array

Current noisy sample

returns

Array

Updated sample: x_{t-dt} = x_t + (sigma_{t-dt} - sigma_t) * v_pred

scale_noise

fn(&self, noise: &Array) -> Result<Array, Exception>

Scale initial noise by maximum sigma.

noise

&Array

Initial Gaussian noise

returns

Array

Scaled noise for starting denoising loop

Quantization

QwenQuantizedTransformer

4-bit quantized transformer for reduced memory.

pub struct QwenQuantizedTransformer {
    pub config: QwenConfig,
    // ... quantized layers
}

load_transformer_weights

fn(dir_path: &Path, config: QwenConfig) -> Result<QwenQuantizedTransformer, Exception>

Load 4-bit quantized transformer from directory.

dir_path

&Path

Path to directory containing quantized weights

config

QwenConfig

Model configuration

QwenConfig

Configuration for quantized models.

pub struct QwenConfig {
    // Configuration fields
}

Text encoder

QwenTextEncoder

Text encoder for converting text to embeddings.

pub struct QwenTextEncoder {
    pub config: TextEncoderConfig,
    // ... internal layers
}

forward

fn(&mut self, input_ids: &Array) -> Result<Array, Exception>

Encode text tokens to embeddings.

input_ids

&Array

Token IDs [batch, seq_len]

returns

Array

Text embeddings [batch, seq_len, hidden_dim]

load_text_encoder

fn(dir_path: &Path, config: TextEncoderConfig) -> Result<QwenTextEncoder, Exception>

Load text encoder from directory.

dir_path

&Path

Path to directory containing encoder weights

config

TextEncoderConfig

Encoder configuration

TextEncoderConfig

Configuration for text encoder.

pub struct TextEncoderConfig {
    // Configuration fields
}

Utilities

build_attention_mask

fn(image_seq_len: i32, text_seq_len: i32, batch_size: i32) -> Result<Array, Exception>

Build attention mask for variable-length sequences.

image_seq_len

i32

Image sequence length

text_seq_len

i32

Text sequence length

batch_size

i32

Batch size

returns

Array

Attention mask [batch, 1, total_seq, total_seq]

Example usage

use qwen_image_mlx::{
    QwenImagePipeline, QwenTransformer, QwenVAE,
    QwenTextEncoder, load_text_encoder, load_vae_from_dir
};

// Load models
let text_encoder = load_text_encoder(encoder_path, config)?;
let transformer = QwenTransformer::new(transformer_config)?;
let vae = load_vae_from_dir(vae_path)?;

// Create pipeline
let mut pipeline = QwenImagePipeline::new(
    transformer,
    vae,
    20,    // num_inference_steps
    3.0,   // shift
);

// Encode text
let input_ids = tokenize("a beautiful sunset over mountains");
let encoder_hidden_states = text_encoder.forward(&input_ids)?;

// Generate image
let image = pipeline.generate(
    &encoder_hidden_states,
    512,   // height
    512,   // width
    1,     // num_frames
    Some(42), // seed
)?;

// Or with classifier-free guidance
let null_embeddings = text_encoder.forward(&tokenize(""))?;
let image = pipeline.generate_cfg(
    &encoder_hidden_states,
    &null_embeddings,
    512, 512, 1,
    7.5,   // guidance_scale
    Some(42),
)?;

Advanced: Custom denoising loop

// Manual control over denoising
let mut latents = Array::zeros(&[1, 16, 1, 64, 64])?;
let noise = mlx_rs::random::normal(&latents.shape(), None, None, None)?;
latents = pipeline.scheduler.scale_noise(&noise)?;

for (idx, &t) in pipeline.scheduler.timesteps().iter().enumerate() {
    let timestep = Array::from_slice(&[t], &[1]);
    
    // Predict velocity
    let v_pred = pipeline.transformer.forward(
        &latents,
        &encoder_hidden_states,
        &timestep,
    )?;
    
    // Euler step
    latents = pipeline.scheduler.step(&v_pred, idx, &latents)?;
}

// Decode
let image = pipeline.vae.decode(&latents)?;

Core Libraries

Language Models

Vision-Language

Audio

Image

Server

Overview

Key features

Core types

QwenImagePipeline

Methods

QwenTransformer

QwenTransformerConfig

QwenVAE

Scheduling

FlowMatchEulerScheduler

Quantization

QwenQuantizedTransformer

QwenConfig

Text encoder

QwenTextEncoder

TextEncoderConfig

Utilities

Example usage

Advanced: Custom denoising loop

Build docs developers (and LLMs) love

Core Libraries

Language Models

Vision-Language

Audio

Image

Server

​Overview

​Key features

​Core types

​QwenImagePipeline

​Methods

​QwenTransformer

​QwenTransformerConfig

​QwenVAE

​Scheduling

​FlowMatchEulerScheduler

​Quantization

​QwenQuantizedTransformer

​QwenConfig

​Text encoder

​QwenTextEncoder

​TextEncoderConfig

​Utilities

​Example usage

​Advanced: Custom denoising loop

Build docs developers (and LLMs) love

Overview

Key features

Core types

QwenImagePipeline

Methods

QwenTransformer

QwenTransformerConfig

QwenVAE

Scheduling

FlowMatchEulerScheduler

Quantization

QwenQuantizedTransformer

QwenConfig

Text encoder

QwenTextEncoder

TextEncoderConfig

Utilities

Example usage

Advanced: Custom denoising loop