Skip to main content

Overview

Qwen-Image-2512 is a text-to-image generation model implemented in Rust using MLX bindings for Apple Silicon.

Key features

  • Qwen transformer: Large-scale diffusion transformer for high-quality image generation
  • Flow-matching Euler scheduler: Efficient sampling with time shifting
  • Qwen VAE: 16-channel latent space with 8x downsampling
  • Classifier-free guidance: Support for guided generation
  • Quantization support: 4-bit quantized models for reduced memory

Core types

QwenImagePipeline

End-to-end text-to-image generation pipeline.
pub struct QwenImagePipeline {
    pub transformer: QwenTransformer,
    pub vae: QwenVAE,
    pub scheduler: FlowMatchEulerScheduler,
}
transformer
QwenTransformer
The diffusion transformer model
vae
QwenVAE
VAE for encoding/decoding images
scheduler
FlowMatchEulerScheduler
Flow-matching scheduler for denoising

Methods

new
fn(transformer: QwenTransformer, vae: QwenVAE, num_inference_steps: i32, shift: f32) -> Self
Create a new generation pipeline.
transformer
QwenTransformer
Pre-loaded transformer model
vae
QwenVAE
Pre-loaded VAE model
num_inference_steps
i32
Number of denoising steps (typically 20-50)
shift
f32
Time shift parameter for scheduler (typically 3.0)
generate
fn(&mut self, encoder_hidden_states: &Array, height: i32, width: i32, num_frames: i32, seed: Option<u64>) -> Result<Array, Exception>
Generate image from text embeddings.
encoder_hidden_states
&Array
Text embeddings from text encoder [batch, seq_len, dim]
height
i32
Output image height (must be divisible by 16)
width
i32
Output image width (must be divisible by 16)
num_frames
i32
Number of frames (1 for single image)
seed
Option<u64>
Random seed for reproducibility (None for random)
returns
Array
Generated image [batch, height, width, 3] in RGB format
generate_cfg
fn(&mut self, encoder_hidden_states: &Array, null_encoder_hidden_states: &Array, height: i32, width: i32, num_frames: i32, guidance_scale: f32, seed: Option<u64>) -> Result<Array, Exception>
Generate with classifier-free guidance.
encoder_hidden_states
&Array
Conditional text embeddings [batch, seq_len, dim]
null_encoder_hidden_states
&Array
Unconditional (empty prompt) embeddings [batch, seq_len, dim]
guidance_scale
f32
Guidance strength (typically 3.5-7.0, higher = more prompt adherence)
returns
Array
Generated image with CFG guidance

QwenTransformer

Diffusion transformer model.
pub struct QwenTransformer {
    pub config: QwenTransformerConfig,
    // ... internal layers
}
forward
fn(&mut self, latents: &Array, encoder_hidden_states: &Array, timestep: &Array) -> Result<Array, Exception>
Predict velocity for denoising step.
latents
&Array
Noisy latents [batch, channels, num_frames, height, width]
encoder_hidden_states
&Array
Text embeddings [batch, seq_len, dim]
timestep
&Array
Current timestep values [batch]
returns
Array
Predicted velocity for flow-matching

QwenTransformerConfig

Configuration for Qwen transformer.
pub struct QwenTransformerConfig {
    // Configuration fields
}

QwenVAE

Variational autoencoder for encoding/decoding images.
pub struct QwenVAE {
    // ... internal layers
}
encode
fn(&mut self, x: &Array) -> Result<Array, Exception>
Encode images to latent space.
x
&Array
Images [batch, channels, height, width]
returns
Array
Latents [batch, 16, height/8, width/8]
decode
fn(&mut self, z: &Array) -> Result<Array, Exception>
Decode latents to images.
z
&Array
Latents [batch, 16, num_frames, height, width]
returns
Array
Decoded images [batch, 3, num_frames, height*8, width*8]
load_vae_from_dir
fn(dir_path: &Path) -> Result<QwenVAE, Exception>
Load VAE model from directory.
dir_path
&Path
Path to directory containing VAE weights

Scheduling

FlowMatchEulerScheduler

Flow-matching Euler discrete scheduler.
pub struct FlowMatchEulerScheduler {
    pub num_inference_steps: i32,
    pub shift: f32,
    // ... internal state
}
num_inference_steps
i32
Total number of denoising steps
shift
f32
Time shift parameter for warping timestep schedule
new
fn(num_inference_steps: i32, shift: f32) -> Self
Create a new scheduler.
num_inference_steps
i32
Number of denoising steps (typically 20-50)
shift
f32
Time shift parameter (typically 3.0)
timesteps
fn(&self) -> &[f32]
Get timestep schedule.
returns
&[f32]
Timestep values from 1.0 (pure noise) to 0.0 (clean)
sigmas
fn(&self) -> &[f32]
Get sigma schedule with time shifting.
returns
&[f32]
Shifted sigma values for each timestep
step
fn(&self, model_output: &Array, timestep_idx: usize, sample: &Array) -> Result<Array, Exception>
Perform one Euler step.
model_output
&Array
Model velocity prediction
timestep_idx
usize
Current timestep index
sample
&Array
Current noisy sample
returns
Array
Updated sample: x_{t-dt} = x_t + (sigma_{t-dt} - sigma_t) * v_pred
scale_noise
fn(&self, noise: &Array) -> Result<Array, Exception>
Scale initial noise by maximum sigma.
noise
&Array
Initial Gaussian noise
returns
Array
Scaled noise for starting denoising loop

Quantization

QwenQuantizedTransformer

4-bit quantized transformer for reduced memory.
pub struct QwenQuantizedTransformer {
    pub config: QwenConfig,
    // ... quantized layers
}
load_transformer_weights
fn(dir_path: &Path, config: QwenConfig) -> Result<QwenQuantizedTransformer, Exception>
Load 4-bit quantized transformer from directory.
dir_path
&Path
Path to directory containing quantized weights
config
QwenConfig
Model configuration

QwenConfig

Configuration for quantized models.
pub struct QwenConfig {
    // Configuration fields
}

Text encoder

QwenTextEncoder

Text encoder for converting text to embeddings.
pub struct QwenTextEncoder {
    pub config: TextEncoderConfig,
    // ... internal layers
}
forward
fn(&mut self, input_ids: &Array) -> Result<Array, Exception>
Encode text tokens to embeddings.
input_ids
&Array
Token IDs [batch, seq_len]
returns
Array
Text embeddings [batch, seq_len, hidden_dim]
load_text_encoder
fn(dir_path: &Path, config: TextEncoderConfig) -> Result<QwenTextEncoder, Exception>
Load text encoder from directory.
dir_path
&Path
Path to directory containing encoder weights
config
TextEncoderConfig
Encoder configuration

TextEncoderConfig

Configuration for text encoder.
pub struct TextEncoderConfig {
    // Configuration fields
}

Utilities

build_attention_mask
fn(image_seq_len: i32, text_seq_len: i32, batch_size: i32) -> Result<Array, Exception>
Build attention mask for variable-length sequences.
image_seq_len
i32
Image sequence length
text_seq_len
i32
Text sequence length
batch_size
i32
Batch size
returns
Array
Attention mask [batch, 1, total_seq, total_seq]

Example usage

use qwen_image_mlx::{
    QwenImagePipeline, QwenTransformer, QwenVAE,
    QwenTextEncoder, load_text_encoder, load_vae_from_dir
};

// Load models
let text_encoder = load_text_encoder(encoder_path, config)?;
let transformer = QwenTransformer::new(transformer_config)?;
let vae = load_vae_from_dir(vae_path)?;

// Create pipeline
let mut pipeline = QwenImagePipeline::new(
    transformer,
    vae,
    20,    // num_inference_steps
    3.0,   // shift
);

// Encode text
let input_ids = tokenize("a beautiful sunset over mountains");
let encoder_hidden_states = text_encoder.forward(&input_ids)?;

// Generate image
let image = pipeline.generate(
    &encoder_hidden_states,
    512,   // height
    512,   // width
    1,     // num_frames
    Some(42), // seed
)?;

// Or with classifier-free guidance
let null_embeddings = text_encoder.forward(&tokenize(""))?;
let image = pipeline.generate_cfg(
    &encoder_hidden_states,
    &null_embeddings,
    512, 512, 1,
    7.5,   // guidance_scale
    Some(42),
)?;

Advanced: Custom denoising loop

// Manual control over denoising
let mut latents = Array::zeros(&[1, 16, 1, 64, 64])?;
let noise = mlx_rs::random::normal(&latents.shape(), None, None, None)?;
latents = pipeline.scheduler.scale_noise(&noise)?;

for (idx, &t) in pipeline.scheduler.timesteps().iter().enumerate() {
    let timestep = Array::from_slice(&[t], &[1]);
    
    // Predict velocity
    let v_pred = pipeline.transformer.forward(
        &latents,
        &encoder_hidden_states,
        &timestep,
    )?;
    
    // Euler step
    latents = pipeline.scheduler.step(&v_pred, idx, &latents)?;
}

// Decode
let image = pipeline.vae.decode(&latents)?;

Build docs developers (and LLMs) love