Skip to main content

load()

Load a model and processor from a local directory or Hugging Face Hub repo. Downloads the model automatically if it is not already cached locally.
from mlx_vlm import load

model, processor = load(path_or_hf_repo)

Signature

def load(
    path_or_hf_repo: str,
    adapter_path: Optional[str] = None,
    lazy: bool = False,
    revision: Optional[str] = None,
    **kwargs,
) -> Tuple[nn.Module, ProcessorMixin]:

Parameters

path_or_hf_repo
str
required
Local directory path or Hugging Face repository ID (e.g. "mlx-community/Qwen2-VL-2B-Instruct-4bit").
adapter_path
str
default:"None"
Path to LoRA adapter weights. When provided, LoRA layers are applied on top of the base model.
lazy
bool
default:"False"
When False, all model parameters are evaluated (loaded into memory) before the function returns. When True, parameters are loaded on first use. Set to True for large models when you want to defer memory allocation.
revision
str
default:"None"
A Hugging Face revision identifier: a branch name, tag, or commit hash. Defaults to main.
quantize_activations
bool
default:"False"
Convert QuantizedLinear layers to QQLinear layers for activation quantization. Required when running mxfp8 or nvfp4 quantized models on NVIDIA CUDA. Has no effect on Apple Silicon (Metal).
force_download
bool
default:"False"
Force re-download from Hugging Face Hub even if the model is already cached.
trust_remote_code
bool
default:"False"
Allow execution of custom model code from the repository. Required for some models that ship non-standard architectures.

Returns

A tuple of (model, processor):
model
nn.Module
The loaded MLX model, ready for inference.
processor
ProcessorMixin
The model’s processor (tokenizer + image processor). Includes a detokenizer attribute added by MLX-VLM for streaming decoding.

Raises

  • FileNotFoundError — Config file or .safetensors weight files not found at the given path.
  • ValueError — Model class or model args class cannot be found or instantiated.

Examples

from mlx_vlm import load

model, processor = load("mlx-community/Qwen2-VL-2B-Instruct-4bit")
quantize_activations=True is only needed for models quantized with mxfp8 or nvfp4 modes when running on CUDA. On Apple Silicon (Metal), those models work without the flag.

load_config()

Load the model configuration from a local directory or Hugging Face repo.

Signature

def load_config(
    model_path: Union[str, Path],
    **kwargs,
) -> dict:

Parameters

model_path
str | Path
required
Local path to a model directory or a Hugging Face repository ID. If a string is passed that is not a local path, the model is first downloaded.

Returns

config
dict
The parsed config.json for the model, with any eos_token_id overrides from generation_config.json applied.

Example

from mlx_vlm.utils import load_config

config = load_config("mlx-community/Qwen2-VL-2B-Instruct-4bit")
print(config["model_type"])  # qwen2_vl

prepare_inputs()

Tokenize prompts and preprocess images/audio into model-ready tensors.

Signature

def prepare_inputs(
    processor,
    images=None,
    audio=None,
    prompts=None,
    image_token_index=None,
    resize_shape=None,
    add_special_tokens=False,
    padding=True,
    padding_side="left",
    pad_to_uniform_size=False,
    return_tensors="mlx",
    **kwargs,
) -> dict:

Parameters

processor
ProcessorMixin
required
The model processor returned by load().
images
list | None
default:"None"
A list of image paths, URLs, or PIL.Image.Image objects.
audio
list | None
default:"None"
A list of audio file paths or URLs.
prompts
str | list[str] | None
default:"None"
The formatted prompt string or list of prompt strings.
image_token_index
int | None
default:"None"
Token index used to mark image positions in the input IDs. Read from model.config.image_token_index.
resize_shape
tuple[int, int] | None
default:"None"
Resize all images to (height, width) before processing.
add_special_tokens
bool
default:"False"
Pass add_special_tokens to the tokenizer.
padding
bool
default:"True"
Pad tokenized inputs to the same length when processing a batch.
padding_side
str
default:"\"left\""
Side to apply padding ("left" or "right"). Left padding is standard for causal generation.
pad_to_uniform_size
bool
default:"False"
When True, resize all images to a uniform size derived from the image processor configuration. Used for batched image inputs.
return_tensors
str
default:"\"mlx\""
Tensor format to return. Always "mlx" for MLX arrays.

Returns

A dictionary containing:
input_ids
mx.array
Token IDs of shape (batch, seq_len).
attention_mask
mx.array
Attention mask of shape (batch, seq_len).
pixel_values
mx.array | None
Preprocessed image pixels (present when images are provided).

process_image()

Load and optionally resize a single image.

Signature

def process_image(
    img: Union[str, PIL.Image.Image],
    resize_shape: Optional[Tuple[int, int]],
    image_processor,
) -> PIL.Image.Image:

Parameters

img
str | PIL.Image.Image
required
An image URL, local file path, or a PIL.Image.Image object.
resize_shape
tuple[int, int] | None
required
Target (max_width, max_height) to resize the image to. Pass None to skip resizing.
image_processor
BaseImageProcessor | None
required
The model’s image processor. When a custom BaseImageProcessor is provided, resizing is handled by that processor instead.

Returns

img
PIL.Image.Image
The loaded, RGB-converted, and optionally resized image.

Example

from mlx_vlm.utils import process_image

img = process_image("http://images.cocodataset.org/val2017/000000039769.jpg", None, None)
print(img.size)  # (640, 480)

Build docs developers (and LLMs) love