Skip to main content

generate()

Generate text from a loaded model given a formatted prompt, optional images, and optional audio. Returns the complete generation result after all tokens are produced.
from mlx_vlm import generate

result = generate(model, processor, prompt, image=["cat.jpg"])
print(result.text)

Signature

def generate(
    model: nn.Module,
    processor: PreTrainedTokenizer,
    prompt: str,
    image: Union[str, List[str]] = None,
    audio: Union[str, List[str]] = None,
    verbose: bool = False,
    **kwargs,
) -> GenerationResult:

Parameters

model
nn.Module
required
The loaded model returned by load().
processor
PreTrainedTokenizer
required
The processor returned by load().
prompt
str
required
The formatted prompt string. Produce this with apply_chat_template().
image
str | list[str]
default:"None"
Image path(s), URL(s), or PIL.Image.Image object(s). Pass a list for multi-image inputs.
audio
str | list[str]
default:"None"
Audio file path(s) or URL(s).
verbose
bool
default:"False"
When True, prints the prompt, generated tokens, and timing statistics to stdout.
max_tokens
int
default:"256"
Maximum number of tokens to generate.
temperature
float
default:"0.0"
Sampling temperature. Use 0.0 for greedy (deterministic) decoding.
top_p
float
default:"1.0"
Nucleus sampling probability mass. Only tokens within the top-p cumulative probability are considered.
top_k
int
default:"0"
Restrict sampling to the top-k tokens. 0 disables top-k filtering.
min_p
float
default:"0.0"
Minimum probability threshold relative to the highest-probability token.
repetition_penalty
float
default:"None"
Penalty factor applied to tokens that have already appeared. Values above 1.0 discourage repetition.
repetition_context_size
int
default:"20"
Number of recent tokens to consider when applying the repetition penalty.
resize_shape
tuple[int, int] | None
default:"None"
Resize input images to (height, width) before processing.
kv_bits
int | None
default:"None"
Quantize the KV cache to this many bits. Reduces memory for long contexts.
kv_group_size
int
default:"64"
Group size for KV cache quantization.
max_kv_size
int | None
default:"None"
Maximum number of tokens to keep in the KV cache. Older entries are evicted.
prefill_step_size
int
default:"2048"
Number of tokens to process per prefill step. Lower values reduce peak memory usage during long-context prefill.
eos_tokens
list[str] | None
default:"None"
Additional end-of-sequence token strings to stop generation.
skip_special_tokens
bool
default:"False"
Exclude special tokens from the decoded output text.
enable_thinking
bool
default:"False"
Enable thinking mode in the chat template (e.g. for Qwen3.5 reasoning models).
thinking_budget
int | None
default:"None"
Maximum number of tokens allowed inside a thinking block. When exceeded, the model is forced to emit the thinking-end token.

Returns

result
GenerationResult
A dataclass with the complete generation output. See GenerationResult below.

Example

from mlx_vlm import load, generate
from mlx_vlm.prompt_utils import apply_chat_template

model, processor = load("mlx-community/Qwen2-VL-2B-Instruct-4bit")
config = model.config

image = ["http://images.cocodataset.org/val2017/000000039769.jpg"]
prompt = apply_chat_template(processor, config, "Describe this image.", num_images=1)

result = generate(model, processor, prompt, image=image, max_tokens=200, verbose=True)
print(result.text)

stream_generate()

A generator that yields GenerationResult objects one token at a time. Use this for streaming output to a UI or terminal.

Signature

def stream_generate(
    model: nn.Module,
    processor: PreTrainedTokenizer,
    prompt: str,
    image: Union[str, List[str]] = None,
    audio: Union[str, List[str]] = None,
    **kwargs,
) -> Generator[GenerationResult, None, None]:

Parameters

stream_generate accepts the same parameters as generate() (minus verbose), passed as **kwargs. See the generate() parameters section above.

Yields

chunk
GenerationResult
One GenerationResult per generated token. The text field contains the decoded text segment for that token. Cumulative statistics (prompt_tokens, generation_tokens, prompt_tps, etc.) are updated on every yield.

Example

from mlx_vlm import load, stream_generate
from mlx_vlm.prompt_utils import apply_chat_template

model, processor = load("mlx-community/Qwen2.5-VL-3B-Instruct-8bit")
config = model.config

image = ["photo.jpg"]
prompt = apply_chat_template(processor, config, "What do you see?", num_images=1)

for chunk in stream_generate(model, processor, prompt, image=image, max_tokens=300):
    print(chunk.text, end="", flush=True)

print()  # newline after generation

batch_generate()

Generate responses for multiple prompts in a single batch. Groups same-sized images together to avoid spatial padding within each group, which improves accuracy and throughput.

Signature

def batch_generate(
    model,
    processor,
    images: Union[str, List[str]] = None,
    audios: Union[str, List[str]] = None,
    prompts: List[str] = None,
    max_tokens: Union[int, List[int]] = 128,
    verbose: bool = False,
    group_by_shape: bool = True,
    track_image_sizes: bool = True,
    **kwargs,
) -> BatchResponse:

Parameters

model
nn.Module
required
The loaded model returned by load().
processor
PreTrainedTokenizer
required
The processor returned by load().
images
str | list[str]
default:"None"
Image paths, URLs, or PIL.Image.Image objects. One image per prompt.
audios
str | list[str]
default:"None"
Audio file paths or URLs. Batched audio is not yet fully supported.
prompts
list[str]
required
List of formatted prompt strings (one per sample). Produce each with apply_chat_template().
max_tokens
int | list[int]
default:"128"
Maximum output tokens. Pass a single integer to apply the same limit to all prompts, or a list to set per-prompt limits.
verbose
bool
default:"False"
Print batch statistics to stdout after generation.
group_by_shape
bool
default:"True"
Group images with the same spatial dimensions into sub-batches. This eliminates padding within each group and improves accuracy. Disable only if all images are already the same size.
track_image_sizes
bool
default:"True"
Record the original (height, width) of each image in the BatchResponse.image_sizes field.

Returns

response
BatchResponse
A dataclass containing generated texts and aggregate statistics. See BatchResponse below.

Example

from mlx_vlm import load, batch_generate
from mlx_vlm.prompt_utils import apply_chat_template

model, processor = load("mlx-community/Qwen2-VL-2B-Instruct-4bit")
config = model.config

images = ["cat.jpg", "dog.jpg", "bird.jpg"]
prompts = [
    apply_chat_template(processor, config, "Describe this animal.", num_images=1)
    for _ in images
]

response = batch_generate(
    model, processor, images=images, prompts=prompts, max_tokens=100, verbose=True
)

for i, text in enumerate(response.texts):
    print(f"[{i}] {text}")

print(f"Generation TPS: {response.stats.generation_tps:.1f}")

Data classes

GenerationResult

Returned by generate() and yielded by stream_generate().
@dataclass
class GenerationResult:
    text: str = ""
    token: Optional[int] = None
    logprobs: Optional[List[float]] = None
    prompt_tokens: int = 0
    generation_tokens: int = 0
    total_tokens: int = 0
    prompt_tps: float = 0.0
    generation_tps: float = 0.0
    peak_memory: float = 0.0
text
str
Decoded text for this generation step. In stream_generate, this is the newly decoded segment for the current token. In generate, this is the full generated response.
token
int | None
The last generated token ID.
logprobs
list[float] | None
Log-probabilities over the vocabulary for the last generated token.
prompt_tokens
int
Number of tokens in the input prompt.
generation_tokens
int
Number of tokens generated so far.
total_tokens
int
Total tokens processed (prompt_tokens + generation_tokens).
prompt_tps
float
Prompt processing speed in tokens per second.
generation_tps
float
Generation speed in tokens per second.
peak_memory
float
Peak memory usage in gigabytes.

BatchResponse

Returned by batch_generate().
@dataclass
class BatchResponse:
    texts: List[str]
    stats: BatchStats
    image_sizes: Optional[List[Tuple[int, int]]] = None
texts
list[str]
Generated text for each prompt, in the same order as the input prompts list.
stats
BatchStats
Aggregate performance statistics for the batch. See BatchStats below.
image_sizes
list[tuple[int, int]] | None
Original (height, width) for each input image. Present when track_image_sizes=True.

BatchStats

@dataclass
class BatchStats:
    prompt_tokens: int = 0
    prompt_tps: float = 0
    prompt_time: float = 0
    generation_tokens: int = 0
    generation_tps: float = 0
    generation_time: float = 0
    peak_memory: float = 0
prompt_tokens
int
Total prompt tokens processed across all samples.
prompt_tps
float
Aggregate prompt processing speed in tokens per second.
prompt_time
float
Total time in seconds spent processing prompts.
generation_tokens
int
Total tokens generated across all samples.
generation_tps
float
Aggregate generation speed in tokens per second.
generation_time
float
Total time in seconds spent generating tokens.
peak_memory
float
Peak memory usage in gigabytes.

Build docs developers (and LLMs) love