Skip to main content
Creates a contrastive vision-language model along with preprocessing transforms for training and validation. This is a convenience function that combines model creation with transform generation.

Signature

def create_model_and_transforms(
    model_name: str,
    pretrained: Optional[str] = None,
    load_weights: bool = True,
    precision: str = 'fp32',
    device: Union[str, torch.device] = 'cpu',
    jit: bool = False,
    force_quick_gelu: bool = False,
    force_custom_text: bool = False,
    force_patch_dropout: Optional[float] = None,
    force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
    force_context_length: Optional[int] = None,
    image_mean: Optional[Tuple[float, ...]] = None,
    image_std: Optional[Tuple[float, ...]] = None,
    image_interpolation: Optional[str] = None,
    image_resize_mode: Optional[str] = None,
    aug_cfg: Optional[Union[Dict[str, Any], AugmentationCfg]] = None,
    pretrained_image: bool = False,
    pretrained_text: bool = True,
    pretrained_image_path: Optional[str] = None,
    pretrained_text_path: Optional[str] = None,
    cache_dir: Optional[str] = None,
    output_dict: Optional[bool] = None,
    weights_only: bool = True,
    **model_kwargs,
):
    ...

Parameters

model_name
str
required
Model identifier, potentially with schema prefix:
  • 'ViT-B-32': Built-in model name. pretrained specifies CLIP weights source.
  • 'hf-hub:org/repo': Loads config/weights from HuggingFace Hub.
  • 'local-dir:/path/to/folder': Loads config/weights from local directory.
pretrained
Optional[str]
default:"None"
Source for CLIP weights (tag or file path) ONLY if model_name has no schema.
load_weights
bool
default:"True"
Load the resolved pretrained weights if True, otherwise random init or tower overrides only.
precision
str
default:"'fp32'"
Model precision. Options: 'fp32', 'fp16', 'bf16', 'pure_fp16', 'pure_bf16'.
device
Union[str, torch.device]
default:"'cpu'"
Device to load model on.
jit
bool
default:"False"
If True, JIT compile the model.
force_quick_gelu
bool
default:"False"
Force use of QuickGELU activation in model config.
force_custom_text
bool
default:"False"
Force use of custom text encoder architecture.
force_patch_dropout
Optional[float]
default:"None"
Override patch dropout value in model config.
force_image_size
Optional[Union[int, Tuple[int, int]]]
default:"None"
Override image size in model config.
force_context_length
Optional[int]
default:"None"
Override context length in text config.
image_mean
Optional[Tuple[float, ...]]
default:"None"
Override default image normalization mean values (per channel). Example: (0.48145466, 0.4578275, 0.40821073).
image_std
Optional[Tuple[float, ...]]
default:"None"
Override default image normalization std values (per channel). Example: (0.26862954, 0.26130258, 0.27577711).
image_interpolation
Optional[str]
default:"None"
Override default interpolation method for image resizing. Options: 'bicubic', 'bilinear', 'nearest'.
image_resize_mode
Optional[str]
default:"None"
Override resize mode for preprocessing. Options:
  • 'squash': Resize to exact dimensions (may distort aspect ratio)
  • 'shortest': Resize shortest edge to target size, then crop
  • 'longest': Resize longest edge to target size, then crop
aug_cfg
Optional[Union[Dict[str, Any], AugmentationCfg]]
default:"None"
Augmentation configuration for training transforms. Can be dict or AugmentationCfg object. Controls random crop, color jitter, etc. If None, uses model defaults.Example dict: {'scale': (0.9, 1.0), 'ratio': (1.0, 1.0), 'color_jitter': 0.4}
pretrained_image
bool
default:"False"
Load default base weights for image tower at creation if no CLIP weights loaded.
pretrained_text
bool
default:"True"
Load default base weights for text tower at creation if no CLIP weights loaded.
pretrained_image_path
Optional[str]
default:"None"
Path to load weights specifically into image tower after creation.
pretrained_text_path
Optional[str]
default:"None"
Path to load weights specifically into text tower after creation.
cache_dir
Optional[str]
default:"None"
Cache directory for downloads.
output_dict
Optional[bool]
default:"None"
If True and model supports it, return dict output.
weights_only
bool
default:"True"
Use weights_only=True for torch.load (safer).
**model_kwargs
Any
Additional keyword arguments for model constructor.

Returns

model
torch.nn.Module
The created model instance.
preprocess_train
Callable
Image preprocessing transform for training (includes augmentation like random crop, color jitter).
preprocess_val
Callable
Image preprocessing transform for validation/inference (no augmentation, deterministic).

Example

import open_clip
from PIL import Image
import torch

# Basic usage with built-in model
model, train_transform, val_transform = open_clip.create_model_and_transforms(
    'ViT-B-32',
    pretrained='openai'
)

# With custom augmentation
aug_cfg = {
    'scale': (0.9, 1.0),
    'ratio': (1.0, 1.0),
    'color_jitter': 0.4
}
model, train_transform, val_transform = open_clip.create_model_and_transforms(
    'ViT-L-14',
    pretrained='datacomp_xl_s13b_b90k',
    aug_cfg=aug_cfg,
    device='cuda',
    precision='fp16'
)

# From Hugging Face Hub
model, train_transform, val_transform = open_clip.create_model_and_transforms(
    'hf-hub:laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K'
)

# Use the transforms
image = Image.open('example.jpg')
train_image = train_transform(image)  # Augmented
val_image = val_transform(image)      # Clean preprocessing

# Get tokenizer
tokenizer = open_clip.get_tokenizer('ViT-B-32')
text = tokenizer(["a photo of a cat", "a photo of a dog"])

# Forward pass
with torch.no_grad():
    image_features = model.encode_image(val_image.unsqueeze(0))
    text_features = model.encode_text(text)

Build docs developers (and LLMs) love