create_model_and_transforms

Creates a contrastive vision-language model along with preprocessing transforms for training and validation. This is a convenience function that combines model creation with transform generation.

Signature

def create_model_and_transforms(
    model_name: str,
    pretrained: Optional[str] = None,
    load_weights: bool = True,
    precision: str = 'fp32',
    device: Union[str, torch.device] = 'cpu',
    jit: bool = False,
    force_quick_gelu: bool = False,
    force_custom_text: bool = False,
    force_patch_dropout: Optional[float] = None,
    force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
    force_context_length: Optional[int] = None,
    image_mean: Optional[Tuple[float, ...]] = None,
    image_std: Optional[Tuple[float, ...]] = None,
    image_interpolation: Optional[str] = None,
    image_resize_mode: Optional[str] = None,
    aug_cfg: Optional[Union[Dict[str, Any], AugmentationCfg]] = None,
    pretrained_image: bool = False,
    pretrained_text: bool = True,
    pretrained_image_path: Optional[str] = None,
    pretrained_text_path: Optional[str] = None,
    cache_dir: Optional[str] = None,
    output_dict: Optional[bool] = None,
    weights_only: bool = True,
    **model_kwargs,
):
    ...

Parameters

model_name

str

required

Model identifier, potentially with schema prefix:

'ViT-B-32': Built-in model name. pretrained specifies CLIP weights source.
'hf-hub:org/repo': Loads config/weights from HuggingFace Hub.
'local-dir:/path/to/folder': Loads config/weights from local directory.

pretrained

Optional[str]

default:"None"

Source for CLIP weights (tag or file path) ONLY if model_name has no schema.

load_weights

bool

default:"True"

Load the resolved pretrained weights if True, otherwise random init or tower overrides only.

precision

str

default:"'fp32'"

Model precision. Options: 'fp32', 'fp16', 'bf16', 'pure_fp16', 'pure_bf16'.

device

Union[str, torch.device]

default:"'cpu'"

Device to load model on.

jit

bool

default:"False"

If True, JIT compile the model.

force_quick_gelu

bool

default:"False"

Force use of QuickGELU activation in model config.

force_custom_text

bool

default:"False"

Force use of custom text encoder architecture.

force_patch_dropout

Optional[float]

default:"None"

Override patch dropout value in model config.

force_image_size

Optional[Union[int, Tuple[int, int]]]

default:"None"

Override image size in model config.

force_context_length

Optional[int]

default:"None"

Override context length in text config.

image_mean

Optional[Tuple[float, ...]]

default:"None"

Override default image normalization mean values (per channel). Example: (0.48145466, 0.4578275, 0.40821073).

image_std

Optional[Tuple[float, ...]]

default:"None"

Override default image normalization std values (per channel). Example: (0.26862954, 0.26130258, 0.27577711).

image_interpolation

Optional[str]

default:"None"

Override default interpolation method for image resizing. Options: 'bicubic', 'bilinear', 'nearest'.

image_resize_mode

Optional[str]

default:"None"

Override resize mode for preprocessing. Options:

'squash': Resize to exact dimensions (may distort aspect ratio)
'shortest': Resize shortest edge to target size, then crop
'longest': Resize longest edge to target size, then crop

aug_cfg

Optional[Union[Dict[str, Any], AugmentationCfg]]

default:"None"

Augmentation configuration for training transforms. Can be dict or AugmentationCfg object. Controls random crop, color jitter, etc. If None, uses model defaults.Example dict: {'scale': (0.9, 1.0), 'ratio': (1.0, 1.0), 'color_jitter': 0.4}

pretrained_image

bool

default:"False"

Load default base weights for image tower at creation if no CLIP weights loaded.

pretrained_text

bool

default:"True"

Load default base weights for text tower at creation if no CLIP weights loaded.

pretrained_image_path

Optional[str]

default:"None"

Path to load weights specifically into image tower after creation.

pretrained_text_path

Optional[str]

default:"None"

Path to load weights specifically into text tower after creation.

cache_dir

Optional[str]

default:"None"

Cache directory for downloads.

output_dict

Optional[bool]

default:"None"

If True and model supports it, return dict output.

weights_only

bool

default:"True"

Use weights_only=True for torch.load (safer).

**model_kwargs

Any

Additional keyword arguments for model constructor.

Returns

model

torch.nn.Module

The created model instance.

preprocess_train

Callable

Image preprocessing transform for training (includes augmentation like random crop, color jitter).

preprocess_val

Callable

Image preprocessing transform for validation/inference (no augmentation, deterministic).

Example

import open_clip
from PIL import Image
import torch

# Basic usage with built-in model
model, train_transform, val_transform = open_clip.create_model_and_transforms(
    'ViT-B-32',
    pretrained='openai'
)

# With custom augmentation
aug_cfg = {
    'scale': (0.9, 1.0),
    'ratio': (1.0, 1.0),
    'color_jitter': 0.4
}
model, train_transform, val_transform = open_clip.create_model_and_transforms(
    'ViT-L-14',
    pretrained='datacomp_xl_s13b_b90k',
    aug_cfg=aug_cfg,
    device='cuda',
    precision='fp16'
)

# From Hugging Face Hub
model, train_transform, val_transform = open_clip.create_model_and_transforms(
    'hf-hub:laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K'
)

# Use the transforms
image = Image.open('example.jpg')
train_image = train_transform(image)  # Augmented
val_image = val_transform(image)      # Clean preprocessing

# Get tokenizer
tokenizer = open_clip.get_tokenizer('ViT-B-32')
text = tokenizer(["a photo of a cat", "a photo of a dog"])

# Forward pass
with torch.no_grad():
    image_features = model.encode_image(val_image.unsqueeze(0))
    text_features = model.encode_text(text)

Model Creation

Pretrained Models

Tokenization

Transforms

Model Classes

Loss Functions

Zero-Shot

create_model_and_transforms

Signature

Parameters

Returns

Example

Build docs developers (and LLMs) love

Model Creation

Pretrained Models

Tokenization

Transforms

Model Classes

Loss Functions

Zero-Shot

​Signature

​Parameters

​Returns

​Example

Build docs developers (and LLMs) love

Signature

Parameters

Returns

Example