Skip to main content
Creates a contrastive vision-language model from pretrained weights with optional preprocessing transform. This function enforces loading of pretrained weights and is designed for inference use cases.

Signature

def create_model_from_pretrained(
    model_name: str,
    pretrained: Optional[str] = None,
    precision: str = 'fp32',
    device: Union[str, torch.device] = 'cpu',
    jit: bool = False,
    force_quick_gelu: bool = False,
    force_custom_text: bool = False,
    force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
    force_context_length: Optional[int] = None,
    image_mean: Optional[Tuple[float, ...]] = None,
    image_std: Optional[Tuple[float, ...]] = None,
    image_interpolation: Optional[str] = None,
    image_resize_mode: Optional[str] = None,
    return_transform: bool = True,
    cache_dir: Optional[str] = None,
    weights_only: bool = True,
    **model_kwargs,
):
    ...

Parameters

model_name
str
required
Model identifier, potentially with schema prefix:
  • 'ViT-B-32': Built-in model name. pretrained specifies CLIP weights source (required).
  • 'hf-hub:org/repo': Loads config/weights from HuggingFace Hub. pretrained is IGNORED.
  • 'local-dir:/path/to/folder': Loads config/weights from local directory. pretrained is IGNORED.
pretrained
Optional[str]
default:"None"
Source for CLIP weights (tag or file path) ONLY if model_name has no schema. If None and schema requires it, will raise an error. Examples: 'openai', 'laion400m_e32', or a file path.
precision
str
default:"'fp32'"
Model precision. Options: 'fp32', 'fp16', 'bf16', 'pure_fp16', 'pure_bf16'.
device
Union[str, torch.device]
default:"'cpu'"
Device to load model on. Can be 'cpu', 'cuda', or a torch.device object.
jit
bool
default:"False"
If True, JIT compile the model using torch.jit.script.
force_quick_gelu
bool
default:"False"
Force use of QuickGELU activation in model config.
force_custom_text
bool
default:"False"
Force use of custom text encoder architecture.
force_image_size
Optional[Union[int, Tuple[int, int]]]
default:"None"
Override image size in model config. Useful for using models at different resolutions than they were trained at.
force_context_length
Optional[int]
default:"None"
Override context length in text config.
image_mean
Optional[Tuple[float, ...]]
default:"None"
Override default image normalization mean values (per channel). Example: (0.48145466, 0.4578275, 0.40821073).
image_std
Optional[Tuple[float, ...]]
default:"None"
Override default image normalization std values (per channel). Example: (0.26862954, 0.26130258, 0.27577711).
image_interpolation
Optional[str]
default:"None"
Override default interpolation method for image resizing. Options: 'bicubic', 'bilinear', 'nearest'.
image_resize_mode
Optional[str]
default:"None"
Override resize mode for inference preprocessing. Options:
  • 'squash': Resize to exact dimensions (may distort aspect ratio)
  • 'shortest': Resize shortest edge to target size, then crop
  • 'longest': Resize longest edge to target size, then crop
Only affects the returned preprocessing transform, not training.
return_transform
bool
default:"True"
If True, returns (model, preprocess) tuple. If False, returns only the model.
cache_dir
Optional[str]
default:"None"
Cache directory for downloads. Defaults to ~/.cache/clip.
weights_only
bool
default:"True"
Use weights_only=True for torch.load (safer, prevents arbitrary code execution).
**model_kwargs
Any
Additional keyword arguments for model constructor (highest override priority).

Returns

model
torch.nn.Module
The created model instance with pretrained weights loaded.
preprocess
Callable
Inference preprocessing transform (only returned if return_transform=True). This is a deterministic transform without augmentation, suitable for validation and inference.

Example

import open_clip
from PIL import Image
import torch

# Load model with preprocessing
model, preprocess = open_clip.create_model_from_pretrained(
    'ViT-B-32',
    pretrained='openai'
)

# Load model without preprocessing (e.g., when using custom preprocessing)
model = open_clip.create_model_from_pretrained(
    'ViT-B-32',
    pretrained='openai',
    return_transform=False
)

# Load from Hugging Face Hub
model, preprocess = open_clip.create_model_from_pretrained(
    'hf-hub:laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K'
)

# Load with custom image size
model, preprocess = open_clip.create_model_from_pretrained(
    'ViT-L-14',
    pretrained='openai',
    force_image_size=336,
    device='cuda',
    precision='fp16'
)

# Use for inference
image = Image.open('example.jpg')
image_input = preprocess(image).unsqueeze(0)

tokenizer = open_clip.get_tokenizer('ViT-B-32')
text = tokenizer(["a photo of a cat", "a photo of a dog"])

with torch.no_grad():
    model.eval()
    image_features = model.encode_image(image_input)
    text_features = model.encode_text(text)
    
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    
    similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
    print(f"Probability cat: {similarity[0, 0]:.3f}")
    print(f"Probability dog: {similarity[0, 1]:.3f}")

Build docs developers (and LLMs) love