create_model_from_pretrained

Creates a contrastive vision-language model from pretrained weights with optional preprocessing transform. This function enforces loading of pretrained weights and is designed for inference use cases.

Signature

def create_model_from_pretrained(
    model_name: str,
    pretrained: Optional[str] = None,
    precision: str = 'fp32',
    device: Union[str, torch.device] = 'cpu',
    jit: bool = False,
    force_quick_gelu: bool = False,
    force_custom_text: bool = False,
    force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
    force_context_length: Optional[int] = None,
    image_mean: Optional[Tuple[float, ...]] = None,
    image_std: Optional[Tuple[float, ...]] = None,
    image_interpolation: Optional[str] = None,
    image_resize_mode: Optional[str] = None,
    return_transform: bool = True,
    cache_dir: Optional[str] = None,
    weights_only: bool = True,
    **model_kwargs,
):
    ...

Parameters

model_name

str

required

Model identifier, potentially with schema prefix:

'ViT-B-32': Built-in model name. pretrained specifies CLIP weights source (required).
'hf-hub:org/repo': Loads config/weights from HuggingFace Hub. pretrained is IGNORED.
'local-dir:/path/to/folder': Loads config/weights from local directory. pretrained is IGNORED.

pretrained

Optional[str]

default:"None"

Source for CLIP weights (tag or file path) ONLY if model_name has no schema. If None and schema requires it, will raise an error. Examples: 'openai', 'laion400m_e32', or a file path.

precision

str

default:"'fp32'"

Model precision. Options: 'fp32', 'fp16', 'bf16', 'pure_fp16', 'pure_bf16'.

device

Union[str, torch.device]

default:"'cpu'"

Device to load model on. Can be 'cpu', 'cuda', or a torch.device object.

jit

bool

default:"False"

If True, JIT compile the model using torch.jit.script.

force_quick_gelu

bool

default:"False"

Force use of QuickGELU activation in model config.

force_custom_text

bool

default:"False"

Force use of custom text encoder architecture.

force_image_size

Optional[Union[int, Tuple[int, int]]]

default:"None"

Override image size in model config. Useful for using models at different resolutions than they were trained at.

force_context_length

Optional[int]

default:"None"

Override context length in text config.

image_mean

Optional[Tuple[float, ...]]

default:"None"

Override default image normalization mean values (per channel). Example: (0.48145466, 0.4578275, 0.40821073).

image_std

Optional[Tuple[float, ...]]

default:"None"

Override default image normalization std values (per channel). Example: (0.26862954, 0.26130258, 0.27577711).

image_interpolation

Optional[str]

default:"None"

Override default interpolation method for image resizing. Options: 'bicubic', 'bilinear', 'nearest'.

image_resize_mode

Optional[str]

default:"None"

Override resize mode for inference preprocessing. Options:

'squash': Resize to exact dimensions (may distort aspect ratio)
'shortest': Resize shortest edge to target size, then crop
'longest': Resize longest edge to target size, then crop

Only affects the returned preprocessing transform, not training.

return_transform

bool

default:"True"

If True, returns (model, preprocess) tuple. If False, returns only the model.

cache_dir

Optional[str]

default:"None"

Cache directory for downloads. Defaults to ~/.cache/clip.

weights_only

bool

default:"True"

Use weights_only=True for torch.load (safer, prevents arbitrary code execution).

**model_kwargs

Any

Additional keyword arguments for model constructor (highest override priority).

Returns

model

torch.nn.Module

The created model instance with pretrained weights loaded.

preprocess

Callable

Inference preprocessing transform (only returned if return_transform=True). This is a deterministic transform without augmentation, suitable for validation and inference.

Example

import open_clip
from PIL import Image
import torch

# Load model with preprocessing
model, preprocess = open_clip.create_model_from_pretrained(
    'ViT-B-32',
    pretrained='openai'
)

# Load model without preprocessing (e.g., when using custom preprocessing)
model = open_clip.create_model_from_pretrained(
    'ViT-B-32',
    pretrained='openai',
    return_transform=False
)

# Load from Hugging Face Hub
model, preprocess = open_clip.create_model_from_pretrained(
    'hf-hub:laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K'
)

# Load with custom image size
model, preprocess = open_clip.create_model_from_pretrained(
    'ViT-L-14',
    pretrained='openai',
    force_image_size=336,
    device='cuda',
    precision='fp16'
)

# Use for inference
image = Image.open('example.jpg')
image_input = preprocess(image).unsqueeze(0)

tokenizer = open_clip.get_tokenizer('ViT-B-32')
text = tokenizer(["a photo of a cat", "a photo of a dog"])

with torch.no_grad():
    model.eval()
    image_features = model.encode_image(image_input)
    text_features = model.encode_text(text)
    
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    
    similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
    print(f"Probability cat: {similarity[0, 0]:.3f}")
    print(f"Probability dog: {similarity[0, 1]:.3f}")

Model Creation

Pretrained Models

Tokenization

Transforms

Model Classes

Loss Functions

Zero-Shot

create_model_from_pretrained

Signature

Parameters

Returns

Example

Build docs developers (and LLMs) love

Model Creation

Pretrained Models

Tokenization

Transforms

Model Classes

Loss Functions

Zero-Shot

​Signature

​Parameters

​Returns

​Example

Build docs developers (and LLMs) love

Signature

Parameters

Returns

Example