Skip to main content
OpenCLIP provides flexible image preprocessing with automatic configuration based on model requirements and customizable augmentation strategies.

Quick Start

Preprocessing is automatically configured when loading models:
import open_clip
from PIL import Image

model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms(
    'ViT-B-32',
    pretrained='laion2b_s34b_b79k'
)

# Use for training
image_train = preprocess_train(Image.open('train.jpg'))

# Use for inference/validation
image_val = preprocess_val(Image.open('val.jpg'))

Preprocessing Configuration

PreprocessCfg

The PreprocessCfg dataclass defines all preprocessing parameters:
from open_clip import PreprocessCfg

config = PreprocessCfg(
    size=224,                          # Image size
    mode='RGB',                        # Color mode
    mean=(0.48145466, 0.4578275, 0.40821073),  # Normalization mean
    std=(0.26862954, 0.26130258, 0.27577711),  # Normalization std
    interpolation='bicubic',           # Resize interpolation
    resize_mode='shortest',            # Resize strategy
    fill_color=0                       # Padding fill color
)
size
int | Tuple[int, int]
default:"224"
Target image size. Can be int for square images or (height, width) tuple.
mean
Tuple[float, float, float]
RGB mean values for normalization. Defaults to OpenAI CLIP values: (0.48145466, 0.4578275, 0.40821073)
std
Tuple[float, float, float]
RGB standard deviation for normalization. Defaults to: (0.26862954, 0.26130258, 0.27577711)
interpolation
str
default:"bicubic"
Resize interpolation method: ‘bicubic’, ‘bilinear’, or ‘nearest’
resize_mode
str
default:"shortest"
Resize strategy:
  • 'shortest': Resize shortest edge, then center crop
  • 'longest': Resize longest edge, then center crop/pad
  • 'squash': Direct resize to target size (may distort)

Creating Transforms

image_transform_v2()

Create preprocessing transforms from configuration:
from open_clip import image_transform_v2, PreprocessCfg, AugmentationCfg

# Create config
preprocess_cfg = PreprocessCfg(
    size=224,
    mean=(0.485, 0.456, 0.406),
    std=(0.229, 0.224, 0.225),
    interpolation='bicubic',
    resize_mode='shortest'
)

# Training transform with augmentation
train_transform = image_transform_v2(
    preprocess_cfg,
    is_train=True,
    aug_cfg={'scale': (0.9, 1.0), 'color_jitter': (0.4, 0.4, 0.4, 0.1)}
)

# Validation transform (no augmentation)
val_transform = image_transform_v2(
    preprocess_cfg,
    is_train=False
)

Resize Modes

Shortest Edge (Default)

Resize shortest edge to target, then center crop:
from open_clip import image_transform_v2, PreprocessCfg

config = PreprocessCfg(size=224, resize_mode='shortest')
transform = image_transform_v2(config, is_train=False)

# Example: 800x600 image -> resize to 224x168 -> center crop to 224x224
This is the default for most CLIP models and preserves aspect ratio before cropping.

Longest Edge

Resize longest edge, pad to square:
config = PreprocessCfg(size=224, resize_mode='longest', fill_color=0)
transform = image_transform_v2(config, is_train=False)

# Example: 800x600 image -> resize to 224x168 -> pad to 224x224
Useful when preserving all image content is important.

Squash Mode

Direct resize (may distort aspect ratio):
config = PreprocessCfg(size=224, resize_mode='squash')
transform = image_transform_v2(config, is_train=False)

# Example: 800x600 image -> resize to 224x224 (distorted)
Used by SigLIP models and some other architectures.

Augmentation Configuration

AugmentationCfg

Configure training data augmentation:
from open_clip import AugmentationCfg

aug_cfg = AugmentationCfg(
    scale=(0.9, 1.0),                    # Random crop scale range
    ratio=(0.75, 1.33),                  # Random crop aspect ratio
    color_jitter=(0.4, 0.4, 0.4, 0.1),  # (brightness, contrast, saturation, hue)
    color_jitter_prob=0.8,               # Probability of applying color jitter
    gray_scale_prob=0.2,                 # Probability of grayscale conversion
    use_timm=False                       # Use timm augmentation library
)
scale
Tuple[float, float]
default:"(0.9, 1.0)"
Scale range for RandomResizedCrop. Values are fractions of original image size.
ratio
Tuple[float, float]
Aspect ratio range for RandomResizedCrop
color_jitter
Tuple[float, ...]
Color jitter parameters: (brightness, contrast, saturation, hue)
color_jitter_prob
float
Probability of applying color jitter (0.0 to 1.0)
gray_scale_prob
float
Probability of converting to grayscale (0.0 to 1.0)
use_timm
bool
default:"False"
Use timm library’s augmentation (RandAugment, etc.)

Example Augmentation Configs

# Light augmentation
light_aug = AugmentationCfg(
    scale=(0.95, 1.0),
    color_jitter=(0.2, 0.2, 0.2, 0.05),
    color_jitter_prob=0.5
)

# Standard augmentation (default)
standard_aug = AugmentationCfg(
    scale=(0.9, 1.0),
    color_jitter=(0.4, 0.4, 0.4, 0.1),
    color_jitter_prob=0.8,
    gray_scale_prob=0.2
)

# Strong augmentation
strong_aug = AugmentationCfg(
    scale=(0.8, 1.0),
    color_jitter=(0.6, 0.6, 0.6, 0.2),
    color_jitter_prob=1.0,
    gray_scale_prob=0.3,
    use_timm=True,
    re_prob=0.25  # Random erasing
)

Normalization

Standard Normalization Values

Different model families use different normalization:
from open_clip.constants import (
    OPENAI_DATASET_MEAN,  # (0.48145466, 0.4578275, 0.40821073)
    OPENAI_DATASET_STD,   # (0.26862954, 0.26130258, 0.27577711)
    IMAGENET_MEAN,        # (0.485, 0.456, 0.406)
    IMAGENET_STD,         # (0.229, 0.224, 0.225)
    INCEPTION_MEAN,       # (0.5, 0.5, 0.5)
    INCEPTION_STD         # (0.5, 0.5, 0.5)
)

# OpenAI CLIP models (default)
config = PreprocessCfg(mean=OPENAI_DATASET_MEAN, std=OPENAI_DATASET_STD)

# SigLIP models
config = PreprocessCfg(mean=INCEPTION_MEAN, std=INCEPTION_STD)

# CLIPA models
config = PreprocessCfg(mean=IMAGENET_MEAN, std=IMAGENET_STD)
Using incorrect normalization values will significantly degrade model performance. Always use the values the model was trained with.

Custom Preprocessing

Override Model Defaults

import open_clip

model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms(
    'ViT-B-32',
    pretrained='laion2b_s34b_b79k',
    # Override defaults
    image_mean=(0.5, 0.5, 0.5),
    image_std=(0.5, 0.5, 0.5),
    image_interpolation='bilinear',
    image_resize_mode='squash'
)

Manual Transform Pipeline

from torchvision import transforms
from open_clip.constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD

# Custom preprocessing pipeline
custom_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=OPENAI_DATASET_MEAN,
        std=OPENAI_DATASET_STD
    )
])

# Use with model
from PIL import Image
image = Image.open('example.jpg')
image_tensor = custom_transform(image).unsqueeze(0)

Advanced Features

Non-Square Images

Some models support non-square inputs:
# Create model with custom image size
model, _, preprocess = open_clip.create_model_and_transforms(
    'ViT-L-14',
    pretrained='datacomp_xl_s13b_b90k',
    force_image_size=(384, 256)  # Width x Height
)

Multiple Resolutions

Use different resolutions at inference:
from open_clip import create_model_from_pretrained, image_transform_v2, PreprocessCfg

# Load base model
model, base_preprocess = create_model_from_pretrained(
    'ViT-L-14',
    pretrained='datacomp_xl_s13b_b90k'
)

# Create higher resolution transform
high_res_cfg = PreprocessCfg(
    size=336,  # Instead of default 224
    mean=(0.48145466, 0.4578275, 0.40821073),
    std=(0.26862954, 0.26130258, 0.27577711),
    interpolation='bicubic'
)
high_res_preprocess = image_transform_v2(high_res_cfg, is_train=False)

Batch Preprocessing

import torch
from PIL import Image

images = [Image.open(f'img{i}.jpg') for i in range(10)]

# Preprocess batch
batch = torch.stack([preprocess(img) for img in images])
print(batch.shape)  # [10, 3, 224, 224]

Complete Example

import torch
import open_clip
from open_clip import PreprocessCfg, AugmentationCfg, image_transform_v2
from PIL import Image

# Configure preprocessing
preprocess_cfg = PreprocessCfg(
    size=224,
    mean=(0.48145466, 0.4578275, 0.40821073),
    std=(0.26862954, 0.26130258, 0.27577711),
    interpolation='bicubic',
    resize_mode='shortest'
)

# Configure augmentation for training
aug_cfg = AugmentationCfg(
    scale=(0.9, 1.0),
    color_jitter=(0.4, 0.4, 0.4, 0.1),
    color_jitter_prob=0.8,
    gray_scale_prob=0.2
)

# Create transforms
train_transform = image_transform_v2(preprocess_cfg, is_train=True, aug_cfg=aug_cfg)
val_transform = image_transform_v2(preprocess_cfg, is_train=False)

# Load model
model, _, _ = open_clip.create_model_and_transforms(
    'ViT-B-32',
    pretrained='laion2b_s34b_b79k'
)
model.eval()

# Process images
train_img = train_transform(Image.open('train.jpg'))
val_img = val_transform(Image.open('val.jpg'))

# Inference
with torch.no_grad():
    features = model.encode_image(val_img.unsqueeze(0))
    print("Features shape:", features.shape)

Build docs developers (and LLMs) love