OpenCLIP provides flexible image preprocessing with automatic configuration based on model requirements and customizable augmentation strategies.
Quick Start
Preprocessing is automatically configured when loading models:
import open_clip
from PIL import Image
model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms(
'ViT-B-32',
pretrained='laion2b_s34b_b79k'
)
# Use for training
image_train = preprocess_train(Image.open('train.jpg'))
# Use for inference/validation
image_val = preprocess_val(Image.open('val.jpg'))
Preprocessing Configuration
PreprocessCfg
The PreprocessCfg dataclass defines all preprocessing parameters:
from open_clip import PreprocessCfg
config = PreprocessCfg(
size=224, # Image size
mode='RGB', # Color mode
mean=(0.48145466, 0.4578275, 0.40821073), # Normalization mean
std=(0.26862954, 0.26130258, 0.27577711), # Normalization std
interpolation='bicubic', # Resize interpolation
resize_mode='shortest', # Resize strategy
fill_color=0 # Padding fill color
)
size
int | Tuple[int, int]
default:"224"
Target image size. Can be int for square images or (height, width) tuple.
mean
Tuple[float, float, float]
RGB mean values for normalization. Defaults to OpenAI CLIP values:
(0.48145466, 0.4578275, 0.40821073)
std
Tuple[float, float, float]
RGB standard deviation for normalization. Defaults to:
(0.26862954, 0.26130258, 0.27577711)
Resize interpolation method: ‘bicubic’, ‘bilinear’, or ‘nearest’
Resize strategy:
'shortest': Resize shortest edge, then center crop
'longest': Resize longest edge, then center crop/pad
'squash': Direct resize to target size (may distort)
Create preprocessing transforms from configuration:
from open_clip import image_transform_v2, PreprocessCfg, AugmentationCfg
# Create config
preprocess_cfg = PreprocessCfg(
size=224,
mean=(0.485, 0.456, 0.406),
std=(0.229, 0.224, 0.225),
interpolation='bicubic',
resize_mode='shortest'
)
# Training transform with augmentation
train_transform = image_transform_v2(
preprocess_cfg,
is_train=True,
aug_cfg={'scale': (0.9, 1.0), 'color_jitter': (0.4, 0.4, 0.4, 0.1)}
)
# Validation transform (no augmentation)
val_transform = image_transform_v2(
preprocess_cfg,
is_train=False
)
Resize Modes
Shortest Edge (Default)
Resize shortest edge to target, then center crop:
from open_clip import image_transform_v2, PreprocessCfg
config = PreprocessCfg(size=224, resize_mode='shortest')
transform = image_transform_v2(config, is_train=False)
# Example: 800x600 image -> resize to 224x168 -> center crop to 224x224
This is the default for most CLIP models and preserves aspect ratio before cropping.
Longest Edge
Resize longest edge, pad to square:
config = PreprocessCfg(size=224, resize_mode='longest', fill_color=0)
transform = image_transform_v2(config, is_train=False)
# Example: 800x600 image -> resize to 224x168 -> pad to 224x224
Useful when preserving all image content is important.
Squash Mode
Direct resize (may distort aspect ratio):
config = PreprocessCfg(size=224, resize_mode='squash')
transform = image_transform_v2(config, is_train=False)
# Example: 800x600 image -> resize to 224x224 (distorted)
Used by SigLIP models and some other architectures.
Augmentation Configuration
AugmentationCfg
Configure training data augmentation:
from open_clip import AugmentationCfg
aug_cfg = AugmentationCfg(
scale=(0.9, 1.0), # Random crop scale range
ratio=(0.75, 1.33), # Random crop aspect ratio
color_jitter=(0.4, 0.4, 0.4, 0.1), # (brightness, contrast, saturation, hue)
color_jitter_prob=0.8, # Probability of applying color jitter
gray_scale_prob=0.2, # Probability of grayscale conversion
use_timm=False # Use timm augmentation library
)
scale
Tuple[float, float]
default:"(0.9, 1.0)"
Scale range for RandomResizedCrop. Values are fractions of original image size.
Aspect ratio range for RandomResizedCrop
Color jitter parameters: (brightness, contrast, saturation, hue)
Probability of applying color jitter (0.0 to 1.0)
Probability of converting to grayscale (0.0 to 1.0)
Use timm library’s augmentation (RandAugment, etc.)
Example Augmentation Configs
# Light augmentation
light_aug = AugmentationCfg(
scale=(0.95, 1.0),
color_jitter=(0.2, 0.2, 0.2, 0.05),
color_jitter_prob=0.5
)
# Standard augmentation (default)
standard_aug = AugmentationCfg(
scale=(0.9, 1.0),
color_jitter=(0.4, 0.4, 0.4, 0.1),
color_jitter_prob=0.8,
gray_scale_prob=0.2
)
# Strong augmentation
strong_aug = AugmentationCfg(
scale=(0.8, 1.0),
color_jitter=(0.6, 0.6, 0.6, 0.2),
color_jitter_prob=1.0,
gray_scale_prob=0.3,
use_timm=True,
re_prob=0.25 # Random erasing
)
Normalization
Standard Normalization Values
Different model families use different normalization:
from open_clip.constants import (
OPENAI_DATASET_MEAN, # (0.48145466, 0.4578275, 0.40821073)
OPENAI_DATASET_STD, # (0.26862954, 0.26130258, 0.27577711)
IMAGENET_MEAN, # (0.485, 0.456, 0.406)
IMAGENET_STD, # (0.229, 0.224, 0.225)
INCEPTION_MEAN, # (0.5, 0.5, 0.5)
INCEPTION_STD # (0.5, 0.5, 0.5)
)
# OpenAI CLIP models (default)
config = PreprocessCfg(mean=OPENAI_DATASET_MEAN, std=OPENAI_DATASET_STD)
# SigLIP models
config = PreprocessCfg(mean=INCEPTION_MEAN, std=INCEPTION_STD)
# CLIPA models
config = PreprocessCfg(mean=IMAGENET_MEAN, std=IMAGENET_STD)
Using incorrect normalization values will significantly degrade model performance. Always use the values the model was trained with.
Custom Preprocessing
Override Model Defaults
import open_clip
model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms(
'ViT-B-32',
pretrained='laion2b_s34b_b79k',
# Override defaults
image_mean=(0.5, 0.5, 0.5),
image_std=(0.5, 0.5, 0.5),
image_interpolation='bilinear',
image_resize_mode='squash'
)
from torchvision import transforms
from open_clip.constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
# Custom preprocessing pipeline
custom_transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(
mean=OPENAI_DATASET_MEAN,
std=OPENAI_DATASET_STD
)
])
# Use with model
from PIL import Image
image = Image.open('example.jpg')
image_tensor = custom_transform(image).unsqueeze(0)
Advanced Features
Non-Square Images
Some models support non-square inputs:
# Create model with custom image size
model, _, preprocess = open_clip.create_model_and_transforms(
'ViT-L-14',
pretrained='datacomp_xl_s13b_b90k',
force_image_size=(384, 256) # Width x Height
)
Multiple Resolutions
Use different resolutions at inference:
from open_clip import create_model_from_pretrained, image_transform_v2, PreprocessCfg
# Load base model
model, base_preprocess = create_model_from_pretrained(
'ViT-L-14',
pretrained='datacomp_xl_s13b_b90k'
)
# Create higher resolution transform
high_res_cfg = PreprocessCfg(
size=336, # Instead of default 224
mean=(0.48145466, 0.4578275, 0.40821073),
std=(0.26862954, 0.26130258, 0.27577711),
interpolation='bicubic'
)
high_res_preprocess = image_transform_v2(high_res_cfg, is_train=False)
Batch Preprocessing
import torch
from PIL import Image
images = [Image.open(f'img{i}.jpg') for i in range(10)]
# Preprocess batch
batch = torch.stack([preprocess(img) for img in images])
print(batch.shape) # [10, 3, 224, 224]
Complete Example
import torch
import open_clip
from open_clip import PreprocessCfg, AugmentationCfg, image_transform_v2
from PIL import Image
# Configure preprocessing
preprocess_cfg = PreprocessCfg(
size=224,
mean=(0.48145466, 0.4578275, 0.40821073),
std=(0.26862954, 0.26130258, 0.27577711),
interpolation='bicubic',
resize_mode='shortest'
)
# Configure augmentation for training
aug_cfg = AugmentationCfg(
scale=(0.9, 1.0),
color_jitter=(0.4, 0.4, 0.4, 0.1),
color_jitter_prob=0.8,
gray_scale_prob=0.2
)
# Create transforms
train_transform = image_transform_v2(preprocess_cfg, is_train=True, aug_cfg=aug_cfg)
val_transform = image_transform_v2(preprocess_cfg, is_train=False)
# Load model
model, _, _ = open_clip.create_model_and_transforms(
'ViT-B-32',
pretrained='laion2b_s34b_b79k'
)
model.eval()
# Process images
train_img = train_transform(Image.open('train.jpg'))
val_img = val_transform(Image.open('val.jpg'))
# Inference
with torch.no_grad():
features = model.encode_image(val_img.unsqueeze(0))
print("Features shape:", features.shape)