Skip to main content
OpenCLIP supports multiple tokenization strategies to handle different text encoders and languages.

Quick Start

import open_clip

# Get tokenizer for model
tokenizer = open_clip.get_tokenizer('ViT-B-32')

# Tokenize text
texts = ["a photo of a cat", "a photo of a dog"]
tokens = tokenizer(texts)

print(tokens.shape)  # [2, 77] - batch_size x context_length

get_tokenizer()

Automatically selects the appropriate tokenizer based on model configuration:
import open_clip

tokenizer = open_clip.get_tokenizer(
    model_name='ViT-B-32',
    context_length=77,  # Override default
    cache_dir='/path/to/cache'  # For HuggingFace tokenizers
)
model_name
str
required
Model identifier. Can use schemas:
  • Built-in: 'ViT-B-32'
  • HuggingFace: 'hf-hub:org/repo'
  • Local: 'local-dir:/path/to/model'
context_length
int
Maximum sequence length. Defaults to model’s configured length (usually 77).
cache_dir
str
Cache directory for downloading HuggingFace tokenizers

Tokenizer Types

SimpleTokenizer

Default BPE tokenizer used by most CLIP models:
from open_clip.tokenizer import SimpleTokenizer

tokenizer = SimpleTokenizer(
    context_length=77,
    clean='lower'  # Text cleaning: 'lower', 'whitespace', or 'canonicalize'
)

# Tokenize
tokens = tokenizer(["Hello world", "OpenCLIP"])
print(tokens.shape)  # [2, 77]

# Decode
text = tokenizer.decode(tokens[0])
print(text)
context_length
int
default:"77"
Maximum sequence length (including special tokens)
clean
str
default:"lower"
Text preprocessing:
  • 'lower': Lowercase + whitespace cleaning
  • 'whitespace': Whitespace cleaning only
  • 'canonicalize': Remove punctuation + lowercase
reduction_mask
str
Token reduction strategy when exceeding context length:
  • 'simple': Random contiguous block
  • 'random': Random tokens (preserve order)
  • 'shuffle': Random tokens (shuffle)
  • 'syntax': Priority based on POS tags

HFTokenizer

HuggingFace Transformers tokenizer wrapper for models using pretrained LMs:
from open_clip.tokenizer import HFTokenizer

# Use RoBERTa tokenizer
tokenizer = HFTokenizer(
    'roberta-base',
    context_length=77,
    clean='whitespace',
    cache_dir='/path/to/cache'
)

tokens = tokenizer(["Example text"])
tokenizer_name
str
required
HuggingFace tokenizer identifier (e.g., ‘roberta-base’, ‘xlm-roberta-large’)
context_length
int
default:"77"
Maximum sequence length
clean
str
default:"whitespace"
Text cleaning mode
strip_sep_token
bool
default:"False"
Remove separator tokens from output
language
str
Language code for multilingual tokenizers (e.g., ‘en’, ‘fr’, ‘de’)

SigLipTokenizer

SentencePiece tokenizer for SigLIP models:
from open_clip.tokenizer import SigLipTokenizer

# Different variants
tokenizer = SigLipTokenizer(
    'c4-en',      # Options: 'c4-en', 'mc4', 'gemma'
    context_length=64  # SigLIP uses 64 by default
)

tokens = tokenizer(["Sample text"])
Variants:
  • 'c4-en': English only (vocab_size=32,000)
  • 'mc4': Multilingual (vocab_size=250,000)
  • 'gemma': SigLIP2 models (vocab_size=256,000)

Context Length

Default Context Lengths

from open_clip.tokenizer import DEFAULT_CONTEXT_LENGTH

print(DEFAULT_CONTEXT_LENGTH)  # 77 for CLIP models
Different models use different context lengths:
  • CLIP models: 77 tokens
  • SigLIP models: 64 tokens
  • CoCa models: 76 tokens (+ 1 for generation)

Handling Long Text

tokenizer = open_clip.get_tokenizer('ViT-B-32')

# Text longer than 77 tokens will be truncated
long_text = "This is a very long description that exceeds 77 tokens..." * 10
tokens = tokenizer([long_text])

print(tokens.shape)  # [1, 77] - truncated to context_length

Custom Context Length

# Create model with custom context length
model = open_clip.create_model(
    'ViT-B-32',
    pretrained='laion2b_s34b_b79k',
    force_context_length=128  # Increase from default 77
)

# Get matching tokenizer
tokenizer = open_clip.get_tokenizer('ViT-B-32', context_length=128)
Changing context length requires model weights trained with that length. For pretrained models, use the original context length.

Text Preprocessing

Cleaning Modes

from open_clip.tokenizer import SimpleTokenizer

# Lowercase + whitespace cleaning (default)
lower_tokenizer = SimpleTokenizer(clean='lower')
tokens = lower_tokenizer(["Hello World!"])  # -> "hello world!"

# Whitespace only
whitespace_tokenizer = SimpleTokenizer(clean='whitespace')
tokens = whitespace_tokenizer(["Hello  World!"])  # -> "Hello World!"

# Canonicalize (remove punctuation + lowercase)
canon_tokenizer = SimpleTokenizer(clean='canonicalize')
tokens = canon_tokenizer(["Hello, World!"])  # -> "hello world"

Special Tokens

SimpleTokenizer uses special tokens:
  • <start_of_text> (token_id: 49406)
  • <end_of_text> (token_id: 49407)
tokenizer = SimpleTokenizer()

print(tokenizer.sot_token_id)  # 49406
print(tokenizer.eot_token_id)  # 49407
print(tokenizer.vocab_size)    # 49408

# Manual encoding
tokens = tokenizer.encode("hello")
print(tokens)  # [...] token IDs without special tokens

# Full tokenization (with special tokens)
tokens = tokenizer(["hello"])
print(tokens[0])  # [49406, ..., 49407, 0, 0, ...]  # SOT, tokens, EOT, padding

Multilingual Tokenization

XLM-RoBERTa Models

import open_clip

# Load multilingual model
model, _, preprocess = open_clip.create_model_and_transforms(
    'xlm-roberta-base-ViT-B-32',
    pretrained='laion5b_s13b_b90k'
)

tokenizer = open_clip.get_tokenizer('xlm-roberta-base-ViT-B-32')

# Supports 100+ languages
texts = [
    "a photo of a cat",          # English
    "une photo d'un chat",        # French
    "ein Foto von einer Katze",   # German
    "猫の写真"                      # Japanese
]

tokens = tokenizer(texts)

SigLIP Multilingual

import open_clip

model, _, preprocess = open_clip.create_model_and_transforms(
    'ViT-B-16-SigLIP-i18n-256',
    pretrained='webli'
)

tokenizer = open_clip.get_tokenizer('ViT-B-16-SigLIP-i18n-256')

# Multilingual SigLIP tokenizer
tokens = tokenizer(["text in various languages"])

Advanced Usage

Batch Tokenization

import open_clip

tokenizer = open_clip.get_tokenizer('ViT-B-32')

# Large batch
texts = [f"description {i}" for i in range(1000)]

# Process in batches for memory efficiency
batch_size = 256
all_tokens = []

for i in range(0, len(texts), batch_size):
    batch = texts[i:i+batch_size]
    tokens = tokenizer(batch)
    all_tokens.append(tokens)

import torch
all_tokens = torch.cat(all_tokens, dim=0)
print(all_tokens.shape)  # [1000, 77]

Custom Vocabulary

from open_clip.tokenizer import SimpleTokenizer

# Add custom special tokens
tokenizer = SimpleTokenizer(
    additional_special_tokens=['<custom_token>']
)

# Access custom token ID
custom_id = tokenizer.encoder['<custom_token>']

Decoding Tokens

tokenizer = open_clip.get_tokenizer('ViT-B-32')

# Tokenize
original = ["a photo of a cat"]
tokens = tokenizer(original)

# Decode
from open_clip import decode
decoded = decode(tokens[0])
print(decoded)  # "<start_of_text>a photo of a cat<end_of_text>"

# Clean decoded text
cleaned = decoded.replace("<start_of_text>", "").replace("<end_of_text>", "").strip()
print(cleaned)  # "a photo of a cat"

Complete Example

import torch
import open_clip
from PIL import Image

# Load model and tokenizer
model, _, preprocess = open_clip.create_model_and_transforms(
    'ViT-L-14',
    pretrained='datacomp_xl_s13b_b90k',
    device='cuda'
)
model.eval()

tokenizer = open_clip.get_tokenizer('ViT-L-14')

# Prepare inputs
image = preprocess(Image.open('cat.jpg')).unsqueeze(0).cuda()

# Create text descriptions with templates
labels = ['cat', 'dog', 'bird']
texts = [f"a photo of a {label}" for label in labels]
tokens = tokenizer(texts).cuda()

print("Token shape:", tokens.shape)  # [3, 77]
print("Context length:", tokenizer.context_length)  # 77

# Inference
with torch.no_grad(), torch.cuda.amp.autocast():
    image_features = model.encode_image(image)
    text_features = model.encode_text(tokens)
    
    # Normalize
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    
    # Compute similarity
    similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)

predicted_idx = similarity.argmax().item()
print(f"\nPredicted: {labels[predicted_idx]}")
print(f"Confidence: {similarity[0, predicted_idx]:.2%}")
print(f"\nAll probabilities:")
for label, prob in zip(labels, similarity[0]):
    print(f"  {label}: {prob:.2%}")
The tokenizer is automatically selected based on model configuration. For most CLIP models, this will be SimpleTokenizer. Models using HuggingFace text encoders will use HFTokenizer.

Build docs developers (and LLMs) love