Skip to main content
Creates a cached content resource that stores context (like documents, conversation history, or system instructions) for reuse across multiple requests. This reduces latency and costs for repeated queries against the same content.

Method Signature

client.caches.create(
    model: str,
    config: Optional[CreateCachedContentConfigOrDict] = None
) -> CachedContent

Parameters

model
string
required
The model to use for this cached content.Example: 'gemini-2.0-flash' or 'gemini-1.5-pro'
config
CreateCachedContentConfig
Configuration for the cached content.Available options:
  • contents: The content to cache (documents, conversation history, etc.)
  • system_instruction: System instructions to cache
  • tools: Tools configuration to cache
  • tool_config: Tool configuration settings
  • ttl: Time-to-live as a duration string (e.g., "3600s" for 1 hour)
  • expire_time: Specific expiration timestamp
  • display_name: Human-readable name for the cache
  • kms_key_name: KMS encryption key (Vertex AI only)

Returns

cached_content
CachedContent
A CachedContent object containing:
  • name: The resource name (e.g., "cachedContents/abc123")
  • model: The model name
  • display_name: Human-readable name
  • create_time: When the cache was created
  • update_time: Last update time
  • expire_time: When the cache will expire
  • usage_metadata: Token usage information

Examples

Basic Cache Creation

from google import genai
from google.genai import types

client = genai.Client(api_key='your-api-key')

# Create cached content
cached = client.caches.create(
    model='gemini-2.0-flash',
    config={
        'contents': [
            types.Content(
                role='user',
                parts=[types.Part(text='This is important context to cache.')]
            )
        ],
        'ttl': '3600s',  # Cache for 1 hour
        'display_name': 'My Context Cache'
    }
)

print(f"Cache created: {cached.name}")
print(f"Expires: {cached.expire_time}")

Cache with Document

# Upload a document first
file = client.files.upload(file='long_document.pdf')

# Wait for processing
import time
while file.state == 'PROCESSING':
    time.sleep(1)
    file = client.files.get(name=file.name)

# Create cache with the document
cached = client.caches.create(
    model='gemini-2.0-flash',
    config={
        'contents': [
            types.Content(
                role='user',
                parts=[types.Part(
                    file_data=types.FileData(file_uri=file.uri)
                )]
            )
        ],
        'ttl': '7200s',  # 2 hours
        'display_name': 'Document Analysis Cache'
    }
)

print(f"Document cached: {cached.name}")

Cache with System Instructions

# Cache system instructions and context together
cached = client.caches.create(
    model='gemini-1.5-pro',
    config={
        'system_instruction': 'You are a helpful coding assistant specialized in Python.',
        'contents': [
            types.Content(
                role='user',
                parts=[types.Part(text='Here is the company coding style guide...')]
            )
        ],
        'ttl': '86400s',  # 24 hours
        'display_name': 'Coding Assistant Context'
    }
)

print(f"Cache with system instructions: {cached.name}")

Use Cached Content in Generation

# Create cache
cached = client.caches.create(
    model='gemini-2.0-flash',
    config={
        'contents': [
            types.Content(
                role='user',
                parts=[types.Part(text='Large context document here...')]
            )
        ],
        'ttl': '3600s'
    }
)

# Use the cache in a generation request
response = client.models.generate_content(
    model=cached.name,  # Use the cache name as the model
    contents='What are the key points from the cached document?'
)

print(response.text)
print(f"Tokens from cache: {response.usage_metadata.cached_content_token_count}")

Cache Multiple Documents

# Upload multiple files
files = [
    client.files.upload(file='doc1.pdf'),
    client.files.upload(file='doc2.pdf'),
    client.files.upload(file='doc3.pdf')
]

# Wait for all to process
for file in files:
    while file.state == 'PROCESSING':
        time.sleep(1)
        file = client.files.get(name=file.name)

# Create cache with all documents
parts = [types.Part(file_data=types.FileData(file_uri=f.uri)) for f in files]

cached = client.caches.create(
    model='gemini-2.0-flash',
    config={
        'contents': [types.Content(role='user', parts=parts)],
        'ttl': '7200s',
        'display_name': 'Multi-Document Cache'
    }
)

print(f"Cached {len(files)} documents")

Set Specific Expiration Time

from datetime import datetime, timedelta

# Set explicit expiration time
expire_at = datetime.now() + timedelta(hours=2)

cached = client.caches.create(
    model='gemini-2.0-flash',
    config={
        'contents': [
            types.Content(
                role='user',
                parts=[types.Part(text='Context to cache')]
            )
        ],
        'expire_time': expire_at.isoformat(),
        'display_name': 'Timed Cache'
    }
)

print(f"Expires at: {cached.expire_time}")

Cache with Tools

# Create cache with tool configuration
cached = client.caches.create(
    model='gemini-2.0-flash',
    config={
        'contents': [
            types.Content(
                role='user',
                parts=[types.Part(text='Database schema and context...')]
            )
        ],
        'tools': [
            types.Tool(
                function_declarations=[
                    types.FunctionDeclaration(
                        name='query_database',
                        description='Query the database',
                        parameters={
                            'type': 'object',
                            'properties': {
                                'query': {'type': 'string'}
                            }
                        }
                    )
                ]
            )
        ],
        'ttl': '3600s',
        'display_name': 'Database Context Cache'
    }
)

print(f"Cache with tools: {cached.name}")

Async Cache Creation

import asyncio

async def create_cache():
    # Create cache asynchronously
    cached = await client.aio.caches.create(
        model='gemini-2.0-flash',
        config={
            'contents': [
                types.Content(
                    role='user',
                    parts=[types.Part(text='Context to cache')]
                )
            ],
            'ttl': '3600s'
        }
    )
    print(f"Cache created: {cached.name}")
    return cached

asyncio.run(create_cache())

Cache Conversation History

# Cache a long conversation history
conversation = [
    types.Content(role='user', parts=[types.Part(text='First message')]),
    types.Content(role='model', parts=[types.Part(text='First response')]),
    types.Content(role='user', parts=[types.Part(text='Second message')]),
    types.Content(role='model', parts=[types.Part(text='Second response')]),
    # ... many more turns
]

cached = client.caches.create(
    model='gemini-2.0-flash',
    config={
        'contents': conversation,
        'ttl': '3600s',
        'display_name': 'Conversation History Cache'
    }
)

print(f"Cached {len(conversation)} conversation turns")

Cost Optimization

Caching is cost-effective when:
  • You make multiple requests with the same large context
  • The cached content is reused more than once
  • The context is large (thousands of tokens)
Cached tokens are typically cheaper than regular input tokens, and subsequent requests using the cache only pay for new tokens.

TTL vs Expire Time

You can specify expiration using either:
  • ttl: Duration from now (e.g., "3600s" for 1 hour)
  • expire_time: Specific timestamp
Use ttl for relative expiration, expire_time for absolute expiration. Don’t specify both.

Error Handling

try:
    cached = client.caches.create(
        model='gemini-2.0-flash',
        config={
            'contents': [types.Content(role='user', parts=[types.Part(text='Context')])],
            'ttl': '3600s'
        }
    )
    print(f"Success: {cached.name}")
except Exception as e:
    print(f"Failed to create cache: {e}")

API Availability

This method is available in both Gemini API and Vertex AI.Vertex AI differences:
  • Supports kms_key_name for encryption
  • Some tool options may differ

Build docs developers (and LLMs) love