Skip to main content
The autogen_ext package provides model clients, integrations, and extensions for AutoGen.

Model Clients

Client for OpenAI chat completion models.
from autogen_ext.models.openai import OpenAIChatCompletionClient

client = OpenAIChatCompletionClient(
    model="gpt-4o",
    api_key="sk-...",
    temperature=0.7,
    max_tokens=1000
)

# Use with agents
from autogen_agentchat.agents import AssistantAgent

agent = AssistantAgent(
    name="assistant",
    model_client=client,
    description="GPT-4 powered assistant"
)
model
str
required
Model identifier (e.g., “gpt-4o”, “gpt-4-turbo”, “gpt-3.5-turbo”)
api_key
str | None
OpenAI API key (defaults to OPENAI_API_KEY env var)
base_url
str | None
Custom API base URL
temperature
float | None
Sampling temperature (0.0 to 2.0)
max_tokens
int | None
Maximum tokens in response
top_p
float | None
Nucleus sampling parameter
timeout
float | None
Request timeout in seconds
organization
str | None
OpenAI organization ID

Methods

create
async method
Generate a chat completion
from autogen_core.models import UserMessage

result = await client.create(
    messages=[UserMessage(content="Hello!", source="user")],
    temperature=0.8
)
print(result.content)
Returns: CreateResult
create_stream
async generator
Stream chat completion chunks
async for chunk in client.create_stream(messages=messages):
    if chunk.content:
        print(chunk.content, end="", flush=True)
Client for Azure OpenAI Service.
from autogen_ext.models.openai import AzureOpenAIChatCompletionClient

client = AzureOpenAIChatCompletionClient(
    model="gpt-4",
    azure_endpoint="https://your-resource.openai.azure.com",
    api_key="your-api-key",
    api_version="2024-02-15-preview",
    azure_deployment="gpt-4-deployment"
)
model
str
required
Model identifier
azure_endpoint
str
required
Azure OpenAI resource endpoint
api_key
str | None
Azure OpenAI API key (or use Azure AD auth)
api_version
str
required
Azure OpenAI API version
azure_deployment
str
required
Deployment name in Azure
azure_ad_token
str | None
Azure Active Directory token
azure_ad_token_provider
Callable | None
Function to provide Azure AD tokens
Client for Anthropic Claude models.
from autogen_ext.models.anthropic import AnthropicChatCompletionClient

client = AnthropicChatCompletionClient(
    model="claude-3-5-sonnet-20241022",
    api_key="sk-ant-...",
    max_tokens=4096
)
model
str
required
Model name (e.g., “claude-3-5-sonnet-20241022”, “claude-3-opus-20240229”)
api_key
str
required
Anthropic API key
max_tokens
int
Maximum tokens in response (required for Anthropic)
temperature
float | None
Sampling temperature
Client for local Ollama models.
from autogen_ext.models.ollama import OllamaChatCompletionClient

client = OllamaChatCompletionClient(
    model="llama3.1:8b",
    base_url="http://localhost:11434"
)
model
str
required
Ollama model name
base_url
str
Ollama server URL (default: “http://localhost:11434”)
temperature
float | None
Sampling temperature
Client using Semantic Kernel integration.
from autogen_ext.models.semantic_kernel import SemanticKernelChatCompletionClient
from semantic_kernel import Kernel

kernel = Kernel()
# Configure kernel...

client = SemanticKernelChatCompletionClient(
    kernel=kernel,
    service_id="chat-gpt"
)
kernel
Kernel
required
Semantic Kernel instance
service_id
str
required
Service identifier in the kernel
Client for llama.cpp models.
from autogen_ext.models.llama_cpp import LlamaCppChatCompletionClient

client = LlamaCppChatCompletionClient(
    model_path="/path/to/model.gguf",
    n_ctx=4096,
    n_gpu_layers=35
)
model_path
str
required
Path to GGUF model file
n_ctx
int
Context window size
n_gpu_layers
int
Number of layers to offload to GPU
Client that replays recorded responses for testing.
from autogen_ext.models.replay import ReplayChatCompletionClient

client = ReplayChatCompletionClient(
    responses=[
        "First response",
        "Second response",
        "Third response"
    ]
)

# Useful for testing and deterministic behavior
responses
List[str]
required
Pre-recorded responses to replay in order

Model Configuration

Configuration dataclass for OpenAI clients.
from autogen_ext.models.openai import OpenAIClientConfiguration

config = OpenAIClientConfiguration(
    model="gpt-4o",
    api_key="sk-...",
    temperature=0.7,
    max_tokens=2000
)

# Use with component system
client = OpenAIChatCompletionClient.from_config(config)
Configuration for Azure OpenAI clients.
from autogen_ext.models.openai import AzureOpenAIClientConfiguration

config = AzureOpenAIClientConfiguration(
    model="gpt-4",
    azure_endpoint="https://your-resource.openai.azure.com",
    api_version="2024-02-15-preview",
    azure_deployment="gpt-4-deployment"
)

Caching

Wrapper that caches model responses.
from autogen_ext.models.cache import CachedChatCompletionClient
from autogen_ext.models.openai import OpenAIChatCompletionClient

base_client = OpenAIChatCompletionClient(model="gpt-4o")

cached_client = CachedChatCompletionClient(
    client=base_client,
    cache_dir=".autogen_cache"
)

# First call hits the API
result1 = await cached_client.create(messages=messages)

# Second identical call uses cache
result2 = await cached_client.create(messages=messages)
client
ChatCompletionClient
required
Underlying model client to cache
cache_dir
str
Directory for cache storage (default: “.autogen_cache”)
cache_seed
int | None
Seed for cache key generation

Code Execution

Execute code in Docker containers.
from autogen_ext.code_executors import DockerCommandLineCodeExecutor

executor = DockerCommandLineCodeExecutor(
    image="python:3.11",
    timeout=60,
    work_dir="/workspace"
)

result = await executor.execute_code_blocks(
    code_blocks=[
        ("python", "print('Hello from Docker!')")
    ]
)
image
str
Docker image to use (default: “python:3.11-slim”)
timeout
int
Execution timeout in seconds
work_dir
str
Working directory in container
Execute code locally (use with caution).
from autogen_ext.code_executors import LocalCommandLineCodeExecutor

executor = LocalCommandLineCodeExecutor(
    timeout=30,
    work_dir="./code_execution"
)

# WARNING: Executes code on your local machine
result = await executor.execute_code_blocks(
    code_blocks=[("python", "print('Hello')")]
)
timeout
int
Execution timeout in seconds
work_dir
str
Working directory for execution

Tools & Extensions

Tool for web searching.
from autogen_ext.tools import WebSearchTool

search_tool = WebSearchTool(
    api_key="your-search-api-key",
    max_results=5
)

# Use with AssistantAgent
agent = AssistantAgent(
    name="researcher",
    model_client=client,
    tools=[search_tool],
    description="Research assistant with web search"
)
Tools for file operations.
from autogen_ext.tools import FileReadTool, FileWriteTool

read_tool = FileReadTool(allowed_paths=["/data"])
write_tool = FileWriteTool(allowed_paths=["/output"])

agent = AssistantAgent(
    name="file_handler",
    model_client=client,
    tools=[read_tool, write_tool],
    description="Agent with file access"
)

Memory Extensions

Vector-based semantic memory.
from autogen_ext.memory import VectorMemory
from autogen_ext.models.openai import OpenAIEmbeddingClient

memory = VectorMemory(
    embedding_client=OpenAIEmbeddingClient(model="text-embedding-3-small"),
    collection_name="agent_memory",
    top_k=5
)

# Use with AssistantAgent
agent = AssistantAgent(
    name="assistant",
    model_client=client,
    memory=[memory],
    description="Assistant with semantic memory"
)
Redis-backed persistent memory.
from autogen_ext.memory import RedisMemory

memory = RedisMemory(
    redis_url="redis://localhost:6379",
    namespace="agent_memory"
)

Runtimes & Deployment

Distributed runtime using gRPC.
from autogen_ext.runtimes.grpc import GrpcAgentRuntime

runtime = GrpcAgentRuntime(
    host="0.0.0.0",
    port=50051
)

await runtime.start()
Cloud-based runtime for distributed agents.
from autogen_ext.runtimes.cloud import CloudRuntime

runtime = CloudRuntime(
    project_id="my-project",
    region="us-central1"
)

Utilities

Rate limiting for API calls.
from autogen_ext.utils import RateLimiter

limiter = RateLimiter(max_calls=100, time_window=60)

async with limiter:
    result = await client.create(messages=messages)
Count tokens for cost estimation.
from autogen_ext.utils import TokenCounter

counter = TokenCounter(model="gpt-4")
tokens = counter.count_messages(messages)
print(f"Estimated cost: ${tokens * 0.00003}")

Configuration Models

All model clients support declarative configuration through the component system:
from autogen_core import ComponentModel
from autogen_ext.models.openai import OpenAIChatCompletionClient

# Create from config
config = ComponentModel(
    component_type="OpenAIChatCompletionClient",
    config={
        "model": "gpt-4o",
        "temperature": 0.7
    }
)

client = OpenAIChatCompletionClient.from_config(config)

See Also

Build docs developers (and LLMs) love