Overview
Tool functions extend agent capabilities by providing access to external data sources and services. This reference documents the database query and knowledge base search tools used in the Building Reliable Agents course.
query_database()
Execute SQL queries against the inventory database.
SQL query to execute. Must be valid SQLite syntax.
Path to the SQLite database file (e.g., "./inventory/inventory.db")
Query results as a string representation of a list of tuples, or an error message if the query fails
Example:
import sqlite3
from langsmith import traceable
@traceable(name="query_database", run_type="tool")
def query_database(query: str, db_path: str) -> str:
"""Execute SQL query against the inventory database."""
try:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute(query)
results = cursor.fetchall()
conn.close()
return str(results)
except Exception as e:
return f"Error: {str(e)}"
# Usage
results = query_database(
query="SELECT name FROM sqlite_master WHERE type='table'",
db_path="./inventory/inventory.db"
)
print(results)
Tool definition for OpenAI’s function calling:
QUERY_DATABASE_TOOL = {
"type": "function",
"function": {
"name": "query_database",
"description": "SQL query to get information about our inventory for customers like products, quantities and prices.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": """SQL query to execute against the inventory database.
YOU DO NOT KNOW THE SCHEMA. ALWAYS discover it first:
1. Query 'SELECT name FROM sqlite_master WHERE type="table"' to see available tables
2. Use 'PRAGMA table_info(table_name)' to inspect columns for each table
3. Only after understanding the schema, construct your search queries"""
}
},
"required": ["query"]
}
}
}
Best Practices:
- Schema Discovery First: Always instruct agents to discover the database schema before querying data
- Error Handling: Wrap database operations in try/except blocks
- Connection Management: Close database connections after each query
- Query Validation: Validate SQL queries to prevent injection attacks in production
search_knowledge_base()
Search company knowledge base documents using semantic similarity.
Natural language search query or question
Number of most relevant documents to return
Formatted string containing the top-k most relevant documents with relevance scores
Example:
import numpy as np
from openai import AsyncOpenAI
from langsmith import traceable
client = AsyncOpenAI()
# Global storage (loaded at startup)
knowledge_base_docs: list[tuple[str, str]] = [] # (filename, content)
knowledge_base_embeddings: list[list[float]] = []
@traceable(name="search_knowledge_base", run_type="tool")
async def search_knowledge_base(query: str, top_k: int = 2) -> str:
"""Search knowledge base using semantic similarity."""
if not knowledge_base_docs or not knowledge_base_embeddings:
return "Error: Knowledge base not loaded"
# Generate embedding for query
response = await client.embeddings.create(
model="text-embedding-3-small",
input=query
)
query_embedding = response.data[0].embedding
# Calculate cosine similarity with all documents
similarities = []
for i, doc_embedding in enumerate(knowledge_base_embeddings):
similarity = np.dot(query_embedding, doc_embedding) / (
np.linalg.norm(query_embedding) * np.linalg.norm(doc_embedding)
)
similarities.append((i, similarity))
# Sort by similarity and get top k
similarities.sort(key=lambda x: x[1], reverse=True)
top_results = similarities[:top_k]
# Format results
results = []
for idx, score in top_results:
filename, content = knowledge_base_docs[idx]
results.append(f"=== {filename} (relevance: {score:.3f}) ===\n{content}\n")
return "\n".join(results)
# Usage
result = await search_knowledge_base(
query="What is your return policy?",
top_k=2
)
print(result)
SEARCH_KNOWLEDGE_BASE_TOOL = {
"type": "function",
"function": {
"name": "search_knowledge_base",
"description": "Search company knowledge base for information about policies, procedures, company info, shipping, returns, ordering, contact information, store locations, and business hours. Use this for non-product questions.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Natural language question or search query about company policies or information"
}
},
"required": ["query"]
}
}
}
load_knowledge_base()
Load knowledge base documents and generate embeddings (called at startup).
kb_dir
str
default:"./knowledge_base"
Path to knowledge base directory containing documents/ subfolder with markdown files
Example:
import json
from pathlib import Path
from typing import List, Tuple
from openai import AsyncOpenAI
client = AsyncOpenAI()
# Global storage
knowledge_base_docs: List[Tuple[str, str]] = []
knowledge_base_embeddings: List[List[float]] = []
def _embeddings_are_stale(kb_path: Path, cache_path: Path) -> bool:
"""Check if any document has been modified after embeddings were generated."""
if not cache_path.exists():
return True
cache_mtime = cache_path.stat().st_mtime
for file_path in kb_path.glob("*.md"):
if file_path.name == "CHUNKING_NOTES.md":
continue
if file_path.stat().st_mtime > cache_mtime:
return True
return False
async def _generate_and_cache_embeddings(kb_path: Path, cache_path: Path) -> None:
"""Generate embeddings for all documents and save to cache."""
global knowledge_base_docs, knowledge_base_embeddings
docs = []
for file_path in kb_path.glob("*.md"):
if file_path.name == "CHUNKING_NOTES.md":
continue
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
docs.append((file_path.name, content))
if not docs:
print(f"Warning: No documents found in '{kb_path}'")
return
knowledge_base_docs = docs
print(f"Generating embeddings for {len(docs)} documents...")
embeddings = []
for filename, content in docs:
response = await client.embeddings.create(
model="text-embedding-3-small",
input=content
)
embeddings.append(response.data[0].embedding)
print(f" {filename}")
knowledge_base_embeddings = embeddings
# Save to cache
cache_path.parent.mkdir(parents=True, exist_ok=True)
cache_data = {"docs": docs, "embeddings": embeddings}
with open(cache_path, 'w') as f:
json.dump(cache_data, f)
print(f"Embeddings cached to {cache_path}")
async def load_knowledge_base(kb_dir: str = "./knowledge_base") -> None:
"""Load knowledge base documents and embeddings.
Automatically regenerates embeddings if any source documents have been modified.
"""
global knowledge_base_docs, knowledge_base_embeddings
kb_path = Path(kb_dir) / "documents"
cache_path = Path(kb_dir) / "embeddings" / "embeddings.json"
if not kb_path.exists():
print(f"Warning: Knowledge base directory '{kb_dir}' not found")
return
# Check if embeddings need to be regenerated
if _embeddings_are_stale(kb_path, cache_path):
print("Knowledge base documents changed, regenerating embeddings...")
await _generate_and_cache_embeddings(kb_path, cache_path)
else:
# Load from cache
with open(cache_path, 'r') as f:
cache_data = json.load(f)
knowledge_base_docs = [tuple(doc) for doc in cache_data["docs"]]
knowledge_base_embeddings = cache_data["embeddings"]
print(f"Knowledge base loaded from cache: {len(knowledge_base_docs)} documents")
# Usage - call at application startup
import asyncio
async def main():
await load_knowledge_base("./knowledge_base")
# Now search_knowledge_base() can be used
asyncio.run(main())
Features:
- Automatic Caching: Embeddings are cached to avoid regenerating on every startup
- Staleness Detection: Automatically detects when source documents change and regenerates embeddings
- Whole Document Retrieval: Returns entire documents (not chunks) for better context
Text Chunking (Alternative Approach)
For large documents, you can chunk them before embedding:
chunk_text()
Split text into overlapping chunks.
Text to split into chunks
Size of each chunk in characters
Number of overlapping characters between chunks
List of text chunks with overlap
Example:
from typing import List
def chunk_text(text: str, chunk_size: int = 200, overlap: int = 20) -> List[str]:
"""Split text into chunks with overlap."""
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunk = text[start:end]
if chunk.strip():
chunks.append(chunk)
start = end - overlap
return chunks
# Usage
text = "Long document text..."
chunks = chunk_text(text, chunk_size=200, overlap=20)
print(f"Created {len(chunks)} chunks")
Combining multiple tools in an agent:
import json
from openai import AsyncOpenAI
from langsmith import traceable
client = AsyncOpenAI()
@traceable(name="MultiToolAgent")
async def agent_with_tools(question: str) -> dict:
tools = [QUERY_DATABASE_TOOL, SEARCH_KNOWLEDGE_BASE_TOOL]
messages = [{"role": "user", "content": question}]
response = await client.chat.completions.create(
model="gpt-5-nano",
messages=messages,
tools=tools,
tool_choice="auto"
)
response_message = response.choices[0].message
while response_message.tool_calls:
messages.append({
"role": "assistant",
"content": response_message.content or "",
"tool_calls": [{
"id": tc.id,
"type": tc.type,
"function": {
"name": tc.function.name,
"arguments": tc.function.arguments
}
} for tc in response_message.tool_calls]
})
for tool_call in response_message.tool_calls:
function_args = json.loads(tool_call.function.arguments)
# Route to appropriate tool
if tool_call.function.name == "query_database":
result = query_database(
query=function_args["query"],
db_path="./inventory/inventory.db"
)
elif tool_call.function.name == "search_knowledge_base":
result = await search_knowledge_base(
query=function_args["query"]
)
else:
result = f"Error: Unknown tool {tool_call.function.name}"
messages.append({
"role": "tool",
"tool_call_id": tool_call.id,
"name": tool_call.function.name,
"content": result
})
response = await client.chat.completions.create(
model="gpt-5-nano",
messages=messages,
tools=tools,
tool_choice="auto"
)
response_message = response.choices[0].message
return {"output": response_message.content, "messages": messages}
Best Practices
Always use @traceable with run_type="tool":
from langsmith import traceable
@traceable(name="my_tool", run_type="tool")
def my_tool(arg: str) -> str:
return f"Result: {arg}"
Schema Instructions
Provide clear instructions in tool descriptions:
"description": """SQL query to execute.
IMPORTANT:
1. Always discover schema first
2. Use PRAGMA table_info(table_name)
3. Then construct your query"""
Error Messages
Return helpful error messages:
try:
result = execute_tool()
return str(result)
except Exception as e:
return f"Error: {str(e)}. Please check your query syntax."
Use async for I/O-bound operations:
@traceable(name="api_tool", run_type="tool")
async def api_tool(query: str) -> str:
async with httpx.AsyncClient() as client:
response = await client.get(f"/api/search?q={query}")
return response.text