Define Pydantic schemas for structured, validated data extraction
Custom schemas allow you to define the exact structure of extracted data using Pydantic models. This ensures type safety, validation, and consistent output format.
import osfrom typing import Listfrom dotenv import load_dotenvfrom pydantic import BaseModel, Fieldfrom scrapegraphai.graphs import SmartScraperGraphload_dotenv()# Define the output schema for the graphclass Project(BaseModel): title: str = Field(description="The title of the project") description: str = Field(description="The description of the project")class Projects(BaseModel): projects: List[Project]# Define the configuration for the graphopenai_key = os.getenv("OPENAI_APIKEY")graph_config = { "llm": { "api_key": openai_key, "model": "openai/gpt-4o-mini", }, "verbose": True, "headless": False,}# Create the SmartScraperGraph instance and run itsmart_scraper_graph = SmartScraperGraph( prompt="List me all the projects with their description", source="https://perinim.github.io/projects/", schema=Projects, config=graph_config,)result = smart_scraper_graph.run()print(result)
from typing import Listfrom pydantic import BaseModel, Field
Import Pydantic components for schema definition.
2
Define your models
class Project(BaseModel): title: str = Field(description="The title of the project") description: str = Field(description="The description of the project")class Projects(BaseModel): projects: List[Project]
Create nested models with clear field descriptions. The descriptions help the AI understand what to extract.
3
Pass schema to graph
smart_scraper_graph = SmartScraperGraph( prompt="List me all the projects with their description", source="https://perinim.github.io/projects/", schema=Projects, # Pass your Pydantic model config=graph_config,)
The schema parameter ensures the output matches your defined structure.
4
Get validated results
result = smart_scraper_graph.run()# result is already validated and structured according to your schema
The output automatically validates against your schema.
from pydantic import BaseModel, Field, validatorclass Product(BaseModel): name: str = Field(description="Product name") price: float = Field(description="Price in USD") discount_percent: float = Field(description="Discount percentage") final_price: Optional[float] = None @validator('discount_percent') def validate_discount(cls, v): if v < 0 or v > 100: raise ValueError('Discount must be between 0 and 100') return v @validator('final_price', always=True) def calculate_final_price(cls, v, values): if 'price' in values and 'discount_percent' in values: price = values['price'] discount = values['discount_percent'] return price * (1 - discount / 100) return v
from scrapegraphai.utils import convert_to_csv, convert_to_json# Run scraper with schemaresult = smart_scraper_graph.run()# Export to different formatsconvert_to_json(result, "output") # Saves as output.jsonconvert_to_csv(result, "output") # Saves as output.csv# Access as Python objectsfor project in result.projects: print(f"Project: {project.title}") print(f"Description: {project.description}")