ScriptCreatorGraph is a unique graph that generates reusable Python scraping scripts instead of just extracting data. It analyzes a web page and creates a custom extract_data(html: str) -> dict function using BeautifulSoup.
The ScriptCreatorGraph constructor accepts the following parameters:
ScriptCreatorGraph( prompt: str, # Description of what data to extract source: str, # URL or path to local HTML file config: dict, # Configuration dictionary (must include 'library') schema: Optional[BaseModel] = None # Pydantic schema for structured output)
import osimport jsonfrom dotenv import load_dotenvfrom scrapegraphai.graphs import ScriptCreatorGraphfrom scrapegraphai.utils import prettify_exec_infoload_dotenv()# Define the configurationgraph_config = { "llm": { "api_key": os.getenv("OPENAI_API_KEY"), "model": "openai/gpt-4o", }, "library": "beautifulsoup", # Required! "verbose": True, "headless": False,}# Create the ScriptCreatorGraph instancescript_creator = ScriptCreatorGraph( prompt="List me all the news with their description.", source="https://perinim.github.io/projects", config=graph_config,)# Run the graph and get the generated coderesult = script_creator.run()print(json.dumps(result, indent=4))# Get execution infograph_exec_info = script_creator.get_execution_info()print(prettify_exec_info(graph_exec_info))
from scrapegraphai.graphs import ScriptCreatorGraphfrom scrapegraphai.utils import prettify_exec_info# Define the configuration for local Ollamagraph_config = { "llm": { "model": "ollama/llama3.1", "temperature": 0.5, "base_url": "http://localhost:11434", }, "library": "beautifulsoup", # Required! "verbose": True,}# Create the ScriptCreatorGraph instancescript_creator = ScriptCreatorGraph( prompt="List me all the news with their description.", source="https://perinim.github.io/projects", config=graph_config,)# Run the graphresult = script_creator.run()print(result)# Get execution infograph_exec_info = script_creator.get_execution_info()print(prettify_exec_info(graph_exec_info))
The graph generates a complete Python function that you can reuse:
result = script_creator.run()# Example output:'''from bs4 import BeautifulSoupdef extract_data(html: str) -> dict: """ Extract news articles with their descriptions from the HTML. Args: html: Raw HTML string of the page Returns: Dictionary containing extracted news articles """ soup = BeautifulSoup(html, 'html.parser') articles = [] # Find all article elements for article in soup.find_all('article', class_='post'): title = article.find('h2').text.strip() description = article.find('p', class_='description').text.strip() articles.append({ 'title': title, 'description': description }) return {'articles': articles}'''
import requestsfrom scrapegraphai.graphs import ScriptCreatorGraph# Generate the scriptscript_creator = ScriptCreatorGraph( prompt="Extract all product names and prices", source="https://example.com/products", config=graph_config,)generated_code = script_creator.run()# Save to filewith open("my_scraper.py", "w") as f: f.write(generated_code)# Later, import and use itfrom my_scraper import extract_datahtml = requests.get("https://example.com/products").textdata = extract_data(html)print(data)
try: code = script_creator.run() if code and "def extract_data" in code: print("Script generated successfully!") # Save or use the code with open("scraper.py", "w") as f: f.write(code) else: print("Failed to generate valid code")except Exception as e: print(f"Error during code generation: {e}")