DocumentScraperGraph is specialized for extracting information from plain text documents and markdown files. Unlike SmartScraperGraph which handles HTML, this graph processes raw text content efficiently.
import osimport jsonfrom dotenv import load_dotenvfrom scrapegraphai.graphs import DocumentScraperGraphload_dotenv()openai_key = os.getenv("OPENAI_API_KEY")graph_config = { "llm": { "api_key": openai_key, "model": "openai/gpt-4o", }}# Example: Direct text inputsource = """ The Divine Comedy, Italian La Divina Commedia, original name La commedia, long narrative poem written in Italian circa 1308/21 by Dante. It is usually held to be one of the world's great works of literature. Divided into three major sections—Inferno, Purgatorio, and Paradiso—the narrative traces the journey of Dante from darkness and error to the revelation of the divine light, culminating in the Beatific Vision of God. Dante is guided by the Roman poet Virgil, who represents the epitome of human knowledge, from the dark wood through the descending circles of the pit of Hell (Inferno). He then climbs the mountain of Purgatory, guided by the Roman poet Statius, who represents the fulfilment of human knowledge, and is finally led by his lifelong love, the Beatrice of his earlier poetry, through the celestial spheres of Paradise."""document_scraper = DocumentScraperGraph( prompt="Summarize the text and find the main topics", source=source, config=graph_config,)result = document_scraper.run()print(json.dumps(result, indent=4))
import jsonfrom scrapegraphai.graphs import DocumentScraperGraph# Define the configuration for local Ollamagraph_config = { "llm": { "model": "ollama/llama3", "temperature": 0, "format": "json", "model_tokens": 4000, }, "verbose": True, "headless": False,}source = """ The Divine Comedy, Italian La Divina Commedia, original name La commedia, long narrative poem written in Italian circa 1308/21 by Dante. It is usually held to be one of the world's great works of literature. Divided into three major sections—Inferno, Purgatorio, and Paradiso—the narrative traces the journey of Dante from darkness and error to the revelation of the divine light, culminating in the Beatific Vision of God."""document_scraper = DocumentScraperGraph( prompt="Summarize the text and find the main topics", source=source, config=graph_config,)result = document_scraper.run()print(json.dumps(result, indent=4))
import osfrom pathlib import Path# Process all markdown files in a directorydocs_dir = "/path/to/documentation/"document_scraper = DocumentScraperGraph( prompt="Extract all API endpoints and their descriptions", source=docs_dir, config=graph_config,)result = document_scraper.run()
try: result = document_scraper.run() if result: print("Extraction successful:", result) else: print("No information extracted")except FileNotFoundError: print("Document file not found")except Exception as e: print(f"Error during extraction: {e}")