Custom nodes allow you to extend ScrapeGraphAI with specialized functionality tailored to your scraping needs. All nodes inherit from the BaseNode abstract base class.
from typing import List, Optionalfrom scrapegraphai.nodes import BaseNode
2
Define Your Node Class
class CustomTextCleanerNode(BaseNode): """ A node that cleans and normalizes text content. """ def __init__( self, input: str, output: List[str], node_config: Optional[dict] = None, node_name: str = "CustomTextCleaner", ): super().__init__(node_name, "node", input, output, 1, node_config) # Custom configuration self.remove_html = ( True if node_config is None else node_config.get("remove_html", True) ) self.lowercase = ( False if node_config is None else node_config.get("lowercase", False) ) self.verbose = ( False if node_config is None else node_config.get("verbose", False) )
3
Implement the execute() Method
def execute(self, state: dict) -> dict: """ Executes the text cleaning logic. Args: state: Current graph state Returns: Updated state with cleaned text """ self.logger.info(f"--- Executing {self.node_name} Node ---") # Get input keys from state input_keys = self.get_input_keys(state) input_data = [state[key] for key in input_keys] text = input_data[0] # Clean the text if self.remove_html: import re text = re.sub(r'<[^>]+>', '', text) if self.lowercase: text = text.lower() # Remove extra whitespace text = ' '.join(text.split()) if self.verbose: self.logger.info(f"Cleaned text length: {len(text)} characters") # Update state with output state.update({self.output[0]: text}) return state
Conditional nodes determine the next node based on logic:
class ContentTypeRouter(BaseNode): """ Routes execution based on content type detection. """ def __init__( self, input: str, output: List[str], node_config: Optional[dict] = None, node_name: str = "ContentRouter", ): super().__init__(node_name, "conditional_node", input, output, 1, node_config) # These will be set by BaseGraph self.true_node_name = None self.false_node_name = None def execute(self, state: dict) -> str: """ Returns the name of the next node to execute. Returns: Node name to execute next """ self.logger.info(f"--- Executing {self.node_name} Node ---") input_keys = self.get_input_keys(state) content = state[input_keys[0]] # Detect content type if "<table" in content.lower(): self.logger.info("Detected table content") return self.true_node_name # Route to table parser else: self.logger.info("Detected regular content") return self.false_node_name # Route to regular parser
Conditional nodes must return a node name (string) instead of updating the state. They require exactly two outgoing edges in the graph.