Nodes are the fundamental processing units in ScrapeGraphAI. Each node performs a specific task (fetching, parsing, generating, etc.) and updates the graph state with its results.
class BaseNode: node_name: str # Unique identifier for the node node_type: str # Either "node" or "conditional_node" input: str # Boolean expression defining input keys output: List[str] # List of output keys to update in state min_input_len: int # Minimum required input keys (default: 1) node_config: dict # Additional configuration logger: Logger # Logging instance
Extend BaseNode to create custom processing logic:
from scrapegraphai.nodes import BaseNodefrom typing import List, Optionalclass MyCustomNode(BaseNode): """ Custom node that processes data in a specific way. """ def __init__( self, input: str, output: List[str], node_config: Optional[dict] = None, node_name: str = "MyCustomNode" ): super().__init__(node_name, "node", input, output, 1, node_config) # Initialize node-specific attributes from config self.llm_model = node_config.get("llm_model") if node_config else None self.verbose = node_config.get("verbose", False) if node_config else False def execute(self, state: dict) -> dict: """ Execute the node's logic. Args: state: Current graph state Returns: Updated state dictionary """ self.logger.info(f"--- Executing {self.node_name} Node ---") # Get input keys based on the input expression input_keys = self.get_input_keys(state) input_data = [state[key] for key in input_keys] # Process the input data result = self.process_data(input_data) # Update state with output state.update({self.output[0]: result}) return state def process_data(self, input_data): """Custom processing logic""" # Your custom logic here return processed_data
# Update specific parametersnode.update_config({ "verbose": True, "timeout": 60}, overwrite=True)# Or use set_common_params on AbstractGraphgraph.set_common_params({ "verbose": True, "headless": False}, overwrite=True)
class SafeNode(BaseNode): def execute(self, state: dict) -> dict: try: self.logger.info(f"--- Executing {self.node_name} ---") input_keys = self.get_input_keys(state) input_data = [state[key] for key in input_keys] # Process data result = self.process(input_data) state.update({self.output[0]: result}) return state except ValueError as e: self.logger.error(f"Validation error in {self.node_name}: {e}") state.update({self.output[0]: {"error": str(e)}}) return state except Exception as e: self.logger.error(f"Error in {self.node_name}: {e}") raise # Re-raise for graph-level handling
Each node should do one thing well. Split complex logic into multiple nodes:
# Good: Separate concernsfetch_node = FetchNode(...) # Only fetchparse_node = ParseNode(...) # Only parsegenerate_node = GenerateNode(...) # Only generate# Avoid: Doing too much in one node