Browser Use Integration

Overview

JARVIS uses Browser Use for autonomous web navigation and data extraction. Browser Use provides both a local SDK and a Cloud API for running browser agents at scale.

Architecture Decision

backend/agents/browser_use_client.py

# RESEARCH: Browser Use Cloud API v2 (docs.cloud.browser-use.com)
# DECISION: Using httpx async client — pure REST, no SDK needed
# ALT: browser-use pip package (not needed for Cloud API)

JARVIS uses two integration modes:

Browser Use Cloud API (REST) - For marketplace skills and task execution
Browser Use SDK (Python) - For custom agents with browser-use Agent class

Cloud API Client

Implementation

backend/agents/browser_use_client.py

import httpx
from loguru import logger

BU_BASE = "https://api.browser-use.com/api/v2"

class BrowserUseClient:
    """Async client for Browser Use Cloud API v2."""
    
    def __init__(self, settings: Settings):
        self._key = settings.browser_use_api_key
        self._headers = {"X-Browser-Use-API-Key": self._key or ""}
    
    def _check(self, resp: httpx.Response) -> dict[str, Any]:
        if resp.status_code >= 400:
            logger.error("Browser Use API error: {} {}", resp.status_code, resp.text[:200])
            raise BrowserUseError(resp.status_code, resp.text[:200])
        return resp.json()

Creating Sessions

backend/agents/browser_use_client.py

async def create_session(
    self,
    start_url: str | None = None,
    width: int = 1280,
    height: int = 800,
) -> dict[str, Any]:
    """POST /api/v2/sessions — create a cloud browser session."""
    body: dict[str, Any] = {
        "browserScreenWidth": width,
        "browserScreenHeight": height,
        "keepAlive": True,
    }
    if start_url:
        body["startUrl"] = start_url
    
    async with httpx.AsyncClient(timeout=30) as c:
        resp = await c.post(f"{BU_BASE}/sessions", json=body, headers=self._headers)
    
    result = self._check(resp)
    logger.info("Created BU session: {}", result.get("id"))
    return result

Creating Tasks

backend/agents/browser_use_client.py

async def create_task(
    self,
    session_id: str,
    task: str,
    start_url: str | None = None,
    allowed_domains: list[str] | None = None,
) -> dict[str, Any]:
    """POST /api/v2/tasks — create a task attached to a session."""
    body: dict[str, Any] = {
        "task": task,
        "sessionId": session_id,
        "maxSteps": 50,
        "vision": True,
    }
    if start_url:
        body["startUrl"] = start_url
    if allowed_domains:
        body["allowedDomains"] = allowed_domains
    
    async with httpx.AsyncClient(timeout=30) as c:
        resp = await c.post(f"{BU_BASE}/tasks", json=body, headers=self._headers)
    
    result = self._check(resp)
    logger.info("Created BU task: {} on session {}", result.get("id"), session_id)
    return result

Getting Task Status

backend/agents/browser_use_client.py

async def get_task(self, task_id: str) -> dict[str, Any]:
    """GET /api/v2/tasks/{id} — get task with steps and status."""
    async with httpx.AsyncClient(timeout=30) as c:
        resp = await c.get(f"{BU_BASE}/tasks/{task_id}", headers=self._headers)
    return self._check(resp)

Making Sessions Public

backend/agents/browser_use_client.py

async def make_session_public(self, session_id: str) -> dict[str, Any]:
    """POST /api/v2/sessions/{id}/public-share — get public replay URL."""
    async with httpx.AsyncClient(timeout=30) as c:
        resp = await c.post(
            f"{BU_BASE}/sessions/{session_id}/public-share",
            headers=self._headers,
        )
    result = self._check(resp)
    logger.info("Public share created for session {}: {}", session_id, result.get("shareUrl"))
    return result

Cloud Skills Runner

The CloudSkillRunner wraps the Cloud API to run marketplace skills:

backend/agents/cloud_skills.py

# Browser Use Cloud SDK skill runner + autonomous task executor
from browser_use_sdk import AsyncBrowserUse

class CloudSkillRunner:
    """Runs Browser Use Cloud tasks — with or without marketplace skills.
    
    Two modes:
    1. run_skill(skill_name, task) — guided by a marketplace/cloned skill
    2. run_task(task) — freeform browser task (for deep URL extraction)
    """
    
    def __init__(self, settings: Settings):
        self._api_key = settings.browser_use_api_key
        self._profile_id = settings.browser_use_profile_id
        self._op_vault_id = settings.op_vault_id
        self._client = None
    
    def _get_client(self):
        if self._client is not None:
            return self._client
        try:
            from browser_use_sdk import AsyncBrowserUse
            self._client = AsyncBrowserUse(api_key=self._api_key)
            return self._client
        except ImportError:
            logger.warning("browser_use_sdk not installed")
            return None

Marketplace Skills

backend/agents/cloud_skills.py

# Cloned skill registry (26 skills in account)
SKILL_IDS = {
    # Social Media — person research
    "tiktok_profile": "60e6940b-eb2c-4d54-aa54-0012ff6b6a8d",
    "instagram_posts": "dc6b4dac-f983-488b-9f15-81cdef09a2a5",
    "facebook_page": "07bca652-0668-4836-9a89-68f728a6b922",
    "linktree_profile": "4d3cc402-f840-4939-9f2b-0f707adf67d1",
    "pinterest_pins": "7dffd877-5c0f-4c72-a1c2-6f7d641a3283",
    "youtube_filmography": "f5d4c977-126e-4374-b1b3-8b02942dfa1c",
    "reddit_subreddit": "4693909a-7d0e-4628-8e0d-1cab01271a3c",
    # Professional / OSINT
    "linkedin_company_posts": "b50a1f27-22a6-489f-9768-83c0c52e8207",
    "company_employees": "47634a96-110e-4da0-ace5-8c520af7d9c3",
    "github_profile": "bcc9987f-7bc7-4af5-8c8e-7046be487e64",
    "sec_filings": "ce6cc606-837a-44d0-94e7-977d8b470113",
    "whois_lookup": "a9bf3a53-d58f-4a09-9a29-d5663a33937b",
    "osint_scraper": "2a607934-bf3e-43d9-aaa2-30f2b419eaf5",
    "yc_company": "cc661bb6-e3f3-4a48-9012-08b252a0a0bb",
    "ancestry_records": "b985c331-4515-4702-bc64-be75f429aee3",
}

Running Skills

backend/agents/cloud_skills.py

async def run_skill(
    self,
    skill_name: str,
    task: str,
    *,
    max_steps: int = 5,
    timeout: float = 60.0,
    secrets: dict[str, str] | None = None,
    allowed_domains: list[str] | None = None,
) -> dict | None:
    """Run a skill-guided task and wait for completion."""
    
    skill_id = SKILL_IDS.get(skill_name)
    if not skill_id:
        logger.error("cloud_skills: unknown skill '{}'", skill_name)
        return None
    
    return await self._create_and_poll(
        task,
        skill_ids=[skill_id],
        max_steps=max_steps,
        timeout=timeout,
        secrets=secrets,
        allowed_domains=allowed_domains,
        label=skill_name,
    )

Polling Tasks

backend/agents/cloud_skills.py

POLL_INTERVAL_SECONDS = 2.0

async def _create_and_poll(
    self,
    task: str,
    *,
    skill_ids: list[str] | None = None,
    max_steps: int = 10,
    timeout: float = 60.0,
    secrets: dict[str, str] | None = None,
    allowed_domains: list[str] | None = None,
    label: str = "task",
) -> dict | None:
    """Core create_task + polling loop."""
    
    client = self._get_client()
    if client is None:
        return None
    
    try:
        create_kwargs: dict = {
            "task": task,
            "llm": "browser-use-2.0",
            "max_steps": max_steps,
        }
        if skill_ids:
            create_kwargs["skill_ids"] = skill_ids
        if secrets:
            create_kwargs["secrets"] = secrets
        
        result = await client.tasks.create_task(**create_kwargs)
        task_id = result.id
        live_url = getattr(result, "live_url", None)
        max_polls = int(timeout / POLL_INTERVAL_SECONDS)
        
        for _ in range(max_polls):
            await asyncio.sleep(POLL_INTERVAL_SECONDS)
            status = await client.tasks.get_task_status(task_id)
            
            if status.status in ("finished", "stopped"):
                output = {
                    "output": status.output or "",
                    "success": bool(status.is_success),
                    "cost": str(status.cost) if status.cost else "unknown",
                    "task_id": task_id,
                    "live_url": live_url,
                    "label": label,
                }
                logger.info(
                    "cloud_skills: completed label={} success={} cost={}",
                    label, output["success"], output["cost"],
                )
                return output
        
        logger.warning("cloud_skills: timed out label={} after {:.0f}s", label, timeout)
        return {"output": "", "success": False, "task_id": task_id, "timed_out": True}
    
    except Exception as exc:
        logger.error("cloud_skills: error label={}: {}", label, str(exc))
        return None

Browser Use SDK Integration

BaseBrowserAgent

Agents that need custom browser automation use the SDK:

backend/agents/browser_agent.py

from browser_use import Agent, Browser, ChatBrowserUse
from langchain_openai import ChatOpenAI

class BaseBrowserAgent(ABC):
    def _build_llm(self):
        """Build the LLM instance for browser-use agents.
        
        Prefers ChatBrowserUse (optimized for browser automation, 3-5x faster).
        Falls back to ChatOpenAI with gpt-4o-mini if OPENAI_API_KEY is set.
        """
        if self._settings.browser_use_api_key:
            try:
                from browser_use import ChatBrowserUse
                logger.debug("agent={} using ChatBrowserUse bu-2-0", self.agent_name)
                return ChatBrowserUse(model="bu-2-0")
            except (ImportError, Exception) as exc:
                logger.debug("agent={} ChatBrowserUse unavailable, trying ChatOpenAI", self.agent_name)
        
        if self._settings.openai_api_key:
            from langchain_openai import ChatOpenAI
            logger.debug("agent={} using ChatOpenAI gpt-4o-mini", self.agent_name)
            return ChatOpenAI(
                model="gpt-4o-mini",
                api_key=self._settings.openai_api_key,
            )
        
        raise RuntimeError("No LLM configured: set BROWSER_USE_API_KEY or OPENAI_API_KEY")

Creating Agents

backend/agents/browser_agent.py

def _create_browser_agent(
    self, task: str, *, max_steps: int = 10, needs_login: bool = False,
):
    """Create a Browser Use Agent optimized for speed.
    
    Speed optimizations:
      - flash_mode=True for faster LLM responses
      - max_failures=2 to fail fast instead of retrying 5x
      - enable_planning=False to skip planning overhead on focused tasks
      - step_timeout=60 to kill stuck navigation steps quickly
      - max_actions_per_step=3 to keep each step small
      - use_vision="auto" to only send screenshots when needed
    """
    from browser_use import Agent
    
    llm = self._build_llm()
    agent_kwargs: dict = {
        "task": task,
        "llm": llm,
        "max_failures": 2,
        "flash_mode": True,
        "enable_planning": False,
        "step_timeout": 60,
        "max_actions_per_step": 3,
        "use_vision": "auto",
    }
    
    if self._settings.browser_use_api_key:
        try:
            from browser_use import Browser
            browser = Browser(use_cloud=True)
            agent_kwargs["browser"] = browser
            logger.debug("agent={} using Browser Use cloud", self.agent_name)
        except Exception as exc:
            logger.warning("agent={} cloud browser setup failed: {}", self.agent_name, str(exc))
    
    return Agent(**agent_kwargs)

Usage Examples

Using Cloud Skills

from agents.cloud_skills import CloudSkillRunner
from config import Settings

settings = Settings()
runner = CloudSkillRunner(settings)

# Run a marketplace skill
result = await runner.run_skill(
    "tiktok_profile",
    "Get TikTok profile info for Elon Musk",
    timeout=60.0,
)

if result and result["success"]:
    print(f"Output: {result['output']}")
    print(f"Cost: {result['cost']}")
    print(f"Live URL: {result['live_url']}")

Using SDK Agents

from agents.browser_agent import BaseBrowserAgent
from agents.models import AgentResult, ResearchRequest

class CustomAgent(BaseBrowserAgent):
    agent_name = "custom"
    
    async def _run_task(self, request: ResearchRequest) -> AgentResult:
        query = self._build_search_query(request)
        task = f"Search for {query} and extract information"
        
        agent = self._create_browser_agent(task, max_steps=5)
        result = await agent.run()
        final_result = result.final_result() if result else None
        
        return AgentResult(
            agent_name=self.agent_name,
            status=AgentStatus.SUCCESS,
            snippets=[str(final_result)],
        )

Deep URL Extraction

from agents.cloud_skills import CloudSkillRunner

runner = CloudSkillRunner(settings)

result = await runner.deep_extract_url(
    url="https://example.com/profile",
    person_name="John Doe",
    timeout=60.0,
)

if result and result["success"]:
    print(f"Extracted: {result['output']}")

Environment Variables

.env

# Browser Use Cloud API key
BROWSER_USE_API_KEY=bu_...

# Optional: Browser Use Profile ID for synced cookies
BROWSER_USE_PROFILE_ID=profile_...

# Optional: 1Password Vault ID for cross-session auth
OP_VAULT_ID=vault_...

# Fallback: OpenAI API key for local browser agents
OPENAI_API_KEY=sk-...

Troubleshooting

API Key Not Working

# Verify API key is set
from config import Settings
settings = Settings()
print(f"Browser Use configured: {bool(settings.browser_use_api_key)}")

Task Timing Out

# Increase timeout and max_steps
result = await runner.run_skill(
    "tiktok_profile",
    "Get TikTok profile",
    max_steps=10,  # Default is 5
    timeout=120.0,  # Default is 60s
)

Session Not Loading

# Check session status
client = BrowserUseClient(settings)
session = await client.get_session(session_id)
print(f"Session status: {session.get('status')}")
print(f"Live URL: {session.get('liveUrl')}")

Cloud SDK Import Error

# Install browser-use-sdk
pip install browser-use-sdk

Performance Tips

Use Skills for Known Platforms: Marketplace skills are faster than freeform tasks
Limit max_steps: Default to 5 for focused tasks, use 10+ only for complex workflows
Set Tight Timeouts: Use 30-60s for simple tasks, 120s+ for complex multi-step workflows
Parallel Execution: Use run_parallel() to run multiple skills concurrently
Semaphore Limiting: DeepResearcher limits to 25 concurrent sessions to avoid rate limits

Get Started

Core Concepts

Hardware Integration

Backend Services

Agent System

Frontend

Data & Storage

Deployment

Browser Use Integration

Overview

Architecture Decision

Cloud API Client

Implementation

Creating Sessions

Creating Tasks

Getting Task Status

Making Sessions Public

Cloud Skills Runner

Marketplace Skills

Running Skills

Polling Tasks

Browser Use SDK Integration

BaseBrowserAgent

Creating Agents

Usage Examples

Using Cloud Skills

Using SDK Agents

Deep URL Extraction

Environment Variables

Troubleshooting

API Key Not Working

Task Timing Out

Session Not Loading

Cloud SDK Import Error

Performance Tips

Next Steps

LinkedIn Agent

Deep Researcher

Build docs developers (and LLMs) love

Get Started

Core Concepts

Hardware Integration

Backend Services

Agent System

Frontend

Data & Storage

Deployment

​Overview

​Architecture Decision

​Cloud API Client

​Implementation

​Creating Sessions

​Creating Tasks

​Getting Task Status

​Making Sessions Public

​Cloud Skills Runner

​Marketplace Skills

​Running Skills

​Polling Tasks

​Browser Use SDK Integration

​BaseBrowserAgent

​Creating Agents

​Usage Examples

​Using Cloud Skills

​Using SDK Agents

​Deep URL Extraction

​Environment Variables

​Troubleshooting

​API Key Not Working

​Task Timing Out

​Session Not Loading

​Cloud SDK Import Error

​Performance Tips

​Next Steps

LinkedIn Agent

Deep Researcher

Build docs developers (and LLMs) love

Overview

Architecture Decision

Cloud API Client

Implementation

Creating Sessions

Creating Tasks

Getting Task Status

Making Sessions Public

Cloud Skills Runner

Marketplace Skills

Running Skills

Polling Tasks

Browser Use SDK Integration

BaseBrowserAgent

Creating Agents

Usage Examples

Using Cloud Skills

Using SDK Agents

Deep URL Extraction

Environment Variables

Troubleshooting

API Key Not Working

Task Timing Out

Session Not Loading

Cloud SDK Import Error

Performance Tips

Next Steps