Skip to main content

Overview

JARVIS uses Browser Use for autonomous web navigation and data extraction. Browser Use provides both a local SDK and a Cloud API for running browser agents at scale.

Architecture Decision

backend/agents/browser_use_client.py
# RESEARCH: Browser Use Cloud API v2 (docs.cloud.browser-use.com)
# DECISION: Using httpx async client — pure REST, no SDK needed
# ALT: browser-use pip package (not needed for Cloud API)
JARVIS uses two integration modes:
  1. Browser Use Cloud API (REST) - For marketplace skills and task execution
  2. Browser Use SDK (Python) - For custom agents with browser-use Agent class

Cloud API Client

Implementation

backend/agents/browser_use_client.py
import httpx
from loguru import logger

BU_BASE = "https://api.browser-use.com/api/v2"

class BrowserUseClient:
    """Async client for Browser Use Cloud API v2."""
    
    def __init__(self, settings: Settings):
        self._key = settings.browser_use_api_key
        self._headers = {"X-Browser-Use-API-Key": self._key or ""}
    
    def _check(self, resp: httpx.Response) -> dict[str, Any]:
        if resp.status_code >= 400:
            logger.error("Browser Use API error: {} {}", resp.status_code, resp.text[:200])
            raise BrowserUseError(resp.status_code, resp.text[:200])
        return resp.json()

Creating Sessions

backend/agents/browser_use_client.py
async def create_session(
    self,
    start_url: str | None = None,
    width: int = 1280,
    height: int = 800,
) -> dict[str, Any]:
    """POST /api/v2/sessions — create a cloud browser session."""
    body: dict[str, Any] = {
        "browserScreenWidth": width,
        "browserScreenHeight": height,
        "keepAlive": True,
    }
    if start_url:
        body["startUrl"] = start_url
    
    async with httpx.AsyncClient(timeout=30) as c:
        resp = await c.post(f"{BU_BASE}/sessions", json=body, headers=self._headers)
    
    result = self._check(resp)
    logger.info("Created BU session: {}", result.get("id"))
    return result

Creating Tasks

backend/agents/browser_use_client.py
async def create_task(
    self,
    session_id: str,
    task: str,
    start_url: str | None = None,
    allowed_domains: list[str] | None = None,
) -> dict[str, Any]:
    """POST /api/v2/tasks — create a task attached to a session."""
    body: dict[str, Any] = {
        "task": task,
        "sessionId": session_id,
        "maxSteps": 50,
        "vision": True,
    }
    if start_url:
        body["startUrl"] = start_url
    if allowed_domains:
        body["allowedDomains"] = allowed_domains
    
    async with httpx.AsyncClient(timeout=30) as c:
        resp = await c.post(f"{BU_BASE}/tasks", json=body, headers=self._headers)
    
    result = self._check(resp)
    logger.info("Created BU task: {} on session {}", result.get("id"), session_id)
    return result

Getting Task Status

backend/agents/browser_use_client.py
async def get_task(self, task_id: str) -> dict[str, Any]:
    """GET /api/v2/tasks/{id} — get task with steps and status."""
    async with httpx.AsyncClient(timeout=30) as c:
        resp = await c.get(f"{BU_BASE}/tasks/{task_id}", headers=self._headers)
    return self._check(resp)

Making Sessions Public

backend/agents/browser_use_client.py
async def make_session_public(self, session_id: str) -> dict[str, Any]:
    """POST /api/v2/sessions/{id}/public-share — get public replay URL."""
    async with httpx.AsyncClient(timeout=30) as c:
        resp = await c.post(
            f"{BU_BASE}/sessions/{session_id}/public-share",
            headers=self._headers,
        )
    result = self._check(resp)
    logger.info("Public share created for session {}: {}", session_id, result.get("shareUrl"))
    return result

Cloud Skills Runner

The CloudSkillRunner wraps the Cloud API to run marketplace skills:
backend/agents/cloud_skills.py
# Browser Use Cloud SDK skill runner + autonomous task executor
from browser_use_sdk import AsyncBrowserUse

class CloudSkillRunner:
    """Runs Browser Use Cloud tasks — with or without marketplace skills.
    
    Two modes:
    1. run_skill(skill_name, task) — guided by a marketplace/cloned skill
    2. run_task(task) — freeform browser task (for deep URL extraction)
    """
    
    def __init__(self, settings: Settings):
        self._api_key = settings.browser_use_api_key
        self._profile_id = settings.browser_use_profile_id
        self._op_vault_id = settings.op_vault_id
        self._client = None
    
    def _get_client(self):
        if self._client is not None:
            return self._client
        try:
            from browser_use_sdk import AsyncBrowserUse
            self._client = AsyncBrowserUse(api_key=self._api_key)
            return self._client
        except ImportError:
            logger.warning("browser_use_sdk not installed")
            return None

Marketplace Skills

backend/agents/cloud_skills.py
# Cloned skill registry (26 skills in account)
SKILL_IDS = {
    # Social Media — person research
    "tiktok_profile": "60e6940b-eb2c-4d54-aa54-0012ff6b6a8d",
    "instagram_posts": "dc6b4dac-f983-488b-9f15-81cdef09a2a5",
    "facebook_page": "07bca652-0668-4836-9a89-68f728a6b922",
    "linktree_profile": "4d3cc402-f840-4939-9f2b-0f707adf67d1",
    "pinterest_pins": "7dffd877-5c0f-4c72-a1c2-6f7d641a3283",
    "youtube_filmography": "f5d4c977-126e-4374-b1b3-8b02942dfa1c",
    "reddit_subreddit": "4693909a-7d0e-4628-8e0d-1cab01271a3c",
    # Professional / OSINT
    "linkedin_company_posts": "b50a1f27-22a6-489f-9768-83c0c52e8207",
    "company_employees": "47634a96-110e-4da0-ace5-8c520af7d9c3",
    "github_profile": "bcc9987f-7bc7-4af5-8c8e-7046be487e64",
    "sec_filings": "ce6cc606-837a-44d0-94e7-977d8b470113",
    "whois_lookup": "a9bf3a53-d58f-4a09-9a29-d5663a33937b",
    "osint_scraper": "2a607934-bf3e-43d9-aaa2-30f2b419eaf5",
    "yc_company": "cc661bb6-e3f3-4a48-9012-08b252a0a0bb",
    "ancestry_records": "b985c331-4515-4702-bc64-be75f429aee3",
}

Running Skills

backend/agents/cloud_skills.py
async def run_skill(
    self,
    skill_name: str,
    task: str,
    *,
    max_steps: int = 5,
    timeout: float = 60.0,
    secrets: dict[str, str] | None = None,
    allowed_domains: list[str] | None = None,
) -> dict | None:
    """Run a skill-guided task and wait for completion."""
    
    skill_id = SKILL_IDS.get(skill_name)
    if not skill_id:
        logger.error("cloud_skills: unknown skill '{}'", skill_name)
        return None
    
    return await self._create_and_poll(
        task,
        skill_ids=[skill_id],
        max_steps=max_steps,
        timeout=timeout,
        secrets=secrets,
        allowed_domains=allowed_domains,
        label=skill_name,
    )

Polling Tasks

backend/agents/cloud_skills.py
POLL_INTERVAL_SECONDS = 2.0

async def _create_and_poll(
    self,
    task: str,
    *,
    skill_ids: list[str] | None = None,
    max_steps: int = 10,
    timeout: float = 60.0,
    secrets: dict[str, str] | None = None,
    allowed_domains: list[str] | None = None,
    label: str = "task",
) -> dict | None:
    """Core create_task + polling loop."""
    
    client = self._get_client()
    if client is None:
        return None
    
    try:
        create_kwargs: dict = {
            "task": task,
            "llm": "browser-use-2.0",
            "max_steps": max_steps,
        }
        if skill_ids:
            create_kwargs["skill_ids"] = skill_ids
        if secrets:
            create_kwargs["secrets"] = secrets
        
        result = await client.tasks.create_task(**create_kwargs)
        task_id = result.id
        live_url = getattr(result, "live_url", None)
        max_polls = int(timeout / POLL_INTERVAL_SECONDS)
        
        for _ in range(max_polls):
            await asyncio.sleep(POLL_INTERVAL_SECONDS)
            status = await client.tasks.get_task_status(task_id)
            
            if status.status in ("finished", "stopped"):
                output = {
                    "output": status.output or "",
                    "success": bool(status.is_success),
                    "cost": str(status.cost) if status.cost else "unknown",
                    "task_id": task_id,
                    "live_url": live_url,
                    "label": label,
                }
                logger.info(
                    "cloud_skills: completed label={} success={} cost={}",
                    label, output["success"], output["cost"],
                )
                return output
        
        logger.warning("cloud_skills: timed out label={} after {:.0f}s", label, timeout)
        return {"output": "", "success": False, "task_id": task_id, "timed_out": True}
    
    except Exception as exc:
        logger.error("cloud_skills: error label={}: {}", label, str(exc))
        return None

Browser Use SDK Integration

BaseBrowserAgent

Agents that need custom browser automation use the SDK:
backend/agents/browser_agent.py
from browser_use import Agent, Browser, ChatBrowserUse
from langchain_openai import ChatOpenAI

class BaseBrowserAgent(ABC):
    def _build_llm(self):
        """Build the LLM instance for browser-use agents.
        
        Prefers ChatBrowserUse (optimized for browser automation, 3-5x faster).
        Falls back to ChatOpenAI with gpt-4o-mini if OPENAI_API_KEY is set.
        """
        if self._settings.browser_use_api_key:
            try:
                from browser_use import ChatBrowserUse
                logger.debug("agent={} using ChatBrowserUse bu-2-0", self.agent_name)
                return ChatBrowserUse(model="bu-2-0")
            except (ImportError, Exception) as exc:
                logger.debug("agent={} ChatBrowserUse unavailable, trying ChatOpenAI", self.agent_name)
        
        if self._settings.openai_api_key:
            from langchain_openai import ChatOpenAI
            logger.debug("agent={} using ChatOpenAI gpt-4o-mini", self.agent_name)
            return ChatOpenAI(
                model="gpt-4o-mini",
                api_key=self._settings.openai_api_key,
            )
        
        raise RuntimeError("No LLM configured: set BROWSER_USE_API_KEY or OPENAI_API_KEY")

Creating Agents

backend/agents/browser_agent.py
def _create_browser_agent(
    self, task: str, *, max_steps: int = 10, needs_login: bool = False,
):
    """Create a Browser Use Agent optimized for speed.
    
    Speed optimizations:
      - flash_mode=True for faster LLM responses
      - max_failures=2 to fail fast instead of retrying 5x
      - enable_planning=False to skip planning overhead on focused tasks
      - step_timeout=60 to kill stuck navigation steps quickly
      - max_actions_per_step=3 to keep each step small
      - use_vision="auto" to only send screenshots when needed
    """
    from browser_use import Agent
    
    llm = self._build_llm()
    agent_kwargs: dict = {
        "task": task,
        "llm": llm,
        "max_failures": 2,
        "flash_mode": True,
        "enable_planning": False,
        "step_timeout": 60,
        "max_actions_per_step": 3,
        "use_vision": "auto",
    }
    
    if self._settings.browser_use_api_key:
        try:
            from browser_use import Browser
            browser = Browser(use_cloud=True)
            agent_kwargs["browser"] = browser
            logger.debug("agent={} using Browser Use cloud", self.agent_name)
        except Exception as exc:
            logger.warning("agent={} cloud browser setup failed: {}", self.agent_name, str(exc))
    
    return Agent(**agent_kwargs)

Usage Examples

Using Cloud Skills

from agents.cloud_skills import CloudSkillRunner
from config import Settings

settings = Settings()
runner = CloudSkillRunner(settings)

# Run a marketplace skill
result = await runner.run_skill(
    "tiktok_profile",
    "Get TikTok profile info for Elon Musk",
    timeout=60.0,
)

if result and result["success"]:
    print(f"Output: {result['output']}")
    print(f"Cost: {result['cost']}")
    print(f"Live URL: {result['live_url']}")

Using SDK Agents

from agents.browser_agent import BaseBrowserAgent
from agents.models import AgentResult, ResearchRequest

class CustomAgent(BaseBrowserAgent):
    agent_name = "custom"
    
    async def _run_task(self, request: ResearchRequest) -> AgentResult:
        query = self._build_search_query(request)
        task = f"Search for {query} and extract information"
        
        agent = self._create_browser_agent(task, max_steps=5)
        result = await agent.run()
        final_result = result.final_result() if result else None
        
        return AgentResult(
            agent_name=self.agent_name,
            status=AgentStatus.SUCCESS,
            snippets=[str(final_result)],
        )

Deep URL Extraction

from agents.cloud_skills import CloudSkillRunner

runner = CloudSkillRunner(settings)

result = await runner.deep_extract_url(
    url="https://example.com/profile",
    person_name="John Doe",
    timeout=60.0,
)

if result and result["success"]:
    print(f"Extracted: {result['output']}")

Environment Variables

.env
# Browser Use Cloud API key
BROWSER_USE_API_KEY=bu_...

# Optional: Browser Use Profile ID for synced cookies
BROWSER_USE_PROFILE_ID=profile_...

# Optional: 1Password Vault ID for cross-session auth
OP_VAULT_ID=vault_...

# Fallback: OpenAI API key for local browser agents
OPENAI_API_KEY=sk-...

Troubleshooting

API Key Not Working

# Verify API key is set
from config import Settings
settings = Settings()
print(f"Browser Use configured: {bool(settings.browser_use_api_key)}")

Task Timing Out

# Increase timeout and max_steps
result = await runner.run_skill(
    "tiktok_profile",
    "Get TikTok profile",
    max_steps=10,  # Default is 5
    timeout=120.0,  # Default is 60s
)

Session Not Loading

# Check session status
client = BrowserUseClient(settings)
session = await client.get_session(session_id)
print(f"Session status: {session.get('status')}")
print(f"Live URL: {session.get('liveUrl')}")

Cloud SDK Import Error

# Install browser-use-sdk
pip install browser-use-sdk

Performance Tips

  1. Use Skills for Known Platforms: Marketplace skills are faster than freeform tasks
  2. Limit max_steps: Default to 5 for focused tasks, use 10+ only for complex workflows
  3. Set Tight Timeouts: Use 30-60s for simple tasks, 120s+ for complex multi-step workflows
  4. Parallel Execution: Use run_parallel() to run multiple skills concurrently
  5. Semaphore Limiting: DeepResearcher limits to 25 concurrent sessions to avoid rate limits

Next Steps

LinkedIn Agent

See Browser Use in action for LinkedIn scraping

Deep Researcher

Multi-phase pipeline using Cloud Skills

Build docs developers (and LLMs) love