Skip to main content

Overview

The Instagram agent extracts profile information and posts from Instagram using Browser Use Cloud SDK skills (primary) with browser-use Agent fallback for Google-first scraping.

Implementation

backend/agents/instagram_agent.py
class InstagramAgent(BaseBrowserAgent):
    """Scrapes Instagram profiles via Cloud SDK skill, falls back to browser-use.
    
    Primary: Cloud SDK Instagram Profile Posts skill (deterministic, $0.01/run)
    Fallback: browser-use Agent + Google snippet extraction
    """
    
    agent_name = "instagram"
    
    def __init__(self, settings: Settings, *, inbox_pool=None):
        super().__init__(settings, inbox_pool=inbox_pool)
        self._cloud = CloudSkillRunner(settings)

Architecture Decision

backend/agents/instagram_agent.py
# RESEARCH: Checked instaloader (8k stars), instagram-private-api (archived), instagrapi (5k stars)
# DECISION: Browser Use Cloud SDK skill (Instagram Profile Posts, $0.01/run, ~23s, deterministic)
# FALLBACK: browser-use Agent with Google-first scraping
# ALT: instagrapi for heavier scraping needs (risk of account bans)
Why Cloud SDK?
  • Marketplace skill handles Instagram’s anti-bot measures
  • Deterministic results (~$0.01/run, 20-30s)
  • No account bans or IP blocks
  • Structured output parsing
Why not instagrapi/instaloader?
  • Risk of account bans
  • Requires managing Instagram sessions
  • More complex setup and maintenance

Two-Tier Approach

1
Try Cloud SDK Skill First
2
Uses the instagram_posts marketplace skill for reliable extraction:
3
async def _run_task(self, request: ResearchRequest) -> AgentResult:
    # Try Cloud SDK skill first (faster, more reliable)
    if self._cloud.configured:
        cloud_result = await self._try_cloud_skill(request)
        if (
            cloud_result
            and cloud_result.status == AgentStatus.SUCCESS
            and cloud_result.profiles
        ):
            return cloud_result
    
    # Fallback to Google-scraping via browser-use Agent
    return await self._try_browser_use(request)
4
Cloud SDK Implementation
5
async def _try_cloud_skill(self, request: ResearchRequest) -> AgentResult | None:
    """Try the Instagram Profile Posts marketplace skill."""
    query = self._build_search_query(request)
    task = (
        f"Search for Instagram profile of {query} and extract profile info "
        f"including username, bio, followers, following, and post count."
    )
    
    try:
        result = await self._cloud.run_skill(
            "instagram_posts",
            task,
            timeout=60.0,
        )
        
        if not result or not result.get("success"):
            logger.info("instagram cloud skill returned no result, falling back")
            return None
        
        output = result.get("output", "")
        parsed = _parse_instagram_output(output, request.person_name)
        
        return AgentResult(
            agent_name=self.agent_name,
            status=AgentStatus.SUCCESS,
            profiles=[parsed["profile"]],
            snippets=parsed["snippets"],
            urls_found=[parsed["profile"].url] if parsed["profile"].url else [],
        )
    
    except Exception as exc:
        logger.warning("instagram cloud skill error: {}", str(exc))
        return None
6
Browser-Use Fallback
7
If Cloud SDK fails, falls back to Google-first scraping:
8
async def _try_browser_use(self, request: ResearchRequest) -> AgentResult:
    """Fallback: Google-first scraping via browser-use Agent."""
    if not self.configured:
        return AgentResult(
            agent_name=self.agent_name,
            status=AgentStatus.FAILED,
            error="Browser Use not configured (BROWSER_USE_API_KEY or OPENAI_API_KEY missing)",
        )
    
    query = self._build_search_query(request)
    logger.info("instagram agent (fallback) searching: {}", query)
    
    try:
        task = (
            "Go to https://www.google.com/search"
            f"?q={query.replace(' ', '+')}+site:instagram.com "
            f"and use the extract tool to get this JSON from the Google results:\n"
            f'{{"username": "", "display_name": "", "bio": "", '
            f'"followers": 0, "following": 0, "post_count": 0, '
            f'"profile_url": ""}}\n'
            f"Extract from Google snippets. Do NOT click into Instagram. Do NOT scroll. "
            f"After extracting, immediately call done with the JSON result."
        )
        
        agent = self._create_browser_agent(task, max_steps=3)
        result = await agent.run()
        final_result = result.final_result() if result else None
        
        if final_result:
            parsed = _parse_instagram_output(str(final_result), request.person_name)
            return AgentResult(
                agent_name=self.agent_name,
                status=AgentStatus.SUCCESS,
                profiles=[parsed["profile"]],
                snippets=parsed["snippets"],
                urls_found=[parsed["profile"].url] if parsed["profile"].url else [],
            )
        
        return AgentResult(
            agent_name=self.agent_name,
            status=AgentStatus.SUCCESS,
            snippets=["No Instagram profile found"],
        )
    
    except Exception as exc:
        logger.error("instagram agent error: {}", str(exc))
        return AgentResult(
            agent_name=self.agent_name,
            status=AgentStatus.FAILED,
            error=f"Instagram agent error: {exc}",
        )

Output Parsing

JSON Extraction

backend/agents/instagram_agent.py
def _extract_json(raw: str) -> dict:
    """Robustly extract JSON from browser-use output."""
    cleaned = raw.strip()
    if "```json" in cleaned:
        cleaned = cleaned.split("```json", 1)[1].split("```", 1)[0]
    elif "```" in cleaned:
        cleaned = cleaned.split("```", 1)[1].split("```", 1)[0]
    cleaned = cleaned.strip()
    
    for text in [cleaned, raw]:
        try:
            return json.loads(text)
        except (json.JSONDecodeError, ValueError):
            pass
        start = text.find("{")
        end = text.rfind("}") + 1
        if start >= 0 and end > start:
            try:
                return json.loads(text[start:end])
            except (json.JSONDecodeError, ValueError):
                pass
    return {}

Profile Parsing

backend/agents/instagram_agent.py
def _parse_instagram_output(raw_output: str, person_name: str) -> dict:
    """Parse browser-use or Cloud SDK output into structured Instagram profile data."""
    data = _extract_json(raw_output)
    
    username = data.get("username", "")
    display_name = data.get("display_name", person_name)
    bio = data.get("bio", "")
    followers = parse_human_number(data.get("followers"))
    following = parse_human_number(data.get("following"))
    post_count = parse_human_number(data.get("post_count"))
    is_verified = data.get("is_verified", False)
    is_private = data.get("is_private", False)
    recent_posts = data.get("recent_posts", [])
    profile_url = data.get("profile_url", "")
    
    raw_data = {
        "post_count": post_count,
        "is_private": is_private,
        "recent_posts": recent_posts,
        "browser_use_output": raw_output,
    }
    
    profile = SocialProfile(
        platform="instagram",
        url=profile_url if profile_url else f"https://instagram.com/{username}" if username else "",
        username=username or None,
        display_name=display_name,
        bio=bio or None,
        followers=followers,
        following=following,
        verified=bool(is_verified),
        raw_data=raw_data,
    )
    
    snippets: list[str] = []
    if bio:
        snippets.append(
            f"@{username}: {bio}" if username else f"Instagram: {bio}"
        )
    if followers is not None:
        snippets.append(f"Followers: {followers:,}" if isinstance(followers, int) else "")
    if post_count is not None:
        snippets.append(f"Posts: {post_count}")
    if is_private:
        snippets.append("Account is private")
    if recent_posts:
        for post in recent_posts[:3]:
            caption = post.get("caption", "")
            if caption:
                snippets.append(f"Post: {caption[:150]}")
    if not snippets:
        snippets.append(raw_output[:500])
    
    snippets = [s for s in snippets if s]
    
    return {"profile": profile, "snippets": snippets}

Extracted Data

The Instagram agent extracts:
  • Profile Info: Username, display name, bio
  • Metrics: Followers, following, post count
  • Verification: Blue checkmark status
  • Privacy: Whether account is private
  • Recent Posts: Latest post captions and media
  • Profile URL: Direct link to Instagram profile

Usage Example

from agents.instagram_agent import InstagramAgent
from agents.models import ResearchRequest, AgentStatus
from config import Settings

settings = Settings()
agent = InstagramAgent(settings)

request = ResearchRequest(
    person_name="Cristiano Ronaldo",
    timeout_seconds=60.0,
)

result = await agent.run(request)

if result.status == AgentStatus.SUCCESS:
    for profile in result.profiles:
        print(f"Found: @{profile.username} ({profile.display_name})")
        print(f"  URL: {profile.url}")
        print(f"  Bio: {profile.bio}")
        print(f"  Followers: {profile.followers:,}")
        print(f"  Following: {profile.following:,}")
        print(f"  Posts: {profile.raw_data.get('post_count')}")
        print(f"  Verified: {profile.verified}")
        print(f"  Private: {profile.raw_data.get('is_private')}")
    
    print("\nRecent Posts:")
    for snippet in result.snippets:
        if snippet.startswith("Post:"):
            print(f"  - {snippet}")

Performance

Cloud SDK Path

  • Duration: 20-30s typical
  • Cost: $0.01 per run
  • Success Rate: ~80%
  • Data Quality: High (direct Instagram extraction)

Browser-Use Fallback

  • Duration: 5-10s (Google snippets only)
  • Cost: Browser Use API usage
  • Success Rate: ~50%
  • Data Quality: Medium (summary from Google knowledge panel)

Handling Private Accounts

# Private accounts return limited data
if result.status == AgentStatus.SUCCESS:
    for profile in result.profiles:
        if profile.raw_data.get("is_private"):
            print(f"Account @{profile.username} is private")
            print("Available data: username, bio, follower count only")
            print("Recent posts: Not accessible")

Troubleshooting

Cloud Skill Failing

# Check if Cloud SDK is configured
from agents.cloud_skills import CloudSkillRunner
from config import Settings

settings = Settings()
runner = CloudSkillRunner(settings)
print(f"Cloud configured: {runner.configured}")

Empty Results

# The agent returns SUCCESS with empty snippets if no profile found
if result.status == AgentStatus.SUCCESS and not result.profiles:
    print("No Instagram profile found for this person")
    # Possible reasons:
    # 1. Person doesn't have Instagram
    # 2. Account was deleted/banned
    # 3. Username doesn't match search query

Timeout Issues

# Instagram skill can take 30-60s, increase timeout if needed
request = ResearchRequest(
    person_name="John Doe",
    timeout_seconds=90.0,  # Default is 60s
)

Post Count Parsing

# Handles human-readable formats
from agents.models import parse_human_number

post_count = parse_human_number("1.2K")  # Returns 1200
post_count = parse_human_number("500")   # Returns 500
post_count = parse_human_number("5M+")   # Returns 5000000

Private vs Public Profiles

# Check if profile is private
for profile in result.profiles:
    is_private = profile.raw_data.get("is_private", False)
    if is_private:
        print("Limited data available (account is private)")
    else:
        print("Full profile data available")
        recent_posts = profile.raw_data.get("recent_posts", [])
        print(f"Found {len(recent_posts)} recent posts")

Advanced Usage

Extracting Recent Posts

# Access recent posts from raw_data
for profile in result.profiles:
    recent_posts = profile.raw_data.get("recent_posts", [])
    for post in recent_posts:
        print(f"Caption: {post.get('caption', 'No caption')}")
        print(f"Likes: {post.get('likes', 0)}")
        print(f"Comments: {post.get('comments', 0)}")
        print(f"Posted: {post.get('timestamp', 'Unknown')}")
        print()

Checking Verification Status

# Verified accounts have the blue checkmark
for profile in result.profiles:
    if profile.verified:
        print(f"@{profile.username} is verified")
    else:
        print(f"@{profile.username} is not verified")

Follower Growth Analysis

# Store follower counts over time for growth tracking
import json
from datetime import datetime

result = await agent.run(request)
for profile in result.profiles:
    data_point = {
        "date": datetime.now().isoformat(),
        "username": profile.username,
        "followers": profile.followers,
        "following": profile.following,
        "posts": profile.raw_data.get("post_count"),
    }
    # Save to database or file for time-series analysis
    print(json.dumps(data_point, indent=2))

Next Steps

Twitter Agent

Twitter/X profile scraping with twscrape

Google Agent

Google search-based intelligence gathering

Browser Use Integration

Deep dive into Cloud SDK skills

Agent Overview

Full agent system architecture

Build docs developers (and LLMs) love