Browser Agents: AI-Powered Web Automation

What Are Browser Agents?

Browser agents combine LLMs with browser automation to navigate websites, fill forms, extract data, and complete tasks autonomously. They can "see" web pages and decide what actions to take.

Visual understanding: Analyze page layouts and content
Navigation: Click, type, scroll intelligently
Data extraction: Scrape information with AI
Task automation: Complete multi-step workflows

Playwright Basics

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
    browser = p.chromium.launch(headless=False)
    page = browser.new_page()

    # Navigate
    page.goto("https://example.com")

    # Interact
    page.fill("input[name='search']", "AI agents")
    page.click("button[type='submit']")

    # Wait and extract
    page.wait_for_selector(".results")
    results = page.query_selector_all(".result-item")

    for result in results:
        print(result.text_content())

    browser.close()

AI-Powered Browser Agent

from playwright.async_api import async_playwright
from openai import OpenAI
import base64
import asyncio

class BrowserAgent:
    def __init__(self):
        self.client = OpenAI()

    async def screenshot_to_base64(self, page) -> str:
        screenshot = await page.screenshot()
        return base64.b64encode(screenshot).decode()

    async def analyze_page(self, page, task: str) -> dict:
        """Use vision to understand the page and decide next action."""
        screenshot = await self.screenshot_to_base64(page)

        response = self.client.chat.completions.create(
            model="gpt-4-vision-preview",
            messages=[{
                "role": "user",
                "content": [
                    {"type": "text", "text": f"""
You are a browser automation agent. Analyze this webpage screenshot.

Task: {task}
Current URL: {page.url}

Decide the next action. Respond with JSON:
{{
    "action": "click|type|scroll|navigate|done",
    "selector": "CSS selector if clicking/typing",
    "value": "text to type or URL to navigate",
    "reasoning": "why this action"
}}"""},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{screenshot}"}
                    }
                ]
            }],
            max_tokens=500
        )

        import json
        return json.loads(response.choices[0].message.content)

    async def execute_action(self, page, action: dict):
        """Execute the decided action."""
        if action["action"] == "click":
            await page.click(action["selector"])
        elif action["action"] == "type":
            await page.fill(action["selector"], action["value"])
        elif action["action"] == "scroll":
            await page.evaluate("window.scrollBy(0, 500)")
        elif action["action"] == "navigate":
            await page.goto(action["value"])

    async def run(self, task: str, start_url: str, max_steps: int = 10):
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=False)
            page = await browser.new_page()
            await page.goto(start_url)

            for step in range(max_steps):
                action = await self.analyze_page(page, task)
                print(f"Step {step + 1}: {action['action']} - {action['reasoning']}")

                if action["action"] == "done":
                    print("Task completed!")
                    break

                await self.execute_action(page, action)
                await page.wait_for_load_state("networkidle")

            await browser.close()

# Usage
agent = BrowserAgent()
asyncio.run(agent.run(
    task="Find the pricing page and extract the price of the Pro plan",
    start_url="https://example.com"
))

Using Browser-Use Library

# browser-use: Purpose-built for LLM browser automation
pip install browser-use

from browser_use import Agent
from langchain_openai import ChatOpenAI

# Create agent
agent = Agent(
    task="Go to amazon.com and find the best-rated laptop under $1000",
    llm=ChatOpenAI(model="gpt-4"),
    browser_config={
        "headless": False,
        "disable_security": True
    }
)

# Run
result = await agent.run()
print(result)

Web Scraping with AI

from playwright.async_api import async_playwright
from openai import OpenAI

async def ai_scrape(url: str, extraction_prompt: str) -> dict:
    """Use AI to intelligently extract data from any webpage."""
    client = OpenAI()

    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        await page.goto(url)

        # Get page content
        html = await page.content()
        text = await page.evaluate("document.body.innerText")

        # Take screenshot for visual context
        screenshot = await page.screenshot()
        screenshot_b64 = base64.b64encode(screenshot).decode()

        await browser.close()

    # Use LLM to extract
    response = client.chat.completions.create(
        model="gpt-4-vision-preview",
        messages=[{
            "role": "user",
            "content": [
                {"type": "text", "text": f"""
Extract the following from this webpage:
{extraction_prompt}

Page text:
{text[:5000]}

Return as JSON."""},
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}
                }
            ]
        }]
    )

    return json.loads(response.choices[0].message.content)

# Usage
data = await ai_scrape(
    "https://news.site.com/article",
    "Extract: title, author, publish date, main points (as list)"
)

Common Use Cases

Data Collection

Scrape product info, prices, reviews from any site.

Form Automation

Fill complex forms, applications, registrations.

Testing

AI-powered end-to-end testing that adapts to UI changes.

Research

Autonomous web research and information gathering.

Best Practices

Respect robots.txt: Follow website rules
Rate limiting: Don't overwhelm servers
Error handling: Sites change - handle failures gracefully
Human verification: Keep human in the loop for important actions
Headless vs headed: Use headed mode for debugging

Build Intelligent Web Agents

Our Agentic AI program covers browser automation and web agents.

Explore Agentic AI Program

Browser Agents