What Are Browser Agents?
Browser agents combine LLMs with browser automation to navigate websites, fill forms, extract data, and complete tasks autonomously. They can "see" web pages and decide what actions to take.
- Visual understanding: Analyze page layouts and content
- Navigation: Click, type, scroll intelligently
- Data extraction: Scrape information with AI
- Task automation: Complete multi-step workflows
Playwright Basics
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
# Navigate
page.goto("https://example.com")
# Interact
page.fill("input[name='search']", "AI agents")
page.click("button[type='submit']")
# Wait and extract
page.wait_for_selector(".results")
results = page.query_selector_all(".result-item")
for result in results:
print(result.text_content())
browser.close()
AI-Powered Browser Agent
from playwright.async_api import async_playwright
from openai import OpenAI
import base64
import asyncio
class BrowserAgent:
def __init__(self):
self.client = OpenAI()
async def screenshot_to_base64(self, page) -> str:
screenshot = await page.screenshot()
return base64.b64encode(screenshot).decode()
async def analyze_page(self, page, task: str) -> dict:
"""Use vision to understand the page and decide next action."""
screenshot = await self.screenshot_to_base64(page)
response = self.client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": f"""
You are a browser automation agent. Analyze this webpage screenshot.
Task: {task}
Current URL: {page.url}
Decide the next action. Respond with JSON:
{{
"action": "click|type|scroll|navigate|done",
"selector": "CSS selector if clicking/typing",
"value": "text to type or URL to navigate",
"reasoning": "why this action"
}}"""},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{screenshot}"}
}
]
}],
max_tokens=500
)
import json
return json.loads(response.choices[0].message.content)
async def execute_action(self, page, action: dict):
"""Execute the decided action."""
if action["action"] == "click":
await page.click(action["selector"])
elif action["action"] == "type":
await page.fill(action["selector"], action["value"])
elif action["action"] == "scroll":
await page.evaluate("window.scrollBy(0, 500)")
elif action["action"] == "navigate":
await page.goto(action["value"])
async def run(self, task: str, start_url: str, max_steps: int = 10):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
page = await browser.new_page()
await page.goto(start_url)
for step in range(max_steps):
action = await self.analyze_page(page, task)
print(f"Step {step + 1}: {action['action']} - {action['reasoning']}")
if action["action"] == "done":
print("Task completed!")
break
await self.execute_action(page, action)
await page.wait_for_load_state("networkidle")
await browser.close()
# Usage
agent = BrowserAgent()
asyncio.run(agent.run(
task="Find the pricing page and extract the price of the Pro plan",
start_url="https://example.com"
))
Using Browser-Use Library
# browser-use: Purpose-built for LLM browser automation
pip install browser-use
from browser_use import Agent
from langchain_openai import ChatOpenAI
# Create agent
agent = Agent(
task="Go to amazon.com and find the best-rated laptop under $1000",
llm=ChatOpenAI(model="gpt-4"),
browser_config={
"headless": False,
"disable_security": True
}
)
# Run
result = await agent.run()
print(result)
Web Scraping with AI
from playwright.async_api import async_playwright
from openai import OpenAI
async def ai_scrape(url: str, extraction_prompt: str) -> dict:
"""Use AI to intelligently extract data from any webpage."""
client = OpenAI()
async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page()
await page.goto(url)
# Get page content
html = await page.content()
text = await page.evaluate("document.body.innerText")
# Take screenshot for visual context
screenshot = await page.screenshot()
screenshot_b64 = base64.b64encode(screenshot).decode()
await browser.close()
# Use LLM to extract
response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": f"""
Extract the following from this webpage:
{extraction_prompt}
Page text:
{text[:5000]}
Return as JSON."""},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}
}
]
}]
)
return json.loads(response.choices[0].message.content)
# Usage
data = await ai_scrape(
"https://news.site.com/article",
"Extract: title, author, publish date, main points (as list)"
)
Common Use Cases
Data Collection
Scrape product info, prices, reviews from any site.
Form Automation
Fill complex forms, applications, registrations.
Testing
AI-powered end-to-end testing that adapts to UI changes.
Research
Autonomous web research and information gathering.
Best Practices
- Respect robots.txt: Follow website rules
- Rate limiting: Don't overwhelm servers
- Error handling: Sites change - handle failures gracefully
- Human verification: Keep human in the loop for important actions
- Headless vs headed: Use headed mode for debugging
Build Intelligent Web Agents
Our Agentic AI program covers browser automation and web agents.
Explore Agentic AI Program