feat: browser use - element extraction + index-based clicking for text models
Some checks failed
CI / build-check-test (push) Has been cancelled

- tool-server.mjs: extractElements() scrapes all interactive elements with coordinates
- tool-server.mjs: formatElements() returns numbered list for LLM to read
- tool-server.mjs: click/type now support {index: N} for element-based interaction
- tool-server.mjs: new /api/browser/elements and /api/browser/keypress endpoints
- browser-tool.ts: updated schema with index, key params and elements/keypress actions
- browser-tool.ts: elementsText included in every LLM response so model can see the page
- browser-tool.ts: detailed workflow instructions in tool description
- Enables text-only models (Llama 3.3 etc) to navigate and interact with web pages
This commit is contained in:
JAE 2026-03-27 23:17:24 +00:00
parent db79dec9e1
commit a2227c7659
2 changed files with 327 additions and 139 deletions

View file

@ -29,7 +29,7 @@ function parseBody(req) {
let browser = null; let browser = null;
let context = null; let context = null;
let page = null; let page = null;
const browserPanelClients = new Set(); // WS clients watching the browser const browserPanelClients = new Set();
async function launchBrowser() { async function launchBrowser() {
if (!browser) { if (!browser) {
@ -52,10 +52,120 @@ async function getPage() {
return page; return page;
} }
// ── ELEMENT EXTRACTION (the key feature for text-model browser control) ──
async function extractElements(p) {
try {
const elements = await p.evaluate(() => {
const items = [];
const seen = new Set();
// Selectors for all interactive elements
const selectors = [
'a[href]',
'button',
'input',
'textarea',
'select',
'[role="button"]',
'[role="link"]',
'[role="tab"]',
'[role="menuitem"]',
'[onclick]',
'[contenteditable="true"]',
'summary',
'details',
'label[for]',
];
const allEls = document.querySelectorAll(selectors.join(','));
for (const el of allEls) {
const rect = el.getBoundingClientRect();
// Skip invisible, off-screen, or tiny elements
if (rect.width < 5 || rect.height < 5) continue;
if (rect.top > window.innerHeight || rect.bottom < 0) continue;
if (rect.left > window.innerWidth || rect.right < 0) continue;
if (window.getComputedStyle(el).visibility === 'hidden') continue;
if (window.getComputedStyle(el).display === 'none') continue;
if (parseFloat(window.getComputedStyle(el).opacity) < 0.1) continue;
const tag = el.tagName.toLowerCase();
const type = el.getAttribute('type') || '';
const role = el.getAttribute('role') || '';
const href = el.getAttribute('href') || '';
const placeholder = el.getAttribute('placeholder') || '';
const ariaLabel = el.getAttribute('aria-label') || '';
const title = el.getAttribute('title') || '';
const name = el.getAttribute('name') || '';
const value = el.value || '';
// Build a human-readable label
let label = (el.innerText || '').trim().slice(0, 80);
if (!label) label = ariaLabel || title || placeholder || name || '';
if (!label && tag === 'img') label = el.getAttribute('alt') || 'image';
if (!label) label = `(${tag}${type ? ' type=' + type : ''})`;
// Centre coordinates
const cx = Math.round(rect.left + rect.width / 2);
const cy = Math.round(rect.top + rect.height / 2);
// De-duplicate by position (within 5px)
const key = `${Math.round(cx/5)*5},${Math.round(cy/5)*5}`;
if (seen.has(key)) continue;
seen.add(key);
let kind = tag;
if (tag === 'a') kind = 'link';
if (tag === 'button' || role === 'button') kind = 'button';
if (tag === 'input') kind = 'input' + (type ? `[${type}]` : '');
if (tag === 'textarea') kind = 'textarea';
if (tag === 'select') kind = 'select';
const item = { index: items.length + 1, kind, label, x: cx, y: cy };
if (tag === 'input' || tag === 'textarea') {
item.value = value.slice(0, 100);
if (placeholder) item.placeholder = placeholder;
}
if (tag === 'a' && href) {
item.href = href.slice(0, 120);
}
items.push(item);
}
return items;
});
return elements;
} catch (e) {
console.error('[tool-server] Element extraction error:', e.message);
return [];
}
}
// Format elements as readable text for the LLM
function formatElements(elements) {
if (!elements || elements.length === 0) return 'No interactive elements found on page.';
const lines = ['Interactive elements on page:'];
for (const el of elements) {
let line = ` [${el.index}] ${el.kind} "${el.label}" at (${el.x}, ${el.y})`;
if (el.value) line += ` value="${el.value}"`;
if (el.placeholder) line += ` placeholder="${el.placeholder}"`;
if (el.href) line += `${el.href}`;
lines.push(line);
}
lines.push('');
lines.push('To click an element, use action "click" with its x,y coordinates.');
lines.push('To type into a focused input, use action "type" with text.');
return lines.join('\n');
}
// Enhanced snap: screenshot + element list
async function snap() { async function snap() {
const p = await getPage(); const p = await getPage();
const buf = await p.screenshot({ type: 'jpeg', quality: 70, fullPage: false }); const buf = await p.screenshot({ type: 'jpeg', quality: 70, fullPage: false });
return { screenshot: buf.toString('base64'), url: p.url(), title: await p.title() }; const elements = await extractElements(p);
return {
screenshot: buf.toString('base64'),
url: p.url(),
title: await p.title(),
elements,
elementsText: formatElements(elements),
};
} }
// Broadcast screenshot to all connected browser panel WebSocket clients // Broadcast screenshot to all connected browser panel WebSocket clients
@ -104,13 +214,21 @@ async function handleNavigate(body) {
const target = targetUrl.startsWith('http') ? targetUrl : 'https://' + targetUrl; const target = targetUrl.startsWith('http') ? targetUrl : 'https://' + targetUrl;
await p.goto(target, { timeout: 30000, waitUntil: 'domcontentloaded' }); await p.goto(target, { timeout: 30000, waitUntil: 'domcontentloaded' });
const result = await snap(); const result = await snap();
broadcastScreenshot(); // sync panel broadcastScreenshot();
return result; return result;
} }
async function handleClick(body) { async function handleClick(body) {
const p = await getPage(); const p = await getPage();
await p.mouse.click(body.x || 0, body.y || 0); // Support clicking by element index
if (body.index && !body.x && !body.y) {
const elements = await extractElements(p);
const el = elements.find(e => e.index === body.index);
if (!el) return { error: `Element [${body.index}] not found. Use action "screenshot" to refresh element list.` };
await p.mouse.click(el.x, el.y);
} else {
await p.mouse.click(body.x || 0, body.y || 0);
}
await p.waitForTimeout(500); await p.waitForTimeout(500);
const result = await snap(); const result = await snap();
broadcastScreenshot(); broadcastScreenshot();
@ -119,8 +237,24 @@ async function handleClick(body) {
async function handleType(body) { async function handleType(body) {
const p = await getPage(); const p = await getPage();
if (body.selector) await p.fill(body.selector, body.text || ''); if (body.selector) {
else await p.keyboard.type(body.text || ''); await p.fill(body.selector, body.text || '');
} else if (body.index) {
// Click the element first, then type
const elements = await extractElements(p);
const el = elements.find(e => e.index === body.index);
if (el) {
await p.mouse.click(el.x, el.y);
await p.waitForTimeout(200);
// Clear existing content and type new text
await p.keyboard.press('Control+a');
await p.keyboard.type(body.text || '');
} else {
return { error: `Element [${body.index}] not found.` };
}
} else {
await p.keyboard.type(body.text || '');
}
await p.waitForTimeout(300); await p.waitForTimeout(300);
const result = await snap(); const result = await snap();
broadcastScreenshot(); broadcastScreenshot();
@ -163,7 +297,8 @@ async function handleReload() {
async function handleText() { async function handleText() {
const p = await getPage(); const p = await getPage();
const text = await p.evaluate(() => document.body.innerText); const text = await p.evaluate(() => document.body.innerText);
return { url: p.url(), title: await p.title(), text: text.slice(0, 8000) }; const elements = await extractElements(p);
return { url: p.url(), title: await p.title(), text: text.slice(0, 8000), elements, elementsText: formatElements(elements) };
} }
async function handleEval(body) { async function handleEval(body) {
@ -174,6 +309,22 @@ async function handleEval(body) {
return { ...ss, evalResult: String(result) }; return { ...ss, evalResult: String(result) };
} }
async function handleElements() {
const p = await getPage();
const elements = await extractElements(p);
return { url: p.url(), title: await p.title(), elements, elementsText: formatElements(elements) };
}
// Press a specific key (Enter, Tab, Escape, etc.)
async function handleKeypress(body) {
const p = await getPage();
await p.keyboard.press(body.key || 'Enter');
await p.waitForTimeout(300);
const result = await snap();
broadcastScreenshot();
return result;
}
// ── HTTP ROUTES ─────────────────────────────────────────────── // ── HTTP ROUTES ───────────────────────────────────────────────
const routes = { const routes = {
'/api/bash': handleBash, '/api/bash': handleBash,
@ -184,9 +335,11 @@ const routes = {
'/api/browser/back': handleBack, '/api/browser/back': handleBack,
'/api/browser/forward': handleForward, '/api/browser/forward': handleForward,
'/api/browser/reload': handleReload, '/api/browser/reload': handleReload,
'/api/browser/screenshot': () => { const r = snap(); broadcastScreenshot(); return r; }, '/api/browser/screenshot': async () => { const r = await snap(); broadcastScreenshot(); return r; },
'/api/browser/text': handleText, '/api/browser/text': handleText,
'/api/browser/eval': handleEval, '/api/browser/eval': handleEval,
'/api/browser/elements': handleElements,
'/api/browser/keypress': handleKeypress,
}; };
// ── HTTP SERVER ─────────────────────────────────────────────── // ── HTTP SERVER ───────────────────────────────────────────────
@ -239,7 +392,7 @@ browserWss.on('connection', async (ws) => {
ws.on('close', () => browserPanelClients.delete(ws)); ws.on('close', () => browserPanelClients.delete(ws));
ws.on('error', () => browserPanelClients.delete(ws)); ws.on('error', () => browserPanelClients.delete(ws));
// Handle panel user interactions (navigate, click, scroll, etc.) // Handle panel user interactions
ws.on('message', async (msg) => { ws.on('message', async (msg) => {
try { try {
const m = JSON.parse(msg.toString()); const m = JSON.parse(msg.toString());
@ -291,14 +444,13 @@ browserWss.on('connection', async (ws) => {
} }
}); });
// Send initial ready + screenshot if browser exists
ws.send(JSON.stringify({ type: 'ready' })); ws.send(JSON.stringify({ type: 'ready' }));
if (page && !page.isClosed()) { if (page && !page.isClosed()) {
try { await broadcastScreenshot(); } catch {} try { await broadcastScreenshot(); } catch {}
} }
}); });
// ── UPGRADE HANDLER (route WS by path) ──────────────────────── // ── UPGRADE HANDLER ───────────────────────────────────────────
server.on('upgrade', (req, socket, head) => { server.on('upgrade', (req, socket, head) => {
const pathname = url.parse(req.url).pathname; const pathname = url.parse(req.url).pathname;
if (pathname === '/ws/terminal') { if (pathname === '/ws/terminal') {

View file

@ -7,157 +7,193 @@ import { registerToolRenderer, renderHeader } from "./renderer-registry.js";
import type { ToolRenderer, ToolRenderResult } from "./types.js"; import type { ToolRenderer, ToolRenderResult } from "./types.js";
const TOOL_SERVER = const TOOL_SERVER =
typeof window !== "undefined" typeof window !== "undefined"
? (window as any).__JAE_TOOL_SERVER__ || "http://localhost:7700" ? (window as any).__JAE_TOOL_SERVER__ || "http://localhost:7700"
: "http://localhost:7700"; : "http://localhost:7700";
const browserSchema = Type.Object({ const browserSchema = Type.Object({
action: Type.Union( action: Type.Union(
[ [
Type.Literal("navigate"), Type.Literal("navigate"),
Type.Literal("click"), Type.Literal("click"),
Type.Literal("type"), Type.Literal("type"),
Type.Literal("scroll"), Type.Literal("scroll"),
Type.Literal("back"), Type.Literal("back"),
Type.Literal("screenshot"), Type.Literal("screenshot"),
Type.Literal("text"), Type.Literal("text"),
Type.Literal("eval"), Type.Literal("eval"),
], Type.Literal("elements"),
{ description: "Browser action to perform" }, Type.Literal("keypress"),
), ],
url: Type.Optional(Type.String({ description: "URL to navigate to" })), { description: "Browser action to perform" },
x: Type.Optional(Type.Number({ description: "Click X coordinate" })), ),
y: Type.Optional(Type.Number({ description: "Click Y coordinate" })), url: Type.Optional(Type.String({ description: "URL to navigate to" })),
text: Type.Optional(Type.String({ description: "Text to type" })), index: Type.Optional(Type.Number({ description: "Element index number from the elements list to click or type into" })),
selector: Type.Optional(Type.String({ description: "CSS selector to type into" })), x: Type.Optional(Type.Number({ description: "Click X coordinate (use index instead when possible)" })),
dy: Type.Optional(Type.Number({ description: "Scroll delta Y pixels" })), y: Type.Optional(Type.Number({ description: "Click Y coordinate (use index instead when possible)" })),
script: Type.Optional(Type.String({ description: "JavaScript to evaluate in page" })), text: Type.Optional(Type.String({ description: "Text to type into focused element or element by index" })),
selector: Type.Optional(Type.String({ description: "CSS selector to type into (prefer index instead)" })),
dy: Type.Optional(Type.Number({ description: "Scroll delta Y pixels (positive=down, negative=up)" })),
script: Type.Optional(Type.String({ description: "JavaScript to evaluate in page" })),
key: Type.Optional(Type.String({ description: "Key to press: Enter, Tab, Escape, Backspace, etc." })),
}); });
export interface BrowserDetails { export interface BrowserDetails {
action: string; action: string;
url?: string; url?: string;
title?: string; title?: string;
screenshot?: string; screenshot?: string;
text?: string; text?: string;
evalResult?: string; elementsText?: string;
error?: string; evalResult?: string;
error?: string;
} }
export const browserTool: AgentTool<typeof browserSchema, BrowserDetails> = { export const browserTool: AgentTool<typeof browserSchema, BrowserDetails> = {
name: "browser", name: "browser",
label: "Browser", label: "Browser",
description: description: `Control a browser to navigate, click, type, and read web pages.
"Control a headless browser. Actions: navigate (url), click (x,y), type (text, optional selector), scroll (dy), back, screenshot, text (get page text), eval (run JS).",
parameters: browserSchema, WORKFLOW: Always follow this pattern:
async execute(toolCallId, params, signal) { 1. navigate to a URL - returns page elements list
const { action, ...rest } = params; 2. READ the numbered elements list to find what you need
const endpoint = 3. click by index number OR type into an element by index
action === "navigate" 4. After each action you get an updated elements list
? "/api/browser/navigate"
: action === "click" Actions:
? "/api/browser/click" - navigate: open a URL (provide "url")
: action === "type" - click: click element by "index" (preferred) or by "x","y" coordinates
? "/api/browser/type" - type: type "text" into element by "index", or into currently focused element
: action === "scroll" - keypress: press a key like "Enter", "Tab", "Escape" (provide "key")
? "/api/browser/scroll" - scroll: scroll page (provide "dy", positive=down negative=up)
: action === "back" - back: go back in history
? "/api/browser/back" - screenshot: get fresh screenshot and elements list
: action === "screenshot" - elements: get just the interactive elements list
? "/api/browser/screenshot" - text: get all visible text content of the page
: action === "text" - eval: run JavaScript on the page (provide "script")
? "/api/browser/text"
: action === "eval" EXAMPLE - Search Google:
? "/api/browser/eval" 1. browser({action:"navigate", url:"google.com"}) see elements, find input[text] "Search" at index 5
: null; 2. browser({action:"type", index:5, text:"my search query"}) typed into search box
if (!endpoint) { 3. browser({action:"keypress", key:"Enter"}) submitted search
return { 4. Read results from elements list, click a link by index`,
content: [{ type: "text" as const, text: "Unknown action: " + action }], parameters: browserSchema,
details: { action, error: "Unknown action" }, async execute(toolCallId, params, signal) {
}; const { action, ...rest } = params;
} const endpointMap: Record<string, string> = {
try { navigate: "/api/browser/navigate",
const res = await fetch(TOOL_SERVER + endpoint, { click: "/api/browser/click",
method: "POST", type: "/api/browser/type",
headers: { "Content-Type": "application/json" }, scroll: "/api/browser/scroll",
body: JSON.stringify(rest), back: "/api/browser/back",
signal, screenshot: "/api/browser/screenshot",
}); text: "/api/browser/text",
const data = (await res.json()) as any; eval: "/api/browser/eval",
if (data.error) { elements: "/api/browser/elements",
return { keypress: "/api/browser/keypress",
content: [{ type: "text" as const, text: "Browser error: " + data.error }], };
details: { action, error: data.error }, const endpoint = endpointMap[action];
}; if (!endpoint) {
} return {
// Build text response for LLM content: [{ type: "text" as const, text: "Unknown action: " + action }],
const textParts: string[] = []; details: { action, error: "Unknown action" },
if (data.url) textParts.push("URL: " + data.url); };
if (data.title) textParts.push("Title: " + data.title); }
if (data.text) textParts.push("Page text:\n" + data.text); try {
if (data.evalResult) textParts.push("Eval result: " + data.evalResult); const res = await fetch(TOOL_SERVER + endpoint, {
if (data.screenshot) textParts.push("[Screenshot captured]"); method: "POST",
if (textParts.length === 0) textParts.push("Action completed."); headers: { "Content-Type": "application/json" },
// Include screenshot as image content if available body: JSON.stringify(rest),
const content: any[] = [{ type: "text" as const, text: textParts.join("\n") }]; signal,
if (data.screenshot) { });
content.push({ type: "image" as const, mimeType: "image/jpeg", data: data.screenshot }); const data = (await res.json()) as any;
} if (data.error) {
return { return {
content, content: [{ type: "text" as const, text: "Browser error: " + data.error }],
details: { details: { action, error: data.error },
action, };
url: data.url, }
title: data.title, // Build text response for LLM — elements list is the key info
screenshot: data.screenshot, const textParts: string[] = [];
text: data.text, if (data.url) textParts.push("URL: " + data.url);
evalResult: data.evalResult, if (data.title) textParts.push("Title: " + data.title);
}, // Always include elements text when available — this is how the model "sees" the page
}; if (data.elementsText) textParts.push("\n" + data.elementsText);
} catch (err: any) { if (data.text) textParts.push("\nPage text (truncated):\n" + data.text.slice(0, 3000));
return { if (data.evalResult) textParts.push("Eval result: " + data.evalResult);
content: [{ type: "text" as const, text: "Browser tool error: " + err.message }], if (data.screenshot) textParts.push("\n[Screenshot captured and displayed in browser panel]");
details: { action, error: err.message }, if (textParts.length === 0) textParts.push("Action completed.");
}; // Include screenshot as image content if available (for vision models)
} const content: any[] = [{ type: "text" as const, text: textParts.join("\n") }];
}, if (data.screenshot) {
content.push({ type: "image" as const, mimeType: "image/jpeg", data: data.screenshot });
}
return {
content,
details: {
action,
url: data.url,
title: data.title,
screenshot: data.screenshot,
text: data.text,
elementsText: data.elementsText,
evalResult: data.evalResult,
},
};
} catch (err: any) {
return {
content: [{ type: "text" as const, text: "Browser tool error: " + err.message }],
details: { action, error: err.message },
};
}
},
}; };
class BrowserToolRenderer implements ToolRenderer<any, BrowserDetails> { class BrowserToolRenderer implements ToolRenderer<any, BrowserDetails> {
render(params: any | undefined, result: ToolResultMessage<BrowserDetails> | undefined): ToolRenderResult { render(params: any | undefined, result: ToolResultMessage<BrowserDetails> | undefined): ToolRenderResult {
const state = result ? (result.isError ? "error" : "complete") : "inprogress"; const state = result ? (result.isError ? "error" : "complete") : "inprogress";
const action = result?.details?.action || params?.action || "..."; const action = result?.details?.action || params?.action || "...";
const url = result?.details?.url || params?.url || ""; const url = result?.details?.url || params?.url || "";
const label = url ? action + ": " + url : action; const label = url ? action + ": " + url : action;
if (result?.details?.screenshot) { if (result?.details?.screenshot) {
return { return {
content: html` content: html`
<div class="flex flex-col gap-2"> <div class="flex flex-col gap-2">
${renderHeader(state, Globe, label)} ${renderHeader(state, Globe, label)}
<img src="data:image/jpeg;base64,${result.details.screenshot}" <img src="data:image/jpeg;base64,${result.details.screenshot}"
class="rounded border border-border max-w-full" style="max-height:400px" class="rounded border border-border max-w-full" style="max-height:400px"
alt="Browser screenshot" /> alt="Browser screenshot" />
${result.details.title ? html`<span class="text-xs text-muted-foreground">${result.details.title}</span>` : html``} ${result.details.title ? html`<span class="text-xs text-muted-foreground">${result.details.title}</span>` : html``}
</div>`, </div>`,
isCustom: false, isCustom: false,
}; };
} }
if (result?.details?.text) { if (result?.details?.elementsText) {
return { return {
content: html` content: html`
<div class="flex flex-col gap-2">
${renderHeader(state, Globe, label)}
<pre class="text-xs p-3 rounded border border-border overflow-auto max-h-48 whitespace-pre-wrap">${result.details.elementsText}</pre>
</div>`,
isCustom: false,
};
}
if (result?.details?.text) {
return {
content: html`
<div class="flex flex-col gap-2"> <div class="flex flex-col gap-2">
${renderHeader(state, Globe, label)} ${renderHeader(state, Globe, label)}
<pre class="text-xs p-3 rounded border border-border overflow-auto max-h-48 whitespace-pre-wrap">${result.details.text}</pre> <pre class="text-xs p-3 rounded border border-border overflow-auto max-h-48 whitespace-pre-wrap">${result.details.text}</pre>
</div>`, </div>`,
isCustom: false, isCustom: false,
}; };
} }
return { content: renderHeader(state, Globe, label), isCustom: false }; return { content: renderHeader(state, Globe, label), isCustom: false };
} }
} }
registerToolRenderer("browser", new BrowserToolRenderer()); registerToolRenderer("browser", new BrowserToolRenderer());
export function createBrowserTool(): AgentTool<typeof browserSchema, BrowserDetails> { export function createBrowserTool(): AgentTool<typeof browserSchema, BrowserDetails> {
return browserTool; return browserTool;
} }