feat: browser use - element extraction + index-based clicking for text models
Some checks failed
CI / build-check-test (push) Has been cancelled

- tool-server.mjs: extractElements() scrapes all interactive elements with coordinates
- tool-server.mjs: formatElements() returns numbered list for LLM to read
- tool-server.mjs: click/type now support {index: N} for element-based interaction
- tool-server.mjs: new /api/browser/elements and /api/browser/keypress endpoints
- browser-tool.ts: updated schema with index, key params and elements/keypress actions
- browser-tool.ts: elementsText included in every LLM response so model can see the page
- browser-tool.ts: detailed workflow instructions in tool description
- Enables text-only models (Llama 3.3 etc) to navigate and interact with web pages
This commit is contained in:
JAE 2026-03-27 23:17:24 +00:00
parent db79dec9e1
commit a2227c7659
2 changed files with 327 additions and 139 deletions

View file

@ -29,7 +29,7 @@ function parseBody(req) {
let browser = null;
let context = null;
let page = null;
const browserPanelClients = new Set(); // WS clients watching the browser
const browserPanelClients = new Set();
async function launchBrowser() {
if (!browser) {
@ -52,10 +52,120 @@ async function getPage() {
return page;
}
// ── ELEMENT EXTRACTION (the key feature for text-model browser control) ──
async function extractElements(p) {
try {
const elements = await p.evaluate(() => {
const items = [];
const seen = new Set();
// Selectors for all interactive elements
const selectors = [
'a[href]',
'button',
'input',
'textarea',
'select',
'[role="button"]',
'[role="link"]',
'[role="tab"]',
'[role="menuitem"]',
'[onclick]',
'[contenteditable="true"]',
'summary',
'details',
'label[for]',
];
const allEls = document.querySelectorAll(selectors.join(','));
for (const el of allEls) {
const rect = el.getBoundingClientRect();
// Skip invisible, off-screen, or tiny elements
if (rect.width < 5 || rect.height < 5) continue;
if (rect.top > window.innerHeight || rect.bottom < 0) continue;
if (rect.left > window.innerWidth || rect.right < 0) continue;
if (window.getComputedStyle(el).visibility === 'hidden') continue;
if (window.getComputedStyle(el).display === 'none') continue;
if (parseFloat(window.getComputedStyle(el).opacity) < 0.1) continue;
const tag = el.tagName.toLowerCase();
const type = el.getAttribute('type') || '';
const role = el.getAttribute('role') || '';
const href = el.getAttribute('href') || '';
const placeholder = el.getAttribute('placeholder') || '';
const ariaLabel = el.getAttribute('aria-label') || '';
const title = el.getAttribute('title') || '';
const name = el.getAttribute('name') || '';
const value = el.value || '';
// Build a human-readable label
let label = (el.innerText || '').trim().slice(0, 80);
if (!label) label = ariaLabel || title || placeholder || name || '';
if (!label && tag === 'img') label = el.getAttribute('alt') || 'image';
if (!label) label = `(${tag}${type ? ' type=' + type : ''})`;
// Centre coordinates
const cx = Math.round(rect.left + rect.width / 2);
const cy = Math.round(rect.top + rect.height / 2);
// De-duplicate by position (within 5px)
const key = `${Math.round(cx/5)*5},${Math.round(cy/5)*5}`;
if (seen.has(key)) continue;
seen.add(key);
let kind = tag;
if (tag === 'a') kind = 'link';
if (tag === 'button' || role === 'button') kind = 'button';
if (tag === 'input') kind = 'input' + (type ? `[${type}]` : '');
if (tag === 'textarea') kind = 'textarea';
if (tag === 'select') kind = 'select';
const item = { index: items.length + 1, kind, label, x: cx, y: cy };
if (tag === 'input' || tag === 'textarea') {
item.value = value.slice(0, 100);
if (placeholder) item.placeholder = placeholder;
}
if (tag === 'a' && href) {
item.href = href.slice(0, 120);
}
items.push(item);
}
return items;
});
return elements;
} catch (e) {
console.error('[tool-server] Element extraction error:', e.message);
return [];
}
}
// Format elements as readable text for the LLM
function formatElements(elements) {
if (!elements || elements.length === 0) return 'No interactive elements found on page.';
const lines = ['Interactive elements on page:'];
for (const el of elements) {
let line = ` [${el.index}] ${el.kind} "${el.label}" at (${el.x}, ${el.y})`;
if (el.value) line += ` value="${el.value}"`;
if (el.placeholder) line += ` placeholder="${el.placeholder}"`;
if (el.href) line += `${el.href}`;
lines.push(line);
}
lines.push('');
lines.push('To click an element, use action "click" with its x,y coordinates.');
lines.push('To type into a focused input, use action "type" with text.');
return lines.join('\n');
}
// Enhanced snap: screenshot + element list
async function snap() {
const p = await getPage();
const buf = await p.screenshot({ type: 'jpeg', quality: 70, fullPage: false });
return { screenshot: buf.toString('base64'), url: p.url(), title: await p.title() };
const elements = await extractElements(p);
return {
screenshot: buf.toString('base64'),
url: p.url(),
title: await p.title(),
elements,
elementsText: formatElements(elements),
};
}
// Broadcast screenshot to all connected browser panel WebSocket clients
@ -104,13 +214,21 @@ async function handleNavigate(body) {
const target = targetUrl.startsWith('http') ? targetUrl : 'https://' + targetUrl;
await p.goto(target, { timeout: 30000, waitUntil: 'domcontentloaded' });
const result = await snap();
broadcastScreenshot(); // sync panel
broadcastScreenshot();
return result;
}
async function handleClick(body) {
const p = await getPage();
await p.mouse.click(body.x || 0, body.y || 0);
// Support clicking by element index
if (body.index && !body.x && !body.y) {
const elements = await extractElements(p);
const el = elements.find(e => e.index === body.index);
if (!el) return { error: `Element [${body.index}] not found. Use action "screenshot" to refresh element list.` };
await p.mouse.click(el.x, el.y);
} else {
await p.mouse.click(body.x || 0, body.y || 0);
}
await p.waitForTimeout(500);
const result = await snap();
broadcastScreenshot();
@ -119,8 +237,24 @@ async function handleClick(body) {
async function handleType(body) {
const p = await getPage();
if (body.selector) await p.fill(body.selector, body.text || '');
else await p.keyboard.type(body.text || '');
if (body.selector) {
await p.fill(body.selector, body.text || '');
} else if (body.index) {
// Click the element first, then type
const elements = await extractElements(p);
const el = elements.find(e => e.index === body.index);
if (el) {
await p.mouse.click(el.x, el.y);
await p.waitForTimeout(200);
// Clear existing content and type new text
await p.keyboard.press('Control+a');
await p.keyboard.type(body.text || '');
} else {
return { error: `Element [${body.index}] not found.` };
}
} else {
await p.keyboard.type(body.text || '');
}
await p.waitForTimeout(300);
const result = await snap();
broadcastScreenshot();
@ -163,7 +297,8 @@ async function handleReload() {
async function handleText() {
const p = await getPage();
const text = await p.evaluate(() => document.body.innerText);
return { url: p.url(), title: await p.title(), text: text.slice(0, 8000) };
const elements = await extractElements(p);
return { url: p.url(), title: await p.title(), text: text.slice(0, 8000), elements, elementsText: formatElements(elements) };
}
async function handleEval(body) {
@ -174,6 +309,22 @@ async function handleEval(body) {
return { ...ss, evalResult: String(result) };
}
async function handleElements() {
const p = await getPage();
const elements = await extractElements(p);
return { url: p.url(), title: await p.title(), elements, elementsText: formatElements(elements) };
}
// Press a specific key (Enter, Tab, Escape, etc.)
async function handleKeypress(body) {
const p = await getPage();
await p.keyboard.press(body.key || 'Enter');
await p.waitForTimeout(300);
const result = await snap();
broadcastScreenshot();
return result;
}
// ── HTTP ROUTES ───────────────────────────────────────────────
const routes = {
'/api/bash': handleBash,
@ -184,9 +335,11 @@ const routes = {
'/api/browser/back': handleBack,
'/api/browser/forward': handleForward,
'/api/browser/reload': handleReload,
'/api/browser/screenshot': () => { const r = snap(); broadcastScreenshot(); return r; },
'/api/browser/screenshot': async () => { const r = await snap(); broadcastScreenshot(); return r; },
'/api/browser/text': handleText,
'/api/browser/eval': handleEval,
'/api/browser/elements': handleElements,
'/api/browser/keypress': handleKeypress,
};
// ── HTTP SERVER ───────────────────────────────────────────────
@ -239,7 +392,7 @@ browserWss.on('connection', async (ws) => {
ws.on('close', () => browserPanelClients.delete(ws));
ws.on('error', () => browserPanelClients.delete(ws));
// Handle panel user interactions (navigate, click, scroll, etc.)
// Handle panel user interactions
ws.on('message', async (msg) => {
try {
const m = JSON.parse(msg.toString());
@ -291,14 +444,13 @@ browserWss.on('connection', async (ws) => {
}
});
// Send initial ready + screenshot if browser exists
ws.send(JSON.stringify({ type: 'ready' }));
if (page && !page.isClosed()) {
try { await broadcastScreenshot(); } catch {}
}
});
// ── UPGRADE HANDLER (route WS by path) ────────────────────────
// ── UPGRADE HANDLER ───────────────────────────────────────────
server.on('upgrade', (req, socket, head) => {
const pathname = url.parse(req.url).pathname;
if (pathname === '/ws/terminal') {

View file

@ -7,157 +7,193 @@ import { registerToolRenderer, renderHeader } from "./renderer-registry.js";
import type { ToolRenderer, ToolRenderResult } from "./types.js";
const TOOL_SERVER =
typeof window !== "undefined"
? (window as any).__JAE_TOOL_SERVER__ || "http://localhost:7700"
: "http://localhost:7700";
typeof window !== "undefined"
? (window as any).__JAE_TOOL_SERVER__ || "http://localhost:7700"
: "http://localhost:7700";
const browserSchema = Type.Object({
action: Type.Union(
[
Type.Literal("navigate"),
Type.Literal("click"),
Type.Literal("type"),
Type.Literal("scroll"),
Type.Literal("back"),
Type.Literal("screenshot"),
Type.Literal("text"),
Type.Literal("eval"),
],
{ description: "Browser action to perform" },
),
url: Type.Optional(Type.String({ description: "URL to navigate to" })),
x: Type.Optional(Type.Number({ description: "Click X coordinate" })),
y: Type.Optional(Type.Number({ description: "Click Y coordinate" })),
text: Type.Optional(Type.String({ description: "Text to type" })),
selector: Type.Optional(Type.String({ description: "CSS selector to type into" })),
dy: Type.Optional(Type.Number({ description: "Scroll delta Y pixels" })),
script: Type.Optional(Type.String({ description: "JavaScript to evaluate in page" })),
action: Type.Union(
[
Type.Literal("navigate"),
Type.Literal("click"),
Type.Literal("type"),
Type.Literal("scroll"),
Type.Literal("back"),
Type.Literal("screenshot"),
Type.Literal("text"),
Type.Literal("eval"),
Type.Literal("elements"),
Type.Literal("keypress"),
],
{ description: "Browser action to perform" },
),
url: Type.Optional(Type.String({ description: "URL to navigate to" })),
index: Type.Optional(Type.Number({ description: "Element index number from the elements list to click or type into" })),
x: Type.Optional(Type.Number({ description: "Click X coordinate (use index instead when possible)" })),
y: Type.Optional(Type.Number({ description: "Click Y coordinate (use index instead when possible)" })),
text: Type.Optional(Type.String({ description: "Text to type into focused element or element by index" })),
selector: Type.Optional(Type.String({ description: "CSS selector to type into (prefer index instead)" })),
dy: Type.Optional(Type.Number({ description: "Scroll delta Y pixels (positive=down, negative=up)" })),
script: Type.Optional(Type.String({ description: "JavaScript to evaluate in page" })),
key: Type.Optional(Type.String({ description: "Key to press: Enter, Tab, Escape, Backspace, etc." })),
});
export interface BrowserDetails {
action: string;
url?: string;
title?: string;
screenshot?: string;
text?: string;
evalResult?: string;
error?: string;
action: string;
url?: string;
title?: string;
screenshot?: string;
text?: string;
elementsText?: string;
evalResult?: string;
error?: string;
}
export const browserTool: AgentTool<typeof browserSchema, BrowserDetails> = {
name: "browser",
label: "Browser",
description:
"Control a headless browser. Actions: navigate (url), click (x,y), type (text, optional selector), scroll (dy), back, screenshot, text (get page text), eval (run JS).",
parameters: browserSchema,
async execute(toolCallId, params, signal) {
const { action, ...rest } = params;
const endpoint =
action === "navigate"
? "/api/browser/navigate"
: action === "click"
? "/api/browser/click"
: action === "type"
? "/api/browser/type"
: action === "scroll"
? "/api/browser/scroll"
: action === "back"
? "/api/browser/back"
: action === "screenshot"
? "/api/browser/screenshot"
: action === "text"
? "/api/browser/text"
: action === "eval"
? "/api/browser/eval"
: null;
if (!endpoint) {
return {
content: [{ type: "text" as const, text: "Unknown action: " + action }],
details: { action, error: "Unknown action" },
};
}
try {
const res = await fetch(TOOL_SERVER + endpoint, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(rest),
signal,
});
const data = (await res.json()) as any;
if (data.error) {
return {
content: [{ type: "text" as const, text: "Browser error: " + data.error }],
details: { action, error: data.error },
};
}
// Build text response for LLM
const textParts: string[] = [];
if (data.url) textParts.push("URL: " + data.url);
if (data.title) textParts.push("Title: " + data.title);
if (data.text) textParts.push("Page text:\n" + data.text);
if (data.evalResult) textParts.push("Eval result: " + data.evalResult);
if (data.screenshot) textParts.push("[Screenshot captured]");
if (textParts.length === 0) textParts.push("Action completed.");
// Include screenshot as image content if available
const content: any[] = [{ type: "text" as const, text: textParts.join("\n") }];
if (data.screenshot) {
content.push({ type: "image" as const, mimeType: "image/jpeg", data: data.screenshot });
}
return {
content,
details: {
action,
url: data.url,
title: data.title,
screenshot: data.screenshot,
text: data.text,
evalResult: data.evalResult,
},
};
} catch (err: any) {
return {
content: [{ type: "text" as const, text: "Browser tool error: " + err.message }],
details: { action, error: err.message },
};
}
},
name: "browser",
label: "Browser",
description: `Control a browser to navigate, click, type, and read web pages.
WORKFLOW: Always follow this pattern:
1. navigate to a URL - returns page elements list
2. READ the numbered elements list to find what you need
3. click by index number OR type into an element by index
4. After each action you get an updated elements list
Actions:
- navigate: open a URL (provide "url")
- click: click element by "index" (preferred) or by "x","y" coordinates
- type: type "text" into element by "index", or into currently focused element
- keypress: press a key like "Enter", "Tab", "Escape" (provide "key")
- scroll: scroll page (provide "dy", positive=down negative=up)
- back: go back in history
- screenshot: get fresh screenshot and elements list
- elements: get just the interactive elements list
- text: get all visible text content of the page
- eval: run JavaScript on the page (provide "script")
EXAMPLE - Search Google:
1. browser({action:"navigate", url:"google.com"}) see elements, find input[text] "Search" at index 5
2. browser({action:"type", index:5, text:"my search query"}) typed into search box
3. browser({action:"keypress", key:"Enter"}) submitted search
4. Read results from elements list, click a link by index`,
parameters: browserSchema,
async execute(toolCallId, params, signal) {
const { action, ...rest } = params;
const endpointMap: Record<string, string> = {
navigate: "/api/browser/navigate",
click: "/api/browser/click",
type: "/api/browser/type",
scroll: "/api/browser/scroll",
back: "/api/browser/back",
screenshot: "/api/browser/screenshot",
text: "/api/browser/text",
eval: "/api/browser/eval",
elements: "/api/browser/elements",
keypress: "/api/browser/keypress",
};
const endpoint = endpointMap[action];
if (!endpoint) {
return {
content: [{ type: "text" as const, text: "Unknown action: " + action }],
details: { action, error: "Unknown action" },
};
}
try {
const res = await fetch(TOOL_SERVER + endpoint, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(rest),
signal,
});
const data = (await res.json()) as any;
if (data.error) {
return {
content: [{ type: "text" as const, text: "Browser error: " + data.error }],
details: { action, error: data.error },
};
}
// Build text response for LLM — elements list is the key info
const textParts: string[] = [];
if (data.url) textParts.push("URL: " + data.url);
if (data.title) textParts.push("Title: " + data.title);
// Always include elements text when available — this is how the model "sees" the page
if (data.elementsText) textParts.push("\n" + data.elementsText);
if (data.text) textParts.push("\nPage text (truncated):\n" + data.text.slice(0, 3000));
if (data.evalResult) textParts.push("Eval result: " + data.evalResult);
if (data.screenshot) textParts.push("\n[Screenshot captured and displayed in browser panel]");
if (textParts.length === 0) textParts.push("Action completed.");
// Include screenshot as image content if available (for vision models)
const content: any[] = [{ type: "text" as const, text: textParts.join("\n") }];
if (data.screenshot) {
content.push({ type: "image" as const, mimeType: "image/jpeg", data: data.screenshot });
}
return {
content,
details: {
action,
url: data.url,
title: data.title,
screenshot: data.screenshot,
text: data.text,
elementsText: data.elementsText,
evalResult: data.evalResult,
},
};
} catch (err: any) {
return {
content: [{ type: "text" as const, text: "Browser tool error: " + err.message }],
details: { action, error: err.message },
};
}
},
};
class BrowserToolRenderer implements ToolRenderer<any, BrowserDetails> {
render(params: any | undefined, result: ToolResultMessage<BrowserDetails> | undefined): ToolRenderResult {
const state = result ? (result.isError ? "error" : "complete") : "inprogress";
const action = result?.details?.action || params?.action || "...";
const url = result?.details?.url || params?.url || "";
const label = url ? action + ": " + url : action;
if (result?.details?.screenshot) {
return {
content: html`
render(params: any | undefined, result: ToolResultMessage<BrowserDetails> | undefined): ToolRenderResult {
const state = result ? (result.isError ? "error" : "complete") : "inprogress";
const action = result?.details?.action || params?.action || "...";
const url = result?.details?.url || params?.url || "";
const label = url ? action + ": " + url : action;
if (result?.details?.screenshot) {
return {
content: html`
<div class="flex flex-col gap-2">
${renderHeader(state, Globe, label)}
<img src="data:image/jpeg;base64,${result.details.screenshot}"
class="rounded border border-border max-w-full" style="max-height:400px"
alt="Browser screenshot" />
class="rounded border border-border max-w-full" style="max-height:400px"
alt="Browser screenshot" />
${result.details.title ? html`<span class="text-xs text-muted-foreground">${result.details.title}</span>` : html``}
</div>`,
isCustom: false,
};
}
if (result?.details?.text) {
return {
content: html`
isCustom: false,
};
}
if (result?.details?.elementsText) {
return {
content: html`
<div class="flex flex-col gap-2">
${renderHeader(state, Globe, label)}
<pre class="text-xs p-3 rounded border border-border overflow-auto max-h-48 whitespace-pre-wrap">${result.details.elementsText}</pre>
</div>`,
isCustom: false,
};
}
if (result?.details?.text) {
return {
content: html`
<div class="flex flex-col gap-2">
${renderHeader(state, Globe, label)}
<pre class="text-xs p-3 rounded border border-border overflow-auto max-h-48 whitespace-pre-wrap">${result.details.text}</pre>
</div>`,
isCustom: false,
};
}
return { content: renderHeader(state, Globe, label), isCustom: false };
}
isCustom: false,
};
}
return { content: renderHeader(state, Globe, label), isCustom: false };
}
}
registerToolRenderer("browser", new BrowserToolRenderer());
export function createBrowserTool(): AgentTool<typeof browserSchema, BrowserDetails> {
return browserTool;
return browserTool;
}