feat: browser use - element extraction + index-based clicking for text models
Some checks failed
CI / build-check-test (push) Has been cancelled
Some checks failed
CI / build-check-test (push) Has been cancelled
- tool-server.mjs: extractElements() scrapes all interactive elements with coordinates
- tool-server.mjs: formatElements() returns numbered list for LLM to read
- tool-server.mjs: click/type now support {index: N} for element-based interaction
- tool-server.mjs: new /api/browser/elements and /api/browser/keypress endpoints
- browser-tool.ts: updated schema with index, key params and elements/keypress actions
- browser-tool.ts: elementsText included in every LLM response so model can see the page
- browser-tool.ts: detailed workflow instructions in tool description
- Enables text-only models (Llama 3.3 etc) to navigate and interact with web pages
This commit is contained in:
parent
db79dec9e1
commit
a2227c7659
2 changed files with 327 additions and 139 deletions
|
|
@ -29,7 +29,7 @@ function parseBody(req) {
|
|||
let browser = null;
|
||||
let context = null;
|
||||
let page = null;
|
||||
const browserPanelClients = new Set(); // WS clients watching the browser
|
||||
const browserPanelClients = new Set();
|
||||
|
||||
async function launchBrowser() {
|
||||
if (!browser) {
|
||||
|
|
@ -52,10 +52,120 @@ async function getPage() {
|
|||
return page;
|
||||
}
|
||||
|
||||
// ── ELEMENT EXTRACTION (the key feature for text-model browser control) ──
|
||||
async function extractElements(p) {
|
||||
try {
|
||||
const elements = await p.evaluate(() => {
|
||||
const items = [];
|
||||
const seen = new Set();
|
||||
// Selectors for all interactive elements
|
||||
const selectors = [
|
||||
'a[href]',
|
||||
'button',
|
||||
'input',
|
||||
'textarea',
|
||||
'select',
|
||||
'[role="button"]',
|
||||
'[role="link"]',
|
||||
'[role="tab"]',
|
||||
'[role="menuitem"]',
|
||||
'[onclick]',
|
||||
'[contenteditable="true"]',
|
||||
'summary',
|
||||
'details',
|
||||
'label[for]',
|
||||
];
|
||||
const allEls = document.querySelectorAll(selectors.join(','));
|
||||
for (const el of allEls) {
|
||||
const rect = el.getBoundingClientRect();
|
||||
// Skip invisible, off-screen, or tiny elements
|
||||
if (rect.width < 5 || rect.height < 5) continue;
|
||||
if (rect.top > window.innerHeight || rect.bottom < 0) continue;
|
||||
if (rect.left > window.innerWidth || rect.right < 0) continue;
|
||||
if (window.getComputedStyle(el).visibility === 'hidden') continue;
|
||||
if (window.getComputedStyle(el).display === 'none') continue;
|
||||
if (parseFloat(window.getComputedStyle(el).opacity) < 0.1) continue;
|
||||
|
||||
const tag = el.tagName.toLowerCase();
|
||||
const type = el.getAttribute('type') || '';
|
||||
const role = el.getAttribute('role') || '';
|
||||
const href = el.getAttribute('href') || '';
|
||||
const placeholder = el.getAttribute('placeholder') || '';
|
||||
const ariaLabel = el.getAttribute('aria-label') || '';
|
||||
const title = el.getAttribute('title') || '';
|
||||
const name = el.getAttribute('name') || '';
|
||||
const value = el.value || '';
|
||||
|
||||
// Build a human-readable label
|
||||
let label = (el.innerText || '').trim().slice(0, 80);
|
||||
if (!label) label = ariaLabel || title || placeholder || name || '';
|
||||
if (!label && tag === 'img') label = el.getAttribute('alt') || 'image';
|
||||
if (!label) label = `(${tag}${type ? ' type=' + type : ''})`;
|
||||
|
||||
// Centre coordinates
|
||||
const cx = Math.round(rect.left + rect.width / 2);
|
||||
const cy = Math.round(rect.top + rect.height / 2);
|
||||
|
||||
// De-duplicate by position (within 5px)
|
||||
const key = `${Math.round(cx/5)*5},${Math.round(cy/5)*5}`;
|
||||
if (seen.has(key)) continue;
|
||||
seen.add(key);
|
||||
|
||||
let kind = tag;
|
||||
if (tag === 'a') kind = 'link';
|
||||
if (tag === 'button' || role === 'button') kind = 'button';
|
||||
if (tag === 'input') kind = 'input' + (type ? `[${type}]` : '');
|
||||
if (tag === 'textarea') kind = 'textarea';
|
||||
if (tag === 'select') kind = 'select';
|
||||
|
||||
const item = { index: items.length + 1, kind, label, x: cx, y: cy };
|
||||
if (tag === 'input' || tag === 'textarea') {
|
||||
item.value = value.slice(0, 100);
|
||||
if (placeholder) item.placeholder = placeholder;
|
||||
}
|
||||
if (tag === 'a' && href) {
|
||||
item.href = href.slice(0, 120);
|
||||
}
|
||||
items.push(item);
|
||||
}
|
||||
return items;
|
||||
});
|
||||
return elements;
|
||||
} catch (e) {
|
||||
console.error('[tool-server] Element extraction error:', e.message);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
// Format elements as readable text for the LLM
|
||||
function formatElements(elements) {
|
||||
if (!elements || elements.length === 0) return 'No interactive elements found on page.';
|
||||
const lines = ['Interactive elements on page:'];
|
||||
for (const el of elements) {
|
||||
let line = ` [${el.index}] ${el.kind} "${el.label}" at (${el.x}, ${el.y})`;
|
||||
if (el.value) line += ` value="${el.value}"`;
|
||||
if (el.placeholder) line += ` placeholder="${el.placeholder}"`;
|
||||
if (el.href) line += ` → ${el.href}`;
|
||||
lines.push(line);
|
||||
}
|
||||
lines.push('');
|
||||
lines.push('To click an element, use action "click" with its x,y coordinates.');
|
||||
lines.push('To type into a focused input, use action "type" with text.');
|
||||
return lines.join('\n');
|
||||
}
|
||||
|
||||
// Enhanced snap: screenshot + element list
|
||||
async function snap() {
|
||||
const p = await getPage();
|
||||
const buf = await p.screenshot({ type: 'jpeg', quality: 70, fullPage: false });
|
||||
return { screenshot: buf.toString('base64'), url: p.url(), title: await p.title() };
|
||||
const elements = await extractElements(p);
|
||||
return {
|
||||
screenshot: buf.toString('base64'),
|
||||
url: p.url(),
|
||||
title: await p.title(),
|
||||
elements,
|
||||
elementsText: formatElements(elements),
|
||||
};
|
||||
}
|
||||
|
||||
// Broadcast screenshot to all connected browser panel WebSocket clients
|
||||
|
|
@ -104,13 +214,21 @@ async function handleNavigate(body) {
|
|||
const target = targetUrl.startsWith('http') ? targetUrl : 'https://' + targetUrl;
|
||||
await p.goto(target, { timeout: 30000, waitUntil: 'domcontentloaded' });
|
||||
const result = await snap();
|
||||
broadcastScreenshot(); // sync panel
|
||||
broadcastScreenshot();
|
||||
return result;
|
||||
}
|
||||
|
||||
async function handleClick(body) {
|
||||
const p = await getPage();
|
||||
// Support clicking by element index
|
||||
if (body.index && !body.x && !body.y) {
|
||||
const elements = await extractElements(p);
|
||||
const el = elements.find(e => e.index === body.index);
|
||||
if (!el) return { error: `Element [${body.index}] not found. Use action "screenshot" to refresh element list.` };
|
||||
await p.mouse.click(el.x, el.y);
|
||||
} else {
|
||||
await p.mouse.click(body.x || 0, body.y || 0);
|
||||
}
|
||||
await p.waitForTimeout(500);
|
||||
const result = await snap();
|
||||
broadcastScreenshot();
|
||||
|
|
@ -119,8 +237,24 @@ async function handleClick(body) {
|
|||
|
||||
async function handleType(body) {
|
||||
const p = await getPage();
|
||||
if (body.selector) await p.fill(body.selector, body.text || '');
|
||||
else await p.keyboard.type(body.text || '');
|
||||
if (body.selector) {
|
||||
await p.fill(body.selector, body.text || '');
|
||||
} else if (body.index) {
|
||||
// Click the element first, then type
|
||||
const elements = await extractElements(p);
|
||||
const el = elements.find(e => e.index === body.index);
|
||||
if (el) {
|
||||
await p.mouse.click(el.x, el.y);
|
||||
await p.waitForTimeout(200);
|
||||
// Clear existing content and type new text
|
||||
await p.keyboard.press('Control+a');
|
||||
await p.keyboard.type(body.text || '');
|
||||
} else {
|
||||
return { error: `Element [${body.index}] not found.` };
|
||||
}
|
||||
} else {
|
||||
await p.keyboard.type(body.text || '');
|
||||
}
|
||||
await p.waitForTimeout(300);
|
||||
const result = await snap();
|
||||
broadcastScreenshot();
|
||||
|
|
@ -163,7 +297,8 @@ async function handleReload() {
|
|||
async function handleText() {
|
||||
const p = await getPage();
|
||||
const text = await p.evaluate(() => document.body.innerText);
|
||||
return { url: p.url(), title: await p.title(), text: text.slice(0, 8000) };
|
||||
const elements = await extractElements(p);
|
||||
return { url: p.url(), title: await p.title(), text: text.slice(0, 8000), elements, elementsText: formatElements(elements) };
|
||||
}
|
||||
|
||||
async function handleEval(body) {
|
||||
|
|
@ -174,6 +309,22 @@ async function handleEval(body) {
|
|||
return { ...ss, evalResult: String(result) };
|
||||
}
|
||||
|
||||
async function handleElements() {
|
||||
const p = await getPage();
|
||||
const elements = await extractElements(p);
|
||||
return { url: p.url(), title: await p.title(), elements, elementsText: formatElements(elements) };
|
||||
}
|
||||
|
||||
// Press a specific key (Enter, Tab, Escape, etc.)
|
||||
async function handleKeypress(body) {
|
||||
const p = await getPage();
|
||||
await p.keyboard.press(body.key || 'Enter');
|
||||
await p.waitForTimeout(300);
|
||||
const result = await snap();
|
||||
broadcastScreenshot();
|
||||
return result;
|
||||
}
|
||||
|
||||
// ── HTTP ROUTES ───────────────────────────────────────────────
|
||||
const routes = {
|
||||
'/api/bash': handleBash,
|
||||
|
|
@ -184,9 +335,11 @@ const routes = {
|
|||
'/api/browser/back': handleBack,
|
||||
'/api/browser/forward': handleForward,
|
||||
'/api/browser/reload': handleReload,
|
||||
'/api/browser/screenshot': () => { const r = snap(); broadcastScreenshot(); return r; },
|
||||
'/api/browser/screenshot': async () => { const r = await snap(); broadcastScreenshot(); return r; },
|
||||
'/api/browser/text': handleText,
|
||||
'/api/browser/eval': handleEval,
|
||||
'/api/browser/elements': handleElements,
|
||||
'/api/browser/keypress': handleKeypress,
|
||||
};
|
||||
|
||||
// ── HTTP SERVER ───────────────────────────────────────────────
|
||||
|
|
@ -239,7 +392,7 @@ browserWss.on('connection', async (ws) => {
|
|||
ws.on('close', () => browserPanelClients.delete(ws));
|
||||
ws.on('error', () => browserPanelClients.delete(ws));
|
||||
|
||||
// Handle panel user interactions (navigate, click, scroll, etc.)
|
||||
// Handle panel user interactions
|
||||
ws.on('message', async (msg) => {
|
||||
try {
|
||||
const m = JSON.parse(msg.toString());
|
||||
|
|
@ -291,14 +444,13 @@ browserWss.on('connection', async (ws) => {
|
|||
}
|
||||
});
|
||||
|
||||
// Send initial ready + screenshot if browser exists
|
||||
ws.send(JSON.stringify({ type: 'ready' }));
|
||||
if (page && !page.isClosed()) {
|
||||
try { await broadcastScreenshot(); } catch {}
|
||||
}
|
||||
});
|
||||
|
||||
// ── UPGRADE HANDLER (route WS by path) ────────────────────────
|
||||
// ── UPGRADE HANDLER ───────────────────────────────────────────
|
||||
server.on('upgrade', (req, socket, head) => {
|
||||
const pathname = url.parse(req.url).pathname;
|
||||
if (pathname === '/ws/terminal') {
|
||||
|
|
|
|||
|
|
@ -22,16 +22,20 @@ const browserSchema = Type.Object({
|
|||
Type.Literal("screenshot"),
|
||||
Type.Literal("text"),
|
||||
Type.Literal("eval"),
|
||||
Type.Literal("elements"),
|
||||
Type.Literal("keypress"),
|
||||
],
|
||||
{ description: "Browser action to perform" },
|
||||
),
|
||||
url: Type.Optional(Type.String({ description: "URL to navigate to" })),
|
||||
x: Type.Optional(Type.Number({ description: "Click X coordinate" })),
|
||||
y: Type.Optional(Type.Number({ description: "Click Y coordinate" })),
|
||||
text: Type.Optional(Type.String({ description: "Text to type" })),
|
||||
selector: Type.Optional(Type.String({ description: "CSS selector to type into" })),
|
||||
dy: Type.Optional(Type.Number({ description: "Scroll delta Y pixels" })),
|
||||
index: Type.Optional(Type.Number({ description: "Element index number from the elements list to click or type into" })),
|
||||
x: Type.Optional(Type.Number({ description: "Click X coordinate (use index instead when possible)" })),
|
||||
y: Type.Optional(Type.Number({ description: "Click Y coordinate (use index instead when possible)" })),
|
||||
text: Type.Optional(Type.String({ description: "Text to type into focused element or element by index" })),
|
||||
selector: Type.Optional(Type.String({ description: "CSS selector to type into (prefer index instead)" })),
|
||||
dy: Type.Optional(Type.Number({ description: "Scroll delta Y pixels (positive=down, negative=up)" })),
|
||||
script: Type.Optional(Type.String({ description: "JavaScript to evaluate in page" })),
|
||||
key: Type.Optional(Type.String({ description: "Key to press: Enter, Tab, Escape, Backspace, etc." })),
|
||||
});
|
||||
|
||||
export interface BrowserDetails {
|
||||
|
|
@ -40,6 +44,7 @@ export interface BrowserDetails {
|
|||
title?: string;
|
||||
screenshot?: string;
|
||||
text?: string;
|
||||
elementsText?: string;
|
||||
evalResult?: string;
|
||||
error?: string;
|
||||
}
|
||||
|
|
@ -47,29 +52,47 @@ export interface BrowserDetails {
|
|||
export const browserTool: AgentTool<typeof browserSchema, BrowserDetails> = {
|
||||
name: "browser",
|
||||
label: "Browser",
|
||||
description:
|
||||
"Control a headless browser. Actions: navigate (url), click (x,y), type (text, optional selector), scroll (dy), back, screenshot, text (get page text), eval (run JS).",
|
||||
description: `Control a browser to navigate, click, type, and read web pages.
|
||||
|
||||
WORKFLOW: Always follow this pattern:
|
||||
1. navigate to a URL - returns page elements list
|
||||
2. READ the numbered elements list to find what you need
|
||||
3. click by index number OR type into an element by index
|
||||
4. After each action you get an updated elements list
|
||||
|
||||
Actions:
|
||||
- navigate: open a URL (provide "url")
|
||||
- click: click element by "index" (preferred) or by "x","y" coordinates
|
||||
- type: type "text" into element by "index", or into currently focused element
|
||||
- keypress: press a key like "Enter", "Tab", "Escape" (provide "key")
|
||||
- scroll: scroll page (provide "dy", positive=down negative=up)
|
||||
- back: go back in history
|
||||
- screenshot: get fresh screenshot and elements list
|
||||
- elements: get just the interactive elements list
|
||||
- text: get all visible text content of the page
|
||||
- eval: run JavaScript on the page (provide "script")
|
||||
|
||||
EXAMPLE - Search Google:
|
||||
1. browser({action:"navigate", url:"google.com"}) → see elements, find input[text] "Search" at index 5
|
||||
2. browser({action:"type", index:5, text:"my search query"}) → typed into search box
|
||||
3. browser({action:"keypress", key:"Enter"}) → submitted search
|
||||
4. Read results from elements list, click a link by index`,
|
||||
parameters: browserSchema,
|
||||
async execute(toolCallId, params, signal) {
|
||||
const { action, ...rest } = params;
|
||||
const endpoint =
|
||||
action === "navigate"
|
||||
? "/api/browser/navigate"
|
||||
: action === "click"
|
||||
? "/api/browser/click"
|
||||
: action === "type"
|
||||
? "/api/browser/type"
|
||||
: action === "scroll"
|
||||
? "/api/browser/scroll"
|
||||
: action === "back"
|
||||
? "/api/browser/back"
|
||||
: action === "screenshot"
|
||||
? "/api/browser/screenshot"
|
||||
: action === "text"
|
||||
? "/api/browser/text"
|
||||
: action === "eval"
|
||||
? "/api/browser/eval"
|
||||
: null;
|
||||
const endpointMap: Record<string, string> = {
|
||||
navigate: "/api/browser/navigate",
|
||||
click: "/api/browser/click",
|
||||
type: "/api/browser/type",
|
||||
scroll: "/api/browser/scroll",
|
||||
back: "/api/browser/back",
|
||||
screenshot: "/api/browser/screenshot",
|
||||
text: "/api/browser/text",
|
||||
eval: "/api/browser/eval",
|
||||
elements: "/api/browser/elements",
|
||||
keypress: "/api/browser/keypress",
|
||||
};
|
||||
const endpoint = endpointMap[action];
|
||||
if (!endpoint) {
|
||||
return {
|
||||
content: [{ type: "text" as const, text: "Unknown action: " + action }],
|
||||
|
|
@ -90,15 +113,17 @@ export const browserTool: AgentTool<typeof browserSchema, BrowserDetails> = {
|
|||
details: { action, error: data.error },
|
||||
};
|
||||
}
|
||||
// Build text response for LLM
|
||||
// Build text response for LLM — elements list is the key info
|
||||
const textParts: string[] = [];
|
||||
if (data.url) textParts.push("URL: " + data.url);
|
||||
if (data.title) textParts.push("Title: " + data.title);
|
||||
if (data.text) textParts.push("Page text:\n" + data.text);
|
||||
// Always include elements text when available — this is how the model "sees" the page
|
||||
if (data.elementsText) textParts.push("\n" + data.elementsText);
|
||||
if (data.text) textParts.push("\nPage text (truncated):\n" + data.text.slice(0, 3000));
|
||||
if (data.evalResult) textParts.push("Eval result: " + data.evalResult);
|
||||
if (data.screenshot) textParts.push("[Screenshot captured]");
|
||||
if (data.screenshot) textParts.push("\n[Screenshot captured and displayed in browser panel]");
|
||||
if (textParts.length === 0) textParts.push("Action completed.");
|
||||
// Include screenshot as image content if available
|
||||
// Include screenshot as image content if available (for vision models)
|
||||
const content: any[] = [{ type: "text" as const, text: textParts.join("\n") }];
|
||||
if (data.screenshot) {
|
||||
content.push({ type: "image" as const, mimeType: "image/jpeg", data: data.screenshot });
|
||||
|
|
@ -111,6 +136,7 @@ export const browserTool: AgentTool<typeof browserSchema, BrowserDetails> = {
|
|||
title: data.title,
|
||||
screenshot: data.screenshot,
|
||||
text: data.text,
|
||||
elementsText: data.elementsText,
|
||||
evalResult: data.evalResult,
|
||||
},
|
||||
};
|
||||
|
|
@ -142,6 +168,16 @@ class BrowserToolRenderer implements ToolRenderer<any, BrowserDetails> {
|
|||
isCustom: false,
|
||||
};
|
||||
}
|
||||
if (result?.details?.elementsText) {
|
||||
return {
|
||||
content: html`
|
||||
<div class="flex flex-col gap-2">
|
||||
${renderHeader(state, Globe, label)}
|
||||
<pre class="text-xs p-3 rounded border border-border overflow-auto max-h-48 whitespace-pre-wrap">${result.details.elementsText}</pre>
|
||||
</div>`,
|
||||
isCustom: false,
|
||||
};
|
||||
}
|
||||
if (result?.details?.text) {
|
||||
return {
|
||||
content: html`
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue