diff --git a/packages/web-ui/example/server/tool-server.mjs b/packages/web-ui/example/server/tool-server.mjs index a1ff6bb..646b5c4 100644 --- a/packages/web-ui/example/server/tool-server.mjs +++ b/packages/web-ui/example/server/tool-server.mjs @@ -29,7 +29,7 @@ function parseBody(req) { let browser = null; let context = null; let page = null; -const browserPanelClients = new Set(); // WS clients watching the browser +const browserPanelClients = new Set(); async function launchBrowser() { if (!browser) { @@ -52,10 +52,120 @@ async function getPage() { return page; } +// ── ELEMENT EXTRACTION (the key feature for text-model browser control) ── +async function extractElements(p) { + try { + const elements = await p.evaluate(() => { + const items = []; + const seen = new Set(); + // Selectors for all interactive elements + const selectors = [ + 'a[href]', + 'button', + 'input', + 'textarea', + 'select', + '[role="button"]', + '[role="link"]', + '[role="tab"]', + '[role="menuitem"]', + '[onclick]', + '[contenteditable="true"]', + 'summary', + 'details', + 'label[for]', + ]; + const allEls = document.querySelectorAll(selectors.join(',')); + for (const el of allEls) { + const rect = el.getBoundingClientRect(); + // Skip invisible, off-screen, or tiny elements + if (rect.width < 5 || rect.height < 5) continue; + if (rect.top > window.innerHeight || rect.bottom < 0) continue; + if (rect.left > window.innerWidth || rect.right < 0) continue; + if (window.getComputedStyle(el).visibility === 'hidden') continue; + if (window.getComputedStyle(el).display === 'none') continue; + if (parseFloat(window.getComputedStyle(el).opacity) < 0.1) continue; + + const tag = el.tagName.toLowerCase(); + const type = el.getAttribute('type') || ''; + const role = el.getAttribute('role') || ''; + const href = el.getAttribute('href') || ''; + const placeholder = el.getAttribute('placeholder') || ''; + const ariaLabel = el.getAttribute('aria-label') || ''; + const title = el.getAttribute('title') || ''; + const name = el.getAttribute('name') || ''; + const value = el.value || ''; + + // Build a human-readable label + let label = (el.innerText || '').trim().slice(0, 80); + if (!label) label = ariaLabel || title || placeholder || name || ''; + if (!label && tag === 'img') label = el.getAttribute('alt') || 'image'; + if (!label) label = `(${tag}${type ? ' type=' + type : ''})`; + + // Centre coordinates + const cx = Math.round(rect.left + rect.width / 2); + const cy = Math.round(rect.top + rect.height / 2); + + // De-duplicate by position (within 5px) + const key = `${Math.round(cx/5)*5},${Math.round(cy/5)*5}`; + if (seen.has(key)) continue; + seen.add(key); + + let kind = tag; + if (tag === 'a') kind = 'link'; + if (tag === 'button' || role === 'button') kind = 'button'; + if (tag === 'input') kind = 'input' + (type ? `[${type}]` : ''); + if (tag === 'textarea') kind = 'textarea'; + if (tag === 'select') kind = 'select'; + + const item = { index: items.length + 1, kind, label, x: cx, y: cy }; + if (tag === 'input' || tag === 'textarea') { + item.value = value.slice(0, 100); + if (placeholder) item.placeholder = placeholder; + } + if (tag === 'a' && href) { + item.href = href.slice(0, 120); + } + items.push(item); + } + return items; + }); + return elements; + } catch (e) { + console.error('[tool-server] Element extraction error:', e.message); + return []; + } +} + +// Format elements as readable text for the LLM +function formatElements(elements) { + if (!elements || elements.length === 0) return 'No interactive elements found on page.'; + const lines = ['Interactive elements on page:']; + for (const el of elements) { + let line = ` [${el.index}] ${el.kind} "${el.label}" at (${el.x}, ${el.y})`; + if (el.value) line += ` value="${el.value}"`; + if (el.placeholder) line += ` placeholder="${el.placeholder}"`; + if (el.href) line += ` → ${el.href}`; + lines.push(line); + } + lines.push(''); + lines.push('To click an element, use action "click" with its x,y coordinates.'); + lines.push('To type into a focused input, use action "type" with text.'); + return lines.join('\n'); +} + +// Enhanced snap: screenshot + element list async function snap() { const p = await getPage(); const buf = await p.screenshot({ type: 'jpeg', quality: 70, fullPage: false }); - return { screenshot: buf.toString('base64'), url: p.url(), title: await p.title() }; + const elements = await extractElements(p); + return { + screenshot: buf.toString('base64'), + url: p.url(), + title: await p.title(), + elements, + elementsText: formatElements(elements), + }; } // Broadcast screenshot to all connected browser panel WebSocket clients @@ -104,13 +214,21 @@ async function handleNavigate(body) { const target = targetUrl.startsWith('http') ? targetUrl : 'https://' + targetUrl; await p.goto(target, { timeout: 30000, waitUntil: 'domcontentloaded' }); const result = await snap(); - broadcastScreenshot(); // sync panel + broadcastScreenshot(); return result; } async function handleClick(body) { const p = await getPage(); - await p.mouse.click(body.x || 0, body.y || 0); + // Support clicking by element index + if (body.index && !body.x && !body.y) { + const elements = await extractElements(p); + const el = elements.find(e => e.index === body.index); + if (!el) return { error: `Element [${body.index}] not found. Use action "screenshot" to refresh element list.` }; + await p.mouse.click(el.x, el.y); + } else { + await p.mouse.click(body.x || 0, body.y || 0); + } await p.waitForTimeout(500); const result = await snap(); broadcastScreenshot(); @@ -119,8 +237,24 @@ async function handleClick(body) { async function handleType(body) { const p = await getPage(); - if (body.selector) await p.fill(body.selector, body.text || ''); - else await p.keyboard.type(body.text || ''); + if (body.selector) { + await p.fill(body.selector, body.text || ''); + } else if (body.index) { + // Click the element first, then type + const elements = await extractElements(p); + const el = elements.find(e => e.index === body.index); + if (el) { + await p.mouse.click(el.x, el.y); + await p.waitForTimeout(200); + // Clear existing content and type new text + await p.keyboard.press('Control+a'); + await p.keyboard.type(body.text || ''); + } else { + return { error: `Element [${body.index}] not found.` }; + } + } else { + await p.keyboard.type(body.text || ''); + } await p.waitForTimeout(300); const result = await snap(); broadcastScreenshot(); @@ -163,7 +297,8 @@ async function handleReload() { async function handleText() { const p = await getPage(); const text = await p.evaluate(() => document.body.innerText); - return { url: p.url(), title: await p.title(), text: text.slice(0, 8000) }; + const elements = await extractElements(p); + return { url: p.url(), title: await p.title(), text: text.slice(0, 8000), elements, elementsText: formatElements(elements) }; } async function handleEval(body) { @@ -174,6 +309,22 @@ async function handleEval(body) { return { ...ss, evalResult: String(result) }; } +async function handleElements() { + const p = await getPage(); + const elements = await extractElements(p); + return { url: p.url(), title: await p.title(), elements, elementsText: formatElements(elements) }; +} + +// Press a specific key (Enter, Tab, Escape, etc.) +async function handleKeypress(body) { + const p = await getPage(); + await p.keyboard.press(body.key || 'Enter'); + await p.waitForTimeout(300); + const result = await snap(); + broadcastScreenshot(); + return result; +} + // ── HTTP ROUTES ─────────────────────────────────────────────── const routes = { '/api/bash': handleBash, @@ -184,9 +335,11 @@ const routes = { '/api/browser/back': handleBack, '/api/browser/forward': handleForward, '/api/browser/reload': handleReload, - '/api/browser/screenshot': () => { const r = snap(); broadcastScreenshot(); return r; }, + '/api/browser/screenshot': async () => { const r = await snap(); broadcastScreenshot(); return r; }, '/api/browser/text': handleText, '/api/browser/eval': handleEval, + '/api/browser/elements': handleElements, + '/api/browser/keypress': handleKeypress, }; // ── HTTP SERVER ─────────────────────────────────────────────── @@ -239,7 +392,7 @@ browserWss.on('connection', async (ws) => { ws.on('close', () => browserPanelClients.delete(ws)); ws.on('error', () => browserPanelClients.delete(ws)); - // Handle panel user interactions (navigate, click, scroll, etc.) + // Handle panel user interactions ws.on('message', async (msg) => { try { const m = JSON.parse(msg.toString()); @@ -291,14 +444,13 @@ browserWss.on('connection', async (ws) => { } }); - // Send initial ready + screenshot if browser exists ws.send(JSON.stringify({ type: 'ready' })); if (page && !page.isClosed()) { try { await broadcastScreenshot(); } catch {} } }); -// ── UPGRADE HANDLER (route WS by path) ──────────────────────── +// ── UPGRADE HANDLER ─────────────────────────────────────────── server.on('upgrade', (req, socket, head) => { const pathname = url.parse(req.url).pathname; if (pathname === '/ws/terminal') { diff --git a/packages/web-ui/src/tools/browser-tool.ts b/packages/web-ui/src/tools/browser-tool.ts index 81f8dde..e4b3099 100644 --- a/packages/web-ui/src/tools/browser-tool.ts +++ b/packages/web-ui/src/tools/browser-tool.ts @@ -7,157 +7,193 @@ import { registerToolRenderer, renderHeader } from "./renderer-registry.js"; import type { ToolRenderer, ToolRenderResult } from "./types.js"; const TOOL_SERVER = - typeof window !== "undefined" - ? (window as any).__JAE_TOOL_SERVER__ || "http://localhost:7700" - : "http://localhost:7700"; + typeof window !== "undefined" + ? (window as any).__JAE_TOOL_SERVER__ || "http://localhost:7700" + : "http://localhost:7700"; const browserSchema = Type.Object({ - action: Type.Union( - [ - Type.Literal("navigate"), - Type.Literal("click"), - Type.Literal("type"), - Type.Literal("scroll"), - Type.Literal("back"), - Type.Literal("screenshot"), - Type.Literal("text"), - Type.Literal("eval"), - ], - { description: "Browser action to perform" }, - ), - url: Type.Optional(Type.String({ description: "URL to navigate to" })), - x: Type.Optional(Type.Number({ description: "Click X coordinate" })), - y: Type.Optional(Type.Number({ description: "Click Y coordinate" })), - text: Type.Optional(Type.String({ description: "Text to type" })), - selector: Type.Optional(Type.String({ description: "CSS selector to type into" })), - dy: Type.Optional(Type.Number({ description: "Scroll delta Y pixels" })), - script: Type.Optional(Type.String({ description: "JavaScript to evaluate in page" })), + action: Type.Union( + [ + Type.Literal("navigate"), + Type.Literal("click"), + Type.Literal("type"), + Type.Literal("scroll"), + Type.Literal("back"), + Type.Literal("screenshot"), + Type.Literal("text"), + Type.Literal("eval"), + Type.Literal("elements"), + Type.Literal("keypress"), + ], + { description: "Browser action to perform" }, + ), + url: Type.Optional(Type.String({ description: "URL to navigate to" })), + index: Type.Optional(Type.Number({ description: "Element index number from the elements list to click or type into" })), + x: Type.Optional(Type.Number({ description: "Click X coordinate (use index instead when possible)" })), + y: Type.Optional(Type.Number({ description: "Click Y coordinate (use index instead when possible)" })), + text: Type.Optional(Type.String({ description: "Text to type into focused element or element by index" })), + selector: Type.Optional(Type.String({ description: "CSS selector to type into (prefer index instead)" })), + dy: Type.Optional(Type.Number({ description: "Scroll delta Y pixels (positive=down, negative=up)" })), + script: Type.Optional(Type.String({ description: "JavaScript to evaluate in page" })), + key: Type.Optional(Type.String({ description: "Key to press: Enter, Tab, Escape, Backspace, etc." })), }); export interface BrowserDetails { - action: string; - url?: string; - title?: string; - screenshot?: string; - text?: string; - evalResult?: string; - error?: string; + action: string; + url?: string; + title?: string; + screenshot?: string; + text?: string; + elementsText?: string; + evalResult?: string; + error?: string; } export const browserTool: AgentTool = { - name: "browser", - label: "Browser", - description: - "Control a headless browser. Actions: navigate (url), click (x,y), type (text, optional selector), scroll (dy), back, screenshot, text (get page text), eval (run JS).", - parameters: browserSchema, - async execute(toolCallId, params, signal) { - const { action, ...rest } = params; - const endpoint = - action === "navigate" - ? "/api/browser/navigate" - : action === "click" - ? "/api/browser/click" - : action === "type" - ? "/api/browser/type" - : action === "scroll" - ? "/api/browser/scroll" - : action === "back" - ? "/api/browser/back" - : action === "screenshot" - ? "/api/browser/screenshot" - : action === "text" - ? "/api/browser/text" - : action === "eval" - ? "/api/browser/eval" - : null; - if (!endpoint) { - return { - content: [{ type: "text" as const, text: "Unknown action: " + action }], - details: { action, error: "Unknown action" }, - }; - } - try { - const res = await fetch(TOOL_SERVER + endpoint, { - method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify(rest), - signal, - }); - const data = (await res.json()) as any; - if (data.error) { - return { - content: [{ type: "text" as const, text: "Browser error: " + data.error }], - details: { action, error: data.error }, - }; - } - // Build text response for LLM - const textParts: string[] = []; - if (data.url) textParts.push("URL: " + data.url); - if (data.title) textParts.push("Title: " + data.title); - if (data.text) textParts.push("Page text:\n" + data.text); - if (data.evalResult) textParts.push("Eval result: " + data.evalResult); - if (data.screenshot) textParts.push("[Screenshot captured]"); - if (textParts.length === 0) textParts.push("Action completed."); - // Include screenshot as image content if available - const content: any[] = [{ type: "text" as const, text: textParts.join("\n") }]; - if (data.screenshot) { - content.push({ type: "image" as const, mimeType: "image/jpeg", data: data.screenshot }); - } - return { - content, - details: { - action, - url: data.url, - title: data.title, - screenshot: data.screenshot, - text: data.text, - evalResult: data.evalResult, - }, - }; - } catch (err: any) { - return { - content: [{ type: "text" as const, text: "Browser tool error: " + err.message }], - details: { action, error: err.message }, - }; - } - }, + name: "browser", + label: "Browser", + description: `Control a browser to navigate, click, type, and read web pages. + +WORKFLOW: Always follow this pattern: +1. navigate to a URL - returns page elements list +2. READ the numbered elements list to find what you need +3. click by index number OR type into an element by index +4. After each action you get an updated elements list + +Actions: +- navigate: open a URL (provide "url") +- click: click element by "index" (preferred) or by "x","y" coordinates +- type: type "text" into element by "index", or into currently focused element +- keypress: press a key like "Enter", "Tab", "Escape" (provide "key") +- scroll: scroll page (provide "dy", positive=down negative=up) +- back: go back in history +- screenshot: get fresh screenshot and elements list +- elements: get just the interactive elements list +- text: get all visible text content of the page +- eval: run JavaScript on the page (provide "script") + +EXAMPLE - Search Google: +1. browser({action:"navigate", url:"google.com"}) → see elements, find input[text] "Search" at index 5 +2. browser({action:"type", index:5, text:"my search query"}) → typed into search box +3. browser({action:"keypress", key:"Enter"}) → submitted search +4. Read results from elements list, click a link by index`, + parameters: browserSchema, + async execute(toolCallId, params, signal) { + const { action, ...rest } = params; + const endpointMap: Record = { + navigate: "/api/browser/navigate", + click: "/api/browser/click", + type: "/api/browser/type", + scroll: "/api/browser/scroll", + back: "/api/browser/back", + screenshot: "/api/browser/screenshot", + text: "/api/browser/text", + eval: "/api/browser/eval", + elements: "/api/browser/elements", + keypress: "/api/browser/keypress", + }; + const endpoint = endpointMap[action]; + if (!endpoint) { + return { + content: [{ type: "text" as const, text: "Unknown action: " + action }], + details: { action, error: "Unknown action" }, + }; + } + try { + const res = await fetch(TOOL_SERVER + endpoint, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(rest), + signal, + }); + const data = (await res.json()) as any; + if (data.error) { + return { + content: [{ type: "text" as const, text: "Browser error: " + data.error }], + details: { action, error: data.error }, + }; + } + // Build text response for LLM — elements list is the key info + const textParts: string[] = []; + if (data.url) textParts.push("URL: " + data.url); + if (data.title) textParts.push("Title: " + data.title); + // Always include elements text when available — this is how the model "sees" the page + if (data.elementsText) textParts.push("\n" + data.elementsText); + if (data.text) textParts.push("\nPage text (truncated):\n" + data.text.slice(0, 3000)); + if (data.evalResult) textParts.push("Eval result: " + data.evalResult); + if (data.screenshot) textParts.push("\n[Screenshot captured and displayed in browser panel]"); + if (textParts.length === 0) textParts.push("Action completed."); + // Include screenshot as image content if available (for vision models) + const content: any[] = [{ type: "text" as const, text: textParts.join("\n") }]; + if (data.screenshot) { + content.push({ type: "image" as const, mimeType: "image/jpeg", data: data.screenshot }); + } + return { + content, + details: { + action, + url: data.url, + title: data.title, + screenshot: data.screenshot, + text: data.text, + elementsText: data.elementsText, + evalResult: data.evalResult, + }, + }; + } catch (err: any) { + return { + content: [{ type: "text" as const, text: "Browser tool error: " + err.message }], + details: { action, error: err.message }, + }; + } + }, }; class BrowserToolRenderer implements ToolRenderer { - render(params: any | undefined, result: ToolResultMessage | undefined): ToolRenderResult { - const state = result ? (result.isError ? "error" : "complete") : "inprogress"; - const action = result?.details?.action || params?.action || "..."; - const url = result?.details?.url || params?.url || ""; - const label = url ? action + ": " + url : action; - if (result?.details?.screenshot) { - return { - content: html` + render(params: any | undefined, result: ToolResultMessage | undefined): ToolRenderResult { + const state = result ? (result.isError ? "error" : "complete") : "inprogress"; + const action = result?.details?.action || params?.action || "..."; + const url = result?.details?.url || params?.url || ""; + const label = url ? action + ": " + url : action; + if (result?.details?.screenshot) { + return { + content: html`
${renderHeader(state, Globe, label)} Browser screenshot + class="rounded border border-border max-w-full" style="max-height:400px" + alt="Browser screenshot" /> ${result.details.title ? html`${result.details.title}` : html``}
`, - isCustom: false, - }; - } - if (result?.details?.text) { - return { - content: html` + isCustom: false, + }; + } + if (result?.details?.elementsText) { + return { + content: html` +
+ ${renderHeader(state, Globe, label)} +
${result.details.elementsText}
+
`, + isCustom: false, + }; + } + if (result?.details?.text) { + return { + content: html`
${renderHeader(state, Globe, label)}
${result.details.text}
`, - isCustom: false, - }; - } - return { content: renderHeader(state, Globe, label), isCustom: false }; - } + isCustom: false, + }; + } + return { content: renderHeader(state, Globe, label), isCustom: false }; + } } registerToolRenderer("browser", new BrowserToolRenderer()); export function createBrowserTool(): AgentTool { - return browserTool; + return browserTool; }