feat: browser use - element extraction + index-based clicking for text models

- tool-server.mjs: extractElements() scrapes all interactive elements with coordinates - tool-server.mjs: formatElements() returns numbered list for LLM to read - tool-server.mjs: click/type now support {index: N} for element-based interaction - tool-server.mjs: new /api/browser/elements and /api/browser/keypress endpoints - browser-tool.ts: updated schema with index, key params and elements/keypress actions - browser-tool.ts: elementsText included in every LLM response so model can see the page - browser-tool.ts: detailed workflow instructions in tool description - Enables text-only models (Llama 3.3 etc) to navigate and interact with web pages
2026-03-27 23:17:24 +00:00 · 2026-03-27 23:17:24 +00:00 · a2227c7659
commit a2227c7659
parent db79dec9e1
2 changed files with 327 additions and 139 deletions
--- a/packages/web-ui/example/server/tool-server.mjs
+++ b/packages/web-ui/example/server/tool-server.mjs
@ -29,7 +29,7 @@ function parseBody(req) {
 let browser = null;
 let context = null;
 let page = null;
-const browserPanelClients = new Set();  // WS clients watching the browser
+const browserPanelClients = new Set();

 async function launchBrowser() {
  if (!browser) {
@ -52,10 +52,120 @@ async function getPage() {
  return page;
 }

+// ── ELEMENT EXTRACTION (the key feature for text-model browser control) ──
+async function extractElements(p) {
+  try {
+    const elements = await p.evaluate(() => {
+      const items = [];
+      const seen = new Set();
+      // Selectors for all interactive elements
+      const selectors = [
+        'a[href]',
+        'button',
+        'input',
+        'textarea',
+        'select',
+        '[role="button"]',
+        '[role="link"]',
+        '[role="tab"]',
+        '[role="menuitem"]',
+        '[onclick]',
+        '[contenteditable="true"]',
+        'summary',
+        'details',
+        'label[for]',
+      ];
+      const allEls = document.querySelectorAll(selectors.join(','));
+      for (const el of allEls) {
+        const rect = el.getBoundingClientRect();
+        // Skip invisible, off-screen, or tiny elements
+        if (rect.width < 5 || rect.height < 5) continue;
+        if (rect.top > window.innerHeight || rect.bottom < 0) continue;
+        if (rect.left > window.innerWidth || rect.right < 0) continue;
+        if (window.getComputedStyle(el).visibility === 'hidden') continue;
+        if (window.getComputedStyle(el).display === 'none') continue;
+        if (parseFloat(window.getComputedStyle(el).opacity) < 0.1) continue;
+
+        const tag = el.tagName.toLowerCase();
+        const type = el.getAttribute('type') || '';
+        const role = el.getAttribute('role') || '';
+        const href = el.getAttribute('href') || '';
+        const placeholder = el.getAttribute('placeholder') || '';
+        const ariaLabel = el.getAttribute('aria-label') || '';
+        const title = el.getAttribute('title') || '';
+        const name = el.getAttribute('name') || '';
+        const value = el.value || '';
+
+        // Build a human-readable label
+        let label = (el.innerText || '').trim().slice(0, 80);
+        if (!label) label = ariaLabel || title || placeholder || name || '';
+        if (!label && tag === 'img') label = el.getAttribute('alt') || 'image';
+        if (!label) label = `(${tag}${type ? ' type=' + type : ''})`;
+
+        // Centre coordinates
+        const cx = Math.round(rect.left + rect.width / 2);
+        const cy = Math.round(rect.top + rect.height / 2);
+
+        // De-duplicate by position (within 5px)
+        const key = `${Math.round(cx/5)*5},${Math.round(cy/5)*5}`;
+        if (seen.has(key)) continue;
+        seen.add(key);
+
+        let kind = tag;
+        if (tag === 'a') kind = 'link';
+        if (tag === 'button' || role === 'button') kind = 'button';
+        if (tag === 'input') kind = 'input' + (type ? `[${type}]` : '');
+        if (tag === 'textarea') kind = 'textarea';
+        if (tag === 'select') kind = 'select';
+
+        const item = { index: items.length + 1, kind, label, x: cx, y: cy };
+        if (tag === 'input' || tag === 'textarea') {
+          item.value = value.slice(0, 100);
+          if (placeholder) item.placeholder = placeholder;
+        }
+        if (tag === 'a' && href) {
+          item.href = href.slice(0, 120);
+        }
+        items.push(item);
+      }
+      return items;
+    });
+    return elements;
+  } catch (e) {
+    console.error('[tool-server] Element extraction error:', e.message);
+    return [];
+  }
+}
+
+// Format elements as readable text for the LLM
+function formatElements(elements) {
+  if (!elements || elements.length === 0) return 'No interactive elements found on page.';
+  const lines = ['Interactive elements on page:'];
+  for (const el of elements) {
+    let line = `  [${el.index}] ${el.kind} "${el.label}" at (${el.x}, ${el.y})`;
+    if (el.value) line += ` value="${el.value}"`;
+    if (el.placeholder) line += ` placeholder="${el.placeholder}"`;
+    if (el.href) line += ` → ${el.href}`;
+    lines.push(line);
+  }
+  lines.push('');
+  lines.push('To click an element, use action "click" with its x,y coordinates.');
+  lines.push('To type into a focused input, use action "type" with text.');
+  return lines.join('\n');
+}
+
+// Enhanced snap: screenshot + element list
 async function snap() {
  const p = await getPage();
  const buf = await p.screenshot({ type: 'jpeg', quality: 70, fullPage: false });
-  return { screenshot: buf.toString('base64'), url: p.url(), title: await p.title() };
+  const elements = await extractElements(p);
+  return {
+    screenshot: buf.toString('base64'),
+    url: p.url(),
+    title: await p.title(),
+    elements,
+    elementsText: formatElements(elements),
+  };
 }

 // Broadcast screenshot to all connected browser panel WebSocket clients
@ -104,13 +214,21 @@ async function handleNavigate(body) {
  const target = targetUrl.startsWith('http') ? targetUrl : 'https://' + targetUrl;
  await p.goto(target, { timeout: 30000, waitUntil: 'domcontentloaded' });
  const result = await snap();
-  broadcastScreenshot();  // sync panel
+  broadcastScreenshot();
  return result;
 }

 async function handleClick(body) {
  const p = await getPage();
+  // Support clicking by element index
+  if (body.index && !body.x && !body.y) {
+    const elements = await extractElements(p);
+    const el = elements.find(e => e.index === body.index);
+    if (!el) return { error: `Element [${body.index}] not found. Use action "screenshot" to refresh element list.` };
+    await p.mouse.click(el.x, el.y);
+  } else {
    await p.mouse.click(body.x || 0, body.y || 0);
+  }
  await p.waitForTimeout(500);
  const result = await snap();
  broadcastScreenshot();
@ -119,8 +237,24 @@ async function handleClick(body) {

 async function handleType(body) {
  const p = await getPage();
-  if (body.selector) await p.fill(body.selector, body.text || '');
-  else await p.keyboard.type(body.text || '');
+  if (body.selector) {
+    await p.fill(body.selector, body.text || '');
+  } else if (body.index) {
+    // Click the element first, then type
+    const elements = await extractElements(p);
+    const el = elements.find(e => e.index === body.index);
+    if (el) {
+      await p.mouse.click(el.x, el.y);
+      await p.waitForTimeout(200);
+      // Clear existing content and type new text
+      await p.keyboard.press('Control+a');
+      await p.keyboard.type(body.text || '');
+    } else {
+      return { error: `Element [${body.index}] not found.` };
+    }
+  } else {
+    await p.keyboard.type(body.text || '');
+  }
  await p.waitForTimeout(300);
  const result = await snap();
  broadcastScreenshot();
@ -163,7 +297,8 @@ async function handleReload() {
 async function handleText() {
  const p = await getPage();
  const text = await p.evaluate(() => document.body.innerText);
-  return { url: p.url(), title: await p.title(), text: text.slice(0, 8000) };
+  const elements = await extractElements(p);
+  return { url: p.url(), title: await p.title(), text: text.slice(0, 8000), elements, elementsText: formatElements(elements) };
 }

 async function handleEval(body) {
@ -174,6 +309,22 @@ async function handleEval(body) {
  return { ...ss, evalResult: String(result) };
 }

+async function handleElements() {
+  const p = await getPage();
+  const elements = await extractElements(p);
+  return { url: p.url(), title: await p.title(), elements, elementsText: formatElements(elements) };
+}
+
+// Press a specific key (Enter, Tab, Escape, etc.)
+async function handleKeypress(body) {
+  const p = await getPage();
+  await p.keyboard.press(body.key || 'Enter');
+  await p.waitForTimeout(300);
+  const result = await snap();
+  broadcastScreenshot();
+  return result;
+}
+
 // ── HTTP ROUTES ───────────────────────────────────────────────
 const routes = {
  '/api/bash': handleBash,
@ -184,9 +335,11 @@ const routes = {
  '/api/browser/back': handleBack,
  '/api/browser/forward': handleForward,
  '/api/browser/reload': handleReload,
-  '/api/browser/screenshot': () => { const r = snap(); broadcastScreenshot(); return r; },
+  '/api/browser/screenshot': async () => { const r = await snap(); broadcastScreenshot(); return r; },
  '/api/browser/text': handleText,
  '/api/browser/eval': handleEval,
+  '/api/browser/elements': handleElements,
+  '/api/browser/keypress': handleKeypress,
 };

 // ── HTTP SERVER ───────────────────────────────────────────────
@ -239,7 +392,7 @@ browserWss.on('connection', async (ws) => {
  ws.on('close', () => browserPanelClients.delete(ws));
  ws.on('error', () => browserPanelClients.delete(ws));

-  // Handle panel user interactions (navigate, click, scroll, etc.)
+  // Handle panel user interactions
  ws.on('message', async (msg) => {
    try {
      const m = JSON.parse(msg.toString());
@ -291,14 +444,13 @@ browserWss.on('connection', async (ws) => {
    }
  });

-  // Send initial ready + screenshot if browser exists
  ws.send(JSON.stringify({ type: 'ready' }));
  if (page && !page.isClosed()) {
    try { await broadcastScreenshot(); } catch {}
  }
 });

-// ── UPGRADE HANDLER (route WS by path) ────────────────────────
+// ── UPGRADE HANDLER ───────────────────────────────────────────
 server.on('upgrade', (req, socket, head) => {
  const pathname = url.parse(req.url).pathname;
  if (pathname === '/ws/terminal') {
--- a/packages/web-ui/src/tools/browser-tool.ts
+++ b/packages/web-ui/src/tools/browser-tool.ts
@ -22,16 +22,20 @@ const browserSchema = Type.Object({
      Type.Literal("screenshot"),
      Type.Literal("text"),
      Type.Literal("eval"),
+      Type.Literal("elements"),
+      Type.Literal("keypress"),
    ],
    { description: "Browser action to perform" },
  ),
  url: Type.Optional(Type.String({ description: "URL to navigate to" })),
-	x: Type.Optional(Type.Number({ description: "Click X coordinate" })),
-	y: Type.Optional(Type.Number({ description: "Click Y coordinate" })),
-	text: Type.Optional(Type.String({ description: "Text to type" })),
-	selector: Type.Optional(Type.String({ description: "CSS selector to type into" })),
-	dy: Type.Optional(Type.Number({ description: "Scroll delta Y pixels" })),
+  index: Type.Optional(Type.Number({ description: "Element index number from the elements list to click or type into" })),
+  x: Type.Optional(Type.Number({ description: "Click X coordinate (use index instead when possible)" })),
+  y: Type.Optional(Type.Number({ description: "Click Y coordinate (use index instead when possible)" })),
+  text: Type.Optional(Type.String({ description: "Text to type into focused element or element by index" })),
+  selector: Type.Optional(Type.String({ description: "CSS selector to type into (prefer index instead)" })),
+  dy: Type.Optional(Type.Number({ description: "Scroll delta Y pixels (positive=down, negative=up)" })),
  script: Type.Optional(Type.String({ description: "JavaScript to evaluate in page" })),
+  key: Type.Optional(Type.String({ description: "Key to press: Enter, Tab, Escape, Backspace, etc." })),
 });

 export interface BrowserDetails {
@ -40,6 +44,7 @@ export interface BrowserDetails {
  title?: string;
  screenshot?: string;
  text?: string;
+  elementsText?: string;
  evalResult?: string;
  error?: string;
 }
@ -47,29 +52,47 @@ export interface BrowserDetails {
 export const browserTool: AgentTool<typeof browserSchema, BrowserDetails> = {
  name: "browser",
  label: "Browser",
-	description:
-		"Control a headless browser. Actions: navigate (url), click (x,y), type (text, optional selector), scroll (dy), back, screenshot, text (get page text), eval (run JS).",
+  description: `Control a browser to navigate, click, type, and read web pages.
+
+WORKFLOW: Always follow this pattern:
+1. navigate to a URL - returns page elements list
+2. READ the numbered elements list to find what you need
+3. click by index number OR type into an element by index
+4. After each action you get an updated elements list
+
+Actions:
+- navigate: open a URL (provide "url")
+- click: click element by "index" (preferred) or by "x","y" coordinates
+- type: type "text" into element by "index", or into currently focused element
+- keypress: press a key like "Enter", "Tab", "Escape" (provide "key")
+- scroll: scroll page (provide "dy", positive=down negative=up)
+- back: go back in history
+- screenshot: get fresh screenshot and elements list
+- elements: get just the interactive elements list
+- text: get all visible text content of the page
+- eval: run JavaScript on the page (provide "script")
+
+EXAMPLE - Search Google:
+1. browser({action:"navigate", url:"google.com"}) → see elements, find input[text] "Search" at index 5
+2. browser({action:"type", index:5, text:"my search query"}) → typed into search box
+3. browser({action:"keypress", key:"Enter"}) → submitted search
+4. Read results from elements list, click a link by index`,
  parameters: browserSchema,
  async execute(toolCallId, params, signal) {
    const { action, ...rest } = params;
-		const endpoint =
-			action === "navigate"
-				? "/api/browser/navigate"
-				: action === "click"
-					? "/api/browser/click"
-					: action === "type"
-						? "/api/browser/type"
-						: action === "scroll"
-							? "/api/browser/scroll"
-							: action === "back"
-								? "/api/browser/back"
-								: action === "screenshot"
-									? "/api/browser/screenshot"
-									: action === "text"
-										? "/api/browser/text"
-										: action === "eval"
-											? "/api/browser/eval"
-											: null;
+    const endpointMap: Record<string, string> = {
+      navigate: "/api/browser/navigate",
+      click: "/api/browser/click",
+      type: "/api/browser/type",
+      scroll: "/api/browser/scroll",
+      back: "/api/browser/back",
+      screenshot: "/api/browser/screenshot",
+      text: "/api/browser/text",
+      eval: "/api/browser/eval",
+      elements: "/api/browser/elements",
+      keypress: "/api/browser/keypress",
+    };
+    const endpoint = endpointMap[action];
    if (!endpoint) {
      return {
        content: [{ type: "text" as const, text: "Unknown action: " + action }],
@ -90,15 +113,17 @@ export const browserTool: AgentTool<typeof browserSchema, BrowserDetails> = {
          details: { action, error: data.error },
        };
      }
-			// Build text response for LLM
+      // Build text response for LLM — elements list is the key info
      const textParts: string[] = [];
      if (data.url) textParts.push("URL: " + data.url);
      if (data.title) textParts.push("Title: " + data.title);
-			if (data.text) textParts.push("Page text:\n" + data.text);
+      // Always include elements text when available — this is how the model "sees" the page
+      if (data.elementsText) textParts.push("\n" + data.elementsText);
+      if (data.text) textParts.push("\nPage text (truncated):\n" + data.text.slice(0, 3000));
      if (data.evalResult) textParts.push("Eval result: " + data.evalResult);
-			if (data.screenshot) textParts.push("[Screenshot captured]");
+      if (data.screenshot) textParts.push("\n[Screenshot captured and displayed in browser panel]");
      if (textParts.length === 0) textParts.push("Action completed.");
-			// Include screenshot as image content if available
+      // Include screenshot as image content if available (for vision models)
      const content: any[] = [{ type: "text" as const, text: textParts.join("\n") }];
      if (data.screenshot) {
        content.push({ type: "image" as const, mimeType: "image/jpeg", data: data.screenshot });
@ -111,6 +136,7 @@ export const browserTool: AgentTool<typeof browserSchema, BrowserDetails> = {
          title: data.title,
          screenshot: data.screenshot,
          text: data.text,
+          elementsText: data.elementsText,
          evalResult: data.evalResult,
        },
      };
@ -142,6 +168,16 @@ class BrowserToolRenderer implements ToolRenderer<any, BrowserDetails> {
        isCustom: false,
      };
    }
+    if (result?.details?.elementsText) {
+      return {
+        content: html`
+          <div class="flex flex-col gap-2">
+            ${renderHeader(state, Globe, label)}
+            <pre class="text-xs p-3 rounded border border-border overflow-auto max-h-48 whitespace-pre-wrap">${result.details.elementsText}</pre>
+          </div>`,
+        isCustom: false,
+      };
+    }
    if (result?.details?.text) {
      return {
        content: html`