Agent-JAE/packages/web-ui/example/server/tool-server.mjs
JAE a2227c7659
Some checks failed
CI / build-check-test (push) Has been cancelled
feat: browser use - element extraction + index-based clicking for text models
- tool-server.mjs: extractElements() scrapes all interactive elements with coordinates
- tool-server.mjs: formatElements() returns numbered list for LLM to read
- tool-server.mjs: click/type now support {index: N} for element-based interaction
- tool-server.mjs: new /api/browser/elements and /api/browser/keypress endpoints
- browser-tool.ts: updated schema with index, key params and elements/keypress actions
- browser-tool.ts: elementsText included in every LLM response so model can see the page
- browser-tool.ts: detailed workflow instructions in tool description
- Enables text-only models (Llama 3.3 etc) to navigate and interact with web pages
2026-03-27 23:17:24 +00:00

472 lines
17 KiB
JavaScript

import http from 'http';
import { exec } from 'child_process';
import { chromium } from 'playwright';
import { WebSocketServer, WebSocket } from 'ws';
import { spawn } from 'child_process';
import url from 'url';
const PORT = parseInt(process.env.TOOL_SERVER_PORT || '7700');
// ── CORS ──────────────────────────────────────────────────────
const cors = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'POST, GET, OPTIONS',
'Access-Control-Allow-Headers': 'Content-Type',
'Content-Type': 'application/json',
};
function parseBody(req) {
return new Promise((resolve) => {
let body = '';
req.on('data', c => body += c);
req.on('end', () => {
try { resolve(JSON.parse(body || '{}')); } catch { resolve({}); }
});
});
}
// ── SHARED PLAYWRIGHT BROWSER ─────────────────────────────────
let browser = null;
let context = null;
let page = null;
const browserPanelClients = new Set();
async function launchBrowser() {
if (!browser) {
browser = await chromium.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
});
console.log('[tool-server] Playwright browser launched');
}
return browser;
}
async function getPage() {
if (!page || page.isClosed()) {
const b = await launchBrowser();
if (context) await context.close().catch(() => {});
context = await b.newContext({ viewport: { width: 1280, height: 800 } });
page = await context.newPage();
}
return page;
}
// ── ELEMENT EXTRACTION (the key feature for text-model browser control) ──
async function extractElements(p) {
try {
const elements = await p.evaluate(() => {
const items = [];
const seen = new Set();
// Selectors for all interactive elements
const selectors = [
'a[href]',
'button',
'input',
'textarea',
'select',
'[role="button"]',
'[role="link"]',
'[role="tab"]',
'[role="menuitem"]',
'[onclick]',
'[contenteditable="true"]',
'summary',
'details',
'label[for]',
];
const allEls = document.querySelectorAll(selectors.join(','));
for (const el of allEls) {
const rect = el.getBoundingClientRect();
// Skip invisible, off-screen, or tiny elements
if (rect.width < 5 || rect.height < 5) continue;
if (rect.top > window.innerHeight || rect.bottom < 0) continue;
if (rect.left > window.innerWidth || rect.right < 0) continue;
if (window.getComputedStyle(el).visibility === 'hidden') continue;
if (window.getComputedStyle(el).display === 'none') continue;
if (parseFloat(window.getComputedStyle(el).opacity) < 0.1) continue;
const tag = el.tagName.toLowerCase();
const type = el.getAttribute('type') || '';
const role = el.getAttribute('role') || '';
const href = el.getAttribute('href') || '';
const placeholder = el.getAttribute('placeholder') || '';
const ariaLabel = el.getAttribute('aria-label') || '';
const title = el.getAttribute('title') || '';
const name = el.getAttribute('name') || '';
const value = el.value || '';
// Build a human-readable label
let label = (el.innerText || '').trim().slice(0, 80);
if (!label) label = ariaLabel || title || placeholder || name || '';
if (!label && tag === 'img') label = el.getAttribute('alt') || 'image';
if (!label) label = `(${tag}${type ? ' type=' + type : ''})`;
// Centre coordinates
const cx = Math.round(rect.left + rect.width / 2);
const cy = Math.round(rect.top + rect.height / 2);
// De-duplicate by position (within 5px)
const key = `${Math.round(cx/5)*5},${Math.round(cy/5)*5}`;
if (seen.has(key)) continue;
seen.add(key);
let kind = tag;
if (tag === 'a') kind = 'link';
if (tag === 'button' || role === 'button') kind = 'button';
if (tag === 'input') kind = 'input' + (type ? `[${type}]` : '');
if (tag === 'textarea') kind = 'textarea';
if (tag === 'select') kind = 'select';
const item = { index: items.length + 1, kind, label, x: cx, y: cy };
if (tag === 'input' || tag === 'textarea') {
item.value = value.slice(0, 100);
if (placeholder) item.placeholder = placeholder;
}
if (tag === 'a' && href) {
item.href = href.slice(0, 120);
}
items.push(item);
}
return items;
});
return elements;
} catch (e) {
console.error('[tool-server] Element extraction error:', e.message);
return [];
}
}
// Format elements as readable text for the LLM
function formatElements(elements) {
if (!elements || elements.length === 0) return 'No interactive elements found on page.';
const lines = ['Interactive elements on page:'];
for (const el of elements) {
let line = ` [${el.index}] ${el.kind} "${el.label}" at (${el.x}, ${el.y})`;
if (el.value) line += ` value="${el.value}"`;
if (el.placeholder) line += ` placeholder="${el.placeholder}"`;
if (el.href) line += `${el.href}`;
lines.push(line);
}
lines.push('');
lines.push('To click an element, use action "click" with its x,y coordinates.');
lines.push('To type into a focused input, use action "type" with text.');
return lines.join('\n');
}
// Enhanced snap: screenshot + element list
async function snap() {
const p = await getPage();
const buf = await p.screenshot({ type: 'jpeg', quality: 70, fullPage: false });
const elements = await extractElements(p);
return {
screenshot: buf.toString('base64'),
url: p.url(),
title: await p.title(),
elements,
elementsText: formatElements(elements),
};
}
// Broadcast screenshot to all connected browser panel WebSocket clients
async function broadcastScreenshot() {
if (browserPanelClients.size === 0) return;
try {
const s = await snap();
const msg = JSON.stringify({ type: 'screenshot', data: s.screenshot, url: s.url, title: s.title });
for (const ws of browserPanelClients) {
if (ws.readyState === WebSocket.OPEN) ws.send(msg);
}
} catch (e) {
const errMsg = JSON.stringify({ type: 'error', msg: String(e) });
for (const ws of browserPanelClients) {
if (ws.readyState === WebSocket.OPEN) ws.send(errMsg);
}
}
}
// ── BASH HANDLER ──────────────────────────────────────────────
async function handleBash(body) {
const { command, timeout = 30000 } = body;
if (!command) return { error: 'No command provided' };
return new Promise((resolve) => {
exec(command, {
timeout,
maxBuffer: 10 * 1024 * 1024,
cwd: process.env.WORKSPACE || process.env.HOME || '/root',
env: { ...process.env, TERM: 'dumb', COLUMNS: '200' },
}, (error, stdout, stderr) => {
resolve({
stdout: stdout || '',
stderr: stderr || '',
exitCode: error ? (error.code ?? 1) : 0,
output: (stdout || '') + (stderr ? '\nSTDERR: ' + stderr : ''),
});
});
});
}
// ── BROWSER HTTP HANDLERS (used by LLM tool) ─────────────────
async function handleNavigate(body) {
const { url: targetUrl } = body;
if (!targetUrl) return { error: 'No URL' };
const p = await getPage();
const target = targetUrl.startsWith('http') ? targetUrl : 'https://' + targetUrl;
await p.goto(target, { timeout: 30000, waitUntil: 'domcontentloaded' });
const result = await snap();
broadcastScreenshot();
return result;
}
async function handleClick(body) {
const p = await getPage();
// Support clicking by element index
if (body.index && !body.x && !body.y) {
const elements = await extractElements(p);
const el = elements.find(e => e.index === body.index);
if (!el) return { error: `Element [${body.index}] not found. Use action "screenshot" to refresh element list.` };
await p.mouse.click(el.x, el.y);
} else {
await p.mouse.click(body.x || 0, body.y || 0);
}
await p.waitForTimeout(500);
const result = await snap();
broadcastScreenshot();
return result;
}
async function handleType(body) {
const p = await getPage();
if (body.selector) {
await p.fill(body.selector, body.text || '');
} else if (body.index) {
// Click the element first, then type
const elements = await extractElements(p);
const el = elements.find(e => e.index === body.index);
if (el) {
await p.mouse.click(el.x, el.y);
await p.waitForTimeout(200);
// Clear existing content and type new text
await p.keyboard.press('Control+a');
await p.keyboard.type(body.text || '');
} else {
return { error: `Element [${body.index}] not found.` };
}
} else {
await p.keyboard.type(body.text || '');
}
await p.waitForTimeout(300);
const result = await snap();
broadcastScreenshot();
return result;
}
async function handleScroll(body) {
const p = await getPage();
await p.mouse.wheel(0, body.dy || 300);
await p.waitForTimeout(300);
const result = await snap();
broadcastScreenshot();
return result;
}
async function handleBack() {
const p = await getPage();
await p.goBack({ timeout: 10000 }).catch(() => {});
const result = await snap();
broadcastScreenshot();
return result;
}
async function handleForward() {
const p = await getPage();
await p.goForward({ timeout: 10000 }).catch(() => {});
const result = await snap();
broadcastScreenshot();
return result;
}
async function handleReload() {
const p = await getPage();
await p.reload({ timeout: 15000 }).catch(() => {});
const result = await snap();
broadcastScreenshot();
return result;
}
async function handleText() {
const p = await getPage();
const text = await p.evaluate(() => document.body.innerText);
const elements = await extractElements(p);
return { url: p.url(), title: await p.title(), text: text.slice(0, 8000), elements, elementsText: formatElements(elements) };
}
async function handleEval(body) {
const p = await getPage();
const result = await p.evaluate(body.script || 'null');
const ss = await snap();
broadcastScreenshot();
return { ...ss, evalResult: String(result) };
}
async function handleElements() {
const p = await getPage();
const elements = await extractElements(p);
return { url: p.url(), title: await p.title(), elements, elementsText: formatElements(elements) };
}
// Press a specific key (Enter, Tab, Escape, etc.)
async function handleKeypress(body) {
const p = await getPage();
await p.keyboard.press(body.key || 'Enter');
await p.waitForTimeout(300);
const result = await snap();
broadcastScreenshot();
return result;
}
// ── HTTP ROUTES ───────────────────────────────────────────────
const routes = {
'/api/bash': handleBash,
'/api/browser/navigate': handleNavigate,
'/api/browser/click': handleClick,
'/api/browser/type': handleType,
'/api/browser/scroll': handleScroll,
'/api/browser/back': handleBack,
'/api/browser/forward': handleForward,
'/api/browser/reload': handleReload,
'/api/browser/screenshot': async () => { const r = await snap(); broadcastScreenshot(); return r; },
'/api/browser/text': handleText,
'/api/browser/eval': handleEval,
'/api/browser/elements': handleElements,
'/api/browser/keypress': handleKeypress,
};
// ── HTTP SERVER ───────────────────────────────────────────────
const server = http.createServer(async (req, res) => {
if (req.method === 'OPTIONS') { res.writeHead(204, cors); res.end(); return; }
if (req.url === '/health') { res.writeHead(200, cors); res.end(JSON.stringify({ ok: true, browser: !!browser })); return; }
const handler = routes[req.url];
if (req.method === 'POST' && handler) {
try {
const body = await parseBody(req);
const result = await handler(body);
res.writeHead(200, cors);
res.end(JSON.stringify(result));
} catch (err) {
res.writeHead(500, cors);
res.end(JSON.stringify({ error: String(err) }));
}
return;
}
res.writeHead(404, cors);
res.end(JSON.stringify({ error: 'Not found' }));
});
// ── WEBSOCKET: TERMINAL (/ws/terminal) ────────────────────────
const terminalWss = new WebSocketServer({ noServer: true });
terminalWss.on('connection', (ws) => {
console.log('[tool-server] Terminal WS client connected');
const shell = spawn('/bin/bash', [], {
env: { ...process.env, TERM: 'xterm-256color', COLORTERM: 'truecolor' },
cwd: process.env.HOME || '/root',
});
shell.stdout.on('data', (d) => { try { ws.send(JSON.stringify({ type: 'data', data: d.toString('binary') })); } catch {} });
shell.stderr.on('data', (d) => { try { ws.send(JSON.stringify({ type: 'data', data: d.toString('binary') })); } catch {} });
shell.on('close', (code) => { try { ws.send(JSON.stringify({ type: 'exit', code })); ws.close(); } catch {} });
ws.on('message', (msg) => {
try {
const m = JSON.parse(msg.toString());
if (m.type === 'input') shell.stdin.write(m.data);
if (m.type === 'resize') { /* best effort without node-pty */ }
} catch {}
});
ws.on('close', () => { shell.kill(); });
});
// ── WEBSOCKET: BROWSER PANEL (/ws/browser) ────────────────────
const browserWss = new WebSocketServer({ noServer: true });
browserWss.on('connection', async (ws) => {
console.log('[tool-server] Browser panel WS client connected');
browserPanelClients.add(ws);
ws.on('close', () => browserPanelClients.delete(ws));
ws.on('error', () => browserPanelClients.delete(ws));
// Handle panel user interactions
ws.on('message', async (msg) => {
try {
const m = JSON.parse(msg.toString());
if (m.type === 'navigate') {
const p = await getPage();
const target = m.url.startsWith('http') ? m.url : 'https://' + m.url;
ws.send(JSON.stringify({ type: 'loading' }));
await p.goto(target, { timeout: 30000, waitUntil: 'domcontentloaded' });
await broadcastScreenshot();
}
if (m.type === 'click') {
const p = await getPage();
await p.mouse.click(m.x || 0, m.y || 0);
await p.waitForTimeout(500);
await broadcastScreenshot();
}
if (m.type === 'scroll') {
const p = await getPage();
await p.mouse.wheel(0, m.dy || 300);
await p.waitForTimeout(300);
await broadcastScreenshot();
}
if (m.type === 'type') {
const p = await getPage();
await p.keyboard.type(m.text || '');
await p.waitForTimeout(300);
await broadcastScreenshot();
}
if (m.type === 'back') {
const p = await getPage();
await p.goBack({ timeout: 10000 }).catch(() => {});
await broadcastScreenshot();
}
if (m.type === 'fwd') {
const p = await getPage();
await p.goForward({ timeout: 10000 }).catch(() => {});
await broadcastScreenshot();
}
if (m.type === 'reload') {
const p = await getPage();
await p.reload({ timeout: 15000 }).catch(() => {});
await broadcastScreenshot();
}
if (m.type === 'screenshot') {
await broadcastScreenshot();
}
} catch (e) {
ws.send(JSON.stringify({ type: 'error', msg: String(e) }));
}
});
ws.send(JSON.stringify({ type: 'ready' }));
if (page && !page.isClosed()) {
try { await broadcastScreenshot(); } catch {}
}
});
// ── UPGRADE HANDLER ───────────────────────────────────────────
server.on('upgrade', (req, socket, head) => {
const pathname = url.parse(req.url).pathname;
if (pathname === '/ws/terminal') {
terminalWss.handleUpgrade(req, socket, head, (ws) => terminalWss.emit('connection', ws, req));
} else if (pathname === '/ws/browser') {
browserWss.handleUpgrade(req, socket, head, (ws) => browserWss.emit('connection', ws, req));
} else {
socket.destroy();
}
});
// ── START ─────────────────────────────────────────────────────
server.listen(PORT, () => {
console.log(`[tool-server] Unified server on http://localhost:${PORT}`);
console.log(`[tool-server] HTTP API: POST /api/bash, /api/browser/*`);
console.log(`[tool-server] Terminal WS: ws://localhost:${PORT}/ws/terminal`);
console.log(`[tool-server] Browser WS: ws://localhost:${PORT}/ws/browser`);
console.log(`[tool-server] Health: GET /health`);
});