Some checks failed
CI / build-check-test (push) Has been cancelled
- tool-server.mjs: extractElements() scrapes all interactive elements with coordinates
- tool-server.mjs: formatElements() returns numbered list for LLM to read
- tool-server.mjs: click/type now support {index: N} for element-based interaction
- tool-server.mjs: new /api/browser/elements and /api/browser/keypress endpoints
- browser-tool.ts: updated schema with index, key params and elements/keypress actions
- browser-tool.ts: elementsText included in every LLM response so model can see the page
- browser-tool.ts: detailed workflow instructions in tool description
- Enables text-only models (Llama 3.3 etc) to navigate and interact with web pages
472 lines
17 KiB
JavaScript
472 lines
17 KiB
JavaScript
import http from 'http';
|
|
import { exec } from 'child_process';
|
|
import { chromium } from 'playwright';
|
|
import { WebSocketServer, WebSocket } from 'ws';
|
|
import { spawn } from 'child_process';
|
|
import url from 'url';
|
|
|
|
const PORT = parseInt(process.env.TOOL_SERVER_PORT || '7700');
|
|
|
|
// ── CORS ──────────────────────────────────────────────────────
|
|
const cors = {
|
|
'Access-Control-Allow-Origin': '*',
|
|
'Access-Control-Allow-Methods': 'POST, GET, OPTIONS',
|
|
'Access-Control-Allow-Headers': 'Content-Type',
|
|
'Content-Type': 'application/json',
|
|
};
|
|
|
|
function parseBody(req) {
|
|
return new Promise((resolve) => {
|
|
let body = '';
|
|
req.on('data', c => body += c);
|
|
req.on('end', () => {
|
|
try { resolve(JSON.parse(body || '{}')); } catch { resolve({}); }
|
|
});
|
|
});
|
|
}
|
|
|
|
// ── SHARED PLAYWRIGHT BROWSER ─────────────────────────────────
|
|
let browser = null;
|
|
let context = null;
|
|
let page = null;
|
|
const browserPanelClients = new Set();
|
|
|
|
async function launchBrowser() {
|
|
if (!browser) {
|
|
browser = await chromium.launch({
|
|
headless: true,
|
|
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
|
|
});
|
|
console.log('[tool-server] Playwright browser launched');
|
|
}
|
|
return browser;
|
|
}
|
|
|
|
async function getPage() {
|
|
if (!page || page.isClosed()) {
|
|
const b = await launchBrowser();
|
|
if (context) await context.close().catch(() => {});
|
|
context = await b.newContext({ viewport: { width: 1280, height: 800 } });
|
|
page = await context.newPage();
|
|
}
|
|
return page;
|
|
}
|
|
|
|
// ── ELEMENT EXTRACTION (the key feature for text-model browser control) ──
|
|
async function extractElements(p) {
|
|
try {
|
|
const elements = await p.evaluate(() => {
|
|
const items = [];
|
|
const seen = new Set();
|
|
// Selectors for all interactive elements
|
|
const selectors = [
|
|
'a[href]',
|
|
'button',
|
|
'input',
|
|
'textarea',
|
|
'select',
|
|
'[role="button"]',
|
|
'[role="link"]',
|
|
'[role="tab"]',
|
|
'[role="menuitem"]',
|
|
'[onclick]',
|
|
'[contenteditable="true"]',
|
|
'summary',
|
|
'details',
|
|
'label[for]',
|
|
];
|
|
const allEls = document.querySelectorAll(selectors.join(','));
|
|
for (const el of allEls) {
|
|
const rect = el.getBoundingClientRect();
|
|
// Skip invisible, off-screen, or tiny elements
|
|
if (rect.width < 5 || rect.height < 5) continue;
|
|
if (rect.top > window.innerHeight || rect.bottom < 0) continue;
|
|
if (rect.left > window.innerWidth || rect.right < 0) continue;
|
|
if (window.getComputedStyle(el).visibility === 'hidden') continue;
|
|
if (window.getComputedStyle(el).display === 'none') continue;
|
|
if (parseFloat(window.getComputedStyle(el).opacity) < 0.1) continue;
|
|
|
|
const tag = el.tagName.toLowerCase();
|
|
const type = el.getAttribute('type') || '';
|
|
const role = el.getAttribute('role') || '';
|
|
const href = el.getAttribute('href') || '';
|
|
const placeholder = el.getAttribute('placeholder') || '';
|
|
const ariaLabel = el.getAttribute('aria-label') || '';
|
|
const title = el.getAttribute('title') || '';
|
|
const name = el.getAttribute('name') || '';
|
|
const value = el.value || '';
|
|
|
|
// Build a human-readable label
|
|
let label = (el.innerText || '').trim().slice(0, 80);
|
|
if (!label) label = ariaLabel || title || placeholder || name || '';
|
|
if (!label && tag === 'img') label = el.getAttribute('alt') || 'image';
|
|
if (!label) label = `(${tag}${type ? ' type=' + type : ''})`;
|
|
|
|
// Centre coordinates
|
|
const cx = Math.round(rect.left + rect.width / 2);
|
|
const cy = Math.round(rect.top + rect.height / 2);
|
|
|
|
// De-duplicate by position (within 5px)
|
|
const key = `${Math.round(cx/5)*5},${Math.round(cy/5)*5}`;
|
|
if (seen.has(key)) continue;
|
|
seen.add(key);
|
|
|
|
let kind = tag;
|
|
if (tag === 'a') kind = 'link';
|
|
if (tag === 'button' || role === 'button') kind = 'button';
|
|
if (tag === 'input') kind = 'input' + (type ? `[${type}]` : '');
|
|
if (tag === 'textarea') kind = 'textarea';
|
|
if (tag === 'select') kind = 'select';
|
|
|
|
const item = { index: items.length + 1, kind, label, x: cx, y: cy };
|
|
if (tag === 'input' || tag === 'textarea') {
|
|
item.value = value.slice(0, 100);
|
|
if (placeholder) item.placeholder = placeholder;
|
|
}
|
|
if (tag === 'a' && href) {
|
|
item.href = href.slice(0, 120);
|
|
}
|
|
items.push(item);
|
|
}
|
|
return items;
|
|
});
|
|
return elements;
|
|
} catch (e) {
|
|
console.error('[tool-server] Element extraction error:', e.message);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
// Format elements as readable text for the LLM
|
|
function formatElements(elements) {
|
|
if (!elements || elements.length === 0) return 'No interactive elements found on page.';
|
|
const lines = ['Interactive elements on page:'];
|
|
for (const el of elements) {
|
|
let line = ` [${el.index}] ${el.kind} "${el.label}" at (${el.x}, ${el.y})`;
|
|
if (el.value) line += ` value="${el.value}"`;
|
|
if (el.placeholder) line += ` placeholder="${el.placeholder}"`;
|
|
if (el.href) line += ` → ${el.href}`;
|
|
lines.push(line);
|
|
}
|
|
lines.push('');
|
|
lines.push('To click an element, use action "click" with its x,y coordinates.');
|
|
lines.push('To type into a focused input, use action "type" with text.');
|
|
return lines.join('\n');
|
|
}
|
|
|
|
// Enhanced snap: screenshot + element list
|
|
async function snap() {
|
|
const p = await getPage();
|
|
const buf = await p.screenshot({ type: 'jpeg', quality: 70, fullPage: false });
|
|
const elements = await extractElements(p);
|
|
return {
|
|
screenshot: buf.toString('base64'),
|
|
url: p.url(),
|
|
title: await p.title(),
|
|
elements,
|
|
elementsText: formatElements(elements),
|
|
};
|
|
}
|
|
|
|
// Broadcast screenshot to all connected browser panel WebSocket clients
|
|
async function broadcastScreenshot() {
|
|
if (browserPanelClients.size === 0) return;
|
|
try {
|
|
const s = await snap();
|
|
const msg = JSON.stringify({ type: 'screenshot', data: s.screenshot, url: s.url, title: s.title });
|
|
for (const ws of browserPanelClients) {
|
|
if (ws.readyState === WebSocket.OPEN) ws.send(msg);
|
|
}
|
|
} catch (e) {
|
|
const errMsg = JSON.stringify({ type: 'error', msg: String(e) });
|
|
for (const ws of browserPanelClients) {
|
|
if (ws.readyState === WebSocket.OPEN) ws.send(errMsg);
|
|
}
|
|
}
|
|
}
|
|
|
|
// ── BASH HANDLER ──────────────────────────────────────────────
|
|
async function handleBash(body) {
|
|
const { command, timeout = 30000 } = body;
|
|
if (!command) return { error: 'No command provided' };
|
|
return new Promise((resolve) => {
|
|
exec(command, {
|
|
timeout,
|
|
maxBuffer: 10 * 1024 * 1024,
|
|
cwd: process.env.WORKSPACE || process.env.HOME || '/root',
|
|
env: { ...process.env, TERM: 'dumb', COLUMNS: '200' },
|
|
}, (error, stdout, stderr) => {
|
|
resolve({
|
|
stdout: stdout || '',
|
|
stderr: stderr || '',
|
|
exitCode: error ? (error.code ?? 1) : 0,
|
|
output: (stdout || '') + (stderr ? '\nSTDERR: ' + stderr : ''),
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
// ── BROWSER HTTP HANDLERS (used by LLM tool) ─────────────────
|
|
async function handleNavigate(body) {
|
|
const { url: targetUrl } = body;
|
|
if (!targetUrl) return { error: 'No URL' };
|
|
const p = await getPage();
|
|
const target = targetUrl.startsWith('http') ? targetUrl : 'https://' + targetUrl;
|
|
await p.goto(target, { timeout: 30000, waitUntil: 'domcontentloaded' });
|
|
const result = await snap();
|
|
broadcastScreenshot();
|
|
return result;
|
|
}
|
|
|
|
async function handleClick(body) {
|
|
const p = await getPage();
|
|
// Support clicking by element index
|
|
if (body.index && !body.x && !body.y) {
|
|
const elements = await extractElements(p);
|
|
const el = elements.find(e => e.index === body.index);
|
|
if (!el) return { error: `Element [${body.index}] not found. Use action "screenshot" to refresh element list.` };
|
|
await p.mouse.click(el.x, el.y);
|
|
} else {
|
|
await p.mouse.click(body.x || 0, body.y || 0);
|
|
}
|
|
await p.waitForTimeout(500);
|
|
const result = await snap();
|
|
broadcastScreenshot();
|
|
return result;
|
|
}
|
|
|
|
async function handleType(body) {
|
|
const p = await getPage();
|
|
if (body.selector) {
|
|
await p.fill(body.selector, body.text || '');
|
|
} else if (body.index) {
|
|
// Click the element first, then type
|
|
const elements = await extractElements(p);
|
|
const el = elements.find(e => e.index === body.index);
|
|
if (el) {
|
|
await p.mouse.click(el.x, el.y);
|
|
await p.waitForTimeout(200);
|
|
// Clear existing content and type new text
|
|
await p.keyboard.press('Control+a');
|
|
await p.keyboard.type(body.text || '');
|
|
} else {
|
|
return { error: `Element [${body.index}] not found.` };
|
|
}
|
|
} else {
|
|
await p.keyboard.type(body.text || '');
|
|
}
|
|
await p.waitForTimeout(300);
|
|
const result = await snap();
|
|
broadcastScreenshot();
|
|
return result;
|
|
}
|
|
|
|
async function handleScroll(body) {
|
|
const p = await getPage();
|
|
await p.mouse.wheel(0, body.dy || 300);
|
|
await p.waitForTimeout(300);
|
|
const result = await snap();
|
|
broadcastScreenshot();
|
|
return result;
|
|
}
|
|
|
|
async function handleBack() {
|
|
const p = await getPage();
|
|
await p.goBack({ timeout: 10000 }).catch(() => {});
|
|
const result = await snap();
|
|
broadcastScreenshot();
|
|
return result;
|
|
}
|
|
|
|
async function handleForward() {
|
|
const p = await getPage();
|
|
await p.goForward({ timeout: 10000 }).catch(() => {});
|
|
const result = await snap();
|
|
broadcastScreenshot();
|
|
return result;
|
|
}
|
|
|
|
async function handleReload() {
|
|
const p = await getPage();
|
|
await p.reload({ timeout: 15000 }).catch(() => {});
|
|
const result = await snap();
|
|
broadcastScreenshot();
|
|
return result;
|
|
}
|
|
|
|
async function handleText() {
|
|
const p = await getPage();
|
|
const text = await p.evaluate(() => document.body.innerText);
|
|
const elements = await extractElements(p);
|
|
return { url: p.url(), title: await p.title(), text: text.slice(0, 8000), elements, elementsText: formatElements(elements) };
|
|
}
|
|
|
|
async function handleEval(body) {
|
|
const p = await getPage();
|
|
const result = await p.evaluate(body.script || 'null');
|
|
const ss = await snap();
|
|
broadcastScreenshot();
|
|
return { ...ss, evalResult: String(result) };
|
|
}
|
|
|
|
async function handleElements() {
|
|
const p = await getPage();
|
|
const elements = await extractElements(p);
|
|
return { url: p.url(), title: await p.title(), elements, elementsText: formatElements(elements) };
|
|
}
|
|
|
|
// Press a specific key (Enter, Tab, Escape, etc.)
|
|
async function handleKeypress(body) {
|
|
const p = await getPage();
|
|
await p.keyboard.press(body.key || 'Enter');
|
|
await p.waitForTimeout(300);
|
|
const result = await snap();
|
|
broadcastScreenshot();
|
|
return result;
|
|
}
|
|
|
|
// ── HTTP ROUTES ───────────────────────────────────────────────
|
|
const routes = {
|
|
'/api/bash': handleBash,
|
|
'/api/browser/navigate': handleNavigate,
|
|
'/api/browser/click': handleClick,
|
|
'/api/browser/type': handleType,
|
|
'/api/browser/scroll': handleScroll,
|
|
'/api/browser/back': handleBack,
|
|
'/api/browser/forward': handleForward,
|
|
'/api/browser/reload': handleReload,
|
|
'/api/browser/screenshot': async () => { const r = await snap(); broadcastScreenshot(); return r; },
|
|
'/api/browser/text': handleText,
|
|
'/api/browser/eval': handleEval,
|
|
'/api/browser/elements': handleElements,
|
|
'/api/browser/keypress': handleKeypress,
|
|
};
|
|
|
|
// ── HTTP SERVER ───────────────────────────────────────────────
|
|
const server = http.createServer(async (req, res) => {
|
|
if (req.method === 'OPTIONS') { res.writeHead(204, cors); res.end(); return; }
|
|
if (req.url === '/health') { res.writeHead(200, cors); res.end(JSON.stringify({ ok: true, browser: !!browser })); return; }
|
|
const handler = routes[req.url];
|
|
if (req.method === 'POST' && handler) {
|
|
try {
|
|
const body = await parseBody(req);
|
|
const result = await handler(body);
|
|
res.writeHead(200, cors);
|
|
res.end(JSON.stringify(result));
|
|
} catch (err) {
|
|
res.writeHead(500, cors);
|
|
res.end(JSON.stringify({ error: String(err) }));
|
|
}
|
|
return;
|
|
}
|
|
res.writeHead(404, cors);
|
|
res.end(JSON.stringify({ error: 'Not found' }));
|
|
});
|
|
|
|
// ── WEBSOCKET: TERMINAL (/ws/terminal) ────────────────────────
|
|
const terminalWss = new WebSocketServer({ noServer: true });
|
|
terminalWss.on('connection', (ws) => {
|
|
console.log('[tool-server] Terminal WS client connected');
|
|
const shell = spawn('/bin/bash', [], {
|
|
env: { ...process.env, TERM: 'xterm-256color', COLORTERM: 'truecolor' },
|
|
cwd: process.env.HOME || '/root',
|
|
});
|
|
shell.stdout.on('data', (d) => { try { ws.send(JSON.stringify({ type: 'data', data: d.toString('binary') })); } catch {} });
|
|
shell.stderr.on('data', (d) => { try { ws.send(JSON.stringify({ type: 'data', data: d.toString('binary') })); } catch {} });
|
|
shell.on('close', (code) => { try { ws.send(JSON.stringify({ type: 'exit', code })); ws.close(); } catch {} });
|
|
ws.on('message', (msg) => {
|
|
try {
|
|
const m = JSON.parse(msg.toString());
|
|
if (m.type === 'input') shell.stdin.write(m.data);
|
|
if (m.type === 'resize') { /* best effort without node-pty */ }
|
|
} catch {}
|
|
});
|
|
ws.on('close', () => { shell.kill(); });
|
|
});
|
|
|
|
// ── WEBSOCKET: BROWSER PANEL (/ws/browser) ────────────────────
|
|
const browserWss = new WebSocketServer({ noServer: true });
|
|
browserWss.on('connection', async (ws) => {
|
|
console.log('[tool-server] Browser panel WS client connected');
|
|
browserPanelClients.add(ws);
|
|
ws.on('close', () => browserPanelClients.delete(ws));
|
|
ws.on('error', () => browserPanelClients.delete(ws));
|
|
|
|
// Handle panel user interactions
|
|
ws.on('message', async (msg) => {
|
|
try {
|
|
const m = JSON.parse(msg.toString());
|
|
if (m.type === 'navigate') {
|
|
const p = await getPage();
|
|
const target = m.url.startsWith('http') ? m.url : 'https://' + m.url;
|
|
ws.send(JSON.stringify({ type: 'loading' }));
|
|
await p.goto(target, { timeout: 30000, waitUntil: 'domcontentloaded' });
|
|
await broadcastScreenshot();
|
|
}
|
|
if (m.type === 'click') {
|
|
const p = await getPage();
|
|
await p.mouse.click(m.x || 0, m.y || 0);
|
|
await p.waitForTimeout(500);
|
|
await broadcastScreenshot();
|
|
}
|
|
if (m.type === 'scroll') {
|
|
const p = await getPage();
|
|
await p.mouse.wheel(0, m.dy || 300);
|
|
await p.waitForTimeout(300);
|
|
await broadcastScreenshot();
|
|
}
|
|
if (m.type === 'type') {
|
|
const p = await getPage();
|
|
await p.keyboard.type(m.text || '');
|
|
await p.waitForTimeout(300);
|
|
await broadcastScreenshot();
|
|
}
|
|
if (m.type === 'back') {
|
|
const p = await getPage();
|
|
await p.goBack({ timeout: 10000 }).catch(() => {});
|
|
await broadcastScreenshot();
|
|
}
|
|
if (m.type === 'fwd') {
|
|
const p = await getPage();
|
|
await p.goForward({ timeout: 10000 }).catch(() => {});
|
|
await broadcastScreenshot();
|
|
}
|
|
if (m.type === 'reload') {
|
|
const p = await getPage();
|
|
await p.reload({ timeout: 15000 }).catch(() => {});
|
|
await broadcastScreenshot();
|
|
}
|
|
if (m.type === 'screenshot') {
|
|
await broadcastScreenshot();
|
|
}
|
|
} catch (e) {
|
|
ws.send(JSON.stringify({ type: 'error', msg: String(e) }));
|
|
}
|
|
});
|
|
|
|
ws.send(JSON.stringify({ type: 'ready' }));
|
|
if (page && !page.isClosed()) {
|
|
try { await broadcastScreenshot(); } catch {}
|
|
}
|
|
});
|
|
|
|
// ── UPGRADE HANDLER ───────────────────────────────────────────
|
|
server.on('upgrade', (req, socket, head) => {
|
|
const pathname = url.parse(req.url).pathname;
|
|
if (pathname === '/ws/terminal') {
|
|
terminalWss.handleUpgrade(req, socket, head, (ws) => terminalWss.emit('connection', ws, req));
|
|
} else if (pathname === '/ws/browser') {
|
|
browserWss.handleUpgrade(req, socket, head, (ws) => browserWss.emit('connection', ws, req));
|
|
} else {
|
|
socket.destroy();
|
|
}
|
|
});
|
|
|
|
// ── START ─────────────────────────────────────────────────────
|
|
server.listen(PORT, () => {
|
|
console.log(`[tool-server] Unified server on http://localhost:${PORT}`);
|
|
console.log(`[tool-server] HTTP API: POST /api/bash, /api/browser/*`);
|
|
console.log(`[tool-server] Terminal WS: ws://localhost:${PORT}/ws/terminal`);
|
|
console.log(`[tool-server] Browser WS: ws://localhost:${PORT}/ws/browser`);
|
|
console.log(`[tool-server] Health: GET /health`);
|
|
});
|