import http from 'http'; import { exec } from 'child_process'; import { chromium } from 'playwright'; import { WebSocketServer, WebSocket } from 'ws'; import { spawn } from 'child_process'; import url from 'url'; const PORT = parseInt(process.env.TOOL_SERVER_PORT || '7700'); // ── CORS ────────────────────────────────────────────────────── const cors = { 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Methods': 'POST, GET, OPTIONS', 'Access-Control-Allow-Headers': 'Content-Type', 'Content-Type': 'application/json', }; function parseBody(req) { return new Promise((resolve) => { let body = ''; req.on('data', c => body += c); req.on('end', () => { try { resolve(JSON.parse(body || '{}')); } catch { resolve({}); } }); }); } // ── SHARED PLAYWRIGHT BROWSER ───────────────────────────────── let browser = null; let context = null; let page = null; const browserPanelClients = new Set(); async function launchBrowser() { if (!browser) { browser = await chromium.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'] }); console.log('[tool-server] Playwright browser launched'); } return browser; } async function getPage() { if (!page || page.isClosed()) { const b = await launchBrowser(); if (context) await context.close().catch(() => {}); context = await b.newContext({ viewport: { width: 1280, height: 800 } }); page = await context.newPage(); } return page; } // ── ELEMENT EXTRACTION (the key feature for text-model browser control) ── async function extractElements(p) { try { const elements = await p.evaluate(() => { const items = []; const seen = new Set(); // Selectors for all interactive elements const selectors = [ 'a[href]', 'button', 'input', 'textarea', 'select', '[role="button"]', '[role="link"]', '[role="tab"]', '[role="menuitem"]', '[onclick]', '[contenteditable="true"]', 'summary', 'details', 'label[for]', ]; const allEls = document.querySelectorAll(selectors.join(',')); for (const el of allEls) { const rect = el.getBoundingClientRect(); // Skip invisible, off-screen, or tiny elements if (rect.width < 5 || rect.height < 5) continue; if (rect.top > window.innerHeight || rect.bottom < 0) continue; if (rect.left > window.innerWidth || rect.right < 0) continue; if (window.getComputedStyle(el).visibility === 'hidden') continue; if (window.getComputedStyle(el).display === 'none') continue; if (parseFloat(window.getComputedStyle(el).opacity) < 0.1) continue; const tag = el.tagName.toLowerCase(); const type = el.getAttribute('type') || ''; const role = el.getAttribute('role') || ''; const href = el.getAttribute('href') || ''; const placeholder = el.getAttribute('placeholder') || ''; const ariaLabel = el.getAttribute('aria-label') || ''; const title = el.getAttribute('title') || ''; const name = el.getAttribute('name') || ''; const value = el.value || ''; // Build a human-readable label let label = (el.innerText || '').trim().slice(0, 80); if (!label) label = ariaLabel || title || placeholder || name || ''; if (!label && tag === 'img') label = el.getAttribute('alt') || 'image'; if (!label) label = `(${tag}${type ? ' type=' + type : ''})`; // Centre coordinates const cx = Math.round(rect.left + rect.width / 2); const cy = Math.round(rect.top + rect.height / 2); // De-duplicate by position (within 5px) const key = `${Math.round(cx/5)*5},${Math.round(cy/5)*5}`; if (seen.has(key)) continue; seen.add(key); let kind = tag; if (tag === 'a') kind = 'link'; if (tag === 'button' || role === 'button') kind = 'button'; if (tag === 'input') kind = 'input' + (type ? `[${type}]` : ''); if (tag === 'textarea') kind = 'textarea'; if (tag === 'select') kind = 'select'; const item = { index: items.length + 1, kind, label, x: cx, y: cy }; if (tag === 'input' || tag === 'textarea') { item.value = value.slice(0, 100); if (placeholder) item.placeholder = placeholder; } if (tag === 'a' && href) { item.href = href.slice(0, 120); } items.push(item); } return items; }); return elements; } catch (e) { console.error('[tool-server] Element extraction error:', e.message); return []; } } // Format elements as readable text for the LLM function formatElements(elements) { if (!elements || elements.length === 0) return 'No interactive elements found on page.'; const lines = ['Interactive elements on page:']; for (const el of elements) { let line = ` [${el.index}] ${el.kind} "${el.label}" at (${el.x}, ${el.y})`; if (el.value) line += ` value="${el.value}"`; if (el.placeholder) line += ` placeholder="${el.placeholder}"`; if (el.href) line += ` → ${el.href}`; lines.push(line); } lines.push(''); lines.push('To click an element, use action "click" with its x,y coordinates.'); lines.push('To type into a focused input, use action "type" with text.'); return lines.join('\n'); } // Enhanced snap: screenshot + element list async function snap() { const p = await getPage(); const buf = await p.screenshot({ type: 'jpeg', quality: 70, fullPage: false }); const elements = await extractElements(p); return { screenshot: buf.toString('base64'), url: p.url(), title: await p.title(), elements, elementsText: formatElements(elements), }; } // Broadcast screenshot to all connected browser panel WebSocket clients async function broadcastScreenshot() { if (browserPanelClients.size === 0) return; try { const s = await snap(); const msg = JSON.stringify({ type: 'screenshot', data: s.screenshot, url: s.url, title: s.title }); for (const ws of browserPanelClients) { if (ws.readyState === WebSocket.OPEN) ws.send(msg); } } catch (e) { const errMsg = JSON.stringify({ type: 'error', msg: String(e) }); for (const ws of browserPanelClients) { if (ws.readyState === WebSocket.OPEN) ws.send(errMsg); } } } // ── BASH HANDLER ────────────────────────────────────────────── async function handleBash(body) { const { command, timeout = 30000 } = body; if (!command) return { error: 'No command provided' }; return new Promise((resolve) => { exec(command, { timeout, maxBuffer: 10 * 1024 * 1024, cwd: process.env.WORKSPACE || process.env.HOME || '/root', env: { ...process.env, TERM: 'dumb', COLUMNS: '200' }, }, (error, stdout, stderr) => { resolve({ stdout: stdout || '', stderr: stderr || '', exitCode: error ? (error.code ?? 1) : 0, output: (stdout || '') + (stderr ? '\nSTDERR: ' + stderr : ''), }); }); }); } // ── BROWSER HTTP HANDLERS (used by LLM tool) ───────────────── async function handleNavigate(body) { const { url: targetUrl } = body; if (!targetUrl) return { error: 'No URL' }; const p = await getPage(); const target = targetUrl.startsWith('http') ? targetUrl : 'https://' + targetUrl; await p.goto(target, { timeout: 30000, waitUntil: 'domcontentloaded' }); const result = await snap(); broadcastScreenshot(); return result; } async function handleClick(body) { const p = await getPage(); // Support clicking by element index if (body.index && !body.x && !body.y) { const elements = await extractElements(p); const el = elements.find(e => e.index === body.index); if (!el) return { error: `Element [${body.index}] not found. Use action "screenshot" to refresh element list.` }; await p.mouse.click(el.x, el.y); } else { await p.mouse.click(body.x || 0, body.y || 0); } await p.waitForTimeout(500); const result = await snap(); broadcastScreenshot(); return result; } async function handleType(body) { const p = await getPage(); if (body.selector) { await p.fill(body.selector, body.text || ''); } else if (body.index) { // Click the element first, then type const elements = await extractElements(p); const el = elements.find(e => e.index === body.index); if (el) { await p.mouse.click(el.x, el.y); await p.waitForTimeout(200); // Clear existing content and type new text await p.keyboard.press('Control+a'); await p.keyboard.type(body.text || ''); } else { return { error: `Element [${body.index}] not found.` }; } } else { await p.keyboard.type(body.text || ''); } await p.waitForTimeout(300); const result = await snap(); broadcastScreenshot(); return result; } async function handleScroll(body) { const p = await getPage(); await p.mouse.wheel(0, body.dy || 300); await p.waitForTimeout(300); const result = await snap(); broadcastScreenshot(); return result; } async function handleBack() { const p = await getPage(); await p.goBack({ timeout: 10000 }).catch(() => {}); const result = await snap(); broadcastScreenshot(); return result; } async function handleForward() { const p = await getPage(); await p.goForward({ timeout: 10000 }).catch(() => {}); const result = await snap(); broadcastScreenshot(); return result; } async function handleReload() { const p = await getPage(); await p.reload({ timeout: 15000 }).catch(() => {}); const result = await snap(); broadcastScreenshot(); return result; } async function handleText() { const p = await getPage(); const text = await p.evaluate(() => document.body.innerText); const elements = await extractElements(p); return { url: p.url(), title: await p.title(), text: text.slice(0, 8000), elements, elementsText: formatElements(elements) }; } async function handleEval(body) { const p = await getPage(); const result = await p.evaluate(body.script || 'null'); const ss = await snap(); broadcastScreenshot(); return { ...ss, evalResult: String(result) }; } async function handleElements() { const p = await getPage(); const elements = await extractElements(p); return { url: p.url(), title: await p.title(), elements, elementsText: formatElements(elements) }; } // Press a specific key (Enter, Tab, Escape, etc.) async function handleKeypress(body) { const p = await getPage(); await p.keyboard.press(body.key || 'Enter'); await p.waitForTimeout(300); const result = await snap(); broadcastScreenshot(); return result; } // ── HTTP ROUTES ─────────────────────────────────────────────── const routes = { '/api/bash': handleBash, '/api/browser/navigate': handleNavigate, '/api/browser/click': handleClick, '/api/browser/type': handleType, '/api/browser/scroll': handleScroll, '/api/browser/back': handleBack, '/api/browser/forward': handleForward, '/api/browser/reload': handleReload, '/api/browser/screenshot': async () => { const r = await snap(); broadcastScreenshot(); return r; }, '/api/browser/text': handleText, '/api/browser/eval': handleEval, '/api/browser/elements': handleElements, '/api/browser/keypress': handleKeypress, }; // ── HTTP SERVER ─────────────────────────────────────────────── const server = http.createServer(async (req, res) => { if (req.method === 'OPTIONS') { res.writeHead(204, cors); res.end(); return; } if (req.url === '/health') { res.writeHead(200, cors); res.end(JSON.stringify({ ok: true, browser: !!browser })); return; } const handler = routes[req.url]; if (req.method === 'POST' && handler) { try { const body = await parseBody(req); const result = await handler(body); res.writeHead(200, cors); res.end(JSON.stringify(result)); } catch (err) { res.writeHead(500, cors); res.end(JSON.stringify({ error: String(err) })); } return; } res.writeHead(404, cors); res.end(JSON.stringify({ error: 'Not found' })); }); // ── WEBSOCKET: TERMINAL (/ws/terminal) ──────────────────────── const terminalWss = new WebSocketServer({ noServer: true }); terminalWss.on('connection', (ws) => { console.log('[tool-server] Terminal WS client connected'); const shell = spawn('/bin/bash', [], { env: { ...process.env, TERM: 'xterm-256color', COLORTERM: 'truecolor' }, cwd: process.env.HOME || '/root', }); shell.stdout.on('data', (d) => { try { ws.send(JSON.stringify({ type: 'data', data: d.toString('binary') })); } catch {} }); shell.stderr.on('data', (d) => { try { ws.send(JSON.stringify({ type: 'data', data: d.toString('binary') })); } catch {} }); shell.on('close', (code) => { try { ws.send(JSON.stringify({ type: 'exit', code })); ws.close(); } catch {} }); ws.on('message', (msg) => { try { const m = JSON.parse(msg.toString()); if (m.type === 'input') shell.stdin.write(m.data); if (m.type === 'resize') { /* best effort without node-pty */ } } catch {} }); ws.on('close', () => { shell.kill(); }); }); // ── WEBSOCKET: BROWSER PANEL (/ws/browser) ──────────────────── const browserWss = new WebSocketServer({ noServer: true }); browserWss.on('connection', async (ws) => { console.log('[tool-server] Browser panel WS client connected'); browserPanelClients.add(ws); ws.on('close', () => browserPanelClients.delete(ws)); ws.on('error', () => browserPanelClients.delete(ws)); // Handle panel user interactions ws.on('message', async (msg) => { try { const m = JSON.parse(msg.toString()); if (m.type === 'navigate') { const p = await getPage(); const target = m.url.startsWith('http') ? m.url : 'https://' + m.url; ws.send(JSON.stringify({ type: 'loading' })); await p.goto(target, { timeout: 30000, waitUntil: 'domcontentloaded' }); await broadcastScreenshot(); } if (m.type === 'click') { const p = await getPage(); await p.mouse.click(m.x || 0, m.y || 0); await p.waitForTimeout(500); await broadcastScreenshot(); } if (m.type === 'scroll') { const p = await getPage(); await p.mouse.wheel(0, m.dy || 300); await p.waitForTimeout(300); await broadcastScreenshot(); } if (m.type === 'type') { const p = await getPage(); await p.keyboard.type(m.text || ''); await p.waitForTimeout(300); await broadcastScreenshot(); } if (m.type === 'back') { const p = await getPage(); await p.goBack({ timeout: 10000 }).catch(() => {}); await broadcastScreenshot(); } if (m.type === 'fwd') { const p = await getPage(); await p.goForward({ timeout: 10000 }).catch(() => {}); await broadcastScreenshot(); } if (m.type === 'reload') { const p = await getPage(); await p.reload({ timeout: 15000 }).catch(() => {}); await broadcastScreenshot(); } if (m.type === 'screenshot') { await broadcastScreenshot(); } } catch (e) { ws.send(JSON.stringify({ type: 'error', msg: String(e) })); } }); ws.send(JSON.stringify({ type: 'ready' })); if (page && !page.isClosed()) { try { await broadcastScreenshot(); } catch {} } }); // ── UPGRADE HANDLER ─────────────────────────────────────────── server.on('upgrade', (req, socket, head) => { const pathname = url.parse(req.url).pathname; if (pathname === '/ws/terminal') { terminalWss.handleUpgrade(req, socket, head, (ws) => terminalWss.emit('connection', ws, req)); } else if (pathname === '/ws/browser') { browserWss.handleUpgrade(req, socket, head, (ws) => browserWss.emit('connection', ws, req)); } else { socket.destroy(); } }); // ── START ───────────────────────────────────────────────────── server.listen(PORT, () => { console.log(`[tool-server] Unified server on http://localhost:${PORT}`); console.log(`[tool-server] HTTP API: POST /api/bash, /api/browser/*`); console.log(`[tool-server] Terminal WS: ws://localhost:${PORT}/ws/terminal`); console.log(`[tool-server] Browser WS: ws://localhost:${PORT}/ws/browser`); console.log(`[tool-server] Health: GET /health`); });