Files
nanobot-ts/src/agent/tools/web.ts
2026-03-13 14:46:15 -06:00

195 lines
6.8 KiB
TypeScript

import { Readability } from '@mozilla/readability';
import { parse as parseHtml } from 'node-html-parser';
import { strArg } from './base.ts';
import type { Tool } from './base.ts';
const FETCH_TIMEOUT_MS = 30_000;
const MAX_CONTENT_CHARS = 50_000;
// ---------------------------------------------------------------------------
// web_search (Brave Search API)
// ---------------------------------------------------------------------------
export class WebSearchTool implements Tool {
readonly name = 'web_search';
readonly description =
'Search the web using Brave Search. Returns a list of results with titles, URLs, and snippets.';
readonly parameters = {
query: { type: 'string', description: 'Search query.' },
count: { type: 'number', description: 'Number of results (default 10, max 20).' },
};
readonly required = ['query'];
private _apiKey: string | undefined;
private _proxy: string | undefined;
constructor(opts: { apiKey?: string; proxy?: string } = {}) {
this._apiKey = opts.apiKey;
this._proxy = opts.proxy;
}
async execute(args: Record<string, unknown>): Promise<string> {
const query = strArg(args, 'query').trim();
if (!query) return 'Error: query is required.';
if (!this._apiKey)
return 'Error: BRAVE_API_KEY not configured (set tools.web.braveApiKey in config).';
const count = Math.min(Number(args['count'] ?? 10), 20);
const url = `https://api.search.brave.com/res/v1/web/search?q=${encodeURIComponent(query)}&count=${count}`;
try {
const res = await fetchWithTimeout(url, {
headers: {
Accept: 'application/json',
'Accept-Encoding': 'gzip',
'X-Subscription-Token': this._apiKey,
},
});
if (!res.ok) return `Error: Brave Search API returned ${res.status}: ${await res.text()}`;
const data = (await res.json()) as {
web?: { results?: Array<{ title: string; url: string; description: string }> };
};
const results = data.web?.results ?? [];
if (results.length === 0) return 'No results found.';
return results
.map((r, i) => `${i + 1}. ${r.title}\n ${r.url}\n ${r.description ?? ''}`)
.join('\n\n');
} catch (err) {
return `Error: ${String(err)}`;
}
}
}
// ---------------------------------------------------------------------------
// web_fetch
// ---------------------------------------------------------------------------
export class WebFetchTool implements Tool {
readonly name = 'web_fetch';
readonly description =
'Fetch a URL and return its content. HTML pages are extracted to readable text. Use mode="raw" for JSON/XML/plain text.';
readonly parameters = {
url: { type: 'string', description: 'URL to fetch.' },
mode: {
type: 'string',
enum: ['markdown', 'text', 'raw'],
description: 'Output mode (default: text).',
},
};
readonly required = ['url'];
private _proxy: string | undefined;
constructor(opts: { proxy?: string } = {}) {
this._proxy = opts.proxy;
}
async execute(args: Record<string, unknown>): Promise<string> {
const url = strArg(args, 'url').trim();
if (!url) return 'Error: url is required.';
const mode = strArg(args, 'mode', 'text');
try {
const res = await fetchWithTimeout(url, {
headers: { 'User-Agent': 'Mozilla/5.0 (compatible; nanobot/1.0)' },
});
if (!res.ok) return `Error: HTTP ${res.status} from ${url}`;
const contentType = res.headers.get('content-type') ?? '';
const body = await res.text();
if (
mode === 'raw' ||
(!contentType.includes('text/html') && !body.trimStart().startsWith('<'))
) {
const truncated =
body.length > MAX_CONTENT_CHARS
? body.slice(0, MAX_CONTENT_CHARS) + '\n... (truncated)'
: body;
return truncated;
}
// Parse HTML with Readability
// Readability needs a DOM — build one from node-html-parser
const root = parseHtml(body);
// Minimal JSDOM-compatible interface for Readability
// biome-ignore lint/suspicious/noExplicitAny: Readability duck-typing requires any
const doc = makePseudoDocument(url, body, root) as any;
const reader = new Readability(doc);
const article = reader.parse();
const title = article?.title ?? '';
const textContent = article?.textContent ?? stripTags(body);
const trimmed = textContent.replace(/\n{3,}/g, '\n\n').trim();
const truncated =
trimmed.length > MAX_CONTENT_CHARS
? trimmed.slice(0, MAX_CONTENT_CHARS) + '\n... (truncated)'
: trimmed;
return title ? `# ${title}\n\n${truncated}` : truncated;
} catch (err) {
return `Error fetching ${url}: ${String(err)}`;
}
}
}
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
function fetchWithTimeout(url: string, init: RequestInit = {}): Promise<Response> {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
return fetch(url, { ...init, signal: controller.signal }).finally(() => clearTimeout(timer));
}
function stripTags(html: string): string {
return html
.replace(/<[^>]*>/g, ' ')
.replace(/\s+/g, ' ')
.trim();
}
/** Build a minimal pseudo-document that satisfies Readability's interface. */
function makePseudoDocument(
url: string,
html: string,
root: ReturnType<typeof parseHtml>,
): Record<string, unknown> {
// node-html-parser's API is close enough for Readability's needs when
// accessed via a proxy. We create a real DOMParser-like wrapper.
// Bun/Node don't have DOMParser built-in, so we duck-type what Readability
// needs: baseURI, documentURI, querySelector, querySelectorAll, innerHTML.
const pseudoDoc = {
baseURI: url,
documentURI: url,
URL: url,
title: root.querySelector('title')?.text ?? '',
documentElement: root,
body: root.querySelector('body') ?? root,
head: root.querySelector('head') ?? root,
// biome-ignore lint/suspicious/noExplicitAny: Readability duck-typing
querySelector: (sel: string) => root.querySelector(sel) as any,
// biome-ignore lint/suspicious/noExplicitAny: Readability duck-typing
querySelectorAll: (sel: string) => root.querySelectorAll(sel) as any,
getElementsByTagName: (tag: string) => root.querySelectorAll(tag),
createElement: (_tag: string) => ({ innerHTML: '', textContent: '', style: {} }),
createTreeWalker: () => ({ nextNode: () => null }),
createRange: () => ({ selectNodeContents: () => {}, cloneContents: () => null }),
// biome-ignore lint/suspicious/noExplicitAny: Readability duck-typing
get innerHTML() {
return html;
},
location: { href: url },
};
return pseudoDoc;
}