195 lines
6.8 KiB
TypeScript
195 lines
6.8 KiB
TypeScript
import { Readability } from '@mozilla/readability';
|
|
import { parse as parseHtml } from 'node-html-parser';
|
|
import { strArg } from './base.ts';
|
|
import type { Tool } from './base.ts';
|
|
|
|
const FETCH_TIMEOUT_MS = 30_000;
|
|
const MAX_CONTENT_CHARS = 50_000;
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// web_search (Brave Search API)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
export class WebSearchTool implements Tool {
|
|
readonly name = 'web_search';
|
|
readonly description =
|
|
'Search the web using Brave Search. Returns a list of results with titles, URLs, and snippets.';
|
|
readonly parameters = {
|
|
query: { type: 'string', description: 'Search query.' },
|
|
count: { type: 'number', description: 'Number of results (default 10, max 20).' },
|
|
};
|
|
readonly required = ['query'];
|
|
|
|
private _apiKey: string | undefined;
|
|
private _proxy: string | undefined;
|
|
|
|
constructor(opts: { apiKey?: string; proxy?: string } = {}) {
|
|
this._apiKey = opts.apiKey;
|
|
this._proxy = opts.proxy;
|
|
}
|
|
|
|
async execute(args: Record<string, unknown>): Promise<string> {
|
|
const query = strArg(args, 'query').trim();
|
|
if (!query) return 'Error: query is required.';
|
|
if (!this._apiKey)
|
|
return 'Error: BRAVE_API_KEY not configured (set tools.web.braveApiKey in config).';
|
|
|
|
const count = Math.min(Number(args['count'] ?? 10), 20);
|
|
const url = `https://api.search.brave.com/res/v1/web/search?q=${encodeURIComponent(query)}&count=${count}`;
|
|
|
|
try {
|
|
const res = await fetchWithTimeout(url, {
|
|
headers: {
|
|
Accept: 'application/json',
|
|
'Accept-Encoding': 'gzip',
|
|
'X-Subscription-Token': this._apiKey,
|
|
},
|
|
});
|
|
|
|
if (!res.ok) return `Error: Brave Search API returned ${res.status}: ${await res.text()}`;
|
|
|
|
const data = (await res.json()) as {
|
|
web?: { results?: Array<{ title: string; url: string; description: string }> };
|
|
};
|
|
const results = data.web?.results ?? [];
|
|
|
|
if (results.length === 0) return 'No results found.';
|
|
|
|
return results
|
|
.map((r, i) => `${i + 1}. ${r.title}\n ${r.url}\n ${r.description ?? ''}`)
|
|
.join('\n\n');
|
|
} catch (err) {
|
|
return `Error: ${String(err)}`;
|
|
}
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// web_fetch
|
|
// ---------------------------------------------------------------------------
|
|
|
|
export class WebFetchTool implements Tool {
|
|
readonly name = 'web_fetch';
|
|
readonly description =
|
|
'Fetch a URL and return its content. HTML pages are extracted to readable text. Use mode="raw" for JSON/XML/plain text.';
|
|
readonly parameters = {
|
|
url: { type: 'string', description: 'URL to fetch.' },
|
|
mode: {
|
|
type: 'string',
|
|
enum: ['markdown', 'text', 'raw'],
|
|
description: 'Output mode (default: text).',
|
|
},
|
|
};
|
|
readonly required = ['url'];
|
|
|
|
private _proxy: string | undefined;
|
|
|
|
constructor(opts: { proxy?: string } = {}) {
|
|
this._proxy = opts.proxy;
|
|
}
|
|
|
|
async execute(args: Record<string, unknown>): Promise<string> {
|
|
const url = strArg(args, 'url').trim();
|
|
if (!url) return 'Error: url is required.';
|
|
|
|
const mode = strArg(args, 'mode', 'text');
|
|
|
|
try {
|
|
const res = await fetchWithTimeout(url, {
|
|
headers: { 'User-Agent': 'Mozilla/5.0 (compatible; nanobot/1.0)' },
|
|
});
|
|
|
|
if (!res.ok) return `Error: HTTP ${res.status} from ${url}`;
|
|
|
|
const contentType = res.headers.get('content-type') ?? '';
|
|
const body = await res.text();
|
|
|
|
if (
|
|
mode === 'raw' ||
|
|
(!contentType.includes('text/html') && !body.trimStart().startsWith('<'))
|
|
) {
|
|
const truncated =
|
|
body.length > MAX_CONTENT_CHARS
|
|
? body.slice(0, MAX_CONTENT_CHARS) + '\n... (truncated)'
|
|
: body;
|
|
return truncated;
|
|
}
|
|
|
|
// Parse HTML with Readability
|
|
// Readability needs a DOM — build one from node-html-parser
|
|
const root = parseHtml(body);
|
|
|
|
// Minimal JSDOM-compatible interface for Readability
|
|
// biome-ignore lint/suspicious/noExplicitAny: Readability duck-typing requires any
|
|
const doc = makePseudoDocument(url, body, root) as any;
|
|
const reader = new Readability(doc);
|
|
const article = reader.parse();
|
|
|
|
const title = article?.title ?? '';
|
|
const textContent = article?.textContent ?? stripTags(body);
|
|
const trimmed = textContent.replace(/\n{3,}/g, '\n\n').trim();
|
|
const truncated =
|
|
trimmed.length > MAX_CONTENT_CHARS
|
|
? trimmed.slice(0, MAX_CONTENT_CHARS) + '\n... (truncated)'
|
|
: trimmed;
|
|
|
|
return title ? `# ${title}\n\n${truncated}` : truncated;
|
|
} catch (err) {
|
|
return `Error fetching ${url}: ${String(err)}`;
|
|
}
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Helpers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
function fetchWithTimeout(url: string, init: RequestInit = {}): Promise<Response> {
|
|
const controller = new AbortController();
|
|
const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
|
|
return fetch(url, { ...init, signal: controller.signal }).finally(() => clearTimeout(timer));
|
|
}
|
|
|
|
function stripTags(html: string): string {
|
|
return html
|
|
.replace(/<[^>]*>/g, ' ')
|
|
.replace(/\s+/g, ' ')
|
|
.trim();
|
|
}
|
|
|
|
/** Build a minimal pseudo-document that satisfies Readability's interface. */
|
|
function makePseudoDocument(
|
|
url: string,
|
|
html: string,
|
|
root: ReturnType<typeof parseHtml>,
|
|
): Record<string, unknown> {
|
|
// node-html-parser's API is close enough for Readability's needs when
|
|
// accessed via a proxy. We create a real DOMParser-like wrapper.
|
|
// Bun/Node don't have DOMParser built-in, so we duck-type what Readability
|
|
// needs: baseURI, documentURI, querySelector, querySelectorAll, innerHTML.
|
|
const pseudoDoc = {
|
|
baseURI: url,
|
|
documentURI: url,
|
|
URL: url,
|
|
title: root.querySelector('title')?.text ?? '',
|
|
documentElement: root,
|
|
body: root.querySelector('body') ?? root,
|
|
head: root.querySelector('head') ?? root,
|
|
// biome-ignore lint/suspicious/noExplicitAny: Readability duck-typing
|
|
querySelector: (sel: string) => root.querySelector(sel) as any,
|
|
// biome-ignore lint/suspicious/noExplicitAny: Readability duck-typing
|
|
querySelectorAll: (sel: string) => root.querySelectorAll(sel) as any,
|
|
getElementsByTagName: (tag: string) => root.querySelectorAll(tag),
|
|
createElement: (_tag: string) => ({ innerHTML: '', textContent: '', style: {} }),
|
|
createTreeWalker: () => ({ nextNode: () => null }),
|
|
createRange: () => ({ selectNodeContents: () => {}, cloneContents: () => null }),
|
|
// biome-ignore lint/suspicious/noExplicitAny: Readability duck-typing
|
|
get innerHTML() {
|
|
return html;
|
|
},
|
|
location: { href: url },
|
|
};
|
|
|
|
return pseudoDoc;
|
|
}
|