nanobot-ts/src/agent/tools/web.ts

import { Readability } from '@mozilla/readability';
import { parse as parseHtml } from 'node-html-parser';
import { strArg } from './base.ts';
import type { Tool } from './base.ts';

const FETCH_TIMEOUT_MS = 30_000;
const MAX_CONTENT_CHARS = 50_000;

// ---------------------------------------------------------------------------
// web_search  (Brave Search API)
// ---------------------------------------------------------------------------

export class WebSearchTool implements Tool {
  readonly name = 'web_search';
  readonly description =
    'Search the web using Brave Search. Returns a list of results with titles, URLs, and snippets.';
  readonly parameters = {
    query: { type: 'string', description: 'Search query.' },
    count: { type: 'number', description: 'Number of results (default 10, max 20).' },
  };
  readonly required = ['query'];

  private _apiKey: string | undefined;
  private _proxy: string | undefined;

  constructor(opts: { apiKey?: string; proxy?: string } = {}) {
    this._apiKey = opts.apiKey;
    this._proxy = opts.proxy;
  }

  async execute(args: Record<string, unknown>): Promise<string> {
    const query = strArg(args, 'query').trim();
    if (!query) return 'Error: query is required.';
    if (!this._apiKey)
      return 'Error: BRAVE_API_KEY not configured (set tools.web.braveApiKey in config).';

    const count = Math.min(Number(args['count'] ?? 10), 20);
    const url = `https://api.search.brave.com/res/v1/web/search?q=${encodeURIComponent(query)}&count=${count}`;

    try {
      const res = await fetchWithTimeout(url, {
        headers: {
          Accept: 'application/json',
          'Accept-Encoding': 'gzip',
          'X-Subscription-Token': this._apiKey,
        },
      });

      if (!res.ok) return `Error: Brave Search API returned ${res.status}: ${await res.text()}`;

      const data = (await res.json()) as {
        web?: { results?: Array<{ title: string; url: string; description: string }> };
      };
      const results = data.web?.results ?? [];

      if (results.length === 0) return 'No results found.';

      return results
        .map((r, i) => `${i + 1}. ${r.title}\n   ${r.url}\n   ${r.description ?? ''}`)
        .join('\n\n');
    } catch (err) {
      return `Error: ${String(err)}`;
    }
  }
}

// ---------------------------------------------------------------------------
// web_fetch
// ---------------------------------------------------------------------------

export class WebFetchTool implements Tool {
  readonly name = 'web_fetch';
  readonly description =
    'Fetch a URL and return its content. HTML pages are extracted to readable text. Use mode="raw" for JSON/XML/plain text.';
  readonly parameters = {
    url: { type: 'string', description: 'URL to fetch.' },
    mode: {
      type: 'string',
      enum: ['markdown', 'text', 'raw'],
      description: 'Output mode (default: text).',
    },
  };
  readonly required = ['url'];

  private _proxy: string | undefined;

  constructor(opts: { proxy?: string } = {}) {
    this._proxy = opts.proxy;
  }

  async execute(args: Record<string, unknown>): Promise<string> {
    const url = strArg(args, 'url').trim();
    if (!url) return 'Error: url is required.';

    const mode = strArg(args, 'mode', 'text');

    try {
      const res = await fetchWithTimeout(url, {
        headers: { 'User-Agent': 'Mozilla/5.0 (compatible; nanobot/1.0)' },
      });

      if (!res.ok) return `Error: HTTP ${res.status} from ${url}`;

      const contentType = res.headers.get('content-type') ?? '';
      const body = await res.text();

      if (
        mode === 'raw' ||
        (!contentType.includes('text/html') && !body.trimStart().startsWith('<'))
      ) {
        const truncated =
          body.length > MAX_CONTENT_CHARS
            ? body.slice(0, MAX_CONTENT_CHARS) + '\n... (truncated)'
            : body;
        return truncated;
      }

      // Parse HTML with Readability
      // Readability needs a DOM — build one from node-html-parser
      const root = parseHtml(body);

      // Minimal JSDOM-compatible interface for Readability
      // biome-ignore lint/suspicious/noExplicitAny: Readability duck-typing requires any
      const doc = makePseudoDocument(url, body, root) as any;
      const reader = new Readability(doc);
      const article = reader.parse();

      const title = article?.title ?? '';
      const textContent = article?.textContent ?? stripTags(body);
      const trimmed = textContent.replace(/\n{3,}/g, '\n\n').trim();
      const truncated =
        trimmed.length > MAX_CONTENT_CHARS
          ? trimmed.slice(0, MAX_CONTENT_CHARS) + '\n... (truncated)'
          : trimmed;

      return title ? `# ${title}\n\n${truncated}` : truncated;
    } catch (err) {
      return `Error fetching ${url}: ${String(err)}`;
    }
  }
}

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------

function fetchWithTimeout(url: string, init: RequestInit = {}): Promise<Response> {
  const controller = new AbortController();
  const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
  return fetch(url, { ...init, signal: controller.signal }).finally(() => clearTimeout(timer));
}

function stripTags(html: string): string {
  return html
    .replace(/<[^>]*>/g, ' ')
    .replace(/\s+/g, ' ')
    .trim();
}

/** Build a minimal pseudo-document that satisfies Readability's interface. */
function makePseudoDocument(
  url: string,
  html: string,
  root: ReturnType<typeof parseHtml>,
): Record<string, unknown> {
  // node-html-parser's API is close enough for Readability's needs when
  // accessed via a proxy. We create a real DOMParser-like wrapper.
  // Bun/Node don't have DOMParser built-in, so we duck-type what Readability
  // needs: baseURI, documentURI, querySelector, querySelectorAll, innerHTML.
  const pseudoDoc = {
    baseURI: url,
    documentURI: url,
    URL: url,
    title: root.querySelector('title')?.text ?? '',
    documentElement: root,
    body: root.querySelector('body') ?? root,
    head: root.querySelector('head') ?? root,
    // biome-ignore lint/suspicious/noExplicitAny: Readability duck-typing
    querySelector: (sel: string) => root.querySelector(sel) as any,
    // biome-ignore lint/suspicious/noExplicitAny: Readability duck-typing
    querySelectorAll: (sel: string) => root.querySelectorAll(sel) as any,
    getElementsByTagName: (tag: string) => root.querySelectorAll(tag),
    createElement: (_tag: string) => ({ innerHTML: '', textContent: '', style: {} }),
    createTreeWalker: () => ({ nextNode: () => null }),
    createRange: () => ({ selectNodeContents: () => {}, cloneContents: () => null }),
    // biome-ignore lint/suspicious/noExplicitAny: Readability duck-typing
    get innerHTML() {
      return html;
    },
    location: { href: url },
  };

  return pseudoDoc;
}