diff --git a/src/services/firecrawl-client.ts b/src/services/firecrawl-client.ts index 1396f26..69f126e 100644 --- a/src/services/firecrawl-client.ts +++ b/src/services/firecrawl-client.ts @@ -1,11 +1,19 @@ import defaultKy, { type KyInstance } from 'ky' +import pThrottle from 'p-throttle' import z from 'zod' import { aiFunction, AIFunctionsProvider } from '../fns.js' -import { assert, delay, getEnv } from '../utils.js' +import { assert, delay, getEnv, throttleKy } from '../utils.js' import { zodToJsonSchema } from '../zod-to-json-schema.js' export namespace firecrawl { + // Allow up to 1 request per second by default. + export const throttle = pThrottle({ + limit: 1, + interval: 1000, + strict: true + }) + /** * Generic parameter interface. */ @@ -96,11 +104,13 @@ export class FirecrawlClient extends AIFunctionsProvider { apiKey = getEnv('FIRECRAWL_API_KEY'), apiBaseUrl = getEnv('FIRECRAWL_API_BASE_URL') ?? 'https://api.firecrawl.dev', + throttle = true, timeoutMs = 60_000, ky = defaultKy }: { apiKey?: string apiBaseUrl?: string + throttle?: boolean timeoutMs?: number ky?: KyInstance } = {}) { @@ -117,7 +127,9 @@ export class FirecrawlClient extends AIFunctionsProvider { this.apiKey = apiKey this.apiBaseUrl = apiBaseUrl - this.ky = ky.extend({ + const throttledKy = throttle ? throttleKy(ky, firecrawl.throttle) : ky + + this.ky = throttledKy.extend({ prefixUrl: apiBaseUrl, timeout: timeoutMs, headers: { @@ -155,18 +167,7 @@ export class FirecrawlClient extends AIFunctionsProvider { } } - const res = await this.ky - .post('v0/scrape', { json }) - .json() - - if (!res.success || !res.data) return res - - if (res.data.markdown) { - delete res.data.html - delete res.data.content - } - - return res + return this.ky.post('v0/scrape', { json }).json() } async search( diff --git a/src/services/perigon-client.ts b/src/services/perigon-client.ts index e77c28d..5822be0 100644 --- a/src/services/perigon-client.ts +++ b/src/services/perigon-client.ts @@ -681,20 +681,20 @@ export class PerigonClient extends AIFunctionsProvider { }) }) async searchArticles(opts: perigon.ArticlesSearchOptions) { - const searchParams = sanitizeSearchParams({ - sortBy: 'relevance', - ...opts, - apiKey: this.apiKey, - size: Math.max( - 1, - Math.min(perigon.MAX_PAGE_SIZE, opts.size || perigon.DEFAULT_PAGE_SIZE) - ) - }) - console.log('perigon.searchArticles', searchParams) - return this.ky .get('all', { - searchParams + searchParams: sanitizeSearchParams({ + sortBy: 'relevance', + ...opts, + apiKey: this.apiKey, + size: Math.max( + 1, + Math.min( + perigon.MAX_PAGE_SIZE, + opts.size || perigon.DEFAULT_PAGE_SIZE + ) + ) + }) }) .json() } @@ -721,20 +721,20 @@ export class PerigonClient extends AIFunctionsProvider { }) }) async searchStories(opts: perigon.StoriesSearchOptions) { - const searchParams = sanitizeSearchParams({ - sortBy: 'relevance', - ...opts, - apiKey: this.apiKey, - size: Math.max( - 1, - Math.min(perigon.MAX_PAGE_SIZE, opts.size || perigon.DEFAULT_PAGE_SIZE) - ) - }) - console.log('perigon.searchStories', searchParams) - return this.ky .get('stories/all', { - searchParams + searchParams: sanitizeSearchParams({ + sortBy: 'relevance', + ...opts, + apiKey: this.apiKey, + size: Math.max( + 1, + Math.min( + perigon.MAX_PAGE_SIZE, + opts.size || perigon.DEFAULT_PAGE_SIZE + ) + ) + }) }) .json() } diff --git a/src/services/scraper-client.ts b/src/services/scraper-client.ts index b0719f2..8f5eb07 100644 --- a/src/services/scraper-client.ts +++ b/src/services/scraper-client.ts @@ -82,7 +82,7 @@ export class ScraperClient extends AIFunctionsProvider { | string | { url: string - format?: 'html' | 'markdown' | 'plaintext' + format?: 'html' | 'markdown' | 'plaintext' | 'all' timeoutMs?: number } ): Promise> { diff --git a/src/url-utils.test.ts b/src/url-utils.test.ts index ff5e278..061acc7 100644 --- a/src/url-utils.test.ts +++ b/src/url-utils.test.ts @@ -17,6 +17,9 @@ describe('normalizeUrl', () => { expect(normalizeUrl('https://google.com/abc/123//')).toBe( 'https://google.com/abc/123' ) + expect(normalizeUrl('//google.com')).toBe('https://google.com') + expect(normalizeUrl('google.com')).toBe('https://google.com') + expect(normalizeUrl('abc.foo.com')).toBe('https://abc.foo.com') }) test('invalid urls', async () => { diff --git a/src/url-utils.ts b/src/url-utils.ts index 0d014b1..2b8c2cf 100644 --- a/src/url-utils.ts +++ b/src/url-utils.ts @@ -45,10 +45,18 @@ export function normalizeUrl( ): string | undefined { let normalizedUrl: string | undefined - if (!url || isRelativeUrl(url)) { + if (!url || typeof url !== 'string') { return undefined } + if (isRelativeUrl(url)) { + if (!/^[./]/.test(url) && url.indexOf('.') > 0) { + url = `https://${url}` + } else { + return undefined + } + } + const opts = { stripWWW: false, defaultProtocol: 'https',