feat: fix normalizeUrl; improve firecrawl, perigon, scraper clients

2024-06-18 00:36:57 -07:00 · 2024-06-18 00:36:57 -07:00 · c0a5323ecf
commit c0a5323ecf
--- a/src/services/firecrawl-client.ts
+++ b/src/services/firecrawl-client.ts
@ -1,11 +1,19 @@
 import defaultKy, { type KyInstance } from 'ky'
+import pThrottle from 'p-throttle'
 import z from 'zod'

 import { aiFunction, AIFunctionsProvider } from '../fns.js'
-import { assert, delay, getEnv } from '../utils.js'
+import { assert, delay, getEnv, throttleKy } from '../utils.js'
 import { zodToJsonSchema } from '../zod-to-json-schema.js'

 export namespace firecrawl {
+  // Allow up to 1 request per second by default.
+  export const throttle = pThrottle({
+    limit: 1,
+    interval: 1000,
+    strict: true
+  })
+
  /**
   * Generic parameter interface.
   */
@ -96,11 +104,13 @@ export class FirecrawlClient extends AIFunctionsProvider {
    apiKey = getEnv('FIRECRAWL_API_KEY'),
    apiBaseUrl = getEnv('FIRECRAWL_API_BASE_URL') ??
      'https://api.firecrawl.dev',
+    throttle = true,
    timeoutMs = 60_000,
    ky = defaultKy
  }: {
    apiKey?: string
    apiBaseUrl?: string
+    throttle?: boolean
    timeoutMs?: number
    ky?: KyInstance
  } = {}) {
@ -117,7 +127,9 @@ export class FirecrawlClient extends AIFunctionsProvider {
    this.apiKey = apiKey
    this.apiBaseUrl = apiBaseUrl

-    this.ky = ky.extend({
+    const throttledKy = throttle ? throttleKy(ky, firecrawl.throttle) : ky
+
+    this.ky = throttledKy.extend({
      prefixUrl: apiBaseUrl,
      timeout: timeoutMs,
      headers: {
@ -155,18 +167,7 @@ export class FirecrawlClient extends AIFunctionsProvider {
      }
    }

-    const res = await this.ky
-      .post('v0/scrape', { json })
-      .json<firecrawl.ScrapeResponse>()
-
-    if (!res.success || !res.data) return res
-
-    if (res.data.markdown) {
-      delete res.data.html
-      delete res.data.content
-    }
-
-    return res
+    return this.ky.post('v0/scrape', { json }).json<firecrawl.ScrapeResponse>()
  }

  async search(
--- a/src/services/perigon-client.ts
+++ b/src/services/perigon-client.ts
@ -681,20 +681,20 @@ export class PerigonClient extends AIFunctionsProvider {
    })
  })
  async searchArticles(opts: perigon.ArticlesSearchOptions) {
-    const searchParams = sanitizeSearchParams({
-      sortBy: 'relevance',
-      ...opts,
-      apiKey: this.apiKey,
-      size: Math.max(
-        1,
-        Math.min(perigon.MAX_PAGE_SIZE, opts.size || perigon.DEFAULT_PAGE_SIZE)
-      )
-    })
-    console.log('perigon.searchArticles', searchParams)
-
    return this.ky
      .get('all', {
-        searchParams
+        searchParams: sanitizeSearchParams({
+          sortBy: 'relevance',
+          ...opts,
+          apiKey: this.apiKey,
+          size: Math.max(
+            1,
+            Math.min(
+              perigon.MAX_PAGE_SIZE,
+              opts.size || perigon.DEFAULT_PAGE_SIZE
+            )
+          )
+        })
      })
      .json<perigon.ArticlesSearchResponse>()
  }
@ -721,20 +721,20 @@ export class PerigonClient extends AIFunctionsProvider {
    })
  })
  async searchStories(opts: perigon.StoriesSearchOptions) {
-    const searchParams = sanitizeSearchParams({
-      sortBy: 'relevance',
-      ...opts,
-      apiKey: this.apiKey,
-      size: Math.max(
-        1,
-        Math.min(perigon.MAX_PAGE_SIZE, opts.size || perigon.DEFAULT_PAGE_SIZE)
-      )
-    })
-    console.log('perigon.searchStories', searchParams)
-
    return this.ky
      .get('stories/all', {
-        searchParams
+        searchParams: sanitizeSearchParams({
+          sortBy: 'relevance',
+          ...opts,
+          apiKey: this.apiKey,
+          size: Math.max(
+            1,
+            Math.min(
+              perigon.MAX_PAGE_SIZE,
+              opts.size || perigon.DEFAULT_PAGE_SIZE
+            )
+          )
+        })
      })
      .json<perigon.StoriesSearchResponse>()
  }
--- a/src/services/scraper-client.ts
+++ b/src/services/scraper-client.ts
@ -82,7 +82,7 @@ export class ScraperClient extends AIFunctionsProvider {
      | string
      | {
          url: string
-          format?: 'html' | 'markdown' | 'plaintext'
+          format?: 'html' | 'markdown' | 'plaintext' | 'all'
          timeoutMs?: number
        }
  ): Promise<Partial<scraper.ScrapeResult>> {
--- a/src/url-utils.test.ts
+++ b/src/url-utils.test.ts
@ -17,6 +17,9 @@ describe('normalizeUrl', () => {
    expect(normalizeUrl('https://google.com/abc/123//')).toBe(
      'https://google.com/abc/123'
    )
+    expect(normalizeUrl('//google.com')).toBe('https://google.com')
+    expect(normalizeUrl('google.com')).toBe('https://google.com')
+    expect(normalizeUrl('abc.foo.com')).toBe('https://abc.foo.com')
  })

  test('invalid urls', async () => {
--- a/src/url-utils.ts
+++ b/src/url-utils.ts
@ -45,10 +45,18 @@ export function normalizeUrl(
 ): string | undefined {
  let normalizedUrl: string | undefined

-  if (!url || isRelativeUrl(url)) {
+  if (!url || typeof url !== 'string') {
    return undefined
  }

+  if (isRelativeUrl(url)) {
+    if (!/^[./]/.test(url) && url.indexOf('.') > 0) {
+      url = `https://${url}`
+    } else {
+      return undefined
+    }
+  }
+
  const opts = {
    stripWWW: false,
    defaultProtocol: 'https',