👱

2024-05-26 17:07:47 -05:00 · 2024-05-26 17:07:47 -05:00 · ca31b560a8
commit ca31b560a8
--- a/readme.md
+++ b/readme.md
@ -23,6 +23,7 @@
 - dexa
 - diffbot
 - exa
+- firecrawl
 - people data labs
 - perigon
 - predict leads
@ -43,11 +44,9 @@
    - agentic
    - walter
 - services
-  - exa - need to update to correct format
  - wolfram alpha
  - wikipedia
  - midjourney
-  - firecrawl
  - unstructured
  - pull from [langchain](https://github.com/langchain-ai/langchainjs/tree/main/langchain)
  - pull from other libs
--- a/src/services/exa-client.ts
+++ b/src/services/exa-client.ts
@ -156,24 +156,23 @@ export class ExaClient {
    })
  }

-  async search(query: string, options?: exa.RegularSearchOptions) {
-    return this.ky
-      .post('search', { json: { ...options, query } })
-      .json<exa.SearchResponse>()
+  /**
+   * Performs an Exa search for the given query.
+   */
+  async search(opts: { query: string } & exa.RegularSearchOptions) {
+    return this.ky.post('search', { json: opts }).json<exa.SearchResponse>()
  }

  /**
   * Performs a search with a Exa prompt-engineered query and returns the
   * contents of the documents.
-   *
-   * @param {string} query - The query string.
   */
-  async searchAndContents<T extends exa.ContentsOptions = exa.ContentsOptions>(
-    query: string,
-    options?: exa.RegularSearchOptions & T
-  ) {
-    const { text, highlights, ...rest } = options || {}
-
+  async searchAndContents<T extends exa.ContentsOptions = exa.ContentsOptions>({
+    query,
+    text,
+    highlights,
+    ...rest
+  }: { query: string } & exa.RegularSearchOptions & T) {
    return this.ky
      .post('search', {
        json: {
@ -193,12 +192,10 @@ export class ExaClient {

  /**
   * Finds similar links to the provided URL.
-   *
-   * @param {string} url - The URL for which to find similar links.
   */
-  async findSimilar(url: string, options?: exa.FindSimilarOptions) {
+  async findSimilar(opts: { url: string } & exa.FindSimilarOptions) {
    return this.ky
-      .post('findSimilar', { json: { url, ...options } })
+      .post('findSimilar', { json: opts })
      .json<exa.SearchResponse>()
  }

@ -210,9 +207,12 @@ export class ExaClient {
   */
  async findSimilarAndContents<
    T extends exa.ContentsOptions = exa.ContentsOptions
-  >(url: string, options?: exa.FindSimilarOptions & T) {
-    const { text, highlights, ...rest } = options || {}
-
+  >({
+    url,
+    text,
+    highlights,
+    ...rest
+  }: { url: string } & exa.FindSimilarOptions & T) {
    return this.ky
      .post('findSimilar', {
        json: {
@ -235,10 +235,10 @@ export class ExaClient {
   *
   * @param {string | string[] | SearchResult[]} ids - An array of document IDs.
   */
-  async getContents<T extends exa.ContentsOptions>(
-    ids: string | string[] | exa.SearchResult[],
-    options?: T
-  ) {
+  async getContents<T extends exa.ContentsOptions = exa.ContentsOptions>({
+    ids,
+    ...opts
+  }: { ids: string | string[] | exa.SearchResult[] } & T) {
    let requestIds: string[]

    if (typeof ids === 'string') {
@ -256,8 +256,8 @@ export class ExaClient {
    return this.ky
      .post('contents', {
        json: {
-          ids: requestIds,
-          ...options
+          ...opts,
+          ids: requestIds
        }
      })
      .json<exa.SearchResponse<T>>()
--- a/src/services/firecrawl-client.ts
+++ b/src/services/firecrawl-client.ts
@ -0,0 +1,205 @@
+import defaultKy, { type KyInstance } from 'ky'
+import z from 'zod'
+
+import { assert, delay, getEnv } from '../utils.js'
+import { zodToJsonSchema } from '../zod-to-json-schema.js'
+
+export namespace firecrawl {
+  /**
+   * Generic parameter interface.
+   */
+  export interface Params {
+    [key: string]: any
+    extractorOptions?: {
+      extractionSchema: z.ZodSchema | any
+      mode?: 'llm-extraction'
+      extractionPrompt?: string
+    }
+  }
+
+  /**
+   * Response interface for scraping operations.
+   */
+  export interface ScrapeResponse {
+    success: boolean
+    data?: any
+    error?: string
+  }
+
+  /**
+   * Response interface for searching operations.
+   */
+  export interface SearchResponse {
+    success: boolean
+    data?: any
+    error?: string
+  }
+
+  /**
+   * Response interface for crawling operations.
+   */
+  export interface CrawlResponse {
+    success: boolean
+    jobId?: string
+    data?: any
+    error?: string
+  }
+
+  /**
+   * Response interface for job status checks.
+   */
+  export interface JobStatusResponse {
+    success: boolean
+    status: string
+    jobId?: string
+    data?: any
+    error?: string
+  }
+}
+
+/**
+ * @see https://www.firecrawl.dev
+ */
+export class FirecrawlClient {
+  readonly ky: KyInstance
+  readonly apiKey: string
+  readonly apiBaseUrl: string
+
+  constructor({
+    apiKey = getEnv('FIRECRAWL_API_KEY'),
+    apiBaseUrl = getEnv('FIRECRAWL_API_BASE_URL') ??
+      'https://api.firecrawl.dev',
+    ky = defaultKy
+  }: {
+    apiKey?: string
+    apiBaseUrl?: string
+    ky?: KyInstance
+  } = {}) {
+    assert(
+      apiKey,
+      'FirecrawlClient missing required "apiKey" (defaults to "FIRECRAWL_API_KEY")'
+    )
+    assert(
+      apiBaseUrl,
+      'FirecrawlClient missing required "apiBaseUrl" (defaults to "FIRECRAWL_API_BASE_URL")'
+    )
+
+    this.apiKey = apiKey
+    this.apiBaseUrl = apiBaseUrl
+
+    this.ky = ky.extend({
+      prefixUrl: apiBaseUrl,
+      headers: {
+        Authorization: `Bearer ${this.apiKey}`
+      }
+    })
+  }
+
+  async scrapeUrl(
+    opts: {
+      url: string
+    } & firecrawl.Params
+  ) {
+    const json = {
+      ...opts
+    }
+
+    if (opts?.extractorOptions?.extractionSchema) {
+      let schema = opts.extractorOptions.extractionSchema
+      if (schema instanceof z.ZodSchema) {
+        schema = zodToJsonSchema(schema)
+      }
+
+      json.extractorOptions = {
+        mode: 'llm-extraction',
+        ...opts.extractorOptions,
+        extractionSchema: schema
+      }
+    }
+
+    return this.ky
+      .post('v0/scrapeUrl', { json })
+      .json<firecrawl.ScrapeResponse>()
+  }
+
+  async search(
+    opts: {
+      query: string
+    } & firecrawl.Params
+  ) {
+    return this.ky
+      .post('v0/search', { json: opts })
+      .json<firecrawl.SearchResponse>()
+  }
+
+  async crawlUrl({
+    waitUntilDone = true,
+    timeoutMs = 30_000,
+    idempotencyKey,
+    ...params
+  }: {
+    url: string
+    waitUntilDone?: boolean
+    timeoutMs?: number
+    idempotencyKey?: string
+  } & firecrawl.Params) {
+    const res = await this.ky
+      .post('v0/crawl', {
+        json: params,
+        timeout: timeoutMs,
+        headers: idempotencyKey
+          ? {
+              'x-idempotency-key': idempotencyKey
+            }
+          : undefined
+      })
+      .json<firecrawl.CrawlResponse>()
+
+    assert(res.jobId)
+    if (waitUntilDone) {
+      return this.waitForCrawlJob({ jobId: res.jobId, timeoutMs })
+    }
+
+    return res
+  }
+
+  async checkCrawlStatus(jobId: string) {
+    assert(jobId)
+
+    return this.ky
+      .get(`v0/crawl/status/${jobId}`)
+      .json<firecrawl.JobStatusResponse>()
+  }
+
+  async waitForCrawlJob({
+    jobId,
+    timeoutMs = 30_000
+  }: {
+    jobId: string
+    timeoutMs?: number
+  }) {
+    assert(jobId)
+
+    const start = Date.now()
+    do {
+      const res = await this.checkCrawlStatus(jobId)
+      if (res.status === 'completed') {
+        return res
+      }
+
+      if (!['active', 'paused', 'pending', 'queued'].includes(res.status)) {
+        throw new Error(
+          `Crawl job "${jobId}" failed or was stopped. Status: ${res.status}`
+        )
+      }
+
+      if (Date.now() - start > timeoutMs) {
+        throw new Error(
+          `Timeout waiting for crawl job "${jobId}" to complete: ${res.status}`
+        )
+      }
+
+      await delay(1000)
+    } while (true)
+  }
+}
--- a/src/services/index.ts
+++ b/src/services/index.ts
@ -2,6 +2,7 @@ export * from './clearbit-client.js'
 export * from './dexa-client.js'
 export * from './diffbot-client.js'
 export * from './exa-client.js'
+export * from './firecrawl-client.js'
 export * from './people-data-labs-client.js'
 export * from './perigon-client.js'
 export * from './predict-leads-client.js'