diff --git a/readme.md b/readme.md index 94b45d0..89345bc 100644 --- a/readme.md +++ b/readme.md @@ -23,6 +23,7 @@ - dexa - diffbot - exa +- firecrawl - people data labs - perigon - predict leads @@ -43,11 +44,9 @@ - agentic - walter - services - - exa - need to update to correct format - wolfram alpha - wikipedia - midjourney - - firecrawl - unstructured - pull from [langchain](https://github.com/langchain-ai/langchainjs/tree/main/langchain) - pull from other libs diff --git a/src/services/exa-client.ts b/src/services/exa-client.ts index d4bf627..8bda881 100644 --- a/src/services/exa-client.ts +++ b/src/services/exa-client.ts @@ -156,24 +156,23 @@ export class ExaClient { }) } - async search(query: string, options?: exa.RegularSearchOptions) { - return this.ky - .post('search', { json: { ...options, query } }) - .json() + /** + * Performs an Exa search for the given query. + */ + async search(opts: { query: string } & exa.RegularSearchOptions) { + return this.ky.post('search', { json: opts }).json() } /** * Performs a search with a Exa prompt-engineered query and returns the * contents of the documents. - * - * @param {string} query - The query string. */ - async searchAndContents( - query: string, - options?: exa.RegularSearchOptions & T - ) { - const { text, highlights, ...rest } = options || {} - + async searchAndContents({ + query, + text, + highlights, + ...rest + }: { query: string } & exa.RegularSearchOptions & T) { return this.ky .post('search', { json: { @@ -193,12 +192,10 @@ export class ExaClient { /** * Finds similar links to the provided URL. - * - * @param {string} url - The URL for which to find similar links. */ - async findSimilar(url: string, options?: exa.FindSimilarOptions) { + async findSimilar(opts: { url: string } & exa.FindSimilarOptions) { return this.ky - .post('findSimilar', { json: { url, ...options } }) + .post('findSimilar', { json: opts }) .json() } @@ -210,9 +207,12 @@ export class ExaClient { */ async findSimilarAndContents< T extends exa.ContentsOptions = exa.ContentsOptions - >(url: string, options?: exa.FindSimilarOptions & T) { - const { text, highlights, ...rest } = options || {} - + >({ + url, + text, + highlights, + ...rest + }: { url: string } & exa.FindSimilarOptions & T) { return this.ky .post('findSimilar', { json: { @@ -235,10 +235,10 @@ export class ExaClient { * * @param {string | string[] | SearchResult[]} ids - An array of document IDs. */ - async getContents( - ids: string | string[] | exa.SearchResult[], - options?: T - ) { + async getContents({ + ids, + ...opts + }: { ids: string | string[] | exa.SearchResult[] } & T) { let requestIds: string[] if (typeof ids === 'string') { @@ -256,8 +256,8 @@ export class ExaClient { return this.ky .post('contents', { json: { - ids: requestIds, - ...options + ...opts, + ids: requestIds } }) .json>() diff --git a/src/services/firecrawl-client.ts b/src/services/firecrawl-client.ts new file mode 100644 index 0000000..970b979 --- /dev/null +++ b/src/services/firecrawl-client.ts @@ -0,0 +1,205 @@ +import defaultKy, { type KyInstance } from 'ky' +import z from 'zod' + +import { assert, delay, getEnv } from '../utils.js' +import { zodToJsonSchema } from '../zod-to-json-schema.js' + +export namespace firecrawl { + /** + * Generic parameter interface. + */ + export interface Params { + [key: string]: any + extractorOptions?: { + extractionSchema: z.ZodSchema | any + mode?: 'llm-extraction' + extractionPrompt?: string + } + } + + /** + * Response interface for scraping operations. + */ + export interface ScrapeResponse { + success: boolean + data?: any + error?: string + } + + /** + * Response interface for searching operations. + */ + export interface SearchResponse { + success: boolean + data?: any + error?: string + } + + /** + * Response interface for crawling operations. + */ + export interface CrawlResponse { + success: boolean + jobId?: string + data?: any + error?: string + } + + /** + * Response interface for job status checks. + */ + export interface JobStatusResponse { + success: boolean + status: string + jobId?: string + data?: any + error?: string + } +} + +/** + * @see https://www.firecrawl.dev + */ +export class FirecrawlClient { + readonly ky: KyInstance + readonly apiKey: string + readonly apiBaseUrl: string + + constructor({ + apiKey = getEnv('FIRECRAWL_API_KEY'), + apiBaseUrl = getEnv('FIRECRAWL_API_BASE_URL') ?? + 'https://api.firecrawl.dev', + ky = defaultKy + }: { + apiKey?: string + apiBaseUrl?: string + ky?: KyInstance + } = {}) { + assert( + apiKey, + 'FirecrawlClient missing required "apiKey" (defaults to "FIRECRAWL_API_KEY")' + ) + assert( + apiBaseUrl, + 'FirecrawlClient missing required "apiBaseUrl" (defaults to "FIRECRAWL_API_BASE_URL")' + ) + + this.apiKey = apiKey + this.apiBaseUrl = apiBaseUrl + + this.ky = ky.extend({ + prefixUrl: apiBaseUrl, + headers: { + Authorization: `Bearer ${this.apiKey}` + } + }) + } + + async scrapeUrl( + opts: { + url: string + } & firecrawl.Params + ) { + const json = { + ...opts + } + + if (opts?.extractorOptions?.extractionSchema) { + let schema = opts.extractorOptions.extractionSchema + if (schema instanceof z.ZodSchema) { + schema = zodToJsonSchema(schema) + } + + json.extractorOptions = { + mode: 'llm-extraction', + ...opts.extractorOptions, + extractionSchema: schema + } + } + + return this.ky + .post('v0/scrapeUrl', { json }) + .json() + } + + async search( + opts: { + query: string + } & firecrawl.Params + ) { + return this.ky + .post('v0/search', { json: opts }) + .json() + } + + async crawlUrl({ + waitUntilDone = true, + timeoutMs = 30_000, + idempotencyKey, + ...params + }: { + url: string + waitUntilDone?: boolean + timeoutMs?: number + idempotencyKey?: string + } & firecrawl.Params) { + const res = await this.ky + .post('v0/crawl', { + json: params, + timeout: timeoutMs, + headers: idempotencyKey + ? { + 'x-idempotency-key': idempotencyKey + } + : undefined + }) + .json() + + assert(res.jobId) + if (waitUntilDone) { + return this.waitForCrawlJob({ jobId: res.jobId, timeoutMs }) + } + + return res + } + + async checkCrawlStatus(jobId: string) { + assert(jobId) + + return this.ky + .get(`v0/crawl/status/${jobId}`) + .json() + } + + async waitForCrawlJob({ + jobId, + timeoutMs = 30_000 + }: { + jobId: string + timeoutMs?: number + }) { + assert(jobId) + + const start = Date.now() + do { + const res = await this.checkCrawlStatus(jobId) + if (res.status === 'completed') { + return res + } + + if (!['active', 'paused', 'pending', 'queued'].includes(res.status)) { + throw new Error( + `Crawl job "${jobId}" failed or was stopped. Status: ${res.status}` + ) + } + + if (Date.now() - start > timeoutMs) { + throw new Error( + `Timeout waiting for crawl job "${jobId}" to complete: ${res.status}` + ) + } + + await delay(1000) + } while (true) + } +} diff --git a/src/services/index.ts b/src/services/index.ts index 32b1c9f..06daec6 100644 --- a/src/services/index.ts +++ b/src/services/index.ts @@ -2,6 +2,7 @@ export * from './clearbit-client.js' export * from './dexa-client.js' export * from './diffbot-client.js' export * from './exa-client.js' +export * from './firecrawl-client.js' export * from './people-data-labs-client.js' export * from './perigon-client.js' export * from './predict-leads-client.js'