feat: fix normalizeUrl; improve firecrawl, perigon, scraper clients

pull/659/head
Travis Fischer 2024-06-18 00:36:57 -07:00
rodzic c31d4e022c
commit c0a5323ecf
5 zmienionych plików z 52 dodań i 40 usunięć

Wyświetl plik

@ -1,11 +1,19 @@
import defaultKy, { type KyInstance } from 'ky' import defaultKy, { type KyInstance } from 'ky'
import pThrottle from 'p-throttle'
import z from 'zod' import z from 'zod'
import { aiFunction, AIFunctionsProvider } from '../fns.js' import { aiFunction, AIFunctionsProvider } from '../fns.js'
import { assert, delay, getEnv } from '../utils.js' import { assert, delay, getEnv, throttleKy } from '../utils.js'
import { zodToJsonSchema } from '../zod-to-json-schema.js' import { zodToJsonSchema } from '../zod-to-json-schema.js'
export namespace firecrawl { export namespace firecrawl {
// Allow up to 1 request per second by default.
export const throttle = pThrottle({
limit: 1,
interval: 1000,
strict: true
})
/** /**
* Generic parameter interface. * Generic parameter interface.
*/ */
@ -96,11 +104,13 @@ export class FirecrawlClient extends AIFunctionsProvider {
apiKey = getEnv('FIRECRAWL_API_KEY'), apiKey = getEnv('FIRECRAWL_API_KEY'),
apiBaseUrl = getEnv('FIRECRAWL_API_BASE_URL') ?? apiBaseUrl = getEnv('FIRECRAWL_API_BASE_URL') ??
'https://api.firecrawl.dev', 'https://api.firecrawl.dev',
throttle = true,
timeoutMs = 60_000, timeoutMs = 60_000,
ky = defaultKy ky = defaultKy
}: { }: {
apiKey?: string apiKey?: string
apiBaseUrl?: string apiBaseUrl?: string
throttle?: boolean
timeoutMs?: number timeoutMs?: number
ky?: KyInstance ky?: KyInstance
} = {}) { } = {}) {
@ -117,7 +127,9 @@ export class FirecrawlClient extends AIFunctionsProvider {
this.apiKey = apiKey this.apiKey = apiKey
this.apiBaseUrl = apiBaseUrl this.apiBaseUrl = apiBaseUrl
this.ky = ky.extend({ const throttledKy = throttle ? throttleKy(ky, firecrawl.throttle) : ky
this.ky = throttledKy.extend({
prefixUrl: apiBaseUrl, prefixUrl: apiBaseUrl,
timeout: timeoutMs, timeout: timeoutMs,
headers: { headers: {
@ -155,18 +167,7 @@ export class FirecrawlClient extends AIFunctionsProvider {
} }
} }
const res = await this.ky return this.ky.post('v0/scrape', { json }).json<firecrawl.ScrapeResponse>()
.post('v0/scrape', { json })
.json<firecrawl.ScrapeResponse>()
if (!res.success || !res.data) return res
if (res.data.markdown) {
delete res.data.html
delete res.data.content
}
return res
} }
async search( async search(

Wyświetl plik

@ -681,20 +681,20 @@ export class PerigonClient extends AIFunctionsProvider {
}) })
}) })
async searchArticles(opts: perigon.ArticlesSearchOptions) { async searchArticles(opts: perigon.ArticlesSearchOptions) {
const searchParams = sanitizeSearchParams({ return this.ky
.get('all', {
searchParams: sanitizeSearchParams({
sortBy: 'relevance', sortBy: 'relevance',
...opts, ...opts,
apiKey: this.apiKey, apiKey: this.apiKey,
size: Math.max( size: Math.max(
1, 1,
Math.min(perigon.MAX_PAGE_SIZE, opts.size || perigon.DEFAULT_PAGE_SIZE) Math.min(
perigon.MAX_PAGE_SIZE,
opts.size || perigon.DEFAULT_PAGE_SIZE
)
) )
}) })
console.log('perigon.searchArticles', searchParams)
return this.ky
.get('all', {
searchParams
}) })
.json<perigon.ArticlesSearchResponse>() .json<perigon.ArticlesSearchResponse>()
} }
@ -721,20 +721,20 @@ export class PerigonClient extends AIFunctionsProvider {
}) })
}) })
async searchStories(opts: perigon.StoriesSearchOptions) { async searchStories(opts: perigon.StoriesSearchOptions) {
const searchParams = sanitizeSearchParams({ return this.ky
.get('stories/all', {
searchParams: sanitizeSearchParams({
sortBy: 'relevance', sortBy: 'relevance',
...opts, ...opts,
apiKey: this.apiKey, apiKey: this.apiKey,
size: Math.max( size: Math.max(
1, 1,
Math.min(perigon.MAX_PAGE_SIZE, opts.size || perigon.DEFAULT_PAGE_SIZE) Math.min(
perigon.MAX_PAGE_SIZE,
opts.size || perigon.DEFAULT_PAGE_SIZE
)
) )
}) })
console.log('perigon.searchStories', searchParams)
return this.ky
.get('stories/all', {
searchParams
}) })
.json<perigon.StoriesSearchResponse>() .json<perigon.StoriesSearchResponse>()
} }

Wyświetl plik

@ -82,7 +82,7 @@ export class ScraperClient extends AIFunctionsProvider {
| string | string
| { | {
url: string url: string
format?: 'html' | 'markdown' | 'plaintext' format?: 'html' | 'markdown' | 'plaintext' | 'all'
timeoutMs?: number timeoutMs?: number
} }
): Promise<Partial<scraper.ScrapeResult>> { ): Promise<Partial<scraper.ScrapeResult>> {

Wyświetl plik

@ -17,6 +17,9 @@ describe('normalizeUrl', () => {
expect(normalizeUrl('https://google.com/abc/123//')).toBe( expect(normalizeUrl('https://google.com/abc/123//')).toBe(
'https://google.com/abc/123' 'https://google.com/abc/123'
) )
expect(normalizeUrl('//google.com')).toBe('https://google.com')
expect(normalizeUrl('google.com')).toBe('https://google.com')
expect(normalizeUrl('abc.foo.com')).toBe('https://abc.foo.com')
}) })
test('invalid urls', async () => { test('invalid urls', async () => {

Wyświetl plik

@ -45,10 +45,18 @@ export function normalizeUrl(
): string | undefined { ): string | undefined {
let normalizedUrl: string | undefined let normalizedUrl: string | undefined
if (!url || isRelativeUrl(url)) { if (!url || typeof url !== 'string') {
return undefined return undefined
} }
if (isRelativeUrl(url)) {
if (!/^[./]/.test(url) && url.indexOf('.') > 0) {
url = `https://${url}`
} else {
return undefined
}
}
const opts = { const opts = {
stripWWW: false, stripWWW: false,
defaultProtocol: 'https', defaultProtocol: 'https',