kopia lustrzana https://github.com/transitive-bullshit/chatgpt-api
feat: fix normalizeUrl; improve firecrawl, perigon, scraper clients
rodzic
c31d4e022c
commit
c0a5323ecf
|
@ -1,11 +1,19 @@
|
||||||
import defaultKy, { type KyInstance } from 'ky'
|
import defaultKy, { type KyInstance } from 'ky'
|
||||||
|
import pThrottle from 'p-throttle'
|
||||||
import z from 'zod'
|
import z from 'zod'
|
||||||
|
|
||||||
import { aiFunction, AIFunctionsProvider } from '../fns.js'
|
import { aiFunction, AIFunctionsProvider } from '../fns.js'
|
||||||
import { assert, delay, getEnv } from '../utils.js'
|
import { assert, delay, getEnv, throttleKy } from '../utils.js'
|
||||||
import { zodToJsonSchema } from '../zod-to-json-schema.js'
|
import { zodToJsonSchema } from '../zod-to-json-schema.js'
|
||||||
|
|
||||||
export namespace firecrawl {
|
export namespace firecrawl {
|
||||||
|
// Allow up to 1 request per second by default.
|
||||||
|
export const throttle = pThrottle({
|
||||||
|
limit: 1,
|
||||||
|
interval: 1000,
|
||||||
|
strict: true
|
||||||
|
})
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generic parameter interface.
|
* Generic parameter interface.
|
||||||
*/
|
*/
|
||||||
|
@ -96,11 +104,13 @@ export class FirecrawlClient extends AIFunctionsProvider {
|
||||||
apiKey = getEnv('FIRECRAWL_API_KEY'),
|
apiKey = getEnv('FIRECRAWL_API_KEY'),
|
||||||
apiBaseUrl = getEnv('FIRECRAWL_API_BASE_URL') ??
|
apiBaseUrl = getEnv('FIRECRAWL_API_BASE_URL') ??
|
||||||
'https://api.firecrawl.dev',
|
'https://api.firecrawl.dev',
|
||||||
|
throttle = true,
|
||||||
timeoutMs = 60_000,
|
timeoutMs = 60_000,
|
||||||
ky = defaultKy
|
ky = defaultKy
|
||||||
}: {
|
}: {
|
||||||
apiKey?: string
|
apiKey?: string
|
||||||
apiBaseUrl?: string
|
apiBaseUrl?: string
|
||||||
|
throttle?: boolean
|
||||||
timeoutMs?: number
|
timeoutMs?: number
|
||||||
ky?: KyInstance
|
ky?: KyInstance
|
||||||
} = {}) {
|
} = {}) {
|
||||||
|
@ -117,7 +127,9 @@ export class FirecrawlClient extends AIFunctionsProvider {
|
||||||
this.apiKey = apiKey
|
this.apiKey = apiKey
|
||||||
this.apiBaseUrl = apiBaseUrl
|
this.apiBaseUrl = apiBaseUrl
|
||||||
|
|
||||||
this.ky = ky.extend({
|
const throttledKy = throttle ? throttleKy(ky, firecrawl.throttle) : ky
|
||||||
|
|
||||||
|
this.ky = throttledKy.extend({
|
||||||
prefixUrl: apiBaseUrl,
|
prefixUrl: apiBaseUrl,
|
||||||
timeout: timeoutMs,
|
timeout: timeoutMs,
|
||||||
headers: {
|
headers: {
|
||||||
|
@ -155,18 +167,7 @@ export class FirecrawlClient extends AIFunctionsProvider {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const res = await this.ky
|
return this.ky.post('v0/scrape', { json }).json<firecrawl.ScrapeResponse>()
|
||||||
.post('v0/scrape', { json })
|
|
||||||
.json<firecrawl.ScrapeResponse>()
|
|
||||||
|
|
||||||
if (!res.success || !res.data) return res
|
|
||||||
|
|
||||||
if (res.data.markdown) {
|
|
||||||
delete res.data.html
|
|
||||||
delete res.data.content
|
|
||||||
}
|
|
||||||
|
|
||||||
return res
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async search(
|
async search(
|
||||||
|
|
|
@ -681,20 +681,20 @@ export class PerigonClient extends AIFunctionsProvider {
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
async searchArticles(opts: perigon.ArticlesSearchOptions) {
|
async searchArticles(opts: perigon.ArticlesSearchOptions) {
|
||||||
const searchParams = sanitizeSearchParams({
|
return this.ky
|
||||||
|
.get('all', {
|
||||||
|
searchParams: sanitizeSearchParams({
|
||||||
sortBy: 'relevance',
|
sortBy: 'relevance',
|
||||||
...opts,
|
...opts,
|
||||||
apiKey: this.apiKey,
|
apiKey: this.apiKey,
|
||||||
size: Math.max(
|
size: Math.max(
|
||||||
1,
|
1,
|
||||||
Math.min(perigon.MAX_PAGE_SIZE, opts.size || perigon.DEFAULT_PAGE_SIZE)
|
Math.min(
|
||||||
|
perigon.MAX_PAGE_SIZE,
|
||||||
|
opts.size || perigon.DEFAULT_PAGE_SIZE
|
||||||
|
)
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
console.log('perigon.searchArticles', searchParams)
|
|
||||||
|
|
||||||
return this.ky
|
|
||||||
.get('all', {
|
|
||||||
searchParams
|
|
||||||
})
|
})
|
||||||
.json<perigon.ArticlesSearchResponse>()
|
.json<perigon.ArticlesSearchResponse>()
|
||||||
}
|
}
|
||||||
|
@ -721,20 +721,20 @@ export class PerigonClient extends AIFunctionsProvider {
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
async searchStories(opts: perigon.StoriesSearchOptions) {
|
async searchStories(opts: perigon.StoriesSearchOptions) {
|
||||||
const searchParams = sanitizeSearchParams({
|
return this.ky
|
||||||
|
.get('stories/all', {
|
||||||
|
searchParams: sanitizeSearchParams({
|
||||||
sortBy: 'relevance',
|
sortBy: 'relevance',
|
||||||
...opts,
|
...opts,
|
||||||
apiKey: this.apiKey,
|
apiKey: this.apiKey,
|
||||||
size: Math.max(
|
size: Math.max(
|
||||||
1,
|
1,
|
||||||
Math.min(perigon.MAX_PAGE_SIZE, opts.size || perigon.DEFAULT_PAGE_SIZE)
|
Math.min(
|
||||||
|
perigon.MAX_PAGE_SIZE,
|
||||||
|
opts.size || perigon.DEFAULT_PAGE_SIZE
|
||||||
|
)
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
console.log('perigon.searchStories', searchParams)
|
|
||||||
|
|
||||||
return this.ky
|
|
||||||
.get('stories/all', {
|
|
||||||
searchParams
|
|
||||||
})
|
})
|
||||||
.json<perigon.StoriesSearchResponse>()
|
.json<perigon.StoriesSearchResponse>()
|
||||||
}
|
}
|
||||||
|
|
|
@ -82,7 +82,7 @@ export class ScraperClient extends AIFunctionsProvider {
|
||||||
| string
|
| string
|
||||||
| {
|
| {
|
||||||
url: string
|
url: string
|
||||||
format?: 'html' | 'markdown' | 'plaintext'
|
format?: 'html' | 'markdown' | 'plaintext' | 'all'
|
||||||
timeoutMs?: number
|
timeoutMs?: number
|
||||||
}
|
}
|
||||||
): Promise<Partial<scraper.ScrapeResult>> {
|
): Promise<Partial<scraper.ScrapeResult>> {
|
||||||
|
|
|
@ -17,6 +17,9 @@ describe('normalizeUrl', () => {
|
||||||
expect(normalizeUrl('https://google.com/abc/123//')).toBe(
|
expect(normalizeUrl('https://google.com/abc/123//')).toBe(
|
||||||
'https://google.com/abc/123'
|
'https://google.com/abc/123'
|
||||||
)
|
)
|
||||||
|
expect(normalizeUrl('//google.com')).toBe('https://google.com')
|
||||||
|
expect(normalizeUrl('google.com')).toBe('https://google.com')
|
||||||
|
expect(normalizeUrl('abc.foo.com')).toBe('https://abc.foo.com')
|
||||||
})
|
})
|
||||||
|
|
||||||
test('invalid urls', async () => {
|
test('invalid urls', async () => {
|
||||||
|
|
|
@ -45,10 +45,18 @@ export function normalizeUrl(
|
||||||
): string | undefined {
|
): string | undefined {
|
||||||
let normalizedUrl: string | undefined
|
let normalizedUrl: string | undefined
|
||||||
|
|
||||||
if (!url || isRelativeUrl(url)) {
|
if (!url || typeof url !== 'string') {
|
||||||
return undefined
|
return undefined
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (isRelativeUrl(url)) {
|
||||||
|
if (!/^[./]/.test(url) && url.indexOf('.') > 0) {
|
||||||
|
url = `https://${url}`
|
||||||
|
} else {
|
||||||
|
return undefined
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const opts = {
|
const opts = {
|
||||||
stripWWW: false,
|
stripWWW: false,
|
||||||
defaultProtocol: 'https',
|
defaultProtocol: 'https',
|
||||||
|
|
Ładowanie…
Reference in New Issue