kopia lustrzana https://github.com/transitive-bullshit/chatgpt-api
604 wiersze
22 KiB
TypeScript
604 wiersze
22 KiB
TypeScript
/**
|
|
* This file was auto-generated from an OpenAPI spec.
|
|
*/
|
|
|
|
import { aiFunction, AIFunctionsProvider, assert, getEnv } from '@agentic/core'
|
|
import defaultKy, { type KyInstance } from 'ky'
|
|
import { z } from 'zod'
|
|
|
|
export namespace firecrawl {
|
|
export const apiBaseUrl = 'https://api.firecrawl.dev/v0'
|
|
|
|
export const ScrapeResponseSchema = z.object({
|
|
success: z.boolean().optional(),
|
|
/** Warning message to let you know of any issues. */
|
|
warning: z
|
|
.string()
|
|
.nullable()
|
|
.describe('Warning message to let you know of any issues.')
|
|
.optional(),
|
|
data: z
|
|
.object({
|
|
/** Markdown content of the page if the `markdown` format was specified (default) */
|
|
markdown: z
|
|
.string()
|
|
.nullable()
|
|
.describe(
|
|
'Markdown content of the page if the `markdown` format was specified (default)'
|
|
)
|
|
.optional(),
|
|
/** HTML version of the content on page if the `html` format was specified */
|
|
html: z
|
|
.string()
|
|
.nullable()
|
|
.describe(
|
|
'HTML version of the content on page if the `html` format was specified'
|
|
)
|
|
.optional(),
|
|
/** Raw HTML content of the page if the `rawHtml` format was specified */
|
|
rawHtml: z
|
|
.string()
|
|
.nullable()
|
|
.describe(
|
|
'Raw HTML content of the page if the `rawHtml` format was specified'
|
|
)
|
|
.optional(),
|
|
/** Links on the page if the `links` format was specified */
|
|
links: z
|
|
.array(z.string().url())
|
|
.nullable()
|
|
.describe('Links on the page if the `links` format was specified')
|
|
.optional(),
|
|
/** URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified */
|
|
screenshot: z
|
|
.string()
|
|
.nullable()
|
|
.describe(
|
|
'URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified'
|
|
)
|
|
.optional(),
|
|
metadata: z
|
|
.object({
|
|
title: z.string().optional(),
|
|
description: z.string().optional(),
|
|
language: z.string().nullable().optional(),
|
|
sourceURL: z.string().url().optional(),
|
|
'<any other metadata> ': z.string().optional(),
|
|
/** The status code of the page */
|
|
statusCode: z
|
|
.number()
|
|
.int()
|
|
.describe('The status code of the page')
|
|
.optional(),
|
|
/** The error message of the page */
|
|
error: z
|
|
.string()
|
|
.nullable()
|
|
.describe('The error message of the page')
|
|
.optional()
|
|
})
|
|
.optional()
|
|
})
|
|
.optional()
|
|
})
|
|
export type ScrapeResponse = z.infer<typeof ScrapeResponseSchema>
|
|
|
|
export const CrawlResponseSchema = z.object({
|
|
success: z.boolean().optional(),
|
|
id: z.string().optional(),
|
|
url: z.string().url().optional()
|
|
})
|
|
export type CrawlResponse = z.infer<typeof CrawlResponseSchema>
|
|
|
|
export const SearchResponseSchema = z.object({
|
|
success: z.boolean().optional(),
|
|
data: z.array(z.any()).optional()
|
|
})
|
|
export type SearchResponse = z.infer<typeof SearchResponseSchema>
|
|
|
|
export const CrawlStatusResponseObjSchema = z.object({
|
|
/** Markdown content of the page if the `markdown` format was specified (default) */
|
|
markdown: z
|
|
.string()
|
|
.nullable()
|
|
.describe(
|
|
'Markdown content of the page if the `markdown` format was specified (default)'
|
|
)
|
|
.optional(),
|
|
/** HTML version of the content on page if the `html` format was specified */
|
|
html: z
|
|
.string()
|
|
.nullable()
|
|
.describe(
|
|
'HTML version of the content on page if the `html` format was specified'
|
|
)
|
|
.optional(),
|
|
/** Raw HTML content of the page if the `rawHtml` format was specified */
|
|
rawHtml: z
|
|
.string()
|
|
.nullable()
|
|
.describe(
|
|
'Raw HTML content of the page if the `rawHtml` format was specified'
|
|
)
|
|
.optional(),
|
|
/** Links on the page if the `links` format was specified */
|
|
links: z
|
|
.array(z.string().url())
|
|
.nullable()
|
|
.describe('Links on the page if the `links` format was specified')
|
|
.optional(),
|
|
/** URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified */
|
|
screenshot: z
|
|
.string()
|
|
.nullable()
|
|
.describe(
|
|
'URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified'
|
|
)
|
|
.optional(),
|
|
metadata: z
|
|
.object({
|
|
title: z.string().optional(),
|
|
description: z.string().optional(),
|
|
language: z.string().nullable().optional(),
|
|
sourceURL: z.string().url().optional(),
|
|
'<any other metadata> ': z.string().optional(),
|
|
/** The status code of the page */
|
|
statusCode: z
|
|
.number()
|
|
.int()
|
|
.describe('The status code of the page')
|
|
.optional(),
|
|
/** The error message of the page */
|
|
error: z
|
|
.string()
|
|
.nullable()
|
|
.describe('The error message of the page')
|
|
.optional()
|
|
})
|
|
.optional()
|
|
})
|
|
export type CrawlStatusResponseObj = z.infer<
|
|
typeof CrawlStatusResponseObjSchema
|
|
>
|
|
|
|
export const ScrapeParamsSchema = z.object({
|
|
/** The URL to scrape */
|
|
url: z.string().url().describe('The URL to scrape'),
|
|
/**
|
|
* Specific formats to return.
|
|
*
|
|
* - markdown: The page in Markdown format.
|
|
* - html: The page's HTML, trimmed to include only meaningful content.
|
|
* - rawHtml: The page's original HTML.
|
|
* - links: The links on the page.
|
|
* - screenshot: A screenshot of the top of the page.
|
|
* - screenshot@fullPage: A screenshot of the full page. (overridden by screenshot if present)
|
|
*/
|
|
formats: z
|
|
.array(
|
|
z.enum([
|
|
'markdown',
|
|
'html',
|
|
'rawHtml',
|
|
'links',
|
|
'screenshot',
|
|
'screenshot@fullPage'
|
|
])
|
|
)
|
|
.describe(
|
|
"Specific formats to return.\n\n - markdown: The page in Markdown format.\n - html: The page's HTML, trimmed to include only meaningful content.\n - rawHtml: The page's original HTML.\n - links: The links on the page.\n - screenshot: A screenshot of the top of the page.\n - screenshot@fullPage: A screenshot of the full page. (overridden by screenshot if present)"
|
|
)
|
|
.default(['markdown']),
|
|
/** Headers to send with the request. Can be used to send cookies, user-agent, etc. */
|
|
headers: z
|
|
.record(z.any())
|
|
.describe(
|
|
'Headers to send with the request. Can be used to send cookies, user-agent, etc.'
|
|
)
|
|
.optional(),
|
|
/** Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer' */
|
|
includeTags: z
|
|
.array(z.string())
|
|
.describe(
|
|
"Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
|
)
|
|
.optional(),
|
|
/** Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer' */
|
|
excludeTags: z
|
|
.array(z.string())
|
|
.describe(
|
|
"Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
|
)
|
|
.optional(),
|
|
/** Only return the main content of the page excluding headers, navs, footers, etc. */
|
|
onlyMainContent: z
|
|
.boolean()
|
|
.describe(
|
|
'Only return the main content of the page excluding headers, navs, footers, etc.'
|
|
)
|
|
.default(true),
|
|
/** Timeout in milliseconds for the request */
|
|
timeout: z
|
|
.number()
|
|
.int()
|
|
.describe('Timeout in milliseconds for the request')
|
|
.default(30_000),
|
|
/** Wait x amount of milliseconds for the page to load to fetch content */
|
|
waitFor: z
|
|
.number()
|
|
.int()
|
|
.describe(
|
|
'Wait x amount of milliseconds for the page to load to fetch content'
|
|
)
|
|
.default(0)
|
|
})
|
|
export type ScrapeParams = z.infer<typeof ScrapeParamsSchema>
|
|
|
|
export const CrawlUrlsParamsSchema = z.object({
|
|
/** The base URL to start crawling from */
|
|
url: z.string().url().describe('The base URL to start crawling from'),
|
|
crawlerOptions: z
|
|
.object({
|
|
/** URL patterns to include */
|
|
includes: z
|
|
.array(z.string())
|
|
.describe('URL patterns to include')
|
|
.optional(),
|
|
/** URL patterns to exclude */
|
|
excludes: z
|
|
.array(z.string())
|
|
.describe('URL patterns to exclude')
|
|
.optional(),
|
|
/** Generate alt text for images using LLMs (must have a paid plan) */
|
|
generateImgAltText: z
|
|
.boolean()
|
|
.describe(
|
|
'Generate alt text for images using LLMs (must have a paid plan)'
|
|
)
|
|
.default(false),
|
|
/** If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents. */
|
|
returnOnlyUrls: z
|
|
.boolean()
|
|
.describe(
|
|
'If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.'
|
|
)
|
|
.default(false),
|
|
/** Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern. */
|
|
maxDepth: z
|
|
.number()
|
|
.int()
|
|
.describe(
|
|
'Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern.'
|
|
)
|
|
.optional(),
|
|
/** The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites. */
|
|
mode: z
|
|
.enum(['default', 'fast'])
|
|
.describe(
|
|
"The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites."
|
|
)
|
|
.default('default'),
|
|
/** Ignore the website sitemap when crawling */
|
|
ignoreSitemap: z
|
|
.boolean()
|
|
.describe('Ignore the website sitemap when crawling')
|
|
.default(false),
|
|
/** Maximum number of pages to crawl */
|
|
limit: z
|
|
.number()
|
|
.int()
|
|
.describe('Maximum number of pages to crawl')
|
|
.default(10_000),
|
|
/** Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product' */
|
|
allowBackwardCrawling: z
|
|
.boolean()
|
|
.describe(
|
|
"Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'"
|
|
)
|
|
.default(false),
|
|
/** Allows the crawler to follow links to external websites. */
|
|
allowExternalContentLinks: z
|
|
.boolean()
|
|
.describe('Allows the crawler to follow links to external websites.')
|
|
.default(false)
|
|
})
|
|
.optional(),
|
|
pageOptions: z
|
|
.object({
|
|
/** Headers to send with the request. Can be used to send cookies, user-agent, etc. */
|
|
headers: z
|
|
.record(z.any())
|
|
.describe(
|
|
'Headers to send with the request. Can be used to send cookies, user-agent, etc.'
|
|
)
|
|
.optional(),
|
|
/** Include the HTML version of the content on page. Will output a html key in the response. */
|
|
includeHtml: z
|
|
.boolean()
|
|
.describe(
|
|
'Include the HTML version of the content on page. Will output a html key in the response.'
|
|
)
|
|
.default(false),
|
|
/** Include the raw HTML content of the page. Will output a rawHtml key in the response. */
|
|
includeRawHtml: z
|
|
.boolean()
|
|
.describe(
|
|
'Include the raw HTML content of the page. Will output a rawHtml key in the response.'
|
|
)
|
|
.default(false),
|
|
/** Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer' */
|
|
onlyIncludeTags: z
|
|
.array(z.string())
|
|
.describe(
|
|
"Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
|
)
|
|
.optional(),
|
|
/** Only return the main content of the page excluding headers, navs, footers, etc. */
|
|
onlyMainContent: z
|
|
.boolean()
|
|
.describe(
|
|
'Only return the main content of the page excluding headers, navs, footers, etc.'
|
|
)
|
|
.default(false),
|
|
/** Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer' */
|
|
removeTags: z
|
|
.array(z.string())
|
|
.describe(
|
|
"Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
|
)
|
|
.optional(),
|
|
/** Replace all relative paths with absolute paths for images and links */
|
|
replaceAllPathsWithAbsolutePaths: z
|
|
.boolean()
|
|
.describe(
|
|
'Replace all relative paths with absolute paths for images and links'
|
|
)
|
|
.default(false),
|
|
/** Include a screenshot of the top of the page that you are scraping. */
|
|
screenshot: z
|
|
.boolean()
|
|
.describe(
|
|
'Include a screenshot of the top of the page that you are scraping.'
|
|
)
|
|
.default(false),
|
|
/** Include a full page screenshot of the page that you are scraping. */
|
|
fullPageScreenshot: z
|
|
.boolean()
|
|
.describe(
|
|
'Include a full page screenshot of the page that you are scraping.'
|
|
)
|
|
.default(false),
|
|
/** Wait x amount of milliseconds for the page to load to fetch content */
|
|
waitFor: z
|
|
.number()
|
|
.int()
|
|
.describe(
|
|
'Wait x amount of milliseconds for the page to load to fetch content'
|
|
)
|
|
.default(0)
|
|
})
|
|
.optional()
|
|
})
|
|
export type CrawlUrlsParams = z.infer<typeof CrawlUrlsParamsSchema>
|
|
|
|
export const CrawlUrlsResponseSchema = CrawlResponseSchema
|
|
export type CrawlUrlsResponse = z.infer<typeof CrawlUrlsResponseSchema>
|
|
|
|
export const SearchGoogleParamsSchema = z.object({
|
|
/** The query to search for */
|
|
query: z.string().url().describe('The query to search for'),
|
|
pageOptions: z
|
|
.object({
|
|
/** Only return the main content of the page excluding headers, navs, footers, etc. */
|
|
onlyMainContent: z
|
|
.boolean()
|
|
.describe(
|
|
'Only return the main content of the page excluding headers, navs, footers, etc.'
|
|
)
|
|
.default(false),
|
|
/** Fetch the content of each page. If false, defaults to a basic fast serp API. */
|
|
fetchPageContent: z
|
|
.boolean()
|
|
.describe(
|
|
'Fetch the content of each page. If false, defaults to a basic fast serp API.'
|
|
)
|
|
.default(true),
|
|
/** Include the HTML version of the content on page. Will output a html key in the response. */
|
|
includeHtml: z
|
|
.boolean()
|
|
.describe(
|
|
'Include the HTML version of the content on page. Will output a html key in the response.'
|
|
)
|
|
.default(false),
|
|
/** Include the raw HTML content of the page. Will output a rawHtml key in the response. */
|
|
includeRawHtml: z
|
|
.boolean()
|
|
.describe(
|
|
'Include the raw HTML content of the page. Will output a rawHtml key in the response.'
|
|
)
|
|
.default(false)
|
|
})
|
|
.optional(),
|
|
searchOptions: z
|
|
.object({
|
|
/** Maximum number of results. Max is 20 during beta. */
|
|
limit: z
|
|
.number()
|
|
.int()
|
|
.describe('Maximum number of results. Max is 20 during beta.')
|
|
.optional()
|
|
})
|
|
.optional()
|
|
})
|
|
export type SearchGoogleParams = z.infer<typeof SearchGoogleParamsSchema>
|
|
|
|
export const SearchGoogleResponseSchema = SearchResponseSchema
|
|
export type SearchGoogleResponse = z.infer<typeof SearchGoogleResponseSchema>
|
|
|
|
export const GetCrawlStatusParamsSchema = z.object({
|
|
/** ID of the crawl job */
|
|
jobId: z.string().describe('ID of the crawl job')
|
|
})
|
|
export type GetCrawlStatusParams = z.infer<typeof GetCrawlStatusParamsSchema>
|
|
|
|
export const GetCrawlStatusResponseSchema = z.object({
|
|
/** Status of the job (completed, active, failed, paused) */
|
|
status: z
|
|
.string()
|
|
.describe('Status of the job (completed, active, failed, paused)')
|
|
.optional(),
|
|
/** Current page number */
|
|
current: z.number().int().describe('Current page number').optional(),
|
|
/** Total number of pages */
|
|
total: z.number().int().describe('Total number of pages').optional(),
|
|
/** Data returned from the job (null when it is in progress) */
|
|
data: z
|
|
.array(CrawlStatusResponseObjSchema)
|
|
.describe('Data returned from the job (null when it is in progress)')
|
|
.optional(),
|
|
/** Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. When the crawl is done, partial_data will become empty and the result will be available in `data`. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array. */
|
|
partial_data: z
|
|
.array(CrawlStatusResponseObjSchema)
|
|
.describe(
|
|
'Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. When the crawl is done, partial_data will become empty and the result will be available in `data`. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array.'
|
|
)
|
|
.optional()
|
|
})
|
|
export type GetCrawlStatusResponse = z.infer<
|
|
typeof GetCrawlStatusResponseSchema
|
|
>
|
|
|
|
export const CancelCrawlJobParamsSchema = z.object({
|
|
/** ID of the crawl job */
|
|
jobId: z.string().describe('ID of the crawl job')
|
|
})
|
|
export type CancelCrawlJobParams = z.infer<typeof CancelCrawlJobParamsSchema>
|
|
|
|
export const CancelCrawlJobResponseSchema = z.object({
|
|
/** Returns cancelled. */
|
|
status: z.string().describe('Returns cancelled.').optional()
|
|
})
|
|
export type CancelCrawlJobResponse = z.infer<
|
|
typeof CancelCrawlJobResponseSchema
|
|
>
|
|
}
|
|
|
|
export class FirecrawlClient extends AIFunctionsProvider {
|
|
protected readonly ky: KyInstance
|
|
protected readonly apiKey: string
|
|
protected readonly apiBaseUrl: string
|
|
|
|
constructor({
|
|
apiKey = getEnv('FIRECRAWL_API_KEY'),
|
|
apiBaseUrl = firecrawl.apiBaseUrl,
|
|
ky = defaultKy
|
|
}: {
|
|
apiKey?: string
|
|
apiBaseUrl?: string
|
|
ky?: KyInstance
|
|
} = {}) {
|
|
assert(
|
|
apiKey,
|
|
'FirecrawlClient missing required "apiKey" (defaults to "FIRECRAWL_API_KEY")'
|
|
)
|
|
super()
|
|
|
|
this.apiKey = apiKey
|
|
this.apiBaseUrl = apiBaseUrl
|
|
|
|
this.ky = ky.extend({
|
|
prefixUrl: apiBaseUrl,
|
|
headers: {
|
|
Authorization: apiKey
|
|
}
|
|
})
|
|
}
|
|
|
|
/**
|
|
* Scrape a single URL.
|
|
*/
|
|
@aiFunction({
|
|
name: 'scrape',
|
|
description: 'Scrape a single URL.',
|
|
inputSchema: firecrawl.ScrapeParamsSchema
|
|
})
|
|
async scrape(
|
|
params: firecrawl.ScrapeParams
|
|
): Promise<firecrawl.ScrapeResponse> {
|
|
return this.ky
|
|
.post('/scrape', {
|
|
json: params
|
|
})
|
|
.json<firecrawl.ScrapeResponse>()
|
|
}
|
|
|
|
/**
|
|
* Crawl multiple URLs based on options.
|
|
*/
|
|
@aiFunction({
|
|
name: 'crawl_urls',
|
|
description: 'Crawl multiple URLs based on options.',
|
|
inputSchema: firecrawl.CrawlUrlsParamsSchema
|
|
})
|
|
async crawlUrls(
|
|
params: firecrawl.CrawlUrlsParams
|
|
): Promise<firecrawl.CrawlUrlsResponse> {
|
|
return this.ky
|
|
.post('/crawl', {
|
|
json: params
|
|
})
|
|
.json<firecrawl.CrawlUrlsResponse>()
|
|
}
|
|
|
|
/**
|
|
* Search for a keyword in Google, returns top page results with markdown content for each page.
|
|
*/
|
|
@aiFunction({
|
|
name: 'search_google',
|
|
description:
|
|
'Search for a keyword in Google, returns top page results with markdown content for each page.',
|
|
inputSchema: firecrawl.SearchGoogleParamsSchema
|
|
})
|
|
async searchGoogle(
|
|
params: firecrawl.SearchGoogleParams
|
|
): Promise<firecrawl.SearchGoogleResponse> {
|
|
return this.ky
|
|
.post('/search', {
|
|
json: params
|
|
})
|
|
.json<firecrawl.SearchGoogleResponse>()
|
|
}
|
|
|
|
/**
|
|
* Get the status of a crawl job.
|
|
*/
|
|
@aiFunction({
|
|
name: 'get_crawl_status',
|
|
description: 'Get the status of a crawl job.',
|
|
inputSchema: firecrawl.GetCrawlStatusParamsSchema
|
|
})
|
|
async getCrawlStatus(
|
|
params: firecrawl.GetCrawlStatusParams
|
|
): Promise<firecrawl.GetCrawlStatusResponse> {
|
|
return this.ky
|
|
.get(`/crawl/status/${params.jobId}`)
|
|
.json<firecrawl.GetCrawlStatusResponse>()
|
|
}
|
|
|
|
/**
|
|
* Cancel a crawl job.
|
|
*/
|
|
@aiFunction({
|
|
name: 'cancel_crawl_job',
|
|
description: 'Cancel a crawl job.',
|
|
inputSchema: firecrawl.CancelCrawlJobParamsSchema
|
|
})
|
|
async cancelCrawlJob(
|
|
params: firecrawl.CancelCrawlJobParams
|
|
): Promise<firecrawl.CancelCrawlJobResponse> {
|
|
return this.ky
|
|
.delete(`/crawl/cancel/${params.jobId}`)
|
|
.json<firecrawl.CancelCrawlJobResponse>()
|
|
}
|
|
}
|