"Specific formats to return.\n\n - markdown: The page in Markdown format.\n - html: The page's HTML, trimmed to include only meaningful content.\n - rawHtml: The page's original HTML.\n - links: The links on the page.\n - screenshot: A screenshot of the top of the page.\n - screenshot@fullPage: A screenshot of the full page. (overridden by screenshot if present)"
)
.default(['markdown']),
/** Headers to send with the request. Can be used to send cookies, user-agent, etc. */
headers: z
.record(z.any())
.describe(
'Headers to send with the request. Can be used to send cookies, user-agent, etc.'
)
.optional(),
/** Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer' */
includeTags: z
.array(z.string())
.describe(
"Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
)
.optional(),
/** Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer' */
excludeTags: z
.array(z.string())
.describe(
"Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
)
.optional(),
/** Only return the main content of the page excluding headers, navs, footers, etc. */
onlyMainContent: z
.boolean()
.describe(
'Only return the main content of the page excluding headers, navs, footers, etc.'
)
.default(true),
/** Timeout in milliseconds for the request */
timeout: z
.number()
.int()
.describe('Timeout in milliseconds for the request')
.default(30_000),
/** Wait x amount of milliseconds for the page to load to fetch content */
waitFor: z
.number()
.int()
.describe(
'Wait x amount of milliseconds for the page to load to fetch content'
url: z.string().url().describe('The base URL to start crawling from'),
crawlerOptions: z
.object({
/** URL patterns to include */
includes: z
.array(z.string())
.describe('URL patterns to include')
.optional(),
/** URL patterns to exclude */
excludes: z
.array(z.string())
.describe('URL patterns to exclude')
.optional(),
/** Generate alt text for images using LLMs (must have a paid plan) */
generateImgAltText: z
.boolean()
.describe(
'Generate alt text for images using LLMs (must have a paid plan)'
)
.default(false),
/** If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents. */
returnOnlyUrls: z
.boolean()
.describe(
'If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.'
)
.default(false),
/** Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern. */
maxDepth: z
.number()
.int()
.describe(
'Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern.'
)
.optional(),
/** The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites. */
mode: z
.enum(['default','fast'])
.describe(
"The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites."
)
.default('default'),
/** Ignore the website sitemap when crawling */
ignoreSitemap: z
.boolean()
.describe('Ignore the website sitemap when crawling')
.default(false),
/** Maximum number of pages to crawl */
limit: z
.number()
.int()
.describe('Maximum number of pages to crawl')
.default(10_000),
/** Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product' */
allowBackwardCrawling: z
.boolean()
.describe(
"Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'"
)
.default(false),
/** Allows the crawler to follow links to external websites. */
allowExternalContentLinks: z
.boolean()
.describe('Allows the crawler to follow links to external websites.')
.default(false)
})
.optional(),
pageOptions: z
.object({
/** Headers to send with the request. Can be used to send cookies, user-agent, etc. */
headers: z
.record(z.any())
.describe(
'Headers to send with the request. Can be used to send cookies, user-agent, etc.'
)
.optional(),
/** Include the HTML version of the content on page. Will output a html key in the response. */
includeHtml: z
.boolean()
.describe(
'Include the HTML version of the content on page. Will output a html key in the response.'
)
.default(false),
/** Include the raw HTML content of the page. Will output a rawHtml key in the response. */
includeRawHtml: z
.boolean()
.describe(
'Include the raw HTML content of the page. Will output a rawHtml key in the response.'
)
.default(false),
/** Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer' */
onlyIncludeTags: z
.array(z.string())
.describe(
"Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
)
.optional(),
/** Only return the main content of the page excluding headers, navs, footers, etc. */
onlyMainContent: z
.boolean()
.describe(
'Only return the main content of the page excluding headers, navs, footers, etc.'
)
.default(false),
/** Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer' */
removeTags: z
.array(z.string())
.describe(
"Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
)
.optional(),
/** Replace all relative paths with absolute paths for images and links */
replaceAllPathsWithAbsolutePaths: z
.boolean()
.describe(
'Replace all relative paths with absolute paths for images and links'
)
.default(false),
/** Include a screenshot of the top of the page that you are scraping. */
screenshot: z
.boolean()
.describe(
'Include a screenshot of the top of the page that you are scraping.'
)
.default(false),
/** Include a full page screenshot of the page that you are scraping. */
fullPageScreenshot: z
.boolean()
.describe(
'Include a full page screenshot of the page that you are scraping.'
)
.default(false),
/** Wait x amount of milliseconds for the page to load to fetch content */
waitFor: z
.number()
.int()
.describe(
'Wait x amount of milliseconds for the page to load to fetch content'
total: z.number().int().describe('Total number of pages').optional(),
/** Data returned from the job (null when it is in progress) */
data: z
.array(CrawlStatusResponseObjSchema)
.describe('Data returned from the job (null when it is in progress)')
.optional(),
/** Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. When the crawl is done, partial_data will become empty and the result will be available in `data`. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array. */
partial_data: z
.array(CrawlStatusResponseObjSchema)
.describe(
'Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. When the crawl is done, partial_data will become empty and the result will be available in `data`. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array.'