kopia lustrzana https://github.com/transitive-bullshit/chatgpt-api
pull/643/head^2
rodzic
526b96d87a
commit
5e0017be62
|
@ -101,5 +101,6 @@ export class AIFunctionSet implements Iterable<types.AIFunction> {
|
||||||
}
|
}
|
||||||
|
|
||||||
function transformName(name: string): string {
|
function transformName(name: string): string {
|
||||||
|
// TODO: decamalize?
|
||||||
return name.toLowerCase()
|
return name.toLowerCase()
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,27 +1,33 @@
|
||||||
import defaultKy, { type KyInstance } from 'ky'
|
import defaultKy, { type KyInstance } from 'ky'
|
||||||
|
import { z } from 'zod'
|
||||||
|
|
||||||
import { assert, getEnv } from '../utils.js'
|
import { aiFunction, AIFunctionsProvider } from '../fns.js'
|
||||||
|
import { assert, getEnv, omit } from '../utils.js'
|
||||||
|
|
||||||
export namespace scraper {
|
export namespace scraper {
|
||||||
export type ScrapeResult = {
|
export type ScrapeResult = {
|
||||||
author: string
|
author: string
|
||||||
byline: string
|
byline: string
|
||||||
/** The HTML for the main content of the page. */
|
|
||||||
content: string
|
|
||||||
description: string
|
description: string
|
||||||
imageUrl: string
|
imageUrl: string
|
||||||
lang: string
|
lang: string
|
||||||
length: number
|
length: number
|
||||||
logoUrl: string
|
logoUrl: string
|
||||||
/** The text for the main content of the page in markdown format. */
|
|
||||||
markdownContent: string
|
|
||||||
publishedTime: string
|
publishedTime: string
|
||||||
|
siteName: string
|
||||||
|
title: string
|
||||||
|
|
||||||
|
/** The HTML for the main content of the page. */
|
||||||
|
content: string
|
||||||
|
|
||||||
/** The raw HTML response from the server. */
|
/** The raw HTML response from the server. */
|
||||||
rawHtml: string
|
rawHtml: string
|
||||||
siteName: string
|
|
||||||
|
/** The text for the main content of the page in markdown format. */
|
||||||
|
markdownContent: string
|
||||||
|
|
||||||
/** The text for the main content of the page. */
|
/** The text for the main content of the page. */
|
||||||
textContent: string
|
textContent: string
|
||||||
title: string
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -33,7 +39,7 @@ export namespace scraper {
|
||||||
* It tries the simplest and fastest methods first, and falls back to slower
|
* It tries the simplest and fastest methods first, and falls back to slower
|
||||||
* proxies and JavaScript rendering if needed.
|
* proxies and JavaScript rendering if needed.
|
||||||
*/
|
*/
|
||||||
export class ScraperClient {
|
export class ScraperClient extends AIFunctionsProvider {
|
||||||
readonly apiBaseUrl: string
|
readonly apiBaseUrl: string
|
||||||
readonly ky: KyInstance
|
readonly ky: KyInstance
|
||||||
|
|
||||||
|
@ -45,25 +51,64 @@ export class ScraperClient {
|
||||||
apiBaseUrl?: string
|
apiBaseUrl?: string
|
||||||
ky?: KyInstance
|
ky?: KyInstance
|
||||||
} = {}) {
|
} = {}) {
|
||||||
assert(apiBaseUrl, 'ScraperClient apiBaseUrl is required')
|
assert(
|
||||||
|
apiBaseUrl,
|
||||||
|
'ScraperClient missing required "apiBaseUrl" (defaults to "SCRAPER_API_BASE_URL")'
|
||||||
|
)
|
||||||
|
super()
|
||||||
|
|
||||||
this.apiBaseUrl = apiBaseUrl
|
this.apiBaseUrl = apiBaseUrl
|
||||||
this.ky = ky.extend({ prefixUrl: this.apiBaseUrl })
|
this.ky = ky.extend({ prefixUrl: this.apiBaseUrl })
|
||||||
}
|
}
|
||||||
|
|
||||||
async scrapeUrl(
|
@aiFunction({
|
||||||
url: string,
|
name: 'scrape_url',
|
||||||
{
|
description: 'Scrapes the content of a single URL.',
|
||||||
timeout = 60_000
|
inputSchema: z.object({
|
||||||
}: {
|
url: z.string().url().describe('The URL of the web page to scrape'),
|
||||||
timeout?: number
|
format: z
|
||||||
} = {}
|
.enum(['html', 'markdown', 'plaintext'])
|
||||||
): Promise<scraper.ScrapeResult> {
|
.default('markdown')
|
||||||
return this.ky
|
.optional()
|
||||||
.post('scrape', {
|
.describe(
|
||||||
json: { url },
|
'Whether to return the content as HTML, markdown, or plaintext.'
|
||||||
timeout
|
)
|
||||||
})
|
})
|
||||||
.json()
|
})
|
||||||
|
async scrapeUrl(
|
||||||
|
urlOrOpts:
|
||||||
|
| string
|
||||||
|
| {
|
||||||
|
url: string
|
||||||
|
format?: 'html' | 'markdown' | 'plaintext'
|
||||||
|
timeoutMs?: number
|
||||||
|
}
|
||||||
|
): Promise<Partial<scraper.ScrapeResult>> {
|
||||||
|
const {
|
||||||
|
timeoutMs = 60_000,
|
||||||
|
format = 'markdown',
|
||||||
|
...opts
|
||||||
|
} = typeof urlOrOpts === 'string' ? { url: urlOrOpts } : urlOrOpts
|
||||||
|
|
||||||
|
const res = await this.ky
|
||||||
|
.post('scrape', {
|
||||||
|
json: opts,
|
||||||
|
timeout: timeoutMs
|
||||||
|
})
|
||||||
|
.json<scraper.ScrapeResult>()
|
||||||
|
|
||||||
|
switch (format) {
|
||||||
|
case 'html':
|
||||||
|
return omit(res, 'markdownContent', 'textContent', 'rawHtml')
|
||||||
|
|
||||||
|
case 'markdown':
|
||||||
|
return omit(res, 'textContent', 'rawHtml', 'content')
|
||||||
|
|
||||||
|
case 'plaintext':
|
||||||
|
return omit(res, 'markdownContent', 'rawHtml', 'content')
|
||||||
|
|
||||||
|
default:
|
||||||
|
return res
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Ładowanie…
Reference in New Issue