kopia lustrzana https://github.com/transitive-bullshit/chatgpt-api
fix: misc fixes for SeardAndCrawlTool
rodzic
730ed2329d
commit
c60f1a108f
|
@ -10,6 +10,9 @@ async function main() {
|
||||||
|
|
||||||
const res = await agentic
|
const res = await agentic
|
||||||
.gpt4(`Summarize the latest news on {{topic}} using markdown.`)
|
.gpt4(`Summarize the latest news on {{topic}} using markdown.`)
|
||||||
|
.modelParams({
|
||||||
|
model: 'gpt-4-32k'
|
||||||
|
})
|
||||||
.tools([new SearchAndCrawlTool()])
|
.tools([new SearchAndCrawlTool()])
|
||||||
.input(
|
.input(
|
||||||
z.object({
|
z.object({
|
||||||
|
|
|
@ -43,14 +43,14 @@ export const DiffbotObjectSchema = z.object({
|
||||||
title: z.string(),
|
title: z.string(),
|
||||||
siteName: z.string(),
|
siteName: z.string(),
|
||||||
author: z.string(),
|
author: z.string(),
|
||||||
authorUrl: z.string(),
|
// authorUrl: z.string(),
|
||||||
pageUrl: z.string(),
|
pageUrl: z.string(),
|
||||||
date: z.string(),
|
date: z.string(),
|
||||||
estimatedDate: z.string(),
|
// estimatedDate: z.string(),
|
||||||
humanLanguage: z.string(),
|
// humanLanguage: z.string(),
|
||||||
text: z.string().describe('main text content of the page'),
|
text: z.string().describe('main text content of the page'),
|
||||||
tags: z.array(z.string()),
|
// tags: z.array(z.string()),
|
||||||
images: z.array(DiffbotImageSchema),
|
// images: z.array(DiffbotImageSchema),
|
||||||
items: z.array(DiffbotListItemSchema)
|
items: z.array(DiffbotListItemSchema)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
@ -115,16 +115,16 @@ export class DiffbotTool extends BaseTask<DiffbotInput, DiffbotOutput> {
|
||||||
'type',
|
'type',
|
||||||
'siteName',
|
'siteName',
|
||||||
'author',
|
'author',
|
||||||
'authorUrl',
|
// 'authorUrl',
|
||||||
'pageUrl',
|
'pageUrl',
|
||||||
'date',
|
'date',
|
||||||
'estimatedDate',
|
// 'estimatedDate',
|
||||||
'humanLanguage',
|
// 'humanLanguage',
|
||||||
'items',
|
'items',
|
||||||
'text'
|
'text'
|
||||||
),
|
)
|
||||||
tags: obj.tags?.map((tag) => tag.label),
|
// tags: obj.tags?.map((tag) => tag.label)
|
||||||
images: obj.images?.map((image) => omit(image, 'diffbotUri'))
|
// images: obj.images?.map((image) => omit(image, 'diffbotUri'))
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@ import { z } from 'zod'
|
||||||
|
|
||||||
import * as types from '@/types'
|
import * as types from '@/types'
|
||||||
import { BaseTask } from '@/task'
|
import { BaseTask } from '@/task'
|
||||||
import { normalizeUrl } from '@/url-utils'
|
import { isValidCrawlableUrl, normalizeUrl } from '@/url-utils'
|
||||||
import { omit } from '@/utils'
|
import { omit } from '@/utils'
|
||||||
|
|
||||||
import { DiffbotOutput, DiffbotOutputSchema, DiffbotTool } from './diffbot'
|
import { DiffbotOutput, DiffbotOutputSchema, DiffbotTool } from './diffbot'
|
||||||
|
@ -88,6 +88,7 @@ export class SearchAndCrawlTool extends BaseTask<
|
||||||
): Promise<Array<DiffbotOutput>> {
|
): Promise<Array<DiffbotOutput>> {
|
||||||
try {
|
try {
|
||||||
if (!url) return []
|
if (!url) return []
|
||||||
|
if (!isValidCrawlableUrl(url)) return []
|
||||||
if (crawledUrls.has(url)) return []
|
if (crawledUrls.has(url)) return []
|
||||||
|
|
||||||
const normalizedUrl = normalizeUrl(url)
|
const normalizedUrl = normalizeUrl(url)
|
||||||
|
@ -136,18 +137,21 @@ export class SearchAndCrawlTool extends BaseTask<
|
||||||
)
|
)
|
||||||
).flat()
|
).flat()
|
||||||
|
|
||||||
return [scrapeResult, ...innerScrapeResults]
|
return innerScrapeResults
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.warn('crawlAndScrape error', url, err)
|
console.warn('crawlAndScrape error', url, err)
|
||||||
return []
|
return []
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const search = await this._serpapiTool.callWithMetadata({ query }, ctx)
|
const search = await this._serpapiTool.callWithMetadata(
|
||||||
|
{ query, numResults: 3 },
|
||||||
|
ctx
|
||||||
|
)
|
||||||
|
|
||||||
const scrapeResults = (
|
const scrapeResults = (
|
||||||
await pMap(
|
await pMap(
|
||||||
search.result.organic_results || [],
|
(search.result.organic_results || []).slice(0, 3),
|
||||||
async (searchResult) => {
|
async (searchResult) => {
|
||||||
return crawlAndScrape(searchResult.link, {
|
return crawlAndScrape(searchResult.link, {
|
||||||
diffbotTool: this._diffbotTool,
|
diffbotTool: this._diffbotTool,
|
||||||
|
|
|
@ -103,7 +103,7 @@ export class SerpAPITool extends BaseTask<SerpAPIInput, SerpAPIOutput> {
|
||||||
// results manuall
|
// results manuall
|
||||||
})
|
})
|
||||||
|
|
||||||
this._logger.debug(
|
this._logger.info(
|
||||||
res,
|
res,
|
||||||
`SerpAPI response for query ${JSON.stringify(ctx.input, null, 2)}"`
|
`SerpAPI response for query ${JSON.stringify(ctx.input, null, 2)}"`
|
||||||
)
|
)
|
||||||
|
|
|
@ -2,11 +2,33 @@ import isRelativeUrl from 'is-relative-url'
|
||||||
import normalizeUrlImpl, { type Options } from 'normalize-url'
|
import normalizeUrlImpl, { type Options } from 'normalize-url'
|
||||||
import QuickLRU from 'quick-lru'
|
import QuickLRU from 'quick-lru'
|
||||||
|
|
||||||
// const protocolAllowList = new Set(['https:', 'http:'])
|
const protocolAllowList = new Set(['https:', 'http:'])
|
||||||
const normalizedUrlCache = new QuickLRU<string, string | null>({
|
const normalizedUrlCache = new QuickLRU<string, string | null>({
|
||||||
maxSize: 4000
|
maxSize: 4000
|
||||||
})
|
})
|
||||||
|
|
||||||
|
export function isValidCrawlableUrl(url: string): boolean {
|
||||||
|
try {
|
||||||
|
if (!url || (isRelativeUrl(url) && !url.startsWith('//'))) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const parsedUrl = new URL(url)
|
||||||
|
if (!protocolAllowList.has(parsedUrl.protocol)) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const normalizedUrl = normalizeUrl(url)
|
||||||
|
if (!normalizedUrl) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
return true
|
||||||
|
} catch (err) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generates a hash string from normalization options.
|
* Generates a hash string from normalization options.
|
||||||
*
|
*
|
||||||
|
|
Ładowanie…
Reference in New Issue