kopia lustrzana https://github.com/transitive-bullshit/chatgpt-api
fix: misc fixes for SeardAndCrawlTool
rodzic
730ed2329d
commit
c60f1a108f
|
@ -10,6 +10,9 @@ async function main() {
|
|||
|
||||
const res = await agentic
|
||||
.gpt4(`Summarize the latest news on {{topic}} using markdown.`)
|
||||
.modelParams({
|
||||
model: 'gpt-4-32k'
|
||||
})
|
||||
.tools([new SearchAndCrawlTool()])
|
||||
.input(
|
||||
z.object({
|
||||
|
|
|
@ -43,14 +43,14 @@ export const DiffbotObjectSchema = z.object({
|
|||
title: z.string(),
|
||||
siteName: z.string(),
|
||||
author: z.string(),
|
||||
authorUrl: z.string(),
|
||||
// authorUrl: z.string(),
|
||||
pageUrl: z.string(),
|
||||
date: z.string(),
|
||||
estimatedDate: z.string(),
|
||||
humanLanguage: z.string(),
|
||||
// estimatedDate: z.string(),
|
||||
// humanLanguage: z.string(),
|
||||
text: z.string().describe('main text content of the page'),
|
||||
tags: z.array(z.string()),
|
||||
images: z.array(DiffbotImageSchema),
|
||||
// tags: z.array(z.string()),
|
||||
// images: z.array(DiffbotImageSchema),
|
||||
items: z.array(DiffbotListItemSchema)
|
||||
})
|
||||
|
||||
|
@ -115,16 +115,16 @@ export class DiffbotTool extends BaseTask<DiffbotInput, DiffbotOutput> {
|
|||
'type',
|
||||
'siteName',
|
||||
'author',
|
||||
'authorUrl',
|
||||
// 'authorUrl',
|
||||
'pageUrl',
|
||||
'date',
|
||||
'estimatedDate',
|
||||
'humanLanguage',
|
||||
// 'estimatedDate',
|
||||
// 'humanLanguage',
|
||||
'items',
|
||||
'text'
|
||||
),
|
||||
tags: obj.tags?.map((tag) => tag.label),
|
||||
images: obj.images?.map((image) => omit(image, 'diffbotUri'))
|
||||
)
|
||||
// tags: obj.tags?.map((tag) => tag.label)
|
||||
// images: obj.images?.map((image) => omit(image, 'diffbotUri'))
|
||||
}))
|
||||
}
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ import { z } from 'zod'
|
|||
|
||||
import * as types from '@/types'
|
||||
import { BaseTask } from '@/task'
|
||||
import { normalizeUrl } from '@/url-utils'
|
||||
import { isValidCrawlableUrl, normalizeUrl } from '@/url-utils'
|
||||
import { omit } from '@/utils'
|
||||
|
||||
import { DiffbotOutput, DiffbotOutputSchema, DiffbotTool } from './diffbot'
|
||||
|
@ -88,6 +88,7 @@ export class SearchAndCrawlTool extends BaseTask<
|
|||
): Promise<Array<DiffbotOutput>> {
|
||||
try {
|
||||
if (!url) return []
|
||||
if (!isValidCrawlableUrl(url)) return []
|
||||
if (crawledUrls.has(url)) return []
|
||||
|
||||
const normalizedUrl = normalizeUrl(url)
|
||||
|
@ -136,18 +137,21 @@ export class SearchAndCrawlTool extends BaseTask<
|
|||
)
|
||||
).flat()
|
||||
|
||||
return [scrapeResult, ...innerScrapeResults]
|
||||
return innerScrapeResults
|
||||
} catch (err) {
|
||||
console.warn('crawlAndScrape error', url, err)
|
||||
return []
|
||||
}
|
||||
}
|
||||
|
||||
const search = await this._serpapiTool.callWithMetadata({ query }, ctx)
|
||||
const search = await this._serpapiTool.callWithMetadata(
|
||||
{ query, numResults: 3 },
|
||||
ctx
|
||||
)
|
||||
|
||||
const scrapeResults = (
|
||||
await pMap(
|
||||
search.result.organic_results || [],
|
||||
(search.result.organic_results || []).slice(0, 3),
|
||||
async (searchResult) => {
|
||||
return crawlAndScrape(searchResult.link, {
|
||||
diffbotTool: this._diffbotTool,
|
||||
|
|
|
@ -103,7 +103,7 @@ export class SerpAPITool extends BaseTask<SerpAPIInput, SerpAPIOutput> {
|
|||
// results manuall
|
||||
})
|
||||
|
||||
this._logger.debug(
|
||||
this._logger.info(
|
||||
res,
|
||||
`SerpAPI response for query ${JSON.stringify(ctx.input, null, 2)}"`
|
||||
)
|
||||
|
|
|
@ -2,11 +2,33 @@ import isRelativeUrl from 'is-relative-url'
|
|||
import normalizeUrlImpl, { type Options } from 'normalize-url'
|
||||
import QuickLRU from 'quick-lru'
|
||||
|
||||
// const protocolAllowList = new Set(['https:', 'http:'])
|
||||
const protocolAllowList = new Set(['https:', 'http:'])
|
||||
const normalizedUrlCache = new QuickLRU<string, string | null>({
|
||||
maxSize: 4000
|
||||
})
|
||||
|
||||
export function isValidCrawlableUrl(url: string): boolean {
|
||||
try {
|
||||
if (!url || (isRelativeUrl(url) && !url.startsWith('//'))) {
|
||||
return false
|
||||
}
|
||||
|
||||
const parsedUrl = new URL(url)
|
||||
if (!protocolAllowList.has(parsedUrl.protocol)) {
|
||||
return false
|
||||
}
|
||||
|
||||
const normalizedUrl = normalizeUrl(url)
|
||||
if (!normalizedUrl) {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
} catch (err) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates a hash string from normalization options.
|
||||
*
|
||||
|
|
Ładowanie…
Reference in New Issue