fix: misc fixes for SeardAndCrawlTool

Travis Fischer 2023-06-16 16:09:33 -07:00
rodzic 730ed2329d
commit c60f1a108f
5 zmienionych plików z 46 dodań i 17 usunięć

Wyświetl plik

@ -10,6 +10,9 @@ async function main() {
const res = await agentic const res = await agentic
.gpt4(`Summarize the latest news on {{topic}} using markdown.`) .gpt4(`Summarize the latest news on {{topic}} using markdown.`)
.modelParams({
model: 'gpt-4-32k'
})
.tools([new SearchAndCrawlTool()]) .tools([new SearchAndCrawlTool()])
.input( .input(
z.object({ z.object({

Wyświetl plik

@ -43,14 +43,14 @@ export const DiffbotObjectSchema = z.object({
title: z.string(), title: z.string(),
siteName: z.string(), siteName: z.string(),
author: z.string(), author: z.string(),
authorUrl: z.string(), // authorUrl: z.string(),
pageUrl: z.string(), pageUrl: z.string(),
date: z.string(), date: z.string(),
estimatedDate: z.string(), // estimatedDate: z.string(),
humanLanguage: z.string(), // humanLanguage: z.string(),
text: z.string().describe('main text content of the page'), text: z.string().describe('main text content of the page'),
tags: z.array(z.string()), // tags: z.array(z.string()),
images: z.array(DiffbotImageSchema), // images: z.array(DiffbotImageSchema),
items: z.array(DiffbotListItemSchema) items: z.array(DiffbotListItemSchema)
}) })
@ -115,16 +115,16 @@ export class DiffbotTool extends BaseTask<DiffbotInput, DiffbotOutput> {
'type', 'type',
'siteName', 'siteName',
'author', 'author',
'authorUrl', // 'authorUrl',
'pageUrl', 'pageUrl',
'date', 'date',
'estimatedDate', // 'estimatedDate',
'humanLanguage', // 'humanLanguage',
'items', 'items',
'text' 'text'
), )
tags: obj.tags?.map((tag) => tag.label), // tags: obj.tags?.map((tag) => tag.label)
images: obj.images?.map((image) => omit(image, 'diffbotUri')) // images: obj.images?.map((image) => omit(image, 'diffbotUri'))
})) }))
} }

Wyświetl plik

@ -3,7 +3,7 @@ import { z } from 'zod'
import * as types from '@/types' import * as types from '@/types'
import { BaseTask } from '@/task' import { BaseTask } from '@/task'
import { normalizeUrl } from '@/url-utils' import { isValidCrawlableUrl, normalizeUrl } from '@/url-utils'
import { omit } from '@/utils' import { omit } from '@/utils'
import { DiffbotOutput, DiffbotOutputSchema, DiffbotTool } from './diffbot' import { DiffbotOutput, DiffbotOutputSchema, DiffbotTool } from './diffbot'
@ -88,6 +88,7 @@ export class SearchAndCrawlTool extends BaseTask<
): Promise<Array<DiffbotOutput>> { ): Promise<Array<DiffbotOutput>> {
try { try {
if (!url) return [] if (!url) return []
if (!isValidCrawlableUrl(url)) return []
if (crawledUrls.has(url)) return [] if (crawledUrls.has(url)) return []
const normalizedUrl = normalizeUrl(url) const normalizedUrl = normalizeUrl(url)
@ -136,18 +137,21 @@ export class SearchAndCrawlTool extends BaseTask<
) )
).flat() ).flat()
return [scrapeResult, ...innerScrapeResults] return innerScrapeResults
} catch (err) { } catch (err) {
console.warn('crawlAndScrape error', url, err) console.warn('crawlAndScrape error', url, err)
return [] return []
} }
} }
const search = await this._serpapiTool.callWithMetadata({ query }, ctx) const search = await this._serpapiTool.callWithMetadata(
{ query, numResults: 3 },
ctx
)
const scrapeResults = ( const scrapeResults = (
await pMap( await pMap(
search.result.organic_results || [], (search.result.organic_results || []).slice(0, 3),
async (searchResult) => { async (searchResult) => {
return crawlAndScrape(searchResult.link, { return crawlAndScrape(searchResult.link, {
diffbotTool: this._diffbotTool, diffbotTool: this._diffbotTool,

Wyświetl plik

@ -103,7 +103,7 @@ export class SerpAPITool extends BaseTask<SerpAPIInput, SerpAPIOutput> {
// results manuall // results manuall
}) })
this._logger.debug( this._logger.info(
res, res,
`SerpAPI response for query ${JSON.stringify(ctx.input, null, 2)}"` `SerpAPI response for query ${JSON.stringify(ctx.input, null, 2)}"`
) )

Wyświetl plik

@ -2,11 +2,33 @@ import isRelativeUrl from 'is-relative-url'
import normalizeUrlImpl, { type Options } from 'normalize-url' import normalizeUrlImpl, { type Options } from 'normalize-url'
import QuickLRU from 'quick-lru' import QuickLRU from 'quick-lru'
// const protocolAllowList = new Set(['https:', 'http:']) const protocolAllowList = new Set(['https:', 'http:'])
const normalizedUrlCache = new QuickLRU<string, string | null>({ const normalizedUrlCache = new QuickLRU<string, string | null>({
maxSize: 4000 maxSize: 4000
}) })
export function isValidCrawlableUrl(url: string): boolean {
try {
if (!url || (isRelativeUrl(url) && !url.startsWith('//'))) {
return false
}
const parsedUrl = new URL(url)
if (!protocolAllowList.has(parsedUrl.protocol)) {
return false
}
const normalizedUrl = normalizeUrl(url)
if (!normalizedUrl) {
return false
}
return true
} catch (err) {
return false
}
}
/** /**
* Generates a hash string from normalization options. * Generates a hash string from normalization options.
* *