fix: misc fixes for SeardAndCrawlTool

Travis Fischer 2023-06-16 16:09:33 -07:00
rodzic 730ed2329d
commit c60f1a108f
5 zmienionych plików z 46 dodań i 17 usunięć

Wyświetl plik

@ -10,6 +10,9 @@ async function main() {
const res = await agentic
.gpt4(`Summarize the latest news on {{topic}} using markdown.`)
.modelParams({
model: 'gpt-4-32k'
})
.tools([new SearchAndCrawlTool()])
.input(
z.object({

Wyświetl plik

@ -43,14 +43,14 @@ export const DiffbotObjectSchema = z.object({
title: z.string(),
siteName: z.string(),
author: z.string(),
authorUrl: z.string(),
// authorUrl: z.string(),
pageUrl: z.string(),
date: z.string(),
estimatedDate: z.string(),
humanLanguage: z.string(),
// estimatedDate: z.string(),
// humanLanguage: z.string(),
text: z.string().describe('main text content of the page'),
tags: z.array(z.string()),
images: z.array(DiffbotImageSchema),
// tags: z.array(z.string()),
// images: z.array(DiffbotImageSchema),
items: z.array(DiffbotListItemSchema)
})
@ -115,16 +115,16 @@ export class DiffbotTool extends BaseTask<DiffbotInput, DiffbotOutput> {
'type',
'siteName',
'author',
'authorUrl',
// 'authorUrl',
'pageUrl',
'date',
'estimatedDate',
'humanLanguage',
// 'estimatedDate',
// 'humanLanguage',
'items',
'text'
),
tags: obj.tags?.map((tag) => tag.label),
images: obj.images?.map((image) => omit(image, 'diffbotUri'))
)
// tags: obj.tags?.map((tag) => tag.label)
// images: obj.images?.map((image) => omit(image, 'diffbotUri'))
}))
}

Wyświetl plik

@ -3,7 +3,7 @@ import { z } from 'zod'
import * as types from '@/types'
import { BaseTask } from '@/task'
import { normalizeUrl } from '@/url-utils'
import { isValidCrawlableUrl, normalizeUrl } from '@/url-utils'
import { omit } from '@/utils'
import { DiffbotOutput, DiffbotOutputSchema, DiffbotTool } from './diffbot'
@ -88,6 +88,7 @@ export class SearchAndCrawlTool extends BaseTask<
): Promise<Array<DiffbotOutput>> {
try {
if (!url) return []
if (!isValidCrawlableUrl(url)) return []
if (crawledUrls.has(url)) return []
const normalizedUrl = normalizeUrl(url)
@ -136,18 +137,21 @@ export class SearchAndCrawlTool extends BaseTask<
)
).flat()
return [scrapeResult, ...innerScrapeResults]
return innerScrapeResults
} catch (err) {
console.warn('crawlAndScrape error', url, err)
return []
}
}
const search = await this._serpapiTool.callWithMetadata({ query }, ctx)
const search = await this._serpapiTool.callWithMetadata(
{ query, numResults: 3 },
ctx
)
const scrapeResults = (
await pMap(
search.result.organic_results || [],
(search.result.organic_results || []).slice(0, 3),
async (searchResult) => {
return crawlAndScrape(searchResult.link, {
diffbotTool: this._diffbotTool,

Wyświetl plik

@ -103,7 +103,7 @@ export class SerpAPITool extends BaseTask<SerpAPIInput, SerpAPIOutput> {
// results manuall
})
this._logger.debug(
this._logger.info(
res,
`SerpAPI response for query ${JSON.stringify(ctx.input, null, 2)}"`
)

Wyświetl plik

@ -2,11 +2,33 @@ import isRelativeUrl from 'is-relative-url'
import normalizeUrlImpl, { type Options } from 'normalize-url'
import QuickLRU from 'quick-lru'
// const protocolAllowList = new Set(['https:', 'http:'])
const protocolAllowList = new Set(['https:', 'http:'])
const normalizedUrlCache = new QuickLRU<string, string | null>({
maxSize: 4000
})
export function isValidCrawlableUrl(url: string): boolean {
try {
if (!url || (isRelativeUrl(url) && !url.startsWith('//'))) {
return false
}
const parsedUrl = new URL(url)
if (!protocolAllowList.has(parsedUrl.protocol)) {
return false
}
const normalizedUrl = normalizeUrl(url)
if (!normalizedUrl) {
return false
}
return true
} catch (err) {
return false
}
}
/**
* Generates a hash string from normalization options.
*