From a4cb592a2deff0217bd9f312bd25d4f63a0c055c Mon Sep 17 00:00:00 2001 From: Travis Fischer Date: Fri, 16 Jun 2023 16:09:33 -0700 Subject: [PATCH] fix: misc fixes for SeardAndCrawlTool --- legacy/examples/search-and-crawl.ts | 3 +++ legacy/src/tools/diffbot.ts | 22 +++++++++++----------- legacy/src/tools/search-and-crawl.ts | 12 ++++++++---- legacy/src/tools/serpapi.ts | 2 +- legacy/src/url-utils.ts | 24 +++++++++++++++++++++++- 5 files changed, 46 insertions(+), 17 deletions(-) diff --git a/legacy/examples/search-and-crawl.ts b/legacy/examples/search-and-crawl.ts index 2107a3eb..f89ae79c 100644 --- a/legacy/examples/search-and-crawl.ts +++ b/legacy/examples/search-and-crawl.ts @@ -10,6 +10,9 @@ async function main() { const res = await agentic .gpt4(`Summarize the latest news on {{topic}} using markdown.`) + .modelParams({ + model: 'gpt-4-32k' + }) .tools([new SearchAndCrawlTool()]) .input( z.object({ diff --git a/legacy/src/tools/diffbot.ts b/legacy/src/tools/diffbot.ts index 61553887..adfe80ce 100644 --- a/legacy/src/tools/diffbot.ts +++ b/legacy/src/tools/diffbot.ts @@ -43,14 +43,14 @@ export const DiffbotObjectSchema = z.object({ title: z.string(), siteName: z.string(), author: z.string(), - authorUrl: z.string(), + // authorUrl: z.string(), pageUrl: z.string(), date: z.string(), - estimatedDate: z.string(), - humanLanguage: z.string(), + // estimatedDate: z.string(), + // humanLanguage: z.string(), text: z.string().describe('main text content of the page'), - tags: z.array(z.string()), - images: z.array(DiffbotImageSchema), + // tags: z.array(z.string()), + // images: z.array(DiffbotImageSchema), items: z.array(DiffbotListItemSchema) }) @@ -115,16 +115,16 @@ export class DiffbotTool extends BaseTask { 'type', 'siteName', 'author', - 'authorUrl', + // 'authorUrl', 'pageUrl', 'date', - 'estimatedDate', - 'humanLanguage', + // 'estimatedDate', + // 'humanLanguage', 'items', 'text' - ), - tags: obj.tags?.map((tag) => tag.label), - images: obj.images?.map((image) => omit(image, 'diffbotUri')) + ) + // tags: obj.tags?.map((tag) => tag.label) + // images: obj.images?.map((image) => omit(image, 'diffbotUri')) })) } diff --git a/legacy/src/tools/search-and-crawl.ts b/legacy/src/tools/search-and-crawl.ts index 97533f91..065ca224 100644 --- a/legacy/src/tools/search-and-crawl.ts +++ b/legacy/src/tools/search-and-crawl.ts @@ -3,7 +3,7 @@ import { z } from 'zod' import * as types from '@/types' import { BaseTask } from '@/task' -import { normalizeUrl } from '@/url-utils' +import { isValidCrawlableUrl, normalizeUrl } from '@/url-utils' import { omit } from '@/utils' import { DiffbotOutput, DiffbotOutputSchema, DiffbotTool } from './diffbot' @@ -88,6 +88,7 @@ export class SearchAndCrawlTool extends BaseTask< ): Promise> { try { if (!url) return [] + if (!isValidCrawlableUrl(url)) return [] if (crawledUrls.has(url)) return [] const normalizedUrl = normalizeUrl(url) @@ -136,18 +137,21 @@ export class SearchAndCrawlTool extends BaseTask< ) ).flat() - return [scrapeResult, ...innerScrapeResults] + return innerScrapeResults } catch (err) { console.warn('crawlAndScrape error', url, err) return [] } } - const search = await this._serpapiTool.callWithMetadata({ query }, ctx) + const search = await this._serpapiTool.callWithMetadata( + { query, numResults: 3 }, + ctx + ) const scrapeResults = ( await pMap( - search.result.organic_results || [], + (search.result.organic_results || []).slice(0, 3), async (searchResult) => { return crawlAndScrape(searchResult.link, { diffbotTool: this._diffbotTool, diff --git a/legacy/src/tools/serpapi.ts b/legacy/src/tools/serpapi.ts index d7db55df..9f8e6551 100644 --- a/legacy/src/tools/serpapi.ts +++ b/legacy/src/tools/serpapi.ts @@ -103,7 +103,7 @@ export class SerpAPITool extends BaseTask { // results manuall }) - this._logger.debug( + this._logger.info( res, `SerpAPI response for query ${JSON.stringify(ctx.input, null, 2)}"` ) diff --git a/legacy/src/url-utils.ts b/legacy/src/url-utils.ts index 91b6816a..9ed89d84 100644 --- a/legacy/src/url-utils.ts +++ b/legacy/src/url-utils.ts @@ -2,11 +2,33 @@ import isRelativeUrl from 'is-relative-url' import normalizeUrlImpl, { type Options } from 'normalize-url' import QuickLRU from 'quick-lru' -// const protocolAllowList = new Set(['https:', 'http:']) +const protocolAllowList = new Set(['https:', 'http:']) const normalizedUrlCache = new QuickLRU({ maxSize: 4000 }) +export function isValidCrawlableUrl(url: string): boolean { + try { + if (!url || (isRelativeUrl(url) && !url.startsWith('//'))) { + return false + } + + const parsedUrl = new URL(url) + if (!protocolAllowList.has(parsedUrl.protocol)) { + return false + } + + const normalizedUrl = normalizeUrl(url) + if (!normalizedUrl) { + return false + } + + return true + } catch (err) { + return false + } +} + /** * Generates a hash string from normalization options. *