From a4cb592a2deff0217bd9f312bd25d4f63a0c055c Mon Sep 17 00:00:00 2001
From: Travis Fischer <fisch0920@gmail.com>
Date: Fri, 16 Jun 2023 16:09:33 -0700
Subject: [PATCH] fix: misc fixes for SeardAndCrawlTool

---
 legacy/examples/search-and-crawl.ts  |  3 +++
 legacy/src/tools/diffbot.ts          | 22 +++++++++++-----------
 legacy/src/tools/search-and-crawl.ts | 12 ++++++++----
 legacy/src/tools/serpapi.ts          |  2 +-
 legacy/src/url-utils.ts              | 24 +++++++++++++++++++++++-
 5 files changed, 46 insertions(+), 17 deletions(-)
diff --git a/legacy/examples/search-and-crawl.ts b/legacy/examples/search-and-crawl.ts
index 2107a3eb..f89ae79c 100644
--- a/legacy/examples/search-and-crawl.ts
+++ b/legacy/examples/search-and-crawl.ts
@@ -10,6 +10,9 @@ async function main() {
 
   const res = await agentic
     .gpt4(`Summarize the latest news on {{topic}} using markdown.`)
+    .modelParams({
+      model: 'gpt-4-32k'
+    })
     .tools([new SearchAndCrawlTool()])
     .input(
       z.object({
diff --git a/legacy/src/tools/diffbot.ts b/legacy/src/tools/diffbot.ts
index 61553887..adfe80ce 100644
--- a/legacy/src/tools/diffbot.ts
+++ b/legacy/src/tools/diffbot.ts
@@ -43,14 +43,14 @@ export const DiffbotObjectSchema = z.object({
   title: z.string(),
   siteName: z.string(),
   author: z.string(),
-  authorUrl: z.string(),
+  // authorUrl: z.string(),
   pageUrl: z.string(),
   date: z.string(),
-  estimatedDate: z.string(),
-  humanLanguage: z.string(),
+  // estimatedDate: z.string(),
+  // humanLanguage: z.string(),
   text: z.string().describe('main text content of the page'),
-  tags: z.array(z.string()),
-  images: z.array(DiffbotImageSchema),
+  // tags: z.array(z.string()),
+  // images: z.array(DiffbotImageSchema),
   items: z.array(DiffbotListItemSchema)
 })
 
@@ -115,16 +115,16 @@ export class DiffbotTool extends BaseTask<DiffbotInput, DiffbotOutput> {
           'type',
           'siteName',
           'author',
-          'authorUrl',
+          // 'authorUrl',
           'pageUrl',
           'date',
-          'estimatedDate',
-          'humanLanguage',
+          // 'estimatedDate',
+          // 'humanLanguage',
           'items',
           'text'
-        ),
-        tags: obj.tags?.map((tag) => tag.label),
-        images: obj.images?.map((image) => omit(image, 'diffbotUri'))
+        )
+        // tags: obj.tags?.map((tag) => tag.label)
+        // images: obj.images?.map((image) => omit(image, 'diffbotUri'))
       }))
     }
 
diff --git a/legacy/src/tools/search-and-crawl.ts b/legacy/src/tools/search-and-crawl.ts
index 97533f91..065ca224 100644
--- a/legacy/src/tools/search-and-crawl.ts
+++ b/legacy/src/tools/search-and-crawl.ts
@@ -3,7 +3,7 @@ import { z } from 'zod'
 
 import * as types from '@/types'
 import { BaseTask } from '@/task'
-import { normalizeUrl } from '@/url-utils'
+import { isValidCrawlableUrl, normalizeUrl } from '@/url-utils'
 import { omit } from '@/utils'
 
 import { DiffbotOutput, DiffbotOutputSchema, DiffbotTool } from './diffbot'
@@ -88,6 +88,7 @@ export class SearchAndCrawlTool extends BaseTask<
     ): Promise<Array<DiffbotOutput>> {
       try {
         if (!url) return []
+        if (!isValidCrawlableUrl(url)) return []
         if (crawledUrls.has(url)) return []
 
         const normalizedUrl = normalizeUrl(url)
@@ -136,18 +137,21 @@ export class SearchAndCrawlTool extends BaseTask<
           )
         ).flat()
 
-        return [scrapeResult, ...innerScrapeResults]
+        return innerScrapeResults
       } catch (err) {
         console.warn('crawlAndScrape error', url, err)
         return []
       }
     }
 
-    const search = await this._serpapiTool.callWithMetadata({ query }, ctx)
+    const search = await this._serpapiTool.callWithMetadata(
+      { query, numResults: 3 },
+      ctx
+    )
 
     const scrapeResults = (
       await pMap(
-        search.result.organic_results || [],
+        (search.result.organic_results || []).slice(0, 3),
         async (searchResult) => {
           return crawlAndScrape(searchResult.link, {
             diffbotTool: this._diffbotTool,
diff --git a/legacy/src/tools/serpapi.ts b/legacy/src/tools/serpapi.ts
index d7db55df..9f8e6551 100644
--- a/legacy/src/tools/serpapi.ts
+++ b/legacy/src/tools/serpapi.ts
@@ -103,7 +103,7 @@ export class SerpAPITool extends BaseTask<SerpAPIInput, SerpAPIOutput> {
       // results manuall
     })
 
-    this._logger.debug(
+    this._logger.info(
       res,
       `SerpAPI response for query ${JSON.stringify(ctx.input, null, 2)}"`
     )
diff --git a/legacy/src/url-utils.ts b/legacy/src/url-utils.ts
index 91b6816a..9ed89d84 100644
--- a/legacy/src/url-utils.ts
+++ b/legacy/src/url-utils.ts
@@ -2,11 +2,33 @@ import isRelativeUrl from 'is-relative-url'
 import normalizeUrlImpl, { type Options } from 'normalize-url'
 import QuickLRU from 'quick-lru'
 
-// const protocolAllowList = new Set(['https:', 'http:'])
+const protocolAllowList = new Set(['https:', 'http:'])
 const normalizedUrlCache = new QuickLRU<string, string | null>({
   maxSize: 4000
 })
 
+export function isValidCrawlableUrl(url: string): boolean {
+  try {
+    if (!url || (isRelativeUrl(url) && !url.startsWith('//'))) {
+      return false
+    }
+
+    const parsedUrl = new URL(url)
+    if (!protocolAllowList.has(parsedUrl.protocol)) {
+      return false
+    }
+
+    const normalizedUrl = normalizeUrl(url)
+    if (!normalizedUrl) {
+      return false
+    }
+
+    return true
+  } catch (err) {
+    return false
+  }
+}
+
 /**
  * Generates a hash string from normalization options.
  *