fix: misc fixes for SeardAndCrawlTool

2023-06-16 16:09:33 -07:00 · 2023-06-16 16:09:33 -07:00 · c60f1a108f
commit c60f1a108f
--- a/examples/search-and-crawl.ts
+++ b/examples/search-and-crawl.ts
@ -10,6 +10,9 @@ async function main() {

  const res = await agentic
    .gpt4(`Summarize the latest news on {{topic}} using markdown.`)
+    .modelParams({
+      model: 'gpt-4-32k'
+    })
    .tools([new SearchAndCrawlTool()])
    .input(
      z.object({
--- a/src/tools/diffbot.ts
+++ b/src/tools/diffbot.ts
@ -43,14 +43,14 @@ export const DiffbotObjectSchema = z.object({
  title: z.string(),
  siteName: z.string(),
  author: z.string(),
-  authorUrl: z.string(),
+  // authorUrl: z.string(),
  pageUrl: z.string(),
  date: z.string(),
-  estimatedDate: z.string(),
-  humanLanguage: z.string(),
+  // estimatedDate: z.string(),
+  // humanLanguage: z.string(),
  text: z.string().describe('main text content of the page'),
-  tags: z.array(z.string()),
-  images: z.array(DiffbotImageSchema),
+  // tags: z.array(z.string()),
+  // images: z.array(DiffbotImageSchema),
  items: z.array(DiffbotListItemSchema)
 })

@ -115,16 +115,16 @@ export class DiffbotTool extends BaseTask<DiffbotInput, DiffbotOutput> {
          'type',
          'siteName',
          'author',
-          'authorUrl',
+          // 'authorUrl',
          'pageUrl',
          'date',
-          'estimatedDate',
-          'humanLanguage',
+          // 'estimatedDate',
+          // 'humanLanguage',
          'items',
          'text'
-        ),
-        tags: obj.tags?.map((tag) => tag.label),
-        images: obj.images?.map((image) => omit(image, 'diffbotUri'))
+        )
+        // tags: obj.tags?.map((tag) => tag.label)
+        // images: obj.images?.map((image) => omit(image, 'diffbotUri'))
      }))
    }

--- a/src/tools/search-and-crawl.ts
+++ b/src/tools/search-and-crawl.ts
@ -3,7 +3,7 @@ import { z } from 'zod'

 import * as types from '@/types'
 import { BaseTask } from '@/task'
-import { normalizeUrl } from '@/url-utils'
+import { isValidCrawlableUrl, normalizeUrl } from '@/url-utils'
 import { omit } from '@/utils'

 import { DiffbotOutput, DiffbotOutputSchema, DiffbotTool } from './diffbot'
@ -88,6 +88,7 @@ export class SearchAndCrawlTool extends BaseTask<
    ): Promise<Array<DiffbotOutput>> {
      try {
        if (!url) return []
+        if (!isValidCrawlableUrl(url)) return []
        if (crawledUrls.has(url)) return []

        const normalizedUrl = normalizeUrl(url)
@ -136,18 +137,21 @@ export class SearchAndCrawlTool extends BaseTask<
          )
        ).flat()

-        return [scrapeResult, ...innerScrapeResults]
+        return innerScrapeResults
      } catch (err) {
        console.warn('crawlAndScrape error', url, err)
        return []
      }
    }

-    const search = await this._serpapiTool.callWithMetadata({ query }, ctx)
+    const search = await this._serpapiTool.callWithMetadata(
+      { query, numResults: 3 },
+      ctx
+    )

    const scrapeResults = (
      await pMap(
-        search.result.organic_results || [],
+        (search.result.organic_results || []).slice(0, 3),
        async (searchResult) => {
          return crawlAndScrape(searchResult.link, {
            diffbotTool: this._diffbotTool,
--- a/src/tools/serpapi.ts
+++ b/src/tools/serpapi.ts
@ -103,7 +103,7 @@ export class SerpAPITool extends BaseTask<SerpAPIInput, SerpAPIOutput> {
      // results manuall
    })

-    this._logger.debug(
+    this._logger.info(
      res,
      `SerpAPI response for query ${JSON.stringify(ctx.input, null, 2)}"`
    )
--- a/src/url-utils.ts
+++ b/src/url-utils.ts
@ -2,11 +2,33 @@ import isRelativeUrl from 'is-relative-url'
 import normalizeUrlImpl, { type Options } from 'normalize-url'
 import QuickLRU from 'quick-lru'

-// const protocolAllowList = new Set(['https:', 'http:'])
+const protocolAllowList = new Set(['https:', 'http:'])
 const normalizedUrlCache = new QuickLRU<string, string | null>({
  maxSize: 4000
 })

+export function isValidCrawlableUrl(url: string): boolean {
+  try {
+    if (!url || (isRelativeUrl(url) && !url.startsWith('//'))) {
+      return false
+    }
+
+    const parsedUrl = new URL(url)
+    if (!protocolAllowList.has(parsedUrl.protocol)) {
+      return false
+    }
+
+    const normalizedUrl = normalizeUrl(url)
+    if (!normalizedUrl) {
+      return false
+    }
+
+    return true
+  } catch (err) {
+    return false
+  }
+}
+
 /**
 * Generates a hash string from normalization options.
 *