feat: add SearchAndCrawl

pull/643/head^2
Travis Fischer 2024-06-05 01:46:52 -05:00
rodzic 1c10060651
commit 71fa4b2e7c
4 zmienionych plików z 205 dodań i 48 usunięć

Wyświetl plik

@ -0,0 +1,30 @@
#!/usr/bin/env node
import 'dotenv/config'
import { ChatModel, createAIRunner } from '@dexaai/dexter'
import { DiffbotClient, SerpAPIClient } from '../../src/index.js'
import { createDexterFunctions } from '../../src/sdks/dexter.js'
import { SearchAndCrawl } from '../../src/tools/search-and-crawl.js'
async function main() {
const serpapi = new SerpAPIClient()
const diffbot = new DiffbotClient()
const searchAndCrawl = new SearchAndCrawl({ serpapi, diffbot })
const runner = createAIRunner({
chatModel: new ChatModel({
params: { model: 'gpt-4o', temperature: 0 }
// debug: true
}),
functions: createDexterFunctions(searchAndCrawl),
systemMessage:
'You are a McKinsey analyst who is an expert at writing executive summaries. Always cite your sources and respond using Markdown.'
})
const topic = 'the 2024 olympics'
const result = await runner(`Summarize the latest news on ${topic}`)
console.log(result)
}
await main()

Wyświetl plik

@ -0,0 +1,135 @@
import pMap from 'p-map'
import { z } from 'zod'
import { aiFunction, AIFunctionsProvider } from '../fns.js'
import { type diffbot, DiffbotClient } from '../services/diffbot-client.js'
import { SerpAPIClient } from '../services/serpapi-client.js'
import { isValidCrawlableUrl, normalizeUrl } from '../url-utils.js'
import { omit, pick } from '../utils.js'
export class SearchAndCrawl extends AIFunctionsProvider {
readonly serpapi: SerpAPIClient
readonly diffbot: DiffbotClient
constructor(opts: { serpapi?: SerpAPIClient; diffbot?: DiffbotClient } = {}) {
super()
this.serpapi = opts.serpapi ?? new SerpAPIClient()
this.diffbot = opts.diffbot ?? new DiffbotClient()
}
@aiFunction({
name: 'search_and_crawl',
description:
'Uses Google to search the web, crawls the results, and then summarizes the most relevant results.',
inputSchema: z.object({
query: z.string().describe('search query')
})
})
async searchAndCrawl({
query,
numSearchResults = 3,
maxCrawlDepth = 1,
maxListItems = 3
}: {
query: string
numSearchResults?: number
maxCrawlDepth?: number
maxListItems?: number
}) {
const crawledUrls = new Set<string>()
const crawlAndScrape = async (
url: string | undefined,
{
depth = 0
}: {
depth?: number
}
): Promise<diffbot.ExtractAnalyzeResponse[]> => {
try {
if (!url) return []
if (!isValidCrawlableUrl(url)) return []
if (crawledUrls.has(url)) return []
const normalizedUrl = normalizeUrl(url)
if (!normalizedUrl) return []
if (crawledUrls.has(normalizedUrl)) return []
crawledUrls.add(url)
crawledUrls.add(normalizedUrl)
console.log('\n\n')
const scrapeResult = await this.diffbot.analyzeUrl({ url })
console.log(
`SearchAndCrawl depth ${depth} - "${url}"`,
pick(scrapeResult, 'type', 'title')
)
if (scrapeResult.type !== 'list') {
return [scrapeResult]
}
if (depth >= maxCrawlDepth) {
return [scrapeResult]
}
const object = scrapeResult.objects?.[0]
if (!object) return [scrapeResult]
const items = object.items
?.filter((item) => item.link)
.slice(0, maxListItems)
if (!items?.length) return [scrapeResult]
const innerScrapeResults = (
await pMap(
items,
async (item) => {
const innerScrapeResult = await crawlAndScrape(item.link, {
depth: depth + 1
})
return innerScrapeResult
},
{
concurrency: 4
}
)
).flat()
return innerScrapeResults
} catch (err) {
console.warn('crawlAndScrape error', url, err)
return []
}
}
const searchResponse = await this.serpapi.search({
q: query,
num: numSearchResults
})
console.log(`SearchAndCrawl search results "${query}"`, searchResponse)
const scrapeResults = (
await pMap(
(searchResponse.organic_results || []).slice(0, numSearchResults),
async (searchResult) => {
return crawlAndScrape(searchResult.link, {
depth: 0
})
},
{
concurrency: 5
}
)
).flat()
const output = {
...omit(searchResponse, 'organic_results'),
scrape_results: scrapeResults
}
console.log(`SearchAndCrawl response for query "${query}"`, output)
return output
}
}

Wyświetl plik

@ -1,20 +1,16 @@
import isRelativeUrlImpl from 'is-relative-url' import isRelativeUrlImpl from 'is-relative-url'
import normalizeUrlImpl, { type Options } from 'normalize-url' import normalizeUrlImpl, {
type Options as NormalizeUrlOptions
} from 'normalize-url'
import QuickLRU from 'quick-lru' import QuickLRU from 'quick-lru'
import { hashObject } from './utils.js' import { hashObject } from './utils.js'
const protocolAllowList = new Set(['https:', 'http:']) const protocolAllowList = new Set(['https:', 'http:'])
const normalizedUrlCache = new QuickLRU<string, string | undefined>({ const normalizedUrlCache = new QuickLRU<string, string | null>({
maxSize: 4000 maxSize: 4000
}) })
/**
* Checks if a URL is crawlable.
*
* @param url - URL string to check
* @returns whether the URL is crawlable
*/
export function isValidCrawlableUrl(url: string): boolean { export function isValidCrawlableUrl(url: string): boolean {
try { try {
if (!url || isRelativeUrl(url)) { if (!url || isRelativeUrl(url)) {
@ -43,42 +39,35 @@ export function isRelativeUrl(url: string): boolean {
return isRelativeUrlImpl(url) && !url.startsWith('//') return isRelativeUrlImpl(url) && !url.startsWith('//')
} }
/**
* Normalizes a URL string.
*
* @param url - URL string to normalize
* @param options - options for normalization.
* @returns normalized URL string or null if an invalid URL was passed
*/
export function normalizeUrl( export function normalizeUrl(
url: string, url: string,
options?: Options options?: NormalizeUrlOptions
): string | undefined { ): string | null {
let normalizedUrl: string | undefined let normalizedUrl: string | null | undefined
let cacheKey: string | undefined
if (!url || isRelativeUrl(url)) {
return null
}
const opts = {
stripWWW: false,
defaultProtocol: 'https',
normalizeProtocol: true,
forceHttps: false,
stripHash: false,
stripTextFragment: true,
removeQueryParameters: [/^utm_\w+/i, 'ref', 'ref_src'],
removeTrailingSlash: true,
removeSingleSlash: true,
removeExplicitPort: true,
sortQueryParameters: true,
...options
} as Required<NormalizeUrlOptions>
const optionsHash = hashObject(opts)
const cacheKey = `${url}-${optionsHash}`
try { try {
if (!url || isRelativeUrl(url)) {
return
}
const opts = {
stripWWW: false,
defaultProtocol: 'https',
normalizeProtocol: true,
forceHttps: false,
stripHash: false,
stripTextFragment: true,
removeQueryParameters: [/^utm_\w+/i, 'ref', 'ref_src'],
removeTrailingSlash: true,
removeSingleSlash: true,
removeExplicitPort: true,
sortQueryParameters: true,
...options
} as Required<Options>
const optionsHash = hashObject(opts)
cacheKey = `${url}-${optionsHash}`
normalizedUrl = normalizedUrlCache.get(cacheKey) normalizedUrl = normalizedUrlCache.get(cacheKey)
if (normalizedUrl !== undefined) { if (normalizedUrl !== undefined) {
@ -86,14 +75,14 @@ export function normalizeUrl(
} }
normalizedUrl = normalizeUrlImpl(url, opts) normalizedUrl = normalizeUrlImpl(url, opts)
if (!normalizeUrl) {
normalizedUrl = null
}
} catch { } catch {
// ignore invalid urls // ignore invalid urls
normalizedUrl = undefined normalizedUrl = null
}
if (cacheKey) {
normalizedUrlCache.set(cacheKey, normalizedUrl!)
} }
normalizedUrlCache.set(cacheKey, normalizedUrl!)
return normalizedUrl return normalizedUrl
} }

Wyświetl plik

@ -1,6 +1,6 @@
import type { Jsonifiable } from 'type-fest' import type { Jsonifiable } from 'type-fest'
import dedent from 'dedent' import dedent from 'dedent'
import hashObjectImpl from 'hash-object' import hashObjectImpl, { type Options as HashObjectOptions } from 'hash-object'
import type * as types from './types.js' import type * as types from './types.js'
@ -142,6 +142,9 @@ export function cleanStringForModel(text: string): string {
return dedenter(text).trim() return dedenter(text).trim()
} }
export function hashObject(object: Record<string, any>): string { export function hashObject(
return hashObjectImpl(object, { algorithm: 'sha256' }) object: Record<string, any>,
options?: HashObjectOptions
): string {
return hashObjectImpl(object, { algorithm: 'sha256', ...options })
} }