kopia lustrzana https://github.com/transitive-bullshit/chatgpt-api
feat: add SearchAndCrawl
rodzic
1c10060651
commit
71fa4b2e7c
|
@ -0,0 +1,30 @@
|
||||||
|
#!/usr/bin/env node
|
||||||
|
import 'dotenv/config'
|
||||||
|
|
||||||
|
import { ChatModel, createAIRunner } from '@dexaai/dexter'
|
||||||
|
|
||||||
|
import { DiffbotClient, SerpAPIClient } from '../../src/index.js'
|
||||||
|
import { createDexterFunctions } from '../../src/sdks/dexter.js'
|
||||||
|
import { SearchAndCrawl } from '../../src/tools/search-and-crawl.js'
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const serpapi = new SerpAPIClient()
|
||||||
|
const diffbot = new DiffbotClient()
|
||||||
|
const searchAndCrawl = new SearchAndCrawl({ serpapi, diffbot })
|
||||||
|
|
||||||
|
const runner = createAIRunner({
|
||||||
|
chatModel: new ChatModel({
|
||||||
|
params: { model: 'gpt-4o', temperature: 0 }
|
||||||
|
// debug: true
|
||||||
|
}),
|
||||||
|
functions: createDexterFunctions(searchAndCrawl),
|
||||||
|
systemMessage:
|
||||||
|
'You are a McKinsey analyst who is an expert at writing executive summaries. Always cite your sources and respond using Markdown.'
|
||||||
|
})
|
||||||
|
|
||||||
|
const topic = 'the 2024 olympics'
|
||||||
|
const result = await runner(`Summarize the latest news on ${topic}`)
|
||||||
|
console.log(result)
|
||||||
|
}
|
||||||
|
|
||||||
|
await main()
|
|
@ -0,0 +1,135 @@
|
||||||
|
import pMap from 'p-map'
|
||||||
|
import { z } from 'zod'
|
||||||
|
|
||||||
|
import { aiFunction, AIFunctionsProvider } from '../fns.js'
|
||||||
|
import { type diffbot, DiffbotClient } from '../services/diffbot-client.js'
|
||||||
|
import { SerpAPIClient } from '../services/serpapi-client.js'
|
||||||
|
import { isValidCrawlableUrl, normalizeUrl } from '../url-utils.js'
|
||||||
|
import { omit, pick } from '../utils.js'
|
||||||
|
|
||||||
|
export class SearchAndCrawl extends AIFunctionsProvider {
|
||||||
|
readonly serpapi: SerpAPIClient
|
||||||
|
readonly diffbot: DiffbotClient
|
||||||
|
|
||||||
|
constructor(opts: { serpapi?: SerpAPIClient; diffbot?: DiffbotClient } = {}) {
|
||||||
|
super()
|
||||||
|
|
||||||
|
this.serpapi = opts.serpapi ?? new SerpAPIClient()
|
||||||
|
this.diffbot = opts.diffbot ?? new DiffbotClient()
|
||||||
|
}
|
||||||
|
|
||||||
|
@aiFunction({
|
||||||
|
name: 'search_and_crawl',
|
||||||
|
description:
|
||||||
|
'Uses Google to search the web, crawls the results, and then summarizes the most relevant results.',
|
||||||
|
inputSchema: z.object({
|
||||||
|
query: z.string().describe('search query')
|
||||||
|
})
|
||||||
|
})
|
||||||
|
async searchAndCrawl({
|
||||||
|
query,
|
||||||
|
numSearchResults = 3,
|
||||||
|
maxCrawlDepth = 1,
|
||||||
|
maxListItems = 3
|
||||||
|
}: {
|
||||||
|
query: string
|
||||||
|
numSearchResults?: number
|
||||||
|
maxCrawlDepth?: number
|
||||||
|
maxListItems?: number
|
||||||
|
}) {
|
||||||
|
const crawledUrls = new Set<string>()
|
||||||
|
|
||||||
|
const crawlAndScrape = async (
|
||||||
|
url: string | undefined,
|
||||||
|
{
|
||||||
|
depth = 0
|
||||||
|
}: {
|
||||||
|
depth?: number
|
||||||
|
}
|
||||||
|
): Promise<diffbot.ExtractAnalyzeResponse[]> => {
|
||||||
|
try {
|
||||||
|
if (!url) return []
|
||||||
|
if (!isValidCrawlableUrl(url)) return []
|
||||||
|
if (crawledUrls.has(url)) return []
|
||||||
|
|
||||||
|
const normalizedUrl = normalizeUrl(url)
|
||||||
|
if (!normalizedUrl) return []
|
||||||
|
if (crawledUrls.has(normalizedUrl)) return []
|
||||||
|
|
||||||
|
crawledUrls.add(url)
|
||||||
|
crawledUrls.add(normalizedUrl)
|
||||||
|
|
||||||
|
console.log('\n\n')
|
||||||
|
const scrapeResult = await this.diffbot.analyzeUrl({ url })
|
||||||
|
console.log(
|
||||||
|
`SearchAndCrawl depth ${depth} - "${url}"`,
|
||||||
|
pick(scrapeResult, 'type', 'title')
|
||||||
|
)
|
||||||
|
|
||||||
|
if (scrapeResult.type !== 'list') {
|
||||||
|
return [scrapeResult]
|
||||||
|
}
|
||||||
|
|
||||||
|
if (depth >= maxCrawlDepth) {
|
||||||
|
return [scrapeResult]
|
||||||
|
}
|
||||||
|
|
||||||
|
const object = scrapeResult.objects?.[0]
|
||||||
|
if (!object) return [scrapeResult]
|
||||||
|
|
||||||
|
const items = object.items
|
||||||
|
?.filter((item) => item.link)
|
||||||
|
.slice(0, maxListItems)
|
||||||
|
if (!items?.length) return [scrapeResult]
|
||||||
|
|
||||||
|
const innerScrapeResults = (
|
||||||
|
await pMap(
|
||||||
|
items,
|
||||||
|
async (item) => {
|
||||||
|
const innerScrapeResult = await crawlAndScrape(item.link, {
|
||||||
|
depth: depth + 1
|
||||||
|
})
|
||||||
|
return innerScrapeResult
|
||||||
|
},
|
||||||
|
{
|
||||||
|
concurrency: 4
|
||||||
|
}
|
||||||
|
)
|
||||||
|
).flat()
|
||||||
|
|
||||||
|
return innerScrapeResults
|
||||||
|
} catch (err) {
|
||||||
|
console.warn('crawlAndScrape error', url, err)
|
||||||
|
return []
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const searchResponse = await this.serpapi.search({
|
||||||
|
q: query,
|
||||||
|
num: numSearchResults
|
||||||
|
})
|
||||||
|
|
||||||
|
console.log(`SearchAndCrawl search results "${query}"`, searchResponse)
|
||||||
|
const scrapeResults = (
|
||||||
|
await pMap(
|
||||||
|
(searchResponse.organic_results || []).slice(0, numSearchResults),
|
||||||
|
async (searchResult) => {
|
||||||
|
return crawlAndScrape(searchResult.link, {
|
||||||
|
depth: 0
|
||||||
|
})
|
||||||
|
},
|
||||||
|
{
|
||||||
|
concurrency: 5
|
||||||
|
}
|
||||||
|
)
|
||||||
|
).flat()
|
||||||
|
|
||||||
|
const output = {
|
||||||
|
...omit(searchResponse, 'organic_results'),
|
||||||
|
scrape_results: scrapeResults
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`SearchAndCrawl response for query "${query}"`, output)
|
||||||
|
return output
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,20 +1,16 @@
|
||||||
import isRelativeUrlImpl from 'is-relative-url'
|
import isRelativeUrlImpl from 'is-relative-url'
|
||||||
import normalizeUrlImpl, { type Options } from 'normalize-url'
|
import normalizeUrlImpl, {
|
||||||
|
type Options as NormalizeUrlOptions
|
||||||
|
} from 'normalize-url'
|
||||||
import QuickLRU from 'quick-lru'
|
import QuickLRU from 'quick-lru'
|
||||||
|
|
||||||
import { hashObject } from './utils.js'
|
import { hashObject } from './utils.js'
|
||||||
|
|
||||||
const protocolAllowList = new Set(['https:', 'http:'])
|
const protocolAllowList = new Set(['https:', 'http:'])
|
||||||
const normalizedUrlCache = new QuickLRU<string, string | undefined>({
|
const normalizedUrlCache = new QuickLRU<string, string | null>({
|
||||||
maxSize: 4000
|
maxSize: 4000
|
||||||
})
|
})
|
||||||
|
|
||||||
/**
|
|
||||||
* Checks if a URL is crawlable.
|
|
||||||
*
|
|
||||||
* @param url - URL string to check
|
|
||||||
* @returns whether the URL is crawlable
|
|
||||||
*/
|
|
||||||
export function isValidCrawlableUrl(url: string): boolean {
|
export function isValidCrawlableUrl(url: string): boolean {
|
||||||
try {
|
try {
|
||||||
if (!url || isRelativeUrl(url)) {
|
if (!url || isRelativeUrl(url)) {
|
||||||
|
@ -43,42 +39,35 @@ export function isRelativeUrl(url: string): boolean {
|
||||||
return isRelativeUrlImpl(url) && !url.startsWith('//')
|
return isRelativeUrlImpl(url) && !url.startsWith('//')
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Normalizes a URL string.
|
|
||||||
*
|
|
||||||
* @param url - URL string to normalize
|
|
||||||
* @param options - options for normalization.
|
|
||||||
* @returns normalized URL string or null if an invalid URL was passed
|
|
||||||
*/
|
|
||||||
export function normalizeUrl(
|
export function normalizeUrl(
|
||||||
url: string,
|
url: string,
|
||||||
options?: Options
|
options?: NormalizeUrlOptions
|
||||||
): string | undefined {
|
): string | null {
|
||||||
let normalizedUrl: string | undefined
|
let normalizedUrl: string | null | undefined
|
||||||
let cacheKey: string | undefined
|
|
||||||
|
if (!url || isRelativeUrl(url)) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
const opts = {
|
||||||
|
stripWWW: false,
|
||||||
|
defaultProtocol: 'https',
|
||||||
|
normalizeProtocol: true,
|
||||||
|
forceHttps: false,
|
||||||
|
stripHash: false,
|
||||||
|
stripTextFragment: true,
|
||||||
|
removeQueryParameters: [/^utm_\w+/i, 'ref', 'ref_src'],
|
||||||
|
removeTrailingSlash: true,
|
||||||
|
removeSingleSlash: true,
|
||||||
|
removeExplicitPort: true,
|
||||||
|
sortQueryParameters: true,
|
||||||
|
...options
|
||||||
|
} as Required<NormalizeUrlOptions>
|
||||||
|
|
||||||
|
const optionsHash = hashObject(opts)
|
||||||
|
const cacheKey = `${url}-${optionsHash}`
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (!url || isRelativeUrl(url)) {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
const opts = {
|
|
||||||
stripWWW: false,
|
|
||||||
defaultProtocol: 'https',
|
|
||||||
normalizeProtocol: true,
|
|
||||||
forceHttps: false,
|
|
||||||
stripHash: false,
|
|
||||||
stripTextFragment: true,
|
|
||||||
removeQueryParameters: [/^utm_\w+/i, 'ref', 'ref_src'],
|
|
||||||
removeTrailingSlash: true,
|
|
||||||
removeSingleSlash: true,
|
|
||||||
removeExplicitPort: true,
|
|
||||||
sortQueryParameters: true,
|
|
||||||
...options
|
|
||||||
} as Required<Options>
|
|
||||||
|
|
||||||
const optionsHash = hashObject(opts)
|
|
||||||
cacheKey = `${url}-${optionsHash}`
|
|
||||||
normalizedUrl = normalizedUrlCache.get(cacheKey)
|
normalizedUrl = normalizedUrlCache.get(cacheKey)
|
||||||
|
|
||||||
if (normalizedUrl !== undefined) {
|
if (normalizedUrl !== undefined) {
|
||||||
|
@ -86,14 +75,14 @@ export function normalizeUrl(
|
||||||
}
|
}
|
||||||
|
|
||||||
normalizedUrl = normalizeUrlImpl(url, opts)
|
normalizedUrl = normalizeUrlImpl(url, opts)
|
||||||
|
if (!normalizeUrl) {
|
||||||
|
normalizedUrl = null
|
||||||
|
}
|
||||||
} catch {
|
} catch {
|
||||||
// ignore invalid urls
|
// ignore invalid urls
|
||||||
normalizedUrl = undefined
|
normalizedUrl = null
|
||||||
}
|
|
||||||
|
|
||||||
if (cacheKey) {
|
|
||||||
normalizedUrlCache.set(cacheKey, normalizedUrl!)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
normalizedUrlCache.set(cacheKey, normalizedUrl!)
|
||||||
return normalizedUrl
|
return normalizedUrl
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import type { Jsonifiable } from 'type-fest'
|
import type { Jsonifiable } from 'type-fest'
|
||||||
import dedent from 'dedent'
|
import dedent from 'dedent'
|
||||||
import hashObjectImpl from 'hash-object'
|
import hashObjectImpl, { type Options as HashObjectOptions } from 'hash-object'
|
||||||
|
|
||||||
import type * as types from './types.js'
|
import type * as types from './types.js'
|
||||||
|
|
||||||
|
@ -142,6 +142,9 @@ export function cleanStringForModel(text: string): string {
|
||||||
return dedenter(text).trim()
|
return dedenter(text).trim()
|
||||||
}
|
}
|
||||||
|
|
||||||
export function hashObject(object: Record<string, any>): string {
|
export function hashObject(
|
||||||
return hashObjectImpl(object, { algorithm: 'sha256' })
|
object: Record<string, any>,
|
||||||
|
options?: HashObjectOptions
|
||||||
|
): string {
|
||||||
|
return hashObjectImpl(object, { algorithm: 'sha256', ...options })
|
||||||
}
|
}
|
||||||
|
|
Ładowanie…
Reference in New Issue