kopia lustrzana https://github.com/transitive-bullshit/chatgpt-api
fix: remove hash-object and search-and-crawl package
rodzic
e0e4bbbfe7
commit
7ce3acca89
|
@ -1,29 +0,0 @@
|
||||||
#!/usr/bin/env node
|
|
||||||
import 'dotenv/config'
|
|
||||||
|
|
||||||
import { createDexterFunctions } from '@agentic/dexter'
|
|
||||||
import { DiffbotClient, SearchAndCrawl, SerpAPIClient } from '@agentic/stdlib'
|
|
||||||
import { ChatModel, createAIRunner } from '@dexaai/dexter'
|
|
||||||
|
|
||||||
async function main() {
|
|
||||||
const serpapi = new SerpAPIClient()
|
|
||||||
const diffbot = new DiffbotClient()
|
|
||||||
|
|
||||||
const searchAndCrawl = new SearchAndCrawl({ serpapi, diffbot })
|
|
||||||
|
|
||||||
const runner = createAIRunner({
|
|
||||||
chatModel: new ChatModel({
|
|
||||||
params: { model: 'gpt-4o-mini', temperature: 0 }
|
|
||||||
// debug: true
|
|
||||||
}),
|
|
||||||
functions: createDexterFunctions(searchAndCrawl),
|
|
||||||
systemMessage:
|
|
||||||
'You are a McKinsey analyst who is an expert at writing executive summaries. Always cite your sources and respond using Markdown.'
|
|
||||||
})
|
|
||||||
|
|
||||||
const topic = 'the 2024 olympics'
|
|
||||||
const result = await runner(`Summarize the latest news on ${topic}`)
|
|
||||||
console.log(result)
|
|
||||||
}
|
|
||||||
|
|
||||||
await main()
|
|
|
@ -34,15 +34,11 @@
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"dedent": "^1.5.3",
|
"dedent": "^1.5.3",
|
||||||
"delay": "^6.0.0",
|
"delay": "^6.0.0",
|
||||||
"hash-object": "^5.0.1",
|
|
||||||
"is-relative-url": "^4.0.0",
|
|
||||||
"jsonrepair": "^3.9.0",
|
"jsonrepair": "^3.9.0",
|
||||||
"ky": "^1.7.2",
|
"ky": "^1.7.2",
|
||||||
"normalize-url": "^8.0.1",
|
|
||||||
"openai-zod-to-json-schema": "^1.0.3",
|
"openai-zod-to-json-schema": "^1.0.3",
|
||||||
"p-map": "^7.0.2",
|
"p-map": "^7.0.2",
|
||||||
"p-throttle": "^6.2.0",
|
"p-throttle": "^6.2.0",
|
||||||
"quick-lru": "^7.0.0",
|
|
||||||
"type-fest": "^4.26.1",
|
"type-fest": "^4.26.1",
|
||||||
"zod-validation-error": "^3.4.0"
|
"zod-validation-error": "^3.4.0"
|
||||||
},
|
},
|
||||||
|
|
|
@ -9,6 +9,5 @@ export * from './message'
|
||||||
export * from './parse-structured-output'
|
export * from './parse-structured-output'
|
||||||
export * from './schema'
|
export * from './schema'
|
||||||
export type * from './types'
|
export type * from './types'
|
||||||
export * from './url-utils'
|
|
||||||
export * from './utils'
|
export * from './utils'
|
||||||
export * from './zod-to-json-schema'
|
export * from './zod-to-json-schema'
|
||||||
|
|
|
@ -1,34 +0,0 @@
|
||||||
import { describe, expect, test } from 'vitest'
|
|
||||||
|
|
||||||
import { normalizeUrl } from './url-utils'
|
|
||||||
|
|
||||||
describe('normalizeUrl', () => {
|
|
||||||
test('valid urls', async () => {
|
|
||||||
expect(normalizeUrl('https://www.google.com')).toBe(
|
|
||||||
'https://www.google.com'
|
|
||||||
)
|
|
||||||
expect(normalizeUrl('//www.google.com')).toBe('https://www.google.com')
|
|
||||||
expect(normalizeUrl('https://www.google.com/foo?')).toBe(
|
|
||||||
'https://www.google.com/foo'
|
|
||||||
)
|
|
||||||
expect(normalizeUrl('https://www.google.com/?foo=bar&dog=cat')).toBe(
|
|
||||||
'https://www.google.com/?dog=cat&foo=bar'
|
|
||||||
)
|
|
||||||
expect(normalizeUrl('https://google.com/abc/123//')).toBe(
|
|
||||||
'https://google.com/abc/123'
|
|
||||||
)
|
|
||||||
expect(normalizeUrl('//google.com')).toBe('https://google.com')
|
|
||||||
expect(normalizeUrl('google.com')).toBe('https://google.com')
|
|
||||||
expect(normalizeUrl('abc.foo.com')).toBe('https://abc.foo.com')
|
|
||||||
})
|
|
||||||
|
|
||||||
test('invalid urls', async () => {
|
|
||||||
expect(normalizeUrl('/foo')).toBe(undefined)
|
|
||||||
expect(normalizeUrl('/foo/bar/baz')).toBe(undefined)
|
|
||||||
expect(normalizeUrl('://foo.com')).toBe(undefined)
|
|
||||||
expect(normalizeUrl('foo')).toBe(undefined)
|
|
||||||
expect(normalizeUrl('')).toBe(undefined)
|
|
||||||
expect(normalizeUrl(undefined as unknown as string)).toBe(undefined)
|
|
||||||
expect(normalizeUrl(null as unknown as string)).toBe(undefined)
|
|
||||||
})
|
|
||||||
})
|
|
|
@ -1,108 +0,0 @@
|
||||||
import isRelativeUrlImpl from 'is-relative-url'
|
|
||||||
import normalizeUrlImpl, {
|
|
||||||
type Options as NormalizeUrlImplOptions
|
|
||||||
} from 'normalize-url'
|
|
||||||
import QuickLRU from 'quick-lru'
|
|
||||||
|
|
||||||
import { hashObject } from './utils'
|
|
||||||
|
|
||||||
const protocolAllowList = new Set(['https:', 'http:'])
|
|
||||||
const normalizedUrlCache = new QuickLRU<string, string>({
|
|
||||||
maxSize: 4000
|
|
||||||
})
|
|
||||||
|
|
||||||
export function isValidCrawlableUrl(url: string): boolean {
|
|
||||||
try {
|
|
||||||
if (!url || isRelativeUrl(url)) {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
const parsedUrl = new URL(url)
|
|
||||||
if (!protocolAllowList.has(parsedUrl.protocol)) {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
const normalizedUrl = normalizeUrl(url)
|
|
||||||
if (!normalizedUrl) {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
return true
|
|
||||||
} catch {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
export function isRelativeUrl(url: string): boolean {
|
|
||||||
if (!url || typeof url !== 'string') return false
|
|
||||||
|
|
||||||
return isRelativeUrlImpl(url) && !url.startsWith('//')
|
|
||||||
}
|
|
||||||
|
|
||||||
export type NormalizeUrlOptions = NormalizeUrlImplOptions & {
|
|
||||||
allowSloppyUris?: boolean
|
|
||||||
}
|
|
||||||
|
|
||||||
export function normalizeUrl(
|
|
||||||
url?: string,
|
|
||||||
{ allowSloppyUris = true, ...options }: NormalizeUrlOptions = {}
|
|
||||||
): string | undefined {
|
|
||||||
let normalizedUrl: string | undefined
|
|
||||||
|
|
||||||
if (!url || typeof url !== 'string') {
|
|
||||||
return undefined
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isRelativeUrl(url)) {
|
|
||||||
if (allowSloppyUris && !/^[#./]/.test(url) && url.indexOf('.') > 0) {
|
|
||||||
url = `https://${url}`
|
|
||||||
} else {
|
|
||||||
return undefined
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const opts = {
|
|
||||||
stripWWW: false,
|
|
||||||
defaultProtocol: 'https',
|
|
||||||
normalizeProtocol: true,
|
|
||||||
forceHttps: false,
|
|
||||||
stripHash: false,
|
|
||||||
stripTextFragment: true,
|
|
||||||
removeQueryParameters: [/^utm_\w+/i, 'ref', 'ref_src'],
|
|
||||||
removeTrailingSlash: true,
|
|
||||||
removeSingleSlash: true,
|
|
||||||
removeExplicitPort: true,
|
|
||||||
sortQueryParameters: true,
|
|
||||||
...options
|
|
||||||
} as Required<NormalizeUrlOptions>
|
|
||||||
|
|
||||||
const optionsHash = hashObject(opts)
|
|
||||||
const cacheKey = `${url}-${optionsHash}`
|
|
||||||
|
|
||||||
try {
|
|
||||||
normalizedUrl = normalizedUrlCache.get(cacheKey)
|
|
||||||
|
|
||||||
if (normalizedUrl !== undefined) {
|
|
||||||
if (normalizedUrl) {
|
|
||||||
return normalizedUrl
|
|
||||||
} else {
|
|
||||||
return undefined
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
normalizedUrl = normalizeUrlImpl(url, opts)
|
|
||||||
if (!normalizeUrl) {
|
|
||||||
normalizedUrl = ''
|
|
||||||
}
|
|
||||||
} catch {
|
|
||||||
// ignore invalid urls
|
|
||||||
normalizedUrl = ''
|
|
||||||
}
|
|
||||||
|
|
||||||
normalizedUrlCache.set(cacheKey, normalizedUrl!)
|
|
||||||
if (normalizedUrl) {
|
|
||||||
return normalizedUrl
|
|
||||||
} else {
|
|
||||||
return undefined
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,5 +1,4 @@
|
||||||
import dedent from 'dedent'
|
import dedent from 'dedent'
|
||||||
import hashObjectImpl, { type Options as HashObjectOptions } from 'hash-object'
|
|
||||||
|
|
||||||
import type * as types from './types'
|
import type * as types from './types'
|
||||||
|
|
||||||
|
@ -253,13 +252,6 @@ export function cleanStringForModel(text: string): string {
|
||||||
return dedenter(text).trim()
|
return dedenter(text).trim()
|
||||||
}
|
}
|
||||||
|
|
||||||
export function hashObject(
|
|
||||||
object: Record<string, any>,
|
|
||||||
options?: HashObjectOptions
|
|
||||||
): string {
|
|
||||||
return hashObjectImpl(object, { algorithm: 'sha256', ...options })
|
|
||||||
}
|
|
||||||
|
|
||||||
export function isAIFunction(obj: any): obj is types.AIFunction {
|
export function isAIFunction(obj: any): obj is types.AIFunction {
|
||||||
if (!obj) return false
|
if (!obj) return false
|
||||||
if (typeof obj !== 'function') return false
|
if (typeof obj !== 'function') return false
|
||||||
|
|
|
@ -1,29 +0,0 @@
|
||||||
# @agentic/search-and-crawl
|
|
||||||
|
|
||||||
## 7.1.0
|
|
||||||
|
|
||||||
### Minor Changes
|
|
||||||
|
|
||||||
- 33bcbe0: Update deps
|
|
||||||
|
|
||||||
### Patch Changes
|
|
||||||
|
|
||||||
- Updated dependencies [33bcbe0]
|
|
||||||
- @agentic/core@7.1.0
|
|
||||||
- @agentic/diffbot@7.1.0
|
|
||||||
- @agentic/serpapi@7.1.0
|
|
||||||
|
|
||||||
## 7.0.0
|
|
||||||
|
|
||||||
### Major Changes
|
|
||||||
|
|
||||||
- cba1cc7: Move to monorepo and multiple packages
|
|
||||||
|
|
||||||
See https://github.com/transitive-bullshit/agentic/issues/654 and https://github.com/transitive-bullshit/agentic/pull/657 for more info.
|
|
||||||
|
|
||||||
### Patch Changes
|
|
||||||
|
|
||||||
- Updated dependencies [cba1cc7]
|
|
||||||
- @agentic/diffbot@7.0.0
|
|
||||||
- @agentic/serpapi@7.0.0
|
|
||||||
- @agentic/core@7.0.0
|
|
|
@ -1,48 +0,0 @@
|
||||||
{
|
|
||||||
"name": "@agentic/search-and-crawl",
|
|
||||||
"version": "7.1.0",
|
|
||||||
"description": "Agentic SDK for Google search and crawling the top results.",
|
|
||||||
"author": "Travis Fischer <travis@transitivebullsh.it>",
|
|
||||||
"license": "MIT",
|
|
||||||
"repository": {
|
|
||||||
"type": "git",
|
|
||||||
"url": "git+https://github.com/transitive-bullshit/agentic.git"
|
|
||||||
},
|
|
||||||
"type": "module",
|
|
||||||
"source": "./src/index.ts",
|
|
||||||
"types": "./dist/index.d.ts",
|
|
||||||
"sideEffects": false,
|
|
||||||
"exports": {
|
|
||||||
".": {
|
|
||||||
"types": "./dist/index.d.ts",
|
|
||||||
"import": "./dist/index.js",
|
|
||||||
"default": "./dist/index.js"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"files": [
|
|
||||||
"dist"
|
|
||||||
],
|
|
||||||
"scripts": {
|
|
||||||
"build": "tsup --config ../../tsup.config.ts",
|
|
||||||
"dev": "tsup --config ../../tsup.config.ts --watch",
|
|
||||||
"clean": "del dist",
|
|
||||||
"test": "run-s test:*",
|
|
||||||
"test:lint": "eslint .",
|
|
||||||
"test:typecheck": "tsc --noEmit"
|
|
||||||
},
|
|
||||||
"dependencies": {
|
|
||||||
"@agentic/core": "workspace:*",
|
|
||||||
"@agentic/diffbot": "workspace:*",
|
|
||||||
"@agentic/serpapi": "workspace:*",
|
|
||||||
"p-map": "^7.0.2"
|
|
||||||
},
|
|
||||||
"peerDependencies": {
|
|
||||||
"zod": "^3.23.8"
|
|
||||||
},
|
|
||||||
"devDependencies": {
|
|
||||||
"@agentic/tsconfig": "workspace:*"
|
|
||||||
},
|
|
||||||
"publishConfig": {
|
|
||||||
"access": "public"
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1 +0,0 @@
|
||||||
export * from './search-and-crawl'
|
|
|
@ -1,142 +0,0 @@
|
||||||
import {
|
|
||||||
aiFunction,
|
|
||||||
AIFunctionsProvider,
|
|
||||||
isValidCrawlableUrl,
|
|
||||||
normalizeUrl,
|
|
||||||
omit,
|
|
||||||
pick
|
|
||||||
} from '@agentic/core'
|
|
||||||
import { type diffbot, DiffbotClient } from '@agentic/diffbot'
|
|
||||||
import { SerpAPIClient } from '@agentic/serpapi'
|
|
||||||
import pMap from 'p-map'
|
|
||||||
import { z } from 'zod'
|
|
||||||
|
|
||||||
// TODO: allow `search` tool to support other search clients
|
|
||||||
// (e.g. Bing, Exa, Searxng, Serper, Tavily)
|
|
||||||
|
|
||||||
export class SearchAndCrawl extends AIFunctionsProvider {
|
|
||||||
readonly serpapi: SerpAPIClient
|
|
||||||
readonly diffbot: DiffbotClient
|
|
||||||
|
|
||||||
constructor(opts: { serpapi?: SerpAPIClient; diffbot?: DiffbotClient } = {}) {
|
|
||||||
super()
|
|
||||||
|
|
||||||
this.serpapi = opts.serpapi ?? new SerpAPIClient()
|
|
||||||
this.diffbot = opts.diffbot ?? new DiffbotClient()
|
|
||||||
}
|
|
||||||
|
|
||||||
@aiFunction({
|
|
||||||
name: 'search_and_crawl',
|
|
||||||
description:
|
|
||||||
'Uses Google to search the web, crawls the results, and then summarizes the most relevant results. Useful for creating in-depth summaries of topics along with sources.',
|
|
||||||
inputSchema: z.object({
|
|
||||||
query: z.string().describe('search query')
|
|
||||||
})
|
|
||||||
})
|
|
||||||
async searchAndCrawl({
|
|
||||||
query,
|
|
||||||
numSearchResults = 3,
|
|
||||||
maxCrawlDepth = 1,
|
|
||||||
maxListItems = 3
|
|
||||||
}: {
|
|
||||||
query: string
|
|
||||||
numSearchResults?: number
|
|
||||||
maxCrawlDepth?: number
|
|
||||||
maxListItems?: number
|
|
||||||
}) {
|
|
||||||
const crawledUrls = new Set<string>()
|
|
||||||
|
|
||||||
const crawlAndScrape = async (
|
|
||||||
url: string | undefined,
|
|
||||||
{
|
|
||||||
depth = 0
|
|
||||||
}: {
|
|
||||||
depth?: number
|
|
||||||
}
|
|
||||||
): Promise<diffbot.ExtractAnalyzeResponse[]> => {
|
|
||||||
try {
|
|
||||||
if (!url) return []
|
|
||||||
if (!isValidCrawlableUrl(url)) return []
|
|
||||||
if (crawledUrls.has(url)) return []
|
|
||||||
|
|
||||||
const normalizedUrl = normalizeUrl(url)
|
|
||||||
if (!normalizedUrl) return []
|
|
||||||
if (crawledUrls.has(normalizedUrl)) return []
|
|
||||||
|
|
||||||
crawledUrls.add(url)
|
|
||||||
crawledUrls.add(normalizedUrl)
|
|
||||||
|
|
||||||
console.log('\n\n')
|
|
||||||
const scrapeResult = await this.diffbot.analyzeUrl({ url })
|
|
||||||
console.log(
|
|
||||||
`SearchAndCrawl depth ${depth} - "${url}"`,
|
|
||||||
pick(scrapeResult, 'type', 'title')
|
|
||||||
)
|
|
||||||
|
|
||||||
if (scrapeResult.type !== 'list') {
|
|
||||||
return [scrapeResult]
|
|
||||||
}
|
|
||||||
|
|
||||||
if (depth >= maxCrawlDepth) {
|
|
||||||
return [scrapeResult]
|
|
||||||
}
|
|
||||||
|
|
||||||
const object = scrapeResult.objects?.[0]
|
|
||||||
if (!object) return [scrapeResult]
|
|
||||||
|
|
||||||
const items = object.items
|
|
||||||
?.filter((item) => item.link)
|
|
||||||
.slice(0, maxListItems)
|
|
||||||
if (!items?.length) return [scrapeResult]
|
|
||||||
|
|
||||||
const innerScrapeResults = (
|
|
||||||
await pMap(
|
|
||||||
items,
|
|
||||||
async (item) => {
|
|
||||||
const innerScrapeResult = await crawlAndScrape(item.link, {
|
|
||||||
depth: depth + 1
|
|
||||||
})
|
|
||||||
return innerScrapeResult
|
|
||||||
},
|
|
||||||
{
|
|
||||||
concurrency: 4
|
|
||||||
}
|
|
||||||
)
|
|
||||||
).flat()
|
|
||||||
|
|
||||||
return innerScrapeResults
|
|
||||||
} catch (err) {
|
|
||||||
console.warn('crawlAndScrape error', url, err)
|
|
||||||
return []
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const searchResponse = await this.serpapi.search({
|
|
||||||
q: query,
|
|
||||||
num: numSearchResults
|
|
||||||
})
|
|
||||||
|
|
||||||
console.log(`SearchAndCrawl search results "${query}"`, searchResponse)
|
|
||||||
const scrapeResults = (
|
|
||||||
await pMap(
|
|
||||||
(searchResponse.organic_results || []).slice(0, numSearchResults),
|
|
||||||
async (searchResult) => {
|
|
||||||
return crawlAndScrape(searchResult.link, {
|
|
||||||
depth: 0
|
|
||||||
})
|
|
||||||
},
|
|
||||||
{
|
|
||||||
concurrency: 5
|
|
||||||
}
|
|
||||||
)
|
|
||||||
).flat()
|
|
||||||
|
|
||||||
const output = {
|
|
||||||
...omit(searchResponse, 'organic_results'),
|
|
||||||
scrape_results: scrapeResults
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log(`SearchAndCrawl response for query "${query}"`, output)
|
|
||||||
return output
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,5 +0,0 @@
|
||||||
{
|
|
||||||
"extends": "@agentic/tsconfig/base.json",
|
|
||||||
"include": ["src"],
|
|
||||||
"exclude": ["node_modules", "dist"]
|
|
||||||
}
|
|
|
@ -56,7 +56,6 @@
|
||||||
"@agentic/polygon": "workspace:*",
|
"@agentic/polygon": "workspace:*",
|
||||||
"@agentic/predict-leads": "workspace:*",
|
"@agentic/predict-leads": "workspace:*",
|
||||||
"@agentic/proxycurl": "workspace:*",
|
"@agentic/proxycurl": "workspace:*",
|
||||||
"@agentic/search-and-crawl": "workspace:*",
|
|
||||||
"@agentic/searxng": "workspace:*",
|
"@agentic/searxng": "workspace:*",
|
||||||
"@agentic/serpapi": "workspace:*",
|
"@agentic/serpapi": "workspace:*",
|
||||||
"@agentic/serper": "workspace:*",
|
"@agentic/serper": "workspace:*",
|
||||||
|
|
|
@ -17,7 +17,6 @@ export * from '@agentic/perigon'
|
||||||
export * from '@agentic/polygon'
|
export * from '@agentic/polygon'
|
||||||
export * from '@agentic/predict-leads'
|
export * from '@agentic/predict-leads'
|
||||||
export * from '@agentic/proxycurl'
|
export * from '@agentic/proxycurl'
|
||||||
export * from '@agentic/search-and-crawl'
|
|
||||||
export * from '@agentic/searxng'
|
export * from '@agentic/searxng'
|
||||||
export * from '@agentic/serpapi'
|
export * from '@agentic/serpapi'
|
||||||
export * from '@agentic/serper'
|
export * from '@agentic/serper'
|
||||||
|
|
3875
pnpm-lock.yaml
3875
pnpm-lock.yaml
Plik diff jest za duży
Load Diff
Ładowanie…
Reference in New Issue