kopia lustrzana https://github.com/transitive-bullshit/chatgpt-api
fix: remove hash-object and search-and-crawl package
rodzic
e0e4bbbfe7
commit
7ce3acca89
|
@ -1,29 +0,0 @@
|
|||
#!/usr/bin/env node
|
||||
import 'dotenv/config'
|
||||
|
||||
import { createDexterFunctions } from '@agentic/dexter'
|
||||
import { DiffbotClient, SearchAndCrawl, SerpAPIClient } from '@agentic/stdlib'
|
||||
import { ChatModel, createAIRunner } from '@dexaai/dexter'
|
||||
|
||||
async function main() {
|
||||
const serpapi = new SerpAPIClient()
|
||||
const diffbot = new DiffbotClient()
|
||||
|
||||
const searchAndCrawl = new SearchAndCrawl({ serpapi, diffbot })
|
||||
|
||||
const runner = createAIRunner({
|
||||
chatModel: new ChatModel({
|
||||
params: { model: 'gpt-4o-mini', temperature: 0 }
|
||||
// debug: true
|
||||
}),
|
||||
functions: createDexterFunctions(searchAndCrawl),
|
||||
systemMessage:
|
||||
'You are a McKinsey analyst who is an expert at writing executive summaries. Always cite your sources and respond using Markdown.'
|
||||
})
|
||||
|
||||
const topic = 'the 2024 olympics'
|
||||
const result = await runner(`Summarize the latest news on ${topic}`)
|
||||
console.log(result)
|
||||
}
|
||||
|
||||
await main()
|
|
@ -34,15 +34,11 @@
|
|||
"dependencies": {
|
||||
"dedent": "^1.5.3",
|
||||
"delay": "^6.0.0",
|
||||
"hash-object": "^5.0.1",
|
||||
"is-relative-url": "^4.0.0",
|
||||
"jsonrepair": "^3.9.0",
|
||||
"ky": "^1.7.2",
|
||||
"normalize-url": "^8.0.1",
|
||||
"openai-zod-to-json-schema": "^1.0.3",
|
||||
"p-map": "^7.0.2",
|
||||
"p-throttle": "^6.2.0",
|
||||
"quick-lru": "^7.0.0",
|
||||
"type-fest": "^4.26.1",
|
||||
"zod-validation-error": "^3.4.0"
|
||||
},
|
||||
|
|
|
@ -9,6 +9,5 @@ export * from './message'
|
|||
export * from './parse-structured-output'
|
||||
export * from './schema'
|
||||
export type * from './types'
|
||||
export * from './url-utils'
|
||||
export * from './utils'
|
||||
export * from './zod-to-json-schema'
|
||||
|
|
|
@ -1,34 +0,0 @@
|
|||
import { describe, expect, test } from 'vitest'
|
||||
|
||||
import { normalizeUrl } from './url-utils'
|
||||
|
||||
describe('normalizeUrl', () => {
|
||||
test('valid urls', async () => {
|
||||
expect(normalizeUrl('https://www.google.com')).toBe(
|
||||
'https://www.google.com'
|
||||
)
|
||||
expect(normalizeUrl('//www.google.com')).toBe('https://www.google.com')
|
||||
expect(normalizeUrl('https://www.google.com/foo?')).toBe(
|
||||
'https://www.google.com/foo'
|
||||
)
|
||||
expect(normalizeUrl('https://www.google.com/?foo=bar&dog=cat')).toBe(
|
||||
'https://www.google.com/?dog=cat&foo=bar'
|
||||
)
|
||||
expect(normalizeUrl('https://google.com/abc/123//')).toBe(
|
||||
'https://google.com/abc/123'
|
||||
)
|
||||
expect(normalizeUrl('//google.com')).toBe('https://google.com')
|
||||
expect(normalizeUrl('google.com')).toBe('https://google.com')
|
||||
expect(normalizeUrl('abc.foo.com')).toBe('https://abc.foo.com')
|
||||
})
|
||||
|
||||
test('invalid urls', async () => {
|
||||
expect(normalizeUrl('/foo')).toBe(undefined)
|
||||
expect(normalizeUrl('/foo/bar/baz')).toBe(undefined)
|
||||
expect(normalizeUrl('://foo.com')).toBe(undefined)
|
||||
expect(normalizeUrl('foo')).toBe(undefined)
|
||||
expect(normalizeUrl('')).toBe(undefined)
|
||||
expect(normalizeUrl(undefined as unknown as string)).toBe(undefined)
|
||||
expect(normalizeUrl(null as unknown as string)).toBe(undefined)
|
||||
})
|
||||
})
|
|
@ -1,108 +0,0 @@
|
|||
import isRelativeUrlImpl from 'is-relative-url'
|
||||
import normalizeUrlImpl, {
|
||||
type Options as NormalizeUrlImplOptions
|
||||
} from 'normalize-url'
|
||||
import QuickLRU from 'quick-lru'
|
||||
|
||||
import { hashObject } from './utils'
|
||||
|
||||
const protocolAllowList = new Set(['https:', 'http:'])
|
||||
const normalizedUrlCache = new QuickLRU<string, string>({
|
||||
maxSize: 4000
|
||||
})
|
||||
|
||||
export function isValidCrawlableUrl(url: string): boolean {
|
||||
try {
|
||||
if (!url || isRelativeUrl(url)) {
|
||||
return false
|
||||
}
|
||||
|
||||
const parsedUrl = new URL(url)
|
||||
if (!protocolAllowList.has(parsedUrl.protocol)) {
|
||||
return false
|
||||
}
|
||||
|
||||
const normalizedUrl = normalizeUrl(url)
|
||||
if (!normalizedUrl) {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
} catch {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
export function isRelativeUrl(url: string): boolean {
|
||||
if (!url || typeof url !== 'string') return false
|
||||
|
||||
return isRelativeUrlImpl(url) && !url.startsWith('//')
|
||||
}
|
||||
|
||||
export type NormalizeUrlOptions = NormalizeUrlImplOptions & {
|
||||
allowSloppyUris?: boolean
|
||||
}
|
||||
|
||||
export function normalizeUrl(
|
||||
url?: string,
|
||||
{ allowSloppyUris = true, ...options }: NormalizeUrlOptions = {}
|
||||
): string | undefined {
|
||||
let normalizedUrl: string | undefined
|
||||
|
||||
if (!url || typeof url !== 'string') {
|
||||
return undefined
|
||||
}
|
||||
|
||||
if (isRelativeUrl(url)) {
|
||||
if (allowSloppyUris && !/^[#./]/.test(url) && url.indexOf('.') > 0) {
|
||||
url = `https://${url}`
|
||||
} else {
|
||||
return undefined
|
||||
}
|
||||
}
|
||||
|
||||
const opts = {
|
||||
stripWWW: false,
|
||||
defaultProtocol: 'https',
|
||||
normalizeProtocol: true,
|
||||
forceHttps: false,
|
||||
stripHash: false,
|
||||
stripTextFragment: true,
|
||||
removeQueryParameters: [/^utm_\w+/i, 'ref', 'ref_src'],
|
||||
removeTrailingSlash: true,
|
||||
removeSingleSlash: true,
|
||||
removeExplicitPort: true,
|
||||
sortQueryParameters: true,
|
||||
...options
|
||||
} as Required<NormalizeUrlOptions>
|
||||
|
||||
const optionsHash = hashObject(opts)
|
||||
const cacheKey = `${url}-${optionsHash}`
|
||||
|
||||
try {
|
||||
normalizedUrl = normalizedUrlCache.get(cacheKey)
|
||||
|
||||
if (normalizedUrl !== undefined) {
|
||||
if (normalizedUrl) {
|
||||
return normalizedUrl
|
||||
} else {
|
||||
return undefined
|
||||
}
|
||||
}
|
||||
|
||||
normalizedUrl = normalizeUrlImpl(url, opts)
|
||||
if (!normalizeUrl) {
|
||||
normalizedUrl = ''
|
||||
}
|
||||
} catch {
|
||||
// ignore invalid urls
|
||||
normalizedUrl = ''
|
||||
}
|
||||
|
||||
normalizedUrlCache.set(cacheKey, normalizedUrl!)
|
||||
if (normalizedUrl) {
|
||||
return normalizedUrl
|
||||
} else {
|
||||
return undefined
|
||||
}
|
||||
}
|
|
@ -1,5 +1,4 @@
|
|||
import dedent from 'dedent'
|
||||
import hashObjectImpl, { type Options as HashObjectOptions } from 'hash-object'
|
||||
|
||||
import type * as types from './types'
|
||||
|
||||
|
@ -253,13 +252,6 @@ export function cleanStringForModel(text: string): string {
|
|||
return dedenter(text).trim()
|
||||
}
|
||||
|
||||
export function hashObject(
|
||||
object: Record<string, any>,
|
||||
options?: HashObjectOptions
|
||||
): string {
|
||||
return hashObjectImpl(object, { algorithm: 'sha256', ...options })
|
||||
}
|
||||
|
||||
export function isAIFunction(obj: any): obj is types.AIFunction {
|
||||
if (!obj) return false
|
||||
if (typeof obj !== 'function') return false
|
||||
|
|
|
@ -1,29 +0,0 @@
|
|||
# @agentic/search-and-crawl
|
||||
|
||||
## 7.1.0
|
||||
|
||||
### Minor Changes
|
||||
|
||||
- 33bcbe0: Update deps
|
||||
|
||||
### Patch Changes
|
||||
|
||||
- Updated dependencies [33bcbe0]
|
||||
- @agentic/core@7.1.0
|
||||
- @agentic/diffbot@7.1.0
|
||||
- @agentic/serpapi@7.1.0
|
||||
|
||||
## 7.0.0
|
||||
|
||||
### Major Changes
|
||||
|
||||
- cba1cc7: Move to monorepo and multiple packages
|
||||
|
||||
See https://github.com/transitive-bullshit/agentic/issues/654 and https://github.com/transitive-bullshit/agentic/pull/657 for more info.
|
||||
|
||||
### Patch Changes
|
||||
|
||||
- Updated dependencies [cba1cc7]
|
||||
- @agentic/diffbot@7.0.0
|
||||
- @agentic/serpapi@7.0.0
|
||||
- @agentic/core@7.0.0
|
|
@ -1,48 +0,0 @@
|
|||
{
|
||||
"name": "@agentic/search-and-crawl",
|
||||
"version": "7.1.0",
|
||||
"description": "Agentic SDK for Google search and crawling the top results.",
|
||||
"author": "Travis Fischer <travis@transitivebullsh.it>",
|
||||
"license": "MIT",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git+https://github.com/transitive-bullshit/agentic.git"
|
||||
},
|
||||
"type": "module",
|
||||
"source": "./src/index.ts",
|
||||
"types": "./dist/index.d.ts",
|
||||
"sideEffects": false,
|
||||
"exports": {
|
||||
".": {
|
||||
"types": "./dist/index.d.ts",
|
||||
"import": "./dist/index.js",
|
||||
"default": "./dist/index.js"
|
||||
}
|
||||
},
|
||||
"files": [
|
||||
"dist"
|
||||
],
|
||||
"scripts": {
|
||||
"build": "tsup --config ../../tsup.config.ts",
|
||||
"dev": "tsup --config ../../tsup.config.ts --watch",
|
||||
"clean": "del dist",
|
||||
"test": "run-s test:*",
|
||||
"test:lint": "eslint .",
|
||||
"test:typecheck": "tsc --noEmit"
|
||||
},
|
||||
"dependencies": {
|
||||
"@agentic/core": "workspace:*",
|
||||
"@agentic/diffbot": "workspace:*",
|
||||
"@agentic/serpapi": "workspace:*",
|
||||
"p-map": "^7.0.2"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"zod": "^3.23.8"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@agentic/tsconfig": "workspace:*"
|
||||
},
|
||||
"publishConfig": {
|
||||
"access": "public"
|
||||
}
|
||||
}
|
|
@ -1 +0,0 @@
|
|||
export * from './search-and-crawl'
|
|
@ -1,142 +0,0 @@
|
|||
import {
|
||||
aiFunction,
|
||||
AIFunctionsProvider,
|
||||
isValidCrawlableUrl,
|
||||
normalizeUrl,
|
||||
omit,
|
||||
pick
|
||||
} from '@agentic/core'
|
||||
import { type diffbot, DiffbotClient } from '@agentic/diffbot'
|
||||
import { SerpAPIClient } from '@agentic/serpapi'
|
||||
import pMap from 'p-map'
|
||||
import { z } from 'zod'
|
||||
|
||||
// TODO: allow `search` tool to support other search clients
|
||||
// (e.g. Bing, Exa, Searxng, Serper, Tavily)
|
||||
|
||||
export class SearchAndCrawl extends AIFunctionsProvider {
|
||||
readonly serpapi: SerpAPIClient
|
||||
readonly diffbot: DiffbotClient
|
||||
|
||||
constructor(opts: { serpapi?: SerpAPIClient; diffbot?: DiffbotClient } = {}) {
|
||||
super()
|
||||
|
||||
this.serpapi = opts.serpapi ?? new SerpAPIClient()
|
||||
this.diffbot = opts.diffbot ?? new DiffbotClient()
|
||||
}
|
||||
|
||||
@aiFunction({
|
||||
name: 'search_and_crawl',
|
||||
description:
|
||||
'Uses Google to search the web, crawls the results, and then summarizes the most relevant results. Useful for creating in-depth summaries of topics along with sources.',
|
||||
inputSchema: z.object({
|
||||
query: z.string().describe('search query')
|
||||
})
|
||||
})
|
||||
async searchAndCrawl({
|
||||
query,
|
||||
numSearchResults = 3,
|
||||
maxCrawlDepth = 1,
|
||||
maxListItems = 3
|
||||
}: {
|
||||
query: string
|
||||
numSearchResults?: number
|
||||
maxCrawlDepth?: number
|
||||
maxListItems?: number
|
||||
}) {
|
||||
const crawledUrls = new Set<string>()
|
||||
|
||||
const crawlAndScrape = async (
|
||||
url: string | undefined,
|
||||
{
|
||||
depth = 0
|
||||
}: {
|
||||
depth?: number
|
||||
}
|
||||
): Promise<diffbot.ExtractAnalyzeResponse[]> => {
|
||||
try {
|
||||
if (!url) return []
|
||||
if (!isValidCrawlableUrl(url)) return []
|
||||
if (crawledUrls.has(url)) return []
|
||||
|
||||
const normalizedUrl = normalizeUrl(url)
|
||||
if (!normalizedUrl) return []
|
||||
if (crawledUrls.has(normalizedUrl)) return []
|
||||
|
||||
crawledUrls.add(url)
|
||||
crawledUrls.add(normalizedUrl)
|
||||
|
||||
console.log('\n\n')
|
||||
const scrapeResult = await this.diffbot.analyzeUrl({ url })
|
||||
console.log(
|
||||
`SearchAndCrawl depth ${depth} - "${url}"`,
|
||||
pick(scrapeResult, 'type', 'title')
|
||||
)
|
||||
|
||||
if (scrapeResult.type !== 'list') {
|
||||
return [scrapeResult]
|
||||
}
|
||||
|
||||
if (depth >= maxCrawlDepth) {
|
||||
return [scrapeResult]
|
||||
}
|
||||
|
||||
const object = scrapeResult.objects?.[0]
|
||||
if (!object) return [scrapeResult]
|
||||
|
||||
const items = object.items
|
||||
?.filter((item) => item.link)
|
||||
.slice(0, maxListItems)
|
||||
if (!items?.length) return [scrapeResult]
|
||||
|
||||
const innerScrapeResults = (
|
||||
await pMap(
|
||||
items,
|
||||
async (item) => {
|
||||
const innerScrapeResult = await crawlAndScrape(item.link, {
|
||||
depth: depth + 1
|
||||
})
|
||||
return innerScrapeResult
|
||||
},
|
||||
{
|
||||
concurrency: 4
|
||||
}
|
||||
)
|
||||
).flat()
|
||||
|
||||
return innerScrapeResults
|
||||
} catch (err) {
|
||||
console.warn('crawlAndScrape error', url, err)
|
||||
return []
|
||||
}
|
||||
}
|
||||
|
||||
const searchResponse = await this.serpapi.search({
|
||||
q: query,
|
||||
num: numSearchResults
|
||||
})
|
||||
|
||||
console.log(`SearchAndCrawl search results "${query}"`, searchResponse)
|
||||
const scrapeResults = (
|
||||
await pMap(
|
||||
(searchResponse.organic_results || []).slice(0, numSearchResults),
|
||||
async (searchResult) => {
|
||||
return crawlAndScrape(searchResult.link, {
|
||||
depth: 0
|
||||
})
|
||||
},
|
||||
{
|
||||
concurrency: 5
|
||||
}
|
||||
)
|
||||
).flat()
|
||||
|
||||
const output = {
|
||||
...omit(searchResponse, 'organic_results'),
|
||||
scrape_results: scrapeResults
|
||||
}
|
||||
|
||||
console.log(`SearchAndCrawl response for query "${query}"`, output)
|
||||
return output
|
||||
}
|
||||
}
|
|
@ -1,5 +0,0 @@
|
|||
{
|
||||
"extends": "@agentic/tsconfig/base.json",
|
||||
"include": ["src"],
|
||||
"exclude": ["node_modules", "dist"]
|
||||
}
|
|
@ -56,7 +56,6 @@
|
|||
"@agentic/polygon": "workspace:*",
|
||||
"@agentic/predict-leads": "workspace:*",
|
||||
"@agentic/proxycurl": "workspace:*",
|
||||
"@agentic/search-and-crawl": "workspace:*",
|
||||
"@agentic/searxng": "workspace:*",
|
||||
"@agentic/serpapi": "workspace:*",
|
||||
"@agentic/serper": "workspace:*",
|
||||
|
|
|
@ -17,7 +17,6 @@ export * from '@agentic/perigon'
|
|||
export * from '@agentic/polygon'
|
||||
export * from '@agentic/predict-leads'
|
||||
export * from '@agentic/proxycurl'
|
||||
export * from '@agentic/search-and-crawl'
|
||||
export * from '@agentic/searxng'
|
||||
export * from '@agentic/serpapi'
|
||||
export * from '@agentic/serper'
|
||||
|
|
3875
pnpm-lock.yaml
3875
pnpm-lock.yaml
Plik diff jest za duży
Load Diff
Ładowanie…
Reference in New Issue