fix: remove hash-object and search-and-crawl package

pull/678/head
Travis Fischer 2024-11-07 13:12:50 -06:00
rodzic e0e4bbbfe7
commit 7ce3acca89
14 zmienionych plików z 1793 dodań i 2493 usunięć

Wyświetl plik

@ -1,29 +0,0 @@
#!/usr/bin/env node
import 'dotenv/config'
import { createDexterFunctions } from '@agentic/dexter'
import { DiffbotClient, SearchAndCrawl, SerpAPIClient } from '@agentic/stdlib'
import { ChatModel, createAIRunner } from '@dexaai/dexter'
async function main() {
const serpapi = new SerpAPIClient()
const diffbot = new DiffbotClient()
const searchAndCrawl = new SearchAndCrawl({ serpapi, diffbot })
const runner = createAIRunner({
chatModel: new ChatModel({
params: { model: 'gpt-4o-mini', temperature: 0 }
// debug: true
}),
functions: createDexterFunctions(searchAndCrawl),
systemMessage:
'You are a McKinsey analyst who is an expert at writing executive summaries. Always cite your sources and respond using Markdown.'
})
const topic = 'the 2024 olympics'
const result = await runner(`Summarize the latest news on ${topic}`)
console.log(result)
}
await main()

Wyświetl plik

@ -34,15 +34,11 @@
"dependencies": {
"dedent": "^1.5.3",
"delay": "^6.0.0",
"hash-object": "^5.0.1",
"is-relative-url": "^4.0.0",
"jsonrepair": "^3.9.0",
"ky": "^1.7.2",
"normalize-url": "^8.0.1",
"openai-zod-to-json-schema": "^1.0.3",
"p-map": "^7.0.2",
"p-throttle": "^6.2.0",
"quick-lru": "^7.0.0",
"type-fest": "^4.26.1",
"zod-validation-error": "^3.4.0"
},

Wyświetl plik

@ -9,6 +9,5 @@ export * from './message'
export * from './parse-structured-output'
export * from './schema'
export type * from './types'
export * from './url-utils'
export * from './utils'
export * from './zod-to-json-schema'

Wyświetl plik

@ -1,34 +0,0 @@
import { describe, expect, test } from 'vitest'
import { normalizeUrl } from './url-utils'
describe('normalizeUrl', () => {
test('valid urls', async () => {
expect(normalizeUrl('https://www.google.com')).toBe(
'https://www.google.com'
)
expect(normalizeUrl('//www.google.com')).toBe('https://www.google.com')
expect(normalizeUrl('https://www.google.com/foo?')).toBe(
'https://www.google.com/foo'
)
expect(normalizeUrl('https://www.google.com/?foo=bar&dog=cat')).toBe(
'https://www.google.com/?dog=cat&foo=bar'
)
expect(normalizeUrl('https://google.com/abc/123//')).toBe(
'https://google.com/abc/123'
)
expect(normalizeUrl('//google.com')).toBe('https://google.com')
expect(normalizeUrl('google.com')).toBe('https://google.com')
expect(normalizeUrl('abc.foo.com')).toBe('https://abc.foo.com')
})
test('invalid urls', async () => {
expect(normalizeUrl('/foo')).toBe(undefined)
expect(normalizeUrl('/foo/bar/baz')).toBe(undefined)
expect(normalizeUrl('://foo.com')).toBe(undefined)
expect(normalizeUrl('foo')).toBe(undefined)
expect(normalizeUrl('')).toBe(undefined)
expect(normalizeUrl(undefined as unknown as string)).toBe(undefined)
expect(normalizeUrl(null as unknown as string)).toBe(undefined)
})
})

Wyświetl plik

@ -1,108 +0,0 @@
import isRelativeUrlImpl from 'is-relative-url'
import normalizeUrlImpl, {
type Options as NormalizeUrlImplOptions
} from 'normalize-url'
import QuickLRU from 'quick-lru'
import { hashObject } from './utils'
const protocolAllowList = new Set(['https:', 'http:'])
const normalizedUrlCache = new QuickLRU<string, string>({
maxSize: 4000
})
export function isValidCrawlableUrl(url: string): boolean {
try {
if (!url || isRelativeUrl(url)) {
return false
}
const parsedUrl = new URL(url)
if (!protocolAllowList.has(parsedUrl.protocol)) {
return false
}
const normalizedUrl = normalizeUrl(url)
if (!normalizedUrl) {
return false
}
return true
} catch {
return false
}
}
export function isRelativeUrl(url: string): boolean {
if (!url || typeof url !== 'string') return false
return isRelativeUrlImpl(url) && !url.startsWith('//')
}
export type NormalizeUrlOptions = NormalizeUrlImplOptions & {
allowSloppyUris?: boolean
}
export function normalizeUrl(
url?: string,
{ allowSloppyUris = true, ...options }: NormalizeUrlOptions = {}
): string | undefined {
let normalizedUrl: string | undefined
if (!url || typeof url !== 'string') {
return undefined
}
if (isRelativeUrl(url)) {
if (allowSloppyUris && !/^[#./]/.test(url) && url.indexOf('.') > 0) {
url = `https://${url}`
} else {
return undefined
}
}
const opts = {
stripWWW: false,
defaultProtocol: 'https',
normalizeProtocol: true,
forceHttps: false,
stripHash: false,
stripTextFragment: true,
removeQueryParameters: [/^utm_\w+/i, 'ref', 'ref_src'],
removeTrailingSlash: true,
removeSingleSlash: true,
removeExplicitPort: true,
sortQueryParameters: true,
...options
} as Required<NormalizeUrlOptions>
const optionsHash = hashObject(opts)
const cacheKey = `${url}-${optionsHash}`
try {
normalizedUrl = normalizedUrlCache.get(cacheKey)
if (normalizedUrl !== undefined) {
if (normalizedUrl) {
return normalizedUrl
} else {
return undefined
}
}
normalizedUrl = normalizeUrlImpl(url, opts)
if (!normalizeUrl) {
normalizedUrl = ''
}
} catch {
// ignore invalid urls
normalizedUrl = ''
}
normalizedUrlCache.set(cacheKey, normalizedUrl!)
if (normalizedUrl) {
return normalizedUrl
} else {
return undefined
}
}

Wyświetl plik

@ -1,5 +1,4 @@
import dedent from 'dedent'
import hashObjectImpl, { type Options as HashObjectOptions } from 'hash-object'
import type * as types from './types'
@ -253,13 +252,6 @@ export function cleanStringForModel(text: string): string {
return dedenter(text).trim()
}
export function hashObject(
object: Record<string, any>,
options?: HashObjectOptions
): string {
return hashObjectImpl(object, { algorithm: 'sha256', ...options })
}
export function isAIFunction(obj: any): obj is types.AIFunction {
if (!obj) return false
if (typeof obj !== 'function') return false

Wyświetl plik

@ -1,29 +0,0 @@
# @agentic/search-and-crawl
## 7.1.0
### Minor Changes
- 33bcbe0: Update deps
### Patch Changes
- Updated dependencies [33bcbe0]
- @agentic/core@7.1.0
- @agentic/diffbot@7.1.0
- @agentic/serpapi@7.1.0
## 7.0.0
### Major Changes
- cba1cc7: Move to monorepo and multiple packages
See https://github.com/transitive-bullshit/agentic/issues/654 and https://github.com/transitive-bullshit/agentic/pull/657 for more info.
### Patch Changes
- Updated dependencies [cba1cc7]
- @agentic/diffbot@7.0.0
- @agentic/serpapi@7.0.0
- @agentic/core@7.0.0

Wyświetl plik

@ -1,48 +0,0 @@
{
"name": "@agentic/search-and-crawl",
"version": "7.1.0",
"description": "Agentic SDK for Google search and crawling the top results.",
"author": "Travis Fischer <travis@transitivebullsh.it>",
"license": "MIT",
"repository": {
"type": "git",
"url": "git+https://github.com/transitive-bullshit/agentic.git"
},
"type": "module",
"source": "./src/index.ts",
"types": "./dist/index.d.ts",
"sideEffects": false,
"exports": {
".": {
"types": "./dist/index.d.ts",
"import": "./dist/index.js",
"default": "./dist/index.js"
}
},
"files": [
"dist"
],
"scripts": {
"build": "tsup --config ../../tsup.config.ts",
"dev": "tsup --config ../../tsup.config.ts --watch",
"clean": "del dist",
"test": "run-s test:*",
"test:lint": "eslint .",
"test:typecheck": "tsc --noEmit"
},
"dependencies": {
"@agentic/core": "workspace:*",
"@agentic/diffbot": "workspace:*",
"@agentic/serpapi": "workspace:*",
"p-map": "^7.0.2"
},
"peerDependencies": {
"zod": "^3.23.8"
},
"devDependencies": {
"@agentic/tsconfig": "workspace:*"
},
"publishConfig": {
"access": "public"
}
}

Wyświetl plik

@ -1 +0,0 @@
export * from './search-and-crawl'

Wyświetl plik

@ -1,142 +0,0 @@
import {
aiFunction,
AIFunctionsProvider,
isValidCrawlableUrl,
normalizeUrl,
omit,
pick
} from '@agentic/core'
import { type diffbot, DiffbotClient } from '@agentic/diffbot'
import { SerpAPIClient } from '@agentic/serpapi'
import pMap from 'p-map'
import { z } from 'zod'
// TODO: allow `search` tool to support other search clients
// (e.g. Bing, Exa, Searxng, Serper, Tavily)
export class SearchAndCrawl extends AIFunctionsProvider {
readonly serpapi: SerpAPIClient
readonly diffbot: DiffbotClient
constructor(opts: { serpapi?: SerpAPIClient; diffbot?: DiffbotClient } = {}) {
super()
this.serpapi = opts.serpapi ?? new SerpAPIClient()
this.diffbot = opts.diffbot ?? new DiffbotClient()
}
@aiFunction({
name: 'search_and_crawl',
description:
'Uses Google to search the web, crawls the results, and then summarizes the most relevant results. Useful for creating in-depth summaries of topics along with sources.',
inputSchema: z.object({
query: z.string().describe('search query')
})
})
async searchAndCrawl({
query,
numSearchResults = 3,
maxCrawlDepth = 1,
maxListItems = 3
}: {
query: string
numSearchResults?: number
maxCrawlDepth?: number
maxListItems?: number
}) {
const crawledUrls = new Set<string>()
const crawlAndScrape = async (
url: string | undefined,
{
depth = 0
}: {
depth?: number
}
): Promise<diffbot.ExtractAnalyzeResponse[]> => {
try {
if (!url) return []
if (!isValidCrawlableUrl(url)) return []
if (crawledUrls.has(url)) return []
const normalizedUrl = normalizeUrl(url)
if (!normalizedUrl) return []
if (crawledUrls.has(normalizedUrl)) return []
crawledUrls.add(url)
crawledUrls.add(normalizedUrl)
console.log('\n\n')
const scrapeResult = await this.diffbot.analyzeUrl({ url })
console.log(
`SearchAndCrawl depth ${depth} - "${url}"`,
pick(scrapeResult, 'type', 'title')
)
if (scrapeResult.type !== 'list') {
return [scrapeResult]
}
if (depth >= maxCrawlDepth) {
return [scrapeResult]
}
const object = scrapeResult.objects?.[0]
if (!object) return [scrapeResult]
const items = object.items
?.filter((item) => item.link)
.slice(0, maxListItems)
if (!items?.length) return [scrapeResult]
const innerScrapeResults = (
await pMap(
items,
async (item) => {
const innerScrapeResult = await crawlAndScrape(item.link, {
depth: depth + 1
})
return innerScrapeResult
},
{
concurrency: 4
}
)
).flat()
return innerScrapeResults
} catch (err) {
console.warn('crawlAndScrape error', url, err)
return []
}
}
const searchResponse = await this.serpapi.search({
q: query,
num: numSearchResults
})
console.log(`SearchAndCrawl search results "${query}"`, searchResponse)
const scrapeResults = (
await pMap(
(searchResponse.organic_results || []).slice(0, numSearchResults),
async (searchResult) => {
return crawlAndScrape(searchResult.link, {
depth: 0
})
},
{
concurrency: 5
}
)
).flat()
const output = {
...omit(searchResponse, 'organic_results'),
scrape_results: scrapeResults
}
console.log(`SearchAndCrawl response for query "${query}"`, output)
return output
}
}

Wyświetl plik

@ -1,5 +0,0 @@
{
"extends": "@agentic/tsconfig/base.json",
"include": ["src"],
"exclude": ["node_modules", "dist"]
}

Wyświetl plik

@ -56,7 +56,6 @@
"@agentic/polygon": "workspace:*",
"@agentic/predict-leads": "workspace:*",
"@agentic/proxycurl": "workspace:*",
"@agentic/search-and-crawl": "workspace:*",
"@agentic/searxng": "workspace:*",
"@agentic/serpapi": "workspace:*",
"@agentic/serper": "workspace:*",

Wyświetl plik

@ -17,7 +17,6 @@ export * from '@agentic/perigon'
export * from '@agentic/polygon'
export * from '@agentic/predict-leads'
export * from '@agentic/proxycurl'
export * from '@agentic/search-and-crawl'
export * from '@agentic/searxng'
export * from '@agentic/serpapi'
export * from '@agentic/serper'

Plik diff jest za duży Load Diff