fix: remove hash-object and search-and-crawl package

2024-11-07 13:12:50 -06:00 · 2024-11-07 13:12:50 -06:00 · 7ce3acca89
commit 7ce3acca89
--- a/examples/dexter/bin/analyze.ts
+++ b/examples/dexter/bin/analyze.ts
@ -1,29 +0,0 @@
-#!/usr/bin/env node
-import 'dotenv/config'
-
-import { createDexterFunctions } from '@agentic/dexter'
-import { DiffbotClient, SearchAndCrawl, SerpAPIClient } from '@agentic/stdlib'
-import { ChatModel, createAIRunner } from '@dexaai/dexter'
-
-async function main() {
-  const serpapi = new SerpAPIClient()
-  const diffbot = new DiffbotClient()
-
-  const searchAndCrawl = new SearchAndCrawl({ serpapi, diffbot })
-
-  const runner = createAIRunner({
-    chatModel: new ChatModel({
-      params: { model: 'gpt-4o-mini', temperature: 0 }
-      // debug: true
-    }),
-    functions: createDexterFunctions(searchAndCrawl),
-    systemMessage:
-      'You are a McKinsey analyst who is an expert at writing executive summaries. Always cite your sources and respond using Markdown.'
-  })
-
-  const topic = 'the 2024 olympics'
-  const result = await runner(`Summarize the latest news on ${topic}`)
-  console.log(result)
-}
-
-await main()
--- a/packages/core/package.json
+++ b/packages/core/package.json
@ -34,15 +34,11 @@
  "dependencies": {
    "dedent": "^1.5.3",
    "delay": "^6.0.0",
-    "hash-object": "^5.0.1",
-    "is-relative-url": "^4.0.0",
    "jsonrepair": "^3.9.0",
    "ky": "^1.7.2",
-    "normalize-url": "^8.0.1",
    "openai-zod-to-json-schema": "^1.0.3",
    "p-map": "^7.0.2",
    "p-throttle": "^6.2.0",
-    "quick-lru": "^7.0.0",
    "type-fest": "^4.26.1",
    "zod-validation-error": "^3.4.0"
  },
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@ -9,6 +9,5 @@ export * from './message'
 export * from './parse-structured-output'
 export * from './schema'
 export type * from './types'
-export * from './url-utils'
 export * from './utils'
 export * from './zod-to-json-schema'
--- a/packages/core/src/url-utils.test.ts
+++ b/packages/core/src/url-utils.test.ts
@ -1,34 +0,0 @@
-import { describe, expect, test } from 'vitest'
-
-import { normalizeUrl } from './url-utils'
-
-describe('normalizeUrl', () => {
-  test('valid urls', async () => {
-    expect(normalizeUrl('https://www.google.com')).toBe(
-      'https://www.google.com'
-    )
-    expect(normalizeUrl('//www.google.com')).toBe('https://www.google.com')
-    expect(normalizeUrl('https://www.google.com/foo?')).toBe(
-      'https://www.google.com/foo'
-    )
-    expect(normalizeUrl('https://www.google.com/?foo=bar&dog=cat')).toBe(
-      'https://www.google.com/?dog=cat&foo=bar'
-    )
-    expect(normalizeUrl('https://google.com/abc/123//')).toBe(
-      'https://google.com/abc/123'
-    )
-    expect(normalizeUrl('//google.com')).toBe('https://google.com')
-    expect(normalizeUrl('google.com')).toBe('https://google.com')
-    expect(normalizeUrl('abc.foo.com')).toBe('https://abc.foo.com')
-  })
-
-  test('invalid urls', async () => {
-    expect(normalizeUrl('/foo')).toBe(undefined)
-    expect(normalizeUrl('/foo/bar/baz')).toBe(undefined)
-    expect(normalizeUrl('://foo.com')).toBe(undefined)
-    expect(normalizeUrl('foo')).toBe(undefined)
-    expect(normalizeUrl('')).toBe(undefined)
-    expect(normalizeUrl(undefined as unknown as string)).toBe(undefined)
-    expect(normalizeUrl(null as unknown as string)).toBe(undefined)
-  })
-})
--- a/packages/core/src/url-utils.ts
+++ b/packages/core/src/url-utils.ts
@ -1,108 +0,0 @@
-import isRelativeUrlImpl from 'is-relative-url'
-import normalizeUrlImpl, {
-  type Options as NormalizeUrlImplOptions
-} from 'normalize-url'
-import QuickLRU from 'quick-lru'
-
-import { hashObject } from './utils'
-
-const protocolAllowList = new Set(['https:', 'http:'])
-const normalizedUrlCache = new QuickLRU<string, string>({
-  maxSize: 4000
-})
-
-export function isValidCrawlableUrl(url: string): boolean {
-  try {
-    if (!url || isRelativeUrl(url)) {
-      return false
-    }
-
-    const parsedUrl = new URL(url)
-    if (!protocolAllowList.has(parsedUrl.protocol)) {
-      return false
-    }
-
-    const normalizedUrl = normalizeUrl(url)
-    if (!normalizedUrl) {
-      return false
-    }
-
-    return true
-  } catch {
-    return false
-  }
-}
-
-export function isRelativeUrl(url: string): boolean {
-  if (!url || typeof url !== 'string') return false
-
-  return isRelativeUrlImpl(url) && !url.startsWith('//')
-}
-
-export type NormalizeUrlOptions = NormalizeUrlImplOptions & {
-  allowSloppyUris?: boolean
-}
-
-export function normalizeUrl(
-  url?: string,
-  { allowSloppyUris = true, ...options }: NormalizeUrlOptions = {}
-): string | undefined {
-  let normalizedUrl: string | undefined
-
-  if (!url || typeof url !== 'string') {
-    return undefined
-  }
-
-  if (isRelativeUrl(url)) {
-    if (allowSloppyUris && !/^[#./]/.test(url) && url.indexOf('.') > 0) {
-      url = `https://${url}`
-    } else {
-      return undefined
-    }
-  }
-
-  const opts = {
-    stripWWW: false,
-    defaultProtocol: 'https',
-    normalizeProtocol: true,
-    forceHttps: false,
-    stripHash: false,
-    stripTextFragment: true,
-    removeQueryParameters: [/^utm_\w+/i, 'ref', 'ref_src'],
-    removeTrailingSlash: true,
-    removeSingleSlash: true,
-    removeExplicitPort: true,
-    sortQueryParameters: true,
-    ...options
-  } as Required<NormalizeUrlOptions>
-
-  const optionsHash = hashObject(opts)
-  const cacheKey = `${url}-${optionsHash}`
-
-  try {
-    normalizedUrl = normalizedUrlCache.get(cacheKey)
-
-    if (normalizedUrl !== undefined) {
-      if (normalizedUrl) {
-        return normalizedUrl
-      } else {
-        return undefined
-      }
-    }
-
-    normalizedUrl = normalizeUrlImpl(url, opts)
-    if (!normalizeUrl) {
-      normalizedUrl = ''
-    }
-  } catch {
-    // ignore invalid urls
-    normalizedUrl = ''
-  }
-
-  normalizedUrlCache.set(cacheKey, normalizedUrl!)
-  if (normalizedUrl) {
-    return normalizedUrl
-  } else {
-    return undefined
-  }
-}
--- a/packages/core/src/utils.ts
+++ b/packages/core/src/utils.ts
@ -1,5 +1,4 @@
 import dedent from 'dedent'
-import hashObjectImpl, { type Options as HashObjectOptions } from 'hash-object'

 import type * as types from './types'

@ -253,13 +252,6 @@ export function cleanStringForModel(text: string): string {
  return dedenter(text).trim()
 }

-export function hashObject(
-  object: Record<string, any>,
-  options?: HashObjectOptions
-): string {
-  return hashObjectImpl(object, { algorithm: 'sha256', ...options })
-}
-
 export function isAIFunction(obj: any): obj is types.AIFunction {
  if (!obj) return false
  if (typeof obj !== 'function') return false
--- a/packages/search-and-crawl/CHANGELOG.md
+++ b/packages/search-and-crawl/CHANGELOG.md
@ -1,29 +0,0 @@
-# @agentic/search-and-crawl
-
-## 7.1.0
-
-### Minor Changes
-
- 33bcbe0: Update deps
-
-### Patch Changes
-
- Updated dependencies [33bcbe0]
-  - @agentic/core@7.1.0
-  - @agentic/diffbot@7.1.0
-  - @agentic/serpapi@7.1.0
-
-## 7.0.0
-
-### Major Changes
-
- cba1cc7: Move to monorepo and multiple packages
-
-  See https://github.com/transitive-bullshit/agentic/issues/654 and https://github.com/transitive-bullshit/agentic/pull/657 for more info.
-
-### Patch Changes
-
- Updated dependencies [cba1cc7]
-  - @agentic/diffbot@7.0.0
-  - @agentic/serpapi@7.0.0
-  - @agentic/core@7.0.0
--- a/packages/search-and-crawl/package.json
+++ b/packages/search-and-crawl/package.json
@ -1,48 +0,0 @@
-{
-  "name": "@agentic/search-and-crawl",
-  "version": "7.1.0",
-  "description": "Agentic SDK for Google search and crawling the top results.",
-  "author": "Travis Fischer <travis@transitivebullsh.it>",
-  "license": "MIT",
-  "repository": {
-    "type": "git",
-    "url": "git+https://github.com/transitive-bullshit/agentic.git"
-  },
-  "type": "module",
-  "source": "./src/index.ts",
-  "types": "./dist/index.d.ts",
-  "sideEffects": false,
-  "exports": {
-    ".": {
-      "types": "./dist/index.d.ts",
-      "import": "./dist/index.js",
-      "default": "./dist/index.js"
-    }
-  },
-  "files": [
-    "dist"
-  ],
-  "scripts": {
-    "build": "tsup --config ../../tsup.config.ts",
-    "dev": "tsup --config ../../tsup.config.ts --watch",
-    "clean": "del dist",
-    "test": "run-s test:*",
-    "test:lint": "eslint .",
-    "test:typecheck": "tsc --noEmit"
-  },
-  "dependencies": {
-    "@agentic/core": "workspace:*",
-    "@agentic/diffbot": "workspace:*",
-    "@agentic/serpapi": "workspace:*",
-    "p-map": "^7.0.2"
-  },
-  "peerDependencies": {
-    "zod": "^3.23.8"
-  },
-  "devDependencies": {
-    "@agentic/tsconfig": "workspace:*"
-  },
-  "publishConfig": {
-    "access": "public"
-  }
-}
--- a/packages/search-and-crawl/src/index.ts
+++ b/packages/search-and-crawl/src/index.ts
@ -1 +0,0 @@
-export * from './search-and-crawl'
--- a/packages/search-and-crawl/src/search-and-crawl.ts
+++ b/packages/search-and-crawl/src/search-and-crawl.ts
@ -1,142 +0,0 @@
-import {
-  aiFunction,
-  AIFunctionsProvider,
-  isValidCrawlableUrl,
-  normalizeUrl,
-  omit,
-  pick
-} from '@agentic/core'
-import { type diffbot, DiffbotClient } from '@agentic/diffbot'
-import { SerpAPIClient } from '@agentic/serpapi'
-import pMap from 'p-map'
-import { z } from 'zod'
-
-// TODO: allow `search` tool to support other search clients
-// (e.g. Bing, Exa, Searxng, Serper, Tavily)
-
-export class SearchAndCrawl extends AIFunctionsProvider {
-  readonly serpapi: SerpAPIClient
-  readonly diffbot: DiffbotClient
-
-  constructor(opts: { serpapi?: SerpAPIClient; diffbot?: DiffbotClient } = {}) {
-    super()
-
-    this.serpapi = opts.serpapi ?? new SerpAPIClient()
-    this.diffbot = opts.diffbot ?? new DiffbotClient()
-  }
-
-  @aiFunction({
-    name: 'search_and_crawl',
-    description:
-      'Uses Google to search the web, crawls the results, and then summarizes the most relevant results. Useful for creating in-depth summaries of topics along with sources.',
-    inputSchema: z.object({
-      query: z.string().describe('search query')
-    })
-  })
-  async searchAndCrawl({
-    query,
-    numSearchResults = 3,
-    maxCrawlDepth = 1,
-    maxListItems = 3
-  }: {
-    query: string
-    numSearchResults?: number
-    maxCrawlDepth?: number
-    maxListItems?: number
-  }) {
-    const crawledUrls = new Set<string>()
-
-    const crawlAndScrape = async (
-      url: string | undefined,
-      {
-        depth = 0
-      }: {
-        depth?: number
-      }
-    ): Promise<diffbot.ExtractAnalyzeResponse[]> => {
-      try {
-        if (!url) return []
-        if (!isValidCrawlableUrl(url)) return []
-        if (crawledUrls.has(url)) return []
-
-        const normalizedUrl = normalizeUrl(url)
-        if (!normalizedUrl) return []
-        if (crawledUrls.has(normalizedUrl)) return []
-
-        crawledUrls.add(url)
-        crawledUrls.add(normalizedUrl)
-
-        console.log('\n\n')
-        const scrapeResult = await this.diffbot.analyzeUrl({ url })
-        console.log(
-          `SearchAndCrawl depth ${depth} - "${url}"`,
-          pick(scrapeResult, 'type', 'title')
-        )
-
-        if (scrapeResult.type !== 'list') {
-          return [scrapeResult]
-        }
-
-        if (depth >= maxCrawlDepth) {
-          return [scrapeResult]
-        }
-
-        const object = scrapeResult.objects?.[0]
-        if (!object) return [scrapeResult]
-
-        const items = object.items
-          ?.filter((item) => item.link)
-          .slice(0, maxListItems)
-        if (!items?.length) return [scrapeResult]
-
-        const innerScrapeResults = (
-          await pMap(
-            items,
-            async (item) => {
-              const innerScrapeResult = await crawlAndScrape(item.link, {
-                depth: depth + 1
-              })
-              return innerScrapeResult
-            },
-            {
-              concurrency: 4
-            }
-          )
-        ).flat()
-
-        return innerScrapeResults
-      } catch (err) {
-        console.warn('crawlAndScrape error', url, err)
-        return []
-      }
-    }
-
-    const searchResponse = await this.serpapi.search({
-      q: query,
-      num: numSearchResults
-    })
-
-    console.log(`SearchAndCrawl search results "${query}"`, searchResponse)
-    const scrapeResults = (
-      await pMap(
-        (searchResponse.organic_results || []).slice(0, numSearchResults),
-        async (searchResult) => {
-          return crawlAndScrape(searchResult.link, {
-            depth: 0
-          })
-        },
-        {
-          concurrency: 5
-        }
-      )
-    ).flat()
-
-    const output = {
-      ...omit(searchResponse, 'organic_results'),
-      scrape_results: scrapeResults
-    }
-
-    console.log(`SearchAndCrawl response for query "${query}"`, output)
-    return output
-  }
-}
--- a/packages/search-and-crawl/tsconfig.json
+++ b/packages/search-and-crawl/tsconfig.json
@ -1,5 +0,0 @@
-{
-  "extends": "@agentic/tsconfig/base.json",
-  "include": ["src"],
-  "exclude": ["node_modules", "dist"]
-}
--- a/packages/stdlib/package.json
+++ b/packages/stdlib/package.json
@ -56,7 +56,6 @@
    "@agentic/polygon": "workspace:*",
    "@agentic/predict-leads": "workspace:*",
    "@agentic/proxycurl": "workspace:*",
-    "@agentic/search-and-crawl": "workspace:*",
    "@agentic/searxng": "workspace:*",
    "@agentic/serpapi": "workspace:*",
    "@agentic/serper": "workspace:*",
--- a/packages/stdlib/src/index.ts
+++ b/packages/stdlib/src/index.ts
@ -17,7 +17,6 @@ export * from '@agentic/perigon'
 export * from '@agentic/polygon'
 export * from '@agentic/predict-leads'
 export * from '@agentic/proxycurl'
-export * from '@agentic/search-and-crawl'
 export * from '@agentic/searxng'
 export * from '@agentic/serpapi'
 export * from '@agentic/serper'
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml