fix: remove hash-object and search-and-crawl package

2024-11-07 13:12:50 -06:00 · 2024-11-07 13:12:50 -06:00 · 7ce3acca89
commit 7ce3acca89
--- a/examples/dexter/bin/analyze.ts
+++ b/examples/dexter/bin/analyze.ts
@ -1,29 +0,0 @@
 #!/usr/bin/env node
 import 'dotenv/config'
 import { createDexterFunctions } from '@agentic/dexter'
 import { DiffbotClient, SearchAndCrawl, SerpAPIClient } from '@agentic/stdlib'
 import { ChatModel, createAIRunner } from '@dexaai/dexter'
 async function main() {
  const serpapi = new SerpAPIClient()
  const diffbot = new DiffbotClient()
  const searchAndCrawl = new SearchAndCrawl({ serpapi, diffbot })
  const runner = createAIRunner({
    chatModel: new ChatModel({
      params: { model: 'gpt-4o-mini', temperature: 0 }
      // debug: true
    }),
    functions: createDexterFunctions(searchAndCrawl),
    systemMessage:
      'You are a McKinsey analyst who is an expert at writing executive summaries. Always cite your sources and respond using Markdown.'
  })
  const topic = 'the 2024 olympics'
  const result = await runner(`Summarize the latest news on ${topic}`)
  console.log(result)
 }
 await main()
--- a/packages/core/package.json
+++ b/packages/core/package.json
@ -34,15 +34,11 @@
  "dependencies": {
    "dedent": "^1.5.3",
    "delay": "^6.0.0",
    "hash-object": "^5.0.1",
    "is-relative-url": "^4.0.0",
    "jsonrepair": "^3.9.0",
    "ky": "^1.7.2",
    "normalize-url": "^8.0.1",
    "openai-zod-to-json-schema": "^1.0.3",
    "p-map": "^7.0.2",
    "p-throttle": "^6.2.0",
    "quick-lru": "^7.0.0",
    "type-fest": "^4.26.1",
    "zod-validation-error": "^3.4.0"
  },
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@ -9,6 +9,5 @@ export * from './message'
 export * from './parse-structured-output'
 export * from './schema'
 export type * from './types'
 export * from './url-utils'
 export * from './utils'
 export * from './zod-to-json-schema'
--- a/packages/core/src/url-utils.test.ts
+++ b/packages/core/src/url-utils.test.ts
@ -1,34 +0,0 @@
 import { describe, expect, test } from 'vitest'
 import { normalizeUrl } from './url-utils'
 describe('normalizeUrl', () => {
  test('valid urls', async () => {
    expect(normalizeUrl('https://www.google.com')).toBe(
      'https://www.google.com'
    )
    expect(normalizeUrl('//www.google.com')).toBe('https://www.google.com')
    expect(normalizeUrl('https://www.google.com/foo?')).toBe(
      'https://www.google.com/foo'
    )
    expect(normalizeUrl('https://www.google.com/?foo=bar&dog=cat')).toBe(
      'https://www.google.com/?dog=cat&foo=bar'
    )
    expect(normalizeUrl('https://google.com/abc/123//')).toBe(
      'https://google.com/abc/123'
    )
    expect(normalizeUrl('//google.com')).toBe('https://google.com')
    expect(normalizeUrl('google.com')).toBe('https://google.com')
    expect(normalizeUrl('abc.foo.com')).toBe('https://abc.foo.com')
  })
  test('invalid urls', async () => {
    expect(normalizeUrl('/foo')).toBe(undefined)
    expect(normalizeUrl('/foo/bar/baz')).toBe(undefined)
    expect(normalizeUrl('://foo.com')).toBe(undefined)
    expect(normalizeUrl('foo')).toBe(undefined)
    expect(normalizeUrl('')).toBe(undefined)
    expect(normalizeUrl(undefined as unknown as string)).toBe(undefined)
    expect(normalizeUrl(null as unknown as string)).toBe(undefined)
  })
 })
--- a/packages/core/src/url-utils.ts
+++ b/packages/core/src/url-utils.ts
@ -1,108 +0,0 @@
 import isRelativeUrlImpl from 'is-relative-url'
 import normalizeUrlImpl, {
  type Options as NormalizeUrlImplOptions
 } from 'normalize-url'
 import QuickLRU from 'quick-lru'
 import { hashObject } from './utils'
 const protocolAllowList = new Set(['https:', 'http:'])
 const normalizedUrlCache = new QuickLRU<string, string>({
  maxSize: 4000
 })
 export function isValidCrawlableUrl(url: string): boolean {
  try {
    if (!url || isRelativeUrl(url)) {
      return false
    }
    const parsedUrl = new URL(url)
    if (!protocolAllowList.has(parsedUrl.protocol)) {
      return false
    }
    const normalizedUrl = normalizeUrl(url)
    if (!normalizedUrl) {
      return false
    }
    return true
  } catch {
    return false
  }
 }
 export function isRelativeUrl(url: string): boolean {
  if (!url || typeof url !== 'string') return false
  return isRelativeUrlImpl(url) && !url.startsWith('//')
 }
 export type NormalizeUrlOptions = NormalizeUrlImplOptions & {
  allowSloppyUris?: boolean
 }
 export function normalizeUrl(
  url?: string,
  { allowSloppyUris = true, ...options }: NormalizeUrlOptions = {}
 ): string | undefined {
  let normalizedUrl: string | undefined
  if (!url || typeof url !== 'string') {
    return undefined
  }
  if (isRelativeUrl(url)) {
    if (allowSloppyUris && !/^[#./]/.test(url) && url.indexOf('.') > 0) {
      url = `https://${url}`
    } else {
      return undefined
    }
  }
  const opts = {
    stripWWW: false,
    defaultProtocol: 'https',
    normalizeProtocol: true,
    forceHttps: false,
    stripHash: false,
    stripTextFragment: true,
    removeQueryParameters: [/^utm_\w+/i, 'ref', 'ref_src'],
    removeTrailingSlash: true,
    removeSingleSlash: true,
    removeExplicitPort: true,
    sortQueryParameters: true,
    ...options
  } as Required<NormalizeUrlOptions>
  const optionsHash = hashObject(opts)
  const cacheKey = `${url}-${optionsHash}`
  try {
    normalizedUrl = normalizedUrlCache.get(cacheKey)
    if (normalizedUrl !== undefined) {
      if (normalizedUrl) {
        return normalizedUrl
      } else {
        return undefined
      }
    }
    normalizedUrl = normalizeUrlImpl(url, opts)
    if (!normalizeUrl) {
      normalizedUrl = ''
    }
  } catch {
    // ignore invalid urls
    normalizedUrl = ''
  }
  normalizedUrlCache.set(cacheKey, normalizedUrl!)
  if (normalizedUrl) {
    return normalizedUrl
  } else {
    return undefined
  }
 }
--- a/packages/core/src/utils.ts
+++ b/packages/core/src/utils.ts
@ -1,5 +1,4 @@
 import dedent from 'dedent'
 import hashObjectImpl, { type Options as HashObjectOptions } from 'hash-object'
 import type * as types from './types'
@ -253,13 +252,6 @@ export function cleanStringForModel(text: string): string {
  return dedenter(text).trim()
 }
 export function hashObject(
  object: Record<string, any>,
  options?: HashObjectOptions
 ): string {
  return hashObjectImpl(object, { algorithm: 'sha256', ...options })
 }
 export function isAIFunction(obj: any): obj is types.AIFunction {
  if (!obj) return false
  if (typeof obj !== 'function') return false
--- a/packages/search-and-crawl/CHANGELOG.md
+++ b/packages/search-and-crawl/CHANGELOG.md
@ -1,29 +0,0 @@
 # @agentic/search-and-crawl
 ## 7.1.0
 ### Minor Changes
 - 33bcbe0: Update deps
 ### Patch Changes
 - Updated dependencies [33bcbe0]
  - @agentic/core@7.1.0
  - @agentic/diffbot@7.1.0
  - @agentic/serpapi@7.1.0
 ## 7.0.0
 ### Major Changes
 - cba1cc7: Move to monorepo and multiple packages
  See https://github.com/transitive-bullshit/agentic/issues/654 and https://github.com/transitive-bullshit/agentic/pull/657 for more info.
 ### Patch Changes
 - Updated dependencies [cba1cc7]
  - @agentic/diffbot@7.0.0
  - @agentic/serpapi@7.0.0
  - @agentic/core@7.0.0
--- a/packages/search-and-crawl/package.json
+++ b/packages/search-and-crawl/package.json
@ -1,48 +0,0 @@
 {
  "name": "@agentic/search-and-crawl",
  "version": "7.1.0",
  "description": "Agentic SDK for Google search and crawling the top results.",
  "author": "Travis Fischer <travis@transitivebullsh.it>",
  "license": "MIT",
  "repository": {
    "type": "git",
    "url": "git+https://github.com/transitive-bullshit/agentic.git"
  },
  "type": "module",
  "source": "./src/index.ts",
  "types": "./dist/index.d.ts",
  "sideEffects": false,
  "exports": {
    ".": {
      "types": "./dist/index.d.ts",
      "import": "./dist/index.js",
      "default": "./dist/index.js"
    }
  },
  "files": [
    "dist"
  ],
  "scripts": {
    "build": "tsup --config ../../tsup.config.ts",
    "dev": "tsup --config ../../tsup.config.ts --watch",
    "clean": "del dist",
    "test": "run-s test:*",
    "test:lint": "eslint .",
    "test:typecheck": "tsc --noEmit"
  },
  "dependencies": {
    "@agentic/core": "workspace:*",
    "@agentic/diffbot": "workspace:*",
    "@agentic/serpapi": "workspace:*",
    "p-map": "^7.0.2"
  },
  "peerDependencies": {
    "zod": "^3.23.8"
  },
  "devDependencies": {
    "@agentic/tsconfig": "workspace:*"
  },
  "publishConfig": {
    "access": "public"
  }
 }
--- a/packages/search-and-crawl/src/index.ts
+++ b/packages/search-and-crawl/src/index.ts
@ -1 +0,0 @@
 export * from './search-and-crawl'
--- a/packages/search-and-crawl/src/search-and-crawl.ts
+++ b/packages/search-and-crawl/src/search-and-crawl.ts
@ -1,142 +0,0 @@
 import {
  aiFunction,
  AIFunctionsProvider,
  isValidCrawlableUrl,
  normalizeUrl,
  omit,
  pick
 } from '@agentic/core'
 import { type diffbot, DiffbotClient } from '@agentic/diffbot'
 import { SerpAPIClient } from '@agentic/serpapi'
 import pMap from 'p-map'
 import { z } from 'zod'
 // TODO: allow `search` tool to support other search clients
 // (e.g. Bing, Exa, Searxng, Serper, Tavily)
 export class SearchAndCrawl extends AIFunctionsProvider {
  readonly serpapi: SerpAPIClient
  readonly diffbot: DiffbotClient
  constructor(opts: { serpapi?: SerpAPIClient; diffbot?: DiffbotClient } = {}) {
    super()
    this.serpapi = opts.serpapi ?? new SerpAPIClient()
    this.diffbot = opts.diffbot ?? new DiffbotClient()
  }
  @aiFunction({
    name: 'search_and_crawl',
    description:
      'Uses Google to search the web, crawls the results, and then summarizes the most relevant results. Useful for creating in-depth summaries of topics along with sources.',
    inputSchema: z.object({
      query: z.string().describe('search query')
    })
  })
  async searchAndCrawl({
    query,
    numSearchResults = 3,
    maxCrawlDepth = 1,
    maxListItems = 3
  }: {
    query: string
    numSearchResults?: number
    maxCrawlDepth?: number
    maxListItems?: number
  }) {
    const crawledUrls = new Set<string>()
    const crawlAndScrape = async (
      url: string | undefined,
      {
        depth = 0
      }: {
        depth?: number
      }
    ): Promise<diffbot.ExtractAnalyzeResponse[]> => {
      try {
        if (!url) return []
        if (!isValidCrawlableUrl(url)) return []
        if (crawledUrls.has(url)) return []
        const normalizedUrl = normalizeUrl(url)
        if (!normalizedUrl) return []
        if (crawledUrls.has(normalizedUrl)) return []
        crawledUrls.add(url)
        crawledUrls.add(normalizedUrl)
        console.log('\n\n')
        const scrapeResult = await this.diffbot.analyzeUrl({ url })
        console.log(
          `SearchAndCrawl depth ${depth} - "${url}"`,
          pick(scrapeResult, 'type', 'title')
        )
        if (scrapeResult.type !== 'list') {
          return [scrapeResult]
        }
        if (depth >= maxCrawlDepth) {
          return [scrapeResult]
        }
        const object = scrapeResult.objects?.[0]
        if (!object) return [scrapeResult]
        const items = object.items
          ?.filter((item) => item.link)
          .slice(0, maxListItems)
        if (!items?.length) return [scrapeResult]
        const innerScrapeResults = (
          await pMap(
            items,
            async (item) => {
              const innerScrapeResult = await crawlAndScrape(item.link, {
                depth: depth + 1
              })
              return innerScrapeResult
            },
            {
              concurrency: 4
            }
          )
        ).flat()
        return innerScrapeResults
      } catch (err) {
        console.warn('crawlAndScrape error', url, err)
        return []
      }
    }
    const searchResponse = await this.serpapi.search({
      q: query,
      num: numSearchResults
    })
    console.log(`SearchAndCrawl search results "${query}"`, searchResponse)
    const scrapeResults = (
      await pMap(
        (searchResponse.organic_results || []).slice(0, numSearchResults),
        async (searchResult) => {
          return crawlAndScrape(searchResult.link, {
            depth: 0
          })
        },
        {
          concurrency: 5
        }
      )
    ).flat()
    const output = {
      ...omit(searchResponse, 'organic_results'),
      scrape_results: scrapeResults
    }
    console.log(`SearchAndCrawl response for query "${query}"`, output)
    return output
  }
 }
--- a/packages/search-and-crawl/tsconfig.json
+++ b/packages/search-and-crawl/tsconfig.json
@ -1,5 +0,0 @@
 {
  "extends": "@agentic/tsconfig/base.json",
  "include": ["src"],
  "exclude": ["node_modules", "dist"]
 }
--- a/packages/stdlib/package.json
+++ b/packages/stdlib/package.json
@ -56,7 +56,6 @@
    "@agentic/polygon": "workspace:*",
    "@agentic/predict-leads": "workspace:*",
    "@agentic/proxycurl": "workspace:*",
    "@agentic/search-and-crawl": "workspace:*",
    "@agentic/searxng": "workspace:*",
    "@agentic/serpapi": "workspace:*",
    "@agentic/serper": "workspace:*",
--- a/packages/stdlib/src/index.ts
+++ b/packages/stdlib/src/index.ts
@ -17,7 +17,6 @@ export * from '@agentic/perigon'
 export * from '@agentic/polygon'
 export * from '@agentic/predict-leads'
 export * from '@agentic/proxycurl'
 export * from '@agentic/search-and-crawl'
 export * from '@agentic/searxng'
 export * from '@agentic/serpapi'
 export * from '@agentic/serper'
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml