diff --git a/bin/scratch.ts b/bin/scratch.ts index 0d7b58d..5e86a48 100644 --- a/bin/scratch.ts +++ b/bin/scratch.ts @@ -20,7 +20,8 @@ import restoreCursor from 'restore-cursor' // import { BingClient } from '../src/index.js' // import { TavilyClient } from '../src/index.js' // import { SocialDataClient } from '../src/index.js' -import { HunterClient } from '../src/index.js' +// import { HunterClient } from '../src/index.js' +import { JinaClient } from '../src/index.js' /** * Scratch pad for testing. @@ -125,15 +126,28 @@ async function main() { // const res = await socialData.getUserByUsername('transitive_bs') // console.log(JSON.stringify(res, null, 2)) - const hunter = new HunterClient() - // const res = await hunter.emailVerifier({ - // email: 'travis@transitivebullsh.it' + // const hunter = new HunterClient() + // // const res = await hunter.emailVerifier({ + // // email: 'travis@transitivebullsh.it' + // // }) + // const res = await hunter.emailFinder({ + // domain: 'aomni.com', + // first_name: 'David', + // last_name: 'Zhang' // }) - const res = await hunter.emailFinder({ - domain: 'aomni.com', - first_name: 'David', - last_name: 'Zhang' + // console.log(JSON.stringify(res, null, 2)) + + const jina = new JinaClient() + const res = await jina.readUrl({ + url: 'https://news.ycombinator.com' + // returnFormat: 'screenshot' + // json: true }) + // const res = await jina.search({ + // query: 'trump assassination attempt', + // // returnFormat: 'screenshot', + // json: true + // }) console.log(JSON.stringify(res, null, 2)) } diff --git a/readme.md b/readme.md index fb1b5d8..d0e232c 100644 --- a/readme.md +++ b/readme.md @@ -140,6 +140,7 @@ Depending on the AI SDK and tool you want to use, you'll also need to install th | [Firecrawl](https://www.firecrawl.dev) | `FirecrawlClient` | Website scraping and sanitization. | | [HackerNews](https://github.com/HackerNews/API) | `HackerNewsClient` | Official HackerNews API. | | [Hunter](https://hunter.io) | `HunterClient` | Email finder, verifier, and enrichment. | +| [Jina](https://jina.ai/reader) | `JinaClient` | Clean URL reader and web search + URL top result reading with a generous free tier. | | [Midjourney](https://www.imagineapi.dev) | `MidjourneyClient` | Unofficial Midjourney client for generative images. | | [Novu](https://novu.co) | `NovuClient` | Sending notifications (email, SMS, in-app, push, etc). | | [People Data Labs](https://www.peopledatalabs.com) | `PeopleDataLabsClient` | People & company data (WIP). | diff --git a/src/services/index.ts b/src/services/index.ts index 549b2e8..bbe43f1 100644 --- a/src/services/index.ts +++ b/src/services/index.ts @@ -6,6 +6,7 @@ export * from './exa-client.js' export * from './firecrawl-client.js' export * from './hacker-news-client.js' export * from './hunter-client.js' +export * from './jina-client.js' export * from './midjourney-client.js' export * from './novu-client.js' export * from './people-data-labs-client.js' diff --git a/src/services/jina-client.ts b/src/services/jina-client.ts new file mode 100644 index 0000000..e2408c8 --- /dev/null +++ b/src/services/jina-client.ts @@ -0,0 +1,251 @@ +import defaultKy, { type KyInstance } from 'ky' +import pThrottle from 'p-throttle' +import { z } from 'zod' + +import { aiFunction, AIFunctionsProvider } from '../fns.js' +import { getEnv, pruneNullOrUndefined, throttleKy } from '../utils.js' + +export namespace jina { + export const ReaderFormatSchema = z.enum([ + 'text', + 'html', + 'markdown', + 'screenshot' + ]) + export type ReaderFormat = z.infer + + export const ReaderOptionsSchema = z.object({ + url: z.string().describe('URL to read'), + timeout: z.number().optional().describe('Optional timeout in seconds'), + targetSelector: z + .string() + .optional() + .describe( + "Optional CSS selector to focus on a more specific part of the page. Useful when your desired content doesn't show under the default settings." + ), + waitForSelector: z + .string() + .optional() + .describe( + "Optional CSS selector to wait for before returning. Useful when your desired content doesn't show under the default settings." + ), + withGeneratedAlt: z.boolean().optional(), + withLinksSummary: z.boolean().optional(), + withImagesSummary: z.boolean().optional(), + setCookie: z.string().optional(), + proxyUrl: z.string().optional(), + noCache: z.boolean().optional(), + returnFormat: ReaderFormatSchema.optional(), + json: z.boolean().optional() + }) + export type ReaderOptions = z.infer + + export const SearchOptionsSchema = z.object({ + query: z.string().describe('Search query'), + site: z + .string() + .optional() + .describe( + 'Returns the search results only from the specified website or domain. By default it searches the entire web.' + ), + withGeneratedAlt: z.boolean().optional(), + withLinksSummary: z.boolean().optional(), + withImagesSummary: z.boolean().optional(), + setCookie: z.string().optional(), + proxyUrl: z.string().optional(), + noCache: z.boolean().optional(), + returnFormat: ReaderFormatSchema.exclude(['screenshot']).optional(), + json: z.boolean().optional() + }) + export type SearchOptions = z.infer + + export interface JinaResponse { + code: number + status: number + data: unknown + } + + export interface ReaderResponse extends JinaResponse { + data: ReaderData + } + + export interface ReaderResponseScreenshot extends JinaResponse { + data: { + screenshotUrl: string + } + } + + export interface SearchResponse extends JinaResponse { + data: ReaderData[] + } + + export interface ReaderData { + url: string + title: string + content: string + description?: string + publishedTime?: string + } +} + +/** + * LLM-friendly URL reader and search client by Jina AI. + * + * - Includes a very generous free tier. + * - Does not support "stream mode". + * - Results default to markdown text format. + * - To return JSON (especially useful for `search`), set `json: true` in the + * options. + * + * @see https://jina.ai/reader + */ +export class JinaClient extends AIFunctionsProvider { + protected readonly kyReader: KyInstance + protected readonly kySearch: KyInstance + protected readonly apiKey?: string + + constructor({ + apiKey = getEnv('JINA_API_KEY'), + throttle = true, + ky = defaultKy + }: { + apiKey?: string + throttle?: boolean + ky?: KyInstance + } = {}) { + super() + + this.apiKey = apiKey + + if (apiKey) { + ky = ky.extend({ headers: { Authorization: `Bearer ${apiKey}` } }) + } + + const throttledKyReader = throttle + ? throttleKy( + ky, + pThrottle({ + limit: apiKey ? 200 : 20, + interval: 60 * 60 * 1000 + }) + ) + : ky + this.kyReader = throttledKyReader.extend({ prefixUrl: 'https://r.jina.ai' }) + + const throttledKySearch = throttle + ? throttleKy( + ky, + pThrottle({ + limit: apiKey ? 40 : 5, + interval: 60 * 60 * 1000 + }) + ) + : ky + this.kySearch = throttledKySearch.extend({ prefixUrl: 'https://s.jina.ai ' }) + } + + @aiFunction({ + name: 'readUrl', + description: + "Reads the contents of the given URL and returns it's main contents in a clean, LLM-friendly format.", + inputSchema: jina.ReaderOptionsSchema + }) + async readUrl( + urlOrOptions: T + ): Promise< + T extends string + ? string + : T extends jina.ReaderOptions + ? T['json'] extends true + ? T['returnFormat'] extends 'screenshot' + ? jina.ReaderResponseScreenshot + : jina.ReaderResponse + : T['returnFormat'] extends 'screenshot' + ? ArrayBuffer + : string + : never + > { + const { url, ...opts } = + typeof urlOrOptions === 'string' + ? { url: urlOrOptions } + : jina.ReaderOptionsSchema.parse(pruneNullOrUndefined(urlOrOptions)) + const headers = this._getHeadersFromOptions(opts) + + const res = this.kyReader.get(url, { headers }) + + if (opts.json) { + return res.json() as any + } else if (opts.returnFormat === 'screenshot') { + return res.arrayBuffer() as any + } else { + return res.text() as any + } + } + + @aiFunction({ + name: 'search', + description: + 'Searches the web for the given query and returns the top-5 results including their page contents in a clean, LLM-friendly format.', + inputSchema: jina.SearchOptionsSchema + }) + async search( + queryOrOptions: T + ): Promise< + T extends string + ? string + : T extends jina.SearchOptions + ? T['json'] extends true + ? jina.SearchResponse + : string + : never + > { + const { query, ...opts } = + typeof queryOrOptions === 'string' + ? { query: queryOrOptions } + : jina.SearchOptionsSchema.parse(pruneNullOrUndefined(queryOrOptions)) + const headers = this._getHeadersFromOptions(opts) + + const res = this.kySearch.get(query, { headers }) + + if (opts.json) { + return res.json() as any + } else { + return res.text() as any + } + } + + protected _getHeadersFromOptions( + options: Record + ) { + const { json, ...rest } = options + + const headerMap: Record = { + site: 'site', + timeout: 'x-timeout', + targetSelector: 'x-target-selector', + waitForSelector: 'x-wait-for-selector', + withGeneratedAlt: 'x-with-generated-alt', + withLinksSummary: 'x-with-links-summary', + withImagesSummary: 'x-with-images-summary', + setCookie: 'x-set-cookie', + proxyUrl: 'x-proxy-url', + noCache: 'x-no-cache', + returnFormat: 'x-return-format' + } + + const headers = Object.fromEntries( + Object.entries(rest).map(([key, value]) => [ + headerMap[key as string]!, + String(value) + ]) + ) + + if (json) { + headers.accept = 'application/json' + } else if (options.returnFormat !== 'screenshot') { + headers.accept = 'text/plain' + } + + return headers + } +}