import defaultKy, { type KyInstance } from 'ky' import pThrottle from 'p-throttle' import { z } from 'zod' import { aiFunction, AIFunctionsProvider } from '../fns.js' import { assert, getEnv, throttleKy } from '../utils.js' export namespace diffbot { export const API_BASE_URL = 'https://api.diffbot.com' export const KNOWLEDGE_GRAPH_API_BASE_URL = 'https://kg.diffbot.com' // Allow up to 5 requests per second by default. // https://docs.diffbot.com/reference/rate-limits export const throttle = pThrottle({ limit: 5, interval: 1000, strict: true }) export interface ExtractOptions { /** Specify optional fields to be returned from any fully-extracted pages, e.g.: &fields=querystring,links. See available fields within each API's individual documentation pages. * @see https://docs.diffbot.com/reference/extract-optional-fields */ fields?: string[] /** (*Undocumented*) Pass paging=false to disable automatic concatenation of multiple-page articles. (By default, Diffbot will concatenate up to 20 pages of a single article.) */ paging?: boolean /** Pass discussion=false to disable automatic extraction of comments or reviews from pages identified as articles or products. This will not affect pages identified as discussions. */ discussion?: boolean /** Sets a value in milliseconds to wait for the retrieval/fetch of content from the requested URL. The default timeout for the third-party response is 30 seconds (30000). */ timeout?: number /** Used to specify the IP address of a custom proxy that will be used to fetch the target page, instead of Diffbot's default IPs/proxies. (Ex: &proxy=168.212.226.204) */ proxy?: string /** Used to specify the authentication parameters that will be used with the proxy specified in the &proxy parameter. (Ex: &proxyAuth=username:password) */ proxyAuth?: string /** `none` will instruct Extract to not use proxies, even if proxies have been enabled for this particular URL globally. */ useProxy?: string /** @see https://docs.diffbot.com/reference/extract-custom-javascript */ customJs?: string /** @see https://docs.diffbot.com/reference/extract-custom-headers */ customHeaders?: Record } export interface ExtractAnalyzeOptions extends ExtractOptions { /** URL of the web page to process */ url: string /** By default the Analyze API will fully extract all pages that match an existing Automatic API -- articles, products or image pages. Set mode to a specific page-type (e.g., mode=article) to extract content only from that specific page-type. All other pages will simply return the default Analyze fields. */ mode?: string /** Force any non-extracted pages (those with a type of "other") through a specific API. For example, to route all "other" pages through the Article API, pass &fallback=article. Pages that utilize this functionality will return a fallbackType field at the top-level of the response and a originalType field within each extracted object, both of which will indicate the fallback API used. */ fallback?: string } export interface ExtractArticleOptions extends ExtractOptions { /** URL of the web page to process */ url: string /** Set the maximum number of automatically-generated tags to return. By default a maximum of ten tags will be returned. */ maxTags?: number /** Set the minimum relevance score of tags to return, between 0.0 and 1.0. By default only tags with a score equal to or above 0.5 will be returned. */ tagConfidence?: number /** Used to request the output of the Diffbot Natural Language API in the field naturalLanguage. Example: &naturalLanguage=entities,facts,categories,sentiment. */ naturalLanguage?: string[] } export interface ExtractResponse { request: DiffbotRequest objects: DiffbotObject[] } export type ExtractArticleResponse = ExtractResponse export interface ExtractAnalyzeResponse extends ExtractResponse { type: string title: string humanLanguage: string } export interface DiffbotObject { date: string sentiment: number images: Image[] author: string estimatedDate: string publisherRegion: string icon: string diffbotUri: string siteName: string type: string title: string tags: Tag[] publisherCountry: string humanLanguage: string authorUrl: string pageUrl: string html: string text: string categories?: ObjectCategory[] authors: Author[] breadcrumb?: Breadcrumb[] items?: ListItem[] meta?: any } export interface ListItem { title: string link: string summary: string image?: string } export interface Author { name: string link: string } export interface ObjectCategory { score: number name: string id: string } export interface Breadcrumb { link: string name: string } export interface Image { url: string diffbotUri: string naturalWidth: number naturalHeight: number width: number height: number isCached?: boolean primary?: boolean } export interface Tag { score: number sentiment: number count: number label: string uri: string rdfTypes: string[] } export interface DiffbotRequest { pageUrl: string api: string version: number } export interface KnowledgeGraphSearchOptions { type?: 'query' | 'text' | 'queryTextFallback' | 'crawl' query: string col?: string from?: number size?: number // NOTE: we only support `json`, so these options are not needed // We can always convert from json to another format if needed. // format?: 'json' | 'jsonl' | 'csv' | 'xls' | 'xlsx' // exportspec?: string // exportseparator?: string // exportfile?: string filter?: string jsonmode?: 'extended' | 'id' nonCanonicalFacts?: boolean noDedupArticles?: boolean cluster?: 'all' | 'best' | 'dedupe' report?: boolean } export interface KnowledgeGraphEnhanceOptions { type: EntityType id?: string name?: string url?: string phone?: string email?: string employer?: string title?: string school?: string location?: string ip?: string customId?: string size?: number threshold?: number refresh?: boolean search?: boolean useCache?: boolean filter?: string jsonmode?: 'extended' | 'id' nonCanonicalFacts?: boolean } export interface KnowledgeGraphResponse { data: KnowledgeGraphNode[] version: number hits: number results: number kgversion: string diffbot_type: string facet?: boolean errors?: any[] } export interface KnowledgeGraphNode { score: number esscore?: number entity: KnowledgeGraphEntity entity_ctx: any errors: string[] callbackQuery: string upperBound: number lowerBound: number count: number value: string uri: string } export interface KnowledgeGraphEntity { id: string diffbotUri: string type?: string name: string images: Image[] origins: string[] nbOrigins?: number gender?: Gender githubUri?: string importance?: number description?: string homepageUri?: string allNames?: string[] skills?: Skill[] crawlTimestamp?: number summary?: string image?: string types?: string[] nbIncomingEdges?: number allUris?: string[] employments?: Employment[] locations?: Location[] location?: Location allOriginHashes?: string[] nameDetail?: NameDetail } export type EntityType = 'Organization' | 'Place' export const EnhanceEntityOptionsSchema = z.object({ type: z.enum(['Person', 'Organization']), id: z .string() .optional() .describe('Diffbot ID of the entity to enhance if known'), name: z .union([z.string(), z.array(z.string())]) .optional() .describe('Name of the entity'), url: z .array(z.string()) .optional() .describe('Origin or homepage URL of the entity'), phone: z.string().optional().describe('Phone number of the entity'), email: z.string().optional().describe('Email of the entity'), employer: z .string() .optional() .describe("Name of the entity's employer (for Person entities)"), title: z .string() .optional() .describe('Title of the entity (for Person entities)'), school: z .string() .optional() .describe('School of the entity (for Person entities)'), location: z.string().optional().describe('Location of the entity'), ip: z.string().optional().describe('IP address of the entity'), customId: z.string().optional().describe('User-defined ID for correlation'), threshold: z.number().optional().describe('Similarity threshold'), refresh: z .boolean() .optional() .describe( 'If set, will attempt to refresh the entity data by recrawling the source URLs.' ), search: z .boolean() .optional() .describe( 'If set, will attempt to search the web for the entity and merge the results into its knowledge base.' ), size: z .number() .int() .positive() .max(100) .optional() .describe('Number of results to return') }) export type EnhanceEntityOptions = z.infer export interface EnhanceEntityResponse { version: number hits: number kgversion: string request_ctx: RequestCtx data: EnhanceEntityResponseDatum[] errors: any[] } export interface RequestCtx { query: Query query_ctx: QueryCtx } export interface Query { type: string name: string[] } export interface QueryCtx { search: string } export interface EnhanceEntityResponseDatum { score: number esscore: number entity: Entity errors: any[] } export interface Entity { name: string type: EntityType id: string summary?: string description?: string homepageUri?: string twitterUri?: string linkedInUri?: string githubUri?: string crunchbaseUri?: string googlePlusUri?: string diffbotUri?: string educations?: Education[] nationalities?: Nationality[] allNames?: string[] skills?: Skill[] children?: Children[] nbOrigins?: number height?: number image?: string images?: Image[] nbIncomingEdges?: number nbFollowers?: number allOriginHashes?: string[] nameDetail?: NameDetail parents?: Parent[] gender?: Gender importance?: number origin?: string wikipediaUri: string wikipediaPageviewsLastQuarterGrowth?: number wikipediaPageviewsLastYear?: number wikipediaPageviewsLastYearGrowth?: number wikipediaPageviews?: number wikipediaPageviewsLastQuarter?: number wikipediaPageviewsGrowth?: number birthPlace?: BirthPlace origins: string[] crawlTimestamp: number types?: string[] unions?: Union[] languages?: Language[] allUris?: string[] employments?: Employment[] birthDate?: DateTime religion?: Religion awards?: Award[] netWorth?: NetWorth allDescriptions?: string[] locations?: Location[] location?: Location interests?: Interest[] age?: number } export interface Education { institution: Institution isCurrent?: boolean major?: Major degree?: Degree from?: DateTime to?: DateTime } export interface Institution { summary: string image: string types: string[] name: string diffbotUri: string targetDiffbotId: string type: string } export interface Major {} export interface Degree { types: string[] name: string diffbotUri: string targetDiffbotId: string type: string } export interface DateTime { str: string precision: number timestamp: number } export interface Nationality { name: string type: string } export interface Skill { name: string diffbotUri?: string targetDiffbotId?: string } export interface Children { summary: string types: string[] name: string diffbotUri: string targetDiffbotId: string type: string } export interface Image { url: string primary?: boolean } export interface NameDetail { firstName: string lastName: string middleName?: string[] } export interface Parent { summary: string types: string[] name: string diffbotUri: string targetDiffbotId: string type: string image?: string } export interface Gender { normalizedValue: string } export interface BirthPlace { country: Country isCurrent: boolean address: string city: City subregion: Subregion latitude: number precision: number surfaceForm: string region: Region longitude: number } export interface Country { summary: string image: string types: string[] name: string diffbotUri: string targetDiffbotId: string type: string } export interface City { summary: string image: string types: string[] name: string diffbotUri: string targetDiffbotId: string type: string } export interface Subregion { summary: string image: string types: string[] name: string diffbotUri: string targetDiffbotId: string type: string } export interface Region { summary: string image: string types: string[] name: string diffbotUri: string targetDiffbotId: string type: string } export interface Union { person: Person from?: DateTime to?: DateTime type?: string } export interface Person { summary: string image: string types: string[] name: string diffbotUri: string targetDiffbotId: string type: string } export interface Language { str: string normalizedValue: string } export interface Employment { isCurrent?: boolean employer?: Employer from?: DateTime categories?: EmploymentCategory[] title?: string to?: DateTime location?: Location } export interface Employer { summary?: string image?: string types?: string[] name: string diffbotUri?: string targetDiffbotId?: string type: string } export interface EmploymentCategory { types: string[] name: string diffbotUri: string targetDiffbotId: string type: string } export interface Location { country?: Country isCurrent: boolean address: string city: City street: string metroArea: MetroArea subregion: Subregion latitude: number precision: number postalCode: string region?: Region longitude: number } export interface MetroArea { summary: string image: string types: string[] name: string diffbotUri: string targetDiffbotId: string type: string } export interface Religion { str: string } export interface Award { title: string date?: DateTime } export interface NetWorth { currency: string value: number } export interface Interest { name: string type: string } } export class DiffbotClient extends AIFunctionsProvider { readonly ky: KyInstance readonly kyKnowledgeGraph: KyInstance readonly apiKey: string readonly apiBaseUrl: string readonly apiKnowledgeGraphBaseUrl: string constructor({ apiKey = getEnv('DIFFBOT_API_KEY'), apiBaseUrl = diffbot.API_BASE_URL, apiKnowledgeGraphBaseUrl = diffbot.KNOWLEDGE_GRAPH_API_BASE_URL, timeoutMs = 30_000, throttle = true, ky = defaultKy }: { apiKey?: string apiBaseUrl?: string apiKnowledgeGraphBaseUrl?: string timeoutMs?: number throttle?: boolean ky?: KyInstance } = {}) { assert( apiKey, `DiffbotClient missing required "apiKey" (defaults to "DIFFBOT_API_KEY")` ) super() this.apiKey = apiKey this.apiBaseUrl = apiBaseUrl this.apiKnowledgeGraphBaseUrl = apiKnowledgeGraphBaseUrl const throttledKy = throttle ? throttleKy(ky, diffbot.throttle) : ky this.ky = throttledKy.extend({ prefixUrl: apiBaseUrl, timeout: timeoutMs }) this.kyKnowledgeGraph = throttledKy.extend({ prefixUrl: apiKnowledgeGraphBaseUrl, timeout: timeoutMs }) } @aiFunction({ name: 'diffbot_analyze_url', description: 'Scrapes and extracts structured data from a web page. Also classifies the web page as one of several types (article, product, discussion, job, image, video, list, event, or other).', inputSchema: z.object({ url: z.string().url().describe('The URL to process.') }) }) async analyzeUrl(options: diffbot.ExtractAnalyzeOptions) { return this._extract('v3/analyze', options) } @aiFunction({ name: 'diffbot_extract_article_from_url', description: 'Scrapes and extracts clean article text from news articles, blog posts, and other text-heavy web pages.', inputSchema: z.object({ url: z.string().url().describe('The URL to process.') }) }) async extractArticleFromUrl(options: diffbot.ExtractArticleOptions) { return this._extract('v3/article', options) } @aiFunction({ name: 'diffbot_enhance_entity', description: 'Enriches a person or organization entity given partial data. Enhance is an enrichment API to find a person or organization using partial data as input. Enhance scores several candidates against the submitted query and returns the best match. More information in the query helps Enhance models estimate with more confidence and will typically result in better matches and a higher score for the matches.', inputSchema: diffbot.EnhanceEntityOptionsSchema.omit({ refresh: true, search: true, customId: true, threshold: true }) }) async enhanceEntity(opts: diffbot.EnhanceEntityOptions) { const { name, url, ...params } = opts // TODO: clean this array handling up... const arraySearchParams = [ name ? (Array.isArray(name) ? name : [name]).map((v) => ['name', v]) : [], url?.map((v) => ['url', v]) ] .filter(Boolean) .flat() return this.kyKnowledgeGraph .get('kg/v3/enhance', { searchParams: new URLSearchParams([ ...arraySearchParams, ...Object.entries(params).map(([key, value]) => [key, String(value)]), ['token', this.apiKey] ]) }) .json() } async searchKnowledgeGraph(options: diffbot.KnowledgeGraphSearchOptions) { return this.kyKnowledgeGraph .get('kg/v3/dql', { searchParams: { ...options, token: this.apiKey } }) .json() } async enhanceKnowledgeGraph(options: diffbot.KnowledgeGraphEnhanceOptions) { return this.kyKnowledgeGraph .get('kg/v3/enhance', { searchParams: { ...options, token: this.apiKey } }) .json() } protected async _extract< T extends diffbot.ExtractResponse = diffbot.ExtractResponse >(endpoint: string, options: diffbot.ExtractOptions): Promise { const { customJs, customHeaders, ...rest } = options const searchParams: Record = { ...rest, token: this.apiKey } const headers = { ...Object.fromEntries( [['X-Forward-X-Evaluate', customJs]].filter(([, value]) => value) ), ...customHeaders } for (const [key, value] of Object.entries(rest)) { if (Array.isArray(value)) { searchParams[key] = value.join(',') } } // console.log(`DiffbotClient._extract: ${endpoint}`, searchParams) return this.ky .get(endpoint, { searchParams, headers, retry: 1 }) .json() } }