2023-06-13 08:09:26 +00:00
import defaultKy from 'ky'
2023-06-17 00:09:24 +00:00
import { AbortError } from 'p-retry'
2023-06-16 06:49:56 +00:00
import pThrottle from 'p-throttle'
2023-06-16 22:50:38 +00:00
import { P } from 'pino'
2023-06-16 06:49:56 +00:00
import { throttleKy } from '@/utils'
2023-06-13 08:09:26 +00:00
export const DIFFBOT_API_BASE_URL = 'https://api.diffbot.com'
export const DIFFBOT_KNOWLEDGE_GRAPH_API_BASE_URL = 'https://kg.diffbot.com'
export interface DiffbotExtractOptions {
/ * * S p e c i f y o p t i o n a l f i e l d s t o b e r e t u r n e d f r o m a n y f u l l y - e x t r a c t e d p a g e s , e . g . : & f i e l d s = q u e r y s t r i n g , l i n k s . S e e a v a i l a b l e f i e l d s w i t h i n e a c h A P I ' s i n d i v i d u a l d o c u m e n t a t i o n p a g e s .
* @see https : //docs.diffbot.com/reference/extract-optional-fields
* /
fields? : string [ ]
/** (*Undocumented*) Pass paging=false to disable automatic concatenation of multiple-page articles. (By default, Diffbot will concatenate up to 20 pages of a single article.) */
paging? : boolean
/** Pass discussion=false to disable automatic extraction of comments or reviews from pages identified as articles or products. This will not affect pages identified as discussions. */
discussion? : boolean
/** Sets a value in milliseconds to wait for the retrieval/fetch of content from the requested URL. The default timeout for the third-party response is 30 seconds (30000). */
timeout? : number
/** Used to specify the IP address of a custom proxy that will be used to fetch the target page, instead of Diffbot's default IPs/proxies. (Ex: &proxy=168.212.226.204) */
proxy? : string
/** Used to specify the authentication parameters that will be used with the proxy specified in the &proxy parameter. (Ex: &proxyAuth=username:password) */
proxyAuth? : string
/** `none` will instruct Extract to not use proxies, even if proxies have been enabled for this particular URL globally. */
useProxy? : string
/** @see https://docs.diffbot.com/reference/extract-custom-javascript */
customJs? : string
/** @see https://docs.diffbot.com/reference/extract-custom-headers */
customHeaders? : Record < string , string >
}
export interface DiffbotExtractAnalyzeOptions extends DiffbotExtractOptions {
/** Web page URL of the analyze to process */
url : string
/** By default the Analyze API will fully extract all pages that match an existing Automatic API -- articles, products or image pages. Set mode to a specific page-type (e.g., mode=article) to extract content only from that specific page-type. All other pages will simply return the default Analyze fields. */
mode? : string
/** Force any non-extracted pages (those with a type of "other") through a specific API. For example, to route all "other" pages through the Article API, pass &fallback=article. Pages that utilize this functionality will return a fallbackType field at the top-level of the response and a originalType field within each extracted object, both of which will indicate the fallback API used. */
fallback? : string
}
export interface DiffbotExtractArticleOptions extends DiffbotExtractOptions {
/** Web page URL of the analyze to process */
url : string
/** Set the maximum number of automatically-generated tags to return. By default a maximum of ten tags will be returned. */
maxTags? : number
/** Set the minimum relevance score of tags to return, between 0.0 and 1.0. By default only tags with a score equal to or above 0.5 will be returned. */
tagConfidence? : number
/** Used to request the output of the Diffbot Natural Language API in the field naturalLanguage. Example: &naturalLanguage=entities,facts,categories,sentiment. */
naturalLanguage? : string [ ]
}
export interface DiffbotExtractResponse {
request : DiffbotRequest
objects : DiffbotObject [ ]
}
export type DiffbotExtractArticleResponse = DiffbotExtractResponse
export interface DiffbotExtractAnalyzeResponse extends DiffbotExtractResponse {
type : string
title : string
humanLanguage : string
}
export interface DiffbotObject {
date : string
sentiment : number
images : DiffbotImage [ ]
author : string
estimatedDate : string
publisherRegion : string
icon : string
diffbotUri : string
siteName : string
type : string
title : string
tags : DiffbotTag [ ]
publisherCountry : string
humanLanguage : string
authorUrl : string
pageUrl : string
html : string
text : string
categories? : DiffbotCategory [ ]
authors : DiffbotAuthor [ ]
breadcrumb? : DiffbotBreadcrumb [ ]
2023-06-16 06:49:56 +00:00
items? : DiffbotListItem [ ]
2023-06-13 08:09:26 +00:00
meta? : any
}
2023-06-16 06:49:56 +00:00
interface DiffbotListItem {
title : string
link : string
summary : string
image? : string
}
2023-06-13 08:09:26 +00:00
interface DiffbotAuthor {
name : string
link : string
}
interface DiffbotCategory {
score : number
name : string
id : string
}
export interface DiffbotBreadcrumb {
link : string
name : string
}
interface DiffbotImage {
url : string
diffbotUri : string
naturalWidth : number
naturalHeight : number
width : number
height : number
isCached? : boolean
primary? : boolean
}
interface DiffbotTag {
score : number
sentiment : number
count : number
label : string
uri : string
rdfTypes : string [ ]
}
interface DiffbotRequest {
pageUrl : string
api : string
version : number
}
export interface Image {
naturalHeight : number
diffbotUri : string
url : string
naturalWidth : number
primary : boolean
}
export interface Tag {
score : number
sentiment : number
count : number
label : string
uri : string
rdfTypes : string [ ]
}
export interface Request {
pageUrl : string
api : string
version : number
}
2023-06-13 09:06:00 +00:00
export interface DiffbotKnowledgeGraphSearchOptions {
2023-06-13 08:09:26 +00:00
type ? : 'query' | 'text' | 'queryTextFallback' | 'crawl'
query : string
col? : string
from ? : number
size? : number
// NOTE: we only support `json`, so these options are not needed
// We can always convert from json to another format if needed.
// format?: 'json' | 'jsonl' | 'csv' | 'xls' | 'xlsx'
// exportspec?: string
// exportseparator?: string
// exportfile?: string
filter? : string
jsonmode ? : 'extended' | 'id'
nonCanonicalFacts? : boolean
noDedupArticles? : boolean
cluster ? : 'all' | 'best' | 'dedupe'
report? : boolean
}
2023-06-13 09:06:00 +00:00
export interface DiffbotKnowledgeGraphEnhanceOptions {
type : 'Person' | 'Organization'
id? : string
name? : string
url? : string
phone? : string
email? : string
employer? : string
title? : string
school? : string
location? : string
ip? : string
customId? : string
size? : number
threshold? : number
refresh? : boolean
search? : boolean
useCache? : boolean
filter? : string
jsonmode ? : 'extended' | 'id'
nonCanonicalFacts? : boolean
}
2023-06-13 09:06:40 +00:00
export interface DiffbotKnowledgeGraphResponse {
2023-06-13 09:06:00 +00:00
data : DiffbotKnowledgeGraphNode [ ]
2023-06-13 08:09:26 +00:00
version : number
hits : number
results : number
kgversion : string
diffbot_type : string
2023-06-13 09:06:00 +00:00
facet? : boolean
errors? : any [ ]
2023-06-13 08:09:26 +00:00
}
export interface DiffbotKnowledgeGraphNode {
score : number
2023-06-13 09:06:00 +00:00
esscore? : number
2023-06-13 08:09:26 +00:00
entity : DiffbotKnowledgeGraphEntity
entity_ctx : any
errors : string [ ]
callbackQuery : string
upperBound : number
lowerBound : number
count : number
value : string
uri : string
}
export interface DiffbotKnowledgeGraphEntity {
id : string
diffbotUri : string
2023-06-13 09:06:00 +00:00
type ? : string
2023-06-13 08:09:26 +00:00
name : string
2023-06-13 09:06:00 +00:00
images : DiffbotImage [ ]
2023-06-13 08:09:26 +00:00
origins : string [ ]
2023-06-13 09:06:00 +00:00
nbOrigins? : number
gender? : DiffbotGender
githubUri? : string
importance? : number
description? : string
homepageUri? : string
allNames? : string [ ]
skills? : DiffbotSkill [ ]
crawlTimestamp? : number
summary? : string
image? : string
types? : string [ ]
nbIncomingEdges? : number
allUris? : string [ ]
employments? : DiffbotEmployment [ ]
locations? : DiffbotLocation [ ]
location? : DiffbotLocation
allOriginHashes? : string [ ]
nameDetail? : DiffbotNameDetail
}
interface DiffbotEmployment {
employer : Entity
}
interface Entity {
image? : string
types? : string [ ]
name : string
diffbotUri? : string
type : EntityType
summary? : string
}
type EntityType = 'Organization' | 'Place'
interface DiffbotGender {
normalizedValue : string
}
interface DiffbotLocation {
country : Entity
isCurrent : boolean
address : string
latitude : number
precision : number
surfaceForm : string
region : Entity
longitude : number
}
interface DiffbotNameDetail {
firstName : string
lastName : string
}
interface DiffbotSkill {
name : string
diffbotUri : string
2023-06-13 08:09:26 +00:00
}
2023-06-16 06:49:56 +00:00
const throttle = pThrottle ( {
limit : 5 ,
interval : 1000 ,
strict : true
} )
2023-06-13 08:09:26 +00:00
export class DiffbotClient {
api : typeof defaultKy
apiKnowledgeGraph : typeof defaultKy
apiKey : string
apiBaseUrl : string
apiKnowledgeGraphBaseUrl : string
constructor ( {
apiKey = process . env . DIFFBOT_API_KEY ,
apiBaseUrl = DIFFBOT_API_BASE_URL ,
apiKnowledgeGraphBaseUrl = DIFFBOT_KNOWLEDGE_GRAPH_API_BASE_URL ,
2023-06-17 00:09:24 +00:00
timeoutMs = 30 _000 ,
2023-06-13 08:09:26 +00:00
ky = defaultKy
} : {
apiKey? : string
apiBaseUrl? : string
apiKnowledgeGraphBaseUrl? : string
timeoutMs? : number
ky? : typeof defaultKy
} = { } ) {
if ( ! apiKey ) {
throw new Error ( ` Error DiffbotClient missing required "apiKey" ` )
}
this . apiKey = apiKey
this . apiBaseUrl = apiBaseUrl
this . apiKnowledgeGraphBaseUrl = apiKnowledgeGraphBaseUrl
2023-06-16 06:49:56 +00:00
const throttledKy = throttleKy ( ky , throttle )
this . api = throttledKy . extend ( {
prefixUrl : apiBaseUrl ,
timeout : timeoutMs
} )
this . apiKnowledgeGraph = throttledKy . extend ( {
2023-06-13 08:09:26 +00:00
prefixUrl : apiKnowledgeGraphBaseUrl ,
timeout : timeoutMs
} )
}
protected async _extract <
T extends DiffbotExtractResponse = DiffbotExtractResponse
> ( endpoint : string , options : DiffbotExtractOptions ) : Promise < T > {
const { customJs , customHeaders , . . . rest } = options
const searchParams : Record < string , any > = {
. . . rest ,
token : this.apiKey
}
const headers = {
. . . Object . fromEntries (
[ [ 'X-Forward-X-Evaluate' , customJs ] ] . filter ( ( [ , value ] ) = > value )
) ,
. . . customHeaders
}
for ( const [ key , value ] of Object . entries ( rest ) ) {
if ( Array . isArray ( value ) ) {
searchParams [ key ] = value . join ( ',' )
}
}
2023-06-17 00:09:24 +00:00
// TODO
const { url } = searchParams
if ( url ) {
const parsedUrl = new URL ( url )
if ( parsedUrl . hostname . includes ( 'theguardian.com' ) ) {
throw new AbortError (
` Diffbot does not support URLs from domain " ${ parsedUrl . hostname } " `
)
}
}
// console.log(`DiffbotClient._extract: ${endpoint}`, searchParams)
2023-06-16 06:49:56 +00:00
2023-06-13 08:09:26 +00:00
return this . api
. get ( endpoint , {
searchParams ,
2023-06-16 06:49:56 +00:00
headers ,
2023-06-17 00:09:24 +00:00
retry : 1
2023-06-13 08:09:26 +00:00
} )
. json < T > ( )
}
async extractAnalyze ( options : DiffbotExtractAnalyzeOptions ) {
return this . _extract < DiffbotExtractAnalyzeResponse > ( 'v3/analyze' , options )
}
async extractArticle ( options : DiffbotExtractArticleOptions ) {
return this . _extract < DiffbotExtractArticleResponse > ( 'v3/article' , options )
}
2023-06-13 09:06:00 +00:00
async knowledgeGraphSearch ( options : DiffbotKnowledgeGraphSearchOptions ) {
2023-06-13 08:09:26 +00:00
return this . apiKnowledgeGraph
. get ( 'kg/v3/dql' , {
searchParams : {
. . . options ,
token : this.apiKey
}
} )
2023-06-13 09:06:40 +00:00
. json < DiffbotKnowledgeGraphResponse > ( )
2023-06-13 09:06:00 +00:00
}
async knowledgeGraphEnhance ( options : DiffbotKnowledgeGraphEnhanceOptions ) {
return this . apiKnowledgeGraph
. get ( 'kg/v3/enhance' , {
searchParams : {
. . . options ,
token : this.apiKey
}
} )
2023-06-13 09:06:40 +00:00
. json < DiffbotKnowledgeGraphResponse > ( )
2023-06-13 08:09:26 +00:00
}
}