From f8cd52221d081ff5a6462b94c1461c7b499d5e15 Mon Sep 17 00:00:00 2001 From: Travis Fischer Date: Wed, 19 Jun 2024 02:10:11 -0700 Subject: [PATCH] feat: improvements to clearbit, diffbot, proxycurl, scraper clients --- src/services/clearbit-client.ts | 117 ++++++++++++++++++------------- src/services/diffbot-client.ts | 7 ++ src/services/proxycurl-client.ts | 30 ++++++-- src/services/scraper-client.ts | 25 ++++++- src/utils.ts | 23 ++++++ 5 files changed, 143 insertions(+), 59 deletions(-) diff --git a/src/services/clearbit-client.ts b/src/services/clearbit-client.ts index ee32adb..5631cca 100644 --- a/src/services/clearbit-client.ts +++ b/src/services/clearbit-client.ts @@ -1,11 +1,12 @@ import defaultKy from 'ky' import pThrottle from 'p-throttle' -import type { DeepNullable, KyInstance } from '../types.js' +import type { KyInstance } from '../types.js' import { assert, delay, getEnv, + pruneNullOrUndefinedDeep, sanitizeSearchParams, throttleKy } from '../utils.js' @@ -37,7 +38,7 @@ export namespace clearbit { phoneNumbers: string[] emailAddresses: string[] } - category: { + category: Partial<{ sector: string industryGroup: string industry: string @@ -48,14 +49,14 @@ export namespace clearbit { naicsCode: string naics6Codes: string[] naics6Codes2022: string[] - } + }> tags: string[] description: string foundedYear: number location: string timeZone: string utcOffset: number - geo: { + geo: Partial<{ streetNumber: string streetName: string subPremise: string @@ -68,16 +69,16 @@ export namespace clearbit { countryCode: string lat: number lng: number - } + }> logo: string - facebook: { + facebook: Partial<{ handle: string likes: number - } + }> linkedin: { handle: string } - twitter: { + twitter: Partial<{ handle: string id: string bio: string @@ -86,30 +87,30 @@ export namespace clearbit { location: string site: string avatar: string - } + }> crunchbase: { handle: string } emailProvider: boolean type: string ticker: string - identifiers: { + identifiers: Partial<{ usEIN: string usCIK: string - } + }> phone: string - metrics: { + metrics: Partial<{ alexaUsRank: number alexaGlobalRank: number trafficRank: string employees: number employeesRange: string - marketCap: string + marketCap: number raised: number annualRevenue: string estimatedAnnualRevenue: string - fiscalYearEnd: string - } + fiscalYearEnd: number + }> indexedAt: string tech: string[] techCategories: string[] @@ -121,18 +122,18 @@ export namespace clearbit { } } - export type EmailLookupResponse = DeepNullable<{ + export type EmailLookupResponse = Partial<{ id: string - name: { + name: Partial<{ fullName: string givenName: string familyName: string - } + }> email: string location: string timeZone: string utcOffset: number - geo: { + geo: Partial<{ city: string state: string stateCode: string @@ -140,22 +141,22 @@ export namespace clearbit { countryCode: string lat: number lng: number - } + }> bio: string site: string avatar: string - employment: { + employment: Partial<{ domain: string name: string title: string role: string subRole: string seniority: string - } + }> facebook: { handle: string } - github: { + github: Partial<{ handle: string id: string avatar: string @@ -163,8 +164,8 @@ export namespace clearbit { blog: string followers: number following: number - } - twitter: { + }> + twitter: Partial<{ handle: string id: string bio: string @@ -175,14 +176,14 @@ export namespace clearbit { location: string site: string avatar: string - } + }> linkedin: { handle: string } googleplus: { handle: null } - gravatar: { + gravatar: Partial<{ handle: string urls: { value: string @@ -193,7 +194,7 @@ export namespace clearbit { url: string type: string }[] - } + }> fuzzy: boolean emailProvider: boolean indexedAt: string @@ -204,7 +205,7 @@ export namespace clearbit { export type CompanyResponse = { id: string - } & DeepNullable + } & Partial export interface CompanySearchOptions { /** @@ -260,17 +261,17 @@ export namespace clearbit { } export interface EmploymentAttributes { - company: string - domain: string - linkedin: string - title: string - role: string - subRole: string - seniority: string - startDate: string - endDate: string - present: boolean - highlight: boolean + company?: string + domain?: string + linkedin?: string + title?: string + role?: string + subRole?: string + seniority?: string + startDate?: string + endDate?: string + present?: boolean + highlight?: boolean } export interface EmailAttributes { @@ -291,7 +292,7 @@ export namespace clearbit { export type PersonAttributesV2 = { id: string - } & DeepNullable<{ + } & Partial<{ name: Name avatar: string location: string @@ -554,31 +555,37 @@ export class ClearbitClient { } async companyEnrichment(options: clearbit.CompanyEnrichmentOptions) { - return this.ky + const res = await this.ky .get('https://company-stream.clearbit.com/v2/companies/find', { searchParams: sanitizeSearchParams(options) }) .json() + + return pruneNullOrUndefinedDeep(res) } async companySearch(options: clearbit.CompanySearchOptions) { - return this.ky + const res = await this.ky .get('https://discovery.clearbit.com/v1/companies/search', { searchParams: sanitizeSearchParams(options) }) .json() + + return pruneNullOrUndefinedDeep(res) } async companyAutocomplete(name: string) { - return this.ky + const res = await this.ky .get('https://autocomplete.clearbit.com/v1/companies/suggest', { searchParams: { query: name } }) .json() + + return pruneNullOrUndefinedDeep(res) } async prospectorPeopleV2(options: clearbit.PeopleSearchOptionsV2) { - return this.ky + const res = await this.ky .get('https://prospector.clearbit.com/v2/people/search', { searchParams: sanitizeSearchParams({ ...options, @@ -589,10 +596,12 @@ export class ClearbitClient { }) }) .json() + + return pruneNullOrUndefinedDeep(res) } async prospectorPeopleV1(options: clearbit.PeopleSearchOptionsV1) { - return this.ky + const res = await this.ky .get('https://prospector.clearbit.com/v1/people/search', { searchParams: sanitizeSearchParams({ email: false, @@ -604,6 +613,8 @@ export class ClearbitClient { }) }) .json() + + return pruneNullOrUndefinedDeep(res) } // TODO Status code = 202 means the response was queued. @@ -622,7 +633,8 @@ export class ClearbitClient { }) if (response.status !== 202 || !maxRetries) { - return response.json() + const res = await response.json() + return pruneNullOrUndefinedDeep(res) } if (maxRetries && response.status === 202) { @@ -637,7 +649,8 @@ export class ClearbitClient { count++ running = response.status === 202 } - return response.json() + const res = await response.json() + return pruneNullOrUndefinedDeep(res) } throw new Error('clearbit email lookup error 202', { cause: response }) @@ -653,17 +666,21 @@ export class ClearbitClient { } async revealCompanyFromIP(ip: string) { - return this.ky + const res = await this.ky .get('https://reveal.clearbit.com/v1/companies/find', { searchParams: { ip } }) .json() .catch((_) => undefined) + + if (res) { + return pruneNullOrUndefinedDeep(res) + } } static filterEmploymentProspectorV2( companyName: string, - employments: Array | null> | null + employments?: Array> ) { if (employments && employments.length > 0) { // We filter by employment endDate because some people could have multiple diff --git a/src/services/diffbot-client.ts b/src/services/diffbot-client.ts index 33dba1c..e21d2ca 100644 --- a/src/services/diffbot-client.ts +++ b/src/services/diffbot-client.ts @@ -384,12 +384,14 @@ export namespace diffbot { nbIncomingEdges?: number nbFollowers?: number nbLocations?: number + nbEmployees?: number nbEmployeesMin?: number nbEmployeesMax?: number nbActiveEmployeeEdges?: number nbUniqueInvestors?: number educations?: Education[] nationalities?: Nationality[] + fullName?: string allNames?: string[] skills?: Partial[] children?: BasicEntity[] @@ -401,6 +403,8 @@ export namespace diffbot { parents?: BasicEntity[] gender?: Gender importance?: number + monthlyTraffic?: number + monthlyTrafficGrowth?: number wikipediaPageviews?: number wikipediaPageviewsLastQuarterGrowth?: number wikipediaPageviewsLastYear?: number @@ -459,6 +463,9 @@ export namespace diffbot { stock?: Stock companiesHouseIds?: string[] yearlyRevenues?: AnnualRevenue[] + revenue?: Amount + parentCompany?: BasicEntity + legalEntities?: BasicEntity[] } export interface AnnualRevenue { diff --git a/src/services/proxycurl-client.ts b/src/services/proxycurl-client.ts index a1aea3e..cd8d016 100644 --- a/src/services/proxycurl-client.ts +++ b/src/services/proxycurl-client.ts @@ -1935,6 +1935,7 @@ export namespace proxycurl { export type SearchResult = z.infer export const ResultProfileSchema = z.object({ + linkedin_url: z.string().optional(), acquisitions: PurpleAcquisitionSchema.optional(), affiliated_companies: z.array(PurpleAffiliatedCompanySchema).optional(), background_cover_image_url: z.string().optional(), @@ -1963,7 +1964,12 @@ export namespace proxycurl { updates: z.array(PurpleCompanyUpdateSchema).optional(), website: z.string().optional() }) - export type ResultProfile = z.infer + export type CompanyProfile = z.infer + export type ResolvedCompanyProfile = { + url: string + last_updated: string + profile: CompanyProfile + } export const CompanyUrlEnrichResultProfileSchema = z.object({ acquisitions: FluffyAcquisitionSchema.optional(), @@ -2087,8 +2093,8 @@ export class ProxycurlClient extends AIFunctionsProvider { }) async getLinkedInCompany( opts: proxycurl.CompanyProfileEndpointParamsQueryClass - ) { - return this.ky + ): Promise { + const res = await this.ky .get('api/linkedin/company', { searchParams: sanitizeSearchParams({ funding_data: 'include', @@ -2097,7 +2103,12 @@ export class ProxycurlClient extends AIFunctionsProvider { ...opts }) }) - .json() + .json() + + return { + linkedin_url: opts.url, + ...res + } } @aiFunction({ @@ -2181,15 +2192,20 @@ export class ProxycurlClient extends AIFunctionsProvider { }) async resolveLinkedInCompany( opts: proxycurl.CompanyLookupEndpointParamsQueryClass - ) { - return this.ky + ): Promise { + const res = await this.ky .get('api/linkedin/company/resolve', { searchParams: sanitizeSearchParams({ enrich_profile: 'enrich', ...opts }) }) - .json() + .json() + + return { + linkedin_url: res.url, + ...res.profile + } } @aiFunction({ diff --git a/src/services/scraper-client.ts b/src/services/scraper-client.ts index 8f5eb07..2f7e6e5 100644 --- a/src/services/scraper-client.ts +++ b/src/services/scraper-client.ts @@ -1,10 +1,18 @@ import defaultKy, { type KyInstance } from 'ky' +import pThrottle from 'p-throttle' import { z } from 'zod' import { aiFunction, AIFunctionsProvider } from '../fns.js' -import { assert, getEnv, omit } from '../utils.js' +import { assert, getEnv, omit, throttleKy } from '../utils.js' export namespace scraper { + // Allow up to 1 request per second by default. + export const throttle = pThrottle({ + limit: 1, + interval: 1000, + strict: true + }) + export type ScrapeResult = { author: string byline: string @@ -47,10 +55,12 @@ export class ScraperClient extends AIFunctionsProvider { constructor({ apiBaseUrl = getEnv('SCRAPER_API_BASE_URL'), + throttle = true, ky = defaultKy }: { apiKey?: string apiBaseUrl?: string + throttle?: boolean ky?: KyInstance } = {}) { assert( @@ -60,7 +70,9 @@ export class ScraperClient extends AIFunctionsProvider { super() this.apiBaseUrl = apiBaseUrl - this.ky = ky.extend({ prefixUrl: this.apiBaseUrl }) + + const throttledKy = throttle ? throttleKy(ky, scraper.throttle) : ky + this.ky = throttledKy.extend({ prefixUrl: this.apiBaseUrl }) } @aiFunction({ @@ -99,6 +111,15 @@ export class ScraperClient extends AIFunctionsProvider { }) .json() + if (res.length <= 40) { + try { + const message = (JSON.parse(res.textContent as string) as any).message + throw new Error(`Failed to scrape URL "${opts.url}": ${message}`) + } catch { + throw new Error(`Failed to scrape URL "${opts.url}"`) + } + } + switch (format) { case 'html': return omit(res, 'markdownContent', 'textContent', 'rawHtml') diff --git a/src/utils.ts b/src/utils.ts index 7da30cb..e5270c4 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -67,6 +67,29 @@ export function pruneNullOrUndefined>( ) as NonNullable } +export function pruneNullOrUndefinedDeep>( + obj: T +): NonNullable<{ [K in keyof T]: Exclude }> { + if (!obj || Array.isArray(obj) || typeof obj !== 'object') return obj + + return Object.fromEntries( + Object.entries(obj) + .filter(([, value]) => value !== undefined && value !== null) + .map(([key, value]) => + Array.isArray(value) + ? [ + key, + value + .filter((v) => v !== undefined && v !== null) + .map(pruneNullOrUndefinedDeep as any) + ] + : typeof value === 'object' + ? [key, pruneNullOrUndefinedDeep(value)] + : [key, value] + ) + ) as NonNullable +} + export function getEnv(name: string): string | undefined { try { return typeof process !== 'undefined'