feat: improvements to clearbit, diffbot, proxycurl, scraper clients

pull/659/head
Travis Fischer 2024-06-19 02:10:11 -07:00
rodzic cecc1a3a0e
commit f8cd52221d
5 zmienionych plików z 143 dodań i 59 usunięć

Wyświetl plik

@ -1,11 +1,12 @@
import defaultKy from 'ky'
import pThrottle from 'p-throttle'
import type { DeepNullable, KyInstance } from '../types.js'
import type { KyInstance } from '../types.js'
import {
assert,
delay,
getEnv,
pruneNullOrUndefinedDeep,
sanitizeSearchParams,
throttleKy
} from '../utils.js'
@ -37,7 +38,7 @@ export namespace clearbit {
phoneNumbers: string[]
emailAddresses: string[]
}
category: {
category: Partial<{
sector: string
industryGroup: string
industry: string
@ -48,14 +49,14 @@ export namespace clearbit {
naicsCode: string
naics6Codes: string[]
naics6Codes2022: string[]
}
}>
tags: string[]
description: string
foundedYear: number
location: string
timeZone: string
utcOffset: number
geo: {
geo: Partial<{
streetNumber: string
streetName: string
subPremise: string
@ -68,16 +69,16 @@ export namespace clearbit {
countryCode: string
lat: number
lng: number
}
}>
logo: string
facebook: {
facebook: Partial<{
handle: string
likes: number
}
}>
linkedin: {
handle: string
}
twitter: {
twitter: Partial<{
handle: string
id: string
bio: string
@ -86,30 +87,30 @@ export namespace clearbit {
location: string
site: string
avatar: string
}
}>
crunchbase: {
handle: string
}
emailProvider: boolean
type: string
ticker: string
identifiers: {
identifiers: Partial<{
usEIN: string
usCIK: string
}
}>
phone: string
metrics: {
metrics: Partial<{
alexaUsRank: number
alexaGlobalRank: number
trafficRank: string
employees: number
employeesRange: string
marketCap: string
marketCap: number
raised: number
annualRevenue: string
estimatedAnnualRevenue: string
fiscalYearEnd: string
}
fiscalYearEnd: number
}>
indexedAt: string
tech: string[]
techCategories: string[]
@ -121,18 +122,18 @@ export namespace clearbit {
}
}
export type EmailLookupResponse = DeepNullable<{
export type EmailLookupResponse = Partial<{
id: string
name: {
name: Partial<{
fullName: string
givenName: string
familyName: string
}
}>
email: string
location: string
timeZone: string
utcOffset: number
geo: {
geo: Partial<{
city: string
state: string
stateCode: string
@ -140,22 +141,22 @@ export namespace clearbit {
countryCode: string
lat: number
lng: number
}
}>
bio: string
site: string
avatar: string
employment: {
employment: Partial<{
domain: string
name: string
title: string
role: string
subRole: string
seniority: string
}
}>
facebook: {
handle: string
}
github: {
github: Partial<{
handle: string
id: string
avatar: string
@ -163,8 +164,8 @@ export namespace clearbit {
blog: string
followers: number
following: number
}
twitter: {
}>
twitter: Partial<{
handle: string
id: string
bio: string
@ -175,14 +176,14 @@ export namespace clearbit {
location: string
site: string
avatar: string
}
}>
linkedin: {
handle: string
}
googleplus: {
handle: null
}
gravatar: {
gravatar: Partial<{
handle: string
urls: {
value: string
@ -193,7 +194,7 @@ export namespace clearbit {
url: string
type: string
}[]
}
}>
fuzzy: boolean
emailProvider: boolean
indexedAt: string
@ -204,7 +205,7 @@ export namespace clearbit {
export type CompanyResponse = {
id: string
} & DeepNullable<CompanyNullableProps>
} & Partial<CompanyNullableProps>
export interface CompanySearchOptions {
/**
@ -260,17 +261,17 @@ export namespace clearbit {
}
export interface EmploymentAttributes {
company: string
domain: string
linkedin: string
title: string
role: string
subRole: string
seniority: string
startDate: string
endDate: string
present: boolean
highlight: boolean
company?: string
domain?: string
linkedin?: string
title?: string
role?: string
subRole?: string
seniority?: string
startDate?: string
endDate?: string
present?: boolean
highlight?: boolean
}
export interface EmailAttributes {
@ -291,7 +292,7 @@ export namespace clearbit {
export type PersonAttributesV2 = {
id: string
} & DeepNullable<{
} & Partial<{
name: Name
avatar: string
location: string
@ -554,31 +555,37 @@ export class ClearbitClient {
}
async companyEnrichment(options: clearbit.CompanyEnrichmentOptions) {
return this.ky
const res = await this.ky
.get('https://company-stream.clearbit.com/v2/companies/find', {
searchParams: sanitizeSearchParams(options)
})
.json<clearbit.CompanyResponse>()
return pruneNullOrUndefinedDeep(res)
}
async companySearch(options: clearbit.CompanySearchOptions) {
return this.ky
const res = await this.ky
.get('https://discovery.clearbit.com/v1/companies/search', {
searchParams: sanitizeSearchParams(options)
})
.json<clearbit.CompanySearchResponse>()
return pruneNullOrUndefinedDeep(res)
}
async companyAutocomplete(name: string) {
return this.ky
const res = await this.ky
.get('https://autocomplete.clearbit.com/v1/companies/suggest', {
searchParams: { query: name }
})
.json<clearbit.BasicCompanyResponse[]>()
return pruneNullOrUndefinedDeep(res)
}
async prospectorPeopleV2(options: clearbit.PeopleSearchOptionsV2) {
return this.ky
const res = await this.ky
.get('https://prospector.clearbit.com/v2/people/search', {
searchParams: sanitizeSearchParams({
...options,
@ -589,10 +596,12 @@ export class ClearbitClient {
})
})
.json<clearbit.ProspectorResponseV2>()
return pruneNullOrUndefinedDeep(res)
}
async prospectorPeopleV1(options: clearbit.PeopleSearchOptionsV1) {
return this.ky
const res = await this.ky
.get('https://prospector.clearbit.com/v1/people/search', {
searchParams: sanitizeSearchParams({
email: false,
@ -604,6 +613,8 @@ export class ClearbitClient {
})
})
.json<clearbit.ProspectorResponseV1>()
return pruneNullOrUndefinedDeep(res)
}
// TODO Status code = 202 means the response was queued.
@ -622,7 +633,8 @@ export class ClearbitClient {
})
if (response.status !== 202 || !maxRetries) {
return response.json<clearbit.EmailLookupResponse>()
const res = await response.json<clearbit.EmailLookupResponse>()
return pruneNullOrUndefinedDeep(res)
}
if (maxRetries && response.status === 202) {
@ -637,7 +649,8 @@ export class ClearbitClient {
count++
running = response.status === 202
}
return response.json<clearbit.EmailLookupResponse>()
const res = await response.json<clearbit.EmailLookupResponse>()
return pruneNullOrUndefinedDeep(res)
}
throw new Error('clearbit email lookup error 202', { cause: response })
@ -653,17 +666,21 @@ export class ClearbitClient {
}
async revealCompanyFromIP(ip: string) {
return this.ky
const res = await this.ky
.get('https://reveal.clearbit.com/v1/companies/find', {
searchParams: { ip }
})
.json<clearbit.CompanyRevealResponse>()
.catch((_) => undefined)
if (res) {
return pruneNullOrUndefinedDeep(res)
}
}
static filterEmploymentProspectorV2(
companyName: string,
employments: Array<DeepNullable<clearbit.EmploymentAttributes> | null> | null
employments?: Array<Partial<clearbit.EmploymentAttributes>>
) {
if (employments && employments.length > 0) {
// We filter by employment endDate because some people could have multiple

Wyświetl plik

@ -384,12 +384,14 @@ export namespace diffbot {
nbIncomingEdges?: number
nbFollowers?: number
nbLocations?: number
nbEmployees?: number
nbEmployeesMin?: number
nbEmployeesMax?: number
nbActiveEmployeeEdges?: number
nbUniqueInvestors?: number
educations?: Education[]
nationalities?: Nationality[]
fullName?: string
allNames?: string[]
skills?: Partial<BasicEntity>[]
children?: BasicEntity[]
@ -401,6 +403,8 @@ export namespace diffbot {
parents?: BasicEntity[]
gender?: Gender
importance?: number
monthlyTraffic?: number
monthlyTrafficGrowth?: number
wikipediaPageviews?: number
wikipediaPageviewsLastQuarterGrowth?: number
wikipediaPageviewsLastYear?: number
@ -459,6 +463,9 @@ export namespace diffbot {
stock?: Stock
companiesHouseIds?: string[]
yearlyRevenues?: AnnualRevenue[]
revenue?: Amount
parentCompany?: BasicEntity
legalEntities?: BasicEntity[]
}
export interface AnnualRevenue {

Wyświetl plik

@ -1935,6 +1935,7 @@ export namespace proxycurl {
export type SearchResult = z.infer<typeof SearchResultSchema>
export const ResultProfileSchema = z.object({
linkedin_url: z.string().optional(),
acquisitions: PurpleAcquisitionSchema.optional(),
affiliated_companies: z.array(PurpleAffiliatedCompanySchema).optional(),
background_cover_image_url: z.string().optional(),
@ -1963,7 +1964,12 @@ export namespace proxycurl {
updates: z.array(PurpleCompanyUpdateSchema).optional(),
website: z.string().optional()
})
export type ResultProfile = z.infer<typeof ResultProfileSchema>
export type CompanyProfile = z.infer<typeof ResultProfileSchema>
export type ResolvedCompanyProfile = {
url: string
last_updated: string
profile: CompanyProfile
}
export const CompanyUrlEnrichResultProfileSchema = z.object({
acquisitions: FluffyAcquisitionSchema.optional(),
@ -2087,8 +2093,8 @@ export class ProxycurlClient extends AIFunctionsProvider {
})
async getLinkedInCompany(
opts: proxycurl.CompanyProfileEndpointParamsQueryClass
) {
return this.ky
): Promise<proxycurl.CompanyProfile> {
const res = await this.ky
.get('api/linkedin/company', {
searchParams: sanitizeSearchParams({
funding_data: 'include',
@ -2097,7 +2103,12 @@ export class ProxycurlClient extends AIFunctionsProvider {
...opts
})
})
.json<proxycurl.ResultProfile>()
.json<proxycurl.CompanyProfile>()
return {
linkedin_url: opts.url,
...res
}
}
@aiFunction({
@ -2181,15 +2192,20 @@ export class ProxycurlClient extends AIFunctionsProvider {
})
async resolveLinkedInCompany(
opts: proxycurl.CompanyLookupEndpointParamsQueryClass
) {
return this.ky
): Promise<proxycurl.CompanyProfile> {
const res = await this.ky
.get('api/linkedin/company/resolve', {
searchParams: sanitizeSearchParams({
enrich_profile: 'enrich',
...opts
})
})
.json<proxycurl.ResultProfile>()
.json<proxycurl.ResolvedCompanyProfile>()
return {
linkedin_url: res.url,
...res.profile
}
}
@aiFunction({

Wyświetl plik

@ -1,10 +1,18 @@
import defaultKy, { type KyInstance } from 'ky'
import pThrottle from 'p-throttle'
import { z } from 'zod'
import { aiFunction, AIFunctionsProvider } from '../fns.js'
import { assert, getEnv, omit } from '../utils.js'
import { assert, getEnv, omit, throttleKy } from '../utils.js'
export namespace scraper {
// Allow up to 1 request per second by default.
export const throttle = pThrottle({
limit: 1,
interval: 1000,
strict: true
})
export type ScrapeResult = {
author: string
byline: string
@ -47,10 +55,12 @@ export class ScraperClient extends AIFunctionsProvider {
constructor({
apiBaseUrl = getEnv('SCRAPER_API_BASE_URL'),
throttle = true,
ky = defaultKy
}: {
apiKey?: string
apiBaseUrl?: string
throttle?: boolean
ky?: KyInstance
} = {}) {
assert(
@ -60,7 +70,9 @@ export class ScraperClient extends AIFunctionsProvider {
super()
this.apiBaseUrl = apiBaseUrl
this.ky = ky.extend({ prefixUrl: this.apiBaseUrl })
const throttledKy = throttle ? throttleKy(ky, scraper.throttle) : ky
this.ky = throttledKy.extend({ prefixUrl: this.apiBaseUrl })
}
@aiFunction({
@ -99,6 +111,15 @@ export class ScraperClient extends AIFunctionsProvider {
})
.json<scraper.ScrapeResult>()
if (res.length <= 40) {
try {
const message = (JSON.parse(res.textContent as string) as any).message
throw new Error(`Failed to scrape URL "${opts.url}": ${message}`)
} catch {
throw new Error(`Failed to scrape URL "${opts.url}"`)
}
}
switch (format) {
case 'html':
return omit(res, 'markdownContent', 'textContent', 'rawHtml')

Wyświetl plik

@ -67,6 +67,29 @@ export function pruneNullOrUndefined<T extends Record<string, any>>(
) as NonNullable<T>
}
export function pruneNullOrUndefinedDeep<T extends Record<string, any>>(
obj: T
): NonNullable<{ [K in keyof T]: Exclude<T[K], undefined | null> }> {
if (!obj || Array.isArray(obj) || typeof obj !== 'object') return obj
return Object.fromEntries(
Object.entries(obj)
.filter(([, value]) => value !== undefined && value !== null)
.map(([key, value]) =>
Array.isArray(value)
? [
key,
value
.filter((v) => v !== undefined && v !== null)
.map(pruneNullOrUndefinedDeep as any)
]
: typeof value === 'object'
? [key, pruneNullOrUndefinedDeep(value)]
: [key, value]
)
) as NonNullable<T>
}
export function getEnv(name: string): string | undefined {
try {
return typeof process !== 'undefined'