feat: improvements to clearbit, diffbot, proxycurl, scraper clients

old-agentic
Travis Fischer 2024-06-19 02:10:11 -07:00
rodzic 1fce6eec58
commit a851965722
5 zmienionych plików z 143 dodań i 59 usunięć

Wyświetl plik

@ -1,11 +1,12 @@
import defaultKy from 'ky' import defaultKy from 'ky'
import pThrottle from 'p-throttle' import pThrottle from 'p-throttle'
import type { DeepNullable, KyInstance } from '../types.js' import type { KyInstance } from '../types.js'
import { import {
assert, assert,
delay, delay,
getEnv, getEnv,
pruneNullOrUndefinedDeep,
sanitizeSearchParams, sanitizeSearchParams,
throttleKy throttleKy
} from '../utils.js' } from '../utils.js'
@ -37,7 +38,7 @@ export namespace clearbit {
phoneNumbers: string[] phoneNumbers: string[]
emailAddresses: string[] emailAddresses: string[]
} }
category: { category: Partial<{
sector: string sector: string
industryGroup: string industryGroup: string
industry: string industry: string
@ -48,14 +49,14 @@ export namespace clearbit {
naicsCode: string naicsCode: string
naics6Codes: string[] naics6Codes: string[]
naics6Codes2022: string[] naics6Codes2022: string[]
} }>
tags: string[] tags: string[]
description: string description: string
foundedYear: number foundedYear: number
location: string location: string
timeZone: string timeZone: string
utcOffset: number utcOffset: number
geo: { geo: Partial<{
streetNumber: string streetNumber: string
streetName: string streetName: string
subPremise: string subPremise: string
@ -68,16 +69,16 @@ export namespace clearbit {
countryCode: string countryCode: string
lat: number lat: number
lng: number lng: number
} }>
logo: string logo: string
facebook: { facebook: Partial<{
handle: string handle: string
likes: number likes: number
} }>
linkedin: { linkedin: {
handle: string handle: string
} }
twitter: { twitter: Partial<{
handle: string handle: string
id: string id: string
bio: string bio: string
@ -86,30 +87,30 @@ export namespace clearbit {
location: string location: string
site: string site: string
avatar: string avatar: string
} }>
crunchbase: { crunchbase: {
handle: string handle: string
} }
emailProvider: boolean emailProvider: boolean
type: string type: string
ticker: string ticker: string
identifiers: { identifiers: Partial<{
usEIN: string usEIN: string
usCIK: string usCIK: string
} }>
phone: string phone: string
metrics: { metrics: Partial<{
alexaUsRank: number alexaUsRank: number
alexaGlobalRank: number alexaGlobalRank: number
trafficRank: string trafficRank: string
employees: number employees: number
employeesRange: string employeesRange: string
marketCap: string marketCap: number
raised: number raised: number
annualRevenue: string annualRevenue: string
estimatedAnnualRevenue: string estimatedAnnualRevenue: string
fiscalYearEnd: string fiscalYearEnd: number
} }>
indexedAt: string indexedAt: string
tech: string[] tech: string[]
techCategories: string[] techCategories: string[]
@ -121,18 +122,18 @@ export namespace clearbit {
} }
} }
export type EmailLookupResponse = DeepNullable<{ export type EmailLookupResponse = Partial<{
id: string id: string
name: { name: Partial<{
fullName: string fullName: string
givenName: string givenName: string
familyName: string familyName: string
} }>
email: string email: string
location: string location: string
timeZone: string timeZone: string
utcOffset: number utcOffset: number
geo: { geo: Partial<{
city: string city: string
state: string state: string
stateCode: string stateCode: string
@ -140,22 +141,22 @@ export namespace clearbit {
countryCode: string countryCode: string
lat: number lat: number
lng: number lng: number
} }>
bio: string bio: string
site: string site: string
avatar: string avatar: string
employment: { employment: Partial<{
domain: string domain: string
name: string name: string
title: string title: string
role: string role: string
subRole: string subRole: string
seniority: string seniority: string
} }>
facebook: { facebook: {
handle: string handle: string
} }
github: { github: Partial<{
handle: string handle: string
id: string id: string
avatar: string avatar: string
@ -163,8 +164,8 @@ export namespace clearbit {
blog: string blog: string
followers: number followers: number
following: number following: number
} }>
twitter: { twitter: Partial<{
handle: string handle: string
id: string id: string
bio: string bio: string
@ -175,14 +176,14 @@ export namespace clearbit {
location: string location: string
site: string site: string
avatar: string avatar: string
} }>
linkedin: { linkedin: {
handle: string handle: string
} }
googleplus: { googleplus: {
handle: null handle: null
} }
gravatar: { gravatar: Partial<{
handle: string handle: string
urls: { urls: {
value: string value: string
@ -193,7 +194,7 @@ export namespace clearbit {
url: string url: string
type: string type: string
}[] }[]
} }>
fuzzy: boolean fuzzy: boolean
emailProvider: boolean emailProvider: boolean
indexedAt: string indexedAt: string
@ -204,7 +205,7 @@ export namespace clearbit {
export type CompanyResponse = { export type CompanyResponse = {
id: string id: string
} & DeepNullable<CompanyNullableProps> } & Partial<CompanyNullableProps>
export interface CompanySearchOptions { export interface CompanySearchOptions {
/** /**
@ -260,17 +261,17 @@ export namespace clearbit {
} }
export interface EmploymentAttributes { export interface EmploymentAttributes {
company: string company?: string
domain: string domain?: string
linkedin: string linkedin?: string
title: string title?: string
role: string role?: string
subRole: string subRole?: string
seniority: string seniority?: string
startDate: string startDate?: string
endDate: string endDate?: string
present: boolean present?: boolean
highlight: boolean highlight?: boolean
} }
export interface EmailAttributes { export interface EmailAttributes {
@ -291,7 +292,7 @@ export namespace clearbit {
export type PersonAttributesV2 = { export type PersonAttributesV2 = {
id: string id: string
} & DeepNullable<{ } & Partial<{
name: Name name: Name
avatar: string avatar: string
location: string location: string
@ -554,31 +555,37 @@ export class ClearbitClient {
} }
async companyEnrichment(options: clearbit.CompanyEnrichmentOptions) { async companyEnrichment(options: clearbit.CompanyEnrichmentOptions) {
return this.ky const res = await this.ky
.get('https://company-stream.clearbit.com/v2/companies/find', { .get('https://company-stream.clearbit.com/v2/companies/find', {
searchParams: sanitizeSearchParams(options) searchParams: sanitizeSearchParams(options)
}) })
.json<clearbit.CompanyResponse>() .json<clearbit.CompanyResponse>()
return pruneNullOrUndefinedDeep(res)
} }
async companySearch(options: clearbit.CompanySearchOptions) { async companySearch(options: clearbit.CompanySearchOptions) {
return this.ky const res = await this.ky
.get('https://discovery.clearbit.com/v1/companies/search', { .get('https://discovery.clearbit.com/v1/companies/search', {
searchParams: sanitizeSearchParams(options) searchParams: sanitizeSearchParams(options)
}) })
.json<clearbit.CompanySearchResponse>() .json<clearbit.CompanySearchResponse>()
return pruneNullOrUndefinedDeep(res)
} }
async companyAutocomplete(name: string) { async companyAutocomplete(name: string) {
return this.ky const res = await this.ky
.get('https://autocomplete.clearbit.com/v1/companies/suggest', { .get('https://autocomplete.clearbit.com/v1/companies/suggest', {
searchParams: { query: name } searchParams: { query: name }
}) })
.json<clearbit.BasicCompanyResponse[]>() .json<clearbit.BasicCompanyResponse[]>()
return pruneNullOrUndefinedDeep(res)
} }
async prospectorPeopleV2(options: clearbit.PeopleSearchOptionsV2) { async prospectorPeopleV2(options: clearbit.PeopleSearchOptionsV2) {
return this.ky const res = await this.ky
.get('https://prospector.clearbit.com/v2/people/search', { .get('https://prospector.clearbit.com/v2/people/search', {
searchParams: sanitizeSearchParams({ searchParams: sanitizeSearchParams({
...options, ...options,
@ -589,10 +596,12 @@ export class ClearbitClient {
}) })
}) })
.json<clearbit.ProspectorResponseV2>() .json<clearbit.ProspectorResponseV2>()
return pruneNullOrUndefinedDeep(res)
} }
async prospectorPeopleV1(options: clearbit.PeopleSearchOptionsV1) { async prospectorPeopleV1(options: clearbit.PeopleSearchOptionsV1) {
return this.ky const res = await this.ky
.get('https://prospector.clearbit.com/v1/people/search', { .get('https://prospector.clearbit.com/v1/people/search', {
searchParams: sanitizeSearchParams({ searchParams: sanitizeSearchParams({
email: false, email: false,
@ -604,6 +613,8 @@ export class ClearbitClient {
}) })
}) })
.json<clearbit.ProspectorResponseV1>() .json<clearbit.ProspectorResponseV1>()
return pruneNullOrUndefinedDeep(res)
} }
// TODO Status code = 202 means the response was queued. // TODO Status code = 202 means the response was queued.
@ -622,7 +633,8 @@ export class ClearbitClient {
}) })
if (response.status !== 202 || !maxRetries) { if (response.status !== 202 || !maxRetries) {
return response.json<clearbit.EmailLookupResponse>() const res = await response.json<clearbit.EmailLookupResponse>()
return pruneNullOrUndefinedDeep(res)
} }
if (maxRetries && response.status === 202) { if (maxRetries && response.status === 202) {
@ -637,7 +649,8 @@ export class ClearbitClient {
count++ count++
running = response.status === 202 running = response.status === 202
} }
return response.json<clearbit.EmailLookupResponse>() const res = await response.json<clearbit.EmailLookupResponse>()
return pruneNullOrUndefinedDeep(res)
} }
throw new Error('clearbit email lookup error 202', { cause: response }) throw new Error('clearbit email lookup error 202', { cause: response })
@ -653,17 +666,21 @@ export class ClearbitClient {
} }
async revealCompanyFromIP(ip: string) { async revealCompanyFromIP(ip: string) {
return this.ky const res = await this.ky
.get('https://reveal.clearbit.com/v1/companies/find', { .get('https://reveal.clearbit.com/v1/companies/find', {
searchParams: { ip } searchParams: { ip }
}) })
.json<clearbit.CompanyRevealResponse>() .json<clearbit.CompanyRevealResponse>()
.catch((_) => undefined) .catch((_) => undefined)
if (res) {
return pruneNullOrUndefinedDeep(res)
}
} }
static filterEmploymentProspectorV2( static filterEmploymentProspectorV2(
companyName: string, companyName: string,
employments: Array<DeepNullable<clearbit.EmploymentAttributes> | null> | null employments?: Array<Partial<clearbit.EmploymentAttributes>>
) { ) {
if (employments && employments.length > 0) { if (employments && employments.length > 0) {
// We filter by employment endDate because some people could have multiple // We filter by employment endDate because some people could have multiple

Wyświetl plik

@ -384,12 +384,14 @@ export namespace diffbot {
nbIncomingEdges?: number nbIncomingEdges?: number
nbFollowers?: number nbFollowers?: number
nbLocations?: number nbLocations?: number
nbEmployees?: number
nbEmployeesMin?: number nbEmployeesMin?: number
nbEmployeesMax?: number nbEmployeesMax?: number
nbActiveEmployeeEdges?: number nbActiveEmployeeEdges?: number
nbUniqueInvestors?: number nbUniqueInvestors?: number
educations?: Education[] educations?: Education[]
nationalities?: Nationality[] nationalities?: Nationality[]
fullName?: string
allNames?: string[] allNames?: string[]
skills?: Partial<BasicEntity>[] skills?: Partial<BasicEntity>[]
children?: BasicEntity[] children?: BasicEntity[]
@ -401,6 +403,8 @@ export namespace diffbot {
parents?: BasicEntity[] parents?: BasicEntity[]
gender?: Gender gender?: Gender
importance?: number importance?: number
monthlyTraffic?: number
monthlyTrafficGrowth?: number
wikipediaPageviews?: number wikipediaPageviews?: number
wikipediaPageviewsLastQuarterGrowth?: number wikipediaPageviewsLastQuarterGrowth?: number
wikipediaPageviewsLastYear?: number wikipediaPageviewsLastYear?: number
@ -459,6 +463,9 @@ export namespace diffbot {
stock?: Stock stock?: Stock
companiesHouseIds?: string[] companiesHouseIds?: string[]
yearlyRevenues?: AnnualRevenue[] yearlyRevenues?: AnnualRevenue[]
revenue?: Amount
parentCompany?: BasicEntity
legalEntities?: BasicEntity[]
} }
export interface AnnualRevenue { export interface AnnualRevenue {

Wyświetl plik

@ -1935,6 +1935,7 @@ export namespace proxycurl {
export type SearchResult = z.infer<typeof SearchResultSchema> export type SearchResult = z.infer<typeof SearchResultSchema>
export const ResultProfileSchema = z.object({ export const ResultProfileSchema = z.object({
linkedin_url: z.string().optional(),
acquisitions: PurpleAcquisitionSchema.optional(), acquisitions: PurpleAcquisitionSchema.optional(),
affiliated_companies: z.array(PurpleAffiliatedCompanySchema).optional(), affiliated_companies: z.array(PurpleAffiliatedCompanySchema).optional(),
background_cover_image_url: z.string().optional(), background_cover_image_url: z.string().optional(),
@ -1963,7 +1964,12 @@ export namespace proxycurl {
updates: z.array(PurpleCompanyUpdateSchema).optional(), updates: z.array(PurpleCompanyUpdateSchema).optional(),
website: z.string().optional() website: z.string().optional()
}) })
export type ResultProfile = z.infer<typeof ResultProfileSchema> export type CompanyProfile = z.infer<typeof ResultProfileSchema>
export type ResolvedCompanyProfile = {
url: string
last_updated: string
profile: CompanyProfile
}
export const CompanyUrlEnrichResultProfileSchema = z.object({ export const CompanyUrlEnrichResultProfileSchema = z.object({
acquisitions: FluffyAcquisitionSchema.optional(), acquisitions: FluffyAcquisitionSchema.optional(),
@ -2087,8 +2093,8 @@ export class ProxycurlClient extends AIFunctionsProvider {
}) })
async getLinkedInCompany( async getLinkedInCompany(
opts: proxycurl.CompanyProfileEndpointParamsQueryClass opts: proxycurl.CompanyProfileEndpointParamsQueryClass
) { ): Promise<proxycurl.CompanyProfile> {
return this.ky const res = await this.ky
.get('api/linkedin/company', { .get('api/linkedin/company', {
searchParams: sanitizeSearchParams({ searchParams: sanitizeSearchParams({
funding_data: 'include', funding_data: 'include',
@ -2097,7 +2103,12 @@ export class ProxycurlClient extends AIFunctionsProvider {
...opts ...opts
}) })
}) })
.json<proxycurl.ResultProfile>() .json<proxycurl.CompanyProfile>()
return {
linkedin_url: opts.url,
...res
}
} }
@aiFunction({ @aiFunction({
@ -2181,15 +2192,20 @@ export class ProxycurlClient extends AIFunctionsProvider {
}) })
async resolveLinkedInCompany( async resolveLinkedInCompany(
opts: proxycurl.CompanyLookupEndpointParamsQueryClass opts: proxycurl.CompanyLookupEndpointParamsQueryClass
) { ): Promise<proxycurl.CompanyProfile> {
return this.ky const res = await this.ky
.get('api/linkedin/company/resolve', { .get('api/linkedin/company/resolve', {
searchParams: sanitizeSearchParams({ searchParams: sanitizeSearchParams({
enrich_profile: 'enrich', enrich_profile: 'enrich',
...opts ...opts
}) })
}) })
.json<proxycurl.ResultProfile>() .json<proxycurl.ResolvedCompanyProfile>()
return {
linkedin_url: res.url,
...res.profile
}
} }
@aiFunction({ @aiFunction({

Wyświetl plik

@ -1,10 +1,18 @@
import defaultKy, { type KyInstance } from 'ky' import defaultKy, { type KyInstance } from 'ky'
import pThrottle from 'p-throttle'
import { z } from 'zod' import { z } from 'zod'
import { aiFunction, AIFunctionsProvider } from '../fns.js' import { aiFunction, AIFunctionsProvider } from '../fns.js'
import { assert, getEnv, omit } from '../utils.js' import { assert, getEnv, omit, throttleKy } from '../utils.js'
export namespace scraper { export namespace scraper {
// Allow up to 1 request per second by default.
export const throttle = pThrottle({
limit: 1,
interval: 1000,
strict: true
})
export type ScrapeResult = { export type ScrapeResult = {
author: string author: string
byline: string byline: string
@ -47,10 +55,12 @@ export class ScraperClient extends AIFunctionsProvider {
constructor({ constructor({
apiBaseUrl = getEnv('SCRAPER_API_BASE_URL'), apiBaseUrl = getEnv('SCRAPER_API_BASE_URL'),
throttle = true,
ky = defaultKy ky = defaultKy
}: { }: {
apiKey?: string apiKey?: string
apiBaseUrl?: string apiBaseUrl?: string
throttle?: boolean
ky?: KyInstance ky?: KyInstance
} = {}) { } = {}) {
assert( assert(
@ -60,7 +70,9 @@ export class ScraperClient extends AIFunctionsProvider {
super() super()
this.apiBaseUrl = apiBaseUrl this.apiBaseUrl = apiBaseUrl
this.ky = ky.extend({ prefixUrl: this.apiBaseUrl })
const throttledKy = throttle ? throttleKy(ky, scraper.throttle) : ky
this.ky = throttledKy.extend({ prefixUrl: this.apiBaseUrl })
} }
@aiFunction({ @aiFunction({
@ -99,6 +111,15 @@ export class ScraperClient extends AIFunctionsProvider {
}) })
.json<scraper.ScrapeResult>() .json<scraper.ScrapeResult>()
if (res.length <= 40) {
try {
const message = (JSON.parse(res.textContent as string) as any).message
throw new Error(`Failed to scrape URL "${opts.url}": ${message}`)
} catch {
throw new Error(`Failed to scrape URL "${opts.url}"`)
}
}
switch (format) { switch (format) {
case 'html': case 'html':
return omit(res, 'markdownContent', 'textContent', 'rawHtml') return omit(res, 'markdownContent', 'textContent', 'rawHtml')

Wyświetl plik

@ -67,6 +67,29 @@ export function pruneNullOrUndefined<T extends Record<string, any>>(
) as NonNullable<T> ) as NonNullable<T>
} }
export function pruneNullOrUndefinedDeep<T extends Record<string, any>>(
obj: T
): NonNullable<{ [K in keyof T]: Exclude<T[K], undefined | null> }> {
if (!obj || Array.isArray(obj) || typeof obj !== 'object') return obj
return Object.fromEntries(
Object.entries(obj)
.filter(([, value]) => value !== undefined && value !== null)
.map(([key, value]) =>
Array.isArray(value)
? [
key,
value
.filter((v) => v !== undefined && v !== null)
.map(pruneNullOrUndefinedDeep as any)
]
: typeof value === 'object'
? [key, pruneNullOrUndefinedDeep(value)]
: [key, value]
)
) as NonNullable<T>
}
export function getEnv(name: string): string | undefined { export function getEnv(name: string): string | undefined {
try { try {
return typeof process !== 'undefined' return typeof process !== 'undefined'