kopia lustrzana https://github.com/transitive-bullshit/chatgpt-api
feat: improvements to clearbit, diffbot, proxycurl, scraper clients
rodzic
cecc1a3a0e
commit
f8cd52221d
|
@ -1,11 +1,12 @@
|
|||
import defaultKy from 'ky'
|
||||
import pThrottle from 'p-throttle'
|
||||
|
||||
import type { DeepNullable, KyInstance } from '../types.js'
|
||||
import type { KyInstance } from '../types.js'
|
||||
import {
|
||||
assert,
|
||||
delay,
|
||||
getEnv,
|
||||
pruneNullOrUndefinedDeep,
|
||||
sanitizeSearchParams,
|
||||
throttleKy
|
||||
} from '../utils.js'
|
||||
|
@ -37,7 +38,7 @@ export namespace clearbit {
|
|||
phoneNumbers: string[]
|
||||
emailAddresses: string[]
|
||||
}
|
||||
category: {
|
||||
category: Partial<{
|
||||
sector: string
|
||||
industryGroup: string
|
||||
industry: string
|
||||
|
@ -48,14 +49,14 @@ export namespace clearbit {
|
|||
naicsCode: string
|
||||
naics6Codes: string[]
|
||||
naics6Codes2022: string[]
|
||||
}
|
||||
}>
|
||||
tags: string[]
|
||||
description: string
|
||||
foundedYear: number
|
||||
location: string
|
||||
timeZone: string
|
||||
utcOffset: number
|
||||
geo: {
|
||||
geo: Partial<{
|
||||
streetNumber: string
|
||||
streetName: string
|
||||
subPremise: string
|
||||
|
@ -68,16 +69,16 @@ export namespace clearbit {
|
|||
countryCode: string
|
||||
lat: number
|
||||
lng: number
|
||||
}
|
||||
}>
|
||||
logo: string
|
||||
facebook: {
|
||||
facebook: Partial<{
|
||||
handle: string
|
||||
likes: number
|
||||
}
|
||||
}>
|
||||
linkedin: {
|
||||
handle: string
|
||||
}
|
||||
twitter: {
|
||||
twitter: Partial<{
|
||||
handle: string
|
||||
id: string
|
||||
bio: string
|
||||
|
@ -86,30 +87,30 @@ export namespace clearbit {
|
|||
location: string
|
||||
site: string
|
||||
avatar: string
|
||||
}
|
||||
}>
|
||||
crunchbase: {
|
||||
handle: string
|
||||
}
|
||||
emailProvider: boolean
|
||||
type: string
|
||||
ticker: string
|
||||
identifiers: {
|
||||
identifiers: Partial<{
|
||||
usEIN: string
|
||||
usCIK: string
|
||||
}
|
||||
}>
|
||||
phone: string
|
||||
metrics: {
|
||||
metrics: Partial<{
|
||||
alexaUsRank: number
|
||||
alexaGlobalRank: number
|
||||
trafficRank: string
|
||||
employees: number
|
||||
employeesRange: string
|
||||
marketCap: string
|
||||
marketCap: number
|
||||
raised: number
|
||||
annualRevenue: string
|
||||
estimatedAnnualRevenue: string
|
||||
fiscalYearEnd: string
|
||||
}
|
||||
fiscalYearEnd: number
|
||||
}>
|
||||
indexedAt: string
|
||||
tech: string[]
|
||||
techCategories: string[]
|
||||
|
@ -121,18 +122,18 @@ export namespace clearbit {
|
|||
}
|
||||
}
|
||||
|
||||
export type EmailLookupResponse = DeepNullable<{
|
||||
export type EmailLookupResponse = Partial<{
|
||||
id: string
|
||||
name: {
|
||||
name: Partial<{
|
||||
fullName: string
|
||||
givenName: string
|
||||
familyName: string
|
||||
}
|
||||
}>
|
||||
email: string
|
||||
location: string
|
||||
timeZone: string
|
||||
utcOffset: number
|
||||
geo: {
|
||||
geo: Partial<{
|
||||
city: string
|
||||
state: string
|
||||
stateCode: string
|
||||
|
@ -140,22 +141,22 @@ export namespace clearbit {
|
|||
countryCode: string
|
||||
lat: number
|
||||
lng: number
|
||||
}
|
||||
}>
|
||||
bio: string
|
||||
site: string
|
||||
avatar: string
|
||||
employment: {
|
||||
employment: Partial<{
|
||||
domain: string
|
||||
name: string
|
||||
title: string
|
||||
role: string
|
||||
subRole: string
|
||||
seniority: string
|
||||
}
|
||||
}>
|
||||
facebook: {
|
||||
handle: string
|
||||
}
|
||||
github: {
|
||||
github: Partial<{
|
||||
handle: string
|
||||
id: string
|
||||
avatar: string
|
||||
|
@ -163,8 +164,8 @@ export namespace clearbit {
|
|||
blog: string
|
||||
followers: number
|
||||
following: number
|
||||
}
|
||||
twitter: {
|
||||
}>
|
||||
twitter: Partial<{
|
||||
handle: string
|
||||
id: string
|
||||
bio: string
|
||||
|
@ -175,14 +176,14 @@ export namespace clearbit {
|
|||
location: string
|
||||
site: string
|
||||
avatar: string
|
||||
}
|
||||
}>
|
||||
linkedin: {
|
||||
handle: string
|
||||
}
|
||||
googleplus: {
|
||||
handle: null
|
||||
}
|
||||
gravatar: {
|
||||
gravatar: Partial<{
|
||||
handle: string
|
||||
urls: {
|
||||
value: string
|
||||
|
@ -193,7 +194,7 @@ export namespace clearbit {
|
|||
url: string
|
||||
type: string
|
||||
}[]
|
||||
}
|
||||
}>
|
||||
fuzzy: boolean
|
||||
emailProvider: boolean
|
||||
indexedAt: string
|
||||
|
@ -204,7 +205,7 @@ export namespace clearbit {
|
|||
|
||||
export type CompanyResponse = {
|
||||
id: string
|
||||
} & DeepNullable<CompanyNullableProps>
|
||||
} & Partial<CompanyNullableProps>
|
||||
|
||||
export interface CompanySearchOptions {
|
||||
/**
|
||||
|
@ -260,17 +261,17 @@ export namespace clearbit {
|
|||
}
|
||||
|
||||
export interface EmploymentAttributes {
|
||||
company: string
|
||||
domain: string
|
||||
linkedin: string
|
||||
title: string
|
||||
role: string
|
||||
subRole: string
|
||||
seniority: string
|
||||
startDate: string
|
||||
endDate: string
|
||||
present: boolean
|
||||
highlight: boolean
|
||||
company?: string
|
||||
domain?: string
|
||||
linkedin?: string
|
||||
title?: string
|
||||
role?: string
|
||||
subRole?: string
|
||||
seniority?: string
|
||||
startDate?: string
|
||||
endDate?: string
|
||||
present?: boolean
|
||||
highlight?: boolean
|
||||
}
|
||||
|
||||
export interface EmailAttributes {
|
||||
|
@ -291,7 +292,7 @@ export namespace clearbit {
|
|||
|
||||
export type PersonAttributesV2 = {
|
||||
id: string
|
||||
} & DeepNullable<{
|
||||
} & Partial<{
|
||||
name: Name
|
||||
avatar: string
|
||||
location: string
|
||||
|
@ -554,31 +555,37 @@ export class ClearbitClient {
|
|||
}
|
||||
|
||||
async companyEnrichment(options: clearbit.CompanyEnrichmentOptions) {
|
||||
return this.ky
|
||||
const res = await this.ky
|
||||
.get('https://company-stream.clearbit.com/v2/companies/find', {
|
||||
searchParams: sanitizeSearchParams(options)
|
||||
})
|
||||
.json<clearbit.CompanyResponse>()
|
||||
|
||||
return pruneNullOrUndefinedDeep(res)
|
||||
}
|
||||
|
||||
async companySearch(options: clearbit.CompanySearchOptions) {
|
||||
return this.ky
|
||||
const res = await this.ky
|
||||
.get('https://discovery.clearbit.com/v1/companies/search', {
|
||||
searchParams: sanitizeSearchParams(options)
|
||||
})
|
||||
.json<clearbit.CompanySearchResponse>()
|
||||
|
||||
return pruneNullOrUndefinedDeep(res)
|
||||
}
|
||||
|
||||
async companyAutocomplete(name: string) {
|
||||
return this.ky
|
||||
const res = await this.ky
|
||||
.get('https://autocomplete.clearbit.com/v1/companies/suggest', {
|
||||
searchParams: { query: name }
|
||||
})
|
||||
.json<clearbit.BasicCompanyResponse[]>()
|
||||
|
||||
return pruneNullOrUndefinedDeep(res)
|
||||
}
|
||||
|
||||
async prospectorPeopleV2(options: clearbit.PeopleSearchOptionsV2) {
|
||||
return this.ky
|
||||
const res = await this.ky
|
||||
.get('https://prospector.clearbit.com/v2/people/search', {
|
||||
searchParams: sanitizeSearchParams({
|
||||
...options,
|
||||
|
@ -589,10 +596,12 @@ export class ClearbitClient {
|
|||
})
|
||||
})
|
||||
.json<clearbit.ProspectorResponseV2>()
|
||||
|
||||
return pruneNullOrUndefinedDeep(res)
|
||||
}
|
||||
|
||||
async prospectorPeopleV1(options: clearbit.PeopleSearchOptionsV1) {
|
||||
return this.ky
|
||||
const res = await this.ky
|
||||
.get('https://prospector.clearbit.com/v1/people/search', {
|
||||
searchParams: sanitizeSearchParams({
|
||||
email: false,
|
||||
|
@ -604,6 +613,8 @@ export class ClearbitClient {
|
|||
})
|
||||
})
|
||||
.json<clearbit.ProspectorResponseV1>()
|
||||
|
||||
return pruneNullOrUndefinedDeep(res)
|
||||
}
|
||||
|
||||
// TODO Status code = 202 means the response was queued.
|
||||
|
@ -622,7 +633,8 @@ export class ClearbitClient {
|
|||
})
|
||||
|
||||
if (response.status !== 202 || !maxRetries) {
|
||||
return response.json<clearbit.EmailLookupResponse>()
|
||||
const res = await response.json<clearbit.EmailLookupResponse>()
|
||||
return pruneNullOrUndefinedDeep(res)
|
||||
}
|
||||
|
||||
if (maxRetries && response.status === 202) {
|
||||
|
@ -637,7 +649,8 @@ export class ClearbitClient {
|
|||
count++
|
||||
running = response.status === 202
|
||||
}
|
||||
return response.json<clearbit.EmailLookupResponse>()
|
||||
const res = await response.json<clearbit.EmailLookupResponse>()
|
||||
return pruneNullOrUndefinedDeep(res)
|
||||
}
|
||||
|
||||
throw new Error('clearbit email lookup error 202', { cause: response })
|
||||
|
@ -653,17 +666,21 @@ export class ClearbitClient {
|
|||
}
|
||||
|
||||
async revealCompanyFromIP(ip: string) {
|
||||
return this.ky
|
||||
const res = await this.ky
|
||||
.get('https://reveal.clearbit.com/v1/companies/find', {
|
||||
searchParams: { ip }
|
||||
})
|
||||
.json<clearbit.CompanyRevealResponse>()
|
||||
.catch((_) => undefined)
|
||||
|
||||
if (res) {
|
||||
return pruneNullOrUndefinedDeep(res)
|
||||
}
|
||||
}
|
||||
|
||||
static filterEmploymentProspectorV2(
|
||||
companyName: string,
|
||||
employments: Array<DeepNullable<clearbit.EmploymentAttributes> | null> | null
|
||||
employments?: Array<Partial<clearbit.EmploymentAttributes>>
|
||||
) {
|
||||
if (employments && employments.length > 0) {
|
||||
// We filter by employment endDate because some people could have multiple
|
||||
|
|
|
@ -384,12 +384,14 @@ export namespace diffbot {
|
|||
nbIncomingEdges?: number
|
||||
nbFollowers?: number
|
||||
nbLocations?: number
|
||||
nbEmployees?: number
|
||||
nbEmployeesMin?: number
|
||||
nbEmployeesMax?: number
|
||||
nbActiveEmployeeEdges?: number
|
||||
nbUniqueInvestors?: number
|
||||
educations?: Education[]
|
||||
nationalities?: Nationality[]
|
||||
fullName?: string
|
||||
allNames?: string[]
|
||||
skills?: Partial<BasicEntity>[]
|
||||
children?: BasicEntity[]
|
||||
|
@ -401,6 +403,8 @@ export namespace diffbot {
|
|||
parents?: BasicEntity[]
|
||||
gender?: Gender
|
||||
importance?: number
|
||||
monthlyTraffic?: number
|
||||
monthlyTrafficGrowth?: number
|
||||
wikipediaPageviews?: number
|
||||
wikipediaPageviewsLastQuarterGrowth?: number
|
||||
wikipediaPageviewsLastYear?: number
|
||||
|
@ -459,6 +463,9 @@ export namespace diffbot {
|
|||
stock?: Stock
|
||||
companiesHouseIds?: string[]
|
||||
yearlyRevenues?: AnnualRevenue[]
|
||||
revenue?: Amount
|
||||
parentCompany?: BasicEntity
|
||||
legalEntities?: BasicEntity[]
|
||||
}
|
||||
|
||||
export interface AnnualRevenue {
|
||||
|
|
|
@ -1935,6 +1935,7 @@ export namespace proxycurl {
|
|||
export type SearchResult = z.infer<typeof SearchResultSchema>
|
||||
|
||||
export const ResultProfileSchema = z.object({
|
||||
linkedin_url: z.string().optional(),
|
||||
acquisitions: PurpleAcquisitionSchema.optional(),
|
||||
affiliated_companies: z.array(PurpleAffiliatedCompanySchema).optional(),
|
||||
background_cover_image_url: z.string().optional(),
|
||||
|
@ -1963,7 +1964,12 @@ export namespace proxycurl {
|
|||
updates: z.array(PurpleCompanyUpdateSchema).optional(),
|
||||
website: z.string().optional()
|
||||
})
|
||||
export type ResultProfile = z.infer<typeof ResultProfileSchema>
|
||||
export type CompanyProfile = z.infer<typeof ResultProfileSchema>
|
||||
export type ResolvedCompanyProfile = {
|
||||
url: string
|
||||
last_updated: string
|
||||
profile: CompanyProfile
|
||||
}
|
||||
|
||||
export const CompanyUrlEnrichResultProfileSchema = z.object({
|
||||
acquisitions: FluffyAcquisitionSchema.optional(),
|
||||
|
@ -2087,8 +2093,8 @@ export class ProxycurlClient extends AIFunctionsProvider {
|
|||
})
|
||||
async getLinkedInCompany(
|
||||
opts: proxycurl.CompanyProfileEndpointParamsQueryClass
|
||||
) {
|
||||
return this.ky
|
||||
): Promise<proxycurl.CompanyProfile> {
|
||||
const res = await this.ky
|
||||
.get('api/linkedin/company', {
|
||||
searchParams: sanitizeSearchParams({
|
||||
funding_data: 'include',
|
||||
|
@ -2097,7 +2103,12 @@ export class ProxycurlClient extends AIFunctionsProvider {
|
|||
...opts
|
||||
})
|
||||
})
|
||||
.json<proxycurl.ResultProfile>()
|
||||
.json<proxycurl.CompanyProfile>()
|
||||
|
||||
return {
|
||||
linkedin_url: opts.url,
|
||||
...res
|
||||
}
|
||||
}
|
||||
|
||||
@aiFunction({
|
||||
|
@ -2181,15 +2192,20 @@ export class ProxycurlClient extends AIFunctionsProvider {
|
|||
})
|
||||
async resolveLinkedInCompany(
|
||||
opts: proxycurl.CompanyLookupEndpointParamsQueryClass
|
||||
) {
|
||||
return this.ky
|
||||
): Promise<proxycurl.CompanyProfile> {
|
||||
const res = await this.ky
|
||||
.get('api/linkedin/company/resolve', {
|
||||
searchParams: sanitizeSearchParams({
|
||||
enrich_profile: 'enrich',
|
||||
...opts
|
||||
})
|
||||
})
|
||||
.json<proxycurl.ResultProfile>()
|
||||
.json<proxycurl.ResolvedCompanyProfile>()
|
||||
|
||||
return {
|
||||
linkedin_url: res.url,
|
||||
...res.profile
|
||||
}
|
||||
}
|
||||
|
||||
@aiFunction({
|
||||
|
|
|
@ -1,10 +1,18 @@
|
|||
import defaultKy, { type KyInstance } from 'ky'
|
||||
import pThrottle from 'p-throttle'
|
||||
import { z } from 'zod'
|
||||
|
||||
import { aiFunction, AIFunctionsProvider } from '../fns.js'
|
||||
import { assert, getEnv, omit } from '../utils.js'
|
||||
import { assert, getEnv, omit, throttleKy } from '../utils.js'
|
||||
|
||||
export namespace scraper {
|
||||
// Allow up to 1 request per second by default.
|
||||
export const throttle = pThrottle({
|
||||
limit: 1,
|
||||
interval: 1000,
|
||||
strict: true
|
||||
})
|
||||
|
||||
export type ScrapeResult = {
|
||||
author: string
|
||||
byline: string
|
||||
|
@ -47,10 +55,12 @@ export class ScraperClient extends AIFunctionsProvider {
|
|||
|
||||
constructor({
|
||||
apiBaseUrl = getEnv('SCRAPER_API_BASE_URL'),
|
||||
throttle = true,
|
||||
ky = defaultKy
|
||||
}: {
|
||||
apiKey?: string
|
||||
apiBaseUrl?: string
|
||||
throttle?: boolean
|
||||
ky?: KyInstance
|
||||
} = {}) {
|
||||
assert(
|
||||
|
@ -60,7 +70,9 @@ export class ScraperClient extends AIFunctionsProvider {
|
|||
super()
|
||||
|
||||
this.apiBaseUrl = apiBaseUrl
|
||||
this.ky = ky.extend({ prefixUrl: this.apiBaseUrl })
|
||||
|
||||
const throttledKy = throttle ? throttleKy(ky, scraper.throttle) : ky
|
||||
this.ky = throttledKy.extend({ prefixUrl: this.apiBaseUrl })
|
||||
}
|
||||
|
||||
@aiFunction({
|
||||
|
@ -99,6 +111,15 @@ export class ScraperClient extends AIFunctionsProvider {
|
|||
})
|
||||
.json<scraper.ScrapeResult>()
|
||||
|
||||
if (res.length <= 40) {
|
||||
try {
|
||||
const message = (JSON.parse(res.textContent as string) as any).message
|
||||
throw new Error(`Failed to scrape URL "${opts.url}": ${message}`)
|
||||
} catch {
|
||||
throw new Error(`Failed to scrape URL "${opts.url}"`)
|
||||
}
|
||||
}
|
||||
|
||||
switch (format) {
|
||||
case 'html':
|
||||
return omit(res, 'markdownContent', 'textContent', 'rawHtml')
|
||||
|
|
23
src/utils.ts
23
src/utils.ts
|
@ -67,6 +67,29 @@ export function pruneNullOrUndefined<T extends Record<string, any>>(
|
|||
) as NonNullable<T>
|
||||
}
|
||||
|
||||
export function pruneNullOrUndefinedDeep<T extends Record<string, any>>(
|
||||
obj: T
|
||||
): NonNullable<{ [K in keyof T]: Exclude<T[K], undefined | null> }> {
|
||||
if (!obj || Array.isArray(obj) || typeof obj !== 'object') return obj
|
||||
|
||||
return Object.fromEntries(
|
||||
Object.entries(obj)
|
||||
.filter(([, value]) => value !== undefined && value !== null)
|
||||
.map(([key, value]) =>
|
||||
Array.isArray(value)
|
||||
? [
|
||||
key,
|
||||
value
|
||||
.filter((v) => v !== undefined && v !== null)
|
||||
.map(pruneNullOrUndefinedDeep as any)
|
||||
]
|
||||
: typeof value === 'object'
|
||||
? [key, pruneNullOrUndefinedDeep(value)]
|
||||
: [key, value]
|
||||
)
|
||||
) as NonNullable<T>
|
||||
}
|
||||
|
||||
export function getEnv(name: string): string | undefined {
|
||||
try {
|
||||
return typeof process !== 'undefined'
|
||||
|
|
Ładowanie…
Reference in New Issue