import { aiFunction, AIFunctionsProvider, pruneEmpty, sanitizeSearchParams } from '@agentic/core' import { XMLParser } from 'fast-xml-parser' import defaultKy, { type KyInstance } from 'ky' import { z } from 'zod' import { castArray, getProp } from './utils' export namespace arxiv { export const API_BASE_URL = 'https://export.arxiv.org/api' export const SortType = { RELEVANCE: 'relevance', LAST_UPDATED_DATE: 'lastUpdatedDate', SUBMITTED_DATE: 'submittedDate' } as const export const SortOrder = { ASCENDING: 'ascending', DESCENDING: 'descending' } as const export const FilterType = { ALL: 'all', TITLE: 'title', AUTHOR: 'author', ABSTRACT: 'abstract', COMMENT: 'comment', JOURNAL_REFERENCE: 'journal_reference', SUBJECT_CATEGORY: 'subject_category', REPORT_NUMBER: 'report_number' } as const export type ValueOf> = T[keyof T] export const FilterTypeMapping: Record, string> = { all: 'all', title: 'ti', author: 'au', abstract: 'abs', comment: 'co', journal_reference: 'jr', subject_category: 'cat', report_number: 'rn' } export const Separators = { AND: '+AND+', OR: '+OR+', ANDNOT: '+ANDNOT+' } as const export interface ArXivResponse { totalResults: number startIndex: number itemsPerPage: number entries: { id: string title: string summary: string published: string updated: string authors: { name: string; affiliation: string[] }[] doi: string comment: string journalReference: string primaryCategory: string categories: string[] links: string[] }[] } export const extractId = (value: string) => value .replace('https://arxiv.org/abs/', '') .replace('https://arxiv.org/pdf/', '') .replace(/v\d$/, '') const EntrySchema = z.object({ field: z.nativeEnum(FilterType).default(FilterType.ALL), value: z.string().min(1) }) export const SearchParamsSchema = z .object({ ids: z.array(z.string().min(1)).optional(), searchQuery: z .union([ z.string(), z.object({ include: z .array(EntrySchema) .nonempty() .describe('Filters to include results.'), exclude: z .array(EntrySchema) .optional() .describe('Filters to exclude results.') }) ]) .optional(), start: z.number().int().min(0).default(0), maxResults: z.number().int().min(1).max(100).default(5) }) .describe('Sorting by date is not supported.') export type SearchParams = z.infer } /** * Lightweight wrapper around ArXiv for academic / scholarly research articles. * * @see https://arxiv.org */ export class ArXivClient extends AIFunctionsProvider { protected readonly ky: KyInstance protected readonly apiBaseUrl: string constructor({ apiBaseUrl = arxiv.API_BASE_URL, ky = defaultKy }: { apiKey?: string apiBaseUrl?: string ky?: KyInstance }) { super() this.apiBaseUrl = apiBaseUrl this.ky = ky.extend({ prefixUrl: this.apiBaseUrl }) } /** * Searches for research articles published on arXiv. */ @aiFunction({ name: 'arxiv_search', description: 'Searches for research articles published on arXiv.', inputSchema: arxiv.SearchParamsSchema }) async search(queryOrOpts: string | arxiv.SearchParams) { const opts = typeof queryOrOpts === 'string' ? ({ searchQuery: queryOrOpts } as arxiv.SearchParams) : queryOrOpts if (!opts.ids?.length && !opts.searchQuery) { throw new Error( `The 'searchQuery' property must be non-empty if the 'ids' property is not provided.` ) } const searchParams = sanitizeSearchParams({ start: opts.start, max_results: opts.maxResults, id_list: opts.ids?.map(arxiv.extractId), search_query: opts.searchQuery ? typeof opts.searchQuery === 'string' ? opts.searchQuery : [ opts.searchQuery.include .map( (tag) => `${arxiv.FilterTypeMapping[tag.field]}:${tag.value}` ) .join(arxiv.Separators.AND), (opts.searchQuery.exclude ?? []) .map( (tag) => `${arxiv.FilterTypeMapping[tag.field]}:${tag.value}` ) .join(arxiv.Separators.ANDNOT) ] .filter(Boolean) .join(arxiv.Separators.ANDNOT) : undefined, sortBy: arxiv.SortType.RELEVANCE, sortOrder: arxiv.SortOrder.DESCENDING }) const responseText = await this.ky.get('query', { searchParams }).text() const parser = new XMLParser({ allowBooleanAttributes: true, alwaysCreateTextNode: false, attributeNamePrefix: '@_', attributesGroupName: false, cdataPropName: '#cdata', ignoreAttributes: true, numberParseOptions: { hex: false, leadingZeros: true }, parseAttributeValue: false, parseTagValue: true, preserveOrder: false, removeNSPrefix: true, textNodeName: '#text', trimValues: true, ignoreDeclaration: true }) const parsedData = parser.parse(responseText) let entries: Record[] = getProp( parsedData, ['feed', 'entry'], [] ) entries = castArray(entries) return { totalResults: Math.max( getProp(parsedData, ['feed', 'totalResults'], 0), entries.length ), startIndex: getProp(parsedData, ['feed', 'startIndex'], 0), itemsPerPage: getProp(parsedData, ['feed', 'itemsPerPage'], 0), entries: entries.map((entry) => pruneEmpty({ id: arxiv.extractId(entry.id), url: entry.id, title: entry.title, summary: entry.summary, published: entry.published, updated: entry.updated, authors: castArray(entry.author) .filter(Boolean) .map((author: any) => ({ name: author.name, affiliation: castArray(author.affiliation ?? []) })), doi: entry.doi, comment: entry.comment, journalReference: entry.journal_ref, primaryCategory: entry.primary_category, categories: castArray(entry.category).filter(Boolean), links: castArray(entry.link).filter(Boolean) }) ) } } }