import isRelativeUrlImpl from 'is-relative-url' import normalizeUrlImpl, { type Options as NormalizeUrlOptions } from 'normalize-url' import QuickLRU from 'quick-lru' import { hashObject } from './utils.js' const protocolAllowList = new Set(['https:', 'http:']) const normalizedUrlCache = new QuickLRU({ maxSize: 4000 }) export function isValidCrawlableUrl(url: string): boolean { try { if (!url || isRelativeUrl(url)) { return false } const parsedUrl = new URL(url) if (!protocolAllowList.has(parsedUrl.protocol)) { return false } const normalizedUrl = normalizeUrl(url) if (!normalizedUrl) { return false } return true } catch { return false } } export function isRelativeUrl(url: string): boolean { if (!url || typeof url !== 'string') return false return isRelativeUrlImpl(url) && !url.startsWith('//') } export function normalizeUrl( url: string, options?: NormalizeUrlOptions ): string | undefined { let normalizedUrl: string | undefined if (!url || isRelativeUrl(url)) { return undefined } const opts = { stripWWW: false, defaultProtocol: 'https', normalizeProtocol: true, forceHttps: false, stripHash: false, stripTextFragment: true, removeQueryParameters: [/^utm_\w+/i, 'ref', 'ref_src'], removeTrailingSlash: true, removeSingleSlash: true, removeExplicitPort: true, sortQueryParameters: true, ...options } as Required const optionsHash = hashObject(opts) const cacheKey = `${url}-${optionsHash}` try { normalizedUrl = normalizedUrlCache.get(cacheKey) if (normalizedUrl !== undefined) { if (normalizedUrl) { return normalizedUrl } else { return undefined } } normalizedUrl = normalizeUrlImpl(url, opts) if (!normalizeUrl) { normalizedUrl = '' } } catch { // ignore invalid urls normalizedUrl = '' } normalizedUrlCache.set(cacheKey, normalizedUrl!) if (normalizedUrl) { return normalizedUrl } else { return undefined } }