fedicrawl/application/src/Fediverse/RobotsTxt/fetchRobotsTxt.ts

46 wiersze
1.6 KiB
TypeScript
Czysty Zwykły widok Historia

2022-11-22 15:37:11 +00:00
import axios, { AxiosRequestConfig, AxiosResponse } from 'axios'
import robotsParser from 'robots-parser'
2022-11-29 13:11:27 +00:00
import getTimeoutMilliseconds from '../getTimeoutMilliseconds.js'
2022-11-22 15:37:11 +00:00
import RobotsTxt from './RobotsTxt.js'
import { RobotsTxtError } from './RobotsTxtError.js'
const userAgent = 'FediCrawl/1.0'
export default async function fetchRobotsTxt (domain: string): Promise<RobotsTxt> {
console.info('Fetching robots.txt', { domain })
2022-11-29 13:11:27 +00:00
const url = new URL(`https://${domain}/robots.txt`)
2022-11-22 15:37:11 +00:00
let content = ''
try {
2022-11-29 13:11:27 +00:00
const robotsTxt = await axios.get(url.toString(), {
2022-11-22 17:52:24 +00:00
headers: { 'User-Agent': userAgent },
2022-11-29 13:11:27 +00:00
timeout: getTimeoutMilliseconds(domain)
2022-11-22 17:52:24 +00:00
})
2022-11-23 22:34:05 +00:00
content = String(robotsTxt.data)
2022-11-22 15:37:11 +00:00
} catch (error) {
console.info('Robots.txt not found', { error, url })
}
2022-11-29 13:11:27 +00:00
const robots = robotsParser(url.toString(), content)
const isAllowed = (url: URL): boolean => robots.isAllowed(url.toString(), userAgent) ?? true
2022-11-22 15:37:11 +00:00
return {
isAllowed,
2022-11-29 13:11:27 +00:00
getIfAllowed: async <T = any, R = AxiosResponse<T>, D = any>(url: URL, config?: AxiosRequestConfig<D>): Promise<R> => {
2022-11-22 15:37:11 +00:00
if (!isAllowed(url)) {
throw new RobotsTxtError(url)
}
2022-11-29 13:11:27 +00:00
return await axios.get(url.toString(), {
2022-11-22 15:37:11 +00:00
headers: { 'User-Agent': userAgent },
...config
})
},
2022-11-29 13:11:27 +00:00
postIfAllowed: async <T = any, R = AxiosResponse<T>, D = any>(url: URL, data?: D, config?: AxiosRequestConfig<D>): Promise<R> => {
2022-11-22 15:37:11 +00:00
if (!isAllowed(url)) {
throw new RobotsTxtError(url)
}
2022-11-29 13:11:27 +00:00
return await axios.post(url.toString(), data, {
2022-11-22 15:37:11 +00:00
headers: { 'User-Agent': userAgent },
...config
})
}
}
}