From 425abd5af0196c481d312c59852ac0423f3d7c19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20=C5=A0korpil?= Date: Sat, 10 Dec 2022 15:11:56 +0100 Subject: [PATCH] Added domain validation --- application/src/Jobs/Nodes/findNewNodesOnPage.ts | 5 ++++- application/src/Jobs/processNextNode.ts | 9 ++++++++- application/src/Storage/Nodes/isDomainValid.ts | 10 ++++++++++ 3 files changed, 22 insertions(+), 2 deletions(-) create mode 100644 application/src/Storage/Nodes/isDomainValid.ts diff --git a/application/src/Jobs/Nodes/findNewNodesOnPage.ts b/application/src/Jobs/Nodes/findNewNodesOnPage.ts index a987b16..1473a8c 100644 --- a/application/src/Jobs/Nodes/findNewNodesOnPage.ts +++ b/application/src/Jobs/Nodes/findNewNodesOnPage.ts @@ -4,6 +4,7 @@ import { NodeProvider } from '../../Fediverse/Providers/NodeProvider' import Node from '../../Storage/Definitions/Node' import { ElasticClient } from '../../Storage/ElasticClient' import isDomainNotBanned from '../../Storage/Nodes/isDomainNotBanned' +import isDomainValid from '../../Storage/Nodes/isDomainValid.js' export const findNewNodesOnPage = async ( elastic: ElasticClient, @@ -13,7 +14,9 @@ export const findNewNodesOnPage = async ( robotsTxt: RobotsTxt ): Promise => { let domains = await provider.retrieveNodes(node.domain, page, robotsTxt) - domains = domains.filter(isDomainNotBanned) + domains = domains.filter( + (domain: string): boolean => isDomainValid(domain) && isDomainNotBanned(domain) + ) console.log('Found nodes', { count: domains.length, domain: node.domain, diff --git a/application/src/Jobs/processNextNode.ts b/application/src/Jobs/processNextNode.ts index e9cb0cf..8e61c7f 100644 --- a/application/src/Jobs/processNextNode.ts +++ b/application/src/Jobs/processNextNode.ts @@ -1,6 +1,7 @@ import fetchRobotsTxt from '../Fediverse/RobotsTxt/fetchRobotsTxt.js' import { fetchNodeToProcess } from '../Storage/Nodes/fetchNodeToProcess' import { ProviderRegistry } from '../Fediverse/Providers/ProviderRegistry' +import isDomainValid from '../Storage/Nodes/isDomainValid.js' import { setNodeRefreshed } from '../Storage/Nodes/setNodeRefreshed' import batchPromises from '../Utils/batchPromises.js' import { refreshNodeInfo } from './NodeInfo/refreshNodeInfo' @@ -13,6 +14,7 @@ import { deleteOldFeeds } from '../Storage/Feeds/deleteOldFeeds' import refreshNodeIps from './Dns/refreshNodeIps' import { ElasticClient } from '../Storage/ElasticClient' import updateNodeFeedStats from './Nodes/updateNodeFeedStats' +import deleteDomains from './Seed/deleteBannedNodes.js' export const processNextNode = async ( elastic: ElasticClient, @@ -21,7 +23,12 @@ export const processNextNode = async ( console.info('#############################################') let node = await fetchNodeToProcess(elastic) node = await setNodeRefreshAttempted(elastic, node) - + // TODO remove check later + if (!isDomainValid(node.domain)) { + console.info('Node domain is invalid, deleting node', { domain: node.domain }) + await deleteDomains(elastic, [node.domain]) + return + } node = await refreshNodeIps(elastic, node) const robotsTxt = await fetchRobotsTxt(node.domain) node = await refreshNodeInfo(elastic, node, robotsTxt) diff --git a/application/src/Storage/Nodes/isDomainValid.ts b/application/src/Storage/Nodes/isDomainValid.ts new file mode 100644 index 0000000..836516c --- /dev/null +++ b/application/src/Storage/Nodes/isDomainValid.ts @@ -0,0 +1,10 @@ +export default function isDomainValid (domain: string): boolean { + try { + // eslint-disable-next-line no-new + new URL(`https://${domain}/`) + } catch (e) { + console.info('Domain is invalid', { domain }) + return false + } + return true +}