fedicrawl/application/src/Jobs/Nodes/findNewNodes.ts

38 wiersze
1.2 KiB
TypeScript
Czysty Zwykły widok Historia

2022-11-23 23:39:17 +00:00
import getMaxCrawlingDepth from '../../Fediverse/getMaxCrawlingDepth.js'
2021-12-23 14:14:06 +00:00
import { NodeProvider } from '../../Fediverse/Providers/NodeProvider'
2022-11-22 15:37:11 +00:00
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
2021-12-23 14:14:06 +00:00
import { findNewNodesOnPage } from './findNewNodesOnPage'
2022-09-14 19:16:04 +00:00
import Node from '../../Storage/Definitions/Node'
import { ElasticClient } from '../../Storage/ElasticClient'
2021-12-23 14:14:06 +00:00
2022-09-18 11:32:25 +00:00
export const findNewNodes = async (
elastic: ElasticClient,
provider: NodeProvider,
2022-11-22 15:37:11 +00:00
node: Node,
robotsTxt: RobotsTxt
2022-09-18 11:32:25 +00:00
): Promise<void> => {
2022-11-23 23:39:17 +00:00
const maxCrawlingDepth = getMaxCrawlingDepth()
if (maxCrawlingDepth !== undefined && node.crawlingDepth >= maxCrawlingDepth) {
console.info('Skipping finding nodes, max crawling depth reached', {
maxCrawlingDepth
})
return
}
2021-12-23 14:14:06 +00:00
try {
2022-09-18 11:32:25 +00:00
// noinspection InfiniteLoopJS
2021-12-23 14:14:06 +00:00
for (let page = 0; true; page++) {
2022-09-18 11:32:25 +00:00
console.info('Retrieve node page', {
domain: node.domain,
provider: provider.getKey()
})
2022-11-22 15:37:11 +00:00
await findNewNodesOnPage(elastic, provider, node, page, robotsTxt)
2021-12-23 14:14:06 +00:00
}
2022-09-18 11:32:25 +00:00
} catch (error) {
console.info('Node search finished', {
error,
domain: node.domain,
provider: provider.getKey()
})
2021-12-23 14:14:06 +00:00
}
}