diff --git a/Dockerfile b/Dockerfile index f010a84..9f5e93a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,6 +8,8 @@ ENV ELASTIC_URL='http://elastic:9200' \ WAIT_FOR_JOB_MINUTES='60' \ DEFAULT_TIMEOUT_MILLISECONDS='10000' \ BANNED_DOMAINS='' \ + MAX_CRAWLING_DEPTH='' \ + CRAWLING_VERSION='0' \ TZ='UTC' FROM prebuild AS build WORKDIR /srv diff --git a/README.md b/README.md index 4f9c0b6..51349ce 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,8 @@ Configuration is done using environmental variables: | `WAIT_FOR_JOB_MINUTES` | _Optional_, How many minutes should the thread sleep if there are no nodes to refresh | `60` | | `DEFAULT_TIMEOUT_MILLISECONDS` | _Optional_, How many milliseconds should http wait for node api response on refresh | `10000` | | `BANNED_DOMAINS` | _Optional_, Domains not to index (even with subdomains) | _empty_ | +| `CRAWLING_VERSION` | _Optional_, Increasing this number can enforce recrawling of the whole index | 0 | +| `MAX_CRAWLING_DEPTH` | _Optional_, Limits how far is fediverse indexed from seed nodes | _empty_ | | `TZ` | _Optional_, Timezone | `UTC` | ## Deploy App is designed to be run in docker container and deployed using docker-compose. diff --git a/application/src/Fediverse/getCrawlingVersion.ts b/application/src/Fediverse/getCrawlingVersion.ts new file mode 100644 index 0000000..91abee6 --- /dev/null +++ b/application/src/Fediverse/getCrawlingVersion.ts @@ -0,0 +1,5 @@ +const CRAWLING_VERSION = 0 + +export default function getCrawlingVersion (): number { + return CRAWLING_VERSION + parseInt(process.env.CRAWLING_VERSION ?? '0') +} diff --git a/application/src/Fediverse/getMaxCrawlingDepth.ts b/application/src/Fediverse/getMaxCrawlingDepth.ts new file mode 100644 index 0000000..f1793ab --- /dev/null +++ b/application/src/Fediverse/getMaxCrawlingDepth.ts @@ -0,0 +1,10 @@ +export default function getMaxCrawlingDepth (): number | undefined { + if (process.env.MAX_CRAWLING_DEPTH === undefined || process.env.MAX_CRAWLING_DEPTH === '') { + return undefined + } + const depth = parseInt(process.env.MAX_CRAWLING_DEPTH) + if (depth >= 0) { + return depth + } + return undefined +} diff --git a/application/src/Jobs/Nodes/findNewNodes.ts b/application/src/Jobs/Nodes/findNewNodes.ts index 0bebd53..0704248 100644 --- a/application/src/Jobs/Nodes/findNewNodes.ts +++ b/application/src/Jobs/Nodes/findNewNodes.ts @@ -1,3 +1,4 @@ +import getMaxCrawlingDepth from '../../Fediverse/getMaxCrawlingDepth.js' import { NodeProvider } from '../../Fediverse/Providers/NodeProvider' import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js' import { findNewNodesOnPage } from './findNewNodesOnPage' @@ -10,6 +11,13 @@ export const findNewNodes = async ( node: Node, robotsTxt: RobotsTxt ): Promise => { + const maxCrawlingDepth = getMaxCrawlingDepth() + if (maxCrawlingDepth !== undefined && node.crawlingDepth >= maxCrawlingDepth) { + console.info('Skipping finding nodes, max crawling depth reached', { + maxCrawlingDepth + }) + return + } try { // noinspection InfiniteLoopJS for (let page = 0; true; page++) { diff --git a/application/src/Jobs/Nodes/findNewNodesOnPage.ts b/application/src/Jobs/Nodes/findNewNodesOnPage.ts index a80e404..a987b16 100644 --- a/application/src/Jobs/Nodes/findNewNodesOnPage.ts +++ b/application/src/Jobs/Nodes/findNewNodesOnPage.ts @@ -20,5 +20,5 @@ export const findNewNodesOnPage = async ( provider: provider.getKey(), page }) - return await createMissingNodes(elastic, domains, node.domain) + return await createMissingNodes(elastic, domains, node.domain, node.crawlingDepth + 1) } diff --git a/application/src/Jobs/Seed/addNodeSeed.ts b/application/src/Jobs/Seed/addNodeSeed.ts index de93514..1a1fe75 100644 --- a/application/src/Jobs/Seed/addNodeSeed.ts +++ b/application/src/Jobs/Seed/addNodeSeed.ts @@ -6,6 +6,6 @@ export const addNodeSeed = async ( domains: string[] ): Promise => { console.info('Trying to add seed domain nodes', { domains }) - const result = await createMissingNodes(elastic, domains, undefined) + const result = await createMissingNodes(elastic, domains, undefined, 0) return result > 0 } diff --git a/application/src/Storage/Definitions/Node.ts b/application/src/Storage/Definitions/Node.ts index de57d2c..1f68b4c 100644 --- a/application/src/Storage/Definitions/Node.ts +++ b/application/src/Storage/Definitions/Node.ts @@ -20,6 +20,8 @@ interface Node { discoveredByDomain?: string accountFeedCount?: number channelFeedCount?: number + crawlingDepth: number + crawlingVersion: number } export default Node diff --git a/application/src/Storage/Nodes/assertNodeIndex.ts b/application/src/Storage/Nodes/assertNodeIndex.ts index 4ec0a39..64896ac 100644 --- a/application/src/Storage/Nodes/assertNodeIndex.ts +++ b/application/src/Storage/Nodes/assertNodeIndex.ts @@ -69,9 +69,12 @@ const assertNodeIndex = async (elastic: ElasticClient): Promise => { } }, accountFeedCount: { type: 'integer' }, - channelFeedCount: { type: 'integer' } + channelFeedCount: { type: 'integer' }, + crawlingDepth: { type: 'integer' }, + crawlingVersion: { type: 'integer' } } }) + await elastic.indices.refresh({ index: nodeIndex }) } diff --git a/application/src/Storage/Nodes/createMissingNodes.ts b/application/src/Storage/Nodes/createMissingNodes.ts index 6040378..3a81e5e 100644 --- a/application/src/Storage/Nodes/createMissingNodes.ts +++ b/application/src/Storage/Nodes/createMissingNodes.ts @@ -1,9 +1,11 @@ +import getCrawlingVersion from '../../Fediverse/getCrawlingVersion.js' import { ElasticClient } from '../ElasticClient' import nodeIndex from '../Definitions/nodeIndex' export const createMissingNodes = async ( elastic: ElasticClient, domains: string[], - discoveredByDomain: string | undefined + discoveredByDomain: string | undefined, + crawlingDepth: number ): Promise => { const response = await elastic.bulk({ index: nodeIndex, @@ -14,6 +16,8 @@ export const createMissingNodes = async ( { domain, discoveredByDomain, + crawlingDepth, + crawlingVersion: getCrawlingVersion(), foundAt: new Date().getTime() } ]) diff --git a/application/src/Storage/Nodes/setNodeRefreshed.ts b/application/src/Storage/Nodes/setNodeRefreshed.ts index 8126873..af0213f 100644 --- a/application/src/Storage/Nodes/setNodeRefreshed.ts +++ b/application/src/Storage/Nodes/setNodeRefreshed.ts @@ -1,3 +1,4 @@ +import getCrawlingVersion from '../../Fediverse/getCrawlingVersion.js' import { ElasticClient } from '../ElasticClient' import nodeIndex from '../Definitions/nodeIndex' import Node from '../Definitions/Node' @@ -14,7 +15,8 @@ export const setNodeRefreshed = async ( index: nodeIndex, id: node.domain, doc: { - refreshedAt: date.getTime() + refreshedAt: date.getTime(), + crawlingVersion: getCrawlingVersion() } }) return assertDefined(