Added crawling depth limit

main
Štěpán Škorpil 2022-11-24 00:39:17 +01:00
rodzic 0723c2508d
commit 6a11d19781
11 zmienionych plików z 43 dodań i 5 usunięć

Wyświetl plik

@ -8,6 +8,8 @@ ENV ELASTIC_URL='http://elastic:9200' \
WAIT_FOR_JOB_MINUTES='60' \ WAIT_FOR_JOB_MINUTES='60' \
DEFAULT_TIMEOUT_MILLISECONDS='10000' \ DEFAULT_TIMEOUT_MILLISECONDS='10000' \
BANNED_DOMAINS='' \ BANNED_DOMAINS='' \
MAX_CRAWLING_DEPTH='' \
CRAWLING_VERSION='0' \
TZ='UTC' TZ='UTC'
FROM prebuild AS build FROM prebuild AS build
WORKDIR /srv WORKDIR /srv

Wyświetl plik

@ -35,6 +35,8 @@ Configuration is done using environmental variables:
| `WAIT_FOR_JOB_MINUTES` | _Optional_, How many minutes should the thread sleep if there are no nodes to refresh | `60` | | `WAIT_FOR_JOB_MINUTES` | _Optional_, How many minutes should the thread sleep if there are no nodes to refresh | `60` |
| `DEFAULT_TIMEOUT_MILLISECONDS` | _Optional_, How many milliseconds should http wait for node api response on refresh | `10000` | | `DEFAULT_TIMEOUT_MILLISECONDS` | _Optional_, How many milliseconds should http wait for node api response on refresh | `10000` |
| `BANNED_DOMAINS` | _Optional_, Domains not to index (even with subdomains) | _empty_ | | `BANNED_DOMAINS` | _Optional_, Domains not to index (even with subdomains) | _empty_ |
| `CRAWLING_VERSION` | _Optional_, Increasing this number can enforce recrawling of the whole index | 0 |
| `MAX_CRAWLING_DEPTH` | _Optional_, Limits how far is fediverse indexed from seed nodes | _empty_ |
| `TZ` | _Optional_, Timezone | `UTC` | | `TZ` | _Optional_, Timezone | `UTC` |
## Deploy ## Deploy
App is designed to be run in docker container and deployed using docker-compose. App is designed to be run in docker container and deployed using docker-compose.

Wyświetl plik

@ -0,0 +1,5 @@
const CRAWLING_VERSION = 0
export default function getCrawlingVersion (): number {
return CRAWLING_VERSION + parseInt(process.env.CRAWLING_VERSION ?? '0')
}

Wyświetl plik

@ -0,0 +1,10 @@
export default function getMaxCrawlingDepth (): number | undefined {
if (process.env.MAX_CRAWLING_DEPTH === undefined || process.env.MAX_CRAWLING_DEPTH === '') {
return undefined
}
const depth = parseInt(process.env.MAX_CRAWLING_DEPTH)
if (depth >= 0) {
return depth
}
return undefined
}

Wyświetl plik

@ -1,3 +1,4 @@
import getMaxCrawlingDepth from '../../Fediverse/getMaxCrawlingDepth.js'
import { NodeProvider } from '../../Fediverse/Providers/NodeProvider' import { NodeProvider } from '../../Fediverse/Providers/NodeProvider'
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js' import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
import { findNewNodesOnPage } from './findNewNodesOnPage' import { findNewNodesOnPage } from './findNewNodesOnPage'
@ -10,6 +11,13 @@ export const findNewNodes = async (
node: Node, node: Node,
robotsTxt: RobotsTxt robotsTxt: RobotsTxt
): Promise<void> => { ): Promise<void> => {
const maxCrawlingDepth = getMaxCrawlingDepth()
if (maxCrawlingDepth !== undefined && node.crawlingDepth >= maxCrawlingDepth) {
console.info('Skipping finding nodes, max crawling depth reached', {
maxCrawlingDepth
})
return
}
try { try {
// noinspection InfiniteLoopJS // noinspection InfiniteLoopJS
for (let page = 0; true; page++) { for (let page = 0; true; page++) {

Wyświetl plik

@ -20,5 +20,5 @@ export const findNewNodesOnPage = async (
provider: provider.getKey(), provider: provider.getKey(),
page page
}) })
return await createMissingNodes(elastic, domains, node.domain) return await createMissingNodes(elastic, domains, node.domain, node.crawlingDepth + 1)
} }

Wyświetl plik

@ -6,6 +6,6 @@ export const addNodeSeed = async (
domains: string[] domains: string[]
): Promise<boolean> => { ): Promise<boolean> => {
console.info('Trying to add seed domain nodes', { domains }) console.info('Trying to add seed domain nodes', { domains })
const result = await createMissingNodes(elastic, domains, undefined) const result = await createMissingNodes(elastic, domains, undefined, 0)
return result > 0 return result > 0
} }

Wyświetl plik

@ -20,6 +20,8 @@ interface Node {
discoveredByDomain?: string discoveredByDomain?: string
accountFeedCount?: number accountFeedCount?: number
channelFeedCount?: number channelFeedCount?: number
crawlingDepth: number
crawlingVersion: number
} }
export default Node export default Node

Wyświetl plik

@ -69,9 +69,12 @@ const assertNodeIndex = async (elastic: ElasticClient): Promise<void> => {
} }
}, },
accountFeedCount: { type: 'integer' }, accountFeedCount: { type: 'integer' },
channelFeedCount: { type: 'integer' } channelFeedCount: { type: 'integer' },
crawlingDepth: { type: 'integer' },
crawlingVersion: { type: 'integer' }
} }
}) })
await elastic.indices.refresh({ index: nodeIndex }) await elastic.indices.refresh({ index: nodeIndex })
} }

Wyświetl plik

@ -1,9 +1,11 @@
import getCrawlingVersion from '../../Fediverse/getCrawlingVersion.js'
import { ElasticClient } from '../ElasticClient' import { ElasticClient } from '../ElasticClient'
import nodeIndex from '../Definitions/nodeIndex' import nodeIndex from '../Definitions/nodeIndex'
export const createMissingNodes = async ( export const createMissingNodes = async (
elastic: ElasticClient, elastic: ElasticClient,
domains: string[], domains: string[],
discoveredByDomain: string | undefined discoveredByDomain: string | undefined,
crawlingDepth: number
): Promise<number> => { ): Promise<number> => {
const response = await elastic.bulk({ const response = await elastic.bulk({
index: nodeIndex, index: nodeIndex,
@ -14,6 +16,8 @@ export const createMissingNodes = async (
{ {
domain, domain,
discoveredByDomain, discoveredByDomain,
crawlingDepth,
crawlingVersion: getCrawlingVersion(),
foundAt: new Date().getTime() foundAt: new Date().getTime()
} }
]) ])

Wyświetl plik

@ -1,3 +1,4 @@
import getCrawlingVersion from '../../Fediverse/getCrawlingVersion.js'
import { ElasticClient } from '../ElasticClient' import { ElasticClient } from '../ElasticClient'
import nodeIndex from '../Definitions/nodeIndex' import nodeIndex from '../Definitions/nodeIndex'
import Node from '../Definitions/Node' import Node from '../Definitions/Node'
@ -14,7 +15,8 @@ export const setNodeRefreshed = async (
index: nodeIndex, index: nodeIndex,
id: node.domain, id: node.domain,
doc: { doc: {
refreshedAt: date.getTime() refreshedAt: date.getTime(),
crawlingVersion: getCrawlingVersion()
} }
}) })
return assertDefined( return assertDefined(