kopia lustrzana https://github.com/Stopka/fedicrawl
Added crawling depth limit
rodzic
0723c2508d
commit
6a11d19781
|
@ -8,6 +8,8 @@ ENV ELASTIC_URL='http://elastic:9200' \
|
||||||
WAIT_FOR_JOB_MINUTES='60' \
|
WAIT_FOR_JOB_MINUTES='60' \
|
||||||
DEFAULT_TIMEOUT_MILLISECONDS='10000' \
|
DEFAULT_TIMEOUT_MILLISECONDS='10000' \
|
||||||
BANNED_DOMAINS='' \
|
BANNED_DOMAINS='' \
|
||||||
|
MAX_CRAWLING_DEPTH='' \
|
||||||
|
CRAWLING_VERSION='0' \
|
||||||
TZ='UTC'
|
TZ='UTC'
|
||||||
FROM prebuild AS build
|
FROM prebuild AS build
|
||||||
WORKDIR /srv
|
WORKDIR /srv
|
||||||
|
|
|
@ -35,6 +35,8 @@ Configuration is done using environmental variables:
|
||||||
| `WAIT_FOR_JOB_MINUTES` | _Optional_, How many minutes should the thread sleep if there are no nodes to refresh | `60` |
|
| `WAIT_FOR_JOB_MINUTES` | _Optional_, How many minutes should the thread sleep if there are no nodes to refresh | `60` |
|
||||||
| `DEFAULT_TIMEOUT_MILLISECONDS` | _Optional_, How many milliseconds should http wait for node api response on refresh | `10000` |
|
| `DEFAULT_TIMEOUT_MILLISECONDS` | _Optional_, How many milliseconds should http wait for node api response on refresh | `10000` |
|
||||||
| `BANNED_DOMAINS` | _Optional_, Domains not to index (even with subdomains) | _empty_ |
|
| `BANNED_DOMAINS` | _Optional_, Domains not to index (even with subdomains) | _empty_ |
|
||||||
|
| `CRAWLING_VERSION` | _Optional_, Increasing this number can enforce recrawling of the whole index | 0 |
|
||||||
|
| `MAX_CRAWLING_DEPTH` | _Optional_, Limits how far is fediverse indexed from seed nodes | _empty_ |
|
||||||
| `TZ` | _Optional_, Timezone | `UTC` |
|
| `TZ` | _Optional_, Timezone | `UTC` |
|
||||||
## Deploy
|
## Deploy
|
||||||
App is designed to be run in docker container and deployed using docker-compose.
|
App is designed to be run in docker container and deployed using docker-compose.
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
const CRAWLING_VERSION = 0
|
||||||
|
|
||||||
|
export default function getCrawlingVersion (): number {
|
||||||
|
return CRAWLING_VERSION + parseInt(process.env.CRAWLING_VERSION ?? '0')
|
||||||
|
}
|
|
@ -0,0 +1,10 @@
|
||||||
|
export default function getMaxCrawlingDepth (): number | undefined {
|
||||||
|
if (process.env.MAX_CRAWLING_DEPTH === undefined || process.env.MAX_CRAWLING_DEPTH === '') {
|
||||||
|
return undefined
|
||||||
|
}
|
||||||
|
const depth = parseInt(process.env.MAX_CRAWLING_DEPTH)
|
||||||
|
if (depth >= 0) {
|
||||||
|
return depth
|
||||||
|
}
|
||||||
|
return undefined
|
||||||
|
}
|
|
@ -1,3 +1,4 @@
|
||||||
|
import getMaxCrawlingDepth from '../../Fediverse/getMaxCrawlingDepth.js'
|
||||||
import { NodeProvider } from '../../Fediverse/Providers/NodeProvider'
|
import { NodeProvider } from '../../Fediverse/Providers/NodeProvider'
|
||||||
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
|
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
|
||||||
import { findNewNodesOnPage } from './findNewNodesOnPage'
|
import { findNewNodesOnPage } from './findNewNodesOnPage'
|
||||||
|
@ -10,6 +11,13 @@ export const findNewNodes = async (
|
||||||
node: Node,
|
node: Node,
|
||||||
robotsTxt: RobotsTxt
|
robotsTxt: RobotsTxt
|
||||||
): Promise<void> => {
|
): Promise<void> => {
|
||||||
|
const maxCrawlingDepth = getMaxCrawlingDepth()
|
||||||
|
if (maxCrawlingDepth !== undefined && node.crawlingDepth >= maxCrawlingDepth) {
|
||||||
|
console.info('Skipping finding nodes, max crawling depth reached', {
|
||||||
|
maxCrawlingDepth
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
try {
|
try {
|
||||||
// noinspection InfiniteLoopJS
|
// noinspection InfiniteLoopJS
|
||||||
for (let page = 0; true; page++) {
|
for (let page = 0; true; page++) {
|
||||||
|
|
|
@ -20,5 +20,5 @@ export const findNewNodesOnPage = async (
|
||||||
provider: provider.getKey(),
|
provider: provider.getKey(),
|
||||||
page
|
page
|
||||||
})
|
})
|
||||||
return await createMissingNodes(elastic, domains, node.domain)
|
return await createMissingNodes(elastic, domains, node.domain, node.crawlingDepth + 1)
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,6 +6,6 @@ export const addNodeSeed = async (
|
||||||
domains: string[]
|
domains: string[]
|
||||||
): Promise<boolean> => {
|
): Promise<boolean> => {
|
||||||
console.info('Trying to add seed domain nodes', { domains })
|
console.info('Trying to add seed domain nodes', { domains })
|
||||||
const result = await createMissingNodes(elastic, domains, undefined)
|
const result = await createMissingNodes(elastic, domains, undefined, 0)
|
||||||
return result > 0
|
return result > 0
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,8 @@ interface Node {
|
||||||
discoveredByDomain?: string
|
discoveredByDomain?: string
|
||||||
accountFeedCount?: number
|
accountFeedCount?: number
|
||||||
channelFeedCount?: number
|
channelFeedCount?: number
|
||||||
|
crawlingDepth: number
|
||||||
|
crawlingVersion: number
|
||||||
}
|
}
|
||||||
|
|
||||||
export default Node
|
export default Node
|
||||||
|
|
|
@ -69,9 +69,12 @@ const assertNodeIndex = async (elastic: ElasticClient): Promise<void> => {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
accountFeedCount: { type: 'integer' },
|
accountFeedCount: { type: 'integer' },
|
||||||
channelFeedCount: { type: 'integer' }
|
channelFeedCount: { type: 'integer' },
|
||||||
|
crawlingDepth: { type: 'integer' },
|
||||||
|
crawlingVersion: { type: 'integer' }
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
await elastic.indices.refresh({ index: nodeIndex })
|
await elastic.indices.refresh({ index: nodeIndex })
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,11 @@
|
||||||
|
import getCrawlingVersion from '../../Fediverse/getCrawlingVersion.js'
|
||||||
import { ElasticClient } from '../ElasticClient'
|
import { ElasticClient } from '../ElasticClient'
|
||||||
import nodeIndex from '../Definitions/nodeIndex'
|
import nodeIndex from '../Definitions/nodeIndex'
|
||||||
export const createMissingNodes = async (
|
export const createMissingNodes = async (
|
||||||
elastic: ElasticClient,
|
elastic: ElasticClient,
|
||||||
domains: string[],
|
domains: string[],
|
||||||
discoveredByDomain: string | undefined
|
discoveredByDomain: string | undefined,
|
||||||
|
crawlingDepth: number
|
||||||
): Promise<number> => {
|
): Promise<number> => {
|
||||||
const response = await elastic.bulk({
|
const response = await elastic.bulk({
|
||||||
index: nodeIndex,
|
index: nodeIndex,
|
||||||
|
@ -14,6 +16,8 @@ export const createMissingNodes = async (
|
||||||
{
|
{
|
||||||
domain,
|
domain,
|
||||||
discoveredByDomain,
|
discoveredByDomain,
|
||||||
|
crawlingDepth,
|
||||||
|
crawlingVersion: getCrawlingVersion(),
|
||||||
foundAt: new Date().getTime()
|
foundAt: new Date().getTime()
|
||||||
}
|
}
|
||||||
])
|
])
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import getCrawlingVersion from '../../Fediverse/getCrawlingVersion.js'
|
||||||
import { ElasticClient } from '../ElasticClient'
|
import { ElasticClient } from '../ElasticClient'
|
||||||
import nodeIndex from '../Definitions/nodeIndex'
|
import nodeIndex from '../Definitions/nodeIndex'
|
||||||
import Node from '../Definitions/Node'
|
import Node from '../Definitions/Node'
|
||||||
|
@ -14,7 +15,8 @@ export const setNodeRefreshed = async (
|
||||||
index: nodeIndex,
|
index: nodeIndex,
|
||||||
id: node.domain,
|
id: node.domain,
|
||||||
doc: {
|
doc: {
|
||||||
refreshedAt: date.getTime()
|
refreshedAt: date.getTime(),
|
||||||
|
crawlingVersion: getCrawlingVersion()
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
return assertDefined(
|
return assertDefined(
|
||||||
|
|
Ładowanie…
Reference in New Issue