kopia lustrzana https://github.com/Stopka/fedicrawl
Porównaj commity
3 Commity
8f42791ee2
...
6a11d19781
Autor | SHA1 | Data |
---|---|---|
Štěpán Škorpil | 6a11d19781 | |
Štěpán Škorpil | 0723c2508d | |
Štěpán Škorpil | 4064db521f |
|
@ -8,6 +8,8 @@ ENV ELASTIC_URL='http://elastic:9200' \
|
|||
WAIT_FOR_JOB_MINUTES='60' \
|
||||
DEFAULT_TIMEOUT_MILLISECONDS='10000' \
|
||||
BANNED_DOMAINS='' \
|
||||
MAX_CRAWLING_DEPTH='' \
|
||||
CRAWLING_VERSION='0' \
|
||||
TZ='UTC'
|
||||
FROM prebuild AS build
|
||||
WORKDIR /srv
|
||||
|
|
|
@ -35,6 +35,8 @@ Configuration is done using environmental variables:
|
|||
| `WAIT_FOR_JOB_MINUTES` | _Optional_, How many minutes should the thread sleep if there are no nodes to refresh | `60` |
|
||||
| `DEFAULT_TIMEOUT_MILLISECONDS` | _Optional_, How many milliseconds should http wait for node api response on refresh | `10000` |
|
||||
| `BANNED_DOMAINS` | _Optional_, Domains not to index (even with subdomains) | _empty_ |
|
||||
| `CRAWLING_VERSION` | _Optional_, Increasing this number can enforce recrawling of the whole index | 0 |
|
||||
| `MAX_CRAWLING_DEPTH` | _Optional_, Limits how far is fediverse indexed from seed nodes | _empty_ |
|
||||
| `TZ` | _Optional_, Timezone | `UTC` |
|
||||
## Deploy
|
||||
App is designed to be run in docker container and deployed using docker-compose.
|
||||
|
|
|
@ -15,7 +15,7 @@ export default async function fetchRobotsTxt (domain: string): Promise<RobotsTxt
|
|||
headers: { 'User-Agent': userAgent },
|
||||
timeout: getDefaultTimeoutMilliseconds()
|
||||
})
|
||||
content = robotsTxt.data
|
||||
content = String(robotsTxt.data)
|
||||
} catch (error) {
|
||||
console.info('Robots.txt not found', { error, url })
|
||||
}
|
||||
|
|
|
@ -11,8 +11,8 @@ export const assertSuccessJsonResponse = (
|
|||
throw new UnexpectedResponseStatusError(expectedStatus, actualStatus)
|
||||
}
|
||||
const expectedContentType = 'application/json'
|
||||
const actualContentType = response.headers['content-type']
|
||||
const actualContentType = String(response.headers['content-type'])
|
||||
if (!actualContentType.startsWith(expectedContentType)) {
|
||||
throw new UnexpectedContentTypeError(expectedContentType, actualContentType)
|
||||
throw new UnexpectedContentTypeError(actualContentType, expectedContentType)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
const CRAWLING_VERSION = 0
|
||||
|
||||
export default function getCrawlingVersion (): number {
|
||||
return CRAWLING_VERSION + parseInt(process.env.CRAWLING_VERSION ?? '0')
|
||||
}
|
|
@ -0,0 +1,10 @@
|
|||
export default function getMaxCrawlingDepth (): number | undefined {
|
||||
if (process.env.MAX_CRAWLING_DEPTH === undefined || process.env.MAX_CRAWLING_DEPTH === '') {
|
||||
return undefined
|
||||
}
|
||||
const depth = parseInt(process.env.MAX_CRAWLING_DEPTH)
|
||||
if (depth >= 0) {
|
||||
return depth
|
||||
}
|
||||
return undefined
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
import { FeedData } from '../../Fediverse/Providers/FeedData'
|
||||
import { extractTags } from '../../StringTools/extractTags'
|
||||
import { extractEmails } from '../../StringTools/extractEmails'
|
||||
import { extractTags } from '../../Utils/extractTags'
|
||||
import { extractEmails } from '../../Utils/extractEmails'
|
||||
import { createFeed } from '../../Storage/Feeds/createFeed'
|
||||
import prepareFulltext from './prepareFulltext'
|
||||
import Feed from '../../Storage/Definitions/Feed'
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import { FeedData } from '../../Fediverse/Providers/FeedData'
|
||||
import { extractTags } from '../../StringTools/extractTags'
|
||||
import { extractEmails } from '../../StringTools/extractEmails'
|
||||
import { extractTags } from '../../Utils/extractTags'
|
||||
import { extractEmails } from '../../Utils/extractEmails'
|
||||
import { updateFeed } from '../../Storage/Feeds/updateFeed'
|
||||
import Feed from '../../Storage/Definitions/Feed'
|
||||
import Node from '../../Storage/Definitions/Node'
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
|
||||
import batchPromises from '../../Utils/batchPromises.js'
|
||||
import { refreshOrAddFeed } from './refreshOrAddFeed'
|
||||
import { FeedProvider } from '../../Fediverse/Providers/FeedProvider'
|
||||
import Node from '../../Storage/Definitions/Node'
|
||||
|
@ -21,10 +22,12 @@ export const refreshFeedsOnPage = async (
|
|||
provider: provider.getKey(),
|
||||
page
|
||||
})
|
||||
return await Promise.all(
|
||||
return await batchPromises(
|
||||
indexableFeedData.map(
|
||||
async (feedDataItem) =>
|
||||
await refreshOrAddFeed(elastic, node, feedDataItem)
|
||||
)
|
||||
(feedDataItem) => {
|
||||
return async () => await refreshOrAddFeed(elastic, node, feedDataItem)
|
||||
}
|
||||
),
|
||||
Number(process.env.STORAGE_BATCH_SIZE ?? 5)
|
||||
)
|
||||
}
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import getMaxCrawlingDepth from '../../Fediverse/getMaxCrawlingDepth.js'
|
||||
import { NodeProvider } from '../../Fediverse/Providers/NodeProvider'
|
||||
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
|
||||
import { findNewNodesOnPage } from './findNewNodesOnPage'
|
||||
|
@ -10,6 +11,13 @@ export const findNewNodes = async (
|
|||
node: Node,
|
||||
robotsTxt: RobotsTxt
|
||||
): Promise<void> => {
|
||||
const maxCrawlingDepth = getMaxCrawlingDepth()
|
||||
if (maxCrawlingDepth !== undefined && node.crawlingDepth >= maxCrawlingDepth) {
|
||||
console.info('Skipping finding nodes, max crawling depth reached', {
|
||||
maxCrawlingDepth
|
||||
})
|
||||
return
|
||||
}
|
||||
try {
|
||||
// noinspection InfiniteLoopJS
|
||||
for (let page = 0; true; page++) {
|
||||
|
|
|
@ -20,5 +20,5 @@ export const findNewNodesOnPage = async (
|
|||
provider: provider.getKey(),
|
||||
page
|
||||
})
|
||||
return await createMissingNodes(elastic, domains, node.domain)
|
||||
return await createMissingNodes(elastic, domains, node.domain, node.crawlingDepth + 1)
|
||||
}
|
||||
|
|
|
@ -6,6 +6,6 @@ export const addNodeSeed = async (
|
|||
domains: string[]
|
||||
): Promise<boolean> => {
|
||||
console.info('Trying to add seed domain nodes', { domains })
|
||||
const result = await createMissingNodes(elastic, domains, undefined)
|
||||
const result = await createMissingNodes(elastic, domains, undefined, 0)
|
||||
return result > 0
|
||||
}
|
||||
|
|
|
@ -2,6 +2,7 @@ import fetchRobotsTxt from '../Fediverse/RobotsTxt/fetchRobotsTxt.js'
|
|||
import { fetchNodeToProcess } from '../Storage/Nodes/fetchNodeToProcess'
|
||||
import { ProviderRegistry } from '../Fediverse/Providers/ProviderRegistry'
|
||||
import { setNodeRefreshed } from '../Storage/Nodes/setNodeRefreshed'
|
||||
import batchPromises from '../Utils/batchPromises.js'
|
||||
import { refreshNodeInfo } from './NodeInfo/refreshNodeInfo'
|
||||
import { setNodeRefreshAttempted } from '../Storage/Nodes/setNodeRefreshAttempted'
|
||||
import { findNewNodes } from './Nodes/findNewNodes'
|
||||
|
@ -37,24 +38,30 @@ export const processNextNode = async (
|
|||
}
|
||||
const provider = providerRegistry.getProviderByKey(softwareName)
|
||||
|
||||
await Promise.all(
|
||||
provider.getNodeProviders().map(async (nodeProvider: NodeProvider) => {
|
||||
console.info('Searching for nodes', {
|
||||
domain: node.domain,
|
||||
provider: nodeProvider.getKey()
|
||||
})
|
||||
return await findNewNodes(elastic, nodeProvider, node, robotsTxt)
|
||||
})
|
||||
await batchPromises(
|
||||
provider.getNodeProviders().map((nodeProvider: NodeProvider) => {
|
||||
return async () => {
|
||||
console.info('Searching for nodes', {
|
||||
domain: node.domain,
|
||||
provider: nodeProvider.getKey()
|
||||
})
|
||||
return await findNewNodes(elastic, nodeProvider, node, robotsTxt)
|
||||
}
|
||||
}),
|
||||
Number(process.env.NODE_PROVIDER_BATCH_SIZE ?? 5)
|
||||
)
|
||||
|
||||
await Promise.all(
|
||||
provider.getFeedProviders().map(async (feedProvider: FeedProvider) => {
|
||||
console.info('Searching for feeds', {
|
||||
domain: node.domain,
|
||||
provider: feedProvider.getKey()
|
||||
})
|
||||
return await refreshFeeds(elastic, feedProvider, node, robotsTxt)
|
||||
})
|
||||
await batchPromises(
|
||||
provider.getFeedProviders().map((feedProvider: FeedProvider) => {
|
||||
return async () => {
|
||||
console.info('Searching for feeds', {
|
||||
domain: node.domain,
|
||||
provider: feedProvider.getKey()
|
||||
})
|
||||
return await refreshFeeds(elastic, feedProvider, node, robotsTxt)
|
||||
}
|
||||
}),
|
||||
Number(process.env.FEED_PROVIDER_BATCH_SIZE ?? 5)
|
||||
)
|
||||
|
||||
await deleteOldFeeds(elastic, node)
|
||||
|
|
|
@ -20,6 +20,8 @@ interface Node {
|
|||
discoveredByDomain?: string
|
||||
accountFeedCount?: number
|
||||
channelFeedCount?: number
|
||||
crawlingDepth: number
|
||||
crawlingVersion: number
|
||||
}
|
||||
|
||||
export default Node
|
||||
|
|
|
@ -69,9 +69,12 @@ const assertNodeIndex = async (elastic: ElasticClient): Promise<void> => {
|
|||
}
|
||||
},
|
||||
accountFeedCount: { type: 'integer' },
|
||||
channelFeedCount: { type: 'integer' }
|
||||
channelFeedCount: { type: 'integer' },
|
||||
crawlingDepth: { type: 'integer' },
|
||||
crawlingVersion: { type: 'integer' }
|
||||
}
|
||||
})
|
||||
|
||||
await elastic.indices.refresh({ index: nodeIndex })
|
||||
}
|
||||
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
import getCrawlingVersion from '../../Fediverse/getCrawlingVersion.js'
|
||||
import { ElasticClient } from '../ElasticClient'
|
||||
import nodeIndex from '../Definitions/nodeIndex'
|
||||
export const createMissingNodes = async (
|
||||
elastic: ElasticClient,
|
||||
domains: string[],
|
||||
discoveredByDomain: string | undefined
|
||||
discoveredByDomain: string | undefined,
|
||||
crawlingDepth: number
|
||||
): Promise<number> => {
|
||||
const response = await elastic.bulk({
|
||||
index: nodeIndex,
|
||||
|
@ -14,6 +16,8 @@ export const createMissingNodes = async (
|
|||
{
|
||||
domain,
|
||||
discoveredByDomain,
|
||||
crawlingDepth,
|
||||
crawlingVersion: getCrawlingVersion(),
|
||||
foundAt: new Date().getTime()
|
||||
}
|
||||
])
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import getCrawlingVersion from '../../Fediverse/getCrawlingVersion.js'
|
||||
import { ElasticClient } from '../ElasticClient'
|
||||
import nodeIndex from '../Definitions/nodeIndex'
|
||||
import Node from '../Definitions/Node'
|
||||
|
@ -14,7 +15,8 @@ export const setNodeRefreshed = async (
|
|||
index: nodeIndex,
|
||||
id: node.domain,
|
||||
doc: {
|
||||
refreshedAt: date.getTime()
|
||||
refreshedAt: date.getTime(),
|
||||
crawlingVersion: getCrawlingVersion()
|
||||
}
|
||||
})
|
||||
return assertDefined(
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
type PromiseFactory<TResult> = () => Promise<TResult>
|
||||
|
||||
export default PromiseFactory
|
|
@ -0,0 +1,18 @@
|
|||
import PromiseFactory from './PromiseFactory.js'
|
||||
|
||||
export default async function batchPromises<TResult> (
|
||||
promiseFactories: Array<PromiseFactory<TResult>>,
|
||||
batchSize: number
|
||||
): Promise<TResult[]> {
|
||||
const results: TResult[] = []
|
||||
|
||||
do {
|
||||
const batch = promiseFactories.splice(0, batchSize)
|
||||
results.push(
|
||||
...await Promise.all(
|
||||
batch.map(async promiseFactory => await promiseFactory())
|
||||
)
|
||||
)
|
||||
} while (promiseFactories.length > 0)
|
||||
return results
|
||||
}
|
Ładowanie…
Reference in New Issue