kopia lustrzana https://github.com/Stopka/fedicrawl
Porównaj commity
5 Commity
29acce3906
...
c0b10f2e7a
Autor | SHA1 | Data |
---|---|---|
Štěpán Škorpil | c0b10f2e7a | |
Štěpán Škorpil | 4113d78a17 | |
Štěpán Škorpil | 704c7c066e | |
Štěpán Škorpil | 8d0452d16b | |
Štěpán Škorpil | 45cafbe94b |
|
@ -26,6 +26,7 @@
|
|||
"geoip-lite": "^1.4.6",
|
||||
"npmlog": "^6.0.0",
|
||||
"rimraf": "^3.0.2",
|
||||
"robots-parser": "^3.0.0",
|
||||
"striptags": "^3.2.0",
|
||||
"typescript-collections": "^1.3.3",
|
||||
"zod": "^3.19.1"
|
||||
|
@ -76,6 +77,5 @@
|
|||
"eslintIgnore": [
|
||||
"dist",
|
||||
"node_modules"
|
||||
],
|
||||
"prettier": "prettier-config-standard"
|
||||
]
|
||||
}
|
||||
|
|
|
@ -1,16 +1,18 @@
|
|||
import RobotsTxt from '../RobotsTxt/RobotsTxt.js'
|
||||
import { retrieveWellKnown } from './retrieveWellKnown'
|
||||
import { retrieveNodeInfo, NodeInfo } from './retrieveNodeInfo'
|
||||
import { NoSupportedLinkError } from './NoSupportedLinkError'
|
||||
|
||||
export const retrieveDomainNodeInfo = async (
|
||||
domain: string
|
||||
domain: string,
|
||||
robotsTxt: RobotsTxt
|
||||
): Promise<NodeInfo> => {
|
||||
const wellKnown = await retrieveWellKnown(domain)
|
||||
const wellKnown = await retrieveWellKnown(domain, robotsTxt)
|
||||
const link = wellKnown.links.find(
|
||||
(link) => link.rel === 'http://nodeinfo.diaspora.software/ns/schema/2.0'
|
||||
)
|
||||
if (typeof link === 'undefined') {
|
||||
throw new NoSupportedLinkError(domain)
|
||||
}
|
||||
return await retrieveNodeInfo(link.href)
|
||||
return await retrieveNodeInfo(link.href, robotsTxt)
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import axios from 'axios'
|
||||
import { z } from 'zod'
|
||||
import { assertSuccessJsonResponse } from '../assertSuccessJsonResponse'
|
||||
import { getDefaultTimeoutMilliseconds } from '../getDefaultTimeoutMilliseconds'
|
||||
import RobotsTxt from '../RobotsTxt/RobotsTxt.js'
|
||||
|
||||
const schema = z.object({
|
||||
name: z.string().optional(),
|
||||
|
@ -27,9 +27,9 @@ const schema = z.object({
|
|||
|
||||
export type NodeInfo = z.infer<typeof schema>
|
||||
|
||||
export const retrieveNodeInfo = async (url: string): Promise<NodeInfo> => {
|
||||
export const retrieveNodeInfo = async (url: string, robotsTxt: RobotsTxt): Promise<NodeInfo> => {
|
||||
console.info('Retrieving node info', { url })
|
||||
const nodeInfoResponse = await axios.get(url, {
|
||||
const nodeInfoResponse = await robotsTxt.getIfAllowed(url, {
|
||||
timeout: getDefaultTimeoutMilliseconds()
|
||||
})
|
||||
assertSuccessJsonResponse(nodeInfoResponse)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import axios from 'axios'
|
||||
import { assertSuccessJsonResponse } from '../assertSuccessJsonResponse'
|
||||
import { z } from 'zod'
|
||||
import { getDefaultTimeoutMilliseconds } from '../getDefaultTimeoutMilliseconds'
|
||||
import RobotsTxt from '../RobotsTxt/RobotsTxt.js'
|
||||
|
||||
const wellKnownSchema = z.object({
|
||||
links: z.array(
|
||||
|
@ -14,10 +14,10 @@ const wellKnownSchema = z.object({
|
|||
|
||||
export type WellKnown = z.infer<typeof wellKnownSchema>
|
||||
|
||||
export const retrieveWellKnown = async (domain: string): Promise<WellKnown> => {
|
||||
export const retrieveWellKnown = async (domain: string, robotsTxt: RobotsTxt): Promise<WellKnown> => {
|
||||
console.info('Retrieving well known', { domain })
|
||||
const wellKnownUrl = `https://${domain}/.well-known/nodeinfo`
|
||||
const wellKnownResponse = await axios.get(wellKnownUrl, {
|
||||
const wellKnownResponse = await robotsTxt.getIfAllowed(wellKnownUrl, {
|
||||
timeout: getDefaultTimeoutMilliseconds(),
|
||||
maxContentLength: 5000
|
||||
})
|
||||
|
|
|
@ -19,4 +19,5 @@ export interface FeedData {
|
|||
name: string
|
||||
hostDomain: string
|
||||
}
|
||||
indexable: boolean
|
||||
}
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
import RobotsTxt from '../RobotsTxt/RobotsTxt.js'
|
||||
import { FeedData } from './FeedData'
|
||||
|
||||
export type FeedProviderMethod = (
|
||||
domain: string,
|
||||
page: number
|
||||
page: number,
|
||||
robotsTxt: RobotsTxt
|
||||
) => Promise<FeedData[]>
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
import axios from 'axios'
|
||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||
import { z } from 'zod'
|
||||
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
||||
|
@ -6,7 +5,7 @@ import { FeedProviderMethod } from '../FeedProviderMethod'
|
|||
import { NoMoreFeedsError } from '../NoMoreFeedsError'
|
||||
import { FeedData } from '../FeedData'
|
||||
|
||||
const limit = 500
|
||||
const limit = 40
|
||||
|
||||
const emojiSchema = z.object({
|
||||
shortcode: z.string(),
|
||||
|
@ -35,7 +34,8 @@ const schema = z.array(
|
|||
value: z.string(),
|
||||
verified_at: z.nullable(z.string())
|
||||
})
|
||||
)
|
||||
),
|
||||
noindex: z.boolean().optional().nullable()
|
||||
})
|
||||
)
|
||||
|
||||
|
@ -53,9 +53,10 @@ const replaceEmojis = (text: string, emojis: Emoji[]): string => {
|
|||
|
||||
export const retrieveLocalPublicUsersPage: FeedProviderMethod = async (
|
||||
domain,
|
||||
page
|
||||
page,
|
||||
robotsTxt
|
||||
): Promise<FeedData[]> => {
|
||||
const response = await axios.get('https://' + domain + '/api/v1/directory', {
|
||||
const response = await robotsTxt.getIfAllowed(`https://${domain}/api/v1/directory`, {
|
||||
params: {
|
||||
limit,
|
||||
offset: page * limit,
|
||||
|
@ -94,7 +95,8 @@ export const retrieveLocalPublicUsersPage: FeedProviderMethod = async (
|
|||
}
|
||||
}),
|
||||
type: 'account',
|
||||
parentFeed: undefined
|
||||
parentFeed: undefined,
|
||||
indexable: !(item.noindex ?? false)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
import axios from 'axios'
|
||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||
import { z } from 'zod'
|
||||
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
||||
|
@ -7,12 +6,12 @@ import { NoMoreNodesError } from '../NoMoreNodesError'
|
|||
|
||||
const schema = z.array(z.string())
|
||||
|
||||
export const retrievePeers: NodeProviderMethod = async (domain, page) => {
|
||||
export const retrievePeers: NodeProviderMethod = async (domain, page, robotsTxt) => {
|
||||
if (page !== 0) {
|
||||
throw new NoMoreNodesError('peer')
|
||||
}
|
||||
const response = await axios.get(
|
||||
'https://' + domain + '/api/v1/instance/peers',
|
||||
const response = await robotsTxt.getIfAllowed(
|
||||
`https://${domain}/api/v1/instance/peers`,
|
||||
{
|
||||
timeout: getDefaultTimeoutMilliseconds()
|
||||
}
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
import axios from 'axios'
|
||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||
import { z } from 'zod'
|
||||
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
||||
|
@ -15,10 +14,11 @@ const schema = z.array(
|
|||
|
||||
export const retrieveInstancesPage: NodeProviderMethod = async (
|
||||
domain,
|
||||
page
|
||||
page,
|
||||
robotsTxt
|
||||
) => {
|
||||
const response = await axios.post(
|
||||
'https://' + domain + '/api/federation/instances',
|
||||
const response = await robotsTxt.postIfAllowed(
|
||||
`https://${domain}/api/federation/instances`,
|
||||
{
|
||||
host: null,
|
||||
blocked: null,
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
import axios from 'axios'
|
||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||
import { z } from 'zod'
|
||||
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
||||
|
@ -68,10 +67,11 @@ const parseDescription = (description: string | null): string => {
|
|||
|
||||
export const retrieveUsersPage: FeedProviderMethod = async (
|
||||
domain,
|
||||
page
|
||||
page,
|
||||
robotsTxt
|
||||
): Promise<FeedData[]> => {
|
||||
const response = await axios.post(
|
||||
'https://' + domain + '/api/users',
|
||||
const response = await robotsTxt.postIfAllowed(
|
||||
`https://${domain}/api/users`,
|
||||
{
|
||||
state: 'all',
|
||||
origin: 'local',
|
||||
|
@ -121,7 +121,8 @@ export const retrieveUsersPage: FeedProviderMethod = async (
|
|||
].filter((field) => field.value !== null) as FieldData[])
|
||||
],
|
||||
type: 'account',
|
||||
parentFeed: undefined
|
||||
parentFeed: undefined,
|
||||
indexable: true
|
||||
}
|
||||
})
|
||||
}
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
import RobotsTxt from '../RobotsTxt/RobotsTxt.js'
|
||||
|
||||
export type NodeProviderMethod = (
|
||||
domain: string,
|
||||
page: number
|
||||
page: number,
|
||||
robotsTxt: RobotsTxt
|
||||
) => Promise<string[]>
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
import { FeedData } from '../FeedData'
|
||||
import axios from 'axios'
|
||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||
import { z } from 'zod'
|
||||
import { avatarSchema } from './Avatar'
|
||||
|
@ -29,8 +28,8 @@ const schema = z.object({
|
|||
)
|
||||
})
|
||||
|
||||
export const retrieveAccounts: FeedProviderMethod = async (domain, page) => {
|
||||
const response = await axios.get(`https://${domain}/api/v1/accounts`, {
|
||||
export const retrieveAccounts: FeedProviderMethod = async (domain, page, robotsTxt) => {
|
||||
const response = await robotsTxt.getIfAllowed(`https://${domain}/api/v1/accounts`, {
|
||||
params: {
|
||||
count: limit,
|
||||
sort: 'createdAt',
|
||||
|
@ -61,7 +60,8 @@ export const retrieveAccounts: FeedProviderMethod = async (domain, page) => {
|
|||
lastStatusAt: undefined,
|
||||
statusesCount: undefined,
|
||||
type: 'account',
|
||||
parentFeed: undefined
|
||||
parentFeed: undefined,
|
||||
indexable: true
|
||||
}
|
||||
})
|
||||
}
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
import axios from 'axios'
|
||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||
import { z } from 'zod'
|
||||
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
||||
|
@ -21,8 +20,8 @@ const schema = z.object({
|
|||
)
|
||||
})
|
||||
|
||||
export const retrieveFollowers: NodeProviderMethod = async (domain, page) => {
|
||||
const response = await axios.get(
|
||||
export const retrieveFollowers: NodeProviderMethod = async (domain, page, robotsTxt) => {
|
||||
const response = await robotsTxt.getIfAllowed(
|
||||
`https://${domain}/api/v1/server/followers`,
|
||||
{
|
||||
params: {
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
import { FeedData } from '../FeedData'
|
||||
import axios from 'axios'
|
||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||
import { z } from 'zod'
|
||||
import { FieldData } from '../FieldData'
|
||||
|
@ -38,9 +37,10 @@ const schema = z.object({
|
|||
|
||||
export const retrieveVideoChannels: FeedProviderMethod = async (
|
||||
domain,
|
||||
page
|
||||
page,
|
||||
robotsTxt
|
||||
) => {
|
||||
const response = await axios.get(`https://${domain}/api/v1/video-channels`, {
|
||||
const response = await robotsTxt.getIfAllowed(`https://${domain}/api/v1/video-channels`, {
|
||||
params: {
|
||||
count: limit,
|
||||
sort: 'createdAt',
|
||||
|
@ -78,7 +78,8 @@ export const retrieveVideoChannels: FeedProviderMethod = async (
|
|||
parentFeed: {
|
||||
name: item.ownerAccount.name,
|
||||
hostDomain: item.ownerAccount.host
|
||||
}
|
||||
},
|
||||
indexable: true
|
||||
}
|
||||
})
|
||||
}
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
import { AxiosRequestConfig, AxiosResponse } from 'axios'
|
||||
|
||||
export default interface RobotsTxt {
|
||||
isAllowed: (url: string) => boolean
|
||||
getIfAllowed: <T = any, R = AxiosResponse<T>, D = any>(url: string, config?: AxiosRequestConfig<D>) => Promise<R>
|
||||
postIfAllowed: <T = any, R = AxiosResponse<T>, D = any>(url: string, data?: D, config?: AxiosRequestConfig<D>) => Promise<R>
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
export class RobotsTxtError extends Error {
|
||||
public readonly url
|
||||
public constructor (url: string) {
|
||||
super('Request was blocked by robots.txt')
|
||||
this.url = url
|
||||
}
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
import axios, { AxiosRequestConfig, AxiosResponse } from 'axios'
|
||||
import robotsParser from 'robots-parser'
|
||||
import RobotsTxt from './RobotsTxt.js'
|
||||
import { RobotsTxtError } from './RobotsTxtError.js'
|
||||
|
||||
const userAgent = 'FediCrawl/1.0'
|
||||
|
||||
export default async function fetchRobotsTxt (domain: string): Promise<RobotsTxt> {
|
||||
console.info('Fetching robots.txt', { domain })
|
||||
const url = `https://${domain}/robots.txt`
|
||||
let content = ''
|
||||
try {
|
||||
const robotsTxt = await axios.get(url)
|
||||
content = robotsTxt.data
|
||||
} catch (error) {
|
||||
console.info('Robots.txt not found', { error, url })
|
||||
}
|
||||
const robots = robotsParser(url, content)
|
||||
const isAllowed = (url: string): boolean => robots.isAllowed(url, userAgent) ?? true
|
||||
return {
|
||||
isAllowed,
|
||||
getIfAllowed: async <T = any, R = AxiosResponse<T>, D = any>(url: string, config?: AxiosRequestConfig<D>): Promise<R> => {
|
||||
if (!isAllowed(url)) {
|
||||
throw new RobotsTxtError(url)
|
||||
}
|
||||
return await axios.get(url, {
|
||||
headers: { 'User-Agent': userAgent },
|
||||
...config
|
||||
})
|
||||
},
|
||||
postIfAllowed: async <T = any, R = AxiosResponse<T>, D = any>(url: string, data?: D, config?: AxiosRequestConfig<D>): Promise<R> => {
|
||||
if (!isAllowed(url)) {
|
||||
throw new RobotsTxtError(url)
|
||||
}
|
||||
return await axios.post(url, data, {
|
||||
headers: { 'User-Agent': userAgent },
|
||||
...config
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,3 +1,4 @@
|
|||
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
|
||||
import { refreshFeedsOnPage } from './refreshFeedsOnPage'
|
||||
import { FeedProvider } from '../../Fediverse/Providers/FeedProvider'
|
||||
import Node from '../../Storage/Definitions/Node'
|
||||
|
@ -6,7 +7,8 @@ import { ElasticClient } from '../../Storage/ElasticClient'
|
|||
export const refreshFeeds = async (
|
||||
elastic: ElasticClient,
|
||||
provider: FeedProvider,
|
||||
node: Node
|
||||
node: Node,
|
||||
robotsTxt: RobotsTxt
|
||||
): Promise<void> => {
|
||||
try {
|
||||
// noinspection InfiniteLoopJS
|
||||
|
@ -16,7 +18,7 @@ export const refreshFeeds = async (
|
|||
provider: provider.getKey(),
|
||||
page
|
||||
})
|
||||
await refreshFeedsOnPage(elastic, provider, node, page)
|
||||
await refreshFeedsOnPage(elastic, provider, node, page, robotsTxt)
|
||||
}
|
||||
} catch (error) {
|
||||
console.info('Feed search finished', {
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
|
||||
import { refreshOrAddFeed } from './refreshOrAddFeed'
|
||||
import { FeedProvider } from '../../Fediverse/Providers/FeedProvider'
|
||||
import Node from '../../Storage/Definitions/Node'
|
||||
|
@ -8,17 +9,20 @@ export const refreshFeedsOnPage = async (
|
|||
elastic: ElasticClient,
|
||||
provider: FeedProvider,
|
||||
node: Node,
|
||||
page: number
|
||||
page: number,
|
||||
robotsTxt: RobotsTxt
|
||||
): Promise<Feed[]> => {
|
||||
const feedData = await provider.retrieveFeeds(node.domain, page)
|
||||
const feedData = await provider.retrieveFeeds(node.domain, page, robotsTxt)
|
||||
const indexableFeedData = feedData.filter(item => item.indexable && !item.description.includes('#noindex'))
|
||||
console.info('Retrieved feeds', {
|
||||
count: feedData.length,
|
||||
indexableCount: indexableFeedData.length,
|
||||
domain: node.domain,
|
||||
provider: provider.getKey(),
|
||||
page
|
||||
})
|
||||
return await Promise.all(
|
||||
feedData.map(
|
||||
indexableFeedData.map(
|
||||
async (feedDataItem) =>
|
||||
await refreshOrAddFeed(elastic, node, feedDataItem)
|
||||
)
|
||||
|
|
|
@ -1,15 +1,17 @@
|
|||
import { retrieveDomainNodeInfo } from '../../Fediverse/NodeInfo/retrieveDomainNodeInfo'
|
||||
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
|
||||
import { updateNodeInfo } from '../../Storage/Nodes/updateNodeInfo'
|
||||
import Node from '../../Storage/Definitions/Node'
|
||||
import { ElasticClient } from '../../Storage/ElasticClient'
|
||||
|
||||
export const refreshNodeInfo = async (
|
||||
elastic: ElasticClient,
|
||||
node: Node
|
||||
node: Node,
|
||||
robotsTxt: RobotsTxt
|
||||
): Promise<Node> => {
|
||||
console.info('Updating info of node', { nodeDomain: node.domain })
|
||||
try {
|
||||
const nodeInfo = await retrieveDomainNodeInfo(node.domain)
|
||||
const nodeInfo = await retrieveDomainNodeInfo(node.domain, robotsTxt)
|
||||
return await updateNodeInfo(elastic, node, nodeInfo)
|
||||
} catch (error) {
|
||||
console.warn('Failed to update node info', error)
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import { NodeProvider } from '../../Fediverse/Providers/NodeProvider'
|
||||
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
|
||||
import { findNewNodesOnPage } from './findNewNodesOnPage'
|
||||
import Node from '../../Storage/Definitions/Node'
|
||||
import { ElasticClient } from '../../Storage/ElasticClient'
|
||||
|
@ -6,7 +7,8 @@ import { ElasticClient } from '../../Storage/ElasticClient'
|
|||
export const findNewNodes = async (
|
||||
elastic: ElasticClient,
|
||||
provider: NodeProvider,
|
||||
node: Node
|
||||
node: Node,
|
||||
robotsTxt: RobotsTxt
|
||||
): Promise<void> => {
|
||||
try {
|
||||
// noinspection InfiniteLoopJS
|
||||
|
@ -15,7 +17,7 @@ export const findNewNodes = async (
|
|||
domain: node.domain,
|
||||
provider: provider.getKey()
|
||||
})
|
||||
await findNewNodesOnPage(elastic, provider, node, page)
|
||||
await findNewNodesOnPage(elastic, provider, node, page, robotsTxt)
|
||||
}
|
||||
} catch (error) {
|
||||
console.info('Node search finished', {
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
|
||||
import { createMissingNodes } from '../../Storage/Nodes/createMissingNodes'
|
||||
import { NodeProvider } from '../../Fediverse/Providers/NodeProvider'
|
||||
import Node from '../../Storage/Definitions/Node'
|
||||
|
@ -8,9 +9,10 @@ export const findNewNodesOnPage = async (
|
|||
elastic: ElasticClient,
|
||||
provider: NodeProvider,
|
||||
node: Node,
|
||||
page: number
|
||||
page: number,
|
||||
robotsTxt: RobotsTxt
|
||||
): Promise<number> => {
|
||||
let domains = await provider.retrieveNodes(node.domain, page)
|
||||
let domains = await provider.retrieveNodes(node.domain, page, robotsTxt)
|
||||
domains = domains.filter(isDomainNotBanned)
|
||||
console.log('Found nodes', {
|
||||
count: domains.length,
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import fetchRobotsTxt from '../Fediverse/RobotsTxt/fetchRobotsTxt.js'
|
||||
import { fetchNodeToProcess } from '../Storage/Nodes/fetchNodeToProcess'
|
||||
import { ProviderRegistry } from '../Fediverse/Providers/ProviderRegistry'
|
||||
import { setNodeRefreshed } from '../Storage/Nodes/setNodeRefreshed'
|
||||
|
@ -21,7 +22,8 @@ export const processNextNode = async (
|
|||
node = await setNodeRefreshAttempted(elastic, node)
|
||||
|
||||
node = await refreshNodeIps(elastic, node)
|
||||
node = await refreshNodeInfo(elastic, node)
|
||||
const robotsTxt = await fetchRobotsTxt(node.domain)
|
||||
node = await refreshNodeInfo(elastic, node, robotsTxt)
|
||||
|
||||
const softwareName = node.softwareName ?? ''
|
||||
if (!providerRegistry.containsKey(softwareName)) {
|
||||
|
@ -41,7 +43,7 @@ export const processNextNode = async (
|
|||
domain: node.domain,
|
||||
provider: nodeProvider.getKey()
|
||||
})
|
||||
return await findNewNodes(elastic, nodeProvider, node)
|
||||
return await findNewNodes(elastic, nodeProvider, node, robotsTxt)
|
||||
})
|
||||
)
|
||||
|
||||
|
@ -51,7 +53,7 @@ export const processNextNode = async (
|
|||
domain: node.domain,
|
||||
provider: feedProvider.getKey()
|
||||
})
|
||||
return await refreshFeeds(elastic, feedProvider, node)
|
||||
return await refreshFeeds(elastic, feedProvider, node, robotsTxt)
|
||||
})
|
||||
)
|
||||
|
||||
|
|
|
@ -8,23 +8,6 @@ const assertNodeIndex = async (elastic: ElasticClient): Promise<void> => {
|
|||
id: 'node',
|
||||
description: 'Default node pipeline',
|
||||
processors: [
|
||||
{
|
||||
// @ts-expect-error
|
||||
geoip: {
|
||||
ignore_missing: true,
|
||||
field: 'serverIps',
|
||||
properties: [
|
||||
'location',
|
||||
'continent_name',
|
||||
'country_name',
|
||||
'country_iso_code',
|
||||
'region_iso_code',
|
||||
'region_name',
|
||||
'city_name'
|
||||
],
|
||||
target_field: 'geoip'
|
||||
}
|
||||
},
|
||||
{
|
||||
grok: {
|
||||
ignore_missing: true,
|
||||
|
|
|
@ -3071,6 +3071,11 @@ rimraf@^3.0.2:
|
|||
dependencies:
|
||||
glob "^7.1.3"
|
||||
|
||||
robots-parser@^3.0.0:
|
||||
version "3.0.0"
|
||||
resolved "https://registry.yarnpkg.com/robots-parser/-/robots-parser-3.0.0.tgz#66af89306302ecd004455f2f24298310d0966631"
|
||||
integrity sha512-6xkze3WRdneibICBAzMKcXyTKQw5shA3GbwoEJy7RSvxpZNGF0GMuYKE1T0VMP4fwx/fQs0n0mtriOqRtk5L1w==
|
||||
|
||||
run-parallel@^1.1.9:
|
||||
version "1.2.0"
|
||||
resolved "https://registry.yarnpkg.com/run-parallel/-/run-parallel-1.2.0.tgz#66d1368da7bdf921eb9d95bd1a9229e7f21a43ee"
|
||||
|
|
Ładowanie…
Reference in New Issue