kopia lustrzana https://github.com/Stopka/fedicrawl
Porównaj commity
5 Commity
29acce3906
...
c0b10f2e7a
Autor | SHA1 | Data |
---|---|---|
Štěpán Škorpil | c0b10f2e7a | |
Štěpán Škorpil | 4113d78a17 | |
Štěpán Škorpil | 704c7c066e | |
Štěpán Škorpil | 8d0452d16b | |
Štěpán Škorpil | 45cafbe94b |
|
@ -26,6 +26,7 @@
|
||||||
"geoip-lite": "^1.4.6",
|
"geoip-lite": "^1.4.6",
|
||||||
"npmlog": "^6.0.0",
|
"npmlog": "^6.0.0",
|
||||||
"rimraf": "^3.0.2",
|
"rimraf": "^3.0.2",
|
||||||
|
"robots-parser": "^3.0.0",
|
||||||
"striptags": "^3.2.0",
|
"striptags": "^3.2.0",
|
||||||
"typescript-collections": "^1.3.3",
|
"typescript-collections": "^1.3.3",
|
||||||
"zod": "^3.19.1"
|
"zod": "^3.19.1"
|
||||||
|
@ -76,6 +77,5 @@
|
||||||
"eslintIgnore": [
|
"eslintIgnore": [
|
||||||
"dist",
|
"dist",
|
||||||
"node_modules"
|
"node_modules"
|
||||||
],
|
]
|
||||||
"prettier": "prettier-config-standard"
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,16 +1,18 @@
|
||||||
|
import RobotsTxt from '../RobotsTxt/RobotsTxt.js'
|
||||||
import { retrieveWellKnown } from './retrieveWellKnown'
|
import { retrieveWellKnown } from './retrieveWellKnown'
|
||||||
import { retrieveNodeInfo, NodeInfo } from './retrieveNodeInfo'
|
import { retrieveNodeInfo, NodeInfo } from './retrieveNodeInfo'
|
||||||
import { NoSupportedLinkError } from './NoSupportedLinkError'
|
import { NoSupportedLinkError } from './NoSupportedLinkError'
|
||||||
|
|
||||||
export const retrieveDomainNodeInfo = async (
|
export const retrieveDomainNodeInfo = async (
|
||||||
domain: string
|
domain: string,
|
||||||
|
robotsTxt: RobotsTxt
|
||||||
): Promise<NodeInfo> => {
|
): Promise<NodeInfo> => {
|
||||||
const wellKnown = await retrieveWellKnown(domain)
|
const wellKnown = await retrieveWellKnown(domain, robotsTxt)
|
||||||
const link = wellKnown.links.find(
|
const link = wellKnown.links.find(
|
||||||
(link) => link.rel === 'http://nodeinfo.diaspora.software/ns/schema/2.0'
|
(link) => link.rel === 'http://nodeinfo.diaspora.software/ns/schema/2.0'
|
||||||
)
|
)
|
||||||
if (typeof link === 'undefined') {
|
if (typeof link === 'undefined') {
|
||||||
throw new NoSupportedLinkError(domain)
|
throw new NoSupportedLinkError(domain)
|
||||||
}
|
}
|
||||||
return await retrieveNodeInfo(link.href)
|
return await retrieveNodeInfo(link.href, robotsTxt)
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import axios from 'axios'
|
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
import { assertSuccessJsonResponse } from '../assertSuccessJsonResponse'
|
import { assertSuccessJsonResponse } from '../assertSuccessJsonResponse'
|
||||||
import { getDefaultTimeoutMilliseconds } from '../getDefaultTimeoutMilliseconds'
|
import { getDefaultTimeoutMilliseconds } from '../getDefaultTimeoutMilliseconds'
|
||||||
|
import RobotsTxt from '../RobotsTxt/RobotsTxt.js'
|
||||||
|
|
||||||
const schema = z.object({
|
const schema = z.object({
|
||||||
name: z.string().optional(),
|
name: z.string().optional(),
|
||||||
|
@ -27,9 +27,9 @@ const schema = z.object({
|
||||||
|
|
||||||
export type NodeInfo = z.infer<typeof schema>
|
export type NodeInfo = z.infer<typeof schema>
|
||||||
|
|
||||||
export const retrieveNodeInfo = async (url: string): Promise<NodeInfo> => {
|
export const retrieveNodeInfo = async (url: string, robotsTxt: RobotsTxt): Promise<NodeInfo> => {
|
||||||
console.info('Retrieving node info', { url })
|
console.info('Retrieving node info', { url })
|
||||||
const nodeInfoResponse = await axios.get(url, {
|
const nodeInfoResponse = await robotsTxt.getIfAllowed(url, {
|
||||||
timeout: getDefaultTimeoutMilliseconds()
|
timeout: getDefaultTimeoutMilliseconds()
|
||||||
})
|
})
|
||||||
assertSuccessJsonResponse(nodeInfoResponse)
|
assertSuccessJsonResponse(nodeInfoResponse)
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import axios from 'axios'
|
|
||||||
import { assertSuccessJsonResponse } from '../assertSuccessJsonResponse'
|
import { assertSuccessJsonResponse } from '../assertSuccessJsonResponse'
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
import { getDefaultTimeoutMilliseconds } from '../getDefaultTimeoutMilliseconds'
|
import { getDefaultTimeoutMilliseconds } from '../getDefaultTimeoutMilliseconds'
|
||||||
|
import RobotsTxt from '../RobotsTxt/RobotsTxt.js'
|
||||||
|
|
||||||
const wellKnownSchema = z.object({
|
const wellKnownSchema = z.object({
|
||||||
links: z.array(
|
links: z.array(
|
||||||
|
@ -14,10 +14,10 @@ const wellKnownSchema = z.object({
|
||||||
|
|
||||||
export type WellKnown = z.infer<typeof wellKnownSchema>
|
export type WellKnown = z.infer<typeof wellKnownSchema>
|
||||||
|
|
||||||
export const retrieveWellKnown = async (domain: string): Promise<WellKnown> => {
|
export const retrieveWellKnown = async (domain: string, robotsTxt: RobotsTxt): Promise<WellKnown> => {
|
||||||
console.info('Retrieving well known', { domain })
|
console.info('Retrieving well known', { domain })
|
||||||
const wellKnownUrl = `https://${domain}/.well-known/nodeinfo`
|
const wellKnownUrl = `https://${domain}/.well-known/nodeinfo`
|
||||||
const wellKnownResponse = await axios.get(wellKnownUrl, {
|
const wellKnownResponse = await robotsTxt.getIfAllowed(wellKnownUrl, {
|
||||||
timeout: getDefaultTimeoutMilliseconds(),
|
timeout: getDefaultTimeoutMilliseconds(),
|
||||||
maxContentLength: 5000
|
maxContentLength: 5000
|
||||||
})
|
})
|
||||||
|
|
|
@ -19,4 +19,5 @@ export interface FeedData {
|
||||||
name: string
|
name: string
|
||||||
hostDomain: string
|
hostDomain: string
|
||||||
}
|
}
|
||||||
|
indexable: boolean
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
|
import RobotsTxt from '../RobotsTxt/RobotsTxt.js'
|
||||||
import { FeedData } from './FeedData'
|
import { FeedData } from './FeedData'
|
||||||
|
|
||||||
export type FeedProviderMethod = (
|
export type FeedProviderMethod = (
|
||||||
domain: string,
|
domain: string,
|
||||||
page: number
|
page: number,
|
||||||
|
robotsTxt: RobotsTxt
|
||||||
) => Promise<FeedData[]>
|
) => Promise<FeedData[]>
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import axios from 'axios'
|
|
||||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
||||||
|
@ -6,7 +5,7 @@ import { FeedProviderMethod } from '../FeedProviderMethod'
|
||||||
import { NoMoreFeedsError } from '../NoMoreFeedsError'
|
import { NoMoreFeedsError } from '../NoMoreFeedsError'
|
||||||
import { FeedData } from '../FeedData'
|
import { FeedData } from '../FeedData'
|
||||||
|
|
||||||
const limit = 500
|
const limit = 40
|
||||||
|
|
||||||
const emojiSchema = z.object({
|
const emojiSchema = z.object({
|
||||||
shortcode: z.string(),
|
shortcode: z.string(),
|
||||||
|
@ -35,7 +34,8 @@ const schema = z.array(
|
||||||
value: z.string(),
|
value: z.string(),
|
||||||
verified_at: z.nullable(z.string())
|
verified_at: z.nullable(z.string())
|
||||||
})
|
})
|
||||||
)
|
),
|
||||||
|
noindex: z.boolean().optional().nullable()
|
||||||
})
|
})
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -53,9 +53,10 @@ const replaceEmojis = (text: string, emojis: Emoji[]): string => {
|
||||||
|
|
||||||
export const retrieveLocalPublicUsersPage: FeedProviderMethod = async (
|
export const retrieveLocalPublicUsersPage: FeedProviderMethod = async (
|
||||||
domain,
|
domain,
|
||||||
page
|
page,
|
||||||
|
robotsTxt
|
||||||
): Promise<FeedData[]> => {
|
): Promise<FeedData[]> => {
|
||||||
const response = await axios.get('https://' + domain + '/api/v1/directory', {
|
const response = await robotsTxt.getIfAllowed(`https://${domain}/api/v1/directory`, {
|
||||||
params: {
|
params: {
|
||||||
limit,
|
limit,
|
||||||
offset: page * limit,
|
offset: page * limit,
|
||||||
|
@ -94,7 +95,8 @@ export const retrieveLocalPublicUsersPage: FeedProviderMethod = async (
|
||||||
}
|
}
|
||||||
}),
|
}),
|
||||||
type: 'account',
|
type: 'account',
|
||||||
parentFeed: undefined
|
parentFeed: undefined,
|
||||||
|
indexable: !(item.noindex ?? false)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import axios from 'axios'
|
|
||||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
||||||
|
@ -7,12 +6,12 @@ import { NoMoreNodesError } from '../NoMoreNodesError'
|
||||||
|
|
||||||
const schema = z.array(z.string())
|
const schema = z.array(z.string())
|
||||||
|
|
||||||
export const retrievePeers: NodeProviderMethod = async (domain, page) => {
|
export const retrievePeers: NodeProviderMethod = async (domain, page, robotsTxt) => {
|
||||||
if (page !== 0) {
|
if (page !== 0) {
|
||||||
throw new NoMoreNodesError('peer')
|
throw new NoMoreNodesError('peer')
|
||||||
}
|
}
|
||||||
const response = await axios.get(
|
const response = await robotsTxt.getIfAllowed(
|
||||||
'https://' + domain + '/api/v1/instance/peers',
|
`https://${domain}/api/v1/instance/peers`,
|
||||||
{
|
{
|
||||||
timeout: getDefaultTimeoutMilliseconds()
|
timeout: getDefaultTimeoutMilliseconds()
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import axios from 'axios'
|
|
||||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
||||||
|
@ -15,10 +14,11 @@ const schema = z.array(
|
||||||
|
|
||||||
export const retrieveInstancesPage: NodeProviderMethod = async (
|
export const retrieveInstancesPage: NodeProviderMethod = async (
|
||||||
domain,
|
domain,
|
||||||
page
|
page,
|
||||||
|
robotsTxt
|
||||||
) => {
|
) => {
|
||||||
const response = await axios.post(
|
const response = await robotsTxt.postIfAllowed(
|
||||||
'https://' + domain + '/api/federation/instances',
|
`https://${domain}/api/federation/instances`,
|
||||||
{
|
{
|
||||||
host: null,
|
host: null,
|
||||||
blocked: null,
|
blocked: null,
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import axios from 'axios'
|
|
||||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
||||||
|
@ -68,10 +67,11 @@ const parseDescription = (description: string | null): string => {
|
||||||
|
|
||||||
export const retrieveUsersPage: FeedProviderMethod = async (
|
export const retrieveUsersPage: FeedProviderMethod = async (
|
||||||
domain,
|
domain,
|
||||||
page
|
page,
|
||||||
|
robotsTxt
|
||||||
): Promise<FeedData[]> => {
|
): Promise<FeedData[]> => {
|
||||||
const response = await axios.post(
|
const response = await robotsTxt.postIfAllowed(
|
||||||
'https://' + domain + '/api/users',
|
`https://${domain}/api/users`,
|
||||||
{
|
{
|
||||||
state: 'all',
|
state: 'all',
|
||||||
origin: 'local',
|
origin: 'local',
|
||||||
|
@ -121,7 +121,8 @@ export const retrieveUsersPage: FeedProviderMethod = async (
|
||||||
].filter((field) => field.value !== null) as FieldData[])
|
].filter((field) => field.value !== null) as FieldData[])
|
||||||
],
|
],
|
||||||
type: 'account',
|
type: 'account',
|
||||||
parentFeed: undefined
|
parentFeed: undefined,
|
||||||
|
indexable: true
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,7 @@
|
||||||
|
import RobotsTxt from '../RobotsTxt/RobotsTxt.js'
|
||||||
|
|
||||||
export type NodeProviderMethod = (
|
export type NodeProviderMethod = (
|
||||||
domain: string,
|
domain: string,
|
||||||
page: number
|
page: number,
|
||||||
|
robotsTxt: RobotsTxt
|
||||||
) => Promise<string[]>
|
) => Promise<string[]>
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
import { FeedData } from '../FeedData'
|
import { FeedData } from '../FeedData'
|
||||||
import axios from 'axios'
|
|
||||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
import { avatarSchema } from './Avatar'
|
import { avatarSchema } from './Avatar'
|
||||||
|
@ -29,8 +28,8 @@ const schema = z.object({
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
|
|
||||||
export const retrieveAccounts: FeedProviderMethod = async (domain, page) => {
|
export const retrieveAccounts: FeedProviderMethod = async (domain, page, robotsTxt) => {
|
||||||
const response = await axios.get(`https://${domain}/api/v1/accounts`, {
|
const response = await robotsTxt.getIfAllowed(`https://${domain}/api/v1/accounts`, {
|
||||||
params: {
|
params: {
|
||||||
count: limit,
|
count: limit,
|
||||||
sort: 'createdAt',
|
sort: 'createdAt',
|
||||||
|
@ -61,7 +60,8 @@ export const retrieveAccounts: FeedProviderMethod = async (domain, page) => {
|
||||||
lastStatusAt: undefined,
|
lastStatusAt: undefined,
|
||||||
statusesCount: undefined,
|
statusesCount: undefined,
|
||||||
type: 'account',
|
type: 'account',
|
||||||
parentFeed: undefined
|
parentFeed: undefined,
|
||||||
|
indexable: true
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import axios from 'axios'
|
|
||||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
||||||
|
@ -21,8 +20,8 @@ const schema = z.object({
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
|
|
||||||
export const retrieveFollowers: NodeProviderMethod = async (domain, page) => {
|
export const retrieveFollowers: NodeProviderMethod = async (domain, page, robotsTxt) => {
|
||||||
const response = await axios.get(
|
const response = await robotsTxt.getIfAllowed(
|
||||||
`https://${domain}/api/v1/server/followers`,
|
`https://${domain}/api/v1/server/followers`,
|
||||||
{
|
{
|
||||||
params: {
|
params: {
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
import { FeedData } from '../FeedData'
|
import { FeedData } from '../FeedData'
|
||||||
import axios from 'axios'
|
|
||||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
import { FieldData } from '../FieldData'
|
import { FieldData } from '../FieldData'
|
||||||
|
@ -38,9 +37,10 @@ const schema = z.object({
|
||||||
|
|
||||||
export const retrieveVideoChannels: FeedProviderMethod = async (
|
export const retrieveVideoChannels: FeedProviderMethod = async (
|
||||||
domain,
|
domain,
|
||||||
page
|
page,
|
||||||
|
robotsTxt
|
||||||
) => {
|
) => {
|
||||||
const response = await axios.get(`https://${domain}/api/v1/video-channels`, {
|
const response = await robotsTxt.getIfAllowed(`https://${domain}/api/v1/video-channels`, {
|
||||||
params: {
|
params: {
|
||||||
count: limit,
|
count: limit,
|
||||||
sort: 'createdAt',
|
sort: 'createdAt',
|
||||||
|
@ -78,7 +78,8 @@ export const retrieveVideoChannels: FeedProviderMethod = async (
|
||||||
parentFeed: {
|
parentFeed: {
|
||||||
name: item.ownerAccount.name,
|
name: item.ownerAccount.name,
|
||||||
hostDomain: item.ownerAccount.host
|
hostDomain: item.ownerAccount.host
|
||||||
}
|
},
|
||||||
|
indexable: true
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
import { AxiosRequestConfig, AxiosResponse } from 'axios'
|
||||||
|
|
||||||
|
export default interface RobotsTxt {
|
||||||
|
isAllowed: (url: string) => boolean
|
||||||
|
getIfAllowed: <T = any, R = AxiosResponse<T>, D = any>(url: string, config?: AxiosRequestConfig<D>) => Promise<R>
|
||||||
|
postIfAllowed: <T = any, R = AxiosResponse<T>, D = any>(url: string, data?: D, config?: AxiosRequestConfig<D>) => Promise<R>
|
||||||
|
}
|
|
@ -0,0 +1,7 @@
|
||||||
|
export class RobotsTxtError extends Error {
|
||||||
|
public readonly url
|
||||||
|
public constructor (url: string) {
|
||||||
|
super('Request was blocked by robots.txt')
|
||||||
|
this.url = url
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,41 @@
|
||||||
|
import axios, { AxiosRequestConfig, AxiosResponse } from 'axios'
|
||||||
|
import robotsParser from 'robots-parser'
|
||||||
|
import RobotsTxt from './RobotsTxt.js'
|
||||||
|
import { RobotsTxtError } from './RobotsTxtError.js'
|
||||||
|
|
||||||
|
const userAgent = 'FediCrawl/1.0'
|
||||||
|
|
||||||
|
export default async function fetchRobotsTxt (domain: string): Promise<RobotsTxt> {
|
||||||
|
console.info('Fetching robots.txt', { domain })
|
||||||
|
const url = `https://${domain}/robots.txt`
|
||||||
|
let content = ''
|
||||||
|
try {
|
||||||
|
const robotsTxt = await axios.get(url)
|
||||||
|
content = robotsTxt.data
|
||||||
|
} catch (error) {
|
||||||
|
console.info('Robots.txt not found', { error, url })
|
||||||
|
}
|
||||||
|
const robots = robotsParser(url, content)
|
||||||
|
const isAllowed = (url: string): boolean => robots.isAllowed(url, userAgent) ?? true
|
||||||
|
return {
|
||||||
|
isAllowed,
|
||||||
|
getIfAllowed: async <T = any, R = AxiosResponse<T>, D = any>(url: string, config?: AxiosRequestConfig<D>): Promise<R> => {
|
||||||
|
if (!isAllowed(url)) {
|
||||||
|
throw new RobotsTxtError(url)
|
||||||
|
}
|
||||||
|
return await axios.get(url, {
|
||||||
|
headers: { 'User-Agent': userAgent },
|
||||||
|
...config
|
||||||
|
})
|
||||||
|
},
|
||||||
|
postIfAllowed: async <T = any, R = AxiosResponse<T>, D = any>(url: string, data?: D, config?: AxiosRequestConfig<D>): Promise<R> => {
|
||||||
|
if (!isAllowed(url)) {
|
||||||
|
throw new RobotsTxtError(url)
|
||||||
|
}
|
||||||
|
return await axios.post(url, data, {
|
||||||
|
headers: { 'User-Agent': userAgent },
|
||||||
|
...config
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,3 +1,4 @@
|
||||||
|
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
|
||||||
import { refreshFeedsOnPage } from './refreshFeedsOnPage'
|
import { refreshFeedsOnPage } from './refreshFeedsOnPage'
|
||||||
import { FeedProvider } from '../../Fediverse/Providers/FeedProvider'
|
import { FeedProvider } from '../../Fediverse/Providers/FeedProvider'
|
||||||
import Node from '../../Storage/Definitions/Node'
|
import Node from '../../Storage/Definitions/Node'
|
||||||
|
@ -6,7 +7,8 @@ import { ElasticClient } from '../../Storage/ElasticClient'
|
||||||
export const refreshFeeds = async (
|
export const refreshFeeds = async (
|
||||||
elastic: ElasticClient,
|
elastic: ElasticClient,
|
||||||
provider: FeedProvider,
|
provider: FeedProvider,
|
||||||
node: Node
|
node: Node,
|
||||||
|
robotsTxt: RobotsTxt
|
||||||
): Promise<void> => {
|
): Promise<void> => {
|
||||||
try {
|
try {
|
||||||
// noinspection InfiniteLoopJS
|
// noinspection InfiniteLoopJS
|
||||||
|
@ -16,7 +18,7 @@ export const refreshFeeds = async (
|
||||||
provider: provider.getKey(),
|
provider: provider.getKey(),
|
||||||
page
|
page
|
||||||
})
|
})
|
||||||
await refreshFeedsOnPage(elastic, provider, node, page)
|
await refreshFeedsOnPage(elastic, provider, node, page, robotsTxt)
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.info('Feed search finished', {
|
console.info('Feed search finished', {
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
|
||||||
import { refreshOrAddFeed } from './refreshOrAddFeed'
|
import { refreshOrAddFeed } from './refreshOrAddFeed'
|
||||||
import { FeedProvider } from '../../Fediverse/Providers/FeedProvider'
|
import { FeedProvider } from '../../Fediverse/Providers/FeedProvider'
|
||||||
import Node from '../../Storage/Definitions/Node'
|
import Node from '../../Storage/Definitions/Node'
|
||||||
|
@ -8,17 +9,20 @@ export const refreshFeedsOnPage = async (
|
||||||
elastic: ElasticClient,
|
elastic: ElasticClient,
|
||||||
provider: FeedProvider,
|
provider: FeedProvider,
|
||||||
node: Node,
|
node: Node,
|
||||||
page: number
|
page: number,
|
||||||
|
robotsTxt: RobotsTxt
|
||||||
): Promise<Feed[]> => {
|
): Promise<Feed[]> => {
|
||||||
const feedData = await provider.retrieveFeeds(node.domain, page)
|
const feedData = await provider.retrieveFeeds(node.domain, page, robotsTxt)
|
||||||
|
const indexableFeedData = feedData.filter(item => item.indexable && !item.description.includes('#noindex'))
|
||||||
console.info('Retrieved feeds', {
|
console.info('Retrieved feeds', {
|
||||||
count: feedData.length,
|
count: feedData.length,
|
||||||
|
indexableCount: indexableFeedData.length,
|
||||||
domain: node.domain,
|
domain: node.domain,
|
||||||
provider: provider.getKey(),
|
provider: provider.getKey(),
|
||||||
page
|
page
|
||||||
})
|
})
|
||||||
return await Promise.all(
|
return await Promise.all(
|
||||||
feedData.map(
|
indexableFeedData.map(
|
||||||
async (feedDataItem) =>
|
async (feedDataItem) =>
|
||||||
await refreshOrAddFeed(elastic, node, feedDataItem)
|
await refreshOrAddFeed(elastic, node, feedDataItem)
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,15 +1,17 @@
|
||||||
import { retrieveDomainNodeInfo } from '../../Fediverse/NodeInfo/retrieveDomainNodeInfo'
|
import { retrieveDomainNodeInfo } from '../../Fediverse/NodeInfo/retrieveDomainNodeInfo'
|
||||||
|
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
|
||||||
import { updateNodeInfo } from '../../Storage/Nodes/updateNodeInfo'
|
import { updateNodeInfo } from '../../Storage/Nodes/updateNodeInfo'
|
||||||
import Node from '../../Storage/Definitions/Node'
|
import Node from '../../Storage/Definitions/Node'
|
||||||
import { ElasticClient } from '../../Storage/ElasticClient'
|
import { ElasticClient } from '../../Storage/ElasticClient'
|
||||||
|
|
||||||
export const refreshNodeInfo = async (
|
export const refreshNodeInfo = async (
|
||||||
elastic: ElasticClient,
|
elastic: ElasticClient,
|
||||||
node: Node
|
node: Node,
|
||||||
|
robotsTxt: RobotsTxt
|
||||||
): Promise<Node> => {
|
): Promise<Node> => {
|
||||||
console.info('Updating info of node', { nodeDomain: node.domain })
|
console.info('Updating info of node', { nodeDomain: node.domain })
|
||||||
try {
|
try {
|
||||||
const nodeInfo = await retrieveDomainNodeInfo(node.domain)
|
const nodeInfo = await retrieveDomainNodeInfo(node.domain, robotsTxt)
|
||||||
return await updateNodeInfo(elastic, node, nodeInfo)
|
return await updateNodeInfo(elastic, node, nodeInfo)
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.warn('Failed to update node info', error)
|
console.warn('Failed to update node info', error)
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import { NodeProvider } from '../../Fediverse/Providers/NodeProvider'
|
import { NodeProvider } from '../../Fediverse/Providers/NodeProvider'
|
||||||
|
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
|
||||||
import { findNewNodesOnPage } from './findNewNodesOnPage'
|
import { findNewNodesOnPage } from './findNewNodesOnPage'
|
||||||
import Node from '../../Storage/Definitions/Node'
|
import Node from '../../Storage/Definitions/Node'
|
||||||
import { ElasticClient } from '../../Storage/ElasticClient'
|
import { ElasticClient } from '../../Storage/ElasticClient'
|
||||||
|
@ -6,7 +7,8 @@ import { ElasticClient } from '../../Storage/ElasticClient'
|
||||||
export const findNewNodes = async (
|
export const findNewNodes = async (
|
||||||
elastic: ElasticClient,
|
elastic: ElasticClient,
|
||||||
provider: NodeProvider,
|
provider: NodeProvider,
|
||||||
node: Node
|
node: Node,
|
||||||
|
robotsTxt: RobotsTxt
|
||||||
): Promise<void> => {
|
): Promise<void> => {
|
||||||
try {
|
try {
|
||||||
// noinspection InfiniteLoopJS
|
// noinspection InfiniteLoopJS
|
||||||
|
@ -15,7 +17,7 @@ export const findNewNodes = async (
|
||||||
domain: node.domain,
|
domain: node.domain,
|
||||||
provider: provider.getKey()
|
provider: provider.getKey()
|
||||||
})
|
})
|
||||||
await findNewNodesOnPage(elastic, provider, node, page)
|
await findNewNodesOnPage(elastic, provider, node, page, robotsTxt)
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.info('Node search finished', {
|
console.info('Node search finished', {
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
|
||||||
import { createMissingNodes } from '../../Storage/Nodes/createMissingNodes'
|
import { createMissingNodes } from '../../Storage/Nodes/createMissingNodes'
|
||||||
import { NodeProvider } from '../../Fediverse/Providers/NodeProvider'
|
import { NodeProvider } from '../../Fediverse/Providers/NodeProvider'
|
||||||
import Node from '../../Storage/Definitions/Node'
|
import Node from '../../Storage/Definitions/Node'
|
||||||
|
@ -8,9 +9,10 @@ export const findNewNodesOnPage = async (
|
||||||
elastic: ElasticClient,
|
elastic: ElasticClient,
|
||||||
provider: NodeProvider,
|
provider: NodeProvider,
|
||||||
node: Node,
|
node: Node,
|
||||||
page: number
|
page: number,
|
||||||
|
robotsTxt: RobotsTxt
|
||||||
): Promise<number> => {
|
): Promise<number> => {
|
||||||
let domains = await provider.retrieveNodes(node.domain, page)
|
let domains = await provider.retrieveNodes(node.domain, page, robotsTxt)
|
||||||
domains = domains.filter(isDomainNotBanned)
|
domains = domains.filter(isDomainNotBanned)
|
||||||
console.log('Found nodes', {
|
console.log('Found nodes', {
|
||||||
count: domains.length,
|
count: domains.length,
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import fetchRobotsTxt from '../Fediverse/RobotsTxt/fetchRobotsTxt.js'
|
||||||
import { fetchNodeToProcess } from '../Storage/Nodes/fetchNodeToProcess'
|
import { fetchNodeToProcess } from '../Storage/Nodes/fetchNodeToProcess'
|
||||||
import { ProviderRegistry } from '../Fediverse/Providers/ProviderRegistry'
|
import { ProviderRegistry } from '../Fediverse/Providers/ProviderRegistry'
|
||||||
import { setNodeRefreshed } from '../Storage/Nodes/setNodeRefreshed'
|
import { setNodeRefreshed } from '../Storage/Nodes/setNodeRefreshed'
|
||||||
|
@ -21,7 +22,8 @@ export const processNextNode = async (
|
||||||
node = await setNodeRefreshAttempted(elastic, node)
|
node = await setNodeRefreshAttempted(elastic, node)
|
||||||
|
|
||||||
node = await refreshNodeIps(elastic, node)
|
node = await refreshNodeIps(elastic, node)
|
||||||
node = await refreshNodeInfo(elastic, node)
|
const robotsTxt = await fetchRobotsTxt(node.domain)
|
||||||
|
node = await refreshNodeInfo(elastic, node, robotsTxt)
|
||||||
|
|
||||||
const softwareName = node.softwareName ?? ''
|
const softwareName = node.softwareName ?? ''
|
||||||
if (!providerRegistry.containsKey(softwareName)) {
|
if (!providerRegistry.containsKey(softwareName)) {
|
||||||
|
@ -41,7 +43,7 @@ export const processNextNode = async (
|
||||||
domain: node.domain,
|
domain: node.domain,
|
||||||
provider: nodeProvider.getKey()
|
provider: nodeProvider.getKey()
|
||||||
})
|
})
|
||||||
return await findNewNodes(elastic, nodeProvider, node)
|
return await findNewNodes(elastic, nodeProvider, node, robotsTxt)
|
||||||
})
|
})
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -51,7 +53,7 @@ export const processNextNode = async (
|
||||||
domain: node.domain,
|
domain: node.domain,
|
||||||
provider: feedProvider.getKey()
|
provider: feedProvider.getKey()
|
||||||
})
|
})
|
||||||
return await refreshFeeds(elastic, feedProvider, node)
|
return await refreshFeeds(elastic, feedProvider, node, robotsTxt)
|
||||||
})
|
})
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -8,23 +8,6 @@ const assertNodeIndex = async (elastic: ElasticClient): Promise<void> => {
|
||||||
id: 'node',
|
id: 'node',
|
||||||
description: 'Default node pipeline',
|
description: 'Default node pipeline',
|
||||||
processors: [
|
processors: [
|
||||||
{
|
|
||||||
// @ts-expect-error
|
|
||||||
geoip: {
|
|
||||||
ignore_missing: true,
|
|
||||||
field: 'serverIps',
|
|
||||||
properties: [
|
|
||||||
'location',
|
|
||||||
'continent_name',
|
|
||||||
'country_name',
|
|
||||||
'country_iso_code',
|
|
||||||
'region_iso_code',
|
|
||||||
'region_name',
|
|
||||||
'city_name'
|
|
||||||
],
|
|
||||||
target_field: 'geoip'
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
grok: {
|
grok: {
|
||||||
ignore_missing: true,
|
ignore_missing: true,
|
||||||
|
|
|
@ -3071,6 +3071,11 @@ rimraf@^3.0.2:
|
||||||
dependencies:
|
dependencies:
|
||||||
glob "^7.1.3"
|
glob "^7.1.3"
|
||||||
|
|
||||||
|
robots-parser@^3.0.0:
|
||||||
|
version "3.0.0"
|
||||||
|
resolved "https://registry.yarnpkg.com/robots-parser/-/robots-parser-3.0.0.tgz#66af89306302ecd004455f2f24298310d0966631"
|
||||||
|
integrity sha512-6xkze3WRdneibICBAzMKcXyTKQw5shA3GbwoEJy7RSvxpZNGF0GMuYKE1T0VMP4fwx/fQs0n0mtriOqRtk5L1w==
|
||||||
|
|
||||||
run-parallel@^1.1.9:
|
run-parallel@^1.1.9:
|
||||||
version "1.2.0"
|
version "1.2.0"
|
||||||
resolved "https://registry.yarnpkg.com/run-parallel/-/run-parallel-1.2.0.tgz#66d1368da7bdf921eb9d95bd1a9229e7f21a43ee"
|
resolved "https://registry.yarnpkg.com/run-parallel/-/run-parallel-1.2.0.tgz#66d1368da7bdf921eb9d95bd1a9229e7f21a43ee"
|
||||||
|
|
Ładowanie…
Reference in New Issue