kopia lustrzana https://github.com/Stopka/fedicrawl
Added more env configs
rodzic
a83039e9ab
commit
0c2ade0b52
|
@ -1,6 +1,11 @@
|
|||
FROM node:16-bullseye AS build
|
||||
ENV POSTGRES_URL='postgresql://fedisearch:passwd@postgres:5432/fedisearch?schema=public' \
|
||||
SEED_NODE_DOMAIN='mastodon.social'
|
||||
SEED_NODE_DOMAIN='mastodon.social' \
|
||||
REATTEMPT_MINUTES='60' \
|
||||
REFRESH_HOURS='120' \
|
||||
WAIT_FOR_JOB_MINUTES='60' \
|
||||
DEFAULT_TIMEOUT_MILLISECONDS='10000' \
|
||||
TZ='UTC'
|
||||
WORKDIR /srv
|
||||
COPY application/package*.json ./
|
||||
COPY application/prisma ./prisma/
|
||||
|
|
14
README.md
14
README.md
|
@ -22,11 +22,15 @@ Data providers for more apps will be probably added soon (Pull requests are welc
|
|||
|
||||
Configuration is done using environmental variables:
|
||||
|
||||
| Variable | Description | Value example |
|
||||
|--------------------|-------------------------------------------------------------|-------------------------------------------------------------------------|
|
||||
| `POSTGRES_URL` | Postgres database uri | `postgresql://fedisearch:passwd@postgres:5432/fedisearch?schema=public` |
|
||||
| `SEED_NODE_DOMAIN` | Domain of the first node to search users and other nodes on | `mastodon.social` |
|
||||
|
||||
| Variable | Description | Default value / Example value |
|
||||
|--------------------------------|--------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------|
|
||||
| `POSTGRES_URL` | Postgres database uri | `postgresql://fedisearch:passwd@postgres:5432/fedisearch?schema=public` |
|
||||
| `SEED_NODE_DOMAIN` | Domain of the first node to search users and other nodes on | `mastodon.social` |
|
||||
| `REATTEMPT_MINUTES` | _Optional_, How many minutes should be waited for next node refresh attempt if the refresh fails | `60 ` |
|
||||
| `REFRESH_HOURS` | _Optional_, How often (in hours) should be node info refreshed | `120` |
|
||||
| `WAIT_FOR_JOB_MINUTES` | _Optional_, How many minutes should the thread sleep if there are no nodes to refresh | `60` |
|
||||
| `DEFAULT_TIMEOUT_MILLISECONDS` | _Optional_, How many milliseconds should http wait for node api response on refresh | `10000` |
|
||||
| `TZ` | _Optional_, Timezone | `UTC` |
|
||||
## Deploy
|
||||
App is designed to be run in docker container and deployed using docker-compose.
|
||||
More info can be found in [FediSearch example docker-compose](https://github.com/Stopka/fedisearch-compose) project
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import axios from 'axios'
|
||||
import { z } from 'zod'
|
||||
import { assertSuccessJsonResponse } from '../assertSuccessJsonResponse'
|
||||
import { getDefaultTimeoutMilliseconds } from '../getDefaultTimeoutMilliseconds'
|
||||
|
||||
const schema = z.object({
|
||||
software: z.object({
|
||||
|
@ -25,7 +26,7 @@ export type NodeInfo = z.infer<typeof schema>
|
|||
|
||||
export const retrieveNodeInfo = async (url:string):Promise<NodeInfo> => {
|
||||
console.info('Retrieving node info', { url: url })
|
||||
const nodeInfoResponse = await axios.get(url, { timeout: 10000 })
|
||||
const nodeInfoResponse = await axios.get(url, { timeout: getDefaultTimeoutMilliseconds() })
|
||||
assertSuccessJsonResponse(nodeInfoResponse)
|
||||
return schema.parse(nodeInfoResponse.data)
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import axios from 'axios'
|
||||
import { assertSuccessJsonResponse } from '../assertSuccessJsonResponse'
|
||||
import { z } from 'zod'
|
||||
import { getDefaultTimeoutMilliseconds } from '../getDefaultTimeoutMilliseconds'
|
||||
|
||||
const wellKnownSchema = z.object({
|
||||
links: z.array(
|
||||
|
@ -16,7 +17,7 @@ export type WellKnown = z.infer<typeof wellKnownSchema>
|
|||
export const retrieveWellKnown = async (domain:string):Promise<WellKnown> => {
|
||||
console.info('Retrieving well known', { domain: domain })
|
||||
const wellKnownUrl = `https://${domain}/.well-known/nodeinfo`
|
||||
const wellKnownResponse = await axios.get(wellKnownUrl, { timeout: 10000 })
|
||||
const wellKnownResponse = await axios.get(wellKnownUrl, { timeout: getDefaultTimeoutMilliseconds() })
|
||||
assertSuccessJsonResponse(wellKnownResponse)
|
||||
return wellKnownSchema.parse(wellKnownResponse.data)
|
||||
}
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
import axios from 'axios'
|
||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||
import { FeedData } from '../FeedData'
|
||||
import { string, z } from 'zod'
|
||||
import { z } from 'zod'
|
||||
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
||||
|
||||
const limit = 500
|
||||
|
||||
|
@ -56,7 +57,7 @@ export const retrieveLocalPublicUsersPage = async (domain: string, page: number)
|
|||
offset: page * limit,
|
||||
local: true
|
||||
},
|
||||
timeout: 10000
|
||||
timeout: getDefaultTimeoutMilliseconds()
|
||||
})
|
||||
assertSuccessJsonResponse(response)
|
||||
const responseData = schema.parse(response.data)
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import axios from 'axios'
|
||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||
import { z } from 'zod'
|
||||
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
||||
|
||||
const schema = z.array(
|
||||
z.string()
|
||||
|
@ -12,7 +13,7 @@ export const retrievePeers = async (domain:string, page:number):Promise<string[]
|
|||
}
|
||||
try {
|
||||
const response = await axios.get('https://' + domain + '/api/v1/instance/peers', {
|
||||
timeout: 10000
|
||||
timeout: getDefaultTimeoutMilliseconds()
|
||||
})
|
||||
assertSuccessJsonResponse(response)
|
||||
return schema.parse(response.data)
|
||||
|
|
|
@ -4,6 +4,7 @@ import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
|||
import { z } from 'zod'
|
||||
import { avatarSchema } from './Avatar'
|
||||
import { parseAvatarUrl } from './parseAvatarUrl'
|
||||
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
||||
|
||||
const limit = 100
|
||||
|
||||
|
@ -32,7 +33,7 @@ export const retrieveAccounts = async (domain: string, page: number): Promise<Fe
|
|||
sort: 'createdAt',
|
||||
start: page * limit
|
||||
},
|
||||
timeout: 10000
|
||||
timeout: getDefaultTimeoutMilliseconds()
|
||||
})
|
||||
assertSuccessJsonResponse(response)
|
||||
const responseData = schema.parse(response.data)
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import axios from 'axios'
|
||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||
import { z } from 'zod'
|
||||
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
||||
|
||||
const limit = 100
|
||||
|
||||
|
@ -25,7 +26,7 @@ export const retrieveFollowers = async (domain: string, page: number): Promise<s
|
|||
sort: 'createdAt',
|
||||
start: page * limit
|
||||
},
|
||||
timeout: 10000
|
||||
timeout: getDefaultTimeoutMilliseconds()
|
||||
})
|
||||
assertSuccessJsonResponse(response)
|
||||
const responseData = schema.parse(response.data)
|
||||
|
|
|
@ -5,6 +5,7 @@ import { z } from 'zod'
|
|||
import { FieldData } from '../FieldData'
|
||||
import { avatarSchema } from './Avatar'
|
||||
import { parseAvatarUrl } from './parseAvatarUrl'
|
||||
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
||||
|
||||
const limit = 100
|
||||
|
||||
|
@ -39,7 +40,7 @@ export const retrieveVideoChannels = async (domain: string, page: number): Promi
|
|||
sort: 'createdAt',
|
||||
start: page * limit
|
||||
},
|
||||
timeout: 10000
|
||||
timeout: getDefaultTimeoutMilliseconds()
|
||||
})
|
||||
assertSuccessJsonResponse(response)
|
||||
const responseData = schema.parse(response.data)
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
export const getDefaultTimeoutMilliseconds = () :number => {
|
||||
return parseInt(process.env.DEFAULT_TIMEOUT_MILLISECONDS ?? '10000')
|
||||
}
|
|
@ -1,34 +1,63 @@
|
|||
import { Node, PrismaClient } from '@prisma/client'
|
||||
|
||||
export const fetchNodeToProcess = async (prisma: PrismaClient): Promise<Node> => {
|
||||
console.log('Searching for not yet processed node')
|
||||
const currentTimestamp = Date.now()
|
||||
const attemptLimitMilliseconds = parseInt(process.env.REATTEMPT_MINUTES ?? '60') * 60 * 1000
|
||||
const attemptLimitDate = new Date(currentTimestamp - attemptLimitMilliseconds)
|
||||
console.log('Searching for not yet processed node not attempted before attemptLimit', { attemptLimitDate, attemptLimitMilliseconds })
|
||||
const newNode = await prisma.node.findFirst({
|
||||
orderBy: {
|
||||
foundAt: 'asc'
|
||||
},
|
||||
where: {
|
||||
refreshedAt: null
|
||||
refreshedAt: null,
|
||||
OR: [
|
||||
{
|
||||
refreshAttemptedAt: {
|
||||
lt: attemptLimitDate
|
||||
}
|
||||
},
|
||||
{
|
||||
refreshAttemptedAt: null
|
||||
}
|
||||
]
|
||||
|
||||
}
|
||||
})
|
||||
if (newNode) {
|
||||
console.log('Found not yet processed node', { domain: newNode.domain })
|
||||
return newNode
|
||||
}
|
||||
const date = new Date()
|
||||
date.setMonth(date.getMonth() - 1)
|
||||
console.log('Searching instance not refreshed for longest time and at least a month ago', { date: date })
|
||||
const refreshLimitMilliseconds = parseInt(process.env.REFRESH_HOURS ?? '168') * 60 * 60 * 1000
|
||||
const refreshLimitDate = new Date(currentTimestamp - refreshLimitMilliseconds)
|
||||
console.log('Searching instance not refreshed for longest time and before refreshLimit and attemptLimit', {
|
||||
refreshLimitMilliseconds,
|
||||
refreshLimitDate,
|
||||
attemptLimitDate,
|
||||
attemptLimitMilliseconds
|
||||
})
|
||||
const node = await prisma.node.findFirst({
|
||||
orderBy: {
|
||||
refreshedAt: 'asc'
|
||||
},
|
||||
where: {
|
||||
refreshedAt: {
|
||||
lt: date
|
||||
}
|
||||
lt: refreshLimitDate
|
||||
},
|
||||
OR: [
|
||||
{
|
||||
refreshAttemptedAt: {
|
||||
lt: attemptLimitDate
|
||||
}
|
||||
},
|
||||
{
|
||||
refreshAttemptedAt: null
|
||||
}
|
||||
]
|
||||
}
|
||||
})
|
||||
if (node) {
|
||||
console.log('Found oldest node', { domain: newNode.domain })
|
||||
console.log('Found oldest node', { domain: node.domain })
|
||||
} else {
|
||||
throw new Error('No node found')
|
||||
}
|
||||
|
|
|
@ -9,10 +9,9 @@ const loop = async (): Promise<void> => {
|
|||
await processNextNode(prismaClient, providerRegistry)
|
||||
} catch (err) {
|
||||
console.warn(err)
|
||||
const milisecondsInMinute = 1000 * 60
|
||||
const timeout = 60 * milisecondsInMinute
|
||||
console.info('Delaying next node process', { timeoutMinutes: timeout / milisecondsInMinute, now: new Date() })
|
||||
setTimeout(loop, timeout)
|
||||
const waitForJobMilliseconds = parseInt(process.env.WAIT_FOR_JOB_MINUTES ?? '60') * 60 * 1000
|
||||
console.info('Delaying next node process', { timeoutMilliseconds: waitForJobMilliseconds, timeoutDate: new Date(Date.now() + waitForJobMilliseconds), now: new Date() })
|
||||
setTimeout(loop, waitForJobMilliseconds)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
|
Ładowanie…
Reference in New Issue