kopia lustrzana https://github.com/Stopka/fedicrawl
Added more env configs
rodzic
a83039e9ab
commit
0c2ade0b52
|
@ -1,6 +1,11 @@
|
||||||
FROM node:16-bullseye AS build
|
FROM node:16-bullseye AS build
|
||||||
ENV POSTGRES_URL='postgresql://fedisearch:passwd@postgres:5432/fedisearch?schema=public' \
|
ENV POSTGRES_URL='postgresql://fedisearch:passwd@postgres:5432/fedisearch?schema=public' \
|
||||||
SEED_NODE_DOMAIN='mastodon.social'
|
SEED_NODE_DOMAIN='mastodon.social' \
|
||||||
|
REATTEMPT_MINUTES='60' \
|
||||||
|
REFRESH_HOURS='120' \
|
||||||
|
WAIT_FOR_JOB_MINUTES='60' \
|
||||||
|
DEFAULT_TIMEOUT_MILLISECONDS='10000' \
|
||||||
|
TZ='UTC'
|
||||||
WORKDIR /srv
|
WORKDIR /srv
|
||||||
COPY application/package*.json ./
|
COPY application/package*.json ./
|
||||||
COPY application/prisma ./prisma/
|
COPY application/prisma ./prisma/
|
||||||
|
|
14
README.md
14
README.md
|
@ -22,11 +22,15 @@ Data providers for more apps will be probably added soon (Pull requests are welc
|
||||||
|
|
||||||
Configuration is done using environmental variables:
|
Configuration is done using environmental variables:
|
||||||
|
|
||||||
| Variable | Description | Value example |
|
| Variable | Description | Default value / Example value |
|
||||||
|--------------------|-------------------------------------------------------------|-------------------------------------------------------------------------|
|
|--------------------------------|--------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------|
|
||||||
| `POSTGRES_URL` | Postgres database uri | `postgresql://fedisearch:passwd@postgres:5432/fedisearch?schema=public` |
|
| `POSTGRES_URL` | Postgres database uri | `postgresql://fedisearch:passwd@postgres:5432/fedisearch?schema=public` |
|
||||||
| `SEED_NODE_DOMAIN` | Domain of the first node to search users and other nodes on | `mastodon.social` |
|
| `SEED_NODE_DOMAIN` | Domain of the first node to search users and other nodes on | `mastodon.social` |
|
||||||
|
| `REATTEMPT_MINUTES` | _Optional_, How many minutes should be waited for next node refresh attempt if the refresh fails | `60 ` |
|
||||||
|
| `REFRESH_HOURS` | _Optional_, How often (in hours) should be node info refreshed | `120` |
|
||||||
|
| `WAIT_FOR_JOB_MINUTES` | _Optional_, How many minutes should the thread sleep if there are no nodes to refresh | `60` |
|
||||||
|
| `DEFAULT_TIMEOUT_MILLISECONDS` | _Optional_, How many milliseconds should http wait for node api response on refresh | `10000` |
|
||||||
|
| `TZ` | _Optional_, Timezone | `UTC` |
|
||||||
## Deploy
|
## Deploy
|
||||||
App is designed to be run in docker container and deployed using docker-compose.
|
App is designed to be run in docker container and deployed using docker-compose.
|
||||||
More info can be found in [FediSearch example docker-compose](https://github.com/Stopka/fedisearch-compose) project
|
More info can be found in [FediSearch example docker-compose](https://github.com/Stopka/fedisearch-compose) project
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import axios from 'axios'
|
import axios from 'axios'
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
import { assertSuccessJsonResponse } from '../assertSuccessJsonResponse'
|
import { assertSuccessJsonResponse } from '../assertSuccessJsonResponse'
|
||||||
|
import { getDefaultTimeoutMilliseconds } from '../getDefaultTimeoutMilliseconds'
|
||||||
|
|
||||||
const schema = z.object({
|
const schema = z.object({
|
||||||
software: z.object({
|
software: z.object({
|
||||||
|
@ -25,7 +26,7 @@ export type NodeInfo = z.infer<typeof schema>
|
||||||
|
|
||||||
export const retrieveNodeInfo = async (url:string):Promise<NodeInfo> => {
|
export const retrieveNodeInfo = async (url:string):Promise<NodeInfo> => {
|
||||||
console.info('Retrieving node info', { url: url })
|
console.info('Retrieving node info', { url: url })
|
||||||
const nodeInfoResponse = await axios.get(url, { timeout: 10000 })
|
const nodeInfoResponse = await axios.get(url, { timeout: getDefaultTimeoutMilliseconds() })
|
||||||
assertSuccessJsonResponse(nodeInfoResponse)
|
assertSuccessJsonResponse(nodeInfoResponse)
|
||||||
return schema.parse(nodeInfoResponse.data)
|
return schema.parse(nodeInfoResponse.data)
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import axios from 'axios'
|
import axios from 'axios'
|
||||||
import { assertSuccessJsonResponse } from '../assertSuccessJsonResponse'
|
import { assertSuccessJsonResponse } from '../assertSuccessJsonResponse'
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
|
import { getDefaultTimeoutMilliseconds } from '../getDefaultTimeoutMilliseconds'
|
||||||
|
|
||||||
const wellKnownSchema = z.object({
|
const wellKnownSchema = z.object({
|
||||||
links: z.array(
|
links: z.array(
|
||||||
|
@ -16,7 +17,7 @@ export type WellKnown = z.infer<typeof wellKnownSchema>
|
||||||
export const retrieveWellKnown = async (domain:string):Promise<WellKnown> => {
|
export const retrieveWellKnown = async (domain:string):Promise<WellKnown> => {
|
||||||
console.info('Retrieving well known', { domain: domain })
|
console.info('Retrieving well known', { domain: domain })
|
||||||
const wellKnownUrl = `https://${domain}/.well-known/nodeinfo`
|
const wellKnownUrl = `https://${domain}/.well-known/nodeinfo`
|
||||||
const wellKnownResponse = await axios.get(wellKnownUrl, { timeout: 10000 })
|
const wellKnownResponse = await axios.get(wellKnownUrl, { timeout: getDefaultTimeoutMilliseconds() })
|
||||||
assertSuccessJsonResponse(wellKnownResponse)
|
assertSuccessJsonResponse(wellKnownResponse)
|
||||||
return wellKnownSchema.parse(wellKnownResponse.data)
|
return wellKnownSchema.parse(wellKnownResponse.data)
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
import axios from 'axios'
|
import axios from 'axios'
|
||||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||||
import { FeedData } from '../FeedData'
|
import { FeedData } from '../FeedData'
|
||||||
import { string, z } from 'zod'
|
import { z } from 'zod'
|
||||||
|
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
||||||
|
|
||||||
const limit = 500
|
const limit = 500
|
||||||
|
|
||||||
|
@ -56,7 +57,7 @@ export const retrieveLocalPublicUsersPage = async (domain: string, page: number)
|
||||||
offset: page * limit,
|
offset: page * limit,
|
||||||
local: true
|
local: true
|
||||||
},
|
},
|
||||||
timeout: 10000
|
timeout: getDefaultTimeoutMilliseconds()
|
||||||
})
|
})
|
||||||
assertSuccessJsonResponse(response)
|
assertSuccessJsonResponse(response)
|
||||||
const responseData = schema.parse(response.data)
|
const responseData = schema.parse(response.data)
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import axios from 'axios'
|
import axios from 'axios'
|
||||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
|
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
||||||
|
|
||||||
const schema = z.array(
|
const schema = z.array(
|
||||||
z.string()
|
z.string()
|
||||||
|
@ -12,7 +13,7 @@ export const retrievePeers = async (domain:string, page:number):Promise<string[]
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
const response = await axios.get('https://' + domain + '/api/v1/instance/peers', {
|
const response = await axios.get('https://' + domain + '/api/v1/instance/peers', {
|
||||||
timeout: 10000
|
timeout: getDefaultTimeoutMilliseconds()
|
||||||
})
|
})
|
||||||
assertSuccessJsonResponse(response)
|
assertSuccessJsonResponse(response)
|
||||||
return schema.parse(response.data)
|
return schema.parse(response.data)
|
||||||
|
|
|
@ -4,6 +4,7 @@ import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
import { avatarSchema } from './Avatar'
|
import { avatarSchema } from './Avatar'
|
||||||
import { parseAvatarUrl } from './parseAvatarUrl'
|
import { parseAvatarUrl } from './parseAvatarUrl'
|
||||||
|
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
||||||
|
|
||||||
const limit = 100
|
const limit = 100
|
||||||
|
|
||||||
|
@ -32,7 +33,7 @@ export const retrieveAccounts = async (domain: string, page: number): Promise<Fe
|
||||||
sort: 'createdAt',
|
sort: 'createdAt',
|
||||||
start: page * limit
|
start: page * limit
|
||||||
},
|
},
|
||||||
timeout: 10000
|
timeout: getDefaultTimeoutMilliseconds()
|
||||||
})
|
})
|
||||||
assertSuccessJsonResponse(response)
|
assertSuccessJsonResponse(response)
|
||||||
const responseData = schema.parse(response.data)
|
const responseData = schema.parse(response.data)
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import axios from 'axios'
|
import axios from 'axios'
|
||||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
|
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
||||||
|
|
||||||
const limit = 100
|
const limit = 100
|
||||||
|
|
||||||
|
@ -25,7 +26,7 @@ export const retrieveFollowers = async (domain: string, page: number): Promise<s
|
||||||
sort: 'createdAt',
|
sort: 'createdAt',
|
||||||
start: page * limit
|
start: page * limit
|
||||||
},
|
},
|
||||||
timeout: 10000
|
timeout: getDefaultTimeoutMilliseconds()
|
||||||
})
|
})
|
||||||
assertSuccessJsonResponse(response)
|
assertSuccessJsonResponse(response)
|
||||||
const responseData = schema.parse(response.data)
|
const responseData = schema.parse(response.data)
|
||||||
|
|
|
@ -5,6 +5,7 @@ import { z } from 'zod'
|
||||||
import { FieldData } from '../FieldData'
|
import { FieldData } from '../FieldData'
|
||||||
import { avatarSchema } from './Avatar'
|
import { avatarSchema } from './Avatar'
|
||||||
import { parseAvatarUrl } from './parseAvatarUrl'
|
import { parseAvatarUrl } from './parseAvatarUrl'
|
||||||
|
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
||||||
|
|
||||||
const limit = 100
|
const limit = 100
|
||||||
|
|
||||||
|
@ -39,7 +40,7 @@ export const retrieveVideoChannels = async (domain: string, page: number): Promi
|
||||||
sort: 'createdAt',
|
sort: 'createdAt',
|
||||||
start: page * limit
|
start: page * limit
|
||||||
},
|
},
|
||||||
timeout: 10000
|
timeout: getDefaultTimeoutMilliseconds()
|
||||||
})
|
})
|
||||||
assertSuccessJsonResponse(response)
|
assertSuccessJsonResponse(response)
|
||||||
const responseData = schema.parse(response.data)
|
const responseData = schema.parse(response.data)
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
export const getDefaultTimeoutMilliseconds = () :number => {
|
||||||
|
return parseInt(process.env.DEFAULT_TIMEOUT_MILLISECONDS ?? '10000')
|
||||||
|
}
|
|
@ -1,34 +1,63 @@
|
||||||
import { Node, PrismaClient } from '@prisma/client'
|
import { Node, PrismaClient } from '@prisma/client'
|
||||||
|
|
||||||
export const fetchNodeToProcess = async (prisma: PrismaClient): Promise<Node> => {
|
export const fetchNodeToProcess = async (prisma: PrismaClient): Promise<Node> => {
|
||||||
console.log('Searching for not yet processed node')
|
const currentTimestamp = Date.now()
|
||||||
|
const attemptLimitMilliseconds = parseInt(process.env.REATTEMPT_MINUTES ?? '60') * 60 * 1000
|
||||||
|
const attemptLimitDate = new Date(currentTimestamp - attemptLimitMilliseconds)
|
||||||
|
console.log('Searching for not yet processed node not attempted before attemptLimit', { attemptLimitDate, attemptLimitMilliseconds })
|
||||||
const newNode = await prisma.node.findFirst({
|
const newNode = await prisma.node.findFirst({
|
||||||
orderBy: {
|
orderBy: {
|
||||||
foundAt: 'asc'
|
foundAt: 'asc'
|
||||||
},
|
},
|
||||||
where: {
|
where: {
|
||||||
refreshedAt: null
|
refreshedAt: null,
|
||||||
|
OR: [
|
||||||
|
{
|
||||||
|
refreshAttemptedAt: {
|
||||||
|
lt: attemptLimitDate
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
refreshAttemptedAt: null
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
if (newNode) {
|
if (newNode) {
|
||||||
console.log('Found not yet processed node', { domain: newNode.domain })
|
console.log('Found not yet processed node', { domain: newNode.domain })
|
||||||
return newNode
|
return newNode
|
||||||
}
|
}
|
||||||
const date = new Date()
|
const refreshLimitMilliseconds = parseInt(process.env.REFRESH_HOURS ?? '168') * 60 * 60 * 1000
|
||||||
date.setMonth(date.getMonth() - 1)
|
const refreshLimitDate = new Date(currentTimestamp - refreshLimitMilliseconds)
|
||||||
console.log('Searching instance not refreshed for longest time and at least a month ago', { date: date })
|
console.log('Searching instance not refreshed for longest time and before refreshLimit and attemptLimit', {
|
||||||
|
refreshLimitMilliseconds,
|
||||||
|
refreshLimitDate,
|
||||||
|
attemptLimitDate,
|
||||||
|
attemptLimitMilliseconds
|
||||||
|
})
|
||||||
const node = await prisma.node.findFirst({
|
const node = await prisma.node.findFirst({
|
||||||
orderBy: {
|
orderBy: {
|
||||||
refreshedAt: 'asc'
|
refreshedAt: 'asc'
|
||||||
},
|
},
|
||||||
where: {
|
where: {
|
||||||
refreshedAt: {
|
refreshedAt: {
|
||||||
lt: date
|
lt: refreshLimitDate
|
||||||
}
|
},
|
||||||
|
OR: [
|
||||||
|
{
|
||||||
|
refreshAttemptedAt: {
|
||||||
|
lt: attemptLimitDate
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
refreshAttemptedAt: null
|
||||||
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
if (node) {
|
if (node) {
|
||||||
console.log('Found oldest node', { domain: newNode.domain })
|
console.log('Found oldest node', { domain: node.domain })
|
||||||
} else {
|
} else {
|
||||||
throw new Error('No node found')
|
throw new Error('No node found')
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,10 +9,9 @@ const loop = async (): Promise<void> => {
|
||||||
await processNextNode(prismaClient, providerRegistry)
|
await processNextNode(prismaClient, providerRegistry)
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.warn(err)
|
console.warn(err)
|
||||||
const milisecondsInMinute = 1000 * 60
|
const waitForJobMilliseconds = parseInt(process.env.WAIT_FOR_JOB_MINUTES ?? '60') * 60 * 1000
|
||||||
const timeout = 60 * milisecondsInMinute
|
console.info('Delaying next node process', { timeoutMilliseconds: waitForJobMilliseconds, timeoutDate: new Date(Date.now() + waitForJobMilliseconds), now: new Date() })
|
||||||
console.info('Delaying next node process', { timeoutMinutes: timeout / milisecondsInMinute, now: new Date() })
|
setTimeout(loop, waitForJobMilliseconds)
|
||||||
setTimeout(loop, timeout)
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Ładowanie…
Reference in New Issue