refactor/elixir backend
parent
ab8e2b09d0
commit
a37452f138
|
@ -0,0 +1,2 @@
|
|||
backend=backend
|
||||
gephi=gephi
|
|
@ -1,93 +1,9 @@
|
|||
*.csv
|
||||
.idea/
|
||||
backend/backend/static/
|
||||
backend/static/
|
||||
*.gexf
|
||||
backend/whitelist.txt
|
||||
data/
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
./lib/
|
||||
./lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# celery beat schedule file
|
||||
celerybeat-schedule
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
.vscode/
|
||||
|
||||
# Environments
|
||||
.env
|
||||
|
@ -99,15 +15,84 @@ ENV/
|
|||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
# The directory Mix will write compiled artifacts to.
|
||||
/backend/_build/
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
# If you run "mix test --cover", coverage assets end up here.
|
||||
/backend/cover/
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
# The directory Mix downloads your dependencies sources to.
|
||||
/backend/deps/
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
# Where 3rd-party dependencies like ExDoc output generated docs.
|
||||
/backend/doc/
|
||||
|
||||
# Ignore .fetch files in case you like to edit your project deps locally.
|
||||
/backend/.fetch
|
||||
|
||||
# If the VM crashes, it generates a dump, let's ignore it too.
|
||||
erl_crash.dump
|
||||
|
||||
# Also ignore archive artifacts (built via "mix archive.build").
|
||||
*.ez
|
||||
|
||||
# Ignore package tarball (built via "mix hex.build").
|
||||
backend-*.tar
|
||||
|
||||
# Since we are building assets from assets/,
|
||||
# we ignore priv/static. You may want to comment
|
||||
# this depending on your deployment strategy.
|
||||
/backend/priv/static/
|
||||
|
||||
# Files matching config/*.secret.exs pattern contain sensitive
|
||||
# data and you should not commit them into version control.
|
||||
#
|
||||
# Alternatively, you may comment the line below and commit the
|
||||
# secrets files as long as you replace their contents by environment
|
||||
# variables.
|
||||
/backend/config/*.secret.exs
|
||||
|
||||
/backend/.elixir_ls/
|
||||
|
||||
*.pot
|
||||
*.po
|
||||
|
||||
# dependencies
|
||||
/frontend/node_modules
|
||||
|
||||
# testing
|
||||
/frontend/coverage
|
||||
|
||||
# production
|
||||
/frontend/build
|
||||
|
||||
# misc
|
||||
.DS_Store
|
||||
.env.local
|
||||
.env.development.local
|
||||
.env.test.local
|
||||
.env.production.local
|
||||
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
|
||||
/gephi/.gradle/
|
||||
/gephi/build/
|
||||
/gephi/lib/*
|
||||
/gephi/!lib/.gitkeep
|
||||
# 64MB file but I don't have much faith that it'll remain available...
|
||||
!/gephi/lib/gephi-toolkit-0.9.2.jar
|
||||
|
||||
*/.idea/
|
||||
|
||||
# Ignore Gradle GUI config
|
||||
/gephi/gradle-app.setting
|
||||
|
||||
# Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored)
|
||||
!/gephi/gradle-wrapper.jar
|
||||
|
||||
# Cache of project
|
||||
/gephi/.gradletasknamecache
|
||||
|
||||
*.javac
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
_build/
|
||||
deps/
|
||||
.git/
|
||||
.gitignore
|
||||
Dockerfile
|
||||
Makefile
|
||||
README*
|
||||
test/
|
||||
priv/static/
|
|
@ -0,0 +1,5 @@
|
|||
[
|
||||
import_deps: [:ecto, :phoenix],
|
||||
inputs: ["*.{ex,exs}", "priv/*/seeds.exs", "{config,lib,test}/**/*.{ex,exs}"],
|
||||
subdirectories: ["priv/*/migrations"]
|
||||
]
|
|
@ -1,12 +1,53 @@
|
|||
FROM python:3
|
||||
ENV PYTHONUNBUFFERED 1
|
||||
FROM elixir:1.9.0-alpine as build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -qqy --no-install-recommends \
|
||||
postgresql-client-9.6=9.6.10-0+deb9u1
|
||||
# install build dependencies
|
||||
RUN apk add --update git build-base
|
||||
|
||||
RUN mkdir /code
|
||||
WORKDIR /code
|
||||
COPY requirements.txt /code/
|
||||
RUN pip install -r requirements.txt
|
||||
COPY . /code/
|
||||
# prepare build dir
|
||||
RUN mkdir /app
|
||||
WORKDIR /app
|
||||
|
||||
# install hex + rebar
|
||||
RUN mix local.hex --force && \
|
||||
mix local.rebar --force
|
||||
|
||||
# set build ENV
|
||||
ENV MIX_ENV=prod
|
||||
|
||||
# install mix dependencies
|
||||
COPY mix.exs mix.lock ./
|
||||
COPY config config
|
||||
RUN mix deps.get
|
||||
RUN mix deps.compile
|
||||
|
||||
# build assets
|
||||
# COPY assets assets
|
||||
# RUN cd assets && npm install && npm run deploy
|
||||
# RUN mix phx.digest
|
||||
|
||||
# build project
|
||||
COPY priv priv
|
||||
COPY lib lib
|
||||
RUN mix compile
|
||||
|
||||
# build release
|
||||
COPY rel rel
|
||||
RUN mix release
|
||||
|
||||
# prepare release image
|
||||
FROM alpine:3.9 AS app
|
||||
RUN apk add --update bash openssl
|
||||
|
||||
RUN mkdir /app
|
||||
WORKDIR /app
|
||||
|
||||
ENV APP_NAME=backend
|
||||
|
||||
COPY --from=build /app/_build/prod/rel/${APP_NAME} ./
|
||||
RUN chown -R nobody: /app
|
||||
USER nobody
|
||||
|
||||
ENV HOME=/app
|
||||
|
||||
# The command to start the backend
|
||||
CMD trap 'exit' INT; ${HOME}/bin/${APP_NAME} start
|
||||
|
|
|
@ -0,0 +1,33 @@
|
|||
# fediverse.space backend
|
||||
|
||||
## Notes
|
||||
|
||||
- This project requires Elixir >= 1.9.
|
||||
- Run with `SKIP_CRAWL=true` to just run the server (useful for working on the API without also crawling)
|
||||
|
||||
## Deployment
|
||||
|
||||
Deployment with Docker is handled as per the [Distillery docs](https://hexdocs.pm/distillery/guides/working_with_docker.html).
|
||||
|
||||
- To build a new version, run `make build` in this directory.
|
||||
- To migrate a released version, run `./backend eval "Backend.Release.migrate"`
|
||||
|
||||
# Default README
|
||||
|
||||
To start your Phoenix server:
|
||||
|
||||
- Install dependencies with `mix deps.get`
|
||||
- Create and migrate your database with `mix ecto.setup`
|
||||
- Start Phoenix endpoint with `mix phx.server`
|
||||
|
||||
Now you can visit [`localhost:4000`](http://localhost:4000) from your browser.
|
||||
|
||||
Ready to run in production? Please [check our deployment guides](https://hexdocs.pm/phoenix/deployment.html).
|
||||
|
||||
## Learn more
|
||||
|
||||
- Official website: http://www.phoenixframework.org/
|
||||
- Guides: https://hexdocs.pm/phoenix/overview.html
|
||||
- Docs: https://hexdocs.pm/phoenix
|
||||
- Mailing list: http://groups.google.com/group/phoenix-talk
|
||||
- Source: https://github.com/phoenixframework/phoenix
|
|
@ -1,8 +0,0 @@
|
|||
def to_representation(self, instance):
|
||||
"""
|
||||
Object instance -> Dict of primitive datatypes.
|
||||
We use a custom to_representation function to exclude empty fields in the serialized JSON.
|
||||
"""
|
||||
ret = super(InstanceListSerializer, self).to_representation(instance)
|
||||
ret = OrderedDict(list(filter(lambda x: x[1], ret.items())))
|
||||
return ret
|
|
@ -1,5 +0,0 @@
|
|||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class Apiv1Config(AppConfig):
|
||||
name = 'apiv1'
|
|
@ -1,105 +0,0 @@
|
|||
from rest_framework import serializers
|
||||
import math
|
||||
from collections import OrderedDict
|
||||
from scraper.models import Instance, Edge
|
||||
|
||||
|
||||
class InstanceListSerializer(serializers.ModelSerializer):
|
||||
"""
|
||||
Minimal instance details used in the full list of instances.
|
||||
"""
|
||||
class Meta:
|
||||
model = Instance
|
||||
fields = ('name', 'user_count')
|
||||
|
||||
def to_representation(self, instance):
|
||||
"""
|
||||
Object instance -> Dict of primitive datatypes.
|
||||
We use a custom to_representation function to exclude empty fields in the serialized JSON.
|
||||
"""
|
||||
ret = super(InstanceListSerializer, self).to_representation(instance)
|
||||
ret = OrderedDict(list(filter(lambda x: x[1], ret.items())))
|
||||
return ret
|
||||
|
||||
|
||||
class InstanceDetailSerializer(serializers.ModelSerializer):
|
||||
"""
|
||||
Detailed instance view.
|
||||
"""
|
||||
userCount = serializers.SerializerMethodField()
|
||||
statusCount = serializers.SerializerMethodField()
|
||||
domainCount = serializers.SerializerMethodField()
|
||||
lastUpdated = serializers.SerializerMethodField()
|
||||
peers = InstanceListSerializer(many=True, read_only=True)
|
||||
|
||||
def get_userCount(self, obj):
|
||||
return obj.user_count
|
||||
|
||||
def get_statusCount(self, obj):
|
||||
return obj.status_count
|
||||
|
||||
def get_domainCount(self, obj):
|
||||
return obj.domain_count
|
||||
|
||||
def get_lastUpdated(self, obj):
|
||||
return obj.last_updated
|
||||
|
||||
class Meta:
|
||||
model = Instance
|
||||
fields = ('name', 'description', 'version', 'userCount',
|
||||
'statusCount', 'domainCount', 'peers', 'lastUpdated',
|
||||
'status')
|
||||
|
||||
|
||||
class EdgeSerializer(serializers.ModelSerializer):
|
||||
"""
|
||||
Used for displaying the graph.
|
||||
"""
|
||||
id = serializers.SerializerMethodField('get_pk')
|
||||
size = serializers.SerializerMethodField('get_weight')
|
||||
|
||||
class Meta:
|
||||
model = Edge
|
||||
fields = ('source', 'target', 'id', 'size')
|
||||
|
||||
def get_pk(self, obj):
|
||||
return obj.pk
|
||||
|
||||
def get_weight(self, obj):
|
||||
return obj.weight
|
||||
|
||||
|
||||
class NodeSerializer(serializers.ModelSerializer):
|
||||
"""
|
||||
Used for displaying the graph.
|
||||
"""
|
||||
id = serializers.SerializerMethodField('get_name')
|
||||
label = serializers.SerializerMethodField('get_name')
|
||||
size = serializers.SerializerMethodField()
|
||||
x = serializers.SerializerMethodField()
|
||||
y = serializers.SerializerMethodField()
|
||||
|
||||
class Meta:
|
||||
model = Instance
|
||||
fields = ('id', 'label', 'size', 'x', 'y')
|
||||
|
||||
def get_name(self, obj):
|
||||
return obj.name
|
||||
|
||||
def get_size(self, obj):
|
||||
return math.log(obj.user_count) if (obj.user_count and (obj.user_count > 1)) else 1
|
||||
|
||||
def get_x(self, obj):
|
||||
return obj.x_coord
|
||||
|
||||
def get_y(self, obj):
|
||||
return obj.y_coord
|
||||
|
||||
def to_representation(self, instance):
|
||||
"""
|
||||
Object instance -> Dict of primitive datatypes.
|
||||
We use a custom to_representation function to exclude empty fields in the serialized JSON.
|
||||
"""
|
||||
ret = super(NodeSerializer, self).to_representation(instance)
|
||||
ret = OrderedDict(list(filter(lambda x: x[1], ret.items())))
|
||||
return ret
|
|
@ -1,37 +0,0 @@
|
|||
from rest_framework import viewsets
|
||||
from scraper.models import Instance, Edge
|
||||
from apiv1.serializers import InstanceListSerializer, InstanceDetailSerializer, NodeSerializer, EdgeSerializer
|
||||
|
||||
|
||||
class InstanceViewSet(viewsets.ReadOnlyModelViewSet):
|
||||
"""API endpoint to view stats for, and the peers of, an instance"""
|
||||
|
||||
lookup_field = 'name'
|
||||
lookup_value_regex = '[a-zA-Z0-9-_\.]+'
|
||||
|
||||
queryset = Instance.objects.all()
|
||||
serializer_class = InstanceListSerializer
|
||||
detail_serializer_class = InstanceDetailSerializer # this serializer also includes stats and a list of peers
|
||||
|
||||
def get_serializer_class(self):
|
||||
if self.action == 'retrieve':
|
||||
if hasattr(self, 'detail_serializer_class'):
|
||||
return self.detail_serializer_class
|
||||
return self.serializer_class
|
||||
|
||||
|
||||
class EdgeView(viewsets.ReadOnlyModelViewSet):
|
||||
"""
|
||||
Endpoint to get a list of the graph's edges in a SigmaJS-friendly format.
|
||||
"""
|
||||
queryset = Edge.objects.all()
|
||||
serializer_class = EdgeSerializer
|
||||
|
||||
|
||||
class NodeView(viewsets.ReadOnlyModelViewSet):
|
||||
"""
|
||||
Endpoint to get a list of the graph's nodes in a SigmaJS-friendly format.
|
||||
"""
|
||||
queryset = Instance.objects.filter(status='success', x_coord__isnull=False, y_coord__isnull=False, user_count__isnull=False)\
|
||||
.exclude(sources__isnull=True, targets__isnull=True)
|
||||
serializer_class = NodeSerializer
|
|
@ -1,124 +0,0 @@
|
|||
"""
|
||||
Django settings for backend project.
|
||||
|
||||
Generated by 'django-admin startproject' using Django 2.1.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/2.1/topics/settings/
|
||||
|
||||
For the full list of settings and their values, see
|
||||
https://docs.djangoproject.com/en/2.1/ref/settings/
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
from django.core.exceptions import ImproperlyConfigured
|
||||
|
||||
SECRET_KEY = os.getenv("SECRET_KEY")
|
||||
|
||||
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
|
||||
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
# Application definition
|
||||
|
||||
INSTALLED_APPS = [
|
||||
'django.contrib.admin',
|
||||
'django.contrib.auth',
|
||||
'django.contrib.contenttypes',
|
||||
'django.contrib.sessions',
|
||||
'django.contrib.messages',
|
||||
'django.contrib.staticfiles',
|
||||
'rest_framework',
|
||||
'silk',
|
||||
'corsheaders',
|
||||
'scraper.apps.ScraperConfig',
|
||||
'apiv1.apps.Apiv1Config',
|
||||
]
|
||||
|
||||
MIDDLEWARE = [
|
||||
'corsheaders.middleware.CorsMiddleware',
|
||||
'django.middleware.security.SecurityMiddleware',
|
||||
'django.contrib.sessions.middleware.SessionMiddleware',
|
||||
'django.middleware.common.CommonMiddleware',
|
||||
'django.middleware.csrf.CsrfViewMiddleware',
|
||||
'django.contrib.auth.middleware.AuthenticationMiddleware',
|
||||
'django.contrib.messages.middleware.MessageMiddleware',
|
||||
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
||||
'silk.middleware.SilkyMiddleware',
|
||||
]
|
||||
|
||||
ROOT_URLCONF = 'backend.urls'
|
||||
|
||||
TEMPLATES = [
|
||||
{
|
||||
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
||||
'DIRS': [os.path.join(BASE_DIR, '../../frontend/build')],
|
||||
'APP_DIRS': True,
|
||||
'OPTIONS': {
|
||||
'context_processors': [
|
||||
'django.template.context_processors.debug',
|
||||
'django.template.context_processors.request',
|
||||
'django.contrib.auth.context_processors.auth',
|
||||
'django.contrib.messages.context_processors.messages',
|
||||
],
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
WSGI_APPLICATION = 'backend.wsgi.application'
|
||||
|
||||
|
||||
# Database
|
||||
# https://docs.djangoproject.com/en/2.1/ref/settings/#databases
|
||||
|
||||
DATABASES = {
|
||||
'default': {
|
||||
'ENGINE': 'django.db.backends.postgresql',
|
||||
'NAME': os.getenv("POSTGRES_DB"),
|
||||
'USER': os.getenv("POSTGRES_USER"),
|
||||
'PASSWORD': os.getenv("POSTGRES_PASSWORD"),
|
||||
'HOST': 'db',
|
||||
'PORT': 5432,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# Password validation
|
||||
# https://docs.djangoproject.com/en/2.1/ref/settings/#auth-password-validators
|
||||
|
||||
AUTH_PASSWORD_VALIDATORS = [
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
|
||||
},
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
|
||||
},
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
|
||||
},
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# Internationalization
|
||||
# https://docs.djangoproject.com/en/2.1/topics/i18n/
|
||||
|
||||
LANGUAGE_CODE = 'en-us'
|
||||
|
||||
TIME_ZONE = 'UTC'
|
||||
|
||||
USE_I18N = True
|
||||
|
||||
USE_L10N = True
|
||||
|
||||
USE_TZ = False
|
||||
|
||||
|
||||
# Static files (CSS, JavaScript, Images)
|
||||
# https://docs.djangoproject.com/en/2.1/howto/static-files/
|
||||
|
||||
STATIC_URL = '/static/'
|
||||
STATICFILES_DIRS = []
|
||||
STATIC_ROOT = os.path.join(BASE_DIR, 'static')
|
|
@ -1,7 +0,0 @@
|
|||
from .base import *
|
||||
|
||||
DEBUG = True
|
||||
|
||||
ALLOWED_HOSTS = ['localhost']
|
||||
|
||||
CORS_ORIGIN_ALLOW_ALL = True
|
|
@ -1,10 +0,0 @@
|
|||
from .base import *
|
||||
|
||||
DEBUG = False
|
||||
|
||||
ALLOWED_HOSTS = ['backend.fediverse.space']
|
||||
|
||||
CORS_ORIGIN_REGEX_WHITELIST = [
|
||||
r'^(https?:\/\/)?(\w+\.)?(.*)?fediverse-space\.netlify\.com\/?$',
|
||||
r'^(https?:\/\/)?(\w+\.)?(.*)?fediverse\.space\/?$',
|
||||
]
|
|
@ -1,37 +0,0 @@
|
|||
"""backend URL Configuration
|
||||
|
||||
The `urlpatterns` list routes URLs to views. For more information please see:
|
||||
https://docs.djangoproject.com/en/2.1/topics/http/urls/
|
||||
Examples:
|
||||
Function views
|
||||
1. Add an import: from my_app import views
|
||||
2. Add a URL to urlpatterns: path('', views.home, name='home')
|
||||
Class-based views
|
||||
1. Add an import: from other_app.views import Home
|
||||
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
|
||||
Including another URLconf
|
||||
1. Import the include() function: from django.urls import include, path
|
||||
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
|
||||
"""
|
||||
from django.urls import path, include
|
||||
from django.views.generic import TemplateView
|
||||
from rest_framework import routers
|
||||
from apiv1 import views
|
||||
|
||||
|
||||
class OptionalTrailingSlashRouter(routers.DefaultRouter):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.trailing_slash = r'/?'
|
||||
|
||||
|
||||
router = OptionalTrailingSlashRouter()
|
||||
router.register(r'instances', views.InstanceViewSet)
|
||||
router.register(r'graph/nodes', views.NodeView)
|
||||
router.register(r'graph/edges', views.EdgeView, base_name='edge')
|
||||
|
||||
urlpatterns = [
|
||||
path('api/v1/', include(router.urls)),
|
||||
path('silk/', include('silk.urls', namespace='silk')),
|
||||
]
|
|
@ -1,13 +0,0 @@
|
|||
"""
|
||||
WSGI config for backend project.
|
||||
|
||||
It exposes the WSGI callable as a module-level variable named ``application``.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/
|
||||
"""
|
||||
|
||||
import os
|
||||
from django.core.wsgi import get_wsgi_application
|
||||
|
||||
application = get_wsgi_application()
|
|
@ -0,0 +1,51 @@
|
|||
# This file is responsible for configuring your application
|
||||
# and its dependencies with the aid of the Mix.Config module.
|
||||
#
|
||||
# This configuration file is loaded before any dependency and
|
||||
# is restricted to this project.
|
||||
|
||||
# General application configuration
|
||||
import Config
|
||||
|
||||
config :backend,
|
||||
ecto_repos: [Backend.Repo]
|
||||
|
||||
# Configures the endpoint
|
||||
config :backend, BackendWeb.Endpoint,
|
||||
url: [host: "localhost"],
|
||||
secret_key_base: "XL4NKGBN9lZMrQbMEI1KJOlwAt8S7younVJl90TdAgzmwyapr3g7BRYSNYvX0sZ9",
|
||||
render_errors: [view: BackendWeb.ErrorView, accepts: ~w(json)],
|
||||
pubsub: [name: Backend.PubSub, adapter: Phoenix.PubSub.PG2]
|
||||
|
||||
config :backend, Backend.Repo, queue_target: 5000
|
||||
|
||||
# Configures Elixir's Logger
|
||||
config :logger, :console,
|
||||
format: "$time $metadata[$level] $message\n",
|
||||
metadata: [:request_id]
|
||||
|
||||
# Use Jason for JSON parsing in Phoenix
|
||||
config :phoenix, :json_library, Jason
|
||||
|
||||
config :backend, :crawler,
|
||||
status_age_limit_days: 28,
|
||||
status_count_limit: 5000,
|
||||
personal_instance_threshold: 10,
|
||||
crawl_interval_mins: 30,
|
||||
crawl_workers: 50,
|
||||
blacklist: [
|
||||
"gab.best"
|
||||
],
|
||||
user_agent: "fediverse.space crawler"
|
||||
|
||||
config :backend, Backend.Scheduler,
|
||||
jobs: [
|
||||
# At midnight every day
|
||||
{"@daily", {Backend.Scheduler, :prune_crawls, [1, "month"]}},
|
||||
# 00.15 daily
|
||||
{"15 0 * * *", {Backend.Scheduler, :generate_edges, []}}
|
||||
]
|
||||
|
||||
# Import environment specific config. This must remain at the bottom
|
||||
# of this file so it overrides the configuration defined above.
|
||||
import_config "#{Mix.env()}.exs"
|
|
@ -0,0 +1,72 @@
|
|||
import Config
|
||||
|
||||
# For development, we disable any cache and enable
|
||||
# debugging and code reloading.
|
||||
#
|
||||
# The watchers configuration can be used to run external
|
||||
# watchers to your application. For example, we use it
|
||||
# with webpack to recompile .js and .css sources.
|
||||
config :backend, BackendWeb.Endpoint,
|
||||
http: [port: 4000],
|
||||
debug_errors: true,
|
||||
code_reloader: true,
|
||||
check_origin: false,
|
||||
watchers: []
|
||||
|
||||
# ## SSL Support
|
||||
#
|
||||
# In order to use HTTPS in development, a self-signed
|
||||
# certificate can be generated by running the following
|
||||
# Mix task:
|
||||
#
|
||||
# mix phx.gen.cert
|
||||
#
|
||||
# Note that this task requires Erlang/OTP 20 or later.
|
||||
# Run `mix help phx.gen.cert` for more information.
|
||||
#
|
||||
# The `http:` config above can be replaced with:
|
||||
#
|
||||
# https: [
|
||||
# port: 4001,
|
||||
# cipher_suite: :strong,
|
||||
# keyfile: "priv/cert/selfsigned_key.pem",
|
||||
# certfile: "priv/cert/selfsigned.pem"
|
||||
# ],
|
||||
#
|
||||
# If desired, both `http:` and `https:` keys can be
|
||||
# configured to run both http and https servers on
|
||||
# different ports.
|
||||
|
||||
# Do not include metadata nor timestamps in development logs
|
||||
config :logger, :console, format: "[$level] $message\n"
|
||||
|
||||
# Set a higher stacktrace during development. Avoid configuring such
|
||||
# in production as building large stacktraces may be expensive.
|
||||
config :phoenix, :stacktrace_depth, 20
|
||||
|
||||
# Initialize plugs at runtime for faster development compilation
|
||||
config :phoenix, :plug_init_mode, :runtime
|
||||
|
||||
# Configure your database
|
||||
config :backend, Backend.Repo,
|
||||
username: "postgres",
|
||||
password: "postgres",
|
||||
database: "backend_dev",
|
||||
hostname: "localhost",
|
||||
pool_size: 10
|
||||
|
||||
config :backend, :crawler,
|
||||
status_age_limit_days: 28,
|
||||
status_count_limit: 100,
|
||||
personal_instance_threshold: 1,
|
||||
crawl_interval_mins: 1,
|
||||
crawl_workers: 10,
|
||||
blacklist: [
|
||||
"gab.best"
|
||||
]
|
||||
|
||||
config :backend, Backend.Scheduler,
|
||||
jobs: [
|
||||
# Every 15 minutes
|
||||
{"*/15 * * * *", {Backend.Scheduler, :prune_crawls, [12, "hour"]}}
|
||||
]
|
|
@ -0,0 +1,57 @@
|
|||
import Config
|
||||
|
||||
# Do not print debug messages in production
|
||||
config :logger, level: :info
|
||||
|
||||
# ## SSL Support
|
||||
#
|
||||
# To get SSL working, you will need to add the `https` key
|
||||
# to the previous section and set your `:url` port to 443:
|
||||
#
|
||||
# config :backend, BackendWeb.Endpoint,
|
||||
# ...
|
||||
# url: [host: "example.com", port: 443],
|
||||
# https: [
|
||||
# :inet6,
|
||||
# port: 443,
|
||||
# cipher_suite: :strong,
|
||||
# keyfile: System.get_env("SOME_APP_SSL_KEY_PATH"),
|
||||
# certfile: System.get_env("SOME_APP_SSL_CERT_PATH")
|
||||
# ]
|
||||
#
|
||||
# The `cipher_suite` is set to `:strong` to support only the
|
||||
# latest and more secure SSL ciphers. This means old browsers
|
||||
# and clients may not be supported. You can set it to
|
||||
# `:compatible` for wider support.
|
||||
#
|
||||
# `:keyfile` and `:certfile` expect an absolute path to the key
|
||||
# and cert in disk or a relative path inside priv, for example
|
||||
# "priv/ssl/server.key". For all supported SSL configuration
|
||||
# options, see https://hexdocs.pm/plug/Plug.SSL.html#configure/1
|
||||
#
|
||||
# We also recommend setting `force_ssl` in your endpoint, ensuring
|
||||
# no data is ever sent via http, always redirecting to https:
|
||||
#
|
||||
# config :backend, BackendWeb.Endpoint,
|
||||
# force_ssl: [hsts: true]
|
||||
#
|
||||
# Check `Plug.SSL` for all available options in `force_ssl`.
|
||||
|
||||
# ## Using releases (distillery)
|
||||
#
|
||||
# If you are doing OTP releases, you need to instruct Phoenix
|
||||
# to start the server for all endpoints:
|
||||
#
|
||||
# config :phoenix, :serve_endpoints, true
|
||||
#
|
||||
# Alternatively, you can configure exactly which server to
|
||||
# start per endpoint:
|
||||
#
|
||||
# config :backend, BackendWeb.Endpoint, server: true
|
||||
#
|
||||
# Note you can't rely on `System.get_env/1` when using releases.
|
||||
# See the releases documentation accordingly.
|
||||
|
||||
# Finally import the config/prod.secret.exs which should be versioned
|
||||
# separately.
|
||||
# import_config "prod.secret.exs"
|
|
@ -0,0 +1,27 @@
|
|||
# This file is for *runtime configuration in releases* only.
|
||||
# https://hexdocs.pm/phoenix/releases.html#runtime-configuration
|
||||
|
||||
import Config
|
||||
|
||||
# For production, don't forget to configure the url host
|
||||
# to something meaningful, Phoenix uses this information
|
||||
# when generating URLs.
|
||||
config :backend, Backend.Repo,
|
||||
# username: System.get_env("POSTGRES_USER"),
|
||||
# password: System.get_env("POSTGRES_PASSWORD"),
|
||||
# database: System.get_env("POSTGRES_DB"),
|
||||
# hostname: System.get_env("POSTGRES_HOSTNAME"),
|
||||
url: System.get_env("ecto://" <> "DATABASE_URL"),
|
||||
pool_size: String.to_integer(System.get_env("POOL_SIZE") || "10"),
|
||||
ssl: true
|
||||
|
||||
# show_sensitive_data_on_connection_error: true
|
||||
|
||||
port = String.to_integer(System.get_env("PORT") || "4000")
|
||||
|
||||
config :backend, BackendWeb.Endpoint,
|
||||
http: [:inet6, port: port],
|
||||
url: [host: System.get_env("BACKEND_HOSTNAME"), port: port],
|
||||
root: ".",
|
||||
secret_key_base: System.get_env("SECRET_KEY_BASE"),
|
||||
server: true
|
|
@ -0,0 +1,18 @@
|
|||
import Config
|
||||
|
||||
# We don't run a server during test. If one is required,
|
||||
# you can enable the server option below.
|
||||
config :backend, BackendWeb.Endpoint,
|
||||
http: [port: 4002],
|
||||
server: false
|
||||
|
||||
# Print only warnings and errors during test
|
||||
config :logger, level: :warn
|
||||
|
||||
# Configure your database
|
||||
config :backend, Backend.Repo,
|
||||
username: "postgres",
|
||||
password: "postgres",
|
||||
database: "backend_test",
|
||||
hostname: "localhost",
|
||||
pool: Ecto.Adapters.SQL.Sandbox
|
|
@ -0,0 +1,9 @@
|
|||
defmodule Backend do
|
||||
@moduledoc """
|
||||
Backend keeps the contexts that define your domain
|
||||
and business logic.
|
||||
|
||||
Contexts are also responsible for managing your data, regardless
|
||||
if it comes from the database, an external API or others.
|
||||
"""
|
||||
end
|
|
@ -0,0 +1,68 @@
|
|||
defmodule Backend.Api do
|
||||
alias Backend.{Crawl, Edge, Instance, Repo}
|
||||
import Ecto.Query
|
||||
|
||||
@spec list_instances() :: [Instance.t()]
|
||||
def list_instances() do
|
||||
Instance
|
||||
|> Repo.all()
|
||||
end
|
||||
|
||||
@spec get_instance!(String.t()) :: Instance.t()
|
||||
def get_instance!(domain) do
|
||||
Instance
|
||||
|> preload(:peers)
|
||||
|> Repo.get_by!(domain: domain)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Returns a list of instances that
|
||||
* have at least one successful crawl
|
||||
* have a user count (required to give the instance a size on the graph)
|
||||
"""
|
||||
@spec list_nodes() :: [Instance.t()]
|
||||
def list_nodes() do
|
||||
crawl_subquery =
|
||||
Crawl
|
||||
|> select([c], %{
|
||||
instance_domain: c.instance_domain,
|
||||
crawl_count: count(c.id)
|
||||
})
|
||||
|> where([c], is_nil(c.error))
|
||||
|> group_by([c], c.instance_domain)
|
||||
|
||||
Instance
|
||||
|> join(:inner, [i], c in subquery(crawl_subquery), on: i.domain == c.instance_domain)
|
||||
|> where(
|
||||
[i, c],
|
||||
c.crawl_count > 0 and not is_nil(i.user_count) and not is_nil(i.x) and not is_nil(i.y)
|
||||
)
|
||||
|> select([c], [:domain, :user_count, :x, :y])
|
||||
|> Repo.all()
|
||||
end
|
||||
|
||||
@spec list_edges() :: [Edge.t()]
|
||||
def list_edges() do
|
||||
crawl_subquery =
|
||||
Crawl
|
||||
|> select([c], %{
|
||||
instance_domain: c.instance_domain,
|
||||
crawl_count: count(c.id)
|
||||
})
|
||||
|> where([c], is_nil(c.error))
|
||||
|> group_by([c], c.instance_domain)
|
||||
|
||||
Edge
|
||||
|> join(:inner, [e], c1 in subquery(crawl_subquery), on: e.source_domain == c1.instance_domain)
|
||||
|> join(:inner, [e], c2 in subquery(crawl_subquery), on: e.target_domain == c2.instance_domain)
|
||||
|> join(:inner, [e], i1 in Instance, on: e.source_domain == i1.domain)
|
||||
|> join(:inner, [e], i2 in Instance, on: e.target_domain == i2.domain)
|
||||
|> select([e], [:id, :source_domain, :target_domain, :weight])
|
||||
|> where(
|
||||
[e, c1, c2, i1, i2],
|
||||
c1.crawl_count > 0 and c2.crawl_count > 0 and not is_nil(i1.x) and not is_nil(i1.y) and
|
||||
not is_nil(i2.x) and not is_nil(i2.y) and e.source_domain != e.target_domain
|
||||
)
|
||||
|> Repo.all()
|
||||
end
|
||||
end
|
|
@ -0,0 +1,46 @@
|
|||
defmodule Backend.Application do
|
||||
# See https://hexdocs.pm/elixir/Application.html
|
||||
# for more information on OTP Applications
|
||||
@moduledoc false
|
||||
|
||||
use Application
|
||||
require Logger
|
||||
import Backend.Util
|
||||
|
||||
def start(_type, _args) do
|
||||
crawl_worker_count = get_config(:crawl_workers)
|
||||
|
||||
children = [
|
||||
# Start the Ecto repository
|
||||
Backend.Repo,
|
||||
# Start the endpoint when the application starts
|
||||
BackendWeb.Endpoint,
|
||||
# Crawler children
|
||||
:hackney_pool.child_spec(:crawler, timeout: 15000, max_connections: crawl_worker_count),
|
||||
{Task,
|
||||
fn ->
|
||||
Honeydew.start_queue(:crawl_queue, failure_mode: Honeydew.FailureMode.Abandon)
|
||||
Honeydew.start_workers(:crawl_queue, Backend.Crawler, num: crawl_worker_count)
|
||||
end},
|
||||
Backend.Scheduler
|
||||
]
|
||||
|
||||
children =
|
||||
case Enum.member?(["true", 1, "1"], System.get_env("SKIP_CRAWL")) do
|
||||
true -> children
|
||||
false -> children ++ [Backend.Crawler.StaleInstanceManager]
|
||||
end
|
||||
|
||||
# See https://hexdocs.pm/elixir/Supervisor.html
|
||||
# for other strategies and supported options
|
||||
opts = [strategy: :one_for_one, name: Backend.Supervisor]
|
||||
Supervisor.start_link(children, opts)
|
||||
end
|
||||
|
||||
# Tell Phoenix to update the endpoint configuration
|
||||
# whenever the application is updated.
|
||||
def config_change(changed, _new, removed) do
|
||||
BackendWeb.Endpoint.config_change(changed, removed)
|
||||
:ok
|
||||
end
|
||||
end
|
|
@ -0,0 +1,26 @@
|
|||
defmodule Backend.Crawl do
|
||||
use Ecto.Schema
|
||||
import Ecto.Changeset
|
||||
|
||||
schema "crawls" do
|
||||
belongs_to :instance, Backend.Instance,
|
||||
references: :domain,
|
||||
type: :string,
|
||||
foreign_key: :instance_domain
|
||||
|
||||
field :interactions_seen, :integer
|
||||
field :statuses_seen, :integer
|
||||
|
||||
# if something went wrong, otherwise null
|
||||
field :error, :string
|
||||
|
||||
timestamps()
|
||||
end
|
||||
|
||||
@doc false
|
||||
def changeset(crawl, attrs) do
|
||||
crawl
|
||||
|> cast(attrs, [:instance, :statuses_seen, :interactions_seen, :error])
|
||||
|> validate_required([:instance])
|
||||
end
|
||||
end
|
|
@ -0,0 +1,29 @@
|
|||
defmodule Backend.CrawlInteraction do
|
||||
use Ecto.Schema
|
||||
import Ecto.Changeset
|
||||
|
||||
schema "crawl_interactions" do
|
||||
belongs_to :crawl, Backend.Crawl
|
||||
|
||||
belongs_to :source, Backend.Instance,
|
||||
references: :domain,
|
||||
type: :string,
|
||||
foreign_key: :source_domain
|
||||
|
||||
belongs_to :target, Backend.Instance,
|
||||
references: :domain,
|
||||
type: :string,
|
||||
foreign_key: :target_domain
|
||||
|
||||
field :mentions, :integer
|
||||
|
||||
timestamps()
|
||||
end
|
||||
|
||||
@doc false
|
||||
def changeset(crawl_interaction, attrs) do
|
||||
crawl_interaction
|
||||
|> cast(attrs, [:crawl, :source, :target, :mentions])
|
||||
|> validate_required([:crawl, :source, :target, :mentions])
|
||||
end
|
||||
end
|
|
@ -0,0 +1,45 @@
|
|||
defmodule Backend.Crawler.ApiCrawler do
|
||||
@moduledoc """
|
||||
This module is a specification. Crawlers for all instance types must implement its behaviour.
|
||||
|
||||
Make sure to respect the following:
|
||||
* You must adhere to the following configuration values:
|
||||
* `:status_age_limit_days` specifies that you must only crawl statuses from the most recent N days
|
||||
* `:status_count_limit` specifies the max number of statuses to crawl in one go
|
||||
* `:personal_instance_threshold` specifies that instances with fewer than this number of users should not be crawled
|
||||
* profiles with the string "nobot" (case insensitive) in their profile must not be included in any stats
|
||||
* Make sure to check the most recent crawl of the instance so you don't re-crawl old statuses
|
||||
"""
|
||||
|
||||
# {domain_mentioned, count}
|
||||
@type instance_interactions :: %{String.t() => integer}
|
||||
|
||||
defstruct [
|
||||
:version,
|
||||
:description,
|
||||
:user_count,
|
||||
:status_count,
|
||||
:peers,
|
||||
:interactions,
|
||||
:statuses_seen
|
||||
]
|
||||
|
||||
@type t() :: %__MODULE__{
|
||||
version: String.t(),
|
||||
description: String.t(),
|
||||
user_count: integer,
|
||||
status_count: integer,
|
||||
peers: [String.t()],
|
||||
interactions: instance_interactions,
|
||||
statuses_seen: integer
|
||||
}
|
||||
|
||||
@doc """
|
||||
Check whether the instance at the given domain is of the type that this ApiCrawler implements.
|
||||
"""
|
||||
@callback is_instance_type?(String.t()) :: boolean()
|
||||
@doc """
|
||||
Crawl the instance at the given domain.
|
||||
"""
|
||||
@callback crawl(String.t()) :: t()
|
||||
end
|
|
@ -0,0 +1,196 @@
|
|||
defmodule Backend.Crawler do
|
||||
@moduledoc """
|
||||
This module crawls instances. Run `run(domain)` to crawl a given domain.
|
||||
"""
|
||||
|
||||
alias __MODULE__
|
||||
alias Backend.Crawler.Crawlers.Mastodon
|
||||
alias Backend.Crawler.ApiCrawler
|
||||
alias Backend.{Crawl, CrawlInteraction, Repo, Instance, InstancePeer}
|
||||
import Ecto.Query
|
||||
import Backend.Util
|
||||
require Logger
|
||||
|
||||
defstruct [
|
||||
# the instance domain (a string)
|
||||
:domain,
|
||||
# a list of ApiCrawlers that will be attempted
|
||||
:api_crawlers,
|
||||
:found_api?,
|
||||
:result,
|
||||
:error
|
||||
]
|
||||
|
||||
@type t() :: %__MODULE__{
|
||||
domain: String.t(),
|
||||
api_crawlers: [ApiCrawler.t()],
|
||||
found_api?: boolean,
|
||||
result: ApiCrawler.t() | nil,
|
||||
error: String.t() | nil
|
||||
}
|
||||
|
||||
def run(domain) do
|
||||
Logger.info("Crawling #{domain}...")
|
||||
HTTPoison.start()
|
||||
state = %Crawler{domain: domain, api_crawlers: [], found_api?: false, result: nil, error: nil}
|
||||
|
||||
state
|
||||
# register APICrawlers here
|
||||
|> register(Mastodon)
|
||||
# go!
|
||||
|> crawl()
|
||||
|> save()
|
||||
end
|
||||
|
||||
# Adds a new ApiCrawler that run/1 will check.
|
||||
defp register(%Crawler{api_crawlers: crawlers} = state, api_crawler) do
|
||||
Map.put(state, :api_crawlers, [api_crawler | crawlers])
|
||||
end
|
||||
|
||||
# Recursive function to check whether `domain` has an API that the head of the api_crawlers list can read.
|
||||
# If so, crawls it. If not, continues with the tail of the api_crawlers list.
|
||||
defp crawl(%Crawler{api_crawlers: [], domain: domain} = state) do
|
||||
Logger.debug("Found no compatible API for #{domain}")
|
||||
Map.put(state, :found_api?, false)
|
||||
end
|
||||
|
||||
defp crawl(%Crawler{domain: domain, api_crawlers: [curr | remaining_crawlers]} = state) do
|
||||
if curr.is_instance_type?(domain) do
|
||||
Logger.debug("Found #{curr} instance")
|
||||
state = Map.put(state, :found_api?, true)
|
||||
|
||||
try do
|
||||
%Crawler{state | result: curr.crawl(domain), api_crawlers: []}
|
||||
rescue
|
||||
e in HTTPoison.Error ->
|
||||
Map.put(state, :error, "HTTPoison error: " <> HTTPoison.Error.message(e))
|
||||
|
||||
e in Jason.DecodeError ->
|
||||
Map.put(state, :error, "Jason DecodeError: " <> Jason.DecodeError.message(e))
|
||||
|
||||
e in _ ->
|
||||
Map.put(state, :error, "Unknown error: " <> inspect(e))
|
||||
end
|
||||
else
|
||||
# Nothing found so check the next APICrawler
|
||||
Logger.debug("#{domain} is not an instance of #{curr}")
|
||||
crawl(%Crawler{state | api_crawlers: remaining_crawlers})
|
||||
end
|
||||
end
|
||||
|
||||
# Save the state (after crawling) to the database.
|
||||
defp save(%Crawler{domain: domain, result: result, found_api?: true, error: nil}) do
|
||||
now = NaiveDateTime.truncate(NaiveDateTime.utc_now(), :second)
|
||||
|
||||
## Update the instance we crawled ##
|
||||
Repo.insert!(
|
||||
%Instance{
|
||||
domain: domain,
|
||||
description: result.description,
|
||||
version: result.version,
|
||||
user_count: result.user_count,
|
||||
status_count: result.status_count
|
||||
},
|
||||
on_conflict: [
|
||||
set: [
|
||||
description: result.description,
|
||||
version: result.version,
|
||||
user_count: result.user_count,
|
||||
status_count: result.status_count,
|
||||
updated_at: now
|
||||
]
|
||||
],
|
||||
conflict_target: :domain
|
||||
)
|
||||
|
||||
# Save details of a new crawl
|
||||
curr_crawl =
|
||||
Repo.insert!(%Crawl{
|
||||
instance_domain: domain,
|
||||
interactions_seen:
|
||||
result.interactions |> Map.values() |> Enum.reduce(0, fn count, acc -> count + acc end),
|
||||
statuses_seen: result.statuses_seen
|
||||
})
|
||||
|
||||
# We get a list of peers from two places:
|
||||
# * the official peers endpoint (which may be disabled)
|
||||
# * the interactions
|
||||
peers_domains =
|
||||
result.interactions
|
||||
|> Map.keys()
|
||||
|> list_union(result.peers)
|
||||
|> Enum.filter(fn domain -> not is_blacklisted?(domain) end)
|
||||
|
||||
peers =
|
||||
peers_domains
|
||||
|> Enum.map(&%{domain: &1, inserted_at: now, updated_at: now})
|
||||
|
||||
Instance
|
||||
|> Repo.insert_all(peers, on_conflict: :nothing, conflict_target: :domain)
|
||||
|
||||
Repo.transaction(fn ->
|
||||
## Save peer relationships ##
|
||||
# get current peers (a list of strings)
|
||||
current_peers =
|
||||
InstancePeer
|
||||
|> where(source_domain: ^domain)
|
||||
|> select([p], p.target_domain)
|
||||
|> Repo.all()
|
||||
|
||||
wanted_peers_set = MapSet.new(peers_domains)
|
||||
current_peers_set = MapSet.new(current_peers)
|
||||
|
||||
# delete the peers we don't want
|
||||
dont_want = current_peers_set |> MapSet.difference(wanted_peers_set) |> MapSet.to_list()
|
||||
|
||||
if length(dont_want) > 0 do
|
||||
InstancePeer
|
||||
|> where(source_domain: ^domain)
|
||||
|> where([p], p.target_domain in ^dont_want)
|
||||
|> Repo.delete_all([])
|
||||
end
|
||||
|
||||
# insert the ones we don't have yet
|
||||
new_instance_peers =
|
||||
wanted_peers_set
|
||||
|> MapSet.difference(current_peers_set)
|
||||
|> MapSet.to_list()
|
||||
|> Enum.map(
|
||||
&%{
|
||||
source_domain: domain,
|
||||
target_domain: &1,
|
||||
inserted_at: now,
|
||||
updated_at: now
|
||||
}
|
||||
)
|
||||
|
||||
InstancePeer
|
||||
|> Repo.insert_all(new_instance_peers)
|
||||
end)
|
||||
|
||||
## Save interactions ##
|
||||
interactions =
|
||||
result.interactions
|
||||
|> Enum.filter(fn {target_domain, _count} -> not is_blacklisted?(target_domain) end)
|
||||
|> Enum.map(fn {target_domain, count} ->
|
||||
%{
|
||||
crawl_id: curr_crawl.id,
|
||||
source_domain: domain,
|
||||
target_domain: target_domain,
|
||||
mentions: count,
|
||||
inserted_at: now,
|
||||
updated_at: now
|
||||
}
|
||||
end)
|
||||
|
||||
CrawlInteraction
|
||||
|> Repo.insert_all(interactions)
|
||||
end
|
||||
|
||||
defp save(%{domain: domain, error: error}) do
|
||||
Repo.insert!(%Crawl{
|
||||
instance_domain: domain,
|
||||
error: error
|
||||
})
|
||||
end
|
||||
end
|
|
@ -0,0 +1,193 @@
|
|||
defmodule Backend.Crawler.Crawlers.Mastodon do
|
||||
require Logger
|
||||
import Backend.Crawler.Util
|
||||
alias Backend.Crawler.ApiCrawler
|
||||
|
||||
@behaviour ApiCrawler
|
||||
|
||||
@impl ApiCrawler
|
||||
def is_instance_type?(domain) do
|
||||
case get("https://#{domain}/api/v1/instance") do
|
||||
{:ok, response} -> if is_http_200?(response), do: has_title?(response.body), else: false
|
||||
{:error, _error} -> false
|
||||
end
|
||||
end
|
||||
|
||||
@impl ApiCrawler
|
||||
def crawl(domain) do
|
||||
instance = Jason.decode!(get!("https://#{domain}/api/v1/instance").body)
|
||||
|
||||
if get_in(instance, ["stats", "user_count"]) > get_config(:personal_instance_threshold) do
|
||||
crawl_large_instance(domain, instance)
|
||||
else
|
||||
Map.merge(
|
||||
Map.merge(
|
||||
Map.take(instance, ["version", "description"]),
|
||||
Map.take(instance["stats"], ["user_count", "status_count"])
|
||||
)
|
||||
|> Map.new(fn {k, v} -> {String.to_atom(k), v} end),
|
||||
%{peers: [], interactions: %{}, statuses_seen: 0}
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
@spec crawl_large_instance(String.t(), any()) :: ApiCrawler.t()
|
||||
defp crawl_large_instance(domain, instance) do
|
||||
# servers may not publish peers
|
||||
peers =
|
||||
case get("https://#{domain}/api/v1/instance/peers") do
|
||||
{:ok, response} -> if is_http_200?(response), do: Jason.decode!(response.body), else: []
|
||||
{:error, _error} -> []
|
||||
end
|
||||
|
||||
Logger.debug("Found #{length(peers)} peers.")
|
||||
|
||||
{interactions, statuses_seen} = get_interactions(domain)
|
||||
|
||||
Logger.debug(
|
||||
"#{domain}: found #{
|
||||
interactions |> Map.values() |> Enum.reduce(0, fn count, acc -> count + acc end)
|
||||
} mentions in #{statuses_seen} statuses."
|
||||
)
|
||||
|
||||
Map.merge(
|
||||
Map.merge(
|
||||
Map.take(instance, ["version", "description"]),
|
||||
Map.take(instance["stats"], ["user_count", "status_count"])
|
||||
)
|
||||
|> Map.new(fn {k, v} -> {String.to_atom(k), v} end),
|
||||
%{peers: peers, interactions: interactions, statuses_seen: statuses_seen}
|
||||
)
|
||||
end
|
||||
|
||||
@spec get_interactions(
|
||||
String.t(),
|
||||
String.t() | nil,
|
||||
Calendar.naive_datetime() | nil,
|
||||
ApiCrawler.instance_interactions(),
|
||||
integer
|
||||
) :: {ApiCrawler.instance_interactions(), integer}
|
||||
defp get_interactions(
|
||||
domain,
|
||||
max_id \\ nil,
|
||||
min_timestamp \\ nil,
|
||||
interactions \\ %{},
|
||||
statuses_seen \\ 0
|
||||
) do
|
||||
# If `statuses_seen == 0`, it's the first call of this function, which means we want to query the database for the
|
||||
# most recent status we have.
|
||||
min_timestamp =
|
||||
if statuses_seen == 0 do
|
||||
get_last_successful_crawl_timestamp(domain)
|
||||
else
|
||||
min_timestamp
|
||||
end
|
||||
|
||||
endpoint = "https://#{domain}/api/v1/timelines/public?local=true"
|
||||
|
||||
endpoint =
|
||||
if max_id do
|
||||
endpoint <> "&max_id=#{max_id}"
|
||||
else
|
||||
endpoint
|
||||
end
|
||||
|
||||
Logger.debug("Crawling #{endpoint}")
|
||||
|
||||
statuses =
|
||||
endpoint
|
||||
|> get!()
|
||||
|> Map.get(:body)
|
||||
|> Jason.decode!()
|
||||
|
||||
filtered_statuses =
|
||||
statuses
|
||||
|> Enum.filter(fn s -> is_after?(s["created_at"], min_timestamp) end)
|
||||
|
||||
if length(filtered_statuses) > 0 do
|
||||
# get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions
|
||||
interactions = Map.merge(interactions, statuses_to_interactions(filtered_statuses))
|
||||
statuses_seen = statuses_seen + length(filtered_statuses)
|
||||
|
||||
status_datetime_threshold =
|
||||
NaiveDateTime.utc_now()
|
||||
|> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)
|
||||
|
||||
oldest_status = Enum.at(filtered_statuses, -1)
|
||||
|
||||
oldest_status_datetime =
|
||||
oldest_status
|
||||
|> (fn s -> s["created_at"] end).()
|
||||
|> NaiveDateTime.from_iso8601!()
|
||||
|
||||
if NaiveDateTime.compare(oldest_status_datetime, status_datetime_threshold) == :gt and
|
||||
statuses_seen < get_config(:status_count_limit) and
|
||||
length(filtered_statuses) == length(statuses) do
|
||||
get_interactions(domain, oldest_status["id"], min_timestamp, interactions, statuses_seen)
|
||||
else
|
||||
{interactions, statuses_seen}
|
||||
end
|
||||
else
|
||||
{interactions, statuses_seen}
|
||||
end
|
||||
end
|
||||
|
||||
# To check if the endpoint works as expected
|
||||
@spec has_title?(String.t()) :: boolean
|
||||
defp has_title?(body) do
|
||||
case Jason.decode(body) do
|
||||
{:ok, decoded} -> Map.has_key?(decoded, "title")
|
||||
{:error, _error} -> false
|
||||
end
|
||||
end
|
||||
|
||||
# Checks whether the status contains one or more mentions
|
||||
defp is_mention?(status) do
|
||||