refactor/elixir backend

pull/1/head
Tao Bojlén 2019-07-14 11:47:06 +00:00
parent ab8e2b09d0
commit a37452f138
109 changed files with 5339 additions and 3675 deletions

2
.dokku-monorepo 100644
View File

@ -0,0 +1,2 @@
backend=backend
gephi=gephi

173
.gitignore vendored
View File

@ -1,93 +1,9 @@
*.csv
.idea/
backend/backend/static/
backend/static/
*.gexf
backend/whitelist.txt
data/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
./lib/
./lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
.vscode/
# Environments
.env
@ -99,15 +15,84 @@ ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# The directory Mix will write compiled artifacts to.
/backend/_build/
# Rope project settings
.ropeproject
# If you run "mix test --cover", coverage assets end up here.
/backend/cover/
# mkdocs documentation
/site
# The directory Mix downloads your dependencies sources to.
/backend/deps/
# mypy
.mypy_cache/
# Where 3rd-party dependencies like ExDoc output generated docs.
/backend/doc/
# Ignore .fetch files in case you like to edit your project deps locally.
/backend/.fetch
# If the VM crashes, it generates a dump, let's ignore it too.
erl_crash.dump
# Also ignore archive artifacts (built via "mix archive.build").
*.ez
# Ignore package tarball (built via "mix hex.build").
backend-*.tar
# Since we are building assets from assets/,
# we ignore priv/static. You may want to comment
# this depending on your deployment strategy.
/backend/priv/static/
# Files matching config/*.secret.exs pattern contain sensitive
# data and you should not commit them into version control.
#
# Alternatively, you may comment the line below and commit the
# secrets files as long as you replace their contents by environment
# variables.
/backend/config/*.secret.exs
/backend/.elixir_ls/
*.pot
*.po
# dependencies
/frontend/node_modules
# testing
/frontend/coverage
# production
/frontend/build
# misc
.DS_Store
.env.local
.env.development.local
.env.test.local
.env.production.local
npm-debug.log*
yarn-debug.log*
yarn-error.log*
/gephi/.gradle/
/gephi/build/
/gephi/lib/*
/gephi/!lib/.gitkeep
# 64MB file but I don't have much faith that it'll remain available...
!/gephi/lib/gephi-toolkit-0.9.2.jar
*/.idea/
# Ignore Gradle GUI config
/gephi/gradle-app.setting
# Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored)
!/gephi/gradle-wrapper.jar
# Cache of project
/gephi/.gradletasknamecache
*.javac

View File

@ -0,0 +1,9 @@
_build/
deps/
.git/
.gitignore
Dockerfile
Makefile
README*
test/
priv/static/

View File

@ -0,0 +1,5 @@
[
import_deps: [:ecto, :phoenix],
inputs: ["*.{ex,exs}", "priv/*/seeds.exs", "{config,lib,test}/**/*.{ex,exs}"],
subdirectories: ["priv/*/migrations"]
]

View File

@ -1,12 +1,53 @@
FROM python:3
ENV PYTHONUNBUFFERED 1
FROM elixir:1.9.0-alpine as build
RUN apt-get update && \
apt-get install -qqy --no-install-recommends \
postgresql-client-9.6=9.6.10-0+deb9u1
# install build dependencies
RUN apk add --update git build-base
RUN mkdir /code
WORKDIR /code
COPY requirements.txt /code/
RUN pip install -r requirements.txt
COPY . /code/
# prepare build dir
RUN mkdir /app
WORKDIR /app
# install hex + rebar
RUN mix local.hex --force && \
mix local.rebar --force
# set build ENV
ENV MIX_ENV=prod
# install mix dependencies
COPY mix.exs mix.lock ./
COPY config config
RUN mix deps.get
RUN mix deps.compile
# build assets
# COPY assets assets
# RUN cd assets && npm install && npm run deploy
# RUN mix phx.digest
# build project
COPY priv priv
COPY lib lib
RUN mix compile
# build release
COPY rel rel
RUN mix release
# prepare release image
FROM alpine:3.9 AS app
RUN apk add --update bash openssl
RUN mkdir /app
WORKDIR /app
ENV APP_NAME=backend
COPY --from=build /app/_build/prod/rel/${APP_NAME} ./
RUN chown -R nobody: /app
USER nobody
ENV HOME=/app
# The command to start the backend
CMD trap 'exit' INT; ${HOME}/bin/${APP_NAME} start

33
backend/README.md 100644
View File

@ -0,0 +1,33 @@
# fediverse.space backend
## Notes
- This project requires Elixir >= 1.9.
- Run with `SKIP_CRAWL=true` to just run the server (useful for working on the API without also crawling)
## Deployment
Deployment with Docker is handled as per the [Distillery docs](https://hexdocs.pm/distillery/guides/working_with_docker.html).
- To build a new version, run `make build` in this directory.
- To migrate a released version, run `./backend eval "Backend.Release.migrate"`
# Default README
To start your Phoenix server:
- Install dependencies with `mix deps.get`
- Create and migrate your database with `mix ecto.setup`
- Start Phoenix endpoint with `mix phx.server`
Now you can visit [`localhost:4000`](http://localhost:4000) from your browser.
Ready to run in production? Please [check our deployment guides](https://hexdocs.pm/phoenix/deployment.html).
## Learn more
- Official website: http://www.phoenixframework.org/
- Guides: https://hexdocs.pm/phoenix/overview.html
- Docs: https://hexdocs.pm/phoenix
- Mailing list: http://groups.google.com/group/phoenix-talk
- Source: https://github.com/phoenixframework/phoenix

View File

@ -1,8 +0,0 @@
def to_representation(self, instance):
"""
Object instance -> Dict of primitive datatypes.
We use a custom to_representation function to exclude empty fields in the serialized JSON.
"""
ret = super(InstanceListSerializer, self).to_representation(instance)
ret = OrderedDict(list(filter(lambda x: x[1], ret.items())))
return ret

View File

@ -1,5 +0,0 @@
from django.apps import AppConfig
class Apiv1Config(AppConfig):
name = 'apiv1'

View File

@ -1,105 +0,0 @@
from rest_framework import serializers
import math
from collections import OrderedDict
from scraper.models import Instance, Edge
class InstanceListSerializer(serializers.ModelSerializer):
"""
Minimal instance details used in the full list of instances.
"""
class Meta:
model = Instance
fields = ('name', 'user_count')
def to_representation(self, instance):
"""
Object instance -> Dict of primitive datatypes.
We use a custom to_representation function to exclude empty fields in the serialized JSON.
"""
ret = super(InstanceListSerializer, self).to_representation(instance)
ret = OrderedDict(list(filter(lambda x: x[1], ret.items())))
return ret
class InstanceDetailSerializer(serializers.ModelSerializer):
"""
Detailed instance view.
"""
userCount = serializers.SerializerMethodField()
statusCount = serializers.SerializerMethodField()
domainCount = serializers.SerializerMethodField()
lastUpdated = serializers.SerializerMethodField()
peers = InstanceListSerializer(many=True, read_only=True)
def get_userCount(self, obj):
return obj.user_count
def get_statusCount(self, obj):
return obj.status_count
def get_domainCount(self, obj):
return obj.domain_count
def get_lastUpdated(self, obj):
return obj.last_updated
class Meta:
model = Instance
fields = ('name', 'description', 'version', 'userCount',
'statusCount', 'domainCount', 'peers', 'lastUpdated',
'status')
class EdgeSerializer(serializers.ModelSerializer):
"""
Used for displaying the graph.
"""
id = serializers.SerializerMethodField('get_pk')
size = serializers.SerializerMethodField('get_weight')
class Meta:
model = Edge
fields = ('source', 'target', 'id', 'size')
def get_pk(self, obj):
return obj.pk
def get_weight(self, obj):
return obj.weight
class NodeSerializer(serializers.ModelSerializer):
"""
Used for displaying the graph.
"""
id = serializers.SerializerMethodField('get_name')
label = serializers.SerializerMethodField('get_name')
size = serializers.SerializerMethodField()
x = serializers.SerializerMethodField()
y = serializers.SerializerMethodField()
class Meta:
model = Instance
fields = ('id', 'label', 'size', 'x', 'y')
def get_name(self, obj):
return obj.name
def get_size(self, obj):
return math.log(obj.user_count) if (obj.user_count and (obj.user_count > 1)) else 1
def get_x(self, obj):
return obj.x_coord
def get_y(self, obj):
return obj.y_coord
def to_representation(self, instance):
"""
Object instance -> Dict of primitive datatypes.
We use a custom to_representation function to exclude empty fields in the serialized JSON.
"""
ret = super(NodeSerializer, self).to_representation(instance)
ret = OrderedDict(list(filter(lambda x: x[1], ret.items())))
return ret

View File

@ -1,37 +0,0 @@
from rest_framework import viewsets
from scraper.models import Instance, Edge
from apiv1.serializers import InstanceListSerializer, InstanceDetailSerializer, NodeSerializer, EdgeSerializer
class InstanceViewSet(viewsets.ReadOnlyModelViewSet):
"""API endpoint to view stats for, and the peers of, an instance"""
lookup_field = 'name'
lookup_value_regex = '[a-zA-Z0-9-_\.]+'
queryset = Instance.objects.all()
serializer_class = InstanceListSerializer
detail_serializer_class = InstanceDetailSerializer # this serializer also includes stats and a list of peers
def get_serializer_class(self):
if self.action == 'retrieve':
if hasattr(self, 'detail_serializer_class'):
return self.detail_serializer_class
return self.serializer_class
class EdgeView(viewsets.ReadOnlyModelViewSet):
"""
Endpoint to get a list of the graph's edges in a SigmaJS-friendly format.
"""
queryset = Edge.objects.all()
serializer_class = EdgeSerializer
class NodeView(viewsets.ReadOnlyModelViewSet):
"""
Endpoint to get a list of the graph's nodes in a SigmaJS-friendly format.
"""
queryset = Instance.objects.filter(status='success', x_coord__isnull=False, y_coord__isnull=False, user_count__isnull=False)\
.exclude(sources__isnull=True, targets__isnull=True)
serializer_class = NodeSerializer

View File

@ -1,124 +0,0 @@
"""
Django settings for backend project.
Generated by 'django-admin startproject' using Django 2.1.
For more information on this file, see
https://docs.djangoproject.com/en/2.1/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/2.1/ref/settings/
"""
import os
import json
from django.core.exceptions import ImproperlyConfigured
SECRET_KEY = os.getenv("SECRET_KEY")
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# Application definition
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'rest_framework',
'silk',
'corsheaders',
'scraper.apps.ScraperConfig',
'apiv1.apps.Apiv1Config',
]
MIDDLEWARE = [
'corsheaders.middleware.CorsMiddleware',
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
'silk.middleware.SilkyMiddleware',
]
ROOT_URLCONF = 'backend.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [os.path.join(BASE_DIR, '../../frontend/build')],
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
WSGI_APPLICATION = 'backend.wsgi.application'
# Database
# https://docs.djangoproject.com/en/2.1/ref/settings/#databases
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.postgresql',
'NAME': os.getenv("POSTGRES_DB"),
'USER': os.getenv("POSTGRES_USER"),
'PASSWORD': os.getenv("POSTGRES_PASSWORD"),
'HOST': 'db',
'PORT': 5432,
}
}
# Password validation
# https://docs.djangoproject.com/en/2.1/ref/settings/#auth-password-validators
AUTH_PASSWORD_VALIDATORS = [
{
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
},
]
# Internationalization
# https://docs.djangoproject.com/en/2.1/topics/i18n/
LANGUAGE_CODE = 'en-us'
TIME_ZONE = 'UTC'
USE_I18N = True
USE_L10N = True
USE_TZ = False
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/2.1/howto/static-files/
STATIC_URL = '/static/'
STATICFILES_DIRS = []
STATIC_ROOT = os.path.join(BASE_DIR, 'static')

View File

@ -1,7 +0,0 @@
from .base import *
DEBUG = True
ALLOWED_HOSTS = ['localhost']
CORS_ORIGIN_ALLOW_ALL = True

View File

@ -1,10 +0,0 @@
from .base import *
DEBUG = False
ALLOWED_HOSTS = ['backend.fediverse.space']
CORS_ORIGIN_REGEX_WHITELIST = [
r'^(https?:\/\/)?(\w+\.)?(.*)?fediverse-space\.netlify\.com\/?$',
r'^(https?:\/\/)?(\w+\.)?(.*)?fediverse\.space\/?$',
]

View File

@ -1,37 +0,0 @@
"""backend URL Configuration
The `urlpatterns` list routes URLs to views. For more information please see:
https://docs.djangoproject.com/en/2.1/topics/http/urls/
Examples:
Function views
1. Add an import: from my_app import views
2. Add a URL to urlpatterns: path('', views.home, name='home')
Class-based views
1. Add an import: from other_app.views import Home
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
Including another URLconf
1. Import the include() function: from django.urls import include, path
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
"""
from django.urls import path, include
from django.views.generic import TemplateView
from rest_framework import routers
from apiv1 import views
class OptionalTrailingSlashRouter(routers.DefaultRouter):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.trailing_slash = r'/?'
router = OptionalTrailingSlashRouter()
router.register(r'instances', views.InstanceViewSet)
router.register(r'graph/nodes', views.NodeView)
router.register(r'graph/edges', views.EdgeView, base_name='edge')
urlpatterns = [
path('api/v1/', include(router.urls)),
path('silk/', include('silk.urls', namespace='silk')),
]

View File

@ -1,13 +0,0 @@
"""
WSGI config for backend project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
application = get_wsgi_application()

View File

@ -0,0 +1,51 @@
# This file is responsible for configuring your application
# and its dependencies with the aid of the Mix.Config module.
#
# This configuration file is loaded before any dependency and
# is restricted to this project.
# General application configuration
import Config
config :backend,
ecto_repos: [Backend.Repo]
# Configures the endpoint
config :backend, BackendWeb.Endpoint,
url: [host: "localhost"],
secret_key_base: "XL4NKGBN9lZMrQbMEI1KJOlwAt8S7younVJl90TdAgzmwyapr3g7BRYSNYvX0sZ9",
render_errors: [view: BackendWeb.ErrorView, accepts: ~w(json)],
pubsub: [name: Backend.PubSub, adapter: Phoenix.PubSub.PG2]
config :backend, Backend.Repo, queue_target: 5000
# Configures Elixir's Logger
config :logger, :console,
format: "$time $metadata[$level] $message\n",
metadata: [:request_id]
# Use Jason for JSON parsing in Phoenix
config :phoenix, :json_library, Jason
config :backend, :crawler,
status_age_limit_days: 28,
status_count_limit: 5000,
personal_instance_threshold: 10,
crawl_interval_mins: 30,
crawl_workers: 50,
blacklist: [
"gab.best"
],
user_agent: "fediverse.space crawler"
config :backend, Backend.Scheduler,
jobs: [
# At midnight every day
{"@daily", {Backend.Scheduler, :prune_crawls, [1, "month"]}},
# 00.15 daily
{"15 0 * * *", {Backend.Scheduler, :generate_edges, []}}
]
# Import environment specific config. This must remain at the bottom
# of this file so it overrides the configuration defined above.
import_config "#{Mix.env()}.exs"

View File

@ -0,0 +1,72 @@
import Config
# For development, we disable any cache and enable
# debugging and code reloading.
#
# The watchers configuration can be used to run external
# watchers to your application. For example, we use it
# with webpack to recompile .js and .css sources.
config :backend, BackendWeb.Endpoint,
http: [port: 4000],
debug_errors: true,
code_reloader: true,
check_origin: false,
watchers: []
# ## SSL Support
#
# In order to use HTTPS in development, a self-signed
# certificate can be generated by running the following
# Mix task:
#
# mix phx.gen.cert
#
# Note that this task requires Erlang/OTP 20 or later.
# Run `mix help phx.gen.cert` for more information.
#
# The `http:` config above can be replaced with:
#
# https: [
# port: 4001,
# cipher_suite: :strong,
# keyfile: "priv/cert/selfsigned_key.pem",
# certfile: "priv/cert/selfsigned.pem"
# ],
#
# If desired, both `http:` and `https:` keys can be
# configured to run both http and https servers on
# different ports.
# Do not include metadata nor timestamps in development logs
config :logger, :console, format: "[$level] $message\n"
# Set a higher stacktrace during development. Avoid configuring such
# in production as building large stacktraces may be expensive.
config :phoenix, :stacktrace_depth, 20
# Initialize plugs at runtime for faster development compilation
config :phoenix, :plug_init_mode, :runtime
# Configure your database
config :backend, Backend.Repo,
username: "postgres",
password: "postgres",
database: "backend_dev",
hostname: "localhost",
pool_size: 10
config :backend, :crawler,
status_age_limit_days: 28,
status_count_limit: 100,
personal_instance_threshold: 1,
crawl_interval_mins: 1,
crawl_workers: 10,
blacklist: [
"gab.best"
]
config :backend, Backend.Scheduler,
jobs: [
# Every 15 minutes
{"*/15 * * * *", {Backend.Scheduler, :prune_crawls, [12, "hour"]}}
]

View File

@ -0,0 +1,57 @@
import Config
# Do not print debug messages in production
config :logger, level: :info
# ## SSL Support
#
# To get SSL working, you will need to add the `https` key
# to the previous section and set your `:url` port to 443:
#
# config :backend, BackendWeb.Endpoint,
# ...
# url: [host: "example.com", port: 443],
# https: [
# :inet6,
# port: 443,
# cipher_suite: :strong,
# keyfile: System.get_env("SOME_APP_SSL_KEY_PATH"),
# certfile: System.get_env("SOME_APP_SSL_CERT_PATH")
# ]
#
# The `cipher_suite` is set to `:strong` to support only the
# latest and more secure SSL ciphers. This means old browsers
# and clients may not be supported. You can set it to
# `:compatible` for wider support.
#
# `:keyfile` and `:certfile` expect an absolute path to the key
# and cert in disk or a relative path inside priv, for example
# "priv/ssl/server.key". For all supported SSL configuration
# options, see https://hexdocs.pm/plug/Plug.SSL.html#configure/1
#
# We also recommend setting `force_ssl` in your endpoint, ensuring
# no data is ever sent via http, always redirecting to https:
#
# config :backend, BackendWeb.Endpoint,
# force_ssl: [hsts: true]
#
# Check `Plug.SSL` for all available options in `force_ssl`.
# ## Using releases (distillery)
#
# If you are doing OTP releases, you need to instruct Phoenix
# to start the server for all endpoints:
#
# config :phoenix, :serve_endpoints, true
#
# Alternatively, you can configure exactly which server to
# start per endpoint:
#
# config :backend, BackendWeb.Endpoint, server: true
#
# Note you can't rely on `System.get_env/1` when using releases.
# See the releases documentation accordingly.
# Finally import the config/prod.secret.exs which should be versioned
# separately.
# import_config "prod.secret.exs"

View File

@ -0,0 +1,27 @@
# This file is for *runtime configuration in releases* only.
# https://hexdocs.pm/phoenix/releases.html#runtime-configuration
import Config
# For production, don't forget to configure the url host
# to something meaningful, Phoenix uses this information
# when generating URLs.
config :backend, Backend.Repo,
# username: System.get_env("POSTGRES_USER"),
# password: System.get_env("POSTGRES_PASSWORD"),
# database: System.get_env("POSTGRES_DB"),
# hostname: System.get_env("POSTGRES_HOSTNAME"),
url: System.get_env("ecto://" <> "DATABASE_URL"),
pool_size: String.to_integer(System.get_env("POOL_SIZE") || "10"),
ssl: true
# show_sensitive_data_on_connection_error: true
port = String.to_integer(System.get_env("PORT") || "4000")
config :backend, BackendWeb.Endpoint,
http: [:inet6, port: port],
url: [host: System.get_env("BACKEND_HOSTNAME"), port: port],
root: ".",
secret_key_base: System.get_env("SECRET_KEY_BASE"),
server: true

View File

@ -0,0 +1,18 @@
import Config
# We don't run a server during test. If one is required,
# you can enable the server option below.
config :backend, BackendWeb.Endpoint,
http: [port: 4002],
server: false
# Print only warnings and errors during test
config :logger, level: :warn
# Configure your database
config :backend, Backend.Repo,
username: "postgres",
password: "postgres",
database: "backend_test",
hostname: "localhost",
pool: Ecto.Adapters.SQL.Sandbox

View File

@ -0,0 +1,9 @@
defmodule Backend do
@moduledoc """
Backend keeps the contexts that define your domain
and business logic.
Contexts are also responsible for managing your data, regardless
if it comes from the database, an external API or others.
"""
end

View File

@ -0,0 +1,68 @@
defmodule Backend.Api do
alias Backend.{Crawl, Edge, Instance, Repo}
import Ecto.Query
@spec list_instances() :: [Instance.t()]
def list_instances() do
Instance
|> Repo.all()
end
@spec get_instance!(String.t()) :: Instance.t()
def get_instance!(domain) do
Instance
|> preload(:peers)
|> Repo.get_by!(domain: domain)
end
@doc """
Returns a list of instances that
* have at least one successful crawl
* have a user count (required to give the instance a size on the graph)
"""
@spec list_nodes() :: [Instance.t()]
def list_nodes() do
crawl_subquery =
Crawl
|> select([c], %{
instance_domain: c.instance_domain,
crawl_count: count(c.id)
})
|> where([c], is_nil(c.error))
|> group_by([c], c.instance_domain)
Instance
|> join(:inner, [i], c in subquery(crawl_subquery), on: i.domain == c.instance_domain)
|> where(
[i, c],
c.crawl_count > 0 and not is_nil(i.user_count) and not is_nil(i.x) and not is_nil(i.y)
)
|> select([c], [:domain, :user_count, :x, :y])
|> Repo.all()
end
@spec list_edges() :: [Edge.t()]
def list_edges() do
crawl_subquery =
Crawl
|> select([c], %{
instance_domain: c.instance_domain,
crawl_count: count(c.id)
})
|> where([c], is_nil(c.error))
|> group_by([c], c.instance_domain)
Edge
|> join(:inner, [e], c1 in subquery(crawl_subquery), on: e.source_domain == c1.instance_domain)
|> join(:inner, [e], c2 in subquery(crawl_subquery), on: e.target_domain == c2.instance_domain)
|> join(:inner, [e], i1 in Instance, on: e.source_domain == i1.domain)
|> join(:inner, [e], i2 in Instance, on: e.target_domain == i2.domain)
|> select([e], [:id, :source_domain, :target_domain, :weight])
|> where(
[e, c1, c2, i1, i2],
c1.crawl_count > 0 and c2.crawl_count > 0 and not is_nil(i1.x) and not is_nil(i1.y) and
not is_nil(i2.x) and not is_nil(i2.y) and e.source_domain != e.target_domain
)
|> Repo.all()
end
end

View File

@ -0,0 +1,46 @@
defmodule Backend.Application do
# See https://hexdocs.pm/elixir/Application.html
# for more information on OTP Applications
@moduledoc false
use Application
require Logger
import Backend.Util
def start(_type, _args) do
crawl_worker_count = get_config(:crawl_workers)
children = [
# Start the Ecto repository
Backend.Repo,
# Start the endpoint when the application starts
BackendWeb.Endpoint,
# Crawler children
:hackney_pool.child_spec(:crawler, timeout: 15000, max_connections: crawl_worker_count),
{Task,
fn ->
Honeydew.start_queue(:crawl_queue, failure_mode: Honeydew.FailureMode.Abandon)
Honeydew.start_workers(:crawl_queue, Backend.Crawler, num: crawl_worker_count)
end},
Backend.Scheduler
]
children =
case Enum.member?(["true", 1, "1"], System.get_env("SKIP_CRAWL")) do
true -> children
false -> children ++ [Backend.Crawler.StaleInstanceManager]
end
# See https://hexdocs.pm/elixir/Supervisor.html
# for other strategies and supported options
opts = [strategy: :one_for_one, name: Backend.Supervisor]
Supervisor.start_link(children, opts)
end
# Tell Phoenix to update the endpoint configuration
# whenever the application is updated.
def config_change(changed, _new, removed) do
BackendWeb.Endpoint.config_change(changed, removed)
:ok
end
end

View File

@ -0,0 +1,26 @@
defmodule Backend.Crawl do
use Ecto.Schema
import Ecto.Changeset
schema "crawls" do
belongs_to :instance, Backend.Instance,
references: :domain,
type: :string,
foreign_key: :instance_domain
field :interactions_seen, :integer
field :statuses_seen, :integer
# if something went wrong, otherwise null
field :error, :string
timestamps()
end
@doc false
def changeset(crawl, attrs) do
crawl
|> cast(attrs, [:instance, :statuses_seen, :interactions_seen, :error])
|> validate_required([:instance])
end
end

View File

@ -0,0 +1,29 @@
defmodule Backend.CrawlInteraction do
use Ecto.Schema
import Ecto.Changeset
schema "crawl_interactions" do
belongs_to :crawl, Backend.Crawl
belongs_to :source, Backend.Instance,
references: :domain,
type: :string,
foreign_key: :source_domain
belongs_to :target, Backend.Instance,
references: :domain,
type: :string,
foreign_key: :target_domain
field :mentions, :integer
timestamps()
end
@doc false
def changeset(crawl_interaction, attrs) do
crawl_interaction
|> cast(attrs, [:crawl, :source, :target, :mentions])
|> validate_required([:crawl, :source, :target, :mentions])
end
end

View File

@ -0,0 +1,45 @@
defmodule Backend.Crawler.ApiCrawler do
@moduledoc """
This module is a specification. Crawlers for all instance types must implement its behaviour.
Make sure to respect the following:
* You must adhere to the following configuration values:
* `:status_age_limit_days` specifies that you must only crawl statuses from the most recent N days
* `:status_count_limit` specifies the max number of statuses to crawl in one go
* `:personal_instance_threshold` specifies that instances with fewer than this number of users should not be crawled
* profiles with the string "nobot" (case insensitive) in their profile must not be included in any stats
* Make sure to check the most recent crawl of the instance so you don't re-crawl old statuses
"""
# {domain_mentioned, count}
@type instance_interactions :: %{String.t() => integer}
defstruct [
:version,
:description,
:user_count,
:status_count,
:peers,
:interactions,
:statuses_seen
]
@type t() :: %__MODULE__{
version: String.t(),
description: String.t(),
user_count: integer,
status_count: integer,
peers: [String.t()],
interactions: instance_interactions,
statuses_seen: integer
}
@doc """
Check whether the instance at the given domain is of the type that this ApiCrawler implements.
"""
@callback is_instance_type?(String.t()) :: boolean()
@doc """
Crawl the instance at the given domain.
"""
@callback crawl(String.t()) :: t()
end

View File

@ -0,0 +1,196 @@
defmodule Backend.Crawler do
@moduledoc """
This module crawls instances. Run `run(domain)` to crawl a given domain.
"""
alias __MODULE__
alias Backend.Crawler.Crawlers.Mastodon
alias Backend.Crawler.ApiCrawler
alias Backend.{Crawl, CrawlInteraction, Repo, Instance, InstancePeer}
import Ecto.Query
import Backend.Util
require Logger
defstruct [
# the instance domain (a string)
:domain,
# a list of ApiCrawlers that will be attempted
:api_crawlers,
:found_api?,
:result,
:error
]
@type t() :: %__MODULE__{
domain: String.t(),
api_crawlers: [ApiCrawler.t()],
found_api?: boolean,
result: ApiCrawler.t() | nil,
error: String.t() | nil
}
def run(domain) do
Logger.info("Crawling #{domain}...")
HTTPoison.start()
state = %Crawler{domain: domain, api_crawlers: [], found_api?: false, result: nil, error: nil}
state
# register APICrawlers here
|> register(Mastodon)
# go!
|> crawl()
|> save()
end
# Adds a new ApiCrawler that run/1 will check.
defp register(%Crawler{api_crawlers: crawlers} = state, api_crawler) do
Map.put(state, :api_crawlers, [api_crawler | crawlers])
end
# Recursive function to check whether `domain` has an API that the head of the api_crawlers list can read.
# If so, crawls it. If not, continues with the tail of the api_crawlers list.
defp crawl(%Crawler{api_crawlers: [], domain: domain} = state) do
Logger.debug("Found no compatible API for #{domain}")
Map.put(state, :found_api?, false)
end
defp crawl(%Crawler{domain: domain, api_crawlers: [curr | remaining_crawlers]} = state) do
if curr.is_instance_type?(domain) do
Logger.debug("Found #{curr} instance")
state = Map.put(state, :found_api?, true)
try do
%Crawler{state | result: curr.crawl(domain), api_crawlers: []}
rescue
e in HTTPoison.Error ->
Map.put(state, :error, "HTTPoison error: " <> HTTPoison.Error.message(e))
e in Jason.DecodeError ->
Map.put(state, :error, "Jason DecodeError: " <> Jason.DecodeError.message(e))
e in _ ->
Map.put(state, :error, "Unknown error: " <> inspect(e))
end
else
# Nothing found so check the next APICrawler
Logger.debug("#{domain} is not an instance of #{curr}")
crawl(%Crawler{state | api_crawlers: remaining_crawlers})
end
end
# Save the state (after crawling) to the database.
defp save(%Crawler{domain: domain, result: result, found_api?: true, error: nil}) do
now = NaiveDateTime.truncate(NaiveDateTime.utc_now(), :second)
## Update the instance we crawled ##
Repo.insert!(
%Instance{
domain: domain,
description: result.description,
version: result.version,
user_count: result.user_count,
status_count: result.status_count
},
on_conflict: [
set: [
description: result.description,
version: result.version,
user_count: result.user_count,
status_count: result.status_count,
updated_at: now
]
],
conflict_target: :domain
)
# Save details of a new crawl
curr_crawl =
Repo.insert!(%Crawl{
instance_domain: domain,
interactions_seen:
result.interactions |> Map.values() |> Enum.reduce(0, fn count, acc -> count + acc end),
statuses_seen: result.statuses_seen
})
# We get a list of peers from two places:
# * the official peers endpoint (which may be disabled)
# * the interactions
peers_domains =
result.interactions
|> Map.keys()
|> list_union(result.peers)
|> Enum.filter(fn domain -> not is_blacklisted?(domain) end)
peers =
peers_domains
|> Enum.map(&%{domain: &1, inserted_at: now, updated_at: now})
Instance
|> Repo.insert_all(peers, on_conflict: :nothing, conflict_target: :domain)
Repo.transaction(fn ->
## Save peer relationships ##
# get current peers (a list of strings)
current_peers =
InstancePeer
|> where(source_domain: ^domain)
|> select([p], p.target_domain)
|> Repo.all()
wanted_peers_set = MapSet.new(peers_domains)
current_peers_set = MapSet.new(current_peers)
# delete the peers we don't want
dont_want = current_peers_set |> MapSet.difference(wanted_peers_set) |> MapSet.to_list()
if length(dont_want) > 0 do
InstancePeer
|> where(source_domain: ^domain)
|> where([p], p.target_domain in ^dont_want)
|> Repo.delete_all([])
end
# insert the ones we don't have yet
new_instance_peers =
wanted_peers_set
|> MapSet.difference(current_peers_set)
|> MapSet.to_list()
|> Enum.map(
&%{
source_domain: domain,
target_domain: &1,
inserted_at: now,
updated_at: now
}
)
InstancePeer
|> Repo.insert_all(new_instance_peers)
end)
## Save interactions ##
interactions =
result.interactions
|> Enum.filter(fn {target_domain, _count} -> not is_blacklisted?(target_domain) end)
|> Enum.map(fn {target_domain, count} ->
%{
crawl_id: curr_crawl.id,
source_domain: domain,
target_domain: target_domain,
mentions: count,
inserted_at: now,
updated_at: now
}
end)
CrawlInteraction
|> Repo.insert_all(interactions)
end
defp save(%{domain: domain, error: error}) do
Repo.insert!(%Crawl{
instance_domain: domain,
error: error
})
end
end

View File

@ -0,0 +1,193 @@
defmodule Backend.Crawler.Crawlers.Mastodon do
require Logger
import Backend.Crawler.Util
alias Backend.Crawler.ApiCrawler
@behaviour ApiCrawler
@impl ApiCrawler
def is_instance_type?(domain) do
case get("https://#{domain}/api/v1/instance") do
{:ok, response} -> if is_http_200?(response), do: has_title?(response.body), else: false
{:error, _error} -> false
end
end
@impl ApiCrawler
def crawl(domain) do
instance = Jason.decode!(get!("https://#{domain}/api/v1/instance").body)
if get_in(instance, ["stats", "user_count"]) > get_config(:personal_instance_threshold) do
crawl_large_instance(domain, instance)
else
Map.merge(
Map.merge(
Map.take(instance, ["version", "description"]),
Map.take(instance["stats"], ["user_count", "status_count"])
)
|> Map.new(fn {k, v} -> {String.to_atom(k), v} end),
%{peers: [], interactions: %{}, statuses_seen: 0}
)
end
end
@spec crawl_large_instance(String.t(), any()) :: ApiCrawler.t()
defp crawl_large_instance(domain, instance) do
# servers may not publish peers
peers =
case get("https://#{domain}/api/v1/instance/peers") do
{:ok, response} -> if is_http_200?(response), do: Jason.decode!(response.body), else: []
{:error, _error} -> []
end
Logger.debug("Found #{length(peers)} peers.")
{interactions, statuses_seen} = get_interactions(domain)
Logger.debug(
"#{domain}: found #{
interactions |> Map.values() |> Enum.reduce(0, fn count, acc -> count + acc end)
} mentions in #{statuses_seen} statuses."
)
Map.merge(
Map.merge(
Map.take(instance, ["version", "description"]),
Map.take(instance["stats"], ["user_count", "status_count"])
)
|> Map.new(fn {k, v} -> {String.to_atom(k), v} end),
%{peers: peers, interactions: interactions, statuses_seen: statuses_seen}
)
end
@spec get_interactions(
String.t(),
String.t() | nil,
Calendar.naive_datetime() | nil,
ApiCrawler.instance_interactions(),
integer
) :: {ApiCrawler.instance_interactions(), integer}
defp get_interactions(
domain,
max_id \\ nil,
min_timestamp \\ nil,
interactions \\ %{},
statuses_seen \\ 0
) do
# If `statuses_seen == 0`, it's the first call of this function, which means we want to query the database for the
# most recent status we have.
min_timestamp =
if statuses_seen == 0 do
get_last_successful_crawl_timestamp(domain)
else
min_timestamp
end
endpoint = "https://#{domain}/api/v1/timelines/public?local=true"
endpoint =
if max_id do
endpoint <> "&max_id=#{max_id}"
else
endpoint
end
Logger.debug("Crawling #{endpoint}")
statuses =
endpoint
|> get!()
|> Map.get(:body)
|> Jason.decode!()
filtered_statuses =
statuses
|> Enum.filter(fn s -> is_after?(s["created_at"], min_timestamp) end)
if length(filtered_statuses) > 0 do
# get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions
interactions = Map.merge(interactions, statuses_to_interactions(filtered_statuses))
statuses_seen = statuses_seen + length(filtered_statuses)
status_datetime_threshold =
NaiveDateTime.utc_now()
|> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)
oldest_status = Enum.at(filtered_statuses, -1)
oldest_status_datetime =
oldest_status
|> (fn s -> s["created_at"] end).()
|> NaiveDateTime.from_iso8601!()
if NaiveDateTime.compare(oldest_status_datetime, status_datetime_threshold) == :gt and
statuses_seen < get_config(:status_count_limit) and
length(filtered_statuses) == length(statuses) do
get_interactions(domain, oldest_status["id"], min_timestamp, interactions, statuses_seen)
else
{interactions, statuses_seen}
end
else
{interactions, statuses_seen}
end
end
# To check if the endpoint works as expected
@spec has_title?(String.t()) :: boolean
defp has_title?(body) do
case Jason.decode(body) do
{:ok, decoded} -> Map.has_key?(decoded, "title")
{:error, _error} -> false
end
end
# Checks whether the status contains one or more mentions
defp is_mention?(status) do