Nginx now bundled in image, does media caching

Also serves static files. Old media caching removed.
2022-12-19 04:26:42 +00:00 · 2022-12-19 04:26:42 +00:00 · a26263fb05
commit a26263fb05
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -9,6 +9,8 @@ RUN apt-get update \
    && apt-get install -y --no-install-recommends \
        libpq5 \
        libxslt1.1 \
+        nginx \
+        busybox \
    && rm -rf /var/lib/apt/lists/*

 COPY requirements.txt requirements.txt
@ -30,6 +32,8 @@ RUN apt-get update \
        zlib1g-dev \
    && rm -rf /var/lib/apt/lists/*

+RUN mkdir -p /cache
+
 COPY . /takahe

 WORKDIR /takahe
@ -41,4 +45,4 @@ EXPOSE 8000
 # Set some sensible defaults
 ENV GUNICORN_CMD_ARGS="--workers 8"

-CMD ["gunicorn", "takahe.wsgi:application", "-b", "0.0.0.0:8000"]
+CMD ["bash", "docker/run.sh"]
--- a/docker/nginx.conf
+++ b/docker/nginx.conf
@ -0,0 +1,63 @@
+daemon off;
+error_log /dev/stdout info;
+
+events {
+  worker_connections 4096;
+}
+
+http {
+
+    include /etc/nginx/mime.types;
+
+    proxy_cache_path /cache/nginx levels=1:2 keys_zone=takahe:20m inactive=14d max_size=__CACHESIZE__;
+
+    upstream takahe {
+        server "127.0.0.1:8001";
+    }
+
+    server {
+        listen 8000;
+        listen [::]:8000;
+        server_name _;
+
+        root /takahe/static;
+        index index.html;
+
+        ignore_invalid_headers on;
+        proxy_connect_timeout 900;
+
+        proxy_headers_hash_max_size 1024;
+        proxy_headers_hash_bucket_size 128;
+
+        client_max_body_size 512M;
+        client_body_buffer_size 128k;
+        charset utf-8;
+
+        proxy_set_header Host $http_host;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_http_version 1.1;
+        proxy_cache takahe;
+
+        location /static/ {
+            alias /takahe/static-collected/;
+        }
+
+        location ~* ^/(media|proxy) {
+            proxy_cache_key $host$uri;
+            proxy_cache_valid 200 304 720h;
+            proxy_cache_valid 301 307 12h;
+            proxy_cache_valid 500 502 503 504 0s;
+            proxy_cache_valid any 72h;
+
+            add_header X-Cache $upstream_cache_status;
+
+            proxy_pass http://takahe;
+        }
+
+        location / {
+            proxy_redirect   off;
+            proxy_buffering  off;
+            proxy_pass       http://takahe;
+        }
+    }
+}
--- a/docker/run.sh
+++ b/docker/run.sh
@ -0,0 +1,15 @@
+#!/bin/bash
+
+# Set up cache size
+CACHE_SIZE="${TAKAHE_NGINX_CACHE_SIZE:-1g}"
+sed -i s/__CACHESIZE__/${CACHE_SIZE}/g /takahe/docker/nginx.conf
+
+# Run nginx and gunicorn
+nginx -c "/takahe/docker/nginx.conf" &
+gunicorn takahe.wsgi:application -b 0.0.0.0:8001 &
+
+# Wait for any process to exit
+wait -n
+
+# Exit with status of process that exited first
+exit $?
--- a/docs/installation.rst
+++ b/docs/installation.rst
@ -45,6 +45,10 @@ You'll need to run two copies of our `Docker image <https://hub.docker.com/r/joi

 * One with the arguments ``python3 manage.py runstator``, which will run the background worker

+These containers will need the ability to write at least 1GB of files out
+to their scratch disks. See the ``TAKAHE_NGINX_CACHE_SIZE`` environment
+variable for more.
+
 .. note::

    If you cannot run a background worker for some reason, you can instead
@ -59,7 +63,9 @@ project, so if you know what you're doing, go for it - but we won't be able
 to give you support.

 If you are running on Kubernetes, we recommend that you make one Deployment
-for the webserver and one Deployment for the background worker.
+for the webserver and one Deployment for the background worker. We also
+recommend that you mount an ``emptyDir`` to the ``/cache/`` path on the
+webserver containers, as this is where the media cache will be stored.


 Environment Variables
@ -113,6 +119,12 @@ be provided to the containers from the first boot.
  ``["andrew@aeracode.org"]`` (if you're doing this via shell, be careful
  about escaping!)

+In addition, there are some optional variables you can set:
+
+* ``TAKAHE_NGINX_CACHE_SIZE`` allows you to specify the size of the disk cache
+  that is used to cache proxied avatars, profile images and media. See
+  :doc:`tuning` for more.
+

 .. _media_configuration:

--- a/docs/tuning.rst
+++ b/docs/tuning.rst
@ -9,26 +9,6 @@ We recommend that all installations are run behind a CDN, and
 have caches configured. See below for more details on each.


-CDNs
----
-
-Takahē is *designed to be run behind a CDN*. It serves most static files directly
-from its main webservers, which is inefficient if called directly, but they
-have ``Cache-Control`` headers set so that the CDN can do the heavy lifting -
-more efficiently than offloading all files to something like S3.
-
-If you don't run behind a CDN, things will still work, but even a medium
-level of traffic might put the webservers under a lot of load.
-
-If you do run behind a CDN, ensure that your CDN is set to respect
-``Cache-Control`` headers from the origin. Some CDNs go purely off of file
-extensions by default, which will not capture all of the proxy views Takahē
-uses to show remote images without leaking user information.
-
-If you don't want to use a CDN but still want a performance improvement, a
-read-through cache that respects ``Cache-Control``, like Varnish, will
-also help if placed in front of Takahē.
-

 Scaling
 -------
@ -88,11 +68,7 @@ servers may consider it permanently unreachable and stop sending posts.
 Caching
 -------

-By default Takakē has caching disabled. The caching needs of a server can
-varying drastically based upon the number of users and how interconnected
-they are with other servers.
-
-There are multiple ways Takahē uses caches:
+There are two ways Takahē uses caches:

 * For caching rendered pages and responses, like user profile information.
  These caches reduce database load on your server and improve performance.
@ -101,32 +77,26 @@ There are multiple ways Takahē uses caches:
  proxied to protect your users' privacy; also caching these reduces
  your server's consumed bandwidth and improves users' loading times.

-The exact caches you can configure are:
+By default Takakē has Nginx inside its container image configured to perform
+read-through HTTP caching for the image and media files, and no cache
+configured for page rendering.

-* ``TAKAHE_CACHES_DEFAULT``: Rendered page and response caching
+Each cache can be adjusted to your needs; let's talk about both.

-* ``TAKAHE_CACHES_MEDIA``: Remote post images and user profile header pictures

-* ``TAKAHE_CACHES_AVATARS``: Remote user avatars ("icons") only
+Page Caching
+~~~~~~~~~~~~

-We recommend you set up ``TAKAHE_CACHES_MEDIA`` and ``TAKAHE_CACHES_AVATARS``
-at a bare minimum - proxying these all the time without caching will eat into
-your server's bandwidth.
+This caching helps Takahē avoid database hits by rendering complex pages or
+API endpoints only once, and turning it on will reduce your database load.
+There is no cache enabled for this by default

-All caches are configured the same way - with a custom cache URI/URL. We
-support anything that is available as part of
+To configure it, set the ``TAKAHE_CACHES_DEFAULT`` environment variable.
+We support anything that is available as part of
 `django-cache-url <https://github.com/epicserve/django-cache-url>`_, but
 some cache backends will require additional Python packages not installed
-by default with Takahē. More discussion on backend is below.
+by default with Takahē. More discussion on some major backends is below.

-All items in the cache come with an expiry set - usually one week - but you
-can also configure a maximum cache size on dedicated cache datastores like
-Memcache. The key names used by the caches do not overlap, so there is
-no need to configure different key prefixes for each of Takahē's caches.
-
-
-Backends
-~~~~~~~~

 Redis
 #####
@ -140,11 +110,6 @@ Examples::
 A Redis-protocol server. Use ``redis://`` for unencrypted communication and
 ``rediss://`` for TLS.

-Redis has a large item size limit and is suitable for all caches. We recommend
-that you keep the DEFAULT cache separate from the MEDIA and AVATARS caches, and
-set the ``maxmemory`` on both to appropriate values (the proxying caches will
-need more memory than the DEFAULT cache).
-


 Memcache
@ -157,9 +122,6 @@ Examples::

 A remote Memcache-protocol server (or set of servers).

-Memcached has a 1MB limit per key by default, so this is only suitable for the
-DEFAULT cache and not the AVATARS or MEDIA cache.
-

 Filesystem
 ##########
@ -168,10 +130,8 @@ Examples::

  file:///var/cache/takahe/

-A cache on the local disk.
-
-This *will* work with any of the cache backends, but is probably more suitable
-for MEDIA and AVATARS.
+A cache on the local disk. Slower than other options, and only really useful
+if you have no other choice.

 Note that if you are running Takahē in a cluster, this cache will not be shared
 across different machines. This is not quite as bad as it first seems; it just
@ -187,4 +147,52 @@ Examples::
  locmem://default

 A local memory cache, inside the Python process. This will consume additional
-memory for the process, and should not be used with the MEDIA or AVATARS caches.
+memory for the process, and should be used with care.
+
+
+Image and Media Caching
+~~~~~~~~~~~~~~~~~~~~~~~
+
+In order to protect your users' privacy and IP addresses, we can't just send
+them the remote URLs of user avatars and post images that aren't on your
+server; we instead need to proxy them through Takahē in order to obscure who
+is requesting them.
+
+Some other ActivityPub servers do this by downloading all media and images as
+soon as they see it, and storing it all locally with some sort of clean-up job;
+Takahē instead opts for using a read-through cache for this task, which uses
+a bit more bandwidth in the long run but which has much easier maintenance and
+better failure modes.
+
+Our Docker image comes with this cache built in, as without it you'll be making
+Python do a lot of file proxying on every page load (and it's not the best at
+that). It's set to 1GB of disk on each container by default, but you can adjust
+this by setting the ``TAKAHE_NGINX_CACHE_SIZE`` environment variable to a value
+Nginx understands, like ``10g``.
+
+The cache directory is ``/cache/``, and you can mount a different disk into
+this path if you'd like to give it faster or more ephemeral storage.
+
+If you have an external CDN or cache, you can also opt to add your own caching
+to these URLs; they all begin with ``/proxy/``, and have appropriate
+``Cache-Control`` headers set.
+
+
+CDNs
+----
+
+Takahē can be run behind a CDN if you want to offset some of the load from the
+webserver containers. Takahē has to proxy all remote user avatars and images in
+order to protect the privacy of your users, and has a built-in cache to help
+with this (see "Caching" above), but at large scale this might start to get
+strained.
+
+If you do run behind a CDN, ensure that your CDN is set to respect
+``Cache-Control`` headers from the origin rather than going purely off of file
+extensions. Some CDNs go purely off of file
+extensions by default, which will not capture all of the proxy views Takahē
+uses to show remote images without leaking user information.
+
+If you don't want to use a CDN but still want a performance improvement, a
+read-through cache that respects ``Cache-Control``, like Varnish, will
+also help if placed in front of Takahē.
--- a/mediaproxy/views.py
+++ b/mediaproxy/views.py
@ -1,6 +1,5 @@
 import httpx
 from django.conf import settings
-from django.core.cache import caches
 from django.http import Http404, HttpResponse
 from django.shortcuts import get_object_or_404
 from django.views.generic import View
@ -9,46 +8,31 @@ from activities.models import Emoji, PostAttachment
 from users.models import Identity


-class BaseCacheView(View):
+class BaseProxyView(View):
    """
-    Base class for caching remote content.
+    Base class for proxying remote content.
    """

-    cache_name = "media"
-    item_timeout: int | None = None
-
    def get(self, request, **kwargs):
        self.kwargs = kwargs
        remote_url = self.get_remote_url()
-        cache = caches[self.cache_name]
-        cache_key = "proxy_" + remote_url
-        # See if it's already cached
-        cached_content = cache.get(cache_key)
-        if not cached_content:
-            # OK, fetch and cache it
-            try:
-                remote_response = httpx.get(
-                    remote_url,
-                    headers={"User-Agent": settings.TAKAHE_USER_AGENT},
-                    follow_redirects=True,
-                    timeout=settings.SETUP.REMOTE_TIMEOUT,
-                )
-            except httpx.RequestError:
-                return HttpResponse(status=502)
-            if remote_response.status_code >= 400:
-                return HttpResponse(status=502)
-            # We got it - shove it into the cache
-            cached_content = {
-                "content": remote_response.content,
-                "mimetype": remote_response.headers.get(
+        try:
+            remote_response = httpx.get(
+                remote_url,
+                headers={"User-Agent": settings.TAKAHE_USER_AGENT},
+                follow_redirects=True,
+                timeout=settings.SETUP.REMOTE_TIMEOUT,
+            )
+        except httpx.RequestError:
+            return HttpResponse(status=502)
+        if remote_response.status_code >= 400:
+            return HttpResponse(status=502)
+        return HttpResponse(
+            remote_response.content,
+            headers={
+                "Content-Type": remote_response.headers.get(
                    "Content-Type", "application/octet-stream"
                ),
-            }
-            cache.set(cache_key, cached_content, timeout=self.item_timeout)
-        return HttpResponse(
-            cached_content["content"],
-            headers={
-                "Content-Type": cached_content["mimetype"],
                "Cache-Control": "public, max-age=3600",
            },
        )
@ -57,13 +41,11 @@ class BaseCacheView(View):
        raise NotImplementedError()


-class EmojiCacheView(BaseCacheView):
+class EmojiCacheView(BaseProxyView):
    """
-    Caches Emoji
+    Proxies Emoji
    """

-    item_timeout = 86400 * 7  # One week
-
    def get_remote_url(self):
        self.emoji = get_object_or_404(Emoji, pk=self.kwargs["emoji_id"])

@ -72,14 +54,11 @@ class EmojiCacheView(BaseCacheView):
        return self.emoji.remote_url


-class IdentityIconCacheView(BaseCacheView):
+class IdentityIconCacheView(BaseProxyView):
    """
-    Caches identity icons (avatars)
+    Proxies identity icons (avatars)
    """

-    cache_name = "avatars"
-    item_timeout = 86400 * 7  # One week
-
    def get_remote_url(self):
        self.identity = get_object_or_404(Identity, pk=self.kwargs["identity_id"])
        if self.identity.local or not self.identity.icon_uri:
@ -87,13 +66,11 @@ class IdentityIconCacheView(BaseCacheView):
        return self.identity.icon_uri


-class IdentityImageCacheView(BaseCacheView):
+class IdentityImageCacheView(BaseProxyView):
    """
-    Caches identity profile header images
+    Proxies identity profile header images
    """

-    item_timeout = 86400 * 7  # One week
-
    def get_remote_url(self):
        self.identity = get_object_or_404(Identity, pk=self.kwargs["identity_id"])
        if self.identity.local or not self.identity.image_uri:
@ -101,13 +78,11 @@ class IdentityImageCacheView(BaseCacheView):
        return self.identity.image_uri


-class PostAttachmentCacheView(BaseCacheView):
+class PostAttachmentCacheView(BaseProxyView):
    """
-    Caches post media (images only, videos should always be offloaded to remote)
+    Proxies post media (images only, videos should always be offloaded to remote)
    """

-    item_timeout = 86400 * 7  # One week
-
    def get_remote_url(self):
        self.post_attachment = get_object_or_404(
            PostAttachment, pk=self.kwargs["attachment_id"]
--- a/takahe/settings.py
+++ b/takahe/settings.py
@ -127,12 +127,6 @@ class Settings(BaseSettings):
    #: Default cache backend
    CACHES_DEFAULT: CacheBackendUrl | None = None

-    #: User icon (avatar) caching backend
-    CACHES_AVATARS: CacheBackendUrl | None = None
-
-    #: Media caching backend
-    CACHES_MEDIA: CacheBackendUrl | None = None
-
    PGHOST: str | None = None
    PGPORT: int | None = 5432
    PGNAME: str = "takahe"
@ -385,8 +379,6 @@ if SETUP.MEDIA_BACKEND:

 CACHES = {
    "default": django_cache_url.parse(SETUP.CACHES_DEFAULT or "dummy://"),
-    "avatars": django_cache_url.parse(SETUP.CACHES_AVATARS or "dummy://"),
-    "media": django_cache_url.parse(SETUP.CACHES_MEDIA or "dummy://"),
 }

 if SETUP.ERROR_EMAILS: