From a26263fb05baf123ba47a39e52e749c1bce73264 Mon Sep 17 00:00:00 2001 From: Andrew Godwin Date: Mon, 19 Dec 2022 04:26:42 +0000 Subject: [PATCH] Nginx now bundled in image, does media caching Also serves static files. Old media caching removed. --- docker/Dockerfile | 6 ++- docker/nginx.conf | 63 ++++++++++++++++++++++ docker/run.sh | 15 ++++++ docs/installation.rst | 14 ++++- docs/tuning.rst | 120 ++++++++++++++++++++++-------------------- mediaproxy/views.py | 75 +++++++++----------------- takahe/settings.py | 8 --- 7 files changed, 185 insertions(+), 116 deletions(-) create mode 100644 docker/nginx.conf create mode 100644 docker/run.sh diff --git a/docker/Dockerfile b/docker/Dockerfile index 36f1820..44eed35 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -9,6 +9,8 @@ RUN apt-get update \ && apt-get install -y --no-install-recommends \ libpq5 \ libxslt1.1 \ + nginx \ + busybox \ && rm -rf /var/lib/apt/lists/* COPY requirements.txt requirements.txt @@ -30,6 +32,8 @@ RUN apt-get update \ zlib1g-dev \ && rm -rf /var/lib/apt/lists/* +RUN mkdir -p /cache + COPY . /takahe WORKDIR /takahe @@ -41,4 +45,4 @@ EXPOSE 8000 # Set some sensible defaults ENV GUNICORN_CMD_ARGS="--workers 8" -CMD ["gunicorn", "takahe.wsgi:application", "-b", "0.0.0.0:8000"] +CMD ["bash", "docker/run.sh"] diff --git a/docker/nginx.conf b/docker/nginx.conf new file mode 100644 index 0000000..a4ce277 --- /dev/null +++ b/docker/nginx.conf @@ -0,0 +1,63 @@ +daemon off; +error_log /dev/stdout info; + +events { + worker_connections 4096; +} + +http { + + include /etc/nginx/mime.types; + + proxy_cache_path /cache/nginx levels=1:2 keys_zone=takahe:20m inactive=14d max_size=__CACHESIZE__; + + upstream takahe { + server "127.0.0.1:8001"; + } + + server { + listen 8000; + listen [::]:8000; + server_name _; + + root /takahe/static; + index index.html; + + ignore_invalid_headers on; + proxy_connect_timeout 900; + + proxy_headers_hash_max_size 1024; + proxy_headers_hash_bucket_size 128; + + client_max_body_size 512M; + client_body_buffer_size 128k; + charset utf-8; + + proxy_set_header Host $http_host; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_http_version 1.1; + proxy_cache takahe; + + location /static/ { + alias /takahe/static-collected/; + } + + location ~* ^/(media|proxy) { + proxy_cache_key $host$uri; + proxy_cache_valid 200 304 720h; + proxy_cache_valid 301 307 12h; + proxy_cache_valid 500 502 503 504 0s; + proxy_cache_valid any 72h; + + add_header X-Cache $upstream_cache_status; + + proxy_pass http://takahe; + } + + location / { + proxy_redirect off; + proxy_buffering off; + proxy_pass http://takahe; + } + } +} diff --git a/docker/run.sh b/docker/run.sh new file mode 100644 index 0000000..a825de6 --- /dev/null +++ b/docker/run.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# Set up cache size +CACHE_SIZE="${TAKAHE_NGINX_CACHE_SIZE:-1g}" +sed -i s/__CACHESIZE__/${CACHE_SIZE}/g /takahe/docker/nginx.conf + +# Run nginx and gunicorn +nginx -c "/takahe/docker/nginx.conf" & +gunicorn takahe.wsgi:application -b 0.0.0.0:8001 & + +# Wait for any process to exit +wait -n + +# Exit with status of process that exited first +exit $? diff --git a/docs/installation.rst b/docs/installation.rst index 003da03..78f3797 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -45,6 +45,10 @@ You'll need to run two copies of our `Docker image `_, but some cache backends will require additional Python packages not installed -by default with Takahē. More discussion on backend is below. +by default with Takahē. More discussion on some major backends is below. -All items in the cache come with an expiry set - usually one week - but you -can also configure a maximum cache size on dedicated cache datastores like -Memcache. The key names used by the caches do not overlap, so there is -no need to configure different key prefixes for each of Takahē's caches. - - -Backends -~~~~~~~~ Redis ##### @@ -140,11 +110,6 @@ Examples:: A Redis-protocol server. Use ``redis://`` for unencrypted communication and ``rediss://`` for TLS. -Redis has a large item size limit and is suitable for all caches. We recommend -that you keep the DEFAULT cache separate from the MEDIA and AVATARS caches, and -set the ``maxmemory`` on both to appropriate values (the proxying caches will -need more memory than the DEFAULT cache). - Memcache @@ -157,9 +122,6 @@ Examples:: A remote Memcache-protocol server (or set of servers). -Memcached has a 1MB limit per key by default, so this is only suitable for the -DEFAULT cache and not the AVATARS or MEDIA cache. - Filesystem ########## @@ -168,10 +130,8 @@ Examples:: file:///var/cache/takahe/ -A cache on the local disk. - -This *will* work with any of the cache backends, but is probably more suitable -for MEDIA and AVATARS. +A cache on the local disk. Slower than other options, and only really useful +if you have no other choice. Note that if you are running Takahē in a cluster, this cache will not be shared across different machines. This is not quite as bad as it first seems; it just @@ -187,4 +147,52 @@ Examples:: locmem://default A local memory cache, inside the Python process. This will consume additional -memory for the process, and should not be used with the MEDIA or AVATARS caches. +memory for the process, and should be used with care. + + +Image and Media Caching +~~~~~~~~~~~~~~~~~~~~~~~ + +In order to protect your users' privacy and IP addresses, we can't just send +them the remote URLs of user avatars and post images that aren't on your +server; we instead need to proxy them through Takahē in order to obscure who +is requesting them. + +Some other ActivityPub servers do this by downloading all media and images as +soon as they see it, and storing it all locally with some sort of clean-up job; +Takahē instead opts for using a read-through cache for this task, which uses +a bit more bandwidth in the long run but which has much easier maintenance and +better failure modes. + +Our Docker image comes with this cache built in, as without it you'll be making +Python do a lot of file proxying on every page load (and it's not the best at +that). It's set to 1GB of disk on each container by default, but you can adjust +this by setting the ``TAKAHE_NGINX_CACHE_SIZE`` environment variable to a value +Nginx understands, like ``10g``. + +The cache directory is ``/cache/``, and you can mount a different disk into +this path if you'd like to give it faster or more ephemeral storage. + +If you have an external CDN or cache, you can also opt to add your own caching +to these URLs; they all begin with ``/proxy/``, and have appropriate +``Cache-Control`` headers set. + + +CDNs +---- + +Takahē can be run behind a CDN if you want to offset some of the load from the +webserver containers. Takahē has to proxy all remote user avatars and images in +order to protect the privacy of your users, and has a built-in cache to help +with this (see "Caching" above), but at large scale this might start to get +strained. + +If you do run behind a CDN, ensure that your CDN is set to respect +``Cache-Control`` headers from the origin rather than going purely off of file +extensions. Some CDNs go purely off of file +extensions by default, which will not capture all of the proxy views Takahē +uses to show remote images without leaking user information. + +If you don't want to use a CDN but still want a performance improvement, a +read-through cache that respects ``Cache-Control``, like Varnish, will +also help if placed in front of Takahē. diff --git a/mediaproxy/views.py b/mediaproxy/views.py index 7e16e04..c745e7f 100644 --- a/mediaproxy/views.py +++ b/mediaproxy/views.py @@ -1,6 +1,5 @@ import httpx from django.conf import settings -from django.core.cache import caches from django.http import Http404, HttpResponse from django.shortcuts import get_object_or_404 from django.views.generic import View @@ -9,46 +8,31 @@ from activities.models import Emoji, PostAttachment from users.models import Identity -class BaseCacheView(View): +class BaseProxyView(View): """ - Base class for caching remote content. + Base class for proxying remote content. """ - cache_name = "media" - item_timeout: int | None = None - def get(self, request, **kwargs): self.kwargs = kwargs remote_url = self.get_remote_url() - cache = caches[self.cache_name] - cache_key = "proxy_" + remote_url - # See if it's already cached - cached_content = cache.get(cache_key) - if not cached_content: - # OK, fetch and cache it - try: - remote_response = httpx.get( - remote_url, - headers={"User-Agent": settings.TAKAHE_USER_AGENT}, - follow_redirects=True, - timeout=settings.SETUP.REMOTE_TIMEOUT, - ) - except httpx.RequestError: - return HttpResponse(status=502) - if remote_response.status_code >= 400: - return HttpResponse(status=502) - # We got it - shove it into the cache - cached_content = { - "content": remote_response.content, - "mimetype": remote_response.headers.get( + try: + remote_response = httpx.get( + remote_url, + headers={"User-Agent": settings.TAKAHE_USER_AGENT}, + follow_redirects=True, + timeout=settings.SETUP.REMOTE_TIMEOUT, + ) + except httpx.RequestError: + return HttpResponse(status=502) + if remote_response.status_code >= 400: + return HttpResponse(status=502) + return HttpResponse( + remote_response.content, + headers={ + "Content-Type": remote_response.headers.get( "Content-Type", "application/octet-stream" ), - } - cache.set(cache_key, cached_content, timeout=self.item_timeout) - return HttpResponse( - cached_content["content"], - headers={ - "Content-Type": cached_content["mimetype"], "Cache-Control": "public, max-age=3600", }, ) @@ -57,13 +41,11 @@ class BaseCacheView(View): raise NotImplementedError() -class EmojiCacheView(BaseCacheView): +class EmojiCacheView(BaseProxyView): """ - Caches Emoji + Proxies Emoji """ - item_timeout = 86400 * 7 # One week - def get_remote_url(self): self.emoji = get_object_or_404(Emoji, pk=self.kwargs["emoji_id"]) @@ -72,14 +54,11 @@ class EmojiCacheView(BaseCacheView): return self.emoji.remote_url -class IdentityIconCacheView(BaseCacheView): +class IdentityIconCacheView(BaseProxyView): """ - Caches identity icons (avatars) + Proxies identity icons (avatars) """ - cache_name = "avatars" - item_timeout = 86400 * 7 # One week - def get_remote_url(self): self.identity = get_object_or_404(Identity, pk=self.kwargs["identity_id"]) if self.identity.local or not self.identity.icon_uri: @@ -87,13 +66,11 @@ class IdentityIconCacheView(BaseCacheView): return self.identity.icon_uri -class IdentityImageCacheView(BaseCacheView): +class IdentityImageCacheView(BaseProxyView): """ - Caches identity profile header images + Proxies identity profile header images """ - item_timeout = 86400 * 7 # One week - def get_remote_url(self): self.identity = get_object_or_404(Identity, pk=self.kwargs["identity_id"]) if self.identity.local or not self.identity.image_uri: @@ -101,13 +78,11 @@ class IdentityImageCacheView(BaseCacheView): return self.identity.image_uri -class PostAttachmentCacheView(BaseCacheView): +class PostAttachmentCacheView(BaseProxyView): """ - Caches post media (images only, videos should always be offloaded to remote) + Proxies post media (images only, videos should always be offloaded to remote) """ - item_timeout = 86400 * 7 # One week - def get_remote_url(self): self.post_attachment = get_object_or_404( PostAttachment, pk=self.kwargs["attachment_id"] diff --git a/takahe/settings.py b/takahe/settings.py index 19c2e5d..ef28512 100644 --- a/takahe/settings.py +++ b/takahe/settings.py @@ -127,12 +127,6 @@ class Settings(BaseSettings): #: Default cache backend CACHES_DEFAULT: CacheBackendUrl | None = None - #: User icon (avatar) caching backend - CACHES_AVATARS: CacheBackendUrl | None = None - - #: Media caching backend - CACHES_MEDIA: CacheBackendUrl | None = None - PGHOST: str | None = None PGPORT: int | None = 5432 PGNAME: str = "takahe" @@ -385,8 +379,6 @@ if SETUP.MEDIA_BACKEND: CACHES = { "default": django_cache_url.parse(SETUP.CACHES_DEFAULT or "dummy://"), - "avatars": django_cache_url.parse(SETUP.CACHES_AVATARS or "dummy://"), - "media": django_cache_url.parse(SETUP.CACHES_MEDIA or "dummy://"), } if SETUP.ERROR_EMAILS: