diff --git a/core/views.py b/core/views.py index d8fddd4..c782728 100644 --- a/core/views.py +++ b/core/views.py @@ -2,6 +2,7 @@ import json from typing import ClassVar import markdown_it +from django.conf import settings from django.http import HttpResponse from django.shortcuts import redirect from django.templatetags.static import static @@ -69,6 +70,23 @@ class StaticContentView(View): raise NotImplementedError() +@method_decorator(cache_page(60 * 60), name="dispatch") +class RobotsTxt(TemplateView): + """ + Serves the robots.txt for Takahē + + To specify additional user-agents to disallow, use TAKAHE_ROBOTS_TXT_DISALLOWED_USER_AGENTS + """ + + template_name = "robots.txt" + content_type = "text/plain" + + def get_context_data(self): + return { + "user_agents": getattr(settings, "ROBOTS_TXT_DISALLOWED_USER_AGENTS", []), + } + + @method_decorator(cache_control(max_age=60 * 15), name="dispatch") class AppManifest(StaticContentView): """ diff --git a/docker/nginx.conf.d/default.conf.tpl b/docker/nginx.conf.d/default.conf.tpl index 3bcb2b8..36b73d0 100644 --- a/docker/nginx.conf.d/default.conf.tpl +++ b/docker/nginx.conf.d/default.conf.tpl @@ -29,11 +29,6 @@ server { proxy_hide_header X-Takahe-User; proxy_hide_header X-Takahe-Identity; - # Serve robots.txt from the non-collected dir as a special case. - location /robots.txt { - alias /takahe/static/robots.txt; - } - # Serves static files from the collected dir location /static/ { # Files in static have cache-busting hashes in the name, thus can be cached forever diff --git a/takahe/settings.py b/takahe/settings.py index 3b044ea..efd7861 100644 --- a/takahe/settings.py +++ b/takahe/settings.py @@ -105,6 +105,10 @@ class Settings(BaseSettings): AUTO_ADMIN_EMAIL: EmailStr | None = None ERROR_EMAILS: list[EmailStr] | None = None + #: If set, a list of user agents to completely disallow in robots.txt + #: List formatting must be a valid JSON list, such as `["Agent1", "Agent2"]` + ROBOTS_TXT_DISALLOWED_USER_AGENTS: list[str] = Field(default_factory=list) + MEDIA_URL: str = "/media/" MEDIA_ROOT: str = str(BASE_DIR / "media") MEDIA_BACKEND: MediaBackendUrl | None = None @@ -313,6 +317,8 @@ STATOR_TOKEN = SETUP.STATOR_TOKEN STATOR_CONCURRENCY = SETUP.STATOR_CONCURRENCY STATOR_CONCURRENCY_PER_MODEL = SETUP.STATOR_CONCURRENCY_PER_MODEL +ROBOTS_TXT_DISALLOWED_USER_AGENTS = SETUP.ROBOTS_TXT_DISALLOWED_USER_AGENTS + CORS_ORIGIN_ALLOW_ALL = True # Temporary CORS_ORIGIN_WHITELIST = SETUP.CORS_HOSTS CORS_ALLOW_CREDENTIALS = True diff --git a/takahe/urls.py b/takahe/urls.py index 97bb8d4..d437308 100644 --- a/takahe/urls.py +++ b/takahe/urls.py @@ -19,6 +19,7 @@ from users.views import ( urlpatterns = [ path("", core.homepage), + path("robots.txt", core.RobotsTxt.as_view()), path("manifest.json", core.AppManifest.as_view()), # Activity views path("notifications/", timelines.Notifications.as_view(), name="notifications"), diff --git a/templates/robots.txt b/templates/robots.txt new file mode 100644 index 0000000..3ea9112 --- /dev/null +++ b/templates/robots.txt @@ -0,0 +1,13 @@ +User-Agent: * + +# Don't allow any bot to crawl tags. +Disallow: /tags/ +Disallow: /tags/* + +# Don't allow bots to crawl through the proxy +Disallow: /proxy/* + +{% for user_agent in user_agents %} +User-agent: {{user_agent}} +Disallow: / +{% endfor %}