From dad652a264aaed8fd4f13665fbb774334a45b0da Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Thu, 22 Dec 2022 14:18:48 +0000 Subject: [PATCH] rework requests cacheing. set a 2 hours expiration on cache entries except for jsonld term definitions. try to get remote profiles from the client app before fetching. --- federation/entities/activitypub/models.py | 43 ++++++++--------------- federation/fetchers.py | 3 +- federation/utils/activitypub.py | 6 ++-- federation/utils/diaspora.py | 5 +-- federation/utils/django.py | 9 +++++ federation/utils/network.py | 17 +++++---- 6 files changed, 43 insertions(+), 40 deletions(-) diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index 108b630..9151965 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -21,28 +21,12 @@ from federation.entities.utils import get_base_attributes, get_profile from federation.outbound import handle_send from federation.types import UserType, ReceiverVariant from federation.utils.activitypub import retrieve_and_parse_document, retrieve_and_parse_profile, get_profile_id_from_webfinger +from federation.utils.django import get_requests_cache_backend from federation.utils.text import with_slash, validate_handle import federation.entities.base as base logger = logging.getLogger("federation") -# Make django federation parameters globally available -# if possible -try: - from federation.utils.django import get_configuration - django_params = get_configuration() -except ImportError: - django_params = {} - -# try to obtain redis config from django and use as -# requests_cache backend if available -if django_params.get('redis'): - backend = rc.RedisCache(namespace='fed_cache', **django_params['redis']) -else: - backend = rc.SQLiteCache(db_path='fed_cache') -logger.info('Using %s for requests_cache', type(backend)) - - # This is required to workaround a bug in pyld that has the Accept header # accept other content types. From what I understand, precedence handling # is broken @@ -52,7 +36,7 @@ def get_loader(*args, **kwargs): def loader(url, options={}): options['headers']['Accept'] = 'application/ld+json' - with rc.enabled(cache_name='fed_cache', backend=backend): + with rc.enabled(cache_name='ld_cache', backend=get_requests_cache_backend('ld_cache')): return requests_loader(url, options) return loader @@ -63,8 +47,7 @@ jsonld.set_document_loader(get_loader()) def get_profile_or_entity(fid): obj = get_profile(fid=fid) if not obj: - with rc.enabled(cache_name='fed_cache', backend=backend): - obj = retrieve_and_parse_document(fid) + obj = retrieve_and_parse_document(fid) return obj @@ -606,6 +589,7 @@ class Person(Object, base.Profile): capabilities = CompactedDict(litepub.capabilities) suspended = fields.Boolean(toot.suspended) public = True + finger = None _cached_inboxes = None _cached_public_key = None _cached_image_urls = None @@ -624,15 +608,18 @@ class Person(Object, base.Profile): super().__init__(*args, **kwargs) self._allowed_children += (PropertyValue, IdentityProof) - # Set handle to username@host if not provided by the platform + # Set finger to username@host if not provided by the platform def post_receive(self): - if not self.finger: + profile = get_profile(fid=self.id) + if getattr(profile, 'finger', None): + self.finger = profile.finger + else: domain = urlparse(self.id).netloc finger = f'{self.username.lower()}@{domain}' - with rc.enabled(cache_name='fed_cache', backend=backend): - if get_profile_id_from_webfinger(finger) == self.id: - self.finger = finger - if self.guid and not self.handle: + if get_profile_id_from_webfinger(finger) == self.id: + self.finger = finger + # multi-protocol platform + if self.finger and self.guid and not self.handle: self.handle = self.finger def to_as2(self): @@ -1269,8 +1256,8 @@ def extract_receivers(entity): profile = None # don't care about receivers for payloads without an actor_id if getattr(entity, 'actor_id'): - with rc.enabled(cache_name='fed_cache', backend=backend): - profile = retrieve_and_parse_profile(entity.actor_id) + profile = get_profile(fid=entity.actor_id) + if not profile: profile = retrieve_and_parse_profile(entity.actor_id) if not profile: return receivers for attr in ("to", "cc"): diff --git a/federation/fetchers.py b/federation/fetchers.py index b4c2ed8..cf3229f 100644 --- a/federation/fetchers.py +++ b/federation/fetchers.py @@ -28,7 +28,8 @@ def retrieve_remote_content( protocol_name = identify_protocol_by_id(id).PROTOCOL_NAME utils = importlib.import_module("federation.utils.%s" % protocol_name) return utils.retrieve_and_parse_content( - id=id, guid=guid, handle=handle, entity_type=entity_type, sender_key_fetcher=sender_key_fetcher, + id=id, guid=guid, handle=handle, entity_type=entity_type, + cache=cache, sender_key_fetcher=sender_key_fetcher, ) diff --git a/federation/utils/activitypub.py b/federation/utils/activitypub.py index 33cf74a..114cce0 100644 --- a/federation/utils/activitypub.py +++ b/federation/utils/activitypub.py @@ -34,15 +34,15 @@ def get_profile_id_from_webfinger(handle: str) -> Optional[str]: def retrieve_and_parse_content(**kwargs) -> Optional[Any]: - return retrieve_and_parse_document(kwargs.get("id")) + return retrieve_and_parse_document(kwargs.get("id"), cache=kwargs.get('cache',True)) -def retrieve_and_parse_document(fid: str) -> Optional[Any]: +def retrieve_and_parse_document(fid: str, cache: bool=True) -> Optional[Any]: """ Retrieve remote document by ID and return the entity. """ from federation.entities.activitypub.models import element_to_objects # Circulars - document, status_code, ex = fetch_document(fid, extra_headers={'accept': 'application/activity+json'}, + document, status_code, ex = fetch_document(fid, extra_headers={'accept': 'application/activity+json'}, cache=cache, auth=get_http_authentication(federation_user.rsa_private_key,f'{federation_user.id}#main-key') if federation_user else None) if document: try: diff --git a/federation/utils/diaspora.py b/federation/utils/diaspora.py index c81ef5c..e56e00e 100644 --- a/federation/utils/diaspora.py +++ b/federation/utils/diaspora.py @@ -162,7 +162,8 @@ def parse_profile_from_hcard(hcard: str, handle: str): def retrieve_and_parse_content( - id: str, guid: str, handle: str, entity_type: str, sender_key_fetcher: Callable[[str], str]=None): + id: str, guid: str, handle: str, entity_type: str, cache: bool=True, + sender_key_fetcher: Callable[[str], str]=None): """Retrieve remote content and return an Entity class instance. This is basically the inverse of receiving an entity. Instead, we fetch it, then call "handle_receive". @@ -175,7 +176,7 @@ def retrieve_and_parse_content( return _username, domain = handle.split("@") url = get_fetch_content_endpoint(domain, entity_type.lower(), guid) - document, status_code, error = fetch_document(url) + document, status_code, error = fetch_document(url, cache=cache) if status_code == 200: request = RequestType(body=document) _sender, _protocol, entities = handle_receive(request, sender_key_fetcher=sender_key_fetcher) diff --git a/federation/utils/django.py b/federation/utils/django.py index 0d5a128..654d649 100644 --- a/federation/utils/django.py +++ b/federation/utils/django.py @@ -1,4 +1,5 @@ import importlib +from requests_cache import RedisCache, SQLiteCache from django.conf import settings from django.core.exceptions import ImproperlyConfigured @@ -59,3 +60,11 @@ def get_federation_user(): return UserType(id=config['federation_id'], private_key=key) +def get_requests_cache_backend(namespace): + """ + Use RedisCache is available, else fallback to SQLiteCache + """ + config = get_configuration() + if not config.get('redis'): return SQLiteCache() + + return RedisCache(namespace, **config['redis']) diff --git a/federation/utils/network.py b/federation/utils/network.py index 52a630b..765c9b6 100644 --- a/federation/utils/network.py +++ b/federation/utils/network.py @@ -8,30 +8,34 @@ from urllib.parse import quote from uuid import uuid4 import requests +from requests_cache import CachedSession, DO_NOT_CACHE from requests.exceptions import RequestException, HTTPError, SSLError from requests.exceptions import ConnectionError from requests.structures import CaseInsensitiveDict from federation import __version__ +from federation.utils.django import get_requests_cache_backend logger = logging.getLogger("federation") USER_AGENT = "python/federation/%s" % __version__ +session = CachedSession('fed_cache', backend=get_requests_cache_backend('fed_cache')) +EXPIRATION = datetime.timedelta(hours=2) def fetch_content_type(url: str) -> Optional[str]: """ Fetch the HEAD of the remote url to determine the content type. """ try: - response = requests.head(url, headers={'user-agent': USER_AGENT}, timeout=10) + response = session.head(url, headers={'user-agent': USER_AGENT}, timeout=10) except RequestException as ex: logger.warning("fetch_content_type - %s when fetching url %s", ex, url) else: return response.headers.get('Content-Type') -def fetch_document(url=None, host=None, path="/", timeout=10, raise_ssl_errors=True, extra_headers=None, **kwargs): +def fetch_document(url=None, host=None, path="/", timeout=10, raise_ssl_errors=True, extra_headers=None, cache=True, **kwargs): """Helper method to fetch remote document. Must be given either the ``url`` or ``host``. @@ -60,7 +64,8 @@ def fetch_document(url=None, host=None, path="/", timeout=10, raise_ssl_errors=T # Use url since it was given logger.debug("fetch_document: trying %s", url) try: - response = requests.get(url, timeout=timeout, headers=headers, **kwargs) + response = session.get(url, timeout=timeout, headers=headers, + expire_after=EXPIRATION if cache else DO_NOT_CACHE, **kwargs) logger.debug("fetch_document: found document, code %s", response.status_code) response.raise_for_status() return response.text, response.status_code, None @@ -73,7 +78,7 @@ def fetch_document(url=None, host=None, path="/", timeout=10, raise_ssl_errors=T url = "https://%s%s" % (host_string, path_string) logger.debug("fetch_document: trying %s", url) try: - response = requests.get(url, timeout=timeout, headers=headers) + response = session.get(url, timeout=timeout, headers=headers) logger.debug("fetch_document: found document, code %s", response.status_code) response.raise_for_status() return response.text, response.status_code, None @@ -85,7 +90,7 @@ def fetch_document(url=None, host=None, path="/", timeout=10, raise_ssl_errors=T url = url.replace("https://", "http://") logger.debug("fetch_document: trying %s", url) try: - response = requests.get(url, timeout=timeout, headers=headers) + response = session.get(url, timeout=timeout, headers=headers) logger.debug("fetch_document: found document, code %s", response.status_code) response.raise_for_status() return response.text, response.status_code, None @@ -116,7 +121,7 @@ def fetch_file(url: str, timeout: int = 30, extra_headers: Dict = None) -> str: headers = {'user-agent': USER_AGENT} if extra_headers: headers.update(extra_headers) - response = requests.get(url, timeout=timeout, headers=headers, stream=True) + response = session.get(url, timeout=timeout, headers=headers, stream=True) response.raise_for_status() name = f"/tmp/{str(uuid4())}" with open(name, "wb") as f: