import asyncio import functools import logging import aiohttp import pyld.documentloader.requests import pyld.jsonld from django.conf import settings from rest_framework import serializers from rest_framework.fields import empty from . import contexts logger = logging.getLogger(__name__) def cached_contexts(loader): functools.wraps(loader) def load(url, *args, **kwargs): for cached in contexts.CONTEXTS: if url == cached["documentUrl"]: return cached if cached["shortId"] == "LITEPUB" and "/schemas/litepub-" in url: # XXX UGLY fix for pleroma because they host their schema # under each instance domain, which makes caching harder return cached return loader(url, *args, **kwargs) return load def get_document_loader(): loader = pyld.documentloader.requests.requests_document_loader( verify=settings.EXTERNAL_REQUESTS_VERIFY_SSL ) return cached_contexts(loader) def expand(doc, options=None, default_contexts=["AS", "FW", "SEC"]): options = options or {} options.setdefault("documentLoader", get_document_loader()) if isinstance(doc, str): doc = options["documentLoader"](doc)["document"] for context_name in default_contexts: ctx = contexts.CONTEXTS_BY_ID[context_name]["documentUrl"] try: insert_context(ctx, doc) except KeyError: # probably an already expanded document pass # XXX This is a hotfix for a bug in pyld. The JSON-LD allows empty dicts or lists as part of the # context, but this makes pyld failing to parse the context the right way. So we remove all # empty items from the contexts try: for active_ctx in doc["@context"]: if len(active_ctx) == 0: doc["@context"].remove(active_ctx) except KeyError: # Nothing to do here if no context is available at all pass result = pyld.jsonld.expand(doc, options=options) try: # jsonld.expand returns a list, which is useless for us return result[0] except IndexError: raise ValueError("Impossible to expand this jsonld document") def insert_context(ctx, doc): """ In some situations, we may want to add a default context to an existing document. This function enable that (this will mutate the original document) """ existing = doc["@context"] if isinstance(existing, list): if ctx not in existing: existing = existing[:] existing.append(ctx) doc["@context"] = existing else: doc["@context"] = [existing, ctx] return doc def get_session(): return aiohttp.ClientSession(raise_for_status=True) async def fetch_json(url, session, cache=None, lock=None): async with session.get(url) as response: response.raise_for_status() return url, await response.json() async def fetch_many(*ids, references=None): """ Given a list of object ids, will fetch the remote representations for those objects, expand them and return a dictionary with id as the key and expanded document as the values """ ids = set(ids) results = references if references is not None else {} if not ids: return results async with get_session() as session: tasks = [fetch_json(url, session) for url in ids if url not in results] tasks_results = await asyncio.gather(*tasks) for url, payload in tasks_results: results[url] = payload return results DEFAULT_PREPARE_CONFIG = { "type": {"property": "@type", "keep": "first"}, "id": {"property": "@id"}, } def dereference(value, references): """ Given a payload and a dictionary containing ids and objects, will replace all the matching objects in the payload by the one in the references dictionary. """ def replace(obj, id): try: matching = references[id] except KeyError: return # we clear the current dict, and replace its content by the matching obj obj.clear() obj.update(matching) if isinstance(value, dict): if "@id" in value: replace(value, value["@id"]) else: for attr in value.values(): dereference(attr, references) elif isinstance(value, list): # we loop on nested objects and trigger dereferencing for obj in value: dereference(obj, references) return value def get_value(value, keep=None, attr=None): if keep == "first": value = value[0] if attr: value = value[attr] elif attr: value = [obj[attr] for obj in value if attr in obj] return value def prepare_for_serializer(payload, config, fallbacks={}): """ Json-ld payloads, as returned by expand are quite complex to handle, because every attr is basically a list of dictionaries. To make code simpler, we use this function to clean the payload a little bit, base on the config object. Config is a dictionary, with keys being serializer field names, and values being dictionaries describing how to handle this field. """ final_payload = {} final_config = {} final_config.update(DEFAULT_PREPARE_CONFIG) final_config.update(config) for field, field_config in final_config.items(): try: value = get_value( payload[field_config["property"]], keep=field_config.get("keep"), attr=field_config.get("attr"), ) except (IndexError, KeyError): aliases = field_config.get("aliases", {}) noop = object() value = noop if not aliases: continue for a in aliases: try: value = get_value( payload[a["property"]], keep=a.get("keep"), attr=a.get("attr"), ) except (IndexError, KeyError): continue break if value is noop: continue final_payload[field] = value for key, choices in fallbacks.items(): if key in final_payload: # initial attr was found, no need to rely on fallbacks continue for choice in choices: if choice not in final_payload: continue final_payload[key] = final_payload[choice] return final_payload def get_ids(v): if isinstance(v, dict) and "@id" in v: yield v["@id"] if isinstance(v, list): for obj in v: yield from get_ids(obj) def get_default_context(): return [ "", "", "", { "manuallyApprovesFollowers": "as:manuallyApprovesFollowers", "Hashtag": "as:Hashtag", }, ] class JsonLdSerializer(serializers.Serializer): def __init__(self, *args, **kwargs): self.jsonld_expand = kwargs.pop("jsonld_expand", True) super().__init__(*args, **kwargs) self.jsonld_context = [] def run_validation(self, data=empty): if data and data is not empty: self.jsonld_context = data.get("@context", []) if self.context.get("expand", self.jsonld_expand): try: data = expand(data) except ValueError as e: raise serializers.ValidationError( f"{data} is not a valid jsonld document: {e}" ) try: config = self.Meta.jsonld_mapping except AttributeError: config = {} try: fallbacks = self.Meta.jsonld_fallbacks except AttributeError: fallbacks = {} data = prepare_for_serializer(data, config, fallbacks=fallbacks) dereferenced_fields = [ k for k, c in config.items() if k in data and c.get("dereference", False) ] dereferenced_ids = set() for field in dereferenced_fields: for i in get_ids(data[field]): dereferenced_ids.add(i) if dereferenced_ids: try: loop = asyncio.get_event_loop() except RuntimeError as exception: logger.debug(exception) loop = asyncio.new_event_loop() references = self.context.setdefault("references", {}) loop.run_until_complete( fetch_many(*dereferenced_ids, references=references) ) data = dereference(data, references) return super().run_validation(data) def first_attr(property, attr, aliases=[]): return {"property": property, "keep": "first", "attr": attr, "aliases": aliases} def first_val(property, aliases=[]): return first_attr(property, "@value", aliases=aliases) def first_id(property, aliases=[]): return first_attr(property, "@id", aliases=aliases) def first_obj(property, aliases=[]): return {"property": property, "keep": "first", "aliases": aliases} def raw(property, aliases=[]): return {"property": property, "aliases": aliases} def is_present_recursive(data, key): if isinstance(data, (dict, list)): for v in data: if is_present_recursive(v, key): return True else: if data == key: return True return False