funkwhale/api/funkwhale_api/federation/jsonld.py

import aiohttp
import asyncio
import functools

import pyld.jsonld
from django.conf import settings
import pyld.documentloader.requests
from rest_framework import serializers
from rest_framework.fields import empty
from . import contexts


def cached_contexts(loader):
    functools.wraps(loader)

    def load(url, *args, **kwargs):
        for cached in contexts.CONTEXTS:
            if url == cached["documentUrl"]:
                return cached
            if cached["shortId"] == "LITEPUB" and "/schemas/litepub-" in url:
                # XXX UGLY fix for pleroma because they host their schema
                # under each instance domain, which makes caching harder
                return cached
        return loader(url, *args, **kwargs)

    return load


def get_document_loader():
    loader = pyld.documentloader.requests.requests_document_loader(
        verify=settings.EXTERNAL_REQUESTS_VERIFY_SSL
    )
    return cached_contexts(loader)


def expand(doc, options=None, default_contexts=["AS", "FW", "SEC"]):
    options = options or {}
    options.setdefault("documentLoader", get_document_loader())
    if isinstance(doc, str):
        doc = options["documentLoader"](doc)["document"]
    for context_name in default_contexts:
        ctx = contexts.CONTEXTS_BY_ID[context_name]["documentUrl"]
        try:
            insert_context(ctx, doc)
        except KeyError:
            # probably an already expanded document
            pass

    # XXX This is a hotfix for a bug in pyld. The JSON-LD allows empty dicts or lists as part of the
    # context, but this makes pyld failing to parse the context the right way. So we remove all
    # empty items from the contexts
    try:
        for active_ctx in doc["@context"]:
            if len(active_ctx) == 0:
                doc["@context"].remove(active_ctx)
    except KeyError:
        # Nothing to do here if no context is available at all
        pass

    result = pyld.jsonld.expand(doc, options=options)
    try:
        # jsonld.expand returns a list, which is useless for us
        return result[0]
    except IndexError:
        raise ValueError("Impossible to expand this jsonld document")


def insert_context(ctx, doc):
    """
    In some situations, we may want to add a default context to an existing document.
    This function enable that (this will mutate the original document)
    """
    existing = doc["@context"]
    if isinstance(existing, list):
        if ctx not in existing:
            existing = existing[:]
            existing.append(ctx)
            doc["@context"] = existing
    else:
        doc["@context"] = [existing, ctx]
    return doc


def get_session():
    return aiohttp.ClientSession(raise_for_status=True)


async def fetch_json(url, session, cache=None, lock=None):
    async with session.get(url) as response:
        response.raise_for_status()
        return url, await response.json()


async def fetch_many(*ids, references=None):
    """
    Given a list of object ids, will fetch the remote
    representations for those objects, expand them
    and return a dictionnary with id as the key and expanded document as the values
    """
    ids = set(ids)
    results = references if references is not None else {}

    if not ids:
        return results

    async with get_session() as session:
        tasks = [fetch_json(url, session) for url in ids if url not in results]
        tasks_results = await asyncio.gather(*tasks)

    for url, payload in tasks_results:
        results[url] = payload

    return results


DEFAULT_PREPARE_CONFIG = {
    "type": {"property": "@type", "keep": "first"},
    "id": {"property": "@id"},
}


def dereference(value, references):
    """
    Given a payload and a dictonary containing ids and objects, will replace
    all the matching objects in the payload by the one in the references dictionary.
    """

    def replace(obj, id):
        try:
            matching = references[id]
        except KeyError:
            return
        # we clear the current dict, and replace its content by the matching obj
        obj.clear()
        obj.update(matching)

    if isinstance(value, dict):
        if "@id" in value:
            replace(value, value["@id"])
        else:
            for attr in value.values():
                dereference(attr, references)

    elif isinstance(value, list):
        # we loop on nested objects and trigger dereferencing
        for obj in value:
            dereference(obj, references)

    return value


def get_value(value, keep=None, attr=None):

    if keep == "first":
        value = value[0]
        if attr:
            value = value[attr]

    elif attr:
        value = [obj[attr] for obj in value if attr in obj]

    return value


def prepare_for_serializer(payload, config, fallbacks={}):
    """
    Json-ld payloads, as returned by expand are quite complex to handle, because
    every attr is basically a list of dictionnaries. To make code simpler,
    we use this function to clean the payload a little bit, base on the config object.

    Config is a dictionnary, with keys being serializer field names, and values
    being dictionaries describing how to handle this field.
    """
    final_payload = {}
    final_config = {}
    final_config.update(DEFAULT_PREPARE_CONFIG)
    final_config.update(config)
    for field, field_config in final_config.items():
        try:
            value = get_value(
                payload[field_config["property"]],
                keep=field_config.get("keep"),
                attr=field_config.get("attr"),
            )
        except (IndexError, KeyError):
            aliases = field_config.get("aliases", {})
            noop = object()
            value = noop
            if not aliases:
                continue

            for a in aliases:
                try:
                    value = get_value(
                        payload[a["property"]],
                        keep=a.get("keep"),
                        attr=a.get("attr"),
                    )
                except (IndexError, KeyError):
                    continue

                break

            if value is noop:
                continue

        final_payload[field] = value

    for key, choices in fallbacks.items():
        if key in final_payload:
            # initial attr was found, no need to rely on fallbacks
            continue

        for choice in choices:
            if choice not in final_payload:
                continue

            final_payload[key] = final_payload[choice]

    return final_payload


def get_ids(v):
    if isinstance(v, dict) and "@id" in v:
        yield v["@id"]

    if isinstance(v, list):
        for obj in v:
            yield from get_ids(obj)


def get_default_context():
    return [
        "https://www.w3.org/ns/activitystreams",
        "https://w3id.org/security/v1",
        "https://funkwhale.audio/ns",
        {
            "manuallyApprovesFollowers": "as:manuallyApprovesFollowers",
            "Hashtag": "as:Hashtag",
        },
    ]


class JsonLdSerializer(serializers.Serializer):
    def __init__(self, *args, **kwargs):
        self.jsonld_expand = kwargs.pop("jsonld_expand", True)
        super().__init__(*args, **kwargs)
        self.jsonld_context = []

    def run_validation(self, data=empty):
        if data and data is not empty:

            self.jsonld_context = data.get("@context", [])
            if self.context.get("expand", self.jsonld_expand):
                try:
                    data = expand(data)
                except ValueError as e:
                    raise serializers.ValidationError(
                        "{} is not a valid jsonld document: {}".format(data, e)
                    )
            try:
                config = self.Meta.jsonld_mapping
            except AttributeError:
                config = {}
            try:
                fallbacks = self.Meta.jsonld_fallbacks
            except AttributeError:
                fallbacks = {}

            data = prepare_for_serializer(data, config, fallbacks=fallbacks)
            dereferenced_fields = [
                k
                for k, c in config.items()
                if k in data and c.get("dereference", False)
            ]
            dereferenced_ids = set()
            for field in dereferenced_fields:
                for i in get_ids(data[field]):
                    dereferenced_ids.add(i)

            if dereferenced_ids:
                try:
                    loop = asyncio.get_event_loop()
                except RuntimeError:
                    loop = asyncio.new_event_loop()
                references = self.context.setdefault("references", {})
                loop.run_until_complete(
                    fetch_many(*dereferenced_ids, references=references)
                )
                data = dereference(data, references)
        return super().run_validation(data)


def first_attr(property, attr, aliases=[]):
    return {"property": property, "keep": "first", "attr": attr, "aliases": aliases}


def first_val(property, aliases=[]):
    return first_attr(property, "@value", aliases=aliases)


def first_id(property, aliases=[]):
    return first_attr(property, "@id", aliases=aliases)


def first_obj(property, aliases=[]):
    return {"property": property, "keep": "first", "aliases": aliases}


def raw(property, aliases=[]):
    return {"property": property, "aliases": aliases}


def is_present_recursive(data, key):
    if isinstance(data, (dict, list)):
        for v in data:
            if is_present_recursive(v, key):
                return True
    else:
        if data == key:
            return True

    return False