bridgy-fed/common.py

# coding=utf-8
"""Misc common utilities.
"""
import itertools
import logging
import os
import re
import urllib.parse

from flask import render_template, request
from flask.views import View
from granary import as2
from oauth_dropins.webutil import util, webmention
import requests
from webob import exc
from werkzeug.exceptions import abort

import common
from models import Response

DOMAIN_RE = r'([^/:]+\.[^/:]+)'
ACCT_RE = r'(?:acct:)?([^@]+)@' + DOMAIN_RE
TLD_BLOCKLIST = ('7z', 'asp', 'aspx', 'gif', 'html', 'ico', 'jpg', 'jpeg', 'js',
                 'json', 'php', 'png', 'rar', 'txt', 'yaml', 'yml', 'zip')
HEADERS = {
    'User-Agent': 'Bridgy Fed (https://fed.brid.gy/)',
}
XML_UTF8 = "<?xml version='1.0' encoding='UTF-8'?>\n"
# USERNAME = 'me'
# USERNAME_EMOJI = '🌎'  # globe
LINK_HEADER_RE = re.compile(r""" *< *([^ >]+) *> *; *rel=['"]([^'"]+)['"] *""")
AS2_PUBLIC_AUDIENCE = 'https://www.w3.org/ns/activitystreams#Public'

# Content-Type values. All non-unicode strings because App Engine's wsgi.py
# requires header values to be str, not unicode.
#
# ActivityPub Content-Type details:
# https://www.w3.org/TR/activitypub/#retrieving-objects
CONTENT_TYPE_AS2_LD = 'application/ld+json; profile="https://www.w3.org/ns/activitystreams"'
CONTENT_TYPE_AS2 = 'application/activity+json'
CONTENT_TYPE_AS1 = 'application/stream+json'
CONTENT_TYPE_HTML = 'text/html; charset=utf-8'
CONTENT_TYPE_ATOM = 'application/atom+xml'
CONTENT_TYPE_MAGIC_ENVELOPE = 'application/magic-envelope+xml'

CONNEG_HEADERS_AS2 = {
    'Accept': '%s; q=0.9, %s; q=0.8' % (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD),
}
CONNEG_HEADERS_AS2_HTML = {
    'Accept': CONNEG_HEADERS_AS2['Accept'] + ', %s; q=0.7' % CONTENT_TYPE_HTML,
}

SUPPORTED_VERBS = (
    'checkin',
    'create',
    'follow',
    'like',
    'post',
    'share',
    'tag',
    'update',
)

PRIMARY_DOMAIN = 'fed.brid.gy'
OTHER_DOMAINS = (
    'bridgy-federated.appspot.com',
    'localhost',
)
DOMAINS = (PRIMARY_DOMAIN,) + OTHER_DOMAINS


# TODO: add to all handlers:
  #   self.response.headers.update({
  #     'Access-Control-Allow-Headers': '*',
  #     'Access-Control-Allow-Methods': '*',
  #     'Access-Control-Allow-Origin': '*',
  #     # see https://content-security-policy.com/
  #     'Content-Security-Policy':
  #       "script-src https: localhost:8080 my.dev.com:8080 'unsafe-inline'; "
  #       "frame-ancestors 'self'; "
  #       "report-uri /csp-report; ",
  #     # 16070400 seconds is 6 months
  #     'Strict-Transport-Security': 'max-age=16070400; preload',
  #     'X-Content-Type-Options': 'nosniff',
  #     # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Frame-Options
  #     'X-Frame-Options': 'SAMEORIGIN',
  #     'X-XSS-Protection': '1; mode=block',
  #   })

  # def options(self, *args, **kwargs):
  #   """Respond to CORS pre-flight OPTIONS requests."""
  #   pass


def not_5xx(resp):
    return isinstance(resp, tuple) and resp[1] // 100 != 5


def requests_get(url, **kwargs):
    return _requests_fn(util.requests_get, url, **kwargs)


def requests_post(url, **kwargs):
    return _requests_fn(util.requests_post, url, **kwargs)


def _requests_fn(fn, url, parse_json=False, **kwargs):
    """Wraps requests.* and adds raise_for_status() and User-Agent."""
    kwargs.setdefault('headers', {}).update(HEADERS)
    resp = fn(url, gateway=True, **kwargs)

    logging.info('Got %s headers:%s', resp.status_code, resp.headers)
    type = content_type(resp)
    if (type and type != 'text/html' and
        (type.startswith('text/') or type.endswith('+json') or type.endswith('/json'))):
        logging.info(resp.text)

    if parse_json:
        try:
            return resp.json()
        except ValueError:
            msg = "Couldn't parse response as JSON"
            logging.info(msg, exc_info=True)
            raise exc.HTTPBadGateway(msg)

    return resp


def get_as2(url):
    """Tries to fetch the given URL as ActivityStreams 2.

    Uses HTTP content negotiation via the Content-Type header. If the url is
    HTML and it has a rel-alternate link with an AS2 content type, fetches and
    returns that URL.

    Args:
        url: string

    Returns:
        requests.Response

    Raises:
        requests.HTTPError, webob.exc.HTTPException

        If we raise webob HTTPException, it will have an additional response
        attribute with the last requests.Response we received.
    """
    def _error(resp):
        msg = "Couldn't fetch %s as ActivityStreams 2" % url
        logging.warning(msg)
        err = exc.HTTPBadGateway(msg)
        err.response = resp
        raise err

    resp = requests_get(url, headers=CONNEG_HEADERS_AS2_HTML)
    if content_type(resp) in (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD):
        return resp

    parsed = util.parse_html(resp)
    as2 = parsed.find('link', rel=('alternate', 'self'), type=(
        CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD))
    if not (as2 and as2['href']):
        _error(resp)

    resp = requests_get(urllib.parse.urljoin(resp.url, as2['href']),
                        headers=CONNEG_HEADERS_AS2)
    if content_type(resp) in (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD):
        return resp

    _error(resp)


def content_type(resp):
    """Returns a requests.Response's Content-Type, without charset suffix."""
    type = resp.headers.get('Content-Type')
    if type:
        return type.split(';')[0]


def get_required_param(name):
    try:
        val = request.args.get(name) or request.form.get(name)
    except (UnicodeDecodeError, UnicodeEncodeError) as e:
        abort(400, f"Couldn't decode parameters as UTF-8: {e}")

    if not val:
        abort(400, f'Missing required parameter: {name}')

    return val


def error(msg, status=None, exc_info=False):
    if not status:
        status = 400
    logging.info('Returning %s: %s' % (status, msg), exc_info=exc_info)
    return (msg, status)


def send_webmentions(activity_wrapped, proxy=None, **response_props):
    """Sends webmentions for an incoming Salmon slap or ActivityPub inbox delivery.
    Args:
      activity_wrapped: dict, AS1 activity
      response_props: passed through to the newly created Responses
    """
    activity = redirect_unwrap(activity_wrapped)

    verb = activity.get('verb')
    if verb and verb not in SUPPORTED_VERBS:
        return error('%s activities are not supported yet.' % verb)

    # extract source and targets
    source = activity.get('url') or activity.get('id')
    obj = activity.get('object')
    obj_url = util.get_url(obj)

    targets = util.get_list(activity, 'inReplyTo')
    if isinstance(obj, dict):
        if not source or verb in ('create', 'post', 'update'):
            source = obj_url or obj.get('id')
        targets.extend(util.get_list(obj, 'inReplyTo'))

    tags = util.get_list(activity_wrapped, 'tags')
    obj_wrapped = activity_wrapped.get('object')
    if isinstance(obj_wrapped, dict):
        tags.extend(util.get_list(obj_wrapped, 'tags'))
    for tag in tags:
        if tag.get('objectType') == 'mention':
            url = tag.get('url')
            if url and url.startswith(request.host_url):
                targets.append(redirect_unwrap(url))

    if verb in ('follow', 'like', 'share'):
         targets.append(obj_url)

    targets = util.dedupe_urls(util.get_url(t) for t in targets)
    if not source:
        return error("Couldn't find original post URL")
    if not targets:
        return error("Couldn't find any target URLs in inReplyTo, object, or mention tags")

    # send webmentions and store Responses
    errors = []
    for target in targets:
        if util.domain_from_link(target) == util.domain_from_link(source):
            logging.info('Skipping same-domain webmention from %s to %s',
                         source, target)
            continue

        response = Response(source=source, target=target, direction='in',
                            **response_props)
        response.put()
        wm_source = (response.proxy_url()
                     if verb in ('follow', 'like', 'share') or proxy
                     else source)
        logging.info('Sending webmention from %s to %s', wm_source, target)

        try:
            endpoint = webmention.discover(target, headers=HEADERS).endpoint
            if endpoint:
                webmention.send(endpoint, wm_source, target, headers=HEADERS)
                response.status = 'complete'
                logging.info('Success!')
        except BaseException as e:
            util.interpret_http_exception(e)
            logging.warning(f'Failed! {e}')
            errors.append(e)
        response.put()

    if errors:
        msg = 'Errors:\n' + '\n'.join(str(e) for e in errors)
        return error(msg, status=getattr(errors[0], 'http_status', None))


def postprocess_as2(activity, target=None, key=None):
    """Prepare an AS2 object to be served or sent via ActivityPub.

    Args:
      activity: dict, AS2 object or activity
      target: dict, AS2 object, optional. The target of activity's inReplyTo or
        Like/Announce/etc object, if any.
      key: MagicKey, optional. populated into publicKey field if provided.
    """
    type = activity.get('type')

    # actor objects
    if type == 'Person':
        postprocess_as2_actor(activity)
        if not activity.get('publicKey'):
            # underspecified, inferred from this issue and Mastodon's implementation:
            # https://github.com/w3c/activitypub/issues/203#issuecomment-297553229
            # https://github.com/tootsuite/mastodon/blob/bc2c263504e584e154384ecc2d804aeb1afb1ba3/app/services/activitypub/process_account_service.rb#L77
            activity.update({
                'publicKey': {
                    'id': activity.get('preferredUsername'),
                    'publicKeyPem': key.public_pem().decode(),
                },
                '@context': (util.get_list(activity, '@context') +
                             ['https://w3id.org/security/v1']),
            })
        return activity

    for actor in (util.get_list(activity, 'attributedTo') +
                  util.get_list(activity, 'actor')):
        postprocess_as2_actor(actor)

    # inReplyTo: singly valued, prefer id over url
    target_id = target.get('id') if target else None
    in_reply_to = activity.get('inReplyTo')
    if in_reply_to:
        if target_id:
            activity['inReplyTo'] = target_id
        elif isinstance(in_reply_to, list):
            if len(in_reply_to) > 1:
                logging.warning(
                    "AS2 doesn't support multiple inReplyTo URLs! "
                    'Only using the first: %s' % in_reply_to[0])
            activity['inReplyTo'] = in_reply_to[0]

        # Mastodon evidently requires a Mention tag for replies to generate a
        # notification to the original post's author. not required for likes,
        # reposts, etc. details:
        # https://github.com/snarfed/bridgy-fed/issues/34
        if target:
            for to in (util.get_list(target, 'attributedTo') +
                       util.get_list(target, 'actor')):
                if isinstance(to, dict):
                    to = to.get('url') or to.get('id')
                if to:
                    activity.setdefault('tag', []).append({
                        'type': 'Mention',
                        'href': to,
                    })

    # activity objects (for Like, Announce, etc): prefer id over url
    obj = activity.get('object')
    if obj:
        if isinstance(obj, dict) and not obj.get('id'):
            obj['id'] = target_id or obj.get('url')
        elif target_id and obj != target_id:
            activity['object'] = target_id

    # id is required for most things. default to url if it's not set.
    if not activity.get('id'):
        activity['id'] = activity.get('url')

    # TODO: find a better way to check this, sometimes or always?
    # removed for now since it fires on posts without u-id or u-url, eg
    # https://chrisbeckstrom.com/2018/12/27/32551/
    # assert activity.get('id') or (isinstance(obj, dict) and obj.get('id'))

    activity['id'] = redirect_wrap(activity.get('id'))
    activity['url'] = redirect_wrap(activity.get('url'))

    # copy image(s) into attachment(s). may be Mastodon-specific.
    # https://github.com/snarfed/bridgy-fed/issues/33#issuecomment-440965618
    obj_or_activity = obj if isinstance(obj, dict) else activity
    obj_or_activity.setdefault('attachment', []).extend(
        obj_or_activity.get('image', []))

    # cc public and target's author(s) and recipients
    # https://www.w3.org/TR/activitystreams-vocabulary/#audienceTargeting
    # https://w3c.github.io/activitypub/#delivery
    if type in as2.TYPE_TO_VERB or type in ('Article', 'Note'):
        recips = [AS2_PUBLIC_AUDIENCE]
        if target:
            recips += itertools.chain(*(util.get_list(target, field) for field in
                                        ('actor', 'attributedTo', 'to', 'cc')))
        activity['cc'] = util.dedupe_urls(util.get_url(recip) or recip.get('id')
                                          for recip in recips)

    # wrap articles and notes in a Create activity
    if type in ('Article', 'Note'):
        activity = {
            '@context': as2.CONTEXT,
            'type': 'Create',
            'id': f'{activity["id"]}#bridgy-fed-create',
            'object': activity,
        }

    return util.trim_nulls(activity)


def postprocess_as2_actor(actor):
    """Prepare an AS2 actor object to be served or sent via ActivityPub.

    Args:
      actor: dict, AS2 actor object
    """
    url = actor.get('url')
    if url:
        domain = urllib.parse.urlparse(url).netloc
        actor.setdefault('preferredUsername', domain)
        actor['id'] = request.host_url + domain
        actor['url'] = redirect_wrap(url)

    # required by pixelfed. https://github.com/snarfed/bridgy-fed/issues/39
    actor.setdefault('summary', '')


def redirect_wrap(url):
    """Returns a URL on our domain that redirects to this URL.

    ...to satisfy Mastodon's non-standard domain matching requirement. :(

    Args:
      url: string

    https://github.com/snarfed/bridgy-fed/issues/16#issuecomment-424799599
    https://github.com/tootsuite/mastodon/pull/6219#issuecomment-429142747

    Returns: string, redirect url
    """
    if not url:
        return url

    prefix = urllib.parse.urljoin(request.host_url, '/r/')
    if url.startswith(prefix):
        return url

    return prefix + url


def redirect_unwrap(val):
    """Removes our redirect wrapping from a URL, if it's there.

    url may be a string, dict, or list. dicts and lists are unwrapped
    recursively.

    Strings that aren't wrapped URLs are left unchanged.

    Args:
      url: string

    Returns: string, unwrapped url
    """
    if isinstance(val, dict):
        return {k: redirect_unwrap(v) for k, v in val.items()}

    elif isinstance(val, list):
        return [redirect_unwrap(v) for v in val]

    elif isinstance(val, str):
        prefix = urllib.parse.urljoin(request.host_url, '/r/')
        if val.startswith(prefix):
            return util.follow_redirects(val[len(prefix):]).url
        elif val.startswith(request.host_url):
            domain = util.domain_from_link(urllib.parse.urlparse(val).path.strip('/'))
            return util.follow_redirects(domain).url

    return val


class XrdOrJrd(View):
    """Renders and serves an XRD or JRD file.

    JRD is served if the request path ends in .jrd or .json, or the format query
    parameter is 'jrd' or 'json', or the request's Accept header includes 'jrd' or
    'json'.

    XRD is served if the request path ends in .xrd or .xml, or the format query
    parameter is 'xml' or 'xrd', or the request's Accept header includes 'xml' or
    'xrd'.

    Otherwise, defaults to DEFAULT_TYPE.

    Subclasses must override :meth:`template_prefix()` and
    :meth:`template_vars()`. URL route variables are passed through to
    :meth:`template_vars()` as keyword args.

    Class members:
      DEFAULT_TYPE: either JRD or XRD, which type to return by default if the
        request doesn't ask for one explicitly with the Accept header.

    """
    JRD = 'jrd'
    XRD = 'xrd'
    DEFAULT_TYPE = JRD  # either JRD or XRD

    def template_prefix(self):
        """Returns template filename, without extension."""
        raise NotImplementedError()

    def template_vars(self, **kwargs):
        """Returns a dict with template variables.

        URL route variables are passed through as kwargs.
        """
        raise NotImplementedError()

    def _type(self):
        """Returns XRD or JRD."""
        format = request.args.get('format', '').lower()
        ext = os.path.splitext(request.path)[1]

        if ext in ('.jrd', '.json') or format in ('jrd', 'json'):
            return self.JRD
        elif ext in ('.xrd', '.xml') or format in ('xrd', 'xml'):
            return self.XRD

        # We don't do full content negotiation (Accept Header parsing); we just
        # check whether jrd/json and xrd/xml are in the header, and if they both
        # are, which one comes first. :/
        # https://developer.mozilla.org/en-US/docs/Web/HTTP/Content_negotiation
        accept = request.headers.get('Accept', '').lower()
        jrd = re.search(r'jrd|json', accept)
        xrd = re.search(r'xrd|xml', accept)
        if jrd and (not xrd or jrd.start() < xrd.start()):
            return self.JRD
        elif xrd and (not jrd or xrd.start() < jrd.start()):
            return self.XRD

        assert self.DEFAULT_TYPE in (self.JRD, self.XRD)
        return self.DEFAULT_TYPE

    def dispatch_request(self, **kwargs):
        data = self.template_vars(**kwargs)
        if not isinstance(data, dict):
            return data

        # Content-Types are from https://tools.ietf.org/html/rfc7033#section-10.2
        if self._type() == self.JRD:
            return data, {'Content-Type': 'application/jrd+json'}
        else:
            template = f'{self.template_prefix()}.{self._type()}'
            return (render_template(template, **data),
                    {'Content-Type': 'application/xrd+xml; charset=utf-8'})