bridgy-fed/common.py

# coding=utf-8
"""Misc common utilities.
"""
import itertools
import logging
import os
import re
import urllib.parse

from flask import request
from granary import as2, microformats2
import mf2util
from oauth_dropins.webutil import util, webmention
from oauth_dropins.webutil.flask_util import error
from oauth_dropins.webutil.util import json_dumps, json_loads
import requests
from werkzeug.exceptions import BadGateway

import common
from models import Activity, User

logger = logging.getLogger(__name__)

DOMAIN_RE = r'([^/:]+\.[^/:]+)'
ACCT_RE = r'(?:acct:)?([^@]+)@' + DOMAIN_RE
TLD_BLOCKLIST = ('7z', 'asp', 'aspx', 'gif', 'html', 'ico', 'jpg', 'jpeg', 'js',
                 'json', 'php', 'png', 'rar', 'txt', 'yaml', 'yml', 'zip')
XML_UTF8 = "<?xml version='1.0' encoding='UTF-8'?>\n"
LINK_HEADER_RE = re.compile(r""" *< *([^ >]+) *> *; *rel=['"]([^'"]+)['"] *""")

# Content-Type values. All non-unicode strings because App Engine's wsgi.py
# requires header values to be str, not unicode.
#
# ActivityPub Content-Type details:
# https://www.w3.org/TR/activitypub/#retrieving-objects
CONTENT_TYPE_AS2_LD = 'application/ld+json; profile="https://www.w3.org/ns/activitystreams"'
CONTENT_TYPE_AS2 = 'application/activity+json'
CONTENT_TYPE_AS1 = 'application/stream+json'
CONTENT_TYPE_HTML = 'text/html; charset=utf-8'
CONTENT_TYPE_ATOM = 'application/atom+xml'
CONTENT_TYPE_MAGIC_ENVELOPE = 'application/magic-envelope+xml'

CONNEG_HEADERS_AS2 = {
    'Accept': '%s; q=0.9, %s; q=0.8' % (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD),
}
CONNEG_HEADERS_AS2_HTML = {
    'Accept': CONNEG_HEADERS_AS2['Accept'] + ', %s; q=0.7' % CONTENT_TYPE_HTML,
}

SUPPORTED_VERBS = (
    'checkin',
    'create',
    'follow',
    'like',
    'post',
    'share',
    'tag',
    'update',
)

PRIMARY_DOMAIN = 'fed.brid.gy'
OTHER_DOMAINS = (
    'bridgy-federated.appspot.com',
    'localhost',
)
DOMAINS = (PRIMARY_DOMAIN,) + OTHER_DOMAINS
# TODO: unify with Bridgy's
DOMAIN_BLOCKLIST = frozenset((
    'facebook.com',
    'fb.com',
    't.co',
    'twitter.com',
) + DOMAINS)


def requests_get(url, **kwargs):
    return _requests_fn(util.requests_get, url, **kwargs)


def requests_post(url, **kwargs):
    return _requests_fn(util.requests_post, url, **kwargs)


def _requests_fn(fn, url, parse_json=False, **kwargs):
    """Wraps requests.* and adds raise_for_status()."""
    kwargs.setdefault('gateway', True)
    resp = fn(url, **kwargs)

    logger.info(f'Got {resp.status_code} headers: {resp.headers}')
    type = content_type(resp)
    if (type and type != 'text/html' and
        (type.startswith('text/') or type.endswith('+json') or type.endswith('/json'))):
        logger.info(resp.text)

    if parse_json:
        try:
            return resp.json()
        except ValueError:
            msg = "Couldn't parse response as JSON"
            logger.info(msg, exc_info=True)
            raise BadGateway(msg)

    return resp


def get_as2(url):
    """Tries to fetch the given URL as ActivityStreams 2.

    Uses HTTP content negotiation via the Content-Type header. If the url is
    HTML and it has a rel-alternate link with an AS2 content type, fetches and
    returns that URL.

    Args:
        url: string

    Returns:
        :class:`requests.Response`

    Raises:
        :class:`requests.HTTPError`, :class:`werkzeug.exceptions.HTTPException`

        If we raise a werkzeug HTTPException, it will have an additional
        requests_response attribute with the last requests.Response we received.
    """
    def _error(resp):
        msg = "Couldn't fetch %s as ActivityStreams 2" % url
        logger.warning(msg)
        err = BadGateway(msg)
        err.requests_response = resp
        raise err

    resp = requests_get(url, headers=CONNEG_HEADERS_AS2_HTML)
    if content_type(resp) in (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD):
        return resp

    parsed = util.parse_html(resp)
    as2 = parsed.find('link', rel=('alternate', 'self'), type=(
        CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD))
    if not (as2 and as2['href']):
        _error(resp)

    resp = requests_get(urllib.parse.urljoin(resp.url, as2['href']),
                        headers=CONNEG_HEADERS_AS2)
    if content_type(resp) in (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD):
        return resp

    _error(resp)


def content_type(resp):
    """Returns a :class:`requests.Response`'s Content-Type, without charset suffix."""
    type = resp.headers.get('Content-Type')
    if type:
        return type.split(';')[0]


def remove_blocklisted(urls):
    """Returns the subset of input URLs that aren't in our domain blocklist.

    Args:
      urls: sequence of str

    Returns: list of str
    """
    return [u for u in urls if not util.domain_or_parent_in(
              util.domain_from_link(u), DOMAIN_BLOCKLIST)]


def send_webmentions(activity_wrapped, proxy=None, **activity_props):
    """Sends webmentions for an incoming Salmon slap or ActivityPub inbox delivery.
    Args:
      activity_wrapped: dict, AS1 activity
      activity_props: passed through to the newly created Activity entities

    Returns: boolean, True if any webmentions were sent, False otherwise
    """
    activity = redirect_unwrap(activity_wrapped)

    verb = activity.get('verb')
    if verb and verb not in SUPPORTED_VERBS:
        error(f'{verb} activities are not supported yet.')

    # extract source and targets
    source = activity.get('url') or activity.get('id')
    obj = activity.get('object')
    obj_url = util.get_url(obj)

    targets = util.get_list(activity, 'inReplyTo')
    if isinstance(obj, dict):
        if not source or verb in ('create', 'post', 'update'):
            source = obj_url or obj.get('id')
        targets.extend(util.get_list(obj, 'inReplyTo'))

    if not source:
        error("Couldn't find original post URL")

    tags = util.get_list(activity_wrapped, 'tags')
    obj_wrapped = activity_wrapped.get('object')
    if isinstance(obj_wrapped, dict):
        tags.extend(util.get_list(obj_wrapped, 'tags'))
    for tag in tags:
        if tag.get('objectType') == 'mention':
            url = tag.get('url')
            if url and url.startswith(request.host_url):
                targets.append(redirect_unwrap(url))

    if verb in ('follow', 'like', 'share'):
         targets.append(obj_url)

    targets = util.dedupe_urls(util.get_url(t) for t in targets)
    targets = remove_blocklisted(t.lower() for t in targets)
    if not targets:
        logger.info("Couldn't find any IndieWeb target URLs in inReplyTo, object, or mention tags")
        return False

    logger.info(f'targets: {targets}')

    # send webmentions and store Activitys
    errors = []  # stores (code, body) tuples
    for target in targets:
        domain = util.domain_from_link(target, minimize=False)
        if (domain == util.domain_from_link(source, minimize=False)):
            logger.info(f'Skipping same-domain webmention from {source} to {target}')
            continue

        activity = Activity(source=source, target=target, direction='in',
                            domain=[domain], **activity_props)
        activity.put()
        wm_source = (activity.proxy_url()
                     if verb in ('follow', 'like', 'share') or proxy
                     else source)
        logger.info(f'Sending webmention from {wm_source} to {target}')

        try:
            endpoint = webmention.discover(target).endpoint
            if endpoint:
                webmention.send(endpoint, wm_source, target)
                activity.status = 'complete'
                logger.info('Success!')
            else:
                activity.status = 'ignored'
                logger.info('Ignoring.')
        except BaseException as e:
            errors.append(util.interpret_http_exception(e))
        activity.put()

    if errors:
        msg = 'Errors: ' + ', '.join(f'{code} {body}' for code, body in errors)
        error(msg, status=int(errors[0][0] or 502))

    return True


def postprocess_as2(activity, user=None, target=None):
    """Prepare an AS2 object to be served or sent via ActivityPub.

    Args:
      activity: dict, AS2 object or activity
      user: :class:`User`, required. populated into actor.id and
        publicKey fields if needed.
      target: dict, AS2 object, optional. The target of activity's inReplyTo or
        Like/Announce/etc object, if any.
    """
    assert user
    type = activity.get('type')

    # actor objects
    if type == 'Person':
        postprocess_as2_actor(activity, user)
        if not activity.get('publicKey'):
            # underspecified, inferred from this issue and Mastodon's implementation:
            # https://github.com/w3c/activitypub/issues/203#issuecomment-297553229
            # https://github.com/tootsuite/mastodon/blob/bc2c263504e584e154384ecc2d804aeb1afb1ba3/app/services/activitypub/process_account_service.rb#L77
            actor_url = request.host_url + activity.get('preferredUsername')
            activity.update({
                'publicKey': {
                    'id': actor_url,
                    'owner': actor_url,
                    'publicKeyPem': user.public_pem().decode(),
                },
                '@context': (util.get_list(activity, '@context') +
                             ['https://w3id.org/security/v1']),
            })
        return activity

    for actor in (util.get_list(activity, 'attributedTo') +
                  util.get_list(activity, 'actor')):
        postprocess_as2_actor(actor, user)

    # inReplyTo: singly valued, prefer id over url
    target_id = target.get('id') if target else None
    in_reply_to = activity.get('inReplyTo')
    if in_reply_to:
        if target_id:
            activity['inReplyTo'] = target_id
        elif isinstance(in_reply_to, list):
            if len(in_reply_to) > 1:
                logger.warning(
                    "AS2 doesn't support multiple inReplyTo URLs! "
                    'Only using the first: %s' % in_reply_to[0])
            activity['inReplyTo'] = in_reply_to[0]

        # Mastodon evidently requires a Mention tag for replies to generate a
        # notification to the original post's author. not required for likes,
        # reposts, etc. details:
        # https://github.com/snarfed/bridgy-fed/issues/34
        if target:
            for to in (util.get_list(target, 'attributedTo') +
                       util.get_list(target, 'actor')):
                if isinstance(to, dict):
                    to = to.get('url') or to.get('id')
                if to:
                    activity.setdefault('tag', []).append({
                        'type': 'Mention',
                        'href': to,
                    })

    # activity objects (for Like, Announce, etc): prefer id over url
    obj = activity.get('object')
    if obj:
        if isinstance(obj, dict) and not obj.get('id'):
            obj['id'] = target_id or obj.get('url')
        elif target_id and obj != target_id:
            activity['object'] = target_id

    # id is required for most things. default to url if it's not set.
    if not activity.get('id'):
        activity['id'] = activity.get('url')

    # TODO: find a better way to check this, sometimes or always?
    # removed for now since it fires on posts without u-id or u-url, eg
    # https://chrisbeckstrom.com/2018/12/27/32551/
    # assert activity.get('id') or (isinstance(obj, dict) and obj.get('id'))

    activity['id'] = redirect_wrap(activity.get('id'))
    activity['url'] = redirect_wrap(activity.get('url'))

    # copy image(s) into attachment(s). may be Mastodon-specific.
    # https://github.com/snarfed/bridgy-fed/issues/33#issuecomment-440965618
    obj_or_activity = obj if isinstance(obj, dict) else activity
    img = obj_or_activity.get('image')
    if img:
        obj_or_activity.setdefault('attachment', []).append(img)

    # cc target's author(s) and recipients
    # https://www.w3.org/TR/activitystreams-vocabulary/#audienceTargeting
    # https://w3c.github.io/activitypub/#delivery
    if target and (type in as2.TYPE_TO_VERB or type in ('Article', 'Note')):
        recips = itertools.chain(*(util.get_list(target, field) for field in
                                 ('actor', 'attributedTo', 'to', 'cc')))
        activity['cc'] = util.dedupe_urls(util.get_url(recip) or recip.get('id')
                                          for recip in recips)

    # to public, since Mastodon interprets to public as public, cc public as unlisted:
    # https://socialhub.activitypub.rocks/t/visibility-to-cc-mapping/284
    # https://wordsmith.social/falkreon/securing-activitypub
    to = activity.setdefault('to', [])
    if as2.PUBLIC_AUDIENCE not in to:
        to.append(as2.PUBLIC_AUDIENCE)

    # wrap articles and notes in a Create activity
    if type in ('Article', 'Note'):
        activity = {
            '@context': as2.CONTEXT,
            'type': 'Create',
            'id': f'{activity["id"]}#bridgy-fed-create',
            'actor': postprocess_as2_actor({}, user),
            'object': activity,
        }

    return util.trim_nulls(activity)


def postprocess_as2_actor(actor, user=None):
    """Prepare an AS2 actor object to be served or sent via ActivityPub.

    Modifies actor in place.

    Args:
      actor: dict, AS2 actor object
      user: :class:`User`

    Returns:
      actor dict
    """
    url = actor.get('url') or f'https://{user.key.id()}/'
    domain = urllib.parse.urlparse(url).netloc

    actor.setdefault('id', request.host_url + domain)
    actor.update({
        'url': redirect_wrap(url),
        'preferredUsername': domain,
    })

    # required by pixelfed. https://github.com/snarfed/bridgy-fed/issues/39
    actor.setdefault('summary', '')
    return actor


def redirect_wrap(url):
    """Returns a URL on our domain that redirects to this URL.

    ...to satisfy Mastodon's non-standard domain matching requirement. :(

    Args:
      url: string

    https://github.com/snarfed/bridgy-fed/issues/16#issuecomment-424799599
    https://github.com/tootsuite/mastodon/pull/6219#issuecomment-429142747

    Returns: string, redirect url
    """
    if not url:
        return url

    prefix = urllib.parse.urljoin(request.host_url, '/r/')
    if url.startswith(prefix):
        return url

    return prefix + url


def redirect_unwrap(val):
    """Removes our redirect wrapping from a URL, if it's there.

    url may be a string, dict, or list. dicts and lists are unwrapped
    recursively.

    Strings that aren't wrapped URLs are left unchanged.

    Args:
      url: string

    Returns: string, unwrapped url
    """
    if isinstance(val, dict):
        return {k: redirect_unwrap(v) for k, v in val.items()}

    elif isinstance(val, list):
        return [redirect_unwrap(v) for v in val]

    elif isinstance(val, str):
        prefix = urllib.parse.urljoin(request.host_url, '/r/')
        if val.startswith(prefix):
            return util.follow_redirects(val[len(prefix):]).url
        elif val.startswith(request.host_url):
            domain = util.domain_from_link(urllib.parse.urlparse(val).path.strip('/'),
                                           minimize=False)
            return util.follow_redirects(domain).url

    return val


def actor(domain, user=None):
    """Fetches a home page, converts its representative h-card to AS2 actor.

    Creates a User for the given domain if one doesn't already exist.

    Args:
      domain: str
      user: :class:`User`, optional

    Returns: dict, AS2 actor
    """
    tld = domain.split('.')[-1]
    if tld in TLD_BLOCKLIST:
        error('', status=404)

    mf2 = util.fetch_mf2(f'https://{domain}/', gateway=True)
    hcard = mf2util.representative_hcard(mf2, mf2['url'])
    logger.info(f'Representative h-card: {json_dumps(hcard, indent=2)}')
    if not hcard:
        error(f"Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on {mf2['url']}")

    if not user:
        user = User.get_or_create(domain)

    actor = postprocess_as2(
        as2.from_as1(microformats2.json_to_object(hcard)), user=user)
    urls = util.dedupe_urls(microformats2.get_string_urls([hcard]))
    username = common.get_username(domain, urls)
    actor.update({
        'id': f'{request.host_url}{domain}',
        'preferredUsername': username,
        'inbox': f'{request.host_url}{domain}/inbox',
        'outbox': f'{request.host_url}{domain}/outbox',
        'following': f'{request.host_url}{domain}/following',
        'followers': f'{request.host_url}{domain}/followers',
        'endpoints': {
            'sharedInbox': f'{request.host_url}inbox',
        },
    })

    logger.info(f'Generated AS2 actor: {json_dumps(actor, indent=2)}')
    return actor


def get_username(domain, urls):
    """Returns a user's preferred username from an acct: url, if available.

    If there's no acct: URL, returns domain.

    Args:
      domain: str
      urls: sequence of str

    Returns: str
    """
    assert domain
    assert urls

    for url in urls:
        if url.startswith('acct:'):
            urluser, urldomain = util.parse_acct_uri(url)
            if urldomain == domain:
                logger.info(f'Found custom username: urluser')
                return urluser

    logger.info(f'Defaulting username to domain {domain}')
    return domain