# coding=utf-8 """Misc common utilities. """ import itertools import logging import os import re import urllib.parse from flask import request from granary import as2 from oauth_dropins.webutil import util, webmention from oauth_dropins.webutil.flask_util import error import requests from werkzeug.exceptions import BadGateway from models import Response logger = logging.getLogger(__name__) DOMAIN_RE = r'([^/:]+\.[^/:]+)' ACCT_RE = r'(?:acct:)?([^@]+)@' + DOMAIN_RE TLD_BLOCKLIST = ('7z', 'asp', 'aspx', 'gif', 'html', 'ico', 'jpg', 'jpeg', 'js', 'json', 'php', 'png', 'rar', 'txt', 'yaml', 'yml', 'zip') HEADERS = { 'User-Agent': 'Bridgy Fed (https://fed.brid.gy/)', } XML_UTF8 = "\n" LINK_HEADER_RE = re.compile(r""" *< *([^ >]+) *> *; *rel=['"]([^'"]+)['"] *""") AS2_PUBLIC_AUDIENCE = 'https://www.w3.org/ns/activitystreams#Public' # Content-Type values. All non-unicode strings because App Engine's wsgi.py # requires header values to be str, not unicode. # # ActivityPub Content-Type details: # https://www.w3.org/TR/activitypub/#retrieving-objects CONTENT_TYPE_AS2_LD = 'application/ld+json; profile="https://www.w3.org/ns/activitystreams"' CONTENT_TYPE_AS2 = 'application/activity+json' CONTENT_TYPE_AS1 = 'application/stream+json' CONTENT_TYPE_HTML = 'text/html; charset=utf-8' CONTENT_TYPE_ATOM = 'application/atom+xml' CONTENT_TYPE_MAGIC_ENVELOPE = 'application/magic-envelope+xml' CONNEG_HEADERS_AS2 = { 'Accept': '%s; q=0.9, %s; q=0.8' % (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD), } CONNEG_HEADERS_AS2_HTML = { 'Accept': CONNEG_HEADERS_AS2['Accept'] + ', %s; q=0.7' % CONTENT_TYPE_HTML, } SUPPORTED_VERBS = ( 'checkin', 'create', 'follow', 'like', 'post', 'share', 'tag', 'update', ) PRIMARY_DOMAIN = 'fed.brid.gy' OTHER_DOMAINS = ( 'bridgy-federated.appspot.com', 'localhost', ) DOMAINS = (PRIMARY_DOMAIN,) + OTHER_DOMAINS def requests_get(url, **kwargs): return _requests_fn(util.requests_get, url, **kwargs) def requests_post(url, **kwargs): return _requests_fn(util.requests_post, url, **kwargs) def _requests_fn(fn, url, parse_json=False, **kwargs): """Wraps requests.* and adds raise_for_status() and User-Agent.""" kwargs.setdefault('headers', {}).update(HEADERS) resp = fn(url, gateway=True, **kwargs) logger.info(f'Got {resp.status_code} headers: {resp.headers}') type = content_type(resp) if (type and type != 'text/html' and (type.startswith('text/') or type.endswith('+json') or type.endswith('/json'))): logger.info(resp.text) if parse_json: try: return resp.json() except ValueError: msg = "Couldn't parse response as JSON" logger.info(msg, exc_info=True) raise BadGateway(msg) return resp def get_as2(url): """Tries to fetch the given URL as ActivityStreams 2. Uses HTTP content negotiation via the Content-Type header. If the url is HTML and it has a rel-alternate link with an AS2 content type, fetches and returns that URL. Args: url: string Returns: :class:`requests.Response` Raises: :class:`requests.HTTPError`, :class:`werkzeug.exceptions.HTTPException` If we raise a werkzeug HTTPException, it will have an additional requests_response attribute with the last requests.Response we received. """ def _error(resp): msg = "Couldn't fetch %s as ActivityStreams 2" % url logger.warning(msg) err = BadGateway(msg) err.requests_response = resp raise err resp = requests_get(url, headers=CONNEG_HEADERS_AS2_HTML) if content_type(resp) in (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD): return resp parsed = util.parse_html(resp) as2 = parsed.find('link', rel=('alternate', 'self'), type=( CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD)) if not (as2 and as2['href']): _error(resp) resp = requests_get(urllib.parse.urljoin(resp.url, as2['href']), headers=CONNEG_HEADERS_AS2) if content_type(resp) in (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD): return resp _error(resp) def content_type(resp): """Returns a :class:`requests.Response`'s Content-Type, without charset suffix.""" type = resp.headers.get('Content-Type') if type: return type.split(';')[0] def send_webmentions(activity_wrapped, proxy=None, **response_props): """Sends webmentions for an incoming Salmon slap or ActivityPub inbox delivery. Args: activity_wrapped: dict, AS1 activity response_props: passed through to the newly created Responses """ activity = redirect_unwrap(activity_wrapped) verb = activity.get('verb') if verb and verb not in SUPPORTED_VERBS: error(f'{verb} activities are not supported yet.') # extract source and targets source = activity.get('url') or activity.get('id') obj = activity.get('object') obj_url = util.get_url(obj) targets = util.get_list(activity, 'inReplyTo') if isinstance(obj, dict): if not source or verb in ('create', 'post', 'update'): source = obj_url or obj.get('id') targets.extend(util.get_list(obj, 'inReplyTo')) tags = util.get_list(activity_wrapped, 'tags') obj_wrapped = activity_wrapped.get('object') if isinstance(obj_wrapped, dict): tags.extend(util.get_list(obj_wrapped, 'tags')) for tag in tags: if tag.get('objectType') == 'mention': url = tag.get('url') if url and url.startswith(request.host_url): targets.append(redirect_unwrap(url)) if verb in ('follow', 'like', 'share'): targets.append(obj_url) targets = util.dedupe_urls(util.get_url(t) for t in targets) if not source: error("Couldn't find original post URL") if not targets: error("Couldn't find any target URLs in inReplyTo, object, or mention tags") # send webmentions and store Responses errors = [] # stores (code, body) tuples for target in targets: if util.domain_from_link(target) == util.domain_from_link(source): logger.info(f'Skipping same-domain webmention from {source} to {target}') continue response = Response(source=source, target=target, direction='in', **response_props) response.put() wm_source = (response.proxy_url() if verb in ('follow', 'like', 'share') or proxy else source) logger.info(f'Sending webmention from {wm_source} to {target}') try: endpoint = webmention.discover(target, headers=HEADERS).endpoint if endpoint: webmention.send(endpoint, wm_source, target, headers=HEADERS) response.status = 'complete' logger.info('Success!') else: response.status = 'ignored' logger.info('Ignoring.') except BaseException as e: errors.append(util.interpret_http_exception(e)) response.put() if errors: msg = 'Errors: ' + ', '.join(f'{code} {body}' for code, body in errors) error(msg, status=int(errors[0][0] or 502)) def postprocess_as2(activity, target=None, key=None): """Prepare an AS2 object to be served or sent via ActivityPub. Args: activity: dict, AS2 object or activity target: dict, AS2 object, optional. The target of activity's inReplyTo or Like/Announce/etc object, if any. key: :class:`models.MagicKey`, optional. populated into publicKey field if provided. """ type = activity.get('type') # actor objects if type == 'Person': postprocess_as2_actor(activity) if not activity.get('publicKey'): # underspecified, inferred from this issue and Mastodon's implementation: # https://github.com/w3c/activitypub/issues/203#issuecomment-297553229 # https://github.com/tootsuite/mastodon/blob/bc2c263504e584e154384ecc2d804aeb1afb1ba3/app/services/activitypub/process_account_service.rb#L77 activity.update({ 'publicKey': { 'id': activity.get('preferredUsername'), 'publicKeyPem': key.public_pem().decode(), }, '@context': (util.get_list(activity, '@context') + ['https://w3id.org/security/v1']), }) return activity for actor in (util.get_list(activity, 'attributedTo') + util.get_list(activity, 'actor')): postprocess_as2_actor(actor) # inReplyTo: singly valued, prefer id over url target_id = target.get('id') if target else None in_reply_to = activity.get('inReplyTo') if in_reply_to: if target_id: activity['inReplyTo'] = target_id elif isinstance(in_reply_to, list): if len(in_reply_to) > 1: logger.warning( "AS2 doesn't support multiple inReplyTo URLs! " 'Only using the first: %s' % in_reply_to[0]) activity['inReplyTo'] = in_reply_to[0] # Mastodon evidently requires a Mention tag for replies to generate a # notification to the original post's author. not required for likes, # reposts, etc. details: # https://github.com/snarfed/bridgy-fed/issues/34 if target: for to in (util.get_list(target, 'attributedTo') + util.get_list(target, 'actor')): if isinstance(to, dict): to = to.get('url') or to.get('id') if to: activity.setdefault('tag', []).append({ 'type': 'Mention', 'href': to, }) # activity objects (for Like, Announce, etc): prefer id over url obj = activity.get('object') if obj: if isinstance(obj, dict) and not obj.get('id'): obj['id'] = target_id or obj.get('url') elif target_id and obj != target_id: activity['object'] = target_id # id is required for most things. default to url if it's not set. if not activity.get('id'): activity['id'] = activity.get('url') # TODO: find a better way to check this, sometimes or always? # removed for now since it fires on posts without u-id or u-url, eg # https://chrisbeckstrom.com/2018/12/27/32551/ # assert activity.get('id') or (isinstance(obj, dict) and obj.get('id')) activity['id'] = redirect_wrap(activity.get('id')) activity['url'] = redirect_wrap(activity.get('url')) # copy image(s) into attachment(s). may be Mastodon-specific. # https://github.com/snarfed/bridgy-fed/issues/33#issuecomment-440965618 obj_or_activity = obj if isinstance(obj, dict) else activity obj_or_activity.setdefault('attachment', []).extend( obj_or_activity.get('image', [])) # cc public and target's author(s) and recipients # https://www.w3.org/TR/activitystreams-vocabulary/#audienceTargeting # https://w3c.github.io/activitypub/#delivery if type in as2.TYPE_TO_VERB or type in ('Article', 'Note'): recips = [AS2_PUBLIC_AUDIENCE] if target: recips += itertools.chain(*(util.get_list(target, field) for field in ('actor', 'attributedTo', 'to', 'cc'))) activity['cc'] = util.dedupe_urls(util.get_url(recip) or recip.get('id') for recip in recips) # wrap articles and notes in a Create activity if type in ('Article', 'Note'): activity = { '@context': as2.CONTEXT, 'type': 'Create', 'id': f'{activity["id"]}#bridgy-fed-create', 'object': activity, } return util.trim_nulls(activity) def postprocess_as2_actor(actor): """Prepare an AS2 actor object to be served or sent via ActivityPub. Args: actor: dict, AS2 actor object """ url = actor.get('url') if url: domain = urllib.parse.urlparse(url).netloc actor.update({ 'id': request.host_url + domain, 'url': redirect_wrap(url), 'preferredUsername': domain, }) # required by pixelfed. https://github.com/snarfed/bridgy-fed/issues/39 actor.setdefault('summary', '') def redirect_wrap(url): """Returns a URL on our domain that redirects to this URL. ...to satisfy Mastodon's non-standard domain matching requirement. :( Args: url: string https://github.com/snarfed/bridgy-fed/issues/16#issuecomment-424799599 https://github.com/tootsuite/mastodon/pull/6219#issuecomment-429142747 Returns: string, redirect url """ if not url: return url prefix = urllib.parse.urljoin(request.host_url, '/r/') if url.startswith(prefix): return url return prefix + url def redirect_unwrap(val): """Removes our redirect wrapping from a URL, if it's there. url may be a string, dict, or list. dicts and lists are unwrapped recursively. Strings that aren't wrapped URLs are left unchanged. Args: url: string Returns: string, unwrapped url """ if isinstance(val, dict): return {k: redirect_unwrap(v) for k, v in val.items()} elif isinstance(val, list): return [redirect_unwrap(v) for v in val] elif isinstance(val, str): prefix = urllib.parse.urljoin(request.host_url, '/r/') if val.startswith(prefix): return util.follow_redirects(val[len(prefix):]).url elif val.startswith(request.host_url): domain = util.domain_from_link(urllib.parse.urlparse(val).path.strip('/')) return util.follow_redirects(domain).url return val