bridgy-fed/common.py

527 wiersze
18 KiB
Python

# coding=utf-8
"""Misc common utilities.
"""
import itertools
import logging
import os
import re
import urllib.parse
from flask import render_template, request
from flask.views import View
from granary import as2
from oauth_dropins.webutil import util, webmention
import requests
from webob import exc
from werkzeug.exceptions import abort
import common
from models import Response
DOMAIN_RE = r'([^/:]+\.[^/:]+)'
ACCT_RE = r'(?:acct:)?([^@]+)@' + DOMAIN_RE
TLD_BLOCKLIST = ('7z', 'asp', 'aspx', 'gif', 'html', 'ico', 'jpg', 'jpeg', 'js',
'json', 'php', 'png', 'rar', 'txt', 'yaml', 'yml', 'zip')
HEADERS = {
'User-Agent': 'Bridgy Fed (https://fed.brid.gy/)',
}
XML_UTF8 = "<?xml version='1.0' encoding='UTF-8'?>\n"
# USERNAME = 'me'
# USERNAME_EMOJI = '🌎' # globe
LINK_HEADER_RE = re.compile(r""" *< *([^ >]+) *> *; *rel=['"]([^'"]+)['"] *""")
AS2_PUBLIC_AUDIENCE = 'https://www.w3.org/ns/activitystreams#Public'
# Content-Type values. All non-unicode strings because App Engine's wsgi.py
# requires header values to be str, not unicode.
#
# ActivityPub Content-Type details:
# https://www.w3.org/TR/activitypub/#retrieving-objects
CONTENT_TYPE_AS2_LD = 'application/ld+json; profile="https://www.w3.org/ns/activitystreams"'
CONTENT_TYPE_AS2 = 'application/activity+json'
CONTENT_TYPE_AS1 = 'application/stream+json'
CONTENT_TYPE_HTML = 'text/html; charset=utf-8'
CONTENT_TYPE_ATOM = 'application/atom+xml'
CONTENT_TYPE_MAGIC_ENVELOPE = 'application/magic-envelope+xml'
CONNEG_HEADERS_AS2 = {
'Accept': '%s; q=0.9, %s; q=0.8' % (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD),
}
CONNEG_HEADERS_AS2_HTML = {
'Accept': CONNEG_HEADERS_AS2['Accept'] + ', %s; q=0.7' % CONTENT_TYPE_HTML,
}
SUPPORTED_VERBS = (
'checkin',
'create',
'follow',
'like',
'post',
'share',
'tag',
'update',
)
PRIMARY_DOMAIN = 'fed.brid.gy'
OTHER_DOMAINS = (
'bridgy-federated.appspot.com',
'localhost',
)
DOMAINS = (PRIMARY_DOMAIN,) + OTHER_DOMAINS
# TODO: add to all handlers:
# self.response.headers.update({
# 'Access-Control-Allow-Headers': '*',
# 'Access-Control-Allow-Methods': '*',
# 'Access-Control-Allow-Origin': '*',
# # see https://content-security-policy.com/
# 'Content-Security-Policy':
# "script-src https: localhost:8080 my.dev.com:8080 'unsafe-inline'; "
# "frame-ancestors 'self'; "
# "report-uri /csp-report; ",
# # 16070400 seconds is 6 months
# 'Strict-Transport-Security': 'max-age=16070400; preload',
# 'X-Content-Type-Options': 'nosniff',
# # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Frame-Options
# 'X-Frame-Options': 'SAMEORIGIN',
# 'X-XSS-Protection': '1; mode=block',
# })
# def options(self, *args, **kwargs):
# """Respond to CORS pre-flight OPTIONS requests."""
# pass
def not_5xx(resp):
return isinstance(resp, tuple) and resp[1] // 100 != 5
def requests_get(url, **kwargs):
return _requests_fn(util.requests_get, url, **kwargs)
def requests_post(url, **kwargs):
return _requests_fn(util.requests_post, url, **kwargs)
def _requests_fn(fn, url, parse_json=False, **kwargs):
"""Wraps requests.* and adds raise_for_status() and User-Agent."""
kwargs.setdefault('headers', {}).update(HEADERS)
resp = fn(url, gateway=True, **kwargs)
logging.info('Got %s headers:%s', resp.status_code, resp.headers)
type = content_type(resp)
if (type and type != 'text/html' and
(type.startswith('text/') or type.endswith('+json') or type.endswith('/json'))):
logging.info(resp.text)
if parse_json:
try:
return resp.json()
except ValueError:
msg = "Couldn't parse response as JSON"
logging.info(msg, exc_info=True)
raise exc.HTTPBadGateway(msg)
return resp
def get_as2(url):
"""Tries to fetch the given URL as ActivityStreams 2.
Uses HTTP content negotiation via the Content-Type header. If the url is
HTML and it has a rel-alternate link with an AS2 content type, fetches and
returns that URL.
Args:
url: string
Returns:
requests.Response
Raises:
requests.HTTPError, webob.exc.HTTPException
If we raise webob HTTPException, it will have an additional response
attribute with the last requests.Response we received.
"""
def _error(resp):
msg = "Couldn't fetch %s as ActivityStreams 2" % url
logging.warning(msg)
err = exc.HTTPBadGateway(msg)
err.response = resp
raise err
resp = requests_get(url, headers=CONNEG_HEADERS_AS2_HTML)
if content_type(resp) in (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD):
return resp
parsed = util.parse_html(resp)
as2 = parsed.find('link', rel=('alternate', 'self'), type=(
CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD))
if not (as2 and as2['href']):
_error(resp)
resp = requests_get(urllib.parse.urljoin(resp.url, as2['href']),
headers=CONNEG_HEADERS_AS2)
if content_type(resp) in (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD):
return resp
_error(resp)
def content_type(resp):
"""Returns a requests.Response's Content-Type, without charset suffix."""
type = resp.headers.get('Content-Type')
if type:
return type.split(';')[0]
def get_required_param(name):
try:
val = request.args.get(name) or request.form.get(name)
except (UnicodeDecodeError, UnicodeEncodeError) as e:
abort(400, f"Couldn't decode parameters as UTF-8: {e}")
if not val:
abort(400, f'Missing required parameter: {name}')
return val
def error(msg, status=None, exc_info=False):
if not status:
status = 400
logging.info('Returning %s: %s' % (status, msg), exc_info=exc_info)
return (msg, status)
def send_webmentions(activity_wrapped, proxy=None, **response_props):
"""Sends webmentions for an incoming Salmon slap or ActivityPub inbox delivery.
Args:
activity_wrapped: dict, AS1 activity
response_props: passed through to the newly created Responses
"""
activity = redirect_unwrap(activity_wrapped)
verb = activity.get('verb')
if verb and verb not in SUPPORTED_VERBS:
return error('%s activities are not supported yet.' % verb)
# extract source and targets
source = activity.get('url') or activity.get('id')
obj = activity.get('object')
obj_url = util.get_url(obj)
targets = util.get_list(activity, 'inReplyTo')
if isinstance(obj, dict):
if not source or verb in ('create', 'post', 'update'):
source = obj_url or obj.get('id')
targets.extend(util.get_list(obj, 'inReplyTo'))
tags = util.get_list(activity_wrapped, 'tags')
obj_wrapped = activity_wrapped.get('object')
if isinstance(obj_wrapped, dict):
tags.extend(util.get_list(obj_wrapped, 'tags'))
for tag in tags:
if tag.get('objectType') == 'mention':
url = tag.get('url')
if url and url.startswith(request.host_url):
targets.append(redirect_unwrap(url))
if verb in ('follow', 'like', 'share'):
targets.append(obj_url)
targets = util.dedupe_urls(util.get_url(t) for t in targets)
if not source:
return error("Couldn't find original post URL")
if not targets:
return error("Couldn't find any target URLs in inReplyTo, object, or mention tags")
# send webmentions and store Responses
errors = []
for target in targets:
if util.domain_from_link(target) == util.domain_from_link(source):
logging.info('Skipping same-domain webmention from %s to %s',
source, target)
continue
response = Response(source=source, target=target, direction='in',
**response_props)
response.put()
wm_source = (response.proxy_url()
if verb in ('follow', 'like', 'share') or proxy
else source)
logging.info('Sending webmention from %s to %s', wm_source, target)
try:
endpoint = webmention.discover(target, headers=HEADERS).endpoint
if endpoint:
webmention.send(endpoint, wm_source, target, headers=HEADERS)
response.status = 'complete'
logging.info('Success!')
except BaseException as e:
util.interpret_http_exception(e)
logging.warning(f'Failed! {e}')
errors.append(e)
response.put()
if errors:
msg = 'Errors:\n' + '\n'.join(str(e) for e in errors)
return error(msg, status=getattr(errors[0], 'http_status', None))
def postprocess_as2(activity, target=None, key=None):
"""Prepare an AS2 object to be served or sent via ActivityPub.
Args:
activity: dict, AS2 object or activity
target: dict, AS2 object, optional. The target of activity's inReplyTo or
Like/Announce/etc object, if any.
key: MagicKey, optional. populated into publicKey field if provided.
"""
type = activity.get('type')
# actor objects
if type == 'Person':
postprocess_as2_actor(activity)
if not activity.get('publicKey'):
# underspecified, inferred from this issue and Mastodon's implementation:
# https://github.com/w3c/activitypub/issues/203#issuecomment-297553229
# https://github.com/tootsuite/mastodon/blob/bc2c263504e584e154384ecc2d804aeb1afb1ba3/app/services/activitypub/process_account_service.rb#L77
activity.update({
'publicKey': {
'id': activity.get('preferredUsername'),
'publicKeyPem': key.public_pem().decode(),
},
'@context': (util.get_list(activity, '@context') +
['https://w3id.org/security/v1']),
})
return activity
for actor in (util.get_list(activity, 'attributedTo') +
util.get_list(activity, 'actor')):
postprocess_as2_actor(actor)
# inReplyTo: singly valued, prefer id over url
target_id = target.get('id') if target else None
in_reply_to = activity.get('inReplyTo')
if in_reply_to:
if target_id:
activity['inReplyTo'] = target_id
elif isinstance(in_reply_to, list):
if len(in_reply_to) > 1:
logging.warning(
"AS2 doesn't support multiple inReplyTo URLs! "
'Only using the first: %s' % in_reply_to[0])
activity['inReplyTo'] = in_reply_to[0]
# Mastodon evidently requires a Mention tag for replies to generate a
# notification to the original post's author. not required for likes,
# reposts, etc. details:
# https://github.com/snarfed/bridgy-fed/issues/34
if target:
for to in (util.get_list(target, 'attributedTo') +
util.get_list(target, 'actor')):
if isinstance(to, dict):
to = to.get('url') or to.get('id')
if to:
activity.setdefault('tag', []).append({
'type': 'Mention',
'href': to,
})
# activity objects (for Like, Announce, etc): prefer id over url
obj = activity.get('object')
if obj:
if isinstance(obj, dict) and not obj.get('id'):
obj['id'] = target_id or obj.get('url')
elif target_id and obj != target_id:
activity['object'] = target_id
# id is required for most things. default to url if it's not set.
if not activity.get('id'):
activity['id'] = activity.get('url')
# TODO: find a better way to check this, sometimes or always?
# removed for now since it fires on posts without u-id or u-url, eg
# https://chrisbeckstrom.com/2018/12/27/32551/
# assert activity.get('id') or (isinstance(obj, dict) and obj.get('id'))
activity['id'] = redirect_wrap(activity.get('id'))
activity['url'] = redirect_wrap(activity.get('url'))
# copy image(s) into attachment(s). may be Mastodon-specific.
# https://github.com/snarfed/bridgy-fed/issues/33#issuecomment-440965618
obj_or_activity = obj if isinstance(obj, dict) else activity
obj_or_activity.setdefault('attachment', []).extend(
obj_or_activity.get('image', []))
# cc public and target's author(s) and recipients
# https://www.w3.org/TR/activitystreams-vocabulary/#audienceTargeting
# https://w3c.github.io/activitypub/#delivery
if type in as2.TYPE_TO_VERB or type in ('Article', 'Note'):
recips = [AS2_PUBLIC_AUDIENCE]
if target:
recips += itertools.chain(*(util.get_list(target, field) for field in
('actor', 'attributedTo', 'to', 'cc')))
activity['cc'] = util.dedupe_urls(util.get_url(recip) or recip.get('id')
for recip in recips)
# wrap articles and notes in a Create activity
if type in ('Article', 'Note'):
activity = {
'@context': as2.CONTEXT,
'type': 'Create',
'id': f'{activity["id"]}#bridgy-fed-create',
'object': activity,
}
return util.trim_nulls(activity)
def postprocess_as2_actor(actor):
"""Prepare an AS2 actor object to be served or sent via ActivityPub.
Args:
actor: dict, AS2 actor object
"""
url = actor.get('url')
if url:
domain = urllib.parse.urlparse(url).netloc
actor.setdefault('preferredUsername', domain)
actor['id'] = request.host_url + domain
actor['url'] = redirect_wrap(url)
# required by pixelfed. https://github.com/snarfed/bridgy-fed/issues/39
actor.setdefault('summary', '')
def redirect_wrap(url):
"""Returns a URL on our domain that redirects to this URL.
...to satisfy Mastodon's non-standard domain matching requirement. :(
Args:
url: string
https://github.com/snarfed/bridgy-fed/issues/16#issuecomment-424799599
https://github.com/tootsuite/mastodon/pull/6219#issuecomment-429142747
Returns: string, redirect url
"""
if not url:
return url
prefix = urllib.parse.urljoin(request.host_url, '/r/')
if url.startswith(prefix):
return url
return prefix + url
def redirect_unwrap(val):
"""Removes our redirect wrapping from a URL, if it's there.
url may be a string, dict, or list. dicts and lists are unwrapped
recursively.
Strings that aren't wrapped URLs are left unchanged.
Args:
url: string
Returns: string, unwrapped url
"""
if isinstance(val, dict):
return {k: redirect_unwrap(v) for k, v in val.items()}
elif isinstance(val, list):
return [redirect_unwrap(v) for v in val]
elif isinstance(val, str):
prefix = urllib.parse.urljoin(request.host_url, '/r/')
if val.startswith(prefix):
return util.follow_redirects(val[len(prefix):]).url
elif val.startswith(request.host_url):
domain = util.domain_from_link(urllib.parse.urlparse(val).path.strip('/'))
return util.follow_redirects(domain).url
return val
class XrdOrJrd(View):
"""Renders and serves an XRD or JRD file.
JRD is served if the request path ends in .jrd or .json, or the format query
parameter is 'jrd' or 'json', or the request's Accept header includes 'jrd' or
'json'.
XRD is served if the request path ends in .xrd or .xml, or the format query
parameter is 'xml' or 'xrd', or the request's Accept header includes 'xml' or
'xrd'.
Otherwise, defaults to DEFAULT_TYPE.
Subclasses must override :meth:`template_prefix()` and
:meth:`template_vars()`. URL route variables are passed through to
:meth:`template_vars()` as keyword args.
Class members:
DEFAULT_TYPE: either JRD or XRD, which type to return by default if the
request doesn't ask for one explicitly with the Accept header.
"""
JRD = 'jrd'
XRD = 'xrd'
DEFAULT_TYPE = JRD # either JRD or XRD
def template_prefix(self):
"""Returns template filename, without extension."""
raise NotImplementedError()
def template_vars(self, **kwargs):
"""Returns a dict with template variables.
URL route variables are passed through as kwargs.
"""
raise NotImplementedError()
def _type(self):
"""Returns XRD or JRD."""
format = request.args.get('format', '').lower()
ext = os.path.splitext(request.path)[1]
if ext in ('.jrd', '.json') or format in ('jrd', 'json'):
return self.JRD
elif ext in ('.xrd', '.xml') or format in ('xrd', 'xml'):
return self.XRD
# We don't do full content negotiation (Accept Header parsing); we just
# check whether jrd/json and xrd/xml are in the header, and if they both
# are, which one comes first. :/
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Content_negotiation
accept = request.headers.get('Accept', '').lower()
jrd = re.search(r'jrd|json', accept)
xrd = re.search(r'xrd|xml', accept)
if jrd and (not xrd or jrd.start() < xrd.start()):
return self.JRD
elif xrd and (not jrd or xrd.start() < jrd.start()):
return self.XRD
assert self.DEFAULT_TYPE in (self.JRD, self.XRD)
return self.DEFAULT_TYPE
def dispatch_request(self, **kwargs):
data = self.template_vars(**kwargs)
if not isinstance(data, dict):
return data
# Content-Types are from https://tools.ietf.org/html/rfc7033#section-10.2
if self._type() == self.JRD:
return data, {'Content-Type': 'application/jrd+json'}
else:
template = f'{self.template_prefix()}.{self._type()}'
return (render_template(template, **data),
{'Content-Type': 'application/xrd+xml; charset=utf-8'})