bridgy-fed/web.py

560 wiersze
21 KiB
Python

"""Handles inbound webmentions."""
import difflib
import logging
from urllib.parse import urlencode, urljoin, urlparse
import feedparser
from flask import g, redirect, request
from flask.views import View
from google.cloud.ndb import Key
from granary import as1, as2, microformats2
import mf2util
from oauth_dropins.webutil import flask_util, util
from oauth_dropins.webutil.appengine_config import tasks_client
from oauth_dropins.webutil.appengine_info import APP_ID
from oauth_dropins.webutil.flask_util import error, flash
from oauth_dropins.webutil.util import json_dumps, json_loads
from oauth_dropins.webutil import webmention
from requests import HTTPError, RequestException, URLRequired
from werkzeug.exceptions import BadGateway, BadRequest, HTTPException, NotFound
import activitypub
from flask_app import app
import common
from models import Follower, Object, PROTOCOLS, Target, User
from protocol import Protocol
logger = logging.getLogger(__name__)
# https://cloud.google.com/appengine/docs/locations
TASKS_LOCATION = 'us-central1'
CHAR_AFTER_SPACE = chr(ord(' ') + 1)
# https://github.com/snarfed/bridgy-fed/issues/314
WWW_DOMAINS = frozenset((
'www.jvt.me',
))
class Web(User, Protocol):
"""Web user and webmention protocol implementation.
The key name is the domain.
"""
LABEL = 'webmention'
@classmethod
def _get_kind(cls):
return 'MagicKey'
def verify(self):
"""Fetches site a couple ways to check for redirects and h-card.
Returns: :class:`Web` that was verified. May be different than
self! eg if self's domain started with www and we switch to the root
domain.
"""
domain = self.key.id()
logger.info(f'Verifying {domain}')
if domain.startswith('www.') and domain not in WWW_DOMAINS:
# if root domain redirects to www, use root domain instead
# https://github.com/snarfed/bridgy-fed/issues/314
root = domain.removeprefix("www.")
root_site = f'https://{root}/'
try:
resp = util.requests_get(root_site, gateway=False)
if resp.ok and self.is_homepage(resp.url):
logger.info(f'{root_site} redirects to {resp.url} ; using {root} instead')
root_user = Web.get_or_create(root)
self.use_instead = root_user.key
self.put()
return root_user.verify()
except RequestException:
pass
# check webfinger redirect
path = f'/.well-known/webfinger?resource=acct:{domain}@{domain}'
self.has_redirects = False
self.redirects_error = None
try:
url = urljoin(self.homepage, path)
resp = util.requests_get(url, gateway=False)
domain_urls = ([f'https://{domain}/' for domain in common.DOMAINS] +
[common.host_url()])
expected = [urljoin(url, path) for url in domain_urls]
if resp.ok:
if resp.url in expected:
self.has_redirects = True
elif resp.url:
diff = '\n'.join(difflib.Differ().compare([resp.url], [expected[0]]))
self.redirects_error = f'Current vs expected:<pre>{diff}</pre>'
else:
lines = [url, f' returned HTTP {resp.status_code}']
if resp.url != url:
lines[1:1] = [' redirected to:', resp.url]
self.redirects_error = '<pre>' + '\n'.join(lines) + '</pre>'
except RequestException:
pass
# check home page
try:
obj = Web.load(self.homepage, gateway=True)
self.actor_as2 = activitypub.postprocess_as2(as2.from_as1(obj.as1))
self.has_hcard = True
except (BadRequest, NotFound):
self.actor_as2 = None
self.has_hcard = False
return self
@classmethod
def send(cls, obj, url):
"""Sends a webmention to a given target URL.
See :meth:`Protocol.send` for details.
"""
source_url = obj.proxy_url()
logger.info(f'Sending webmention from {source_url} to {url}')
endpoint = common.webmention_discover(url).endpoint
if endpoint:
webmention.send(endpoint, source_url, url)
return True
@classmethod
def fetch(cls, obj, gateway=False, check_backlink=None):
"""Fetches a URL over HTTP and extracts its microformats2.
Follows redirects, but doesn't change the original URL in obj's id! The
:class:`Model` class doesn't allow that anyway, but more importantly, we
want to preserve that original URL becase other objects may refer to it
instead of the final redirect destination URL.
See :meth:`Protocol.fetch` for other background.
Args:
gateway: passed through to :func:`webutil.util.fetch_mf2`
check_backlink: bool, optional, whether to require a link to Bridgy Fed
"""
url = obj.key.id()
is_homepage = ((g.user and g.user.is_homepage(url)) or
(g.external_user and g.external_user == url))
require_backlink = None
if check_backlink or (check_backlink is None and not is_homepage):
require_backlink = common.host_url().rstrip('/')
try:
parsed = util.fetch_mf2(url, gateway=gateway,
require_backlink=require_backlink)
except (ValueError, URLRequired) as e:
error(str(e))
if parsed is None:
error(f'id {urlparse(url).fragment} not found in {url}')
# find mf2 item
if is_homepage:
logger.info(f"{url} is user's homepage")
entry = mf2util.representative_hcard(parsed, parsed['url'])
logger.info(f'Representative h-card: {json_dumps(entry, indent=2)}')
if not entry:
error(f"Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on {parsed['url']}")
else:
entry = mf2util.find_first_entry(parsed, ['h-entry'])
if not entry:
error(f'No microformats2 found in {url}')
# store final URL in mf2 object, and also default url property to it,
# since that's the fallback for AS1/AS2 id
entry['url'] = parsed['url']
if is_homepage:
entry.setdefault('rel-urls', {}).update(parsed.get('rel-urls', {}))
props = entry.setdefault('properties', {})
props.setdefault('url', [parsed['url']])
logger.info(f'Extracted microformats2 entry: {json_dumps(entry, indent=2)}')
# run full authorship algorithm if necessary: https://indieweb.org/authorship
# duplicated in microformats2.json_to_object
author = util.get_first(props, 'author')
if not isinstance(author, dict) and not is_homepage:
logger.info(f'Fetching full authorship for author {author}')
author = mf2util.find_author({'items': [entry]}, hentry=entry,
fetch_mf2_func=util.fetch_mf2)
logger.info(f'Got: {author}')
if author:
props['author'] = util.trim_nulls([{
"type": ["h-card"],
'properties': {
field: [author[field]] if author.get(field) else []
for field in ('name', 'photo', 'url')
},
}])
obj.mf2 = entry
return obj
@classmethod
def serve(cls, obj):
"""Serves an :class:`Object` as HTML."""
obj_as1 = obj.as1
from_proto = PROTOCOLS.get(obj.source_protocol)
if from_proto:
# fill in author/actor if available
for field in 'author', 'actor':
val = as1.get_object(obj.as1, field)
if val.keys() == set(['id']) and val['id']:
loaded = from_proto.load(val['id'])
if loaded and loaded.as1:
obj_as1 = {**obj_as1, field: loaded.as1}
else:
logger.debug(f'Not hydrating actor or author due to source_protocol {obj.source_protocol}')
html = microformats2.activities_to_html([obj_as1])
# add HTML meta redirect to source page. should trigger for end users in
# browsers but not for webmention receivers (hopefully).
url = util.get_url(obj_as1)
if url:
utf8 = '<meta charset="utf-8">'
refresh = f'<meta http-equiv="refresh" content="0;url={url}">'
html = html.replace(utf8, utf8 + '\n' + refresh)
return html, {'Content-Type': common.CONTENT_TYPE_HTML}
@app.post('/webmention')
def webmention_external():
"""Handles inbound webmention, enqueue task to process.
Use a task queue to deliver to followers because we send to each inbox in
serial, which can take a long time with many followers/instances.
"""
source = flask_util.get_required_param('source').strip()
if not util.is_web(source):
error(f'Bad URL {source}')
domain = util.domain_from_link(source, minimize=False)
g.user = Web.get_by_id(domain)
if not g.user:
error(f'No user found for domain {domain}')
queue_path = tasks_client.queue_path(APP_ID, TASKS_LOCATION, 'webmention')
task = tasks_client.create_task(
parent=queue_path,
task={
'app_engine_http_request': {
'http_method': 'POST',
'relative_uri': '/_ah/queue/webmention',
'body': urlencode(request.form).encode(),
# https://googleapis.dev/python/cloudtasks/latest/gapic/v2/types.html#google.cloud.tasks_v2.types.AppEngineHttpRequest.headers
'headers': {'Content-Type': 'application/x-www-form-urlencoded'},
},
},
)
msg = f'Enqueued task {task.name}.'
logger.info(msg)
return msg, 202
@app.post('/webmention-interactive')
def webmention_interactive():
"""Handler that runs interactive webmention-based requests from the web UI.
...eg the update profile button on user pages.
"""
try:
webmention_external()
flash(f'Updating fediverse profile from <a href="{g.user.homepage}">{g.user.key.id()}</a>...')
except HTTPException as e:
flash(util.linkify(str(e.description), pretty=True))
path = f'/user/{g.user.key.id()}' if g.user else '/'
return redirect(path, code=302)
@app.post('/_ah/queue/webmention')
def webmention_task():
"""Handles webmention task, converts to ActivityPub and delivers."""
logger.info(f'Params: {list(request.form.items())}')
# load user
source = flask_util.get_required_param('source').strip()
domain = util.domain_from_link(source, minimize=False)
logger.info(f'webmention from {domain}')
g.user = Web.get_by_id(domain)
if not g.user:
error(f'No user found for domain {domain}', status=304)
# fetch source page
try:
obj = Web.load(source, refresh=True)
except BadRequest as e:
error(str(e.description), status=304)
except HTTPError as e:
if e.response.status_code not in (410, 404):
error(f'{e} ; {e.response.text if e.response else ""}', status=502)
create_id = f'{source}#bridgy-fed-create'
logger.info(f'Interpreting as Delete. Looking for {create_id}')
create = Object.get_by_id(create_id)
if not create or create.status != 'complete':
error(f"Bridgy Fed hasn't successfully published {source}", status=304)
id = f'{source}#bridgy-fed-delete'
obj = Object(id=id, our_as1={
'id': id,
'objectType': 'activity',
'verb': 'delete',
'actor': g.user.actor_id(),
'object': source,
})
if obj.mf2:
# set actor to user
props = obj.mf2['properties']
author_urls = microformats2.get_string_urls(props.get('author', []))
if author_urls and not g.user.is_homepage(author_urls[0]):
logger.info(f'Overriding author {author_urls[0]} with {g.user.actor_id()}')
props['author'] = [g.user.actor_id()]
logger.info(f'Converted to AS1: {obj.type}: {json_dumps(obj.as1, indent=2)}')
# if source is home page, send an actor Update to followers' instances
if g.user.is_homepage(obj.key.id()):
obj.put()
actor_as1 = {
**obj.as1,
'id': g.user.actor_id(),
'updated': util.now().isoformat(),
}
id = common.host_url(f'{obj.key.id()}#update-{util.now().isoformat()}')
obj = Object(id=id, our_as1={
'objectType': 'activity',
'verb': 'update',
'id': id,
'actor': g.user.actor_id(),
'object': actor_as1,
})
inboxes_to_targets = _activitypub_targets(obj)
obj.populate(
domains=[g.user.key.id()],
source_protocol='webmention',
)
if not inboxes_to_targets:
obj.labels.append('user')
obj.status = 'ignored'
obj.put()
return 'No ActivityPub targets'
err = None
last_success = None
log_data = True
if obj.type in ('note', 'article', 'comment'):
# have we already seen this object? has it changed? or is it new?
if obj.changed:
logger.info(f'Content has changed from last time at {obj.updated}! Redelivering to all inboxes')
updated = util.now().isoformat()
id = f'{obj.key.id()}#bridgy-fed-update-{updated}'
logger.info(f'Wrapping in update activity {id}')
obj.put()
update_as1 = {
'objectType': 'activity',
'verb': 'update',
'id': id,
'actor': g.user.actor_id(),
'object': {
# Mastodon requires the updated field for Updates, so
# add a default value.
# https://docs.joinmastodon.org/spec/activitypub/#supported-activities-for-statuses
# https://socialhub.activitypub.rocks/t/what-could-be-the-reason-that-my-update-activity-does-not-work/2893/4
# https://github.com/mastodon/documentation/pull/1150
'updated': updated,
**obj.as1,
},
}
obj = Object(id=id, mf2=obj.mf2, our_as1=update_as1, labels=['user'],
domains=[g.user.key.id()], source_protocol='webmention')
elif obj.new:
logger.info(f'New Object {obj.key.id()}')
id = f'{obj.key.id()}#bridgy-fed-create'
logger.info(f'Wrapping in post activity {id}')
obj.put()
create_as1 = {
'objectType': 'activity',
'verb': 'post',
'id': id,
'actor': g.user.actor_id(),
'object': obj.as1,
}
obj = Object(id=id, mf2=obj.mf2, our_as1=create_as1,
domains=[g.user.key.id()], labels=['user'],
source_protocol='webmention')
else:
msg = f'{obj.key.id()} is unchanged, nothing to do'
logger.info(msg)
return msg, 204
# TODO: collect by inbox, add 'to' fields, de-dupe inboxes and recipients
#
# make copy of undelivered because we modify it below
obj.populate(
status='in progress',
labels=['user'],
delivered=[],
failed=[],
undelivered=[Target(uri=uri, protocol='activitypub')
for uri in inboxes_to_targets.keys()],
)
logger.info(f'Delivering to inboxes: {sorted(t.uri for t in obj.undelivered)}')
for target in list(obj.undelivered):
inbox = target.uri
if inbox in inboxes_to_targets:
target_as2 = inboxes_to_targets[inbox]
else:
logger.warning(f'Missing target_as2 for inbox {inbox}!')
target_as2 = None
if obj.type == 'follow':
# prefer AS2 id or url, if available
# https://github.com/snarfed/bridgy-fed/issues/307
dest = target_as2 or as1.get_object(obj.as1)
dest_id = dest.get('id') or dest.get('url')
if not dest_id:
error('follow missing target ')
Follower.get_or_create(dest=dest_id,
src=g.user.key.id(),
last_follow=as2.from_as1({
**obj.as1,
'object': dest_id,
}))
# this is reused later in ActivityPub.send()
# TODO: find a better way
obj.target_as2 = target_as2
try:
last = activitypub.ActivityPub.send(obj, inbox, log_data=log_data)
obj.delivered.append(target)
last_success = last
except BaseException as e:
code, body = util.interpret_http_exception(e)
if not code and not body:
raise
obj.failed.append(target)
err = e
finally:
log_data = False
obj.undelivered.remove(target)
obj.put()
obj.status = ('complete' if obj.delivered
else 'failed' if obj.failed
else 'ignored')
obj.put()
# Pass the AP response status code and body through as our response
if last_success:
return last_success.text or 'Sent!', last_success.status_code
elif isinstance(err, BadGateway):
raise err
elif isinstance(err, HTTPError):
return str(err), err.status_code
else:
return str(err)
def _activitypub_targets(obj):
"""
Args:
obj: :class:`models.Object`
Returns: dict of {str inbox URL: dict target AS2 object}
"""
# if there's in-reply-to, like-of, or repost-of, they're the targets.
# otherwise, it's all followers' inboxes.
targets = util.get_urls(obj.as1, 'inReplyTo')
verb = obj.as1.get('verb')
if targets:
logger.info(f'targets from inReplyTo: {targets}')
elif verb in as1.VERBS_WITH_OBJECT:
targets = util.get_urls(obj.as1, 'object')
logger.info(f'targets from object: {targets}')
targets = common.remove_blocklisted(targets)
inboxes_to_targets = {}
target_obj = None
for target in targets:
# fetch target page as AS2 object
try:
# TODO: make this generic across protocols
target_stored = activitypub.ActivityPub.load(target)
target_obj = target_stored.as2 or as2.from_as1(target_stored.as1)
except (HTTPError, BadGateway) as e:
resp = getattr(e, 'requests_response', None)
if resp and resp.ok:
type = common.content_type(resp)
if type and type.startswith('text/html'):
continue # give up
raise
inbox_url = target_obj.get('inbox')
if not inbox_url:
# TODO: test actor/attributedTo and not, with/without inbox
actor = (util.get_first(target_obj, 'actor') or
util.get_first(target_obj, 'attributedTo'))
if isinstance(actor, dict):
inbox_url = actor.get('inbox')
actor = util.get_first(actor, 'url') or actor.get('id')
if not inbox_url and not actor:
error('Target object has no actor or attributedTo with URL or id.', status=304)
elif not isinstance(actor, str):
error(f'Target actor or attributedTo has unexpected url or id object: {actor}', status=304)
if not inbox_url:
# fetch actor as AS object
# TODO: make this generic across protocols
actor_obj = activitypub.ActivityPub.load(actor)
actor = actor_obj.as2 or as2.from_as1(actor_obj.as1)
inbox_url = actor.get('inbox')
if not inbox_url:
# TODO: probably need a way to surface errors like this
logger.error('Target actor has no inbox')
continue
inbox_url = urljoin(target, inbox_url)
inboxes_to_targets[inbox_url] = target_obj
if not targets or verb == 'share':
logger.info('Delivering to followers')
domain = g.user.key.id()
for follower in Follower.query().filter(
Follower.key > Key('Follower', domain + ' '),
Follower.key < Key('Follower', domain + CHAR_AFTER_SPACE)):
if follower.status != 'inactive' and follower.last_follow:
actor = follower.last_follow.get('actor')
if actor and isinstance(actor, dict):
inbox = (actor.get('endpoints', {}).get('sharedInbox') or
actor.get('publicInbox') or
actor.get('inbox'))
# HACK: use last target object from above for reposts, which
# has its resolved id
inboxes_to_targets[inbox] = (target_obj if verb == 'share' else None)
return inboxes_to_targets