bridgy-fed/web.py

572 wiersze
20 KiB
Python

"""Handles inbound webmentions."""
import datetime
import difflib
import logging
import re
import urllib.parse
from urllib.parse import urlencode, urljoin, urlparse
from flask import g, redirect, render_template, request
from google.cloud import ndb
from google.cloud.ndb import ComputedProperty
from granary import as1, as2, microformats2
import mf2util
from oauth_dropins.webutil import flask_util, util
from oauth_dropins.webutil.appengine_config import tasks_client
from oauth_dropins.webutil.appengine_info import APP_ID
from oauth_dropins.webutil.flask_util import error, flash
from oauth_dropins.webutil.util import json_dumps, json_loads
from oauth_dropins.webutil import webmention
from requests import HTTPError, RequestException, URLRequired
from werkzeug.exceptions import BadGateway, BadRequest, HTTPException, NotFound
import common
from common import add
from flask_app import app, cache
from models import Follower, Object, PROTOCOLS, Target, User
from protocol import Protocol
logger = logging.getLogger(__name__)
# https://cloud.google.com/appengine/docs/locations
TASKS_LOCATION = 'us-central1'
CHAR_AFTER_SPACE = chr(ord(' ') + 1)
# https://github.com/snarfed/bridgy-fed/issues/314
WWW_DOMAINS = frozenset((
'www.jvt.me',
))
NON_TLDS = frozenset(('html', 'json', 'php', 'xml'))
class Web(User, Protocol):
"""Web user and webmention protocol implementation.
The key name is the domain.
"""
ABBREV = 'web'
OTHER_LABELS = ('webmention',)
has_redirects = ndb.BooleanProperty()
redirects_error = ndb.TextProperty()
has_hcard = ndb.BooleanProperty()
@classmethod
def _get_kind(cls):
return 'MagicKey'
@ComputedProperty
def readable_id(self):
# prettify if domain, noop if username
username = self.username()
if username != self.key.id():
return util.domain_from_link(username, minimize=False)
def _pre_put_hook(self):
"""Validate domain id, don't allow upper case or invalid characters."""
super()._pre_put_hook()
id = self.key.id()
assert re.match(common.DOMAIN_RE, id)
assert id.lower() == id, f'upper case is not allowed in Web key id: {id}'
assert id not in common.DOMAINS, f'{id} is a Bridgy Fed domain'
@classmethod
def get_or_create(cls, id, **kwargs):
"""Lower cases id (domain), then passes through to :meth:`User.get_or_create`."""
return super().get_or_create(id.lower(), **kwargs)
def web_url(self):
"""Returns this user's web URL aka web_url, eg 'https://foo.com/'."""
return f'https://{self.key.id()}/'
def ap_address(self):
"""Returns this user's ActivityPub address, eg '@foo.com@foo.com'.
Uses the user's domain if they're direct, fed.brid.gy if they're not.
"""
if self.direct:
return f'@{self.username()}@{self.key.id()}'
else:
return f'@{self.key.id()}@{request.host}'
def ap_actor(self, rest=None):
"""Returns this user's ActivityPub/AS2 actor id.
Eg 'https://fed.brid.gy/foo.com'
Web users are special cased to not have an /ap/web/ prefix, for backward
compatibility.
"""
url = common.host_url(self.key.id())
if rest:
url += f'/{rest}'
return url
def user_page_path(self, rest=None):
"""Always use domain."""
path = f'/{self.ABBREV}/{self.key.id()}'
if rest:
if not rest.startswith('?'):
path += '/'
path += rest
return path
def username(self):
"""Returns the user's preferred username.
Uses stored representative h-card if available, falls back to id.
Returns: str
"""
id = self.key.id()
if self.obj and self.obj.as1 and self.direct:
for url in (util.get_list(self.obj.as1, 'url') +
util.get_list(self.obj.as1, 'urls')):
url = url.get('value') if isinstance(url, dict) else url
if url and url.startswith('acct:'):
try:
urluser, urldomain = util.parse_acct_uri(url)
except ValueError as e:
continue
if urldomain == id:
logger.info(f'Found custom username: {urluser}')
return urluser
logger.info(f'Defaulting username to key id {id}')
return id
def verify(self):
"""Fetches site a couple ways to check for redirects and h-card.
Returns: :class:`Web` that was verified. May be different than
self! eg if self's domain started with www and we switch to the root
domain.
"""
domain = self.key.id()
logger.info(f'Verifying {domain}')
if domain.startswith('www.') and domain not in WWW_DOMAINS:
# if root domain redirects to www, use root domain instead
# https://github.com/snarfed/bridgy-fed/issues/314
root = domain.removeprefix("www.")
root_site = f'https://{root}/'
try:
resp = util.requests_get(root_site, gateway=False)
if resp.ok and self.is_web_url(resp.url):
logger.info(f'{root_site} redirects to {resp.url} ; using {root} instead')
root_user = Web.get_or_create(root)
self.use_instead = root_user.key
self.put()
return root_user.verify()
except RequestException:
pass
# check webfinger redirect
path = f'/.well-known/webfinger?resource=acct:{domain}@{domain}'
self.has_redirects = False
self.redirects_error = None
try:
url = urljoin(self.web_url(), path)
resp = util.requests_get(url, gateway=False)
domain_urls = ([f'https://{domain}/' for domain in common.DOMAINS] +
[common.host_url()])
expected = [urljoin(url, path) for url in domain_urls]
if resp.ok and resp.url:
got = urllib.parse.unquote(resp.url)
if got in expected:
self.has_redirects = True
elif got:
diff = '\n'.join(difflib.Differ().compare([got], [expected[0]]))
self.redirects_error = f'Current vs expected:<pre>{diff}</pre>'
else:
lines = [url, f' returned HTTP {resp.status_code}']
if resp.url and resp.url != url:
lines[1:1] = [' redirected to:', resp.url]
self.redirects_error = '<pre>' + '\n'.join(lines) + '</pre>'
except RequestException:
pass
# check home page
try:
self.obj = Web.load(self.web_url(), remote=True, gateway=True)
self.has_hcard = True
except (BadRequest, NotFound, common.NoMicroformats):
self.obj = None
self.has_hcard = False
self.put()
return self
@classmethod
def key_for(cls, id):
"""Returns the :class:`ndb.Key` for a given id.
If id is a domain, uses it as is. If it's a home page URL or fed.brid.gy
or web.brid.gy AP actor URL, extracts the domain and uses that.
Otherwise, raises AssertionError.
Args:
id: str
Raises:
ValueError
"""
if not id:
raise ValueError()
if util.is_web(id):
parsed = urlparse(id)
if parsed.path in ('', '/'):
id = parsed.netloc
if re.match(common.DOMAIN_RE, id):
tld = id.split('.')[-1]
if tld in NON_TLDS:
raise ValueError(f"{id} looks like a domain but {tld} isn't a TLD")
return cls(id=id).key
raise ValueError(f'{id} is not a domain or usable home page URL')
@classmethod
def owns_id(cls, id):
"""Returns None if id is a domain or http(s) URL, False otherwise.
All web pages are http(s) URLs, but not all http(s) URLs are web pages.
"""
if not id:
return False
try:
key = cls.key_for(id)
if key:
user = key.get()
return True if user and user.has_redirects else None
except ValueError as e:
logger.info(e)
return None if util.is_web(id) else False
@classmethod
def target_for(cls, obj, shared=False):
"""Returns `obj`'s id, as a URL webmention target."""
# TODO: we have entities in prod that fail this, eg
# https://indieweb.social/users/bismark has source_protocol webmention
# assert obj.source_protocol in (cls.LABEL, cls.ABBREV, 'ui', None), str(obj)
if not util.is_web(obj.key.id()):
logger.warning(f"{obj.key} is source_protocol web but id isn't a URL!")
return None
return obj.key.id()
@classmethod
def send(cls, obj, url, **kwargs):
"""Sends a webmention to a given target URL.
See :meth:`Protocol.send` for details.
Returns False if the target URL doesn't advertise a webmention endpoint,
or if webmention/microformats2 don't support the activity type.
https://fed.brid.gy/docs#error-handling
"""
if obj.as1.get('verb') == 'accept':
logger.info(f'Skipping sending accept activity to {url}')
return False
source_url = obj.proxy_url()
logger.info(f'Sending webmention from {source_url} to {url}')
endpoint = common.webmention_discover(url).endpoint
if not endpoint:
return False
webmention.send(endpoint, source_url, url)
return True
@classmethod
def fetch(cls, obj, gateway=False, check_backlink=False, **kwargs):
"""Fetches a URL over HTTP and extracts its microformats2.
Follows redirects, but doesn't change the original URL in obj's id! The
:class:`Model` class doesn't allow that anyway, but more importantly, we
want to preserve that original URL becase other objects may refer to it
instead of the final redirect destination URL.
See :meth:`Protocol.fetch` for other background.
Args:
gateway: passed through to :func:`webutil.util.fetch_mf2`
check_backlink: bool, optional, whether to require a link to Bridgy
Fed. Ignored if the URL is a homepage, ie has no path.
kwargs: ignored
"""
url = obj.key.id()
is_homepage = urlparse(url).path.strip('/') == ''
require_backlink = (common.host_url().rstrip('/')
if check_backlink and not is_homepage
else None)
try:
parsed = util.fetch_mf2(url, gateway=gateway,
require_backlink=require_backlink)
except (ValueError, URLRequired) as e:
error(str(e))
if parsed is None:
error(f'id {urlparse(url).fragment} not found in {url}')
# find mf2 item
if is_homepage:
logger.info(f"{url} is user's web url")
entry = mf2util.representative_hcard(parsed, parsed['url'])
logger.info(f'Representative h-card: {json_dumps(entry, indent=2)}')
if not entry:
msg = f"Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on {parsed['url']}"
logging.info(msg)
raise common.NoMicroformats(msg)
else:
entry = mf2util.find_first_entry(parsed, ['h-entry'])
if not entry:
error(f'No microformats2 found in {url}')
# store final URL in mf2 object, and also default url property to it,
# since that's the fallback for AS1/AS2 id
if is_homepage:
entry.setdefault('rel-urls', {}).update(parsed.get('rel-urls', {}))
entry.setdefault('type', ['h-card'])
props = entry.setdefault('properties', {})
if parsed['url']:
entry['url'] = parsed['url']
props.setdefault('url', [parsed['url']])
logger.info(f'Extracted microformats2 entry: {json_dumps(entry, indent=2)}')
if not is_homepage:
# default actor/author to home page URL
if not props.get('author'):
homepage = urljoin(url, '/')
logger.info(f'Defaulting author to {homepage}')
props['author'] = [homepage]
# run full authorship algorithm if necessary:
# https://indieweb.org/authorship
# duplicated in microformats2.json_to_object
author = util.get_first(props, 'author')
if not isinstance(author, dict) and not is_homepage:
logger.info(f'Fetching full authorship for author {author}')
author = mf2util.find_author({'items': [entry]}, hentry=entry,
fetch_mf2_func=util.fetch_mf2)
logger.info(f'Got: {author}')
if author:
props['author'] = util.trim_nulls([{
"type": ["h-card"],
'properties': {
field: [author[field]] if author.get(field) else []
for field in ('name', 'photo', 'url')
},
}])
obj.mf2 = entry
return obj
@classmethod
def serve(cls, obj):
"""Serves an :class:`Object` as HTML."""
obj_as1 = obj.as1
from_proto = PROTOCOLS.get(obj.source_protocol)
if from_proto:
# fill in author/actor if available
for field in 'author', 'actor':
val = as1.get_object(obj.as1, field)
if val.keys() == set(['id']) and val['id']:
loaded = from_proto.load(val['id'])
if loaded and loaded.as1:
obj_as1 = {**obj_as1, field: loaded.as1}
else:
logger.debug(f'Not hydrating actor or author due to source_protocol {obj.source_protocol}')
html = microformats2.activities_to_html([obj_as1])
# add HTML meta redirect to source page. should trigger for end users in
# browsers but not for webmention receivers (hopefully).
url = util.get_url(obj_as1)
if url:
utf8 = '<meta charset="utf-8">'
refresh = f'<meta http-equiv="refresh" content="0;url={url}">'
html = html.replace(utf8, utf8 + '\n' + refresh)
return html, {'Content-Type': common.CONTENT_TYPE_HTML}
@app.get('/web-site')
@flask_util.cached(cache, datetime.timedelta(days=1))
def enter_web_site():
return render_template('enter_web_site.html')
@app.post('/web-site')
def check_web_site():
url = request.values['url']
# this normalizes and lower cases domain
domain = util.domain_from_link(url, minimize=False)
if not domain:
flash(f'No domain found in {url}')
return render_template('enter_web_site.html')
elif domain in common.DOMAINS:
flash(f'{domain} is a Bridgy Fed domain')
return render_template('enter_web_site.html')
g.user = Web.get_or_create(domain, direct=True)
try:
g.user = g.user.verify()
except BaseException as e:
code, body = util.interpret_http_exception(e)
if code:
flash(f"Couldn't connect to {url}: {e}")
return render_template('enter_web_site.html')
raise
g.user.put()
return redirect(g.user.user_page_path())
@app.post('/webmention')
def webmention_external():
"""Handles inbound webmention, enqueue task to process.
Use a task queue to deliver to followers because we send to each inbox in
serial, which can take a long time with many followers/instances.
"""
logger.info(f'Params: {list(request.form.items())}')
source = flask_util.get_required_param('source').strip()
if not util.is_web(source):
error(f'Bad URL {source}')
domain = util.domain_from_link(source, minimize=False)
if not domain:
error(f'Bad source URL {source}')
g.user = Web.get_by_id(domain)
if not g.user:
error(f'No user found for domain {domain}')
queue_path = tasks_client.queue_path(APP_ID, TASKS_LOCATION, 'webmention')
task = tasks_client.create_task(
parent=queue_path,
task={
'app_engine_http_request': {
'http_method': 'POST',
'relative_uri': '/_ah/queue/webmention',
'body': urlencode(request.form).encode(),
# https://googleapis.dev/python/cloudtasks/latest/gapic/v2/types.html#google.cloud.tasks_v2.types.AppEngineHttpRequest.headers
'headers': {'Content-Type': 'application/x-www-form-urlencoded'},
},
},
)
msg = f'Enqueued task {task.name}.'
logger.info(msg)
return msg, 202
@app.post('/webmention-interactive')
def webmention_interactive():
"""Handler that runs interactive webmention-based requests from the web UI.
...eg the update profile button on user pages.
"""
try:
webmention_external()
flash(f'Updating fediverse profile from <a href="{g.user.web_url()}">{g.user.key.id()}</a>...')
except HTTPException as e:
flash(util.linkify(str(e.description), pretty=True))
path = g.user.user_page_path() if g.user else '/'
return redirect(path, code=302)
@app.post('/_ah/queue/webmention')
def webmention_task():
"""Handles inbound webmention task."""
logger.info(f'Params: {list(request.form.items())}')
# load user
source = flask_util.get_required_param('source').strip()
domain = util.domain_from_link(source, minimize=False)
logger.info(f'webmention from {domain}')
g.user = Web.get_by_id(domain)
if not g.user:
error(f'No user found for domain {domain}', status=304)
# fetch source page
try:
# remote=True to force fetch, local=True to populate new/changed attrs
obj = Web.load(source, local=True, remote=True, check_backlink=True)
except BadRequest as e:
error(str(e.description), status=304)
except HTTPError as e:
if e.response.status_code not in (410, 404):
error(f'{e} ; {e.response.text if e.response else ""}', status=502)
create_id = f'{source}#bridgy-fed-create'
logger.info(f'Interpreting as Delete. Looking for {create_id}')
create = Object.get_by_id(create_id)
if not create or create.status != 'complete':
error(f"Bridgy Fed hasn't successfully published {source}", status=304)
id = f'{source}#bridgy-fed-delete'
obj = Object(id=id, our_as1={
'id': id,
'objectType': 'activity',
'verb': 'delete',
'actor': g.user.ap_actor(),
'object': source,
})
if not obj.mf2 and obj.type != 'delete':
error(f'No microformats2 found in {source}', status=304)
elif obj.mf2:
# default actor to user
props = obj.mf2['properties']
author_urls = microformats2.get_string_urls(props.get('author', []))
if author_urls and not g.user.is_web_url(author_urls[0]):
logger.info(f'Overriding author {author_urls[0]} with {g.user.ap_actor()}')
props['author'] = [g.user.ap_actor()]
# if source is home page, update Web user and send an actor Update to
# followers' instances
if g.user and (g.user.key.id() == obj.key.id()
or g.user.is_web_url(obj.key.id())):
logger.info(f'Converted to AS1: {obj.type}: {json_dumps(obj.as1, indent=2)}')
obj.put()
g.user.obj = obj
g.user.put()
logger.info('Wrapping in Update for home page user profile')
actor_as1 = {
**obj.as1,
'id': g.user.ap_actor(),
'updated': util.now().isoformat(),
}
id = common.host_url(f'{obj.key.id()}#update-{util.now().isoformat()}')
obj = Object(id=id, our_as1={
'objectType': 'activity',
'verb': 'update',
'id': id,
'actor': g.user.ap_actor(),
'object': actor_as1,
})
try:
return Web.receive(obj)
except ValueError as e:
logger.warning(e, exc_info=True)
error(e, status=304)