2017-09-12 14:31:50 +00:00
|
|
|
# coding=utf-8
|
2017-08-15 06:07:24 +00:00
|
|
|
"""Misc common utilities.
|
|
|
|
"""
|
2017-09-12 14:31:50 +00:00
|
|
|
from __future__ import unicode_literals
|
2017-10-20 14:00:42 +00:00
|
|
|
import copy
|
2017-11-05 23:50:23 +00:00
|
|
|
import itertools
|
2017-10-16 14:13:43 +00:00
|
|
|
import json
|
2017-08-15 06:07:24 +00:00
|
|
|
import logging
|
2017-09-13 14:48:32 +00:00
|
|
|
import re
|
2017-10-20 14:00:42 +00:00
|
|
|
import urlparse
|
2017-08-15 06:07:24 +00:00
|
|
|
|
2017-10-20 14:00:42 +00:00
|
|
|
from bs4 import BeautifulSoup
|
2017-09-28 14:25:21 +00:00
|
|
|
from granary import as2
|
2017-10-24 04:49:43 +00:00
|
|
|
from oauth_dropins.webutil import handlers, util
|
2017-08-15 06:07:24 +00:00
|
|
|
import requests
|
2017-10-16 14:13:43 +00:00
|
|
|
from webmentiontools import send
|
2017-08-15 14:39:22 +00:00
|
|
|
from webob import exc
|
|
|
|
|
2017-10-24 04:49:43 +00:00
|
|
|
import appengine_config
|
2017-10-16 14:13:43 +00:00
|
|
|
from models import Response
|
|
|
|
|
2017-09-03 19:54:10 +00:00
|
|
|
DOMAIN_RE = r'([^/]+\.[^/]+)'
|
|
|
|
ACCT_RE = r'(?:acct:)?([^@]+)@' + DOMAIN_RE
|
2017-08-15 06:07:24 +00:00
|
|
|
HEADERS = {
|
2017-08-19 14:41:25 +00:00
|
|
|
'User-Agent': 'Bridgy Fed (https://fed.brid.gy/)',
|
2017-08-15 06:07:24 +00:00
|
|
|
}
|
2018-10-17 14:00:31 +00:00
|
|
|
# see redirect_wrap() and redirect_unwrap()
|
|
|
|
REDIRECT_PREFIX = urlparse.urljoin(appengine_config.HOST_URL, '/r/')
|
2017-09-02 03:49:00 +00:00
|
|
|
XML_UTF8 = "<?xml version='1.0' encoding='UTF-8'?>\n"
|
2017-10-26 15:07:29 +00:00
|
|
|
# USERNAME = 'me'
|
2017-09-12 14:31:50 +00:00
|
|
|
# USERNAME_EMOJI = '🌎' # globe
|
2017-09-13 14:48:32 +00:00
|
|
|
LINK_HEADER_RE = re.compile(r""" *< *([^ >]+) *> *; *rel=['"]([^'"]+)['"] *""")
|
2017-09-28 14:25:21 +00:00
|
|
|
AS2_PUBLIC_AUDIENCE = 'https://www.w3.org/ns/activitystreams#Public'
|
2017-09-12 14:31:50 +00:00
|
|
|
|
2017-10-21 04:05:35 +00:00
|
|
|
# Content-Type values. All non-unicode strings because App Engine's wsgi.py
|
|
|
|
# requires header values to be str, not unicode.
|
|
|
|
#
|
|
|
|
# ActivityPub Content-Type details:
|
2017-10-20 14:00:42 +00:00
|
|
|
# https://www.w3.org/TR/activitypub/#retrieving-objects
|
2017-10-21 04:05:35 +00:00
|
|
|
CONTENT_TYPE_AS2_LD = b'application/ld+json; profile="https://www.w3.org/ns/activitystreams"'
|
|
|
|
CONTENT_TYPE_AS2 = b'application/activity+json'
|
|
|
|
CONTENT_TYPE_AS1 = b'application/stream+json'
|
|
|
|
CONTENT_TYPE_HTML = b'text/html'
|
|
|
|
CONTENT_TYPE_ATOM = b'application/atom+xml'
|
|
|
|
CONTENT_TYPE_MAGIC_ENVELOPE = b'application/magic-envelope+xml'
|
2017-10-20 14:49:25 +00:00
|
|
|
|
2017-10-20 14:00:42 +00:00
|
|
|
CONNEG_HEADERS_AS2 = {
|
|
|
|
'Accept': '%s; q=0.9, %s; q=0.8' % (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD),
|
|
|
|
}
|
2017-10-20 14:49:25 +00:00
|
|
|
CONNEG_HEADERS_AS2_HTML = {
|
|
|
|
'Accept': CONNEG_HEADERS_AS2['Accept'] + ', %s; q=0.7' % CONTENT_TYPE_HTML,
|
|
|
|
}
|
2017-10-20 14:00:42 +00:00
|
|
|
|
2017-10-16 14:13:43 +00:00
|
|
|
SUPPORTED_VERBS = (
|
|
|
|
'checkin',
|
|
|
|
'create',
|
2018-10-23 14:11:44 +00:00
|
|
|
'follow',
|
2017-10-16 14:13:43 +00:00
|
|
|
'like',
|
2018-10-15 15:09:36 +00:00
|
|
|
'post',
|
2017-10-16 14:13:43 +00:00
|
|
|
'share',
|
|
|
|
'tag',
|
|
|
|
'update',
|
|
|
|
)
|
|
|
|
|
2017-10-24 04:49:43 +00:00
|
|
|
canonicalize_domain = handlers.redirect('bridgy-federated.appspot.com', 'fed.brid.gy')
|
|
|
|
|
2017-08-15 06:07:24 +00:00
|
|
|
|
|
|
|
def requests_get(url, **kwargs):
|
2017-08-23 15:14:51 +00:00
|
|
|
return _requests_fn(util.requests_get, url, **kwargs)
|
2017-08-15 06:07:24 +00:00
|
|
|
|
|
|
|
|
2017-08-15 14:39:22 +00:00
|
|
|
def requests_post(url, **kwargs):
|
2017-08-23 15:14:51 +00:00
|
|
|
return _requests_fn(util.requests_post, url, **kwargs)
|
2017-08-15 06:07:24 +00:00
|
|
|
|
|
|
|
|
2017-10-20 14:49:25 +00:00
|
|
|
def _requests_fn(fn, url, parse_json=False, **kwargs):
|
2017-08-15 14:39:22 +00:00
|
|
|
"""Wraps requests.* and adds raise_for_status() and User-Agent."""
|
|
|
|
kwargs.setdefault('headers', {}).update(HEADERS)
|
2017-08-23 15:14:51 +00:00
|
|
|
|
2017-08-15 14:39:22 +00:00
|
|
|
resp = fn(url, **kwargs)
|
2017-10-20 14:49:25 +00:00
|
|
|
|
|
|
|
logging.info('Got %s headers:%s', resp.status_code, resp.headers)
|
2017-10-21 03:35:07 +00:00
|
|
|
type = content_type(resp)
|
2017-10-21 04:05:35 +00:00
|
|
|
if (type and type != 'text/html' and
|
2017-10-24 14:30:33 +00:00
|
|
|
(type.startswith('text/') or type.endswith('+json') or type.endswith('/json'))):
|
2017-10-20 14:49:25 +00:00
|
|
|
logging.info(resp.text)
|
2017-10-17 05:27:54 +00:00
|
|
|
|
|
|
|
if resp.status_code // 100 in (4, 5):
|
2017-10-24 14:23:51 +00:00
|
|
|
msg = 'Received %s from %s:\n%s' % (resp.status_code, url, resp.text)
|
|
|
|
logging.info(msg)
|
|
|
|
raise exc.HTTPBadGateway(msg)
|
2017-08-15 14:39:22 +00:00
|
|
|
|
2017-08-23 15:14:51 +00:00
|
|
|
if parse_json:
|
2017-08-15 14:39:22 +00:00
|
|
|
try:
|
|
|
|
return resp.json()
|
2017-10-22 22:59:51 +00:00
|
|
|
except ValueError:
|
2017-10-19 05:47:03 +00:00
|
|
|
msg = "Couldn't parse response as JSON"
|
2017-10-22 22:59:51 +00:00
|
|
|
logging.info(msg, exc_info=True)
|
2017-10-19 05:47:03 +00:00
|
|
|
raise exc.HTTPBadGateway(msg)
|
2017-08-15 14:39:22 +00:00
|
|
|
|
|
|
|
return resp
|
2017-08-23 15:14:51 +00:00
|
|
|
|
|
|
|
|
2017-10-20 14:00:42 +00:00
|
|
|
def get_as2(url):
|
|
|
|
"""Tries to fetch the given URL as ActivityStreams 2.
|
|
|
|
|
|
|
|
Uses HTTP content negotiation via the Content-Type header. If the url is
|
|
|
|
HTML and it has a rel-alternate link with an AS2 content type, fetches and
|
|
|
|
returns that URL.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
url: string
|
|
|
|
|
|
|
|
Returns:
|
2017-10-20 14:49:25 +00:00
|
|
|
requests.Response
|
2017-10-20 14:00:42 +00:00
|
|
|
|
|
|
|
Raises:
|
|
|
|
requests.HTTPError, webob.exc.HTTPException
|
2017-10-20 14:49:25 +00:00
|
|
|
|
|
|
|
If we raise webob HTTPException, it will have an additional response
|
|
|
|
attribute with the last requests.Response we received.
|
2017-10-20 14:00:42 +00:00
|
|
|
"""
|
2017-10-20 14:49:25 +00:00
|
|
|
def _error(resp):
|
2017-10-20 14:00:42 +00:00
|
|
|
msg = "Couldn't fetch %s as ActivityStreams 2" % url
|
|
|
|
logging.error(msg)
|
2017-10-20 14:49:25 +00:00
|
|
|
err = exc.HTTPBadGateway(msg)
|
|
|
|
err.response = resp
|
|
|
|
raise err
|
2017-10-20 14:00:42 +00:00
|
|
|
|
|
|
|
resp = requests_get(url, headers=CONNEG_HEADERS_AS2_HTML)
|
2017-10-21 03:35:07 +00:00
|
|
|
if content_type(resp) in (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD):
|
2017-10-20 14:49:25 +00:00
|
|
|
return resp
|
2017-10-20 14:00:42 +00:00
|
|
|
|
2018-10-12 02:12:18 +00:00
|
|
|
parsed = beautifulsoup_parse(resp.content, from_encoding=resp.encoding)
|
2017-10-20 14:00:42 +00:00
|
|
|
as2 = parsed.find('link', rel=('alternate', 'self'), type=(
|
|
|
|
CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD))
|
|
|
|
if not (as2 and as2['href']):
|
2017-10-20 14:49:25 +00:00
|
|
|
_error(resp)
|
2017-10-20 14:00:42 +00:00
|
|
|
|
|
|
|
resp = requests_get(urlparse.urljoin(resp.url, as2['href']),
|
|
|
|
headers=CONNEG_HEADERS_AS2)
|
2017-10-21 03:35:07 +00:00
|
|
|
if content_type(resp) in (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD):
|
2017-10-20 14:49:25 +00:00
|
|
|
return resp
|
2017-10-20 14:00:42 +00:00
|
|
|
|
2017-10-20 14:49:25 +00:00
|
|
|
_error(resp)
|
2017-10-20 14:00:42 +00:00
|
|
|
|
|
|
|
|
2017-10-21 03:35:07 +00:00
|
|
|
def content_type(resp):
|
|
|
|
"""Returns a requests.Response's Content-Type, without charset suffix."""
|
|
|
|
type = resp.headers.get('Content-Type')
|
|
|
|
if type:
|
|
|
|
return type.split(';')[0]
|
|
|
|
|
|
|
|
|
2017-10-17 14:46:42 +00:00
|
|
|
def error(handler, msg, status=None, exc_info=False):
|
2017-10-16 14:13:43 +00:00
|
|
|
if not status:
|
|
|
|
status = 400
|
2017-10-17 14:46:42 +00:00
|
|
|
logging.info('Returning %s: %s' % (status, msg), exc_info=exc_info)
|
2017-08-23 15:14:51 +00:00
|
|
|
handler.abort(status, msg)
|
2017-09-28 14:25:21 +00:00
|
|
|
|
|
|
|
|
2018-10-17 14:49:04 +00:00
|
|
|
def send_webmentions(handler, activity, proxy=None, **response_props):
|
2017-10-16 14:13:43 +00:00
|
|
|
"""Sends webmentions for an incoming Salmon slap or ActivityPub inbox delivery.
|
|
|
|
Args:
|
|
|
|
handler: RequestHandler
|
|
|
|
activity: dict, AS1 activity
|
|
|
|
response_props: passed through to the newly created Responses
|
|
|
|
"""
|
|
|
|
verb = activity.get('verb')
|
|
|
|
if verb and verb not in SUPPORTED_VERBS:
|
2017-10-23 15:03:45 +00:00
|
|
|
error(handler, '%s activities are not supported yet.' % verb)
|
2017-10-16 14:13:43 +00:00
|
|
|
|
|
|
|
# extract source and targets
|
|
|
|
source = activity.get('url') or activity.get('id')
|
|
|
|
obj = activity.get('object')
|
|
|
|
obj_url = util.get_url(obj)
|
|
|
|
|
|
|
|
targets = util.get_list(activity, 'inReplyTo')
|
|
|
|
if isinstance(obj, dict):
|
2018-10-17 13:52:07 +00:00
|
|
|
if not source or verb in ('create', 'post', 'update'):
|
2017-10-16 14:13:43 +00:00
|
|
|
source = obj_url or obj.get('id')
|
|
|
|
targets.extend(util.get_list(obj, 'inReplyTo'))
|
2018-10-23 14:11:44 +00:00
|
|
|
if verb in ('follow', 'like', 'share'):
|
2017-10-16 14:13:43 +00:00
|
|
|
targets.append(obj_url)
|
|
|
|
|
|
|
|
targets = util.dedupe_urls(util.get_url(t) for t in targets)
|
|
|
|
if not source:
|
|
|
|
error(handler, "Couldn't find original post URL")
|
|
|
|
if not targets:
|
|
|
|
error(handler, "Couldn't find target URLs (inReplyTo or object)")
|
|
|
|
|
|
|
|
# send webmentions and store Responses
|
|
|
|
errors = []
|
|
|
|
for target in targets:
|
|
|
|
if not target:
|
|
|
|
continue
|
|
|
|
|
2018-10-23 14:11:44 +00:00
|
|
|
target = util.follow_redirects(redirect_unwrap(target)).url
|
2017-10-16 14:13:43 +00:00
|
|
|
response = Response(source=source, target=target, direction='in',
|
|
|
|
**response_props)
|
|
|
|
response.put()
|
2018-10-23 14:11:44 +00:00
|
|
|
wm_source = (response.proxy_url()
|
|
|
|
if verb in ('follow', 'like', 'share') or proxy
|
2018-10-17 14:49:04 +00:00
|
|
|
else source)
|
2017-10-16 14:13:43 +00:00
|
|
|
logging.info('Sending webmention from %s to %s', wm_source, target)
|
|
|
|
|
|
|
|
wm = send.WebmentionSend(wm_source, target)
|
|
|
|
if wm.send(headers=HEADERS):
|
|
|
|
logging.info('Success: %s', wm.response)
|
|
|
|
response.status = 'complete'
|
|
|
|
else:
|
|
|
|
logging.warning('Failed: %s', wm.error)
|
|
|
|
errors.append(wm.error)
|
|
|
|
response.status = 'error'
|
|
|
|
response.put()
|
|
|
|
|
|
|
|
if errors:
|
|
|
|
msg = 'Errors:\n' + '\n'.join(json.dumps(e, indent=2) for e in errors)
|
|
|
|
error(handler, msg, status=errors[0].get('http_status'))
|
|
|
|
|
|
|
|
|
2017-10-24 04:23:33 +00:00
|
|
|
def postprocess_as2(activity, target=None, key=None):
|
2017-10-01 14:01:35 +00:00
|
|
|
"""Prepare an AS2 object to be served or sent via ActivityPub.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
activity: dict, AS2 object or activity
|
2017-10-24 04:23:33 +00:00
|
|
|
target: dict, AS2 object, optional. The target of activity's inReplyTo or
|
|
|
|
Like/Announce/etc object, if any.
|
2017-10-01 14:01:35 +00:00
|
|
|
key: MagicKey, optional. populated into publicKey field if provided.
|
|
|
|
"""
|
2017-10-02 04:45:00 +00:00
|
|
|
type = activity.get('type')
|
2017-10-24 04:49:43 +00:00
|
|
|
|
|
|
|
# actor objects
|
2017-10-02 04:43:18 +00:00
|
|
|
if type == 'Person':
|
2017-10-24 04:49:43 +00:00
|
|
|
postprocess_as2_actor(activity)
|
|
|
|
if not activity.get('publicKey'):
|
|
|
|
# underspecified, inferred from this issue and Mastodon's implementation:
|
|
|
|
# https://github.com/w3c/activitypub/issues/203#issuecomment-297553229
|
|
|
|
# https://github.com/tootsuite/mastodon/blob/bc2c263504e584e154384ecc2d804aeb1afb1ba3/app/services/activitypub/process_account_service.rb#L77
|
|
|
|
activity['publicKey'] = {
|
|
|
|
'publicKeyPem': key.public_pem(),
|
|
|
|
}
|
|
|
|
return activity
|
|
|
|
|
|
|
|
for actor in (util.get_list(activity, 'attributedTo') +
|
|
|
|
util.get_list(activity, 'actor')):
|
|
|
|
postprocess_as2_actor(actor)
|
2017-10-01 14:01:35 +00:00
|
|
|
|
2017-10-24 04:23:33 +00:00
|
|
|
# inReplyTo: singly valued, prefer id over url
|
|
|
|
target_id = target.get('id') if target else None
|
|
|
|
in_reply_to = activity.get('inReplyTo')
|
|
|
|
if in_reply_to:
|
|
|
|
if target_id:
|
|
|
|
activity['inReplyTo'] = target_id
|
|
|
|
elif isinstance(in_reply_to, list):
|
|
|
|
if len(in_reply_to) > 1:
|
|
|
|
logging.warning(
|
|
|
|
"AS2 doesn't support multiple inReplyTo URLs! "
|
|
|
|
'Only using the first: %s' % in_reply_tos[0])
|
|
|
|
activity['inReplyTo'] = in_reply_to[0]
|
|
|
|
|
2018-10-16 15:26:55 +00:00
|
|
|
# Mastodon evidently requires a Mention tag for replies to generate a
|
|
|
|
# notification to the original post's author. not required for likes,
|
|
|
|
# reposts, etc. details:
|
|
|
|
# https://github.com/snarfed/bridgy-fed/issues/34
|
|
|
|
to = target.get('actor') or target.get('attributedTo')
|
|
|
|
if to:
|
|
|
|
if isinstance(to, dict):
|
|
|
|
to = to.get('url') or to.get('id')
|
|
|
|
if to:
|
|
|
|
activity.setdefault('tag', []).append({
|
|
|
|
'type': 'Mention',
|
|
|
|
'href': to,
|
|
|
|
})
|
|
|
|
|
|
|
|
|
2017-10-24 04:23:33 +00:00
|
|
|
# activity objects (for Like, Announce, etc): prefer id over url
|
|
|
|
obj = activity.get('object', {})
|
|
|
|
if obj:
|
|
|
|
if isinstance(obj, dict) and not obj.get('id'):
|
|
|
|
obj['id'] = target_id or obj.get('url')
|
|
|
|
elif obj != target_id:
|
|
|
|
activity['object'] = target_id
|
|
|
|
|
2017-12-13 02:07:12 +00:00
|
|
|
# id is required for most things. default to url if it's not set.
|
2017-10-24 04:23:33 +00:00
|
|
|
if not activity.get('id'):
|
|
|
|
activity['id'] = activity.get('url')
|
|
|
|
|
2017-12-13 02:07:12 +00:00
|
|
|
assert activity.get('id') or (isinstance(obj, dict) and obj.get('id'))
|
|
|
|
|
2018-10-14 14:58:17 +00:00
|
|
|
activity['id'] = redirect_wrap(activity['id'])
|
|
|
|
activity['url'] = redirect_wrap(activity['url'])
|
|
|
|
|
2017-11-05 23:50:23 +00:00
|
|
|
# cc public and target's author(s) and recipients
|
|
|
|
# https://www.w3.org/TR/activitystreams-vocabulary/#audienceTargeting
|
|
|
|
# https://w3c.github.io/activitypub/#delivery
|
2017-09-28 14:25:21 +00:00
|
|
|
if type in as2.TYPE_TO_VERB or type in ('Article', 'Note'):
|
2017-11-05 23:50:23 +00:00
|
|
|
recips = [AS2_PUBLIC_AUDIENCE]
|
|
|
|
if target:
|
|
|
|
recips += itertools.chain(*(util.get_list(target, field) for field in
|
|
|
|
('actor', 'attributedTo', 'to', 'cc')))
|
|
|
|
activity['cc'] = util.dedupe_urls(util.get_url(recip) for recip in recips)
|
2017-09-28 14:25:21 +00:00
|
|
|
|
2017-10-24 04:23:33 +00:00
|
|
|
# wrap articles and notes in a Create activity
|
2017-10-02 04:45:00 +00:00
|
|
|
if type in ('Article', 'Note'):
|
2017-09-28 14:25:21 +00:00
|
|
|
activity = {
|
|
|
|
'@context': as2.CONTEXT,
|
|
|
|
'type': 'Create',
|
|
|
|
'object': activity,
|
|
|
|
}
|
|
|
|
|
|
|
|
return util.trim_nulls(activity)
|
2017-10-24 04:49:43 +00:00
|
|
|
|
|
|
|
|
|
|
|
def postprocess_as2_actor(actor):
|
|
|
|
"""Prepare an AS2 actor object to be served or sent via ActivityPub.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
actor: dict, AS2 actor object
|
|
|
|
"""
|
|
|
|
url = actor.get('url')
|
|
|
|
if url:
|
2017-10-26 15:07:29 +00:00
|
|
|
domain = urlparse.urlparse(url).netloc
|
|
|
|
actor.setdefault('preferredUsername', domain)
|
|
|
|
actor['id'] = '%s/%s' % (appengine_config.HOST_URL, domain)
|
2018-10-14 14:58:17 +00:00
|
|
|
actor['url'] = redirect_wrap(url)
|
|
|
|
|
|
|
|
|
|
|
|
def redirect_wrap(url):
|
|
|
|
"""Returns a URL on our domain that redirects to this URL.
|
|
|
|
|
|
|
|
...to satisfy Mastodon's non-standard domain matching requirement. :(
|
|
|
|
|
|
|
|
https://github.com/snarfed/bridgy-fed/issues/16#issuecomment-424799599
|
|
|
|
https://github.com/tootsuite/mastodon/pull/6219#issuecomment-429142747
|
|
|
|
"""
|
2018-10-17 14:00:31 +00:00
|
|
|
if url.startswith(REDIRECT_PREFIX):
|
2018-10-14 14:58:17 +00:00
|
|
|
return url
|
2018-10-17 14:00:31 +00:00
|
|
|
return REDIRECT_PREFIX + url
|
|
|
|
|
|
|
|
|
2018-10-17 14:49:04 +00:00
|
|
|
def redirect_unwrap(val):
|
2018-10-17 14:00:31 +00:00
|
|
|
"""Removes our redirect wrapping from a URL, if it's there.
|
|
|
|
|
2018-10-17 14:49:04 +00:00
|
|
|
url may be a string or dict. If it's a dict, all string and dict values are
|
|
|
|
unwrapped, recursively.
|
|
|
|
|
|
|
|
Strings that aren't wrapped URLs are left unchanged.
|
2018-10-17 14:00:31 +00:00
|
|
|
"""
|
2018-10-17 14:49:04 +00:00
|
|
|
if isinstance(val, dict):
|
|
|
|
return {k: redirect_unwrap(v) for k, v in val.items()}
|
|
|
|
|
2018-10-22 01:54:27 +00:00
|
|
|
if isinstance(val, basestring):
|
|
|
|
if val.startswith(REDIRECT_PREFIX):
|
|
|
|
return val[len(REDIRECT_PREFIX):]
|
|
|
|
elif val.startswith(appengine_config.HOST_URL):
|
|
|
|
return util.domain_from_link(urlparse.urlparse(val).path.strip('/'))
|
2018-10-17 14:49:04 +00:00
|
|
|
|
|
|
|
return val
|
2018-10-12 02:12:18 +00:00
|
|
|
|
|
|
|
|
|
|
|
def beautifulsoup_parse(html, **kwargs):
|
|
|
|
"""Parses an HTML string with BeautifulSoup. Centralizes our parsing config.
|
|
|
|
|
|
|
|
*Copied from bridgy/util.py.*
|
|
|
|
|
|
|
|
We currently use lxml, which BeautifulSoup claims is the fastest and best:
|
|
|
|
http://www.crummy.com/software/BeautifulSoup/bs4/doc/#specifying-the-parser-to-use
|
|
|
|
|
|
|
|
lxml is a native module, so we don't bundle and deploy it to App Engine.
|
|
|
|
Instead, we use App Engine's version by declaring it in app.yaml.
|
|
|
|
https://cloud.google.com/appengine/docs/standard/python/tools/built-in-libraries-27
|
|
|
|
|
|
|
|
We pin App Engine's version in requirements.freeze.txt and tell BeautifulSoup
|
|
|
|
to use lxml explicitly to ensure we use the same parser and version in prod
|
|
|
|
and locally, since we've been bit by at least one meaningful difference
|
|
|
|
between lxml and e.g. html5lib: lxml includes the contents of <noscript> tags,
|
|
|
|
html5lib omits them. :(
|
|
|
|
https://github.com/snarfed/bridgy/issues/798#issuecomment-370508015
|
|
|
|
"""
|
|
|
|
return BeautifulSoup(html, 'lxml', **kwargs)
|