From 7c9a03c827be04220d2bb7082dfba68e94dec008 Mon Sep 17 00:00:00 2001 From: Ryan Barrett Date: Thu, 3 Oct 2019 21:08:26 -0700 Subject: [PATCH] unify HTTP fetching, HTML and mf2 parsing, and error handling into webutil see: * snarfed/granary#171 * snarfed/webutil@f994884b2b95b8bb10e2079940ef83db4c91f75f * snarfed/oauth-dropins@f5b6e73530561f992ce86ea7bc40c55ee4300acf * snarfed/granary@284eb115083f2d5c982d914b76d83c4ebb1e3f59 --- activitypub.py | 10 ++++----- common.py | 43 +++------------------------------------ redirect.py | 7 +++---- requirements.freeze.txt | 2 +- tests/test_activitypub.py | 12 +++++------ tests/test_redirect.py | 8 +++----- tests/test_webfinger.py | 4 ++-- tests/test_webmention.py | 19 ++++++++--------- tests/testutil.py | 1 + webfinger.py | 5 ++--- webmention.py | 9 ++++---- 11 files changed, 38 insertions(+), 82 deletions(-) diff --git a/activitypub.py b/activitypub.py index ca671a9..673d726 100644 --- a/activitypub.py +++ b/activitypub.py @@ -8,7 +8,6 @@ import appengine_config from google.appengine.ext import ndb from granary import as2, microformats2 -import mf2py import mf2util from oauth_dropins.webutil import util from oauth_dropins.webutil.handlers import memcache_response @@ -73,16 +72,15 @@ class ActorHandler(webapp2.RequestHandler): @memcache_response(CACHE_TIME) def get(self, domain): - url = 'http://%s/' % domain - resp = common.requests_get(url) - mf2 = mf2py.parse(resp.text, url=resp.url, img_with_alt=True) + mf2 = util.fetch_mf2('http://%s/' % domain, gateway=True, + headers=common.HEADERS) # logging.info('Parsed mf2 for %s: %s', resp.url, json.dumps(mf2, indent=2)) - hcard = mf2util.representative_hcard(mf2, resp.url) + hcard = mf2util.representative_hcard(mf2, mf2['url']) logging.info('Representative h-card: %s', json.dumps(hcard, indent=2)) if not hcard: common.error(self, """\ -Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on %s""" % resp.url) +Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on %s""" % mf2['url']) key = MagicKey.get_or_create(domain) obj = common.postprocess_as2(as2.from_as1(microformats2.json_to_object(hcard)), diff --git a/common.py b/common.py index 9fc78cc..23d9e79 100644 --- a/common.py +++ b/common.py @@ -8,7 +8,6 @@ import logging import re import urlparse -from bs4 import BeautifulSoup from granary import as2 from oauth_dropins.webutil import handlers, util import requests @@ -42,7 +41,7 @@ AS2_PUBLIC_AUDIENCE = 'https://www.w3.org/ns/activitystreams#Public' CONTENT_TYPE_AS2_LD = b'application/ld+json; profile="https://www.w3.org/ns/activitystreams"' CONTENT_TYPE_AS2 = b'application/activity+json' CONTENT_TYPE_AS1 = b'application/stream+json' -CONTENT_TYPE_HTML = b'text/html' +CONTENT_TYPE_HTML = b'text/html; charset=utf-8' CONTENT_TYPE_ATOM = b'application/atom+xml' CONTENT_TYPE_MAGIC_ENVELOPE = b'application/magic-envelope+xml' @@ -78,16 +77,7 @@ def requests_post(url, **kwargs): def _requests_fn(fn, url, parse_json=False, **kwargs): """Wraps requests.* and adds raise_for_status() and User-Agent.""" kwargs.setdefault('headers', {}).update(HEADERS) - - try: - resp = fn(url, **kwargs) - except ValueError as e: - msg = 'Bad URL %s: %s' % (url, e) - logging.warning(msg) - raise exc.HTTPBadRequest(msg) - except (requests.ConnectionError, requests.Timeout) as e: - logging.warning(url, exc_info=True) - raise exc.HTTPBadGateway(unicode(e)) + resp = fn(url, gateway=True, **kwargs) logging.info('Got %s headers:%s', resp.status_code, resp.headers) type = content_type(resp) @@ -95,11 +85,6 @@ def _requests_fn(fn, url, parse_json=False, **kwargs): (type.startswith('text/') or type.endswith('+json') or type.endswith('/json'))): logging.info(resp.text) - if resp.status_code // 100 in (4, 5): - msg = 'Received %s from %s:\n%s' % (resp.status_code, url, resp.text) - logging.info(msg) - raise exc.HTTPBadGateway(msg) - if parse_json: try: return resp.json() @@ -141,7 +126,7 @@ def get_as2(url): if content_type(resp) in (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD): return resp - parsed = beautifulsoup_parse(resp.content, from_encoding=resp.encoding) + parsed = util.parse_html(resp) as2 = parsed.find('link', rel=('alternate', 'self'), type=( CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD)) if not (as2 and as2['href']): @@ -402,25 +387,3 @@ def redirect_unwrap(val): cache=memcache).url return val - - -def beautifulsoup_parse(html, **kwargs): - """Parses an HTML string with BeautifulSoup. Centralizes our parsing config. - - *Copied from bridgy/util.py.* - - We currently use lxml, which BeautifulSoup claims is the fastest and best: - http://www.crummy.com/software/BeautifulSoup/bs4/doc/#specifying-the-parser-to-use - - lxml is a native module, so we don't bundle and deploy it to App Engine. - Instead, we use App Engine's version by declaring it in app.yaml. - https://cloud.google.com/appengine/docs/standard/python/tools/built-in-libraries-27 - - We pin App Engine's version in requirements.freeze.txt and tell BeautifulSoup - to use lxml explicitly to ensure we use the same parser and version in prod - and locally, since we've been bit by at least one meaningful difference - between lxml and e.g. html5lib: lxml includes the contents of