unify HTTP fetching, HTML and mf2 parsing, and error handling into webutil

see: * snarfed/granary#171 * snarfed/webutil@f994884b2b * snarfed/oauth-dropins@f5b6e73530 * snarfed/granary@284eb11508
2019-10-03 21:08:26 -07:00 · 2019-10-03 21:08:26 -07:00 · 7c9a03c827
commit 7c9a03c827
--- a/activitypub.py
+++ b/activitypub.py
@ -8,7 +8,6 @@ import appengine_config

 from google.appengine.ext import ndb
 from granary import as2, microformats2
-import mf2py
 import mf2util
 from oauth_dropins.webutil import util
 from oauth_dropins.webutil.handlers import memcache_response
@ -73,16 +72,15 @@ class ActorHandler(webapp2.RequestHandler):

    @memcache_response(CACHE_TIME)
    def get(self, domain):
-        url = 'http://%s/' % domain
-        resp = common.requests_get(url)
-        mf2 = mf2py.parse(resp.text, url=resp.url, img_with_alt=True)
+        mf2 = util.fetch_mf2('http://%s/' % domain, gateway=True,
+                             headers=common.HEADERS)
        # logging.info('Parsed mf2 for %s: %s', resp.url, json.dumps(mf2, indent=2))

-        hcard = mf2util.representative_hcard(mf2, resp.url)
+        hcard = mf2util.representative_hcard(mf2, mf2['url'])
        logging.info('Representative h-card: %s', json.dumps(hcard, indent=2))
        if not hcard:
            common.error(self, """\
-Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on %s""" % resp.url)
+Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on %s""" % mf2['url'])

        key = MagicKey.get_or_create(domain)
        obj = common.postprocess_as2(as2.from_as1(microformats2.json_to_object(hcard)),
--- a/common.py
+++ b/common.py
@ -8,7 +8,6 @@ import logging
 import re
 import urlparse

-from bs4 import BeautifulSoup
 from granary import as2
 from oauth_dropins.webutil import handlers, util
 import requests
@ -42,7 +41,7 @@ AS2_PUBLIC_AUDIENCE = 'https://www.w3.org/ns/activitystreams#Public'
 CONTENT_TYPE_AS2_LD = b'application/ld+json; profile="https://www.w3.org/ns/activitystreams"'
 CONTENT_TYPE_AS2 = b'application/activity+json'
 CONTENT_TYPE_AS1 = b'application/stream+json'
-CONTENT_TYPE_HTML = b'text/html'
+CONTENT_TYPE_HTML = b'text/html; charset=utf-8'
 CONTENT_TYPE_ATOM = b'application/atom+xml'
 CONTENT_TYPE_MAGIC_ENVELOPE = b'application/magic-envelope+xml'

@ -78,16 +77,7 @@ def requests_post(url, **kwargs):
 def _requests_fn(fn, url, parse_json=False, **kwargs):
    """Wraps requests.* and adds raise_for_status() and User-Agent."""
    kwargs.setdefault('headers', {}).update(HEADERS)
-
-    try:
-        resp = fn(url, **kwargs)
-    except ValueError as e:
-        msg = 'Bad URL %s: %s' % (url, e)
-        logging.warning(msg)
-        raise exc.HTTPBadRequest(msg)
-    except (requests.ConnectionError, requests.Timeout) as e:
-        logging.warning(url, exc_info=True)
-        raise exc.HTTPBadGateway(unicode(e))
+    resp = fn(url, gateway=True, **kwargs)

    logging.info('Got %s headers:%s', resp.status_code, resp.headers)
    type = content_type(resp)
@ -95,11 +85,6 @@ def _requests_fn(fn, url, parse_json=False, **kwargs):
        (type.startswith('text/') or type.endswith('+json') or type.endswith('/json'))):
        logging.info(resp.text)

-    if resp.status_code // 100 in (4, 5):
-        msg = 'Received %s from %s:\n%s' % (resp.status_code, url, resp.text)
-        logging.info(msg)
-        raise exc.HTTPBadGateway(msg)
-
    if parse_json:
        try:
            return resp.json()
@ -141,7 +126,7 @@ def get_as2(url):
    if content_type(resp) in (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD):
        return resp

-    parsed = beautifulsoup_parse(resp.content, from_encoding=resp.encoding)
+    parsed = util.parse_html(resp)
    as2 = parsed.find('link', rel=('alternate', 'self'), type=(
        CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD))
    if not (as2 and as2['href']):
@ -402,25 +387,3 @@ def redirect_unwrap(val):
                cache=memcache).url

    return val
-
-
-def beautifulsoup_parse(html, **kwargs):
-  """Parses an HTML string with BeautifulSoup. Centralizes our parsing config.
-
-  *Copied from bridgy/util.py.*
-
-  We currently use lxml, which BeautifulSoup claims is the fastest and best:
-  http://www.crummy.com/software/BeautifulSoup/bs4/doc/#specifying-the-parser-to-use
-
-  lxml is a native module, so we don't bundle and deploy it to App Engine.
-  Instead, we use App Engine's version by declaring it in app.yaml.
-  https://cloud.google.com/appengine/docs/standard/python/tools/built-in-libraries-27
-
-  We pin App Engine's version in requirements.freeze.txt and tell BeautifulSoup
-  to use lxml explicitly to ensure we use the same parser and version in prod
-  and locally, since we've been bit by at least one meaningful difference
-  between lxml and e.g. html5lib: lxml includes the contents of <noscript> tags,
-  html5lib omits them. :(
-  https://github.com/snarfed/bridgy/issues/798#issuecomment-370508015
-  """
-  return BeautifulSoup(html, 'lxml', **kwargs)
--- a/redirect.py
+++ b/redirect.py
@ -13,8 +13,8 @@ import json
 import logging

 from granary import as2, microformats2
-import mf2py
 import mf2util
+from oauth_dropins.webutil import util
 from oauth_dropins.webutil.handlers import memcache_response
 import webapp2

@ -53,10 +53,9 @@ class RedirectHandler(webapp2.RequestHandler):
        Currently mainly for Pixelfed.
        https://github.com/snarfed/bridgy-fed/issues/39
        """
-        resp = common.requests_get(url)
-        mf2 = mf2py.parse(resp.text, url=resp.url, img_with_alt=True)
+        mf2 = util.fetch_mf2(url)
        entry = mf2util.find_first_entry(mf2, ['h-entry'])
-        logging.info('Parsed mf2 for %s: %s', resp.url, json.dumps(entry, indent=2))
+        logging.info('Parsed mf2 for %s: %s', mf2['url'], json.dumps(entry, indent=2))

        obj = common.postprocess_as2(as2.from_as1(microformats2.json_to_object(entry)))
        logging.info('Returning: %s', json.dumps(obj, indent=2))
--- a/requirements.freeze.txt
+++ b/requirements.freeze.txt
@ -1,4 +1,4 @@
-beautifulsoup4==4.6.3
+beautifulsoup4==4.8.0
 brevity==0.2.17
 certifi==2019.3.9
 chardet==3.0.4
--- a/tests/test_activitypub.py
+++ b/tests/test_activitypub.py
@ -143,11 +143,11 @@ class ActivityPubTest(testutil.TestCase):
 <body>
 <a class="h-card u-url" rel="me" href="/about-me">Mrs. ☕ Foo</a>
 </body>
-""", url='https://foo.com/')
+""", url='https://foo.com/', content_type=common.CONTENT_TYPE_HTML)

        got = app.get_response('/foo.com')
        mock_get.assert_called_once_with('http://foo.com/', headers=common.HEADERS,
-                                         timeout=util.HTTP_TIMEOUT)
+                                         stream=True, timeout=util.HTTP_TIMEOUT)
        self.assertEquals(200, got.status_int)
        self.assertEquals(common.CONTENT_TYPE_AS2, got.headers['Content-Type'])
        self.assertEquals({
@ -179,7 +179,7 @@ class ActivityPubTest(testutil.TestCase):

        got = app.get_response('/foo.com')
        mock_get.assert_called_once_with('http://foo.com/', headers=common.HEADERS,
-                                         timeout=util.HTTP_TIMEOUT)
+                                         stream=True, timeout=util.HTTP_TIMEOUT)
        self.assertEquals(400, got.status_int)
        self.assertIn('representative h-card', got.body)

@ -234,7 +234,7 @@ class ActivityPubTest(testutil.TestCase):
        self.assertEquals(200, got.status_int, got.body)

        mock_head.assert_called_once_with(
-            'http://this', allow_redirects=True, timeout=15)
+            'http://this', allow_redirects=True, stream=True, timeout=15)
        mock_get.assert_not_called()
        mock_post.assert_not_called()
        self.assertEquals(0, Response.query().count())
@ -293,7 +293,7 @@ class ActivityPubTest(testutil.TestCase):
        as2_headers = copy.deepcopy(common.HEADERS)
        as2_headers.update(common.CONNEG_HEADERS_AS2_HTML)
        mock_get.assert_has_calls((
-            call('http://orig/actor', headers=as2_headers, timeout=15),
+            call('http://orig/actor', headers=as2_headers, stream=True, timeout=15),
            call('http://orig/post', headers=common.HEADERS, verify=False),
        ))

@ -330,7 +330,7 @@ class ActivityPubTest(testutil.TestCase):
        as2_headers = copy.deepcopy(common.HEADERS)
        as2_headers.update(common.CONNEG_HEADERS_AS2_HTML)
        mock_get.assert_has_calls((
-            call(FOLLOW['actor'], headers=as2_headers, timeout=15),
+            call(FOLLOW['actor'], headers=as2_headers, stream=True, timeout=15),
        ))

        # check AP Accept
--- a/tests/test_redirect.py
+++ b/tests/test_redirect.py
@ -33,7 +33,7 @@ class RedirectTest(testutil.TestCase):
        self._test_as2(common.CONTENT_TYPE_AS2_LD)

    @patch('requests.get')
-    def _test_as2(self, content_type, mock_get):
+    def _test_as2(self, accept, mock_get):
        """Currently mainly for Pixelfed.

        https://github.com/snarfed/bridgy-fed/issues/39
@ -45,11 +45,9 @@ class RedirectTest(testutil.TestCase):
        })

        mock_get.return_value = requests_response(
-            REPOST_HTML, content_type=content_type)
+            REPOST_HTML, content_type=common.CONTENT_TYPE_HTML)

-        got = app.get_response('/r/https://foo.com/bar', headers={
-            'Accept': 'application/ld+json; profile="https://www.w3.org/ns/activitystreams"',
-        })
+        got = app.get_response('/r/https://foo.com/bar', headers={'Accept': accept})

        args, kwargs = mock_get.call_args
        self.assertEqual(('https://foo.com/bar',), args)
--- a/tests/test_webfinger.py
+++ b/tests/test_webfinger.py
@ -112,7 +112,7 @@ class WebFingerTest(testutil.TestCase):
        self.assertEquals('application/json; charset=utf-8',
                          got.headers['Content-Type'])
        mock_get.assert_called_once_with('http://foo.com/', headers=common.HEADERS,
-                                         timeout=util.HTTP_TIMEOUT)
+                                         stream=True, timeout=util.HTTP_TIMEOUT)

        self.assertEquals(self.expected_webfinger, json.loads(got.body))

@ -171,7 +171,7 @@ class WebFingerTest(testutil.TestCase):
 """)
        got = app.get_response('/foo.com')
        mock_get.assert_called_once_with('http://foo.com/', headers=common.HEADERS,
-                                         timeout=util.HTTP_TIMEOUT)
+                                         stream=True, timeout=util.HTTP_TIMEOUT)
        self.assertEquals(400, got.status_int)
        self.assertIn('representative h-card', got.body)

--- a/tests/test_webmention.py
+++ b/tests/test_webmention.py
@ -13,7 +13,6 @@ from django_salmon import magicsigs, utils
 import feedparser
 from granary import atom, microformats2
 from httpsig.sign import HeaderSigner
-import mf2py
 import mock
 from mock import call
 from oauth_dropins.webutil import util
@ -128,12 +127,12 @@ class WebmentionTest(testutil.TestCase):
 """
        self.reply = requests_response(
            self.reply_html, content_type=CONTENT_TYPE_HTML)
-        self.reply_mf2 = mf2py.parse(self.reply_html, url='http://a/reply')
+        self.reply_mf2 = util.parse_mf2(self.reply_html, url='http://a/reply')

        self.repost_html = REPOST_HTML
        self.repost = requests_response(
            self.repost_html, content_type=CONTENT_TYPE_HTML)
-        self.repost_mf2 = mf2py.parse(self.repost_html, url='http://a/repost')
+        self.repost_mf2 = util.parse_mf2(self.repost_html, url='http://a/repost')
        self.repost_as2 = REPOST_AS2

        self.like_html = """\
@ -149,7 +148,7 @@ class WebmentionTest(testutil.TestCase):
 """
        self.like = requests_response(
            self.like_html, content_type=CONTENT_TYPE_HTML)
-        self.like_mf2 = mf2py.parse(self.like_html, url='http://a/like')
+        self.like_mf2 = util.parse_mf2(self.like_html, url='http://a/like')

        self.actor = requests_response({
            'objectType' : 'person',
@ -204,7 +203,7 @@ class WebmentionTest(testutil.TestCase):
 """
        self.follow = requests_response(
            self.follow_html, content_type=CONTENT_TYPE_HTML)
-        self.follow_mf2 = mf2py.parse(self.follow_html, url='http://a/follow')
+        self.follow_mf2 = util.parse_mf2(self.follow_html, url='http://a/follow')
        self.follow_as2 = {
            '@context': 'https://www.w3.org/ns/activitystreams',
            'type': 'Follow',
@ -241,7 +240,7 @@ class WebmentionTest(testutil.TestCase):
 """
        self.create = requests_response(
            self.create_html, content_type=CONTENT_TYPE_HTML)
-        self.create_mf2 = mf2py.parse(self.create_html, url='http://a/create')
+        self.create_mf2 = util.parse_mf2(self.create_html, url='http://a/create')
        self.create_as2 = {
            '@context': 'https://www.w3.org/ns/activitystreams',
            'type': 'Create',
@ -567,7 +566,7 @@ class WebmentionTest(testutil.TestCase):
 <img class="u-photo" src="/pic" />
 </body>
 </html>
-""", content_type=CONTENT_TYPE_HTML)
+""", url='http://orig', content_type=CONTENT_TYPE_HTML)
        mock_get.side_effect = [repost, author, self.orig_as2, self.actor]
        mock_post.return_value = requests_response('abc xyz', status=201)

@ -800,7 +799,7 @@ class WebmentionTest(testutil.TestCase):

        mock_get.assert_any_call(
            'http://orig/.well-known/webfinger?resource=acct:ryan@orig',
-            headers=HEADERS, timeout=util.HTTP_TIMEOUT, verify=False)
+            headers=HEADERS, stream=True, timeout=util.HTTP_TIMEOUT, verify=False)
        self.assertEqual(('http://orig/@ryan/salmon',), mock_post.call_args[0])

    def test_salmon_no_target_atom(self, mock_get, mock_post):
@ -835,7 +834,7 @@ class WebmentionTest(testutil.TestCase):
        self.assertEquals(200, got.status_int)

        mock_get.assert_any_call('http://orig/atom/1', headers=HEADERS,
-                                 timeout=util.HTTP_TIMEOUT)
+                                 stream=True, timeout=util.HTTP_TIMEOUT)
        data = self.verify_salmon(mock_post)

    def test_salmon_relative_atom_href_with_base(self, mock_get, mock_post):
@ -855,5 +854,5 @@ class WebmentionTest(testutil.TestCase):
        self.assertEquals(200, got.status_int)

        mock_get.assert_any_call('http://orig/base/atom/1', headers=HEADERS,
-                                 timeout=util.HTTP_TIMEOUT)
+                                 stream=True, timeout=util.HTTP_TIMEOUT)
        data = self.verify_salmon(mock_post)
--- a/tests/testutil.py
+++ b/tests/testutil.py
@ -38,5 +38,6 @@ class TestCase(unittest.TestCase, testutil.Asserts):
            kwargs['headers'] = headers

        kwargs.setdefault('timeout', util.HTTP_TIMEOUT)
+        kwargs.setdefault('stream', True)

        return call(url, **kwargs)
--- a/webfinger.py
+++ b/webfinger.py
@ -19,7 +19,6 @@ import urlparse

 import appengine_config

-import mf2py
 import mf2util
 from oauth_dropins.webutil import handlers, util
 import webapp2
@ -55,8 +54,8 @@ class UserHandler(handlers.XrdOrJrdHandler):

        for candidate in urls:
            resp = common.requests_get(candidate)
-            parsed = common.beautifulsoup_parse(resp.content, from_encoding=resp.encoding)
-            mf2 = mf2py.parse(parsed, url=resp.url, img_with_alt=True)
+            parsed = util.parse_html(resp)
+            mf2 = util.parse_mf2(parsed, url=resp.url)
            # logging.debug('Parsed mf2 for %s: %s', resp.url, json.dumps(mf2, indent=2))
            hcard = mf2util.representative_hcard(mf2, resp.url)
            if hcard:
--- a/webmention.py
+++ b/webmention.py
@ -17,7 +17,6 @@ import feedparser
 from google.appengine.api import mail
 from google.appengine.ext.ndb import Key
 from granary import as2, atom, microformats2, source
-import mf2py
 import mf2util
 from oauth_dropins.webutil import util
 import requests
@ -47,8 +46,9 @@ class WebmentionHandler(webapp2.RequestHandler):
        source_resp = common.requests_get(source)
        self.source_url = source_resp.url or source
        self.source_domain = urlparse.urlparse(self.source_url).netloc.split(':')[0]
-        self.source_mf2 = mf2py.parse(source_resp.text, url=self.source_url, img_with_alt=True)
-        # logging.debug('Parsed mf2 for %s: %s', source_resp.url, json.dumps(self.source_mf2, indent=2))
+        self.source_mf2 = util.parse_mf2(source_resp)
+
+        # logging.debug('Parsed mf2 for %s: %s', source_resp.url, json.dumps(self.source_mf2 indent=2))

        # check for backlink to bridgy fed (for webmention spec and to confirm
        # source's intent to federate to mastodon)
@ -239,8 +239,7 @@ class WebmentionHandler(webapp2.RequestHandler):
        if not self.target_resp:
            self.target_resp = common.requests_get(resp.target())

-        parsed = common.beautifulsoup_parse(self.target_resp.content,
-                                            from_encoding=self.target_resp.encoding)
+        parsed = util.parse_html(self.target_resp)
        atom_url = parsed.find('link', rel='alternate', type=common.CONTENT_TYPE_ATOM)
        if not atom_url or not atom_url.get('href'):
            common.error(self, 'Target post %s has no Atom link' % resp.target(),