From 7c9a03c827be04220d2bb7082dfba68e94dec008 Mon Sep 17 00:00:00 2001
From: Ryan Barrett <git@ryanb.org>
Date: Thu, 3 Oct 2019 21:08:26 -0700
Subject: [PATCH] unify HTTP fetching, HTML and mf2 parsing, and error handling
 into webutil

see:
* snarfed/granary#171
* snarfed/webutil@f994884b2b95b8bb10e2079940ef83db4c91f75f
* snarfed/oauth-dropins@f5b6e73530561f992ce86ea7bc40c55ee4300acf
* snarfed/granary@284eb115083f2d5c982d914b76d83c4ebb1e3f59
---
 activitypub.py            | 10 ++++-----
 common.py                 | 43 +++------------------------------------
 redirect.py               |  7 +++----
 requirements.freeze.txt   |  2 +-
 tests/test_activitypub.py | 12 +++++------
 tests/test_redirect.py    |  8 +++-----
 tests/test_webfinger.py   |  4 ++--
 tests/test_webmention.py  | 19 ++++++++---------
 tests/testutil.py         |  1 +
 webfinger.py              |  5 ++---
 webmention.py             |  9 ++++----
 11 files changed, 38 insertions(+), 82 deletions(-)

diff --git a/activitypub.py b/activitypub.py
index ca671a9..673d726 100644
--- a/activitypub.py
+++ b/activitypub.py
@@ -8,7 +8,6 @@ import appengine_config
 
 from google.appengine.ext import ndb
 from granary import as2, microformats2
-import mf2py
 import mf2util
 from oauth_dropins.webutil import util
 from oauth_dropins.webutil.handlers import memcache_response
@@ -73,16 +72,15 @@ class ActorHandler(webapp2.RequestHandler):
 
     @memcache_response(CACHE_TIME)
     def get(self, domain):
-        url = 'http://%s/' % domain
-        resp = common.requests_get(url)
-        mf2 = mf2py.parse(resp.text, url=resp.url, img_with_alt=True)
+        mf2 = util.fetch_mf2('http://%s/' % domain, gateway=True,
+                             headers=common.HEADERS)
         # logging.info('Parsed mf2 for %s: %s', resp.url, json.dumps(mf2, indent=2))
 
-        hcard = mf2util.representative_hcard(mf2, resp.url)
+        hcard = mf2util.representative_hcard(mf2, mf2['url'])
         logging.info('Representative h-card: %s', json.dumps(hcard, indent=2))
         if not hcard:
             common.error(self, """\
-Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on %s""" % resp.url)
+Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on %s""" % mf2['url'])
 
         key = MagicKey.get_or_create(domain)
         obj = common.postprocess_as2(as2.from_as1(microformats2.json_to_object(hcard)),
diff --git a/common.py b/common.py
index 9fc78cc..23d9e79 100644
--- a/common.py
+++ b/common.py
@@ -8,7 +8,6 @@ import logging
 import re
 import urlparse
 
-from bs4 import BeautifulSoup
 from granary import as2
 from oauth_dropins.webutil import handlers, util
 import requests
@@ -42,7 +41,7 @@ AS2_PUBLIC_AUDIENCE = 'https://www.w3.org/ns/activitystreams#Public'
 CONTENT_TYPE_AS2_LD = b'application/ld+json; profile="https://www.w3.org/ns/activitystreams"'
 CONTENT_TYPE_AS2 = b'application/activity+json'
 CONTENT_TYPE_AS1 = b'application/stream+json'
-CONTENT_TYPE_HTML = b'text/html'
+CONTENT_TYPE_HTML = b'text/html; charset=utf-8'
 CONTENT_TYPE_ATOM = b'application/atom+xml'
 CONTENT_TYPE_MAGIC_ENVELOPE = b'application/magic-envelope+xml'
 
@@ -78,16 +77,7 @@ def requests_post(url, **kwargs):
 def _requests_fn(fn, url, parse_json=False, **kwargs):
     """Wraps requests.* and adds raise_for_status() and User-Agent."""
     kwargs.setdefault('headers', {}).update(HEADERS)
-
-    try:
-        resp = fn(url, **kwargs)
-    except ValueError as e:
-        msg = 'Bad URL %s: %s' % (url, e)
-        logging.warning(msg)
-        raise exc.HTTPBadRequest(msg)
-    except (requests.ConnectionError, requests.Timeout) as e:
-        logging.warning(url, exc_info=True)
-        raise exc.HTTPBadGateway(unicode(e))
+    resp = fn(url, gateway=True, **kwargs)
 
     logging.info('Got %s headers:%s', resp.status_code, resp.headers)
     type = content_type(resp)
@@ -95,11 +85,6 @@ def _requests_fn(fn, url, parse_json=False, **kwargs):
         (type.startswith('text/') or type.endswith('+json') or type.endswith('/json'))):
         logging.info(resp.text)
 
-    if resp.status_code // 100 in (4, 5):
-        msg = 'Received %s from %s:\n%s' % (resp.status_code, url, resp.text)
-        logging.info(msg)
-        raise exc.HTTPBadGateway(msg)
-
     if parse_json:
         try:
             return resp.json()
@@ -141,7 +126,7 @@ def get_as2(url):
     if content_type(resp) in (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD):
         return resp
 
-    parsed = beautifulsoup_parse(resp.content, from_encoding=resp.encoding)
+    parsed = util.parse_html(resp)
     as2 = parsed.find('link', rel=('alternate', 'self'), type=(
         CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD))
     if not (as2 and as2['href']):
@@ -402,25 +387,3 @@ def redirect_unwrap(val):
                 cache=memcache).url
 
     return val
-
-
-def beautifulsoup_parse(html, **kwargs):
-  """Parses an HTML string with BeautifulSoup. Centralizes our parsing config.
-
-  *Copied from bridgy/util.py.*
-
-  We currently use lxml, which BeautifulSoup claims is the fastest and best:
-  http://www.crummy.com/software/BeautifulSoup/bs4/doc/#specifying-the-parser-to-use
-
-  lxml is a native module, so we don't bundle and deploy it to App Engine.
-  Instead, we use App Engine's version by declaring it in app.yaml.
-  https://cloud.google.com/appengine/docs/standard/python/tools/built-in-libraries-27
-
-  We pin App Engine's version in requirements.freeze.txt and tell BeautifulSoup
-  to use lxml explicitly to ensure we use the same parser and version in prod
-  and locally, since we've been bit by at least one meaningful difference
-  between lxml and e.g. html5lib: lxml includes the contents of <noscript> tags,
-  html5lib omits them. :(
-  https://github.com/snarfed/bridgy/issues/798#issuecomment-370508015
-  """
-  return BeautifulSoup(html, 'lxml', **kwargs)
diff --git a/redirect.py b/redirect.py
index 10ca9d5..7965f2a 100644
--- a/redirect.py
+++ b/redirect.py
@@ -13,8 +13,8 @@ import json
 import logging
 
 from granary import as2, microformats2
-import mf2py
 import mf2util
+from oauth_dropins.webutil import util
 from oauth_dropins.webutil.handlers import memcache_response
 import webapp2
 
@@ -53,10 +53,9 @@ class RedirectHandler(webapp2.RequestHandler):
         Currently mainly for Pixelfed.
         https://github.com/snarfed/bridgy-fed/issues/39
         """
-        resp = common.requests_get(url)
-        mf2 = mf2py.parse(resp.text, url=resp.url, img_with_alt=True)
+        mf2 = util.fetch_mf2(url)
         entry = mf2util.find_first_entry(mf2, ['h-entry'])
-        logging.info('Parsed mf2 for %s: %s', resp.url, json.dumps(entry, indent=2))
+        logging.info('Parsed mf2 for %s: %s', mf2['url'], json.dumps(entry, indent=2))
 
         obj = common.postprocess_as2(as2.from_as1(microformats2.json_to_object(entry)))
         logging.info('Returning: %s', json.dumps(obj, indent=2))
diff --git a/requirements.freeze.txt b/requirements.freeze.txt
index 9f723ae..83f000b 100644
--- a/requirements.freeze.txt
+++ b/requirements.freeze.txt
@@ -1,4 +1,4 @@
-beautifulsoup4==4.6.3
+beautifulsoup4==4.8.0
 brevity==0.2.17
 certifi==2019.3.9
 chardet==3.0.4
diff --git a/tests/test_activitypub.py b/tests/test_activitypub.py
index 5be2cbf..411e6de 100644
--- a/tests/test_activitypub.py
+++ b/tests/test_activitypub.py
@@ -143,11 +143,11 @@ class ActivityPubTest(testutil.TestCase):
 <body>
 <a class="h-card u-url" rel="me" href="/about-me">Mrs. ☕ Foo</a>
 </body>
-""", url='https://foo.com/')
+""", url='https://foo.com/', content_type=common.CONTENT_TYPE_HTML)
 
         got = app.get_response('/foo.com')
         mock_get.assert_called_once_with('http://foo.com/', headers=common.HEADERS,
-                                         timeout=util.HTTP_TIMEOUT)
+                                         stream=True, timeout=util.HTTP_TIMEOUT)
         self.assertEquals(200, got.status_int)
         self.assertEquals(common.CONTENT_TYPE_AS2, got.headers['Content-Type'])
         self.assertEquals({
@@ -179,7 +179,7 @@ class ActivityPubTest(testutil.TestCase):
 
         got = app.get_response('/foo.com')
         mock_get.assert_called_once_with('http://foo.com/', headers=common.HEADERS,
-                                         timeout=util.HTTP_TIMEOUT)
+                                         stream=True, timeout=util.HTTP_TIMEOUT)
         self.assertEquals(400, got.status_int)
         self.assertIn('representative h-card', got.body)
 
@@ -234,7 +234,7 @@ class ActivityPubTest(testutil.TestCase):
         self.assertEquals(200, got.status_int, got.body)
 
         mock_head.assert_called_once_with(
-            'http://this', allow_redirects=True, timeout=15)
+            'http://this', allow_redirects=True, stream=True, timeout=15)
         mock_get.assert_not_called()
         mock_post.assert_not_called()
         self.assertEquals(0, Response.query().count())
@@ -293,7 +293,7 @@ class ActivityPubTest(testutil.TestCase):
         as2_headers = copy.deepcopy(common.HEADERS)
         as2_headers.update(common.CONNEG_HEADERS_AS2_HTML)
         mock_get.assert_has_calls((
-            call('http://orig/actor', headers=as2_headers, timeout=15),
+            call('http://orig/actor', headers=as2_headers, stream=True, timeout=15),
             call('http://orig/post', headers=common.HEADERS, verify=False),
         ))
 
@@ -330,7 +330,7 @@ class ActivityPubTest(testutil.TestCase):
         as2_headers = copy.deepcopy(common.HEADERS)
         as2_headers.update(common.CONNEG_HEADERS_AS2_HTML)
         mock_get.assert_has_calls((
-            call(FOLLOW['actor'], headers=as2_headers, timeout=15),
+            call(FOLLOW['actor'], headers=as2_headers, stream=True, timeout=15),
         ))
 
         # check AP Accept
diff --git a/tests/test_redirect.py b/tests/test_redirect.py
index 1af4d8f..7a895cb 100644
--- a/tests/test_redirect.py
+++ b/tests/test_redirect.py
@@ -33,7 +33,7 @@ class RedirectTest(testutil.TestCase):
         self._test_as2(common.CONTENT_TYPE_AS2_LD)
 
     @patch('requests.get')
-    def _test_as2(self, content_type, mock_get):
+    def _test_as2(self, accept, mock_get):
         """Currently mainly for Pixelfed.
 
         https://github.com/snarfed/bridgy-fed/issues/39
@@ -45,11 +45,9 @@ class RedirectTest(testutil.TestCase):
         })
 
         mock_get.return_value = requests_response(
-            REPOST_HTML, content_type=content_type)
+            REPOST_HTML, content_type=common.CONTENT_TYPE_HTML)
 
-        got = app.get_response('/r/https://foo.com/bar', headers={
-            'Accept': 'application/ld+json; profile="https://www.w3.org/ns/activitystreams"',
-        })
+        got = app.get_response('/r/https://foo.com/bar', headers={'Accept': accept})
 
         args, kwargs = mock_get.call_args
         self.assertEqual(('https://foo.com/bar',), args)
diff --git a/tests/test_webfinger.py b/tests/test_webfinger.py
index d4bdf82..43f1993 100644
--- a/tests/test_webfinger.py
+++ b/tests/test_webfinger.py
@@ -112,7 +112,7 @@ class WebFingerTest(testutil.TestCase):
         self.assertEquals('application/json; charset=utf-8',
                           got.headers['Content-Type'])
         mock_get.assert_called_once_with('http://foo.com/', headers=common.HEADERS,
-                                         timeout=util.HTTP_TIMEOUT)
+                                         stream=True, timeout=util.HTTP_TIMEOUT)
 
         self.assertEquals(self.expected_webfinger, json.loads(got.body))
 
@@ -171,7 +171,7 @@ class WebFingerTest(testutil.TestCase):
 """)
         got = app.get_response('/foo.com')
         mock_get.assert_called_once_with('http://foo.com/', headers=common.HEADERS,
-                                         timeout=util.HTTP_TIMEOUT)
+                                         stream=True, timeout=util.HTTP_TIMEOUT)
         self.assertEquals(400, got.status_int)
         self.assertIn('representative h-card', got.body)
 
diff --git a/tests/test_webmention.py b/tests/test_webmention.py
index 4dc0bd7..9514b47 100644
--- a/tests/test_webmention.py
+++ b/tests/test_webmention.py
@@ -13,7 +13,6 @@ from django_salmon import magicsigs, utils
 import feedparser
 from granary import atom, microformats2
 from httpsig.sign import HeaderSigner
-import mf2py
 import mock
 from mock import call
 from oauth_dropins.webutil import util
@@ -128,12 +127,12 @@ class WebmentionTest(testutil.TestCase):
 """
         self.reply = requests_response(
             self.reply_html, content_type=CONTENT_TYPE_HTML)
-        self.reply_mf2 = mf2py.parse(self.reply_html, url='http://a/reply')
+        self.reply_mf2 = util.parse_mf2(self.reply_html, url='http://a/reply')
 
         self.repost_html = REPOST_HTML
         self.repost = requests_response(
             self.repost_html, content_type=CONTENT_TYPE_HTML)
-        self.repost_mf2 = mf2py.parse(self.repost_html, url='http://a/repost')
+        self.repost_mf2 = util.parse_mf2(self.repost_html, url='http://a/repost')
         self.repost_as2 = REPOST_AS2
 
         self.like_html = """\
@@ -149,7 +148,7 @@ class WebmentionTest(testutil.TestCase):
 """
         self.like = requests_response(
             self.like_html, content_type=CONTENT_TYPE_HTML)
-        self.like_mf2 = mf2py.parse(self.like_html, url='http://a/like')
+        self.like_mf2 = util.parse_mf2(self.like_html, url='http://a/like')
 
         self.actor = requests_response({
             'objectType' : 'person',
@@ -204,7 +203,7 @@ class WebmentionTest(testutil.TestCase):
 """
         self.follow = requests_response(
             self.follow_html, content_type=CONTENT_TYPE_HTML)
-        self.follow_mf2 = mf2py.parse(self.follow_html, url='http://a/follow')
+        self.follow_mf2 = util.parse_mf2(self.follow_html, url='http://a/follow')
         self.follow_as2 = {
             '@context': 'https://www.w3.org/ns/activitystreams',
             'type': 'Follow',
@@ -241,7 +240,7 @@ class WebmentionTest(testutil.TestCase):
 """
         self.create = requests_response(
             self.create_html, content_type=CONTENT_TYPE_HTML)
-        self.create_mf2 = mf2py.parse(self.create_html, url='http://a/create')
+        self.create_mf2 = util.parse_mf2(self.create_html, url='http://a/create')
         self.create_as2 = {
             '@context': 'https://www.w3.org/ns/activitystreams',
             'type': 'Create',
@@ -567,7 +566,7 @@ class WebmentionTest(testutil.TestCase):
 <img class="u-photo" src="/pic" />
 </body>
 </html>
-""", content_type=CONTENT_TYPE_HTML)
+""", url='http://orig', content_type=CONTENT_TYPE_HTML)
         mock_get.side_effect = [repost, author, self.orig_as2, self.actor]
         mock_post.return_value = requests_response('abc xyz', status=201)
 
@@ -800,7 +799,7 @@ class WebmentionTest(testutil.TestCase):
 
         mock_get.assert_any_call(
             'http://orig/.well-known/webfinger?resource=acct:ryan@orig',
-            headers=HEADERS, timeout=util.HTTP_TIMEOUT, verify=False)
+            headers=HEADERS, stream=True, timeout=util.HTTP_TIMEOUT, verify=False)
         self.assertEqual(('http://orig/@ryan/salmon',), mock_post.call_args[0])
 
     def test_salmon_no_target_atom(self, mock_get, mock_post):
@@ -835,7 +834,7 @@ class WebmentionTest(testutil.TestCase):
         self.assertEquals(200, got.status_int)
 
         mock_get.assert_any_call('http://orig/atom/1', headers=HEADERS,
-                                 timeout=util.HTTP_TIMEOUT)
+                                 stream=True, timeout=util.HTTP_TIMEOUT)
         data = self.verify_salmon(mock_post)
 
     def test_salmon_relative_atom_href_with_base(self, mock_get, mock_post):
@@ -855,5 +854,5 @@ class WebmentionTest(testutil.TestCase):
         self.assertEquals(200, got.status_int)
 
         mock_get.assert_any_call('http://orig/base/atom/1', headers=HEADERS,
-                                 timeout=util.HTTP_TIMEOUT)
+                                 stream=True, timeout=util.HTTP_TIMEOUT)
         data = self.verify_salmon(mock_post)
diff --git a/tests/testutil.py b/tests/testutil.py
index 4284446..94a10ae 100644
--- a/tests/testutil.py
+++ b/tests/testutil.py
@@ -38,5 +38,6 @@ class TestCase(unittest.TestCase, testutil.Asserts):
             kwargs['headers'] = headers
 
         kwargs.setdefault('timeout', util.HTTP_TIMEOUT)
+        kwargs.setdefault('stream', True)
 
         return call(url, **kwargs)
diff --git a/webfinger.py b/webfinger.py
index 9bdcca7..b35e2f9 100644
--- a/webfinger.py
+++ b/webfinger.py
@@ -19,7 +19,6 @@ import urlparse
 
 import appengine_config
 
-import mf2py
 import mf2util
 from oauth_dropins.webutil import handlers, util
 import webapp2
@@ -55,8 +54,8 @@ class UserHandler(handlers.XrdOrJrdHandler):
 
         for candidate in urls:
             resp = common.requests_get(candidate)
-            parsed = common.beautifulsoup_parse(resp.content, from_encoding=resp.encoding)
-            mf2 = mf2py.parse(parsed, url=resp.url, img_with_alt=True)
+            parsed = util.parse_html(resp)
+            mf2 = util.parse_mf2(parsed, url=resp.url)
             # logging.debug('Parsed mf2 for %s: %s', resp.url, json.dumps(mf2, indent=2))
             hcard = mf2util.representative_hcard(mf2, resp.url)
             if hcard:
diff --git a/webmention.py b/webmention.py
index 5f33d7f..0cae3c7 100644
--- a/webmention.py
+++ b/webmention.py
@@ -17,7 +17,6 @@ import feedparser
 from google.appengine.api import mail
 from google.appengine.ext.ndb import Key
 from granary import as2, atom, microformats2, source
-import mf2py
 import mf2util
 from oauth_dropins.webutil import util
 import requests
@@ -47,8 +46,9 @@ class WebmentionHandler(webapp2.RequestHandler):
         source_resp = common.requests_get(source)
         self.source_url = source_resp.url or source
         self.source_domain = urlparse.urlparse(self.source_url).netloc.split(':')[0]
-        self.source_mf2 = mf2py.parse(source_resp.text, url=self.source_url, img_with_alt=True)
-        # logging.debug('Parsed mf2 for %s: %s', source_resp.url, json.dumps(self.source_mf2, indent=2))
+        self.source_mf2 = util.parse_mf2(source_resp)
+
+        # logging.debug('Parsed mf2 for %s: %s', source_resp.url, json.dumps(self.source_mf2 indent=2))
 
         # check for backlink to bridgy fed (for webmention spec and to confirm
         # source's intent to federate to mastodon)
@@ -239,8 +239,7 @@ class WebmentionHandler(webapp2.RequestHandler):
         if not self.target_resp:
             self.target_resp = common.requests_get(resp.target())
 
-        parsed = common.beautifulsoup_parse(self.target_resp.content,
-                                            from_encoding=self.target_resp.encoding)
+        parsed = util.parse_html(self.target_resp)
         atom_url = parsed.find('link', rel='alternate', type=common.CONTENT_TYPE_ATOM)
         if not atom_url or not atom_url.get('href'):
             common.error(self, 'Target post %s has no Atom link' % resp.target(),