upgrade BS4 to 4.6.3, pin lxml to app engine prod's version

2018-10-11 19:12:18 -07:00 · 2018-10-11 19:12:18 -07:00 · 5047337738
commit 5047337738
--- a/activitypub.py
+++ b/activitypub.py
@ -31,7 +31,7 @@ class ActorHandler(webapp2.RequestHandler):
    def get(self, domain):
        url = 'http://%s/' % domain
        resp = common.requests_get(url)
-        mf2 = mf2py.parse(resp.text, url=resp.url)
+        mf2 = mf2py.parse(resp.text, url=resp.url, img_with_alt=True)
        # logging.info('Parsed mf2 for %s: %s', resp.url, json.dumps(mf2, indent=2))

        hcard = mf2util.representative_hcard(mf2, resp.url)
--- a/common.py
+++ b/common.py
@ -127,7 +127,7 @@ def get_as2(url):
    if content_type(resp) in (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD):
        return resp

-    parsed = BeautifulSoup(resp.content, from_encoding=resp.encoding)
+    parsed = beautifulsoup_parse(resp.content, from_encoding=resp.encoding)
    as2 = parsed.find('link', rel=('alternate', 'self'), type=(
        CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD))
    if not (as2 and as2['href']):
@ -299,3 +299,25 @@ def postprocess_as2_actor(actor):
        domain = urlparse.urlparse(url).netloc
        actor.setdefault('preferredUsername', domain)
        actor['id'] = '%s/%s' % (appengine_config.HOST_URL, domain)
+
+
+def beautifulsoup_parse(html, **kwargs):
+  """Parses an HTML string with BeautifulSoup. Centralizes our parsing config.
+
+  *Copied from bridgy/util.py.*
+
+  We currently use lxml, which BeautifulSoup claims is the fastest and best:
+  http://www.crummy.com/software/BeautifulSoup/bs4/doc/#specifying-the-parser-to-use
+
+  lxml is a native module, so we don't bundle and deploy it to App Engine.
+  Instead, we use App Engine's version by declaring it in app.yaml.
+  https://cloud.google.com/appengine/docs/standard/python/tools/built-in-libraries-27
+
+  We pin App Engine's version in requirements.freeze.txt and tell BeautifulSoup
+  to use lxml explicitly to ensure we use the same parser and version in prod
+  and locally, since we've been bit by at least one meaningful difference
+  between lxml and e.g. html5lib: lxml includes the contents of <noscript> tags,
+  html5lib omits them. :(
+  https://github.com/snarfed/bridgy/issues/798#issuecomment-370508015
+  """
+  return BeautifulSoup(html, 'lxml', **kwargs)
--- a/requirements.freeze.txt
+++ b/requirements.freeze.txt
@ -1,6 +1,5 @@
-beautifulsoup4==4.6.0
+beautifulsoup4==4.6.3
 brevity==0.2.17
-bs4==0.0.1
 certifi==2018.4.16
 chardet==3.0.4
 coverage==4.0.3
@ -19,6 +18,9 @@ httplib2==0.10.3
 humanize==0.5.1
 idna==2.7
 Jinja2==2.10
+# app engine's built in lxml is 3.7.3
+# https://cloud.google.com/appengine/docs/standard/python/tools/built-in-libraries-27
+lxml==3.7.3
 MarkupSafe==1.0
 mf2py==1.1.2
 mf2util==0.5.0
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,6 @@
 -e git+https://github.com/snarfed/django-salmon.git#egg=django_salmon
 -e git+https://github.com/snarfed/webmention-tools.git#egg=webmentiontools
-bs4
+beautifulsoup4
 feedparser
 granary>=1.12
 httpsig
--- a/webfinger.py
+++ b/webfinger.py
@ -46,8 +46,8 @@ class UserHandler(handlers.XrdOrJrdHandler):

        for candidate in urls:
            resp = common.requests_get(candidate)
-            parsed = BeautifulSoup(resp.content, from_encoding=resp.encoding)
-            mf2 = mf2py.parse(parsed, url=resp.url)
+            parsed = common.beautifulsoup_parse(resp.content, from_encoding=resp.encoding)
+            mf2 = mf2py.parse(parsed, url=resp.url, img_with_alt=True)
            # logging.debug('Parsed mf2 for %s: %s', resp.url, json.dumps(mf2, indent=2))
            hcard = mf2util.representative_hcard(mf2, resp.url)
            if hcard:
--- a/webmention.py
+++ b/webmention.py
@ -72,7 +72,7 @@ class WebmentionHandler(webapp2.RequestHandler):
        # fetch source page, convert to ActivityStreams
        source_resp = common.requests_get(source)
        source_url = source_resp.url or source
-        source_mf2 = mf2py.parse(source_resp.text, url=source_url)
+        source_mf2 = mf2py.parse(source_resp.text, url=source_url, img_with_alt=True)
        # logging.debug('Parsed mf2 for %s: %s', source_resp.url, json.dumps(source_mf2, indent=2))

        entry = mf2util.find_first_entry(source_mf2, ['h-entry'])
@ -177,7 +177,7 @@ class WebmentionHandler(webapp2.RequestHandler):
        if not target_resp:
            target_resp = common.requests_get(self.resp.target())

-        parsed = BeautifulSoup(target_resp.content, from_encoding=target_resp.encoding)
+        parsed = common.beautifulsoup_parse(target_resp.content, from_encoding=target_resp.encoding)
        atom_url = parsed.find('link', rel='alternate', type=common.CONTENT_TYPE_ATOM)
        if not atom_url or not atom_url.get('href'):
            common.error(self, 'Target post %s has no Atom link' % self.resp.target(),