upgrade BS4 to 4.6.3, pin lxml to app engine prod's version

pull/36/head
Ryan Barrett 2018-10-11 19:12:18 -07:00
rodzic 6681a5f2c7
commit 5047337738
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 6BE31FDF4776E9D4
6 zmienionych plików z 33 dodań i 9 usunięć

Wyświetl plik

@ -31,7 +31,7 @@ class ActorHandler(webapp2.RequestHandler):
def get(self, domain):
url = 'http://%s/' % domain
resp = common.requests_get(url)
mf2 = mf2py.parse(resp.text, url=resp.url)
mf2 = mf2py.parse(resp.text, url=resp.url, img_with_alt=True)
# logging.info('Parsed mf2 for %s: %s', resp.url, json.dumps(mf2, indent=2))
hcard = mf2util.representative_hcard(mf2, resp.url)

Wyświetl plik

@ -127,7 +127,7 @@ def get_as2(url):
if content_type(resp) in (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD):
return resp
parsed = BeautifulSoup(resp.content, from_encoding=resp.encoding)
parsed = beautifulsoup_parse(resp.content, from_encoding=resp.encoding)
as2 = parsed.find('link', rel=('alternate', 'self'), type=(
CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD))
if not (as2 and as2['href']):
@ -299,3 +299,25 @@ def postprocess_as2_actor(actor):
domain = urlparse.urlparse(url).netloc
actor.setdefault('preferredUsername', domain)
actor['id'] = '%s/%s' % (appengine_config.HOST_URL, domain)
def beautifulsoup_parse(html, **kwargs):
"""Parses an HTML string with BeautifulSoup. Centralizes our parsing config.
*Copied from bridgy/util.py.*
We currently use lxml, which BeautifulSoup claims is the fastest and best:
http://www.crummy.com/software/BeautifulSoup/bs4/doc/#specifying-the-parser-to-use
lxml is a native module, so we don't bundle and deploy it to App Engine.
Instead, we use App Engine's version by declaring it in app.yaml.
https://cloud.google.com/appengine/docs/standard/python/tools/built-in-libraries-27
We pin App Engine's version in requirements.freeze.txt and tell BeautifulSoup
to use lxml explicitly to ensure we use the same parser and version in prod
and locally, since we've been bit by at least one meaningful difference
between lxml and e.g. html5lib: lxml includes the contents of <noscript> tags,
html5lib omits them. :(
https://github.com/snarfed/bridgy/issues/798#issuecomment-370508015
"""
return BeautifulSoup(html, 'lxml', **kwargs)

Wyświetl plik

@ -1,6 +1,5 @@
beautifulsoup4==4.6.0
beautifulsoup4==4.6.3
brevity==0.2.17
bs4==0.0.1
certifi==2018.4.16
chardet==3.0.4
coverage==4.0.3
@ -19,6 +18,9 @@ httplib2==0.10.3
humanize==0.5.1
idna==2.7
Jinja2==2.10
# app engine's built in lxml is 3.7.3
# https://cloud.google.com/appengine/docs/standard/python/tools/built-in-libraries-27
lxml==3.7.3
MarkupSafe==1.0
mf2py==1.1.2
mf2util==0.5.0

Wyświetl plik

@ -1,6 +1,6 @@
-e git+https://github.com/snarfed/django-salmon.git#egg=django_salmon
-e git+https://github.com/snarfed/webmention-tools.git#egg=webmentiontools
bs4
beautifulsoup4
feedparser
granary>=1.12
httpsig

Wyświetl plik

@ -46,8 +46,8 @@ class UserHandler(handlers.XrdOrJrdHandler):
for candidate in urls:
resp = common.requests_get(candidate)
parsed = BeautifulSoup(resp.content, from_encoding=resp.encoding)
mf2 = mf2py.parse(parsed, url=resp.url)
parsed = common.beautifulsoup_parse(resp.content, from_encoding=resp.encoding)
mf2 = mf2py.parse(parsed, url=resp.url, img_with_alt=True)
# logging.debug('Parsed mf2 for %s: %s', resp.url, json.dumps(mf2, indent=2))
hcard = mf2util.representative_hcard(mf2, resp.url)
if hcard:

Wyświetl plik

@ -72,7 +72,7 @@ class WebmentionHandler(webapp2.RequestHandler):
# fetch source page, convert to ActivityStreams
source_resp = common.requests_get(source)
source_url = source_resp.url or source
source_mf2 = mf2py.parse(source_resp.text, url=source_url)
source_mf2 = mf2py.parse(source_resp.text, url=source_url, img_with_alt=True)
# logging.debug('Parsed mf2 for %s: %s', source_resp.url, json.dumps(source_mf2, indent=2))
entry = mf2util.find_first_entry(source_mf2, ['h-entry'])
@ -177,7 +177,7 @@ class WebmentionHandler(webapp2.RequestHandler):
if not target_resp:
target_resp = common.requests_get(self.resp.target())
parsed = BeautifulSoup(target_resp.content, from_encoding=target_resp.encoding)
parsed = common.beautifulsoup_parse(target_resp.content, from_encoding=target_resp.encoding)
atom_url = parsed.find('link', rel='alternate', type=common.CONTENT_TYPE_ATOM)
if not atom_url or not atom_url.get('href'):
common.error(self, 'Target post %s has no Atom link' % self.resp.target(),