kopia lustrzana https://github.com/snarfed/bridgy-fed
upgrade BS4 to 4.6.3, pin lxml to app engine prod's version
rodzic
6681a5f2c7
commit
5047337738
|
@ -31,7 +31,7 @@ class ActorHandler(webapp2.RequestHandler):
|
|||
def get(self, domain):
|
||||
url = 'http://%s/' % domain
|
||||
resp = common.requests_get(url)
|
||||
mf2 = mf2py.parse(resp.text, url=resp.url)
|
||||
mf2 = mf2py.parse(resp.text, url=resp.url, img_with_alt=True)
|
||||
# logging.info('Parsed mf2 for %s: %s', resp.url, json.dumps(mf2, indent=2))
|
||||
|
||||
hcard = mf2util.representative_hcard(mf2, resp.url)
|
||||
|
|
24
common.py
24
common.py
|
@ -127,7 +127,7 @@ def get_as2(url):
|
|||
if content_type(resp) in (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD):
|
||||
return resp
|
||||
|
||||
parsed = BeautifulSoup(resp.content, from_encoding=resp.encoding)
|
||||
parsed = beautifulsoup_parse(resp.content, from_encoding=resp.encoding)
|
||||
as2 = parsed.find('link', rel=('alternate', 'self'), type=(
|
||||
CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD))
|
||||
if not (as2 and as2['href']):
|
||||
|
@ -299,3 +299,25 @@ def postprocess_as2_actor(actor):
|
|||
domain = urlparse.urlparse(url).netloc
|
||||
actor.setdefault('preferredUsername', domain)
|
||||
actor['id'] = '%s/%s' % (appengine_config.HOST_URL, domain)
|
||||
|
||||
|
||||
def beautifulsoup_parse(html, **kwargs):
|
||||
"""Parses an HTML string with BeautifulSoup. Centralizes our parsing config.
|
||||
|
||||
*Copied from bridgy/util.py.*
|
||||
|
||||
We currently use lxml, which BeautifulSoup claims is the fastest and best:
|
||||
http://www.crummy.com/software/BeautifulSoup/bs4/doc/#specifying-the-parser-to-use
|
||||
|
||||
lxml is a native module, so we don't bundle and deploy it to App Engine.
|
||||
Instead, we use App Engine's version by declaring it in app.yaml.
|
||||
https://cloud.google.com/appengine/docs/standard/python/tools/built-in-libraries-27
|
||||
|
||||
We pin App Engine's version in requirements.freeze.txt and tell BeautifulSoup
|
||||
to use lxml explicitly to ensure we use the same parser and version in prod
|
||||
and locally, since we've been bit by at least one meaningful difference
|
||||
between lxml and e.g. html5lib: lxml includes the contents of <noscript> tags,
|
||||
html5lib omits them. :(
|
||||
https://github.com/snarfed/bridgy/issues/798#issuecomment-370508015
|
||||
"""
|
||||
return BeautifulSoup(html, 'lxml', **kwargs)
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
beautifulsoup4==4.6.0
|
||||
beautifulsoup4==4.6.3
|
||||
brevity==0.2.17
|
||||
bs4==0.0.1
|
||||
certifi==2018.4.16
|
||||
chardet==3.0.4
|
||||
coverage==4.0.3
|
||||
|
@ -19,6 +18,9 @@ httplib2==0.10.3
|
|||
humanize==0.5.1
|
||||
idna==2.7
|
||||
Jinja2==2.10
|
||||
# app engine's built in lxml is 3.7.3
|
||||
# https://cloud.google.com/appengine/docs/standard/python/tools/built-in-libraries-27
|
||||
lxml==3.7.3
|
||||
MarkupSafe==1.0
|
||||
mf2py==1.1.2
|
||||
mf2util==0.5.0
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
-e git+https://github.com/snarfed/django-salmon.git#egg=django_salmon
|
||||
-e git+https://github.com/snarfed/webmention-tools.git#egg=webmentiontools
|
||||
bs4
|
||||
beautifulsoup4
|
||||
feedparser
|
||||
granary>=1.12
|
||||
httpsig
|
||||
|
|
|
@ -46,8 +46,8 @@ class UserHandler(handlers.XrdOrJrdHandler):
|
|||
|
||||
for candidate in urls:
|
||||
resp = common.requests_get(candidate)
|
||||
parsed = BeautifulSoup(resp.content, from_encoding=resp.encoding)
|
||||
mf2 = mf2py.parse(parsed, url=resp.url)
|
||||
parsed = common.beautifulsoup_parse(resp.content, from_encoding=resp.encoding)
|
||||
mf2 = mf2py.parse(parsed, url=resp.url, img_with_alt=True)
|
||||
# logging.debug('Parsed mf2 for %s: %s', resp.url, json.dumps(mf2, indent=2))
|
||||
hcard = mf2util.representative_hcard(mf2, resp.url)
|
||||
if hcard:
|
||||
|
|
|
@ -72,7 +72,7 @@ class WebmentionHandler(webapp2.RequestHandler):
|
|||
# fetch source page, convert to ActivityStreams
|
||||
source_resp = common.requests_get(source)
|
||||
source_url = source_resp.url or source
|
||||
source_mf2 = mf2py.parse(source_resp.text, url=source_url)
|
||||
source_mf2 = mf2py.parse(source_resp.text, url=source_url, img_with_alt=True)
|
||||
# logging.debug('Parsed mf2 for %s: %s', source_resp.url, json.dumps(source_mf2, indent=2))
|
||||
|
||||
entry = mf2util.find_first_entry(source_mf2, ['h-entry'])
|
||||
|
@ -177,7 +177,7 @@ class WebmentionHandler(webapp2.RequestHandler):
|
|||
if not target_resp:
|
||||
target_resp = common.requests_get(self.resp.target())
|
||||
|
||||
parsed = BeautifulSoup(target_resp.content, from_encoding=target_resp.encoding)
|
||||
parsed = common.beautifulsoup_parse(target_resp.content, from_encoding=target_resp.encoding)
|
||||
atom_url = parsed.find('link', rel='alternate', type=common.CONTENT_TYPE_ATOM)
|
||||
if not atom_url or not atom_url.get('href'):
|
||||
common.error(self, 'Target post %s has no Atom link' % self.resp.target(),
|
||||
|
|
Ładowanie…
Reference in New Issue