unify HTTP fetching, HTML and mf2 parsing, and error handling into webutil

see:
* snarfed/granary#171
* snarfed/webutil@f994884b2b
* snarfed/oauth-dropins@f5b6e73530
* snarfed/granary@284eb11508
pull/59/head
Ryan Barrett 2019-10-03 21:08:26 -07:00
rodzic 00d0cc5557
commit 7c9a03c827
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 6BE31FDF4776E9D4
11 zmienionych plików z 38 dodań i 82 usunięć

Wyświetl plik

@ -8,7 +8,6 @@ import appengine_config
from google.appengine.ext import ndb
from granary import as2, microformats2
import mf2py
import mf2util
from oauth_dropins.webutil import util
from oauth_dropins.webutil.handlers import memcache_response
@ -73,16 +72,15 @@ class ActorHandler(webapp2.RequestHandler):
@memcache_response(CACHE_TIME)
def get(self, domain):
url = 'http://%s/' % domain
resp = common.requests_get(url)
mf2 = mf2py.parse(resp.text, url=resp.url, img_with_alt=True)
mf2 = util.fetch_mf2('http://%s/' % domain, gateway=True,
headers=common.HEADERS)
# logging.info('Parsed mf2 for %s: %s', resp.url, json.dumps(mf2, indent=2))
hcard = mf2util.representative_hcard(mf2, resp.url)
hcard = mf2util.representative_hcard(mf2, mf2['url'])
logging.info('Representative h-card: %s', json.dumps(hcard, indent=2))
if not hcard:
common.error(self, """\
Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on %s""" % resp.url)
Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on %s""" % mf2['url'])
key = MagicKey.get_or_create(domain)
obj = common.postprocess_as2(as2.from_as1(microformats2.json_to_object(hcard)),

Wyświetl plik

@ -8,7 +8,6 @@ import logging
import re
import urlparse
from bs4 import BeautifulSoup
from granary import as2
from oauth_dropins.webutil import handlers, util
import requests
@ -42,7 +41,7 @@ AS2_PUBLIC_AUDIENCE = 'https://www.w3.org/ns/activitystreams#Public'
CONTENT_TYPE_AS2_LD = b'application/ld+json; profile="https://www.w3.org/ns/activitystreams"'
CONTENT_TYPE_AS2 = b'application/activity+json'
CONTENT_TYPE_AS1 = b'application/stream+json'
CONTENT_TYPE_HTML = b'text/html'
CONTENT_TYPE_HTML = b'text/html; charset=utf-8'
CONTENT_TYPE_ATOM = b'application/atom+xml'
CONTENT_TYPE_MAGIC_ENVELOPE = b'application/magic-envelope+xml'
@ -78,16 +77,7 @@ def requests_post(url, **kwargs):
def _requests_fn(fn, url, parse_json=False, **kwargs):
"""Wraps requests.* and adds raise_for_status() and User-Agent."""
kwargs.setdefault('headers', {}).update(HEADERS)
try:
resp = fn(url, **kwargs)
except ValueError as e:
msg = 'Bad URL %s: %s' % (url, e)
logging.warning(msg)
raise exc.HTTPBadRequest(msg)
except (requests.ConnectionError, requests.Timeout) as e:
logging.warning(url, exc_info=True)
raise exc.HTTPBadGateway(unicode(e))
resp = fn(url, gateway=True, **kwargs)
logging.info('Got %s headers:%s', resp.status_code, resp.headers)
type = content_type(resp)
@ -95,11 +85,6 @@ def _requests_fn(fn, url, parse_json=False, **kwargs):
(type.startswith('text/') or type.endswith('+json') or type.endswith('/json'))):
logging.info(resp.text)
if resp.status_code // 100 in (4, 5):
msg = 'Received %s from %s:\n%s' % (resp.status_code, url, resp.text)
logging.info(msg)
raise exc.HTTPBadGateway(msg)
if parse_json:
try:
return resp.json()
@ -141,7 +126,7 @@ def get_as2(url):
if content_type(resp) in (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD):
return resp
parsed = beautifulsoup_parse(resp.content, from_encoding=resp.encoding)
parsed = util.parse_html(resp)
as2 = parsed.find('link', rel=('alternate', 'self'), type=(
CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD))
if not (as2 and as2['href']):
@ -402,25 +387,3 @@ def redirect_unwrap(val):
cache=memcache).url
return val
def beautifulsoup_parse(html, **kwargs):
"""Parses an HTML string with BeautifulSoup. Centralizes our parsing config.
*Copied from bridgy/util.py.*
We currently use lxml, which BeautifulSoup claims is the fastest and best:
http://www.crummy.com/software/BeautifulSoup/bs4/doc/#specifying-the-parser-to-use
lxml is a native module, so we don't bundle and deploy it to App Engine.
Instead, we use App Engine's version by declaring it in app.yaml.
https://cloud.google.com/appengine/docs/standard/python/tools/built-in-libraries-27
We pin App Engine's version in requirements.freeze.txt and tell BeautifulSoup
to use lxml explicitly to ensure we use the same parser and version in prod
and locally, since we've been bit by at least one meaningful difference
between lxml and e.g. html5lib: lxml includes the contents of <noscript> tags,
html5lib omits them. :(
https://github.com/snarfed/bridgy/issues/798#issuecomment-370508015
"""
return BeautifulSoup(html, 'lxml', **kwargs)

Wyświetl plik

@ -13,8 +13,8 @@ import json
import logging
from granary import as2, microformats2
import mf2py
import mf2util
from oauth_dropins.webutil import util
from oauth_dropins.webutil.handlers import memcache_response
import webapp2
@ -53,10 +53,9 @@ class RedirectHandler(webapp2.RequestHandler):
Currently mainly for Pixelfed.
https://github.com/snarfed/bridgy-fed/issues/39
"""
resp = common.requests_get(url)
mf2 = mf2py.parse(resp.text, url=resp.url, img_with_alt=True)
mf2 = util.fetch_mf2(url)
entry = mf2util.find_first_entry(mf2, ['h-entry'])
logging.info('Parsed mf2 for %s: %s', resp.url, json.dumps(entry, indent=2))
logging.info('Parsed mf2 for %s: %s', mf2['url'], json.dumps(entry, indent=2))
obj = common.postprocess_as2(as2.from_as1(microformats2.json_to_object(entry)))
logging.info('Returning: %s', json.dumps(obj, indent=2))

Wyświetl plik

@ -1,4 +1,4 @@
beautifulsoup4==4.6.3
beautifulsoup4==4.8.0
brevity==0.2.17
certifi==2019.3.9
chardet==3.0.4

Wyświetl plik

@ -143,11 +143,11 @@ class ActivityPubTest(testutil.TestCase):
<body>
<a class="h-card u-url" rel="me" href="/about-me">Mrs. Foo</a>
</body>
""", url='https://foo.com/')
""", url='https://foo.com/', content_type=common.CONTENT_TYPE_HTML)
got = app.get_response('/foo.com')
mock_get.assert_called_once_with('http://foo.com/', headers=common.HEADERS,
timeout=util.HTTP_TIMEOUT)
stream=True, timeout=util.HTTP_TIMEOUT)
self.assertEquals(200, got.status_int)
self.assertEquals(common.CONTENT_TYPE_AS2, got.headers['Content-Type'])
self.assertEquals({
@ -179,7 +179,7 @@ class ActivityPubTest(testutil.TestCase):
got = app.get_response('/foo.com')
mock_get.assert_called_once_with('http://foo.com/', headers=common.HEADERS,
timeout=util.HTTP_TIMEOUT)
stream=True, timeout=util.HTTP_TIMEOUT)
self.assertEquals(400, got.status_int)
self.assertIn('representative h-card', got.body)
@ -234,7 +234,7 @@ class ActivityPubTest(testutil.TestCase):
self.assertEquals(200, got.status_int, got.body)
mock_head.assert_called_once_with(
'http://this', allow_redirects=True, timeout=15)
'http://this', allow_redirects=True, stream=True, timeout=15)
mock_get.assert_not_called()
mock_post.assert_not_called()
self.assertEquals(0, Response.query().count())
@ -293,7 +293,7 @@ class ActivityPubTest(testutil.TestCase):
as2_headers = copy.deepcopy(common.HEADERS)
as2_headers.update(common.CONNEG_HEADERS_AS2_HTML)
mock_get.assert_has_calls((
call('http://orig/actor', headers=as2_headers, timeout=15),
call('http://orig/actor', headers=as2_headers, stream=True, timeout=15),
call('http://orig/post', headers=common.HEADERS, verify=False),
))
@ -330,7 +330,7 @@ class ActivityPubTest(testutil.TestCase):
as2_headers = copy.deepcopy(common.HEADERS)
as2_headers.update(common.CONNEG_HEADERS_AS2_HTML)
mock_get.assert_has_calls((
call(FOLLOW['actor'], headers=as2_headers, timeout=15),
call(FOLLOW['actor'], headers=as2_headers, stream=True, timeout=15),
))
# check AP Accept

Wyświetl plik

@ -33,7 +33,7 @@ class RedirectTest(testutil.TestCase):
self._test_as2(common.CONTENT_TYPE_AS2_LD)
@patch('requests.get')
def _test_as2(self, content_type, mock_get):
def _test_as2(self, accept, mock_get):
"""Currently mainly for Pixelfed.
https://github.com/snarfed/bridgy-fed/issues/39
@ -45,11 +45,9 @@ class RedirectTest(testutil.TestCase):
})
mock_get.return_value = requests_response(
REPOST_HTML, content_type=content_type)
REPOST_HTML, content_type=common.CONTENT_TYPE_HTML)
got = app.get_response('/r/https://foo.com/bar', headers={
'Accept': 'application/ld+json; profile="https://www.w3.org/ns/activitystreams"',
})
got = app.get_response('/r/https://foo.com/bar', headers={'Accept': accept})
args, kwargs = mock_get.call_args
self.assertEqual(('https://foo.com/bar',), args)

Wyświetl plik

@ -112,7 +112,7 @@ class WebFingerTest(testutil.TestCase):
self.assertEquals('application/json; charset=utf-8',
got.headers['Content-Type'])
mock_get.assert_called_once_with('http://foo.com/', headers=common.HEADERS,
timeout=util.HTTP_TIMEOUT)
stream=True, timeout=util.HTTP_TIMEOUT)
self.assertEquals(self.expected_webfinger, json.loads(got.body))
@ -171,7 +171,7 @@ class WebFingerTest(testutil.TestCase):
""")
got = app.get_response('/foo.com')
mock_get.assert_called_once_with('http://foo.com/', headers=common.HEADERS,
timeout=util.HTTP_TIMEOUT)
stream=True, timeout=util.HTTP_TIMEOUT)
self.assertEquals(400, got.status_int)
self.assertIn('representative h-card', got.body)

Wyświetl plik

@ -13,7 +13,6 @@ from django_salmon import magicsigs, utils
import feedparser
from granary import atom, microformats2
from httpsig.sign import HeaderSigner
import mf2py
import mock
from mock import call
from oauth_dropins.webutil import util
@ -128,12 +127,12 @@ class WebmentionTest(testutil.TestCase):
"""
self.reply = requests_response(
self.reply_html, content_type=CONTENT_TYPE_HTML)
self.reply_mf2 = mf2py.parse(self.reply_html, url='http://a/reply')
self.reply_mf2 = util.parse_mf2(self.reply_html, url='http://a/reply')
self.repost_html = REPOST_HTML
self.repost = requests_response(
self.repost_html, content_type=CONTENT_TYPE_HTML)
self.repost_mf2 = mf2py.parse(self.repost_html, url='http://a/repost')
self.repost_mf2 = util.parse_mf2(self.repost_html, url='http://a/repost')
self.repost_as2 = REPOST_AS2
self.like_html = """\
@ -149,7 +148,7 @@ class WebmentionTest(testutil.TestCase):
"""
self.like = requests_response(
self.like_html, content_type=CONTENT_TYPE_HTML)
self.like_mf2 = mf2py.parse(self.like_html, url='http://a/like')
self.like_mf2 = util.parse_mf2(self.like_html, url='http://a/like')
self.actor = requests_response({
'objectType' : 'person',
@ -204,7 +203,7 @@ class WebmentionTest(testutil.TestCase):
"""
self.follow = requests_response(
self.follow_html, content_type=CONTENT_TYPE_HTML)
self.follow_mf2 = mf2py.parse(self.follow_html, url='http://a/follow')
self.follow_mf2 = util.parse_mf2(self.follow_html, url='http://a/follow')
self.follow_as2 = {
'@context': 'https://www.w3.org/ns/activitystreams',
'type': 'Follow',
@ -241,7 +240,7 @@ class WebmentionTest(testutil.TestCase):
"""
self.create = requests_response(
self.create_html, content_type=CONTENT_TYPE_HTML)
self.create_mf2 = mf2py.parse(self.create_html, url='http://a/create')
self.create_mf2 = util.parse_mf2(self.create_html, url='http://a/create')
self.create_as2 = {
'@context': 'https://www.w3.org/ns/activitystreams',
'type': 'Create',
@ -567,7 +566,7 @@ class WebmentionTest(testutil.TestCase):
<img class="u-photo" src="/pic" />
</body>
</html>
""", content_type=CONTENT_TYPE_HTML)
""", url='http://orig', content_type=CONTENT_TYPE_HTML)
mock_get.side_effect = [repost, author, self.orig_as2, self.actor]
mock_post.return_value = requests_response('abc xyz', status=201)
@ -800,7 +799,7 @@ class WebmentionTest(testutil.TestCase):
mock_get.assert_any_call(
'http://orig/.well-known/webfinger?resource=acct:ryan@orig',
headers=HEADERS, timeout=util.HTTP_TIMEOUT, verify=False)
headers=HEADERS, stream=True, timeout=util.HTTP_TIMEOUT, verify=False)
self.assertEqual(('http://orig/@ryan/salmon',), mock_post.call_args[0])
def test_salmon_no_target_atom(self, mock_get, mock_post):
@ -835,7 +834,7 @@ class WebmentionTest(testutil.TestCase):
self.assertEquals(200, got.status_int)
mock_get.assert_any_call('http://orig/atom/1', headers=HEADERS,
timeout=util.HTTP_TIMEOUT)
stream=True, timeout=util.HTTP_TIMEOUT)
data = self.verify_salmon(mock_post)
def test_salmon_relative_atom_href_with_base(self, mock_get, mock_post):
@ -855,5 +854,5 @@ class WebmentionTest(testutil.TestCase):
self.assertEquals(200, got.status_int)
mock_get.assert_any_call('http://orig/base/atom/1', headers=HEADERS,
timeout=util.HTTP_TIMEOUT)
stream=True, timeout=util.HTTP_TIMEOUT)
data = self.verify_salmon(mock_post)

Wyświetl plik

@ -38,5 +38,6 @@ class TestCase(unittest.TestCase, testutil.Asserts):
kwargs['headers'] = headers
kwargs.setdefault('timeout', util.HTTP_TIMEOUT)
kwargs.setdefault('stream', True)
return call(url, **kwargs)

Wyświetl plik

@ -19,7 +19,6 @@ import urlparse
import appengine_config
import mf2py
import mf2util
from oauth_dropins.webutil import handlers, util
import webapp2
@ -55,8 +54,8 @@ class UserHandler(handlers.XrdOrJrdHandler):
for candidate in urls:
resp = common.requests_get(candidate)
parsed = common.beautifulsoup_parse(resp.content, from_encoding=resp.encoding)
mf2 = mf2py.parse(parsed, url=resp.url, img_with_alt=True)
parsed = util.parse_html(resp)
mf2 = util.parse_mf2(parsed, url=resp.url)
# logging.debug('Parsed mf2 for %s: %s', resp.url, json.dumps(mf2, indent=2))
hcard = mf2util.representative_hcard(mf2, resp.url)
if hcard:

Wyświetl plik

@ -17,7 +17,6 @@ import feedparser
from google.appengine.api import mail
from google.appengine.ext.ndb import Key
from granary import as2, atom, microformats2, source
import mf2py
import mf2util
from oauth_dropins.webutil import util
import requests
@ -47,8 +46,9 @@ class WebmentionHandler(webapp2.RequestHandler):
source_resp = common.requests_get(source)
self.source_url = source_resp.url or source
self.source_domain = urlparse.urlparse(self.source_url).netloc.split(':')[0]
self.source_mf2 = mf2py.parse(source_resp.text, url=self.source_url, img_with_alt=True)
# logging.debug('Parsed mf2 for %s: %s', source_resp.url, json.dumps(self.source_mf2, indent=2))
self.source_mf2 = util.parse_mf2(source_resp)
# logging.debug('Parsed mf2 for %s: %s', source_resp.url, json.dumps(self.source_mf2 indent=2))
# check for backlink to bridgy fed (for webmention spec and to confirm
# source's intent to federate to mastodon)
@ -239,8 +239,7 @@ class WebmentionHandler(webapp2.RequestHandler):
if not self.target_resp:
self.target_resp = common.requests_get(resp.target())
parsed = common.beautifulsoup_parse(self.target_resp.content,
from_encoding=self.target_resp.encoding)
parsed = util.parse_html(self.target_resp)
atom_url = parsed.find('link', rel='alternate', type=common.CONTENT_TYPE_ATOM)
if not atom_url or not atom_url.get('href'):
common.error(self, 'Target post %s has no Atom link' % resp.target(),