kopia lustrzana https://github.com/snarfed/bridgy-fed
unify HTTP fetching, HTML and mf2 parsing, and error handling into webutil
see: * snarfed/granary#171 * snarfed/webutil@f994884b2b * snarfed/oauth-dropins@f5b6e73530 * snarfed/granary@284eb11508pull/59/head
rodzic
00d0cc5557
commit
7c9a03c827
|
@ -8,7 +8,6 @@ import appengine_config
|
|||
|
||||
from google.appengine.ext import ndb
|
||||
from granary import as2, microformats2
|
||||
import mf2py
|
||||
import mf2util
|
||||
from oauth_dropins.webutil import util
|
||||
from oauth_dropins.webutil.handlers import memcache_response
|
||||
|
@ -73,16 +72,15 @@ class ActorHandler(webapp2.RequestHandler):
|
|||
|
||||
@memcache_response(CACHE_TIME)
|
||||
def get(self, domain):
|
||||
url = 'http://%s/' % domain
|
||||
resp = common.requests_get(url)
|
||||
mf2 = mf2py.parse(resp.text, url=resp.url, img_with_alt=True)
|
||||
mf2 = util.fetch_mf2('http://%s/' % domain, gateway=True,
|
||||
headers=common.HEADERS)
|
||||
# logging.info('Parsed mf2 for %s: %s', resp.url, json.dumps(mf2, indent=2))
|
||||
|
||||
hcard = mf2util.representative_hcard(mf2, resp.url)
|
||||
hcard = mf2util.representative_hcard(mf2, mf2['url'])
|
||||
logging.info('Representative h-card: %s', json.dumps(hcard, indent=2))
|
||||
if not hcard:
|
||||
common.error(self, """\
|
||||
Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on %s""" % resp.url)
|
||||
Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on %s""" % mf2['url'])
|
||||
|
||||
key = MagicKey.get_or_create(domain)
|
||||
obj = common.postprocess_as2(as2.from_as1(microformats2.json_to_object(hcard)),
|
||||
|
|
43
common.py
43
common.py
|
@ -8,7 +8,6 @@ import logging
|
|||
import re
|
||||
import urlparse
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from granary import as2
|
||||
from oauth_dropins.webutil import handlers, util
|
||||
import requests
|
||||
|
@ -42,7 +41,7 @@ AS2_PUBLIC_AUDIENCE = 'https://www.w3.org/ns/activitystreams#Public'
|
|||
CONTENT_TYPE_AS2_LD = b'application/ld+json; profile="https://www.w3.org/ns/activitystreams"'
|
||||
CONTENT_TYPE_AS2 = b'application/activity+json'
|
||||
CONTENT_TYPE_AS1 = b'application/stream+json'
|
||||
CONTENT_TYPE_HTML = b'text/html'
|
||||
CONTENT_TYPE_HTML = b'text/html; charset=utf-8'
|
||||
CONTENT_TYPE_ATOM = b'application/atom+xml'
|
||||
CONTENT_TYPE_MAGIC_ENVELOPE = b'application/magic-envelope+xml'
|
||||
|
||||
|
@ -78,16 +77,7 @@ def requests_post(url, **kwargs):
|
|||
def _requests_fn(fn, url, parse_json=False, **kwargs):
|
||||
"""Wraps requests.* and adds raise_for_status() and User-Agent."""
|
||||
kwargs.setdefault('headers', {}).update(HEADERS)
|
||||
|
||||
try:
|
||||
resp = fn(url, **kwargs)
|
||||
except ValueError as e:
|
||||
msg = 'Bad URL %s: %s' % (url, e)
|
||||
logging.warning(msg)
|
||||
raise exc.HTTPBadRequest(msg)
|
||||
except (requests.ConnectionError, requests.Timeout) as e:
|
||||
logging.warning(url, exc_info=True)
|
||||
raise exc.HTTPBadGateway(unicode(e))
|
||||
resp = fn(url, gateway=True, **kwargs)
|
||||
|
||||
logging.info('Got %s headers:%s', resp.status_code, resp.headers)
|
||||
type = content_type(resp)
|
||||
|
@ -95,11 +85,6 @@ def _requests_fn(fn, url, parse_json=False, **kwargs):
|
|||
(type.startswith('text/') or type.endswith('+json') or type.endswith('/json'))):
|
||||
logging.info(resp.text)
|
||||
|
||||
if resp.status_code // 100 in (4, 5):
|
||||
msg = 'Received %s from %s:\n%s' % (resp.status_code, url, resp.text)
|
||||
logging.info(msg)
|
||||
raise exc.HTTPBadGateway(msg)
|
||||
|
||||
if parse_json:
|
||||
try:
|
||||
return resp.json()
|
||||
|
@ -141,7 +126,7 @@ def get_as2(url):
|
|||
if content_type(resp) in (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD):
|
||||
return resp
|
||||
|
||||
parsed = beautifulsoup_parse(resp.content, from_encoding=resp.encoding)
|
||||
parsed = util.parse_html(resp)
|
||||
as2 = parsed.find('link', rel=('alternate', 'self'), type=(
|
||||
CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD))
|
||||
if not (as2 and as2['href']):
|
||||
|
@ -402,25 +387,3 @@ def redirect_unwrap(val):
|
|||
cache=memcache).url
|
||||
|
||||
return val
|
||||
|
||||
|
||||
def beautifulsoup_parse(html, **kwargs):
|
||||
"""Parses an HTML string with BeautifulSoup. Centralizes our parsing config.
|
||||
|
||||
*Copied from bridgy/util.py.*
|
||||
|
||||
We currently use lxml, which BeautifulSoup claims is the fastest and best:
|
||||
http://www.crummy.com/software/BeautifulSoup/bs4/doc/#specifying-the-parser-to-use
|
||||
|
||||
lxml is a native module, so we don't bundle and deploy it to App Engine.
|
||||
Instead, we use App Engine's version by declaring it in app.yaml.
|
||||
https://cloud.google.com/appengine/docs/standard/python/tools/built-in-libraries-27
|
||||
|
||||
We pin App Engine's version in requirements.freeze.txt and tell BeautifulSoup
|
||||
to use lxml explicitly to ensure we use the same parser and version in prod
|
||||
and locally, since we've been bit by at least one meaningful difference
|
||||
between lxml and e.g. html5lib: lxml includes the contents of <noscript> tags,
|
||||
html5lib omits them. :(
|
||||
https://github.com/snarfed/bridgy/issues/798#issuecomment-370508015
|
||||
"""
|
||||
return BeautifulSoup(html, 'lxml', **kwargs)
|
||||
|
|
|
@ -13,8 +13,8 @@ import json
|
|||
import logging
|
||||
|
||||
from granary import as2, microformats2
|
||||
import mf2py
|
||||
import mf2util
|
||||
from oauth_dropins.webutil import util
|
||||
from oauth_dropins.webutil.handlers import memcache_response
|
||||
import webapp2
|
||||
|
||||
|
@ -53,10 +53,9 @@ class RedirectHandler(webapp2.RequestHandler):
|
|||
Currently mainly for Pixelfed.
|
||||
https://github.com/snarfed/bridgy-fed/issues/39
|
||||
"""
|
||||
resp = common.requests_get(url)
|
||||
mf2 = mf2py.parse(resp.text, url=resp.url, img_with_alt=True)
|
||||
mf2 = util.fetch_mf2(url)
|
||||
entry = mf2util.find_first_entry(mf2, ['h-entry'])
|
||||
logging.info('Parsed mf2 for %s: %s', resp.url, json.dumps(entry, indent=2))
|
||||
logging.info('Parsed mf2 for %s: %s', mf2['url'], json.dumps(entry, indent=2))
|
||||
|
||||
obj = common.postprocess_as2(as2.from_as1(microformats2.json_to_object(entry)))
|
||||
logging.info('Returning: %s', json.dumps(obj, indent=2))
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
beautifulsoup4==4.6.3
|
||||
beautifulsoup4==4.8.0
|
||||
brevity==0.2.17
|
||||
certifi==2019.3.9
|
||||
chardet==3.0.4
|
||||
|
|
|
@ -143,11 +143,11 @@ class ActivityPubTest(testutil.TestCase):
|
|||
<body>
|
||||
<a class="h-card u-url" rel="me" href="/about-me">Mrs. ☕ Foo</a>
|
||||
</body>
|
||||
""", url='https://foo.com/')
|
||||
""", url='https://foo.com/', content_type=common.CONTENT_TYPE_HTML)
|
||||
|
||||
got = app.get_response('/foo.com')
|
||||
mock_get.assert_called_once_with('http://foo.com/', headers=common.HEADERS,
|
||||
timeout=util.HTTP_TIMEOUT)
|
||||
stream=True, timeout=util.HTTP_TIMEOUT)
|
||||
self.assertEquals(200, got.status_int)
|
||||
self.assertEquals(common.CONTENT_TYPE_AS2, got.headers['Content-Type'])
|
||||
self.assertEquals({
|
||||
|
@ -179,7 +179,7 @@ class ActivityPubTest(testutil.TestCase):
|
|||
|
||||
got = app.get_response('/foo.com')
|
||||
mock_get.assert_called_once_with('http://foo.com/', headers=common.HEADERS,
|
||||
timeout=util.HTTP_TIMEOUT)
|
||||
stream=True, timeout=util.HTTP_TIMEOUT)
|
||||
self.assertEquals(400, got.status_int)
|
||||
self.assertIn('representative h-card', got.body)
|
||||
|
||||
|
@ -234,7 +234,7 @@ class ActivityPubTest(testutil.TestCase):
|
|||
self.assertEquals(200, got.status_int, got.body)
|
||||
|
||||
mock_head.assert_called_once_with(
|
||||
'http://this', allow_redirects=True, timeout=15)
|
||||
'http://this', allow_redirects=True, stream=True, timeout=15)
|
||||
mock_get.assert_not_called()
|
||||
mock_post.assert_not_called()
|
||||
self.assertEquals(0, Response.query().count())
|
||||
|
@ -293,7 +293,7 @@ class ActivityPubTest(testutil.TestCase):
|
|||
as2_headers = copy.deepcopy(common.HEADERS)
|
||||
as2_headers.update(common.CONNEG_HEADERS_AS2_HTML)
|
||||
mock_get.assert_has_calls((
|
||||
call('http://orig/actor', headers=as2_headers, timeout=15),
|
||||
call('http://orig/actor', headers=as2_headers, stream=True, timeout=15),
|
||||
call('http://orig/post', headers=common.HEADERS, verify=False),
|
||||
))
|
||||
|
||||
|
@ -330,7 +330,7 @@ class ActivityPubTest(testutil.TestCase):
|
|||
as2_headers = copy.deepcopy(common.HEADERS)
|
||||
as2_headers.update(common.CONNEG_HEADERS_AS2_HTML)
|
||||
mock_get.assert_has_calls((
|
||||
call(FOLLOW['actor'], headers=as2_headers, timeout=15),
|
||||
call(FOLLOW['actor'], headers=as2_headers, stream=True, timeout=15),
|
||||
))
|
||||
|
||||
# check AP Accept
|
||||
|
|
|
@ -33,7 +33,7 @@ class RedirectTest(testutil.TestCase):
|
|||
self._test_as2(common.CONTENT_TYPE_AS2_LD)
|
||||
|
||||
@patch('requests.get')
|
||||
def _test_as2(self, content_type, mock_get):
|
||||
def _test_as2(self, accept, mock_get):
|
||||
"""Currently mainly for Pixelfed.
|
||||
|
||||
https://github.com/snarfed/bridgy-fed/issues/39
|
||||
|
@ -45,11 +45,9 @@ class RedirectTest(testutil.TestCase):
|
|||
})
|
||||
|
||||
mock_get.return_value = requests_response(
|
||||
REPOST_HTML, content_type=content_type)
|
||||
REPOST_HTML, content_type=common.CONTENT_TYPE_HTML)
|
||||
|
||||
got = app.get_response('/r/https://foo.com/bar', headers={
|
||||
'Accept': 'application/ld+json; profile="https://www.w3.org/ns/activitystreams"',
|
||||
})
|
||||
got = app.get_response('/r/https://foo.com/bar', headers={'Accept': accept})
|
||||
|
||||
args, kwargs = mock_get.call_args
|
||||
self.assertEqual(('https://foo.com/bar',), args)
|
||||
|
|
|
@ -112,7 +112,7 @@ class WebFingerTest(testutil.TestCase):
|
|||
self.assertEquals('application/json; charset=utf-8',
|
||||
got.headers['Content-Type'])
|
||||
mock_get.assert_called_once_with('http://foo.com/', headers=common.HEADERS,
|
||||
timeout=util.HTTP_TIMEOUT)
|
||||
stream=True, timeout=util.HTTP_TIMEOUT)
|
||||
|
||||
self.assertEquals(self.expected_webfinger, json.loads(got.body))
|
||||
|
||||
|
@ -171,7 +171,7 @@ class WebFingerTest(testutil.TestCase):
|
|||
""")
|
||||
got = app.get_response('/foo.com')
|
||||
mock_get.assert_called_once_with('http://foo.com/', headers=common.HEADERS,
|
||||
timeout=util.HTTP_TIMEOUT)
|
||||
stream=True, timeout=util.HTTP_TIMEOUT)
|
||||
self.assertEquals(400, got.status_int)
|
||||
self.assertIn('representative h-card', got.body)
|
||||
|
||||
|
|
|
@ -13,7 +13,6 @@ from django_salmon import magicsigs, utils
|
|||
import feedparser
|
||||
from granary import atom, microformats2
|
||||
from httpsig.sign import HeaderSigner
|
||||
import mf2py
|
||||
import mock
|
||||
from mock import call
|
||||
from oauth_dropins.webutil import util
|
||||
|
@ -128,12 +127,12 @@ class WebmentionTest(testutil.TestCase):
|
|||
"""
|
||||
self.reply = requests_response(
|
||||
self.reply_html, content_type=CONTENT_TYPE_HTML)
|
||||
self.reply_mf2 = mf2py.parse(self.reply_html, url='http://a/reply')
|
||||
self.reply_mf2 = util.parse_mf2(self.reply_html, url='http://a/reply')
|
||||
|
||||
self.repost_html = REPOST_HTML
|
||||
self.repost = requests_response(
|
||||
self.repost_html, content_type=CONTENT_TYPE_HTML)
|
||||
self.repost_mf2 = mf2py.parse(self.repost_html, url='http://a/repost')
|
||||
self.repost_mf2 = util.parse_mf2(self.repost_html, url='http://a/repost')
|
||||
self.repost_as2 = REPOST_AS2
|
||||
|
||||
self.like_html = """\
|
||||
|
@ -149,7 +148,7 @@ class WebmentionTest(testutil.TestCase):
|
|||
"""
|
||||
self.like = requests_response(
|
||||
self.like_html, content_type=CONTENT_TYPE_HTML)
|
||||
self.like_mf2 = mf2py.parse(self.like_html, url='http://a/like')
|
||||
self.like_mf2 = util.parse_mf2(self.like_html, url='http://a/like')
|
||||
|
||||
self.actor = requests_response({
|
||||
'objectType' : 'person',
|
||||
|
@ -204,7 +203,7 @@ class WebmentionTest(testutil.TestCase):
|
|||
"""
|
||||
self.follow = requests_response(
|
||||
self.follow_html, content_type=CONTENT_TYPE_HTML)
|
||||
self.follow_mf2 = mf2py.parse(self.follow_html, url='http://a/follow')
|
||||
self.follow_mf2 = util.parse_mf2(self.follow_html, url='http://a/follow')
|
||||
self.follow_as2 = {
|
||||
'@context': 'https://www.w3.org/ns/activitystreams',
|
||||
'type': 'Follow',
|
||||
|
@ -241,7 +240,7 @@ class WebmentionTest(testutil.TestCase):
|
|||
"""
|
||||
self.create = requests_response(
|
||||
self.create_html, content_type=CONTENT_TYPE_HTML)
|
||||
self.create_mf2 = mf2py.parse(self.create_html, url='http://a/create')
|
||||
self.create_mf2 = util.parse_mf2(self.create_html, url='http://a/create')
|
||||
self.create_as2 = {
|
||||
'@context': 'https://www.w3.org/ns/activitystreams',
|
||||
'type': 'Create',
|
||||
|
@ -567,7 +566,7 @@ class WebmentionTest(testutil.TestCase):
|
|||
<img class="u-photo" src="/pic" />
|
||||
</body>
|
||||
</html>
|
||||
""", content_type=CONTENT_TYPE_HTML)
|
||||
""", url='http://orig', content_type=CONTENT_TYPE_HTML)
|
||||
mock_get.side_effect = [repost, author, self.orig_as2, self.actor]
|
||||
mock_post.return_value = requests_response('abc xyz', status=201)
|
||||
|
||||
|
@ -800,7 +799,7 @@ class WebmentionTest(testutil.TestCase):
|
|||
|
||||
mock_get.assert_any_call(
|
||||
'http://orig/.well-known/webfinger?resource=acct:ryan@orig',
|
||||
headers=HEADERS, timeout=util.HTTP_TIMEOUT, verify=False)
|
||||
headers=HEADERS, stream=True, timeout=util.HTTP_TIMEOUT, verify=False)
|
||||
self.assertEqual(('http://orig/@ryan/salmon',), mock_post.call_args[0])
|
||||
|
||||
def test_salmon_no_target_atom(self, mock_get, mock_post):
|
||||
|
@ -835,7 +834,7 @@ class WebmentionTest(testutil.TestCase):
|
|||
self.assertEquals(200, got.status_int)
|
||||
|
||||
mock_get.assert_any_call('http://orig/atom/1', headers=HEADERS,
|
||||
timeout=util.HTTP_TIMEOUT)
|
||||
stream=True, timeout=util.HTTP_TIMEOUT)
|
||||
data = self.verify_salmon(mock_post)
|
||||
|
||||
def test_salmon_relative_atom_href_with_base(self, mock_get, mock_post):
|
||||
|
@ -855,5 +854,5 @@ class WebmentionTest(testutil.TestCase):
|
|||
self.assertEquals(200, got.status_int)
|
||||
|
||||
mock_get.assert_any_call('http://orig/base/atom/1', headers=HEADERS,
|
||||
timeout=util.HTTP_TIMEOUT)
|
||||
stream=True, timeout=util.HTTP_TIMEOUT)
|
||||
data = self.verify_salmon(mock_post)
|
||||
|
|
|
@ -38,5 +38,6 @@ class TestCase(unittest.TestCase, testutil.Asserts):
|
|||
kwargs['headers'] = headers
|
||||
|
||||
kwargs.setdefault('timeout', util.HTTP_TIMEOUT)
|
||||
kwargs.setdefault('stream', True)
|
||||
|
||||
return call(url, **kwargs)
|
||||
|
|
|
@ -19,7 +19,6 @@ import urlparse
|
|||
|
||||
import appengine_config
|
||||
|
||||
import mf2py
|
||||
import mf2util
|
||||
from oauth_dropins.webutil import handlers, util
|
||||
import webapp2
|
||||
|
@ -55,8 +54,8 @@ class UserHandler(handlers.XrdOrJrdHandler):
|
|||
|
||||
for candidate in urls:
|
||||
resp = common.requests_get(candidate)
|
||||
parsed = common.beautifulsoup_parse(resp.content, from_encoding=resp.encoding)
|
||||
mf2 = mf2py.parse(parsed, url=resp.url, img_with_alt=True)
|
||||
parsed = util.parse_html(resp)
|
||||
mf2 = util.parse_mf2(parsed, url=resp.url)
|
||||
# logging.debug('Parsed mf2 for %s: %s', resp.url, json.dumps(mf2, indent=2))
|
||||
hcard = mf2util.representative_hcard(mf2, resp.url)
|
||||
if hcard:
|
||||
|
|
|
@ -17,7 +17,6 @@ import feedparser
|
|||
from google.appengine.api import mail
|
||||
from google.appengine.ext.ndb import Key
|
||||
from granary import as2, atom, microformats2, source
|
||||
import mf2py
|
||||
import mf2util
|
||||
from oauth_dropins.webutil import util
|
||||
import requests
|
||||
|
@ -47,8 +46,9 @@ class WebmentionHandler(webapp2.RequestHandler):
|
|||
source_resp = common.requests_get(source)
|
||||
self.source_url = source_resp.url or source
|
||||
self.source_domain = urlparse.urlparse(self.source_url).netloc.split(':')[0]
|
||||
self.source_mf2 = mf2py.parse(source_resp.text, url=self.source_url, img_with_alt=True)
|
||||
# logging.debug('Parsed mf2 for %s: %s', source_resp.url, json.dumps(self.source_mf2, indent=2))
|
||||
self.source_mf2 = util.parse_mf2(source_resp)
|
||||
|
||||
# logging.debug('Parsed mf2 for %s: %s', source_resp.url, json.dumps(self.source_mf2 indent=2))
|
||||
|
||||
# check for backlink to bridgy fed (for webmention spec and to confirm
|
||||
# source's intent to federate to mastodon)
|
||||
|
@ -239,8 +239,7 @@ class WebmentionHandler(webapp2.RequestHandler):
|
|||
if not self.target_resp:
|
||||
self.target_resp = common.requests_get(resp.target())
|
||||
|
||||
parsed = common.beautifulsoup_parse(self.target_resp.content,
|
||||
from_encoding=self.target_resp.encoding)
|
||||
parsed = util.parse_html(self.target_resp)
|
||||
atom_url = parsed.find('link', rel='alternate', type=common.CONTENT_TYPE_ATOM)
|
||||
if not atom_url or not atom_url.get('href'):
|
||||
common.error(self, 'Target post %s has no Atom link' % resp.target(),
|
||||
|
|
Ładowanie…
Reference in New Issue