kopia lustrzana https://github.com/dgtlmoon/changedetection.io
Be sure that documents returned with a application/json header are not parsed with inscriptis (#337)
* Auto-detect JSON by Content-Type header * Add test to not parse JSON responses with inscriptispull/350/head
rodzic
fbd9ecab62
commit
023951a10e
|
@ -15,6 +15,7 @@ class Fetcher():
|
|||
error = None
|
||||
status_code = None
|
||||
content = None # Should always be bytes.
|
||||
headers = None
|
||||
|
||||
fetcher_description ="No description"
|
||||
|
||||
|
@ -113,6 +114,7 @@ class html_webdriver(Fetcher):
|
|||
# @todo - dom wait loaded?
|
||||
time.sleep(5)
|
||||
self.content = driver.page_source
|
||||
self.headers = {}
|
||||
|
||||
driver.quit()
|
||||
|
||||
|
@ -156,4 +158,5 @@ class html_requests(Fetcher):
|
|||
|
||||
self.status_code = r.status_code
|
||||
self.content = html
|
||||
self.headers = r.headers
|
||||
|
||||
|
|
|
@ -103,9 +103,16 @@ class perform_site_check():
|
|||
# https://stackoverflow.com/questions/41817578/basic-method-chaining ?
|
||||
# return content().textfilter().jsonextract().checksumcompare() ?
|
||||
|
||||
is_html = True
|
||||
is_json = fetcher.headers.get('Content-Type', '') == 'application/json'
|
||||
is_html = not is_json
|
||||
css_filter_rule = watch['css_filter']
|
||||
if css_filter_rule and len(css_filter_rule.strip()):
|
||||
|
||||
has_filter_rule = css_filter_rule and len(css_filter_rule.strip())
|
||||
if is_json and not has_filter_rule:
|
||||
css_filter_rule = "json:$"
|
||||
has_filter_rule = True
|
||||
|
||||
if has_filter_rule:
|
||||
if 'json:' in css_filter_rule:
|
||||
stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule)
|
||||
is_html = False
|
||||
|
@ -116,7 +123,7 @@ class perform_site_check():
|
|||
if is_html:
|
||||
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
||||
html_content = fetcher.content
|
||||
if css_filter_rule and len(css_filter_rule.strip()):
|
||||
if has_filter_rule:
|
||||
html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
|
||||
|
||||
# get_text() via inscriptis
|
||||
|
|
|
@ -111,6 +111,21 @@ def set_original_response():
|
|||
f.write(test_return_data)
|
||||
return None
|
||||
|
||||
|
||||
def set_response_with_html():
|
||||
test_return_data = """
|
||||
{
|
||||
"test": [
|
||||
{
|
||||
"html": "<b>"
|
||||
}
|
||||
]
|
||||
}
|
||||
"""
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write(test_return_data)
|
||||
return None
|
||||
|
||||
def set_modified_response():
|
||||
test_return_data = """
|
||||
{
|
||||
|
@ -138,6 +153,37 @@ def set_modified_response():
|
|||
|
||||
return None
|
||||
|
||||
def test_check_json_without_filter(client, live_server):
|
||||
# Request a JSON document from a application/json source containing HTML
|
||||
# and be sure it doesn't get chewed up by instriptis
|
||||
set_response_with_html()
|
||||
|
||||
# Give the endpoint time to spin up
|
||||
time.sleep(1)
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_endpoint_json', _external=True)
|
||||
client.post(
|
||||
url_for("import_page"),
|
||||
data={"urls": test_url},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
# Trigger a check
|
||||
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(3)
|
||||
|
||||
res = client.get(
|
||||
url_for("preview_page", uuid="first"),
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b'"<b>' in res.data
|
||||
assert res.data.count(b'{\n') >= 2
|
||||
|
||||
|
||||
def test_check_json_filter(client, live_server):
|
||||
json_filter = 'json:boss.name'
|
||||
|
||||
|
|
|
@ -44,6 +44,16 @@ def live_server_setup(live_server):
|
|||
with open("test-datastore/endpoint-content.txt", "r") as f:
|
||||
return f.read()
|
||||
|
||||
@live_server.app.route('/test-endpoint-json')
|
||||
def test_endpoint_json():
|
||||
|
||||
from flask import make_response
|
||||
|
||||
with open("test-datastore/endpoint-content.txt", "r") as f:
|
||||
resp = make_response(f.read())
|
||||
resp.headers['Content-Type'] = 'application/json'
|
||||
return resp
|
||||
|
||||
# Just return the headers in the request
|
||||
@live_server.app.route('/test-headers')
|
||||
def test_headers():
|
||||
|
|
Ładowanie…
Reference in New Issue