kopia lustrzana https://github.com/dgtlmoon/changedetection.io
Be sure that documents returned with a application/json header are not parsed with inscriptis (#337)
* Auto-detect JSON by Content-Type header * Add test to not parse JSON responses with inscriptispull/350/head
rodzic
fbd9ecab62
commit
023951a10e
|
@ -15,6 +15,7 @@ class Fetcher():
|
||||||
error = None
|
error = None
|
||||||
status_code = None
|
status_code = None
|
||||||
content = None # Should always be bytes.
|
content = None # Should always be bytes.
|
||||||
|
headers = None
|
||||||
|
|
||||||
fetcher_description ="No description"
|
fetcher_description ="No description"
|
||||||
|
|
||||||
|
@ -113,6 +114,7 @@ class html_webdriver(Fetcher):
|
||||||
# @todo - dom wait loaded?
|
# @todo - dom wait loaded?
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
self.content = driver.page_source
|
self.content = driver.page_source
|
||||||
|
self.headers = {}
|
||||||
|
|
||||||
driver.quit()
|
driver.quit()
|
||||||
|
|
||||||
|
@ -156,4 +158,5 @@ class html_requests(Fetcher):
|
||||||
|
|
||||||
self.status_code = r.status_code
|
self.status_code = r.status_code
|
||||||
self.content = html
|
self.content = html
|
||||||
|
self.headers = r.headers
|
||||||
|
|
||||||
|
|
|
@ -103,9 +103,16 @@ class perform_site_check():
|
||||||
# https://stackoverflow.com/questions/41817578/basic-method-chaining ?
|
# https://stackoverflow.com/questions/41817578/basic-method-chaining ?
|
||||||
# return content().textfilter().jsonextract().checksumcompare() ?
|
# return content().textfilter().jsonextract().checksumcompare() ?
|
||||||
|
|
||||||
is_html = True
|
is_json = fetcher.headers.get('Content-Type', '') == 'application/json'
|
||||||
|
is_html = not is_json
|
||||||
css_filter_rule = watch['css_filter']
|
css_filter_rule = watch['css_filter']
|
||||||
if css_filter_rule and len(css_filter_rule.strip()):
|
|
||||||
|
has_filter_rule = css_filter_rule and len(css_filter_rule.strip())
|
||||||
|
if is_json and not has_filter_rule:
|
||||||
|
css_filter_rule = "json:$"
|
||||||
|
has_filter_rule = True
|
||||||
|
|
||||||
|
if has_filter_rule:
|
||||||
if 'json:' in css_filter_rule:
|
if 'json:' in css_filter_rule:
|
||||||
stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule)
|
stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule)
|
||||||
is_html = False
|
is_html = False
|
||||||
|
@ -116,7 +123,7 @@ class perform_site_check():
|
||||||
if is_html:
|
if is_html:
|
||||||
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
||||||
html_content = fetcher.content
|
html_content = fetcher.content
|
||||||
if css_filter_rule and len(css_filter_rule.strip()):
|
if has_filter_rule:
|
||||||
html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
|
html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
|
||||||
|
|
||||||
# get_text() via inscriptis
|
# get_text() via inscriptis
|
||||||
|
|
|
@ -111,6 +111,21 @@ def set_original_response():
|
||||||
f.write(test_return_data)
|
f.write(test_return_data)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def set_response_with_html():
|
||||||
|
test_return_data = """
|
||||||
|
{
|
||||||
|
"test": [
|
||||||
|
{
|
||||||
|
"html": "<b>"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||||
|
f.write(test_return_data)
|
||||||
|
return None
|
||||||
|
|
||||||
def set_modified_response():
|
def set_modified_response():
|
||||||
test_return_data = """
|
test_return_data = """
|
||||||
{
|
{
|
||||||
|
@ -138,6 +153,37 @@ def set_modified_response():
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def test_check_json_without_filter(client, live_server):
|
||||||
|
# Request a JSON document from a application/json source containing HTML
|
||||||
|
# and be sure it doesn't get chewed up by instriptis
|
||||||
|
set_response_with_html()
|
||||||
|
|
||||||
|
# Give the endpoint time to spin up
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
# Add our URL to the import page
|
||||||
|
test_url = url_for('test_endpoint_json', _external=True)
|
||||||
|
client.post(
|
||||||
|
url_for("import_page"),
|
||||||
|
data={"urls": test_url},
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Trigger a check
|
||||||
|
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||||
|
|
||||||
|
# Give the thread time to pick it up
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
res = client.get(
|
||||||
|
url_for("preview_page", uuid="first"),
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
|
||||||
|
assert b'"<b>' in res.data
|
||||||
|
assert res.data.count(b'{\n') >= 2
|
||||||
|
|
||||||
|
|
||||||
def test_check_json_filter(client, live_server):
|
def test_check_json_filter(client, live_server):
|
||||||
json_filter = 'json:boss.name'
|
json_filter = 'json:boss.name'
|
||||||
|
|
||||||
|
|
|
@ -44,6 +44,16 @@ def live_server_setup(live_server):
|
||||||
with open("test-datastore/endpoint-content.txt", "r") as f:
|
with open("test-datastore/endpoint-content.txt", "r") as f:
|
||||||
return f.read()
|
return f.read()
|
||||||
|
|
||||||
|
@live_server.app.route('/test-endpoint-json')
|
||||||
|
def test_endpoint_json():
|
||||||
|
|
||||||
|
from flask import make_response
|
||||||
|
|
||||||
|
with open("test-datastore/endpoint-content.txt", "r") as f:
|
||||||
|
resp = make_response(f.read())
|
||||||
|
resp.headers['Content-Type'] = 'application/json'
|
||||||
|
return resp
|
||||||
|
|
||||||
# Just return the headers in the request
|
# Just return the headers in the request
|
||||||
@live_server.app.route('/test-headers')
|
@live_server.app.route('/test-headers')
|
||||||
def test_headers():
|
def test_headers():
|
||||||
|
|
Ładowanie…
Reference in New Issue