kopia lustrzana https://github.com/dgtlmoon/changedetection.io
UI + Fetching - Be more helpful when a filter contains no text, suggest ways to deal with images in filters (#1819)
rodzic
1e88136325
commit
e30b17b8bc
|
@ -77,11 +77,13 @@ class ScreenshotUnavailable(Exception):
|
||||||
|
|
||||||
|
|
||||||
class ReplyWithContentButNoText(Exception):
|
class ReplyWithContentButNoText(Exception):
|
||||||
def __init__(self, status_code, url, screenshot=None):
|
def __init__(self, status_code, url, screenshot=None, has_filters=False, html_content=''):
|
||||||
# Set this so we can use it in other parts of the app
|
# Set this so we can use it in other parts of the app
|
||||||
self.status_code = status_code
|
self.status_code = status_code
|
||||||
self.url = url
|
self.url = url
|
||||||
self.screenshot = screenshot
|
self.screenshot = screenshot
|
||||||
|
self.has_filters = has_filters
|
||||||
|
self.html_content = html_content
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -314,7 +314,12 @@ class perform_site_check(difference_detection_processor):
|
||||||
# Treat pages with no renderable text content as a change? No by default
|
# Treat pages with no renderable text content as a change? No by default
|
||||||
empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
|
empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
|
||||||
if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
|
if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
|
||||||
raise content_fetcher.ReplyWithContentButNoText(url=url, status_code=fetcher.get_last_status_code(), screenshot=screenshot)
|
raise content_fetcher.ReplyWithContentButNoText(url=url,
|
||||||
|
status_code=fetcher.get_last_status_code(),
|
||||||
|
screenshot=screenshot,
|
||||||
|
has_filters=has_filter_rule,
|
||||||
|
html_content=html_content
|
||||||
|
)
|
||||||
|
|
||||||
# We rely on the actual text in the html output.. many sites have random script vars etc,
|
# We rely on the actual text in the html output.. many sites have random script vars etc,
|
||||||
# in the future we'll implement other mechanisms.
|
# in the future we'll implement other mechanisms.
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
import time
|
import time
|
||||||
from flask import url_for
|
from flask import url_for
|
||||||
from . util import live_server_setup
|
from .util import live_server_setup, wait_for_all_checks
|
||||||
|
|
||||||
from ..html_tools import *
|
from ..html_tools import *
|
||||||
|
|
||||||
|
@ -176,3 +176,77 @@ def test_check_multiple_filters(client, live_server):
|
||||||
assert b"Blob A" in res.data # CSS was ok
|
assert b"Blob A" in res.data # CSS was ok
|
||||||
assert b"Blob B" in res.data # xPath was ok
|
assert b"Blob B" in res.data # xPath was ok
|
||||||
assert b"Blob C" not in res.data # Should not be included
|
assert b"Blob C" not in res.data # Should not be included
|
||||||
|
|
||||||
|
# The filter exists, but did not contain anything useful
|
||||||
|
# Mainly used when the filter contains just an IMG, this can happen when someone selects an image in the visual-selector
|
||||||
|
# Tests fetcher can throw a "ReplyWithContentButNoText" exception after applying filter and extracting text
|
||||||
|
def test_filter_is_empty_help_suggestion(client, live_server):
|
||||||
|
#live_server_setup(live_server)
|
||||||
|
|
||||||
|
include_filters = "#blob-a"
|
||||||
|
|
||||||
|
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||||
|
f.write("""<html><body>
|
||||||
|
<div id="blob-a">
|
||||||
|
<img src="something.jpg">
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
|
# Add our URL to the import page
|
||||||
|
test_url = url_for('test_endpoint', _external=True)
|
||||||
|
res = client.post(
|
||||||
|
url_for("import_page"),
|
||||||
|
data={"urls": test_url},
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
assert b"1 Imported" in res.data
|
||||||
|
wait_for_all_checks(client)
|
||||||
|
|
||||||
|
# Goto the edit page, add our ignore text
|
||||||
|
# Add our URL to the import page
|
||||||
|
res = client.post(
|
||||||
|
url_for("edit_page", uuid="first"),
|
||||||
|
data={"include_filters": include_filters,
|
||||||
|
"url": test_url,
|
||||||
|
"tags": "",
|
||||||
|
"headers": "",
|
||||||
|
'fetch_backend': "html_requests"},
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
assert b"Updated watch." in res.data
|
||||||
|
|
||||||
|
wait_for_all_checks(client)
|
||||||
|
|
||||||
|
|
||||||
|
res = client.get(
|
||||||
|
url_for("index"),
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
|
||||||
|
assert b'empty result or contain only an image' in res.data
|
||||||
|
|
||||||
|
|
||||||
|
### Just an empty selector, no image
|
||||||
|
|
||||||
|
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||||
|
f.write("""<html><body>
|
||||||
|
<div id="blob-a">
|
||||||
|
<!-- doo doo -->
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""")
|
||||||
|
|
||||||
|
res = client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||||
|
wait_for_all_checks(client)
|
||||||
|
|
||||||
|
res = client.get(
|
||||||
|
url_for("index"),
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
|
||||||
|
assert b'empty result or contain only an image' not in res.data
|
||||||
|
assert b'but contained no usable text' in res.data
|
||||||
|
|
|
@ -3,7 +3,7 @@ import threading
|
||||||
import queue
|
import queue
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from changedetectionio import content_fetcher
|
from changedetectionio import content_fetcher, html_tools
|
||||||
from .processors.text_json_diff import FilterNotFoundInResponse
|
from .processors.text_json_diff import FilterNotFoundInResponse
|
||||||
from .processors.restock_diff import UnableToExtractRestockData
|
from .processors.restock_diff import UnableToExtractRestockData
|
||||||
|
|
||||||
|
@ -251,7 +251,20 @@ class update_worker(threading.Thread):
|
||||||
# Totally fine, it's by choice - just continue on, nothing more to care about
|
# Totally fine, it's by choice - just continue on, nothing more to care about
|
||||||
# Page had elements/content but no renderable text
|
# Page had elements/content but no renderable text
|
||||||
# Backend (not filters) gave zero output
|
# Backend (not filters) gave zero output
|
||||||
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Got HTML content but no text found (With {} reply code).".format(e.status_code)})
|
extra_help = ""
|
||||||
|
if e.has_filters:
|
||||||
|
# Maybe it contains an image? offer a more helpful link
|
||||||
|
has_img = html_tools.include_filters(include_filters='img',
|
||||||
|
html_content=e.html_content)
|
||||||
|
if has_img:
|
||||||
|
extra_help = ", it's possible that the filters you have give an empty result or contain only an image <a href=\"https://github.com/dgtlmoon/changedetection.io/wiki/Detecting-changes-in-images\">more help here</a>."
|
||||||
|
else:
|
||||||
|
extra_help = ", it's possible that the filters were found, but contained no usable text."
|
||||||
|
|
||||||
|
self.datastore.update_watch(uuid=uuid, update_obj={
|
||||||
|
'last_error': f"Got HTML content but no text found (With {e.status_code} reply code){extra_help}"
|
||||||
|
})
|
||||||
|
|
||||||
if e.screenshot:
|
if e.screenshot:
|
||||||
self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot)
|
self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot)
|
||||||
process_changedetection_results = False
|
process_changedetection_results = False
|
||||||
|
|
Ładowanie…
Reference in New Issue