kopia lustrzana https://github.com/dgtlmoon/changedetection.io
New filter - Block change-detection if text matches - for example, block change-detection while the text "out of stock" is on the page, know when the text is no longer on the page (#698)
rodzic
bb732d3d2e
commit
7da32f9ac3
|
@ -225,25 +225,40 @@ class perform_site_check():
|
||||||
fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest()
|
fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest()
|
||||||
|
|
||||||
############ Blocking rules, after checksum #################
|
############ Blocking rules, after checksum #################
|
||||||
blocked_by_not_found_trigger_text = False
|
blocked = False
|
||||||
|
|
||||||
if len(watch['trigger_text']):
|
if len(watch['trigger_text']):
|
||||||
# Yeah, lets block first until something matches
|
# Assume blocked
|
||||||
blocked_by_not_found_trigger_text = True
|
blocked = True
|
||||||
# Filter and trigger works the same, so reuse it
|
# Filter and trigger works the same, so reuse it
|
||||||
# It should return the line numbers that match
|
# It should return the line numbers that match
|
||||||
result = html_tools.strip_ignore_text(content=str(stripped_text_from_html),
|
result = html_tools.strip_ignore_text(content=str(stripped_text_from_html),
|
||||||
wordlist=watch['trigger_text'],
|
wordlist=watch['trigger_text'],
|
||||||
mode="line numbers")
|
mode="line numbers")
|
||||||
# If it returned any lines that matched..
|
# Unblock if the trigger was found
|
||||||
if result:
|
if result:
|
||||||
blocked_by_not_found_trigger_text = False
|
blocked = False
|
||||||
|
|
||||||
if not blocked_by_not_found_trigger_text and watch['previous_md5'] != fetched_md5:
|
|
||||||
|
if len(watch['text_should_not_be_present']):
|
||||||
|
# If anything matched, then we should block a change from happening
|
||||||
|
result = html_tools.strip_ignore_text(content=str(stripped_text_from_html),
|
||||||
|
wordlist=watch['text_should_not_be_present'],
|
||||||
|
mode="line numbers")
|
||||||
|
if result:
|
||||||
|
blocked = True
|
||||||
|
|
||||||
|
# The main thing that all this at the moment comes down to :)
|
||||||
|
if watch['previous_md5'] != fetched_md5:
|
||||||
changed_detected = True
|
changed_detected = True
|
||||||
|
|
||||||
|
# Looks like something changed, but did it match all the rules?
|
||||||
|
if blocked:
|
||||||
|
changed_detected = False
|
||||||
|
else:
|
||||||
update_obj["last_changed"] = timestamp
|
update_obj["last_changed"] = timestamp
|
||||||
|
|
||||||
|
|
||||||
# Extract title as title
|
# Extract title as title
|
||||||
if is_html:
|
if is_html:
|
||||||
if self.datastore.data['settings']['application']['extract_title_as_title'] or watch['extract_title_as_title']:
|
if self.datastore.data['settings']['application']['extract_title_as_title'] or watch['extract_title_as_title']:
|
||||||
|
@ -257,5 +272,4 @@ class perform_site_check():
|
||||||
if not watch.get('previous_md5'):
|
if not watch.get('previous_md5'):
|
||||||
watch['previous_md5'] = fetched_md5
|
watch['previous_md5'] = fetched_md5
|
||||||
|
|
||||||
|
|
||||||
return changed_detected, update_obj, text_content_before_ignored_filter, fetcher.screenshot, fetcher.xpath_data
|
return changed_detected, update_obj, text_content_before_ignored_filter, fetcher.screenshot, fetcher.xpath_data
|
||||||
|
|
|
@ -341,6 +341,8 @@ class watchForm(commonSettingsForm):
|
||||||
method = SelectField('Request method', choices=valid_method, default=default_method)
|
method = SelectField('Request method', choices=valid_method, default=default_method)
|
||||||
ignore_status_codes = BooleanField('Ignore status codes (process non-2xx status codes as normal)', default=False)
|
ignore_status_codes = BooleanField('Ignore status codes (process non-2xx status codes as normal)', default=False)
|
||||||
trigger_text = StringListField('Trigger/wait for text', [validators.Optional(), ValidateListRegex()])
|
trigger_text = StringListField('Trigger/wait for text', [validators.Optional(), ValidateListRegex()])
|
||||||
|
text_should_not_be_present = StringListField('Block change-detection if text matches', [validators.Optional(), ValidateListRegex()])
|
||||||
|
|
||||||
save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"})
|
save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"})
|
||||||
save_and_preview_button = SubmitField('Save & Preview', render_kw={"class": "pure-button pure-button-primary"})
|
save_and_preview_button = SubmitField('Save & Preview', render_kw={"class": "pure-button pure-button-primary"})
|
||||||
proxy = RadioField('Proxy')
|
proxy = RadioField('Proxy')
|
||||||
|
|
|
@ -38,6 +38,7 @@ class model(dict):
|
||||||
'extract_text': [], # Extract text by regex after filters
|
'extract_text': [], # Extract text by regex after filters
|
||||||
'subtractive_selectors': [],
|
'subtractive_selectors': [],
|
||||||
'trigger_text': [], # List of text or regex to wait for until a change is detected
|
'trigger_text': [], # List of text or regex to wait for until a change is detected
|
||||||
|
'text_should_not_be_present': [], # Text that should not present
|
||||||
'fetch_backend': None,
|
'fetch_backend': None,
|
||||||
'extract_title_as_title': False,
|
'extract_title_as_title': False,
|
||||||
'proxy': None, # Preferred proxy connection
|
'proxy': None, # Preferred proxy connection
|
||||||
|
@ -85,7 +86,7 @@ class model(dict):
|
||||||
# Read the history file as a dict
|
# Read the history file as a dict
|
||||||
fname = os.path.join(self.__datastore_path, self.get('uuid'), "history.txt")
|
fname = os.path.join(self.__datastore_path, self.get('uuid'), "history.txt")
|
||||||
if os.path.isfile(fname):
|
if os.path.isfile(fname):
|
||||||
logging.debug("Disk IO accessed " + str(time.time()))
|
logging.debug("Reading history index " + str(time.time()))
|
||||||
with open(fname, "r") as f:
|
with open(fname, "r") as f:
|
||||||
tmp_history = dict(i.strip().split(',', 2) for i in f.readlines())
|
tmp_history = dict(i.strip().split(',', 2) for i in f.readlines())
|
||||||
|
|
||||||
|
|
|
@ -9,8 +9,6 @@
|
||||||
# exit when any command fails
|
# exit when any command fails
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
export MINIMUM_SECONDS_RECHECK_TIME=0
|
|
||||||
|
|
||||||
find tests/test_*py -type f|while read test_name
|
find tests/test_*py -type f|while read test_name
|
||||||
do
|
do
|
||||||
echo "TEST RUNNING $test_name"
|
echo "TEST RUNNING $test_name"
|
||||||
|
|
|
@ -290,14 +290,15 @@ class ChangeDetectionStore:
|
||||||
headers={'App-Guid': self.__data['app_guid']})
|
headers={'App-Guid': self.__data['app_guid']})
|
||||||
res = r.json()
|
res = r.json()
|
||||||
|
|
||||||
# List of permisable stuff we accept from the wild internet
|
# List of permissible attributes we accept from the wild internet
|
||||||
for k in ['url', 'tag',
|
for k in ['url', 'tag',
|
||||||
'paused', 'title',
|
'paused', 'title',
|
||||||
'previous_md5', 'headers',
|
'previous_md5', 'headers',
|
||||||
'body', 'method',
|
'body', 'method',
|
||||||
'ignore_text', 'css_filter',
|
'ignore_text', 'css_filter',
|
||||||
'subtractive_selectors', 'trigger_text',
|
'subtractive_selectors', 'trigger_text',
|
||||||
'extract_title_as_title', 'extract_text']:
|
'extract_title_as_title', 'extract_text',
|
||||||
|
'text_should_not_be_present']:
|
||||||
if res.get(k):
|
if res.get(k):
|
||||||
apply_extras[k] = res[k]
|
apply_extras[k] = res[k]
|
||||||
|
|
||||||
|
|
|
@ -199,6 +199,22 @@ nav
|
||||||
</span>
|
</span>
|
||||||
</div>
|
</div>
|
||||||
</fieldset>
|
</fieldset>
|
||||||
|
<fieldset>
|
||||||
|
<div class="pure-control-group">
|
||||||
|
{{ render_field(form.text_should_not_be_present, rows=5, placeholder="For example: Out of stock
|
||||||
|
Sold out
|
||||||
|
Not in stock
|
||||||
|
Unavailable") }}
|
||||||
|
<span class="pure-form-message-inline">
|
||||||
|
<ul>
|
||||||
|
<li>Block change-detection while this text is on the page, all text and regex are tested <i>case-insensitive</i>, good for waiting for when a product is available again</li>
|
||||||
|
<li>Block text is processed from the result-text that comes out of any CSS/JSON Filters for this watch</li>
|
||||||
|
<li>All lines here must not exist (think of each line as "OR")</li>
|
||||||
|
<li>Note: Wrap in forward slash / to use regex example: <code>/foo\d/</code></li>
|
||||||
|
</ul>
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
</fieldset>
|
||||||
<fieldset>
|
<fieldset>
|
||||||
<div class="pure-control-group">
|
<div class="pure-control-group">
|
||||||
{{ render_field(form.extract_text, rows=5, placeholder="\d+ online") }}
|
{{ render_field(form.extract_text, rows=5, placeholder="\d+ online") }}
|
||||||
|
|
|
@ -32,6 +32,8 @@ def app(request):
|
||||||
"""Create application for the tests."""
|
"""Create application for the tests."""
|
||||||
datastore_path = "./test-datastore"
|
datastore_path = "./test-datastore"
|
||||||
|
|
||||||
|
# So they don't delay in fetching
|
||||||
|
os.environ["MINIMUM_SECONDS_RECHECK_TIME"] = "0"
|
||||||
try:
|
try:
|
||||||
os.mkdir(datastore_path)
|
os.mkdir(datastore_path)
|
||||||
except FileExistsError:
|
except FileExistsError:
|
||||||
|
|
|
@ -0,0 +1,137 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import time
|
||||||
|
from flask import url_for
|
||||||
|
from . util import live_server_setup
|
||||||
|
from changedetectionio import html_tools
|
||||||
|
|
||||||
|
def set_original_ignore_response():
|
||||||
|
test_return_data = """<html>
|
||||||
|
<body>
|
||||||
|
Some initial text</br>
|
||||||
|
<p>Which is across multiple lines</p>
|
||||||
|
</br>
|
||||||
|
So let's see what happens. </br>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||||
|
f.write(test_return_data)
|
||||||
|
|
||||||
|
|
||||||
|
def set_modified_original_ignore_response():
|
||||||
|
test_return_data = """<html>
|
||||||
|
<body>
|
||||||
|
Some NEW nice initial text</br>
|
||||||
|
<p>Which is across multiple lines</p>
|
||||||
|
</br>
|
||||||
|
So let's see what happens. </br>
|
||||||
|
<p>new ignore stuff</p>
|
||||||
|
<p>out of stock</p>
|
||||||
|
<p>blah</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||||
|
f.write(test_return_data)
|
||||||
|
|
||||||
|
|
||||||
|
# Is the same but includes ZZZZZ, 'ZZZZZ' is the last line in ignore_text
|
||||||
|
def set_modified_response_minus_block_text():
|
||||||
|
test_return_data = """<html>
|
||||||
|
<body>
|
||||||
|
Some NEW nice initial text</br>
|
||||||
|
<p>Which is across multiple lines</p>
|
||||||
|
<p>now on sale $2/p>
|
||||||
|
</br>
|
||||||
|
So let's see what happens. </br>
|
||||||
|
<p>new ignore stuff</p>
|
||||||
|
<p>blah</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||||
|
f.write(test_return_data)
|
||||||
|
|
||||||
|
|
||||||
|
def test_check_block_changedetection_text_NOT_present(client, live_server):
|
||||||
|
sleep_time_for_fetch_thread = 3
|
||||||
|
live_server_setup(live_server)
|
||||||
|
# Use a mix of case in ZzZ to prove it works case-insensitive.
|
||||||
|
ignore_text = "out of stoCk\r\nfoobar"
|
||||||
|
|
||||||
|
set_original_ignore_response()
|
||||||
|
|
||||||
|
# Give the endpoint time to spin up
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
# Add our URL to the import page
|
||||||
|
test_url = url_for('test_endpoint', _external=True)
|
||||||
|
res = client.post(
|
||||||
|
url_for("import_page"),
|
||||||
|
data={"urls": test_url},
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
assert b"1 Imported" in res.data
|
||||||
|
|
||||||
|
# Give the thread time to pick it up
|
||||||
|
time.sleep(sleep_time_for_fetch_thread)
|
||||||
|
|
||||||
|
# Goto the edit page, add our ignore text
|
||||||
|
# Add our URL to the import page
|
||||||
|
res = client.post(
|
||||||
|
url_for("edit_page", uuid="first"),
|
||||||
|
data={"text_should_not_be_present": ignore_text, "url": test_url, 'fetch_backend': "html_requests"},
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
assert b"Updated watch." in res.data
|
||||||
|
|
||||||
|
# Give the thread time to pick it up
|
||||||
|
time.sleep(sleep_time_for_fetch_thread)
|
||||||
|
# Check it saved
|
||||||
|
res = client.get(
|
||||||
|
url_for("edit_page", uuid="first"),
|
||||||
|
)
|
||||||
|
assert bytes(ignore_text.encode('utf-8')) in res.data
|
||||||
|
|
||||||
|
# Trigger a check
|
||||||
|
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||||
|
|
||||||
|
# Give the thread time to pick it up
|
||||||
|
time.sleep(sleep_time_for_fetch_thread)
|
||||||
|
|
||||||
|
# It should report nothing found (no new 'unviewed' class)
|
||||||
|
res = client.get(url_for("index"))
|
||||||
|
assert b'unviewed' not in res.data
|
||||||
|
assert b'/test-endpoint' in res.data
|
||||||
|
|
||||||
|
# The page changed, BUT the text is still there, just the rest of it changes, we should not see a change
|
||||||
|
set_modified_original_ignore_response()
|
||||||
|
|
||||||
|
# Trigger a check
|
||||||
|
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||||
|
# Give the thread time to pick it up
|
||||||
|
time.sleep(sleep_time_for_fetch_thread)
|
||||||
|
|
||||||
|
# It should report nothing found (no new 'unviewed' class)
|
||||||
|
res = client.get(url_for("index"))
|
||||||
|
assert b'unviewed' not in res.data
|
||||||
|
assert b'/test-endpoint' in res.data
|
||||||
|
|
||||||
|
|
||||||
|
# Now we set a change where the text is gone, it should now trigger
|
||||||
|
set_modified_response_minus_block_text()
|
||||||
|
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||||
|
time.sleep(sleep_time_for_fetch_thread)
|
||||||
|
|
||||||
|
res = client.get(url_for("index"))
|
||||||
|
assert b'unviewed' in res.data
|
||||||
|
|
||||||
|
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||||
|
assert b'Deleted' in res.data
|
|
@ -98,9 +98,16 @@ class update_worker(threading.Thread):
|
||||||
|
|
||||||
# Notifications should only trigger on the second time (first time, we gather the initial snapshot)
|
# Notifications should only trigger on the second time (first time, we gather the initial snapshot)
|
||||||
if watch.history_n >= 2:
|
if watch.history_n >= 2:
|
||||||
|
print(">> Change detected in UUID {} - {}".format(uuid, watch['url']))
|
||||||
dates = list(watch.history.keys())
|
watch_history = watch.history
|
||||||
prev_fname = watch.history[dates[-2]]
|
dates = list(watch_history.keys())
|
||||||
|
# Theoretically it's possible that this could be just 1 long,
|
||||||
|
# - In the case that the timestamp key was not unique
|
||||||
|
if len(dates) == 1:
|
||||||
|
raise ValueError(
|
||||||
|
"History index had 2 or more, but only 1 date loaded, timestamps were not unique? maybe two of the same timestamps got written, needs more delay?"
|
||||||
|
)
|
||||||
|
prev_fname = watch_history[dates[-2]]
|
||||||
|
|
||||||
|
|
||||||
# Did it have any notification alerts to hit?
|
# Did it have any notification alerts to hit?
|
||||||
|
|
Ładowanie…
Reference in New Issue