diff --git a/changedetectionio/conditions/__init__.py b/changedetectionio/conditions/__init__.py index 05ef53e2..c09e526d 100644 --- a/changedetectionio/conditions/__init__.py +++ b/changedetectionio/conditions/__init__.py @@ -5,7 +5,7 @@ from json_logic.builtins import BUILTINS from .exceptions import EmptyConditionRuleRowNotUsable from .pluggy_interface import plugin_manager # Import the pluggy plugin manager from . import default_plugin - +from loguru import logger # List of all supported JSON Logic operators operator_choices = [ (None, "Choose one - Operator"), @@ -113,12 +113,14 @@ def execute_ruleset_against_all_plugins(current_watch_uuid: str, application_dat application_datastruct=application_datastruct, ephemeral_data=ephemeral_data ) - + logger.debug(f"Trying plugin {plugin}....") + # Set a timeout of 10 seconds try: new_execute_data = future.result(timeout=10) if new_execute_data and isinstance(new_execute_data, dict): EXECUTE_DATA.update(new_execute_data) + except concurrent.futures.TimeoutError: # The plugin took too long, abort processing for this watch raise Exception(f"Plugin {plugin.__class__.__name__} took more than 10 seconds to run.") diff --git a/changedetectionio/conditions/plugins/levenshtein_plugin.py b/changedetectionio/conditions/plugins/levenshtein_plugin.py index df8341d9..dbd32391 100644 --- a/changedetectionio/conditions/plugins/levenshtein_plugin.py +++ b/changedetectionio/conditions/plugins/levenshtein_plugin.py @@ -9,15 +9,20 @@ def levenshtein_ratio_recent_history(watch, incoming_text=None): try: from Levenshtein import ratio, distance k = list(watch.history.keys()) - if len(k) >= 2: - # When called from ui_edit_stats_extras, we don't have incoming_text - if incoming_text is None: - a = watch.get_history_snapshot(timestamp=k[-1]) # Latest snapshot - b = watch.get_history_snapshot(timestamp=k[-2]) # Previous snapshot - else: - a = watch.get_history_snapshot(timestamp=k[-2]) # Second newest, incoming_text will be "newest" - b = incoming_text - + a = None + b = None + + # When called from ui_edit_stats_extras, we don't have incoming_text + if incoming_text is None: + a = watch.get_history_snapshot(timestamp=k[-1]) # Latest snapshot + b = watch.get_history_snapshot(timestamp=k[-2]) # Previous snapshot + + # Needs atleast one snapshot + elif len(k) >= 1: # Should be atleast one snapshot to compare against + a = watch.get_history_snapshot(timestamp=k[-1]) # Latest saved snapshot + b = incoming_text if incoming_text else k[-2] + + if a and b: distance_value = distance(a, b) ratio_value = ratio(a, b) return { @@ -53,7 +58,7 @@ def add_data(current_watch_uuid, application_datastruct, ephemeral_data): # ephemeral_data['text'] will be the current text after filters, they may have edited filters but not saved them yet etc if watch and 'text' in ephemeral_data: - lev_data = levenshtein_ratio_recent_history(watch, ephemeral_data['text']) + lev_data = levenshtein_ratio_recent_history(watch, ephemeral_data.get('text','')) if isinstance(lev_data, dict): res['levenshtein_ratio'] = lev_data.get('ratio', 0) res['levenshtein_similarity'] = lev_data.get('percent_similar', 0) diff --git a/changedetectionio/tests/test_conditions.py b/changedetectionio/tests/test_conditions.py index fdb6d880..14dde024 100644 --- a/changedetectionio/tests/test_conditions.py +++ b/changedetectionio/tests/test_conditions.py @@ -196,7 +196,11 @@ def test_condition_validate_rule_row(client, live_server): ) assert res.status_code == 200 assert b'false' in res.data - + # cleanup for the next + client.get( + url_for("ui.form_delete", uuid="all"), + follow_redirects=True + ) @@ -235,4 +239,107 @@ def test_wordcount_conditions_plugin(client, live_server, measure_memory_usage): ) # Assert the word count is counted correctly - assert b'13' in res.data \ No newline at end of file + assert b'13' in res.data + + # cleanup for the next + client.get( + url_for("ui.form_delete", uuid="all"), + follow_redirects=True + ) + +# If there was only a change in the whitespacing, then we shouldnt have a change detected +def test_lev_conditions_plugin(client, live_server, measure_memory_usage): + #live_server_setup(live_server) + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(""" + + Some initial text
+

Which is across multiple lines

+
+ So let's see what happens.
+ + + """) + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("ui.ui_views.form_quick_watch_add"), + data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'}, + follow_redirects=True + ) + assert b"Watch added in Paused state, saving will unpause" in res.data + + uuid = next(iter(live_server.app.config['DATASTORE'].data['watching'])) + # Give the thread time to pick it up + wait_for_all_checks(client) + res = client.post( + url_for("ui.ui_edit.edit_page", uuid=uuid, unpause_on_save=1), + data={ + "url": test_url, + "fetch_backend": "html_requests", + "conditions_match_logic": "ALL", # ALL = AND logic + "conditions-0-field": "levenshtein_ratio", + "conditions-0-operator": "<", + "conditions-0-value": "0.8" # needs to be more of a diff to trigger a change + }, + follow_redirects=True + ) + + assert b"unpaused" in res.data + + wait_for_all_checks(client) + res = client.get(url_for("watchlist.index")) + assert b'unviewed' not in res.data + + # Check the content saved initially, even tho a condition was set - this is the first snapshot so shouldnt be affected by conditions + res = client.get( + url_for("ui.ui_views.preview_page", uuid=uuid), + follow_redirects=True + ) + assert b'Which is across multiple lines' in res.data + + + ############### Now change it a LITTLE bit... + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(""" + + Some initial text
+

Which is across multiple lines

+
+ So let's see what happenxxxxxxxxx.
+ + + """) + + res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True) + assert b'Queued 1 watch for rechecking.' in res.data + wait_for_all_checks(client) + + res = client.get(url_for("watchlist.index")) + assert b'unviewed' not in res.data #because this will be like 0.90 not 0.8 threshold + + ############### Now change it a MORE THAN 50% + test_return_data = """ + + Some sxxxx
+

Which is across a lines

+
+ ok.
+ + + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True) + assert b'Queued 1 watch for rechecking.' in res.data + wait_for_all_checks(client) + res = client.get(url_for("watchlist.index")) + assert b'unviewed' in res.data + # cleanup for the next + client.get( + url_for("ui.form_delete", uuid="all"), + follow_redirects=True + ) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 83f97720..c6c0495a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -90,6 +90,8 @@ extruct # For cleaning up unknown currency formats babel +levenshtein + # Needed for > 3.10, https://github.com/microsoft/playwright-python/issues/2096 greenlet >= 3.0.3