kopia lustrzana https://github.com/dgtlmoon/changedetection.io
Conditions - Levenshtein text similarity plugin - adding test, fixing import, fixing check for watches with 1 snapshot history (#3161)
rodzic
d0da8c9825
commit
5fd8200fd9
|
@ -5,7 +5,7 @@ from json_logic.builtins import BUILTINS
|
||||||
from .exceptions import EmptyConditionRuleRowNotUsable
|
from .exceptions import EmptyConditionRuleRowNotUsable
|
||||||
from .pluggy_interface import plugin_manager # Import the pluggy plugin manager
|
from .pluggy_interface import plugin_manager # Import the pluggy plugin manager
|
||||||
from . import default_plugin
|
from . import default_plugin
|
||||||
|
from loguru import logger
|
||||||
# List of all supported JSON Logic operators
|
# List of all supported JSON Logic operators
|
||||||
operator_choices = [
|
operator_choices = [
|
||||||
(None, "Choose one - Operator"),
|
(None, "Choose one - Operator"),
|
||||||
|
@ -113,12 +113,14 @@ def execute_ruleset_against_all_plugins(current_watch_uuid: str, application_dat
|
||||||
application_datastruct=application_datastruct,
|
application_datastruct=application_datastruct,
|
||||||
ephemeral_data=ephemeral_data
|
ephemeral_data=ephemeral_data
|
||||||
)
|
)
|
||||||
|
logger.debug(f"Trying plugin {plugin}....")
|
||||||
|
|
||||||
# Set a timeout of 10 seconds
|
# Set a timeout of 10 seconds
|
||||||
try:
|
try:
|
||||||
new_execute_data = future.result(timeout=10)
|
new_execute_data = future.result(timeout=10)
|
||||||
if new_execute_data and isinstance(new_execute_data, dict):
|
if new_execute_data and isinstance(new_execute_data, dict):
|
||||||
EXECUTE_DATA.update(new_execute_data)
|
EXECUTE_DATA.update(new_execute_data)
|
||||||
|
|
||||||
except concurrent.futures.TimeoutError:
|
except concurrent.futures.TimeoutError:
|
||||||
# The plugin took too long, abort processing for this watch
|
# The plugin took too long, abort processing for this watch
|
||||||
raise Exception(f"Plugin {plugin.__class__.__name__} took more than 10 seconds to run.")
|
raise Exception(f"Plugin {plugin.__class__.__name__} took more than 10 seconds to run.")
|
||||||
|
|
|
@ -9,15 +9,20 @@ def levenshtein_ratio_recent_history(watch, incoming_text=None):
|
||||||
try:
|
try:
|
||||||
from Levenshtein import ratio, distance
|
from Levenshtein import ratio, distance
|
||||||
k = list(watch.history.keys())
|
k = list(watch.history.keys())
|
||||||
if len(k) >= 2:
|
a = None
|
||||||
# When called from ui_edit_stats_extras, we don't have incoming_text
|
b = None
|
||||||
if incoming_text is None:
|
|
||||||
a = watch.get_history_snapshot(timestamp=k[-1]) # Latest snapshot
|
# When called from ui_edit_stats_extras, we don't have incoming_text
|
||||||
b = watch.get_history_snapshot(timestamp=k[-2]) # Previous snapshot
|
if incoming_text is None:
|
||||||
else:
|
a = watch.get_history_snapshot(timestamp=k[-1]) # Latest snapshot
|
||||||
a = watch.get_history_snapshot(timestamp=k[-2]) # Second newest, incoming_text will be "newest"
|
b = watch.get_history_snapshot(timestamp=k[-2]) # Previous snapshot
|
||||||
b = incoming_text
|
|
||||||
|
# Needs atleast one snapshot
|
||||||
|
elif len(k) >= 1: # Should be atleast one snapshot to compare against
|
||||||
|
a = watch.get_history_snapshot(timestamp=k[-1]) # Latest saved snapshot
|
||||||
|
b = incoming_text if incoming_text else k[-2]
|
||||||
|
|
||||||
|
if a and b:
|
||||||
distance_value = distance(a, b)
|
distance_value = distance(a, b)
|
||||||
ratio_value = ratio(a, b)
|
ratio_value = ratio(a, b)
|
||||||
return {
|
return {
|
||||||
|
@ -53,7 +58,7 @@ def add_data(current_watch_uuid, application_datastruct, ephemeral_data):
|
||||||
# ephemeral_data['text'] will be the current text after filters, they may have edited filters but not saved them yet etc
|
# ephemeral_data['text'] will be the current text after filters, they may have edited filters but not saved them yet etc
|
||||||
|
|
||||||
if watch and 'text' in ephemeral_data:
|
if watch and 'text' in ephemeral_data:
|
||||||
lev_data = levenshtein_ratio_recent_history(watch, ephemeral_data['text'])
|
lev_data = levenshtein_ratio_recent_history(watch, ephemeral_data.get('text',''))
|
||||||
if isinstance(lev_data, dict):
|
if isinstance(lev_data, dict):
|
||||||
res['levenshtein_ratio'] = lev_data.get('ratio', 0)
|
res['levenshtein_ratio'] = lev_data.get('ratio', 0)
|
||||||
res['levenshtein_similarity'] = lev_data.get('percent_similar', 0)
|
res['levenshtein_similarity'] = lev_data.get('percent_similar', 0)
|
||||||
|
|
|
@ -196,7 +196,11 @@ def test_condition_validate_rule_row(client, live_server):
|
||||||
)
|
)
|
||||||
assert res.status_code == 200
|
assert res.status_code == 200
|
||||||
assert b'false' in res.data
|
assert b'false' in res.data
|
||||||
|
# cleanup for the next
|
||||||
|
client.get(
|
||||||
|
url_for("ui.form_delete", uuid="all"),
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -235,4 +239,107 @@ def test_wordcount_conditions_plugin(client, live_server, measure_memory_usage):
|
||||||
)
|
)
|
||||||
|
|
||||||
# Assert the word count is counted correctly
|
# Assert the word count is counted correctly
|
||||||
assert b'<td>13</td>' in res.data
|
assert b'<td>13</td>' in res.data
|
||||||
|
|
||||||
|
# cleanup for the next
|
||||||
|
client.get(
|
||||||
|
url_for("ui.form_delete", uuid="all"),
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# If there was only a change in the whitespacing, then we shouldnt have a change detected
|
||||||
|
def test_lev_conditions_plugin(client, live_server, measure_memory_usage):
|
||||||
|
#live_server_setup(live_server)
|
||||||
|
|
||||||
|
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||||
|
f.write("""<html>
|
||||||
|
<body>
|
||||||
|
Some initial text<br>
|
||||||
|
<p>Which is across multiple lines</p>
|
||||||
|
<br>
|
||||||
|
So let's see what happens. <br>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Add our URL to the import page
|
||||||
|
test_url = url_for('test_endpoint', _external=True)
|
||||||
|
res = client.post(
|
||||||
|
url_for("ui.ui_views.form_quick_watch_add"),
|
||||||
|
data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
assert b"Watch added in Paused state, saving will unpause" in res.data
|
||||||
|
|
||||||
|
uuid = next(iter(live_server.app.config['DATASTORE'].data['watching']))
|
||||||
|
# Give the thread time to pick it up
|
||||||
|
wait_for_all_checks(client)
|
||||||
|
res = client.post(
|
||||||
|
url_for("ui.ui_edit.edit_page", uuid=uuid, unpause_on_save=1),
|
||||||
|
data={
|
||||||
|
"url": test_url,
|
||||||
|
"fetch_backend": "html_requests",
|
||||||
|
"conditions_match_logic": "ALL", # ALL = AND logic
|
||||||
|
"conditions-0-field": "levenshtein_ratio",
|
||||||
|
"conditions-0-operator": "<",
|
||||||
|
"conditions-0-value": "0.8" # needs to be more of a diff to trigger a change
|
||||||
|
},
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
|
||||||
|
assert b"unpaused" in res.data
|
||||||
|
|
||||||
|
wait_for_all_checks(client)
|
||||||
|
res = client.get(url_for("watchlist.index"))
|
||||||
|
assert b'unviewed' not in res.data
|
||||||
|
|
||||||
|
# Check the content saved initially, even tho a condition was set - this is the first snapshot so shouldnt be affected by conditions
|
||||||
|
res = client.get(
|
||||||
|
url_for("ui.ui_views.preview_page", uuid=uuid),
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
assert b'Which is across multiple lines' in res.data
|
||||||
|
|
||||||
|
|
||||||
|
############### Now change it a LITTLE bit...
|
||||||
|
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||||
|
f.write("""<html>
|
||||||
|
<body>
|
||||||
|
Some initial text<br>
|
||||||
|
<p>Which is across multiple lines</p>
|
||||||
|
<br>
|
||||||
|
So let's see what happenxxxxxxxxx. <br>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""")
|
||||||
|
|
||||||
|
res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
|
||||||
|
assert b'Queued 1 watch for rechecking.' in res.data
|
||||||
|
wait_for_all_checks(client)
|
||||||
|
|
||||||
|
res = client.get(url_for("watchlist.index"))
|
||||||
|
assert b'unviewed' not in res.data #because this will be like 0.90 not 0.8 threshold
|
||||||
|
|
||||||
|
############### Now change it a MORE THAN 50%
|
||||||
|
test_return_data = """<html>
|
||||||
|
<body>
|
||||||
|
Some sxxxx<br>
|
||||||
|
<p>Which is across a lines</p>
|
||||||
|
<br>
|
||||||
|
ok. <br>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||||
|
f.write(test_return_data)
|
||||||
|
res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
|
||||||
|
assert b'Queued 1 watch for rechecking.' in res.data
|
||||||
|
wait_for_all_checks(client)
|
||||||
|
res = client.get(url_for("watchlist.index"))
|
||||||
|
assert b'unviewed' in res.data
|
||||||
|
# cleanup for the next
|
||||||
|
client.get(
|
||||||
|
url_for("ui.form_delete", uuid="all"),
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
|
@ -90,6 +90,8 @@ extruct
|
||||||
# For cleaning up unknown currency formats
|
# For cleaning up unknown currency formats
|
||||||
babel
|
babel
|
||||||
|
|
||||||
|
levenshtein
|
||||||
|
|
||||||
# Needed for > 3.10, https://github.com/microsoft/playwright-python/issues/2096
|
# Needed for > 3.10, https://github.com/microsoft/playwright-python/issues/2096
|
||||||
greenlet >= 3.0.3
|
greenlet >= 3.0.3
|
||||||
|
|
||||||
|
|
Ładowanie…
Reference in New Issue