kopia lustrzana https://github.com/dgtlmoon/changedetection.io
Text filters - Adding filters "Trim whitespace" and "Remove duplicate lines"
rodzic
c6589ee1b4
commit
e830fb2320
|
@ -480,8 +480,10 @@ class processor_text_json_diff_form(commonSettingsForm):
|
|||
body = TextAreaField('Request body', [validators.Optional()])
|
||||
method = SelectField('Request method', choices=valid_method, default=default_method)
|
||||
ignore_status_codes = BooleanField('Ignore status codes (process non-2xx status codes as normal)', default=False)
|
||||
check_unique_lines = BooleanField('Only trigger when unique lines appear', default=False)
|
||||
check_unique_lines = BooleanField('Only trigger when unique lines appear in all history', default=False)
|
||||
remove_duplicate_lines = BooleanField('Remove duplicate lines of text', default=False)
|
||||
sort_text_alphabetically = BooleanField('Sort text alphabetically', default=False)
|
||||
trim_text_whitespace = BooleanField('Trim whitespace before and after text', default=False)
|
||||
|
||||
filter_text_added = BooleanField('Added lines', default=True)
|
||||
filter_text_replaced = BooleanField('Replaced/changed lines', default=True)
|
||||
|
|
|
@ -60,6 +60,8 @@ class watch_base(dict):
|
|||
'time_between_check_use_default': True,
|
||||
'title': None,
|
||||
'track_ldjson_price_data': None,
|
||||
'trim_text_whitespace': False,
|
||||
'remove_duplicate_lines': False,
|
||||
'trigger_text': [], # List of text or regex to wait for until a change is detected
|
||||
'url': '',
|
||||
'uuid': str(uuid.uuid4()),
|
||||
|
|
|
@ -218,11 +218,19 @@ class perform_site_check(difference_detection_processor):
|
|||
is_rss=is_rss)) #1874 activate the <title workaround hack
|
||||
stripped_text_from_html = future.result()
|
||||
|
||||
if watch.get('sort_text_alphabetically') and stripped_text_from_html:
|
||||
|
||||
if watch.get('trim_text_whitespace'):
|
||||
stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())
|
||||
|
||||
if watch.get('remove_duplicate_lines'):
|
||||
stripped_text_from_html = '\n'.join(dict.fromkeys(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines()))
|
||||
|
||||
if watch.get('sort_text_alphabetically'):
|
||||
# Note: Because a <p>something</p> will add an extra line feed to signify the paragraph gap
|
||||
# we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here.
|
||||
stripped_text_from_html = stripped_text_from_html.replace('\n\n', '\n')
|
||||
stripped_text_from_html = '\n'.join( sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower() ))
|
||||
stripped_text_from_html = stripped_text_from_html.replace("\n\n", "\n")
|
||||
stripped_text_from_html = '\n'.join(sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower()))
|
||||
|
||||
|
||||
# Re #340 - return the content before the 'ignore text' was applied
|
||||
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
|
||||
|
@ -304,7 +312,7 @@ class perform_site_check(difference_detection_processor):
|
|||
for match in res:
|
||||
regex_matched_output += [match] + [b'\n']
|
||||
|
||||
# Now we will only show what the regex matched
|
||||
##########################################################
|
||||
stripped_text_from_html = b''
|
||||
text_content_before_ignored_filter = b''
|
||||
if regex_matched_output:
|
||||
|
@ -312,6 +320,8 @@ class perform_site_check(difference_detection_processor):
|
|||
stripped_text_from_html = b''.join(regex_matched_output)
|
||||
text_content_before_ignored_filter = stripped_text_from_html
|
||||
|
||||
|
||||
|
||||
# Re #133 - if we should strip whitespaces from triggering the change detected comparison
|
||||
if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
|
||||
fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
|
||||
|
|
|
@ -331,11 +331,22 @@ nav
|
|||
<span class="pure-form-message-inline">So it's always better to select <strong>Added</strong>+<strong>Replaced</strong> when you're interested in new content.</span><br>
|
||||
<span class="pure-form-message-inline">When content is merely moved in a list, it will also trigger an <strong>addition</strong>, consider enabling <code><strong>Only trigger when unique lines appear</strong></code></span>
|
||||
</fieldset>
|
||||
|
||||
<fieldset class="pure-control-group">
|
||||
{{ render_checkbox_field(form.check_unique_lines) }}
|
||||
<span class="pure-form-message-inline">Good for websites that just move the content around, and you want to know when NEW content is added, compares new lines against all history for this watch.</span>
|
||||
</fieldset>
|
||||
<fieldset class="pure-control-group">
|
||||
{{ render_checkbox_field(form.remove_duplicate_lines) }}
|
||||
<span class="pure-form-message-inline">Remove duplicate lines of text</span>
|
||||
</fieldset>
|
||||
<fieldset class="pure-control-group">
|
||||
{{ render_checkbox_field(form.sort_text_alphabetically) }}
|
||||
<span class="pure-form-message-inline">Helps reduce changes detected caused by sites shuffling lines around, combine with <i>check unique lines</i> below.</span>
|
||||
</fieldset>
|
||||
<fieldset class="pure-control-group">
|
||||
{{ render_checkbox_field(form.trim_text_whitespace) }}
|
||||
<span class="pure-form-message-inline">Remove any whitespace before and after each line of text</span>
|
||||
</fieldset>
|
||||
<fieldset class="pure-control-group">
|
||||
{{ render_checkbox_field(form.check_unique_lines) }}
|
||||
<span class="pure-form-message-inline">Good for websites that just move the content around, and you want to know when NEW content is added, compares new lines against all history for this watch.</span>
|
||||
|
|
|
@ -11,6 +11,8 @@ def set_original_ignore_response():
|
|||
<p>Some initial text</p>
|
||||
<p>Which is across multiple lines</p>
|
||||
<p>So let's see what happens.</p>
|
||||
<p> So let's see what happens. <br> </p>
|
||||
<p>A - sortable line</p>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
@ -164,5 +166,52 @@ def test_sort_lines_functionality(client, live_server, measure_memory_usage):
|
|||
assert res.data.find(b'A uppercase') < res.data.find(b'Z last')
|
||||
assert res.data.find(b'Some initial text') < res.data.find(b'Which is across multiple lines')
|
||||
|
||||
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||
assert b'Deleted' in res.data
|
||||
|
||||
|
||||
def test_extra_filters(client, live_server, measure_memory_usage):
|
||||
#live_server_setup(live_server)
|
||||
|
||||
set_original_ignore_response()
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_endpoint', _external=True)
|
||||
res = client.post(
|
||||
url_for("import_page"),
|
||||
data={"urls": test_url},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"1 Imported" in res.data
|
||||
wait_for_all_checks(client)
|
||||
|
||||
# Add our URL to the import page
|
||||
res = client.post(
|
||||
url_for("edit_page", uuid="first"),
|
||||
data={"remove_duplicate_lines": "y",
|
||||
"trim_text_whitespace": "y",
|
||||
"sort_text_alphabetically": "", # leave this OFF for testing
|
||||
"url": test_url,
|
||||
"fetch_backend": "html_requests"},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"Updated watch." in res.data
|
||||
# Give the thread time to pick it up
|
||||
wait_for_all_checks(client)
|
||||
# Trigger a check
|
||||
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||
|
||||
# Give the thread time to pick it up
|
||||
wait_for_all_checks(client)
|
||||
|
||||
res = client.get(
|
||||
url_for("preview_page", uuid="first")
|
||||
)
|
||||
|
||||
assert res.data.count(b"see what happens.") == 1
|
||||
|
||||
# still should remain unsorted ('A - sortable line') stays at the end
|
||||
assert res.data.find(b'A - sortable line') > res.data.find(b'Which is across multiple lines')
|
||||
|
||||
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||
assert b'Deleted' in res.data
|
Ładowanie…
Reference in New Issue