kopia lustrzana https://github.com/dgtlmoon/changedetection.io
Option to render links as [Some Text ](/link), adds the ability to change-detect on hyperlink changes
rodzic
1890881977
commit
9809af142d
|
@ -641,6 +641,7 @@ def changedetection_app(config=None, datastore_o=None):
|
|||
form.global_subtractive_selectors.data = datastore.data['settings']['application']['global_subtractive_selectors']
|
||||
form.global_ignore_text.data = datastore.data['settings']['application']['global_ignore_text']
|
||||
form.ignore_whitespace.data = datastore.data['settings']['application']['ignore_whitespace']
|
||||
form.render_anchor_tag_content.data = datastore.data['settings']['application']['render_anchor_tag_content']
|
||||
form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title']
|
||||
form.fetch_backend.data = datastore.data['settings']['application']['fetch_backend']
|
||||
form.notification_title.data = datastore.data['settings']['application']['notification_title']
|
||||
|
@ -671,6 +672,7 @@ def changedetection_app(config=None, datastore_o=None):
|
|||
datastore.data['settings']['application']['global_ignore_text'] = form.global_ignore_text.data
|
||||
datastore.data['settings']['application']['ignore_whitespace'] = form.ignore_whitespace.data
|
||||
datastore.data['settings']['application']['real_browser_save_screenshot'] = form.real_browser_save_screenshot.data
|
||||
datastore.data['settings']['application']['render_anchor_tag_content'] = form.render_anchor_tag_content.data
|
||||
|
||||
if not os.getenv("SALTED_PASS", False) and form.password.encrypted_password:
|
||||
datastore.data['settings']['application']['password'] = form.password.encrypted_password
|
||||
|
|
|
@ -4,7 +4,6 @@ import re
|
|||
import time
|
||||
import urllib3
|
||||
|
||||
from inscriptis import get_text
|
||||
from changedetectionio import content_fetcher, html_tools
|
||||
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
@ -88,7 +87,7 @@ class perform_site_check():
|
|||
|
||||
has_filter_rule = css_filter_rule and len(css_filter_rule.strip())
|
||||
has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip())
|
||||
|
||||
|
||||
if is_json and not has_filter_rule:
|
||||
css_filter_rule = "json:$"
|
||||
has_filter_rule = True
|
||||
|
@ -117,9 +116,14 @@ class perform_site_check():
|
|||
html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
|
||||
if has_subtractive_selectors:
|
||||
html_content = html_tools.element_removal(subtractive_selectors, html_content)
|
||||
# get_text() via inscriptis
|
||||
stripped_text_from_html = get_text(html_content)
|
||||
|
||||
# extract text
|
||||
stripped_text_from_html = \
|
||||
html_tools.html_to_text(
|
||||
html_content,
|
||||
render_anchor_tag_content=self.datastore.data["settings"][
|
||||
"application"].get(
|
||||
"render_anchor_tag_content", False)
|
||||
)
|
||||
# Re #340 - return the content before the 'ignore text' was applied
|
||||
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
|
||||
|
||||
|
|
|
@ -231,7 +231,7 @@ class ValidateListRegex(object):
|
|||
except re.error:
|
||||
message = field.gettext('RegEx \'%s\' is not a valid regular expression.')
|
||||
raise ValidationError(message % (line))
|
||||
|
||||
|
||||
class ValidateCSSJSONXPATHInput(object):
|
||||
"""
|
||||
Filter validation
|
||||
|
@ -293,7 +293,7 @@ class ValidateCSSJSONXPATHInput(object):
|
|||
# Re #265 - maybe in the future fetch the page and offer a
|
||||
# warning/notice that its possible the rule doesnt yet match anything?
|
||||
|
||||
|
||||
|
||||
class quickWatchForm(Form):
|
||||
# https://wtforms.readthedocs.io/en/2.3.x/fields/#module-wtforms.fields.html5
|
||||
# `require_tld` = False is needed even for the test harness "http://localhost:5005.." to run
|
||||
|
@ -352,6 +352,10 @@ class globalSettingsForm(commonSettingsForm):
|
|||
global_subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
|
||||
global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
|
||||
ignore_whitespace = BooleanField('Ignore whitespace')
|
||||
|
||||
render_anchor_tag_content = BooleanField('Render Anchor Tag Content',
|
||||
default=False)
|
||||
|
||||
save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"})
|
||||
real_browser_save_screenshot = BooleanField('Save last screenshot when using Chrome?')
|
||||
removepassword_button = SubmitField('Remove password', render_kw={"class": "pure-button pure-button-primary"})
|
||||
|
|
|
@ -4,6 +4,9 @@ from typing import List
|
|||
|
||||
from bs4 import BeautifulSoup
|
||||
from jsonpath_ng.ext import parse
|
||||
import re
|
||||
from inscriptis import get_text
|
||||
from inscriptis.model.config import ParserConfig
|
||||
|
||||
|
||||
class JSONNotFound(ValueError):
|
||||
|
@ -25,12 +28,12 @@ def subtractive_css_selector(css_selector, html_content):
|
|||
item.decompose()
|
||||
return str(soup)
|
||||
|
||||
|
||||
|
||||
def element_removal(selectors: List[str], html_content):
|
||||
"""Joins individual filters into one css filter."""
|
||||
selector = ",".join(selectors)
|
||||
return subtractive_css_selector(selector, html_content)
|
||||
|
||||
|
||||
|
||||
# Return str Utf-8 of matched rules
|
||||
def xpath_filter(xpath_filter, html_content):
|
||||
|
@ -167,3 +170,35 @@ def strip_ignore_text(content, wordlist, mode="content"):
|
|||
return ignored_line_numbers
|
||||
|
||||
return "\n".encode('utf8').join(output)
|
||||
|
||||
|
||||
def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
|
||||
"""Converts html string to a string with just the text. If ignoring
|
||||
rendering anchor tag content is enable, anchor tag content are also
|
||||
included in the text
|
||||
|
||||
:param html_content: string with html content
|
||||
:param render_anchor_tag_content: boolean flag indicating whether to extract
|
||||
hyperlinks (the anchor tag content) together with text. This refers to the
|
||||
'href' inside 'a' tags.
|
||||
Anchor tag content is rendered in the following manner:
|
||||
'[ text ](anchor tag content)'
|
||||
:return: extracted text from the HTML
|
||||
"""
|
||||
# if anchor tag content flag is set to True define a config for
|
||||
# extracting this content
|
||||
if render_anchor_tag_content:
|
||||
|
||||
parser_config = ParserConfig(
|
||||
annotation_rules={"a": ["hyperlink"]}, display_links=True
|
||||
)
|
||||
|
||||
# otherwise set config to None
|
||||
else:
|
||||
parser_config = None
|
||||
|
||||
# get text and annotations via inscriptis
|
||||
text_content = get_text(html_content, config=parser_config)
|
||||
|
||||
return text_content
|
||||
|
||||
|
|
|
@ -52,6 +52,7 @@ class ChangeDetectionStore:
|
|||
'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum
|
||||
'global_subtractive_selectors': [],
|
||||
'ignore_whitespace': False,
|
||||
'render_anchor_tag_content': False,
|
||||
'notification_urls': [], # Apprise URL list
|
||||
# Custom notification content
|
||||
'notification_title': default_notification_title,
|
||||
|
|
|
@ -91,10 +91,16 @@
|
|||
<fieldset class="pure-group">
|
||||
{{ render_field(form.ignore_whitespace) }}
|
||||
<span class="pure-form-message-inline">Ignore whitespace, tabs and new-lines/line-feeds when considering if a change was detected.<br/>
|
||||
<i>Note:</i> Changing this will change the status of your existing watches, possibily trigger alerts etc.
|
||||
<i>Note:</i> Changing this will change the status of your existing watches, possibly trigger alerts etc.
|
||||
</span>
|
||||
</fieldset>
|
||||
<fieldset class="pure-group">
|
||||
{{ render_field(form.render_anchor_tag_content) }}
|
||||
<span class="pure-form-message-inline">Render anchor tag content, default disabled, when enabled renders links as <code>(link text)[https://somesite.com]</code>
|
||||
<br/>
|
||||
<i>Note:</i> Changing this could affect the content of your existing watches, possibly trigger alerts etc.
|
||||
</span>
|
||||
</fieldset>
|
||||
|
||||
<fieldset class="pure-group">
|
||||
{{ render_field(form.global_subtractive_selectors, rows=5, placeholder="header
|
||||
footer
|
||||
|
|
|
@ -26,7 +26,8 @@ def test_snapshot_api_detects_change(client, live_server):
|
|||
time.sleep(1)
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_endpoint', content_type="text/plain", _external=True)
|
||||
test_url = url_for('test_endpoint', content_type="text/plain",
|
||||
_external=True)
|
||||
res = client.post(
|
||||
url_for("import_page"),
|
||||
data={"urls": test_url},
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
#!/usr/bin/python3
|
||||
"""Test suite for the method to extract text from an html string"""
|
||||
from ..html_tools import html_to_text
|
||||
|
||||
|
||||
def test_html_to_text_func():
|
||||
test_html = """<html>
|
||||
<body>
|
||||
Some initial text</br>
|
||||
<p>Which is across multiple lines</p>
|
||||
<a href="/first_link"> More Text </a>
|
||||
</br>
|
||||
So let's see what happens. </br>
|
||||
<a href="second_link.com"> Even More Text </a>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
# extract text, with 'render_anchor_tag_content' set to False
|
||||
text_content = html_to_text(test_html, render_anchor_tag_content=False)
|
||||
|
||||
no_links_text = \
|
||||
"Some initial text\n\nWhich is across multiple " \
|
||||
"lines\n\nMore Text So let's see what happens. Even More Text"
|
||||
|
||||
# check that no links are in the extracted text
|
||||
assert text_content == no_links_text
|
||||
|
||||
# extract text, with 'render_anchor_tag_content' set to True
|
||||
text_content = html_to_text(test_html, render_anchor_tag_content=True)
|
||||
|
||||
links_text = \
|
||||
"Some initial text\n\nWhich is across multiple lines\n\n[ More Text " \
|
||||
"](/first_link) So let's see what happens. [ Even More Text ]" \
|
||||
"(second_link.com)"
|
||||
|
||||
# check that links are present in the extracted text
|
||||
assert text_content == links_text
|
|
@ -0,0 +1,219 @@
|
|||
#!/usr/bin/python3
|
||||
"""Test suite for the render/not render anchor tag content functionality"""
|
||||
|
||||
import time
|
||||
from flask import url_for
|
||||
from .util import live_server_setup
|
||||
|
||||
|
||||
def test_setup(live_server):
|
||||
live_server_setup(live_server)
|
||||
|
||||
def set_original_ignore_response():
|
||||
test_return_data = """<html>
|
||||
<body>
|
||||
Some initial text</br>
|
||||
<a href="/original_link"> Some More Text </a>
|
||||
</br>
|
||||
So let's see what happens. </br>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write(test_return_data)
|
||||
|
||||
|
||||
# Should be the same as set_original_ignore_response() but with a different
|
||||
# link
|
||||
def set_modified_ignore_response():
|
||||
test_return_data = """<html>
|
||||
<body>
|
||||
Some initial text</br>
|
||||
<a href="/modified_link"> Some More Text </a>
|
||||
</br>
|
||||
So let's see what happens. </br>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write(test_return_data)
|
||||
|
||||
def test_render_anchor_tag_content_true(client, live_server):
|
||||
"""Testing that the link changes are detected when
|
||||
render_anchor_tag_content setting is set to true"""
|
||||
sleep_time_for_fetch_thread = 3
|
||||
|
||||
# Give the endpoint time to spin up
|
||||
time.sleep(1)
|
||||
|
||||
# set original html text
|
||||
set_original_ignore_response()
|
||||
|
||||
# Goto the settings page, choose not to ignore links
|
||||
res = client.post(
|
||||
url_for("settings_page"),
|
||||
data={
|
||||
"minutes_between_check": 180,
|
||||
"render_anchor_tag_content": "true",
|
||||
"fetch_backend": "html_requests",
|
||||
},
|
||||
follow_redirects=True,
|
||||
)
|
||||
assert b"Settings updated." in res.data
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for("test_endpoint", _external=True)
|
||||
res = client.post(
|
||||
url_for("import_page"), data={"urls": test_url},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"1 Imported" in res.data
|
||||
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
# Trigger a check
|
||||
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||
|
||||
# set a new html text with a modified link
|
||||
set_modified_ignore_response()
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
|
||||
# Trigger a check
|
||||
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
|
||||
# check that the anchor tag content is rendered
|
||||
res = client.get(url_for("preview_page", uuid="first"))
|
||||
assert '(/modified_link)' in res.data.decode()
|
||||
|
||||
# since the link has changed, and we chose to render anchor tag content,
|
||||
# we should detect a change (new 'unviewed' class)
|
||||
res = client.get(url_for("index"))
|
||||
assert b"unviewed" in res.data
|
||||
assert b"/test-endpoint" in res.data
|
||||
|
||||
# Cleanup everything
|
||||
res = client.get(url_for("api_delete", uuid="all"),
|
||||
follow_redirects=True)
|
||||
assert b'Deleted' in res.data
|
||||
|
||||
|
||||
def test_render_anchor_tag_content_false(client, live_server):
|
||||
"""Testing that anchor tag content changes are ignored when
|
||||
render_anchor_tag_content setting is set to false"""
|
||||
sleep_time_for_fetch_thread = 3
|
||||
|
||||
# Give the endpoint time to spin up
|
||||
time.sleep(1)
|
||||
|
||||
# set the original html text
|
||||
set_original_ignore_response()
|
||||
|
||||
# Goto the settings page, choose to ignore hyperlinks
|
||||
res = client.post(
|
||||
url_for("settings_page"),
|
||||
data={
|
||||
"minutes_between_check": 180,
|
||||
"render_anchor_tag_content": "false",
|
||||
"fetch_backend": "html_requests",
|
||||
},
|
||||
follow_redirects=True,
|
||||
)
|
||||
assert b"Settings updated." in res.data
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for("test_endpoint", _external=True)
|
||||
res = client.post(
|
||||
url_for("import_page"), data={"urls": test_url}, follow_redirects=True
|
||||
)
|
||||
assert b"1 Imported" in res.data
|
||||
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
# Trigger a check
|
||||
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||
|
||||
# set a new html text, with a modified link
|
||||
set_modified_ignore_response()
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
|
||||
# Trigger a check
|
||||
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
|
||||
# check that the anchor tag content is not rendered
|
||||
res = client.get(url_for("preview_page", uuid="first"))
|
||||
assert '(/modified_link)' not in res.data.decode()
|
||||
|
||||
# even though the link has changed, we shouldn't detect a change since
|
||||
# we selected to not render anchor tag content (no new 'unviewed' class)
|
||||
res = client.get(url_for("index"))
|
||||
assert b"unviewed" not in res.data
|
||||
assert b"/test-endpoint" in res.data
|
||||
|
||||
# Cleanup everything
|
||||
res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
|
||||
assert b'Deleted' in res.data
|
||||
|
||||
|
||||
def test_render_anchor_tag_content_default(client, live_server):
|
||||
"""Testing that anchor tag content changes are ignored when the
|
||||
render_anchor_tag_content setting is not explicitly selected"""
|
||||
sleep_time_for_fetch_thread = 3
|
||||
|
||||
# Give the endpoint time to spin up
|
||||
time.sleep(1)
|
||||
|
||||
# set the original html text
|
||||
set_original_ignore_response()
|
||||
|
||||
# Goto the settings page, not passing the render_anchor_tag_content setting
|
||||
res = client.post(
|
||||
url_for("settings_page"),
|
||||
data={
|
||||
"minutes_between_check": 180,
|
||||
"fetch_backend": "html_requests",
|
||||
},
|
||||
follow_redirects=True,
|
||||
)
|
||||
assert b"Settings updated." in res.data
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for("test_endpoint", _external=True)
|
||||
res = client.post(
|
||||
url_for("import_page"), data={"urls": test_url}, follow_redirects=True
|
||||
)
|
||||
assert b"1 Imported" in res.data
|
||||
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
# Trigger a check
|
||||
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||
|
||||
# set a new html text, with a modified link
|
||||
set_modified_ignore_response()
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
|
||||
# Trigger a check
|
||||
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
|
||||
# check that the anchor tag content is not rendered
|
||||
res = client.get(url_for("preview_page", uuid="first"))
|
||||
assert '(/modified_link)' not in res.data.decode()
|
||||
|
||||
# even though the link has changed, we shouldn't detect a change since
|
||||
# we did not select the setting and the default behaviour is to not
|
||||
# render anchor tag content (no new 'unviewed' class)
|
||||
res = client.get(url_for("index"))
|
||||
assert b"unviewed" not in res.data
|
||||
assert b"/test-endpoint" in res.data
|
||||
|
||||
# Cleanup everything
|
||||
res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
|
||||
assert b'Deleted' in res.data
|
Ładowanie…
Reference in New Issue