Option to render links as [Some Text ](/link), adds the ability to change-detect on hyperlink changes

2022-04-09 10:35:14 +02:00 · 2022-04-09 10:35:14 +02:00 · 9809af142d
commit 9809af142d
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@ -641,6 +641,7 @@ def changedetection_app(config=None, datastore_o=None):
            form.global_subtractive_selectors.data = datastore.data['settings']['application']['global_subtractive_selectors']
            form.global_ignore_text.data = datastore.data['settings']['application']['global_ignore_text']
            form.ignore_whitespace.data = datastore.data['settings']['application']['ignore_whitespace']
+            form.render_anchor_tag_content.data = datastore.data['settings']['application']['render_anchor_tag_content']
            form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title']
            form.fetch_backend.data = datastore.data['settings']['application']['fetch_backend']
            form.notification_title.data = datastore.data['settings']['application']['notification_title']
@ -671,6 +672,7 @@ def changedetection_app(config=None, datastore_o=None):
            datastore.data['settings']['application']['global_ignore_text'] =  form.global_ignore_text.data
            datastore.data['settings']['application']['ignore_whitespace'] = form.ignore_whitespace.data
            datastore.data['settings']['application']['real_browser_save_screenshot'] = form.real_browser_save_screenshot.data
+            datastore.data['settings']['application']['render_anchor_tag_content'] = form.render_anchor_tag_content.data

            if not os.getenv("SALTED_PASS", False) and form.password.encrypted_password:
                datastore.data['settings']['application']['password'] = form.password.encrypted_password
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@ -4,7 +4,6 @@ import re
 import time
 import urllib3

-from inscriptis import get_text
 from changedetectionio import content_fetcher, html_tools

 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@ -88,7 +87,7 @@ class perform_site_check():

            has_filter_rule = css_filter_rule and len(css_filter_rule.strip())
            has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip())
-            
+
            if is_json and not has_filter_rule:
                css_filter_rule = "json:$"
                has_filter_rule = True
@ -117,9 +116,14 @@ class perform_site_check():
                            html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
                    if has_subtractive_selectors:
                        html_content = html_tools.element_removal(subtractive_selectors, html_content)
-                    # get_text() via inscriptis
-                    stripped_text_from_html = get_text(html_content)
-
+                    # extract text
+                    stripped_text_from_html = \
+                        html_tools.html_to_text(
+                            html_content,
+                            render_anchor_tag_content=self.datastore.data["settings"][
+                                "application"].get(
+                                "render_anchor_tag_content", False)
+                        )
            # Re #340 - return the content before the 'ignore text' was applied
            text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')

--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@ -231,7 +231,7 @@ class ValidateListRegex(object):
                except re.error:
                    message = field.gettext('RegEx \'%s\' is not a valid regular expression.')
                    raise ValidationError(message % (line))
-              
+
 class ValidateCSSJSONXPATHInput(object):
    """
    Filter validation
@ -293,7 +293,7 @@ class ValidateCSSJSONXPATHInput(object):
                # Re #265 - maybe in the future fetch the page and offer a
                # warning/notice that its possible the rule doesnt yet match anything?

-            
+
 class quickWatchForm(Form):
    # https://wtforms.readthedocs.io/en/2.3.x/fields/#module-wtforms.fields.html5
    # `require_tld` = False is needed even for the test harness "http://localhost:5005.." to run
@ -352,6 +352,10 @@ class globalSettingsForm(commonSettingsForm):
    global_subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
    global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
    ignore_whitespace = BooleanField('Ignore whitespace')
+
+    render_anchor_tag_content = BooleanField('Render Anchor Tag Content',
+                                             default=False)
+
    save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"})
    real_browser_save_screenshot = BooleanField('Save last screenshot when using Chrome?')
    removepassword_button = SubmitField('Remove password', render_kw={"class": "pure-button pure-button-primary"})
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@ -4,6 +4,9 @@ from typing import List

 from bs4 import BeautifulSoup
 from jsonpath_ng.ext import parse
+import re
+from inscriptis import get_text
+from inscriptis.model.config import ParserConfig


 class JSONNotFound(ValueError):
@ -25,12 +28,12 @@ def subtractive_css_selector(css_selector, html_content):
        item.decompose()
    return str(soup)

-    
+
 def element_removal(selectors: List[str], html_content):
    """Joins individual filters into one css filter."""
    selector = ",".join(selectors)
    return subtractive_css_selector(selector, html_content)
-    
+

 # Return str Utf-8 of matched rules
 def xpath_filter(xpath_filter, html_content):
@ -167,3 +170,35 @@ def strip_ignore_text(content, wordlist, mode="content"):
        return ignored_line_numbers

    return "\n".encode('utf8').join(output)
+
+
+def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
+    """Converts html string to a string with just the text. If ignoring
+    rendering anchor tag content is enable, anchor tag content are also
+    included in the text
+
+    :param html_content: string with html content
+    :param render_anchor_tag_content: boolean flag indicating whether to extract
+    hyperlinks (the anchor tag content) together with text. This refers to the
+    'href' inside 'a' tags.
+    Anchor tag content is rendered in the following manner:
+    '[ text ](anchor tag content)'
+    :return: extracted text from the HTML
+    """
+    #  if anchor tag content flag is set to True define a config for
+    #  extracting this content
+    if render_anchor_tag_content:
+
+        parser_config = ParserConfig(
+            annotation_rules={"a": ["hyperlink"]}, display_links=True
+        )
+
+    # otherwise set config to None
+    else:
+        parser_config = None
+
+    # get text and annotations via inscriptis
+    text_content = get_text(html_content, config=parser_config)
+
+    return text_content
+
--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@ -52,6 +52,7 @@ class ChangeDetectionStore:
                    'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum
                    'global_subtractive_selectors': [],
                    'ignore_whitespace': False,
+                    'render_anchor_tag_content': False,
                    'notification_urls': [], # Apprise URL list
                    # Custom notification content
                    'notification_title': default_notification_title,
--- a/changedetectionio/templates/settings.html
+++ b/changedetectionio/templates/settings.html
@ -91,10 +91,16 @@
                    <fieldset class="pure-group">
                    {{ render_field(form.ignore_whitespace) }}
                    <span class="pure-form-message-inline">Ignore whitespace, tabs and new-lines/line-feeds when considering if a change was detected.<br/>
-                    <i>Note:</i> Changing this will change the status of your existing watches, possibily trigger alerts etc.
+                    <i>Note:</i> Changing this will change the status of your existing watches, possibly trigger alerts etc.
+                    </span>
+                    </fieldset>
+                <fieldset class="pure-group">
+                    {{ render_field(form.render_anchor_tag_content) }}
+                    <span class="pure-form-message-inline">Render anchor tag content, default disabled, when enabled renders links as <code>(link text)[https://somesite.com]</code>
+                        <br/>
+                    <i>Note:</i> Changing this could affect the content of your existing watches, possibly trigger alerts etc.
                    </span>
                    </fieldset>
-
                    <fieldset class="pure-group">
                      {{ render_field(form.global_subtractive_selectors, rows=5, placeholder="header
 footer
--- a/changedetectionio/tests/test_api.py
+++ b/changedetectionio/tests/test_api.py
@ -26,7 +26,8 @@ def test_snapshot_api_detects_change(client, live_server):
    time.sleep(1)

    # Add our URL to the import page
-    test_url = url_for('test_endpoint', content_type="text/plain", _external=True)
+    test_url = url_for('test_endpoint', content_type="text/plain",
+                       _external=True)
    res = client.post(
        url_for("import_page"),
        data={"urls": test_url},
--- a/changedetectionio/tests/test_html_to_text.py
+++ b/changedetectionio/tests/test_html_to_text.py
@ -0,0 +1,38 @@
+#!/usr/bin/python3
+"""Test suite for the method to extract text from an html string"""
+from ..html_tools import html_to_text
+
+
+def test_html_to_text_func():
+    test_html = """<html>
+       <body>
+     Some initial text</br>
+     <p>Which is across multiple lines</p>
+     <a href="/first_link"> More Text </a>
+     </br>
+     So let's see what happens.  </br>
+     <a href="second_link.com"> Even More Text </a>
+     </body>
+     </html>
+    """
+
+    # extract text, with 'render_anchor_tag_content' set to False
+    text_content = html_to_text(test_html, render_anchor_tag_content=False)
+
+    no_links_text = \
+        "Some initial text\n\nWhich is across multiple " \
+        "lines\n\nMore Text So let's see what happens. Even More Text"
+
+    # check that no links are in the extracted text
+    assert text_content == no_links_text
+
+    # extract text, with 'render_anchor_tag_content' set to True
+    text_content = html_to_text(test_html, render_anchor_tag_content=True)
+
+    links_text = \
+        "Some initial text\n\nWhich is across multiple lines\n\n[ More Text " \
+        "](/first_link) So let's see what happens. [ Even More Text ]" \
+        "(second_link.com)"
+
+    # check that links are present in the extracted text
+    assert text_content == links_text
--- a/changedetectionio/tests/test_ignorehyperlinks.py
+++ b/changedetectionio/tests/test_ignorehyperlinks.py
@ -0,0 +1,219 @@
+#!/usr/bin/python3
+"""Test suite for the render/not render anchor tag content functionality"""
+
+import time
+from flask import url_for
+from .util import live_server_setup
+
+
+def test_setup(live_server):
+    live_server_setup(live_server)
+
+def set_original_ignore_response():
+    test_return_data = """<html>
+       <body>
+     Some initial text</br>
+     <a href="/original_link"> Some More Text </a>
+     </br>
+     So let's see what happens.  </br>
+     </body>
+     </html>
+    """
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(test_return_data)
+
+
+# Should be the same as set_original_ignore_response() but with a different
+# link
+def set_modified_ignore_response():
+    test_return_data = """<html>
+       <body>
+     Some initial text</br>
+     <a href="/modified_link"> Some More Text </a>
+     </br>
+     So let's see what happens.  </br>
+     </body>
+     </html>
+    """
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(test_return_data)
+
+def test_render_anchor_tag_content_true(client, live_server):
+    """Testing that the link changes are detected when
+    render_anchor_tag_content setting is set to true"""
+    sleep_time_for_fetch_thread = 3
+
+    # Give the endpoint time to spin up
+    time.sleep(1)
+
+    # set original html text
+    set_original_ignore_response()
+
+    # Goto the settings page, choose not to ignore links
+    res = client.post(
+        url_for("settings_page"),
+        data={
+            "minutes_between_check": 180,
+            "render_anchor_tag_content": "true",
+            "fetch_backend": "html_requests",
+        },
+        follow_redirects=True,
+    )
+    assert b"Settings updated." in res.data
+
+    # Add our URL to the import page
+    test_url = url_for("test_endpoint", _external=True)
+    res = client.post(
+        url_for("import_page"), data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+
+    time.sleep(sleep_time_for_fetch_thread)
+    # Trigger a check
+    client.get(url_for("api_watch_checknow"), follow_redirects=True)
+
+    # set a new html text with a modified link
+    set_modified_ignore_response()
+    time.sleep(sleep_time_for_fetch_thread)
+
+    # Trigger a check
+    client.get(url_for("api_watch_checknow"), follow_redirects=True)
+
+    # Give the thread time to pick it up
+    time.sleep(sleep_time_for_fetch_thread)
+
+    # check that the anchor tag content is rendered
+    res = client.get(url_for("preview_page", uuid="first"))
+    assert '(/modified_link)' in res.data.decode()
+
+    # since the link has changed, and we chose to render anchor tag content,
+    # we should detect a change (new 'unviewed' class)
+    res = client.get(url_for("index"))
+    assert b"unviewed" in res.data
+    assert b"/test-endpoint" in res.data
+
+    # Cleanup everything
+    res = client.get(url_for("api_delete", uuid="all"),
+                     follow_redirects=True)
+    assert b'Deleted' in res.data
+
+
+def test_render_anchor_tag_content_false(client, live_server):
+    """Testing that anchor tag content changes are ignored when
+    render_anchor_tag_content setting is set to false"""
+    sleep_time_for_fetch_thread = 3
+
+    # Give the endpoint time to spin up
+    time.sleep(1)
+
+    # set the original html text
+    set_original_ignore_response()
+
+    # Goto the settings page, choose to ignore hyperlinks
+    res = client.post(
+        url_for("settings_page"),
+        data={
+            "minutes_between_check": 180,
+            "render_anchor_tag_content": "false",
+            "fetch_backend": "html_requests",
+        },
+        follow_redirects=True,
+    )
+    assert b"Settings updated." in res.data
+
+    # Add our URL to the import page
+    test_url = url_for("test_endpoint", _external=True)
+    res = client.post(
+        url_for("import_page"), data={"urls": test_url}, follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+
+    time.sleep(sleep_time_for_fetch_thread)
+    # Trigger a check
+    client.get(url_for("api_watch_checknow"), follow_redirects=True)
+
+    # set a new html text, with a modified link
+    set_modified_ignore_response()
+    time.sleep(sleep_time_for_fetch_thread)
+
+    # Trigger a check
+    client.get(url_for("api_watch_checknow"), follow_redirects=True)
+
+    # Give the thread time to pick it up
+    time.sleep(sleep_time_for_fetch_thread)
+
+    # check that the anchor tag content is not rendered
+    res = client.get(url_for("preview_page", uuid="first"))
+    assert '(/modified_link)' not in res.data.decode()
+
+    # even though the link has changed, we shouldn't detect a change since
+    # we selected to not render anchor tag content (no new 'unviewed' class)
+    res = client.get(url_for("index"))
+    assert b"unviewed" not in res.data
+    assert b"/test-endpoint" in res.data
+
+    # Cleanup everything
+    res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
+    assert b'Deleted' in res.data
+
+
+def test_render_anchor_tag_content_default(client, live_server):
+    """Testing that anchor tag content changes are ignored when the
+    render_anchor_tag_content setting is not explicitly selected"""
+    sleep_time_for_fetch_thread = 3
+
+    # Give the endpoint time to spin up
+    time.sleep(1)
+
+    # set the original html text
+    set_original_ignore_response()
+
+    # Goto the settings page, not passing the render_anchor_tag_content setting
+    res = client.post(
+        url_for("settings_page"),
+        data={
+            "minutes_between_check": 180,
+            "fetch_backend": "html_requests",
+        },
+        follow_redirects=True,
+    )
+    assert b"Settings updated." in res.data
+
+    # Add our URL to the import page
+    test_url = url_for("test_endpoint", _external=True)
+    res = client.post(
+        url_for("import_page"), data={"urls": test_url}, follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+
+    time.sleep(sleep_time_for_fetch_thread)
+    # Trigger a check
+    client.get(url_for("api_watch_checknow"), follow_redirects=True)
+
+    # set a new html text, with a modified link
+    set_modified_ignore_response()
+    time.sleep(sleep_time_for_fetch_thread)
+
+    # Trigger a check
+    client.get(url_for("api_watch_checknow"), follow_redirects=True)
+
+    # Give the thread time to pick it up
+    time.sleep(sleep_time_for_fetch_thread)
+
+    # check that the anchor tag content is not rendered
+    res = client.get(url_for("preview_page", uuid="first"))
+    assert '(/modified_link)' not in res.data.decode()
+
+    # even though the link has changed, we shouldn't detect a change since
+    # we did not select the setting and the default behaviour is to not
+    # render anchor tag content (no new 'unviewed' class)
+    res = client.get(url_for("index"))
+    assert b"unviewed" not in res.data
+    assert b"/test-endpoint" in res.data
+
+    # Cleanup everything
+    res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
+    assert b'Deleted' in res.data