Filters - Process all CSS and XPath 'subtract' selectors in a single pass to prevent index shifting and reference loss during DOM manipulation. (#2754)

2024-10-30 12:00:53 +01:00 · 2024-10-30 12:00:53 +01:00 · 87510becb5
commit 87510becb5
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@ -54,29 +54,64 @@ def include_filters(include_filters, html_content, append_pretty_line_formatting
 def subtractive_css_selector(css_selector, html_content):
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")
-    for item in soup.select(css_selector):
+
+    # So that the elements dont shift their index, build a list of elements here which will be pointers to their place in the DOM
+    elements_to_remove = soup.select(css_selector)
+
+    # Then, remove them in a separate loop
+    for item in elements_to_remove:
        item.decompose()
+
    return str(soup)

-def subtractive_xpath_selector(xpath_selector, html_content): 
+def subtractive_xpath_selector(selectors: List[str], html_content: str) -> str:
+    # Parse the HTML content using lxml
    html_tree = etree.HTML(html_content)
-    elements_to_remove = html_tree.xpath(xpath_selector)

+    # First, collect all elements to remove
+    elements_to_remove = []
+
+    # Iterate over the list of XPath selectors
+    for selector in selectors:
+        # Collect elements for each selector
+        elements_to_remove.extend(html_tree.xpath(selector))
+
+    # Then, remove them in a separate loop
    for element in elements_to_remove:
-        element.getparent().remove(element)
+        if element.getparent() is not None:  # Ensure the element has a parent before removing
+            element.getparent().remove(element)

+    # Convert the modified HTML tree back to a string
    modified_html = etree.tostring(html_tree, method="html").decode("utf-8")
    return modified_html

+
 def element_removal(selectors: List[str], html_content):
-    """Removes elements that match a list of CSS or xPath selectors."""
+    """Removes elements that match a list of CSS or XPath selectors."""
    modified_html = html_content
+    css_selectors = []
+    xpath_selectors = []
+
    for selector in selectors:
        if selector.startswith(('xpath:', 'xpath1:', '//')):
+            # Handle XPath selectors separately
            xpath_selector = selector.removeprefix('xpath:').removeprefix('xpath1:')
-            modified_html = subtractive_xpath_selector(xpath_selector, modified_html)
+            xpath_selectors.append(xpath_selector)
        else:
-            modified_html = subtractive_css_selector(selector, modified_html)
+            # Collect CSS selectors as one "hit", see comment in subtractive_css_selector
+            css_selectors.append(selector.strip().strip(","))
+
+    if xpath_selectors:
+        modified_html = subtractive_xpath_selector(xpath_selectors, modified_html)
+
+    if css_selectors:
+        # Remove duplicates, then combine all CSS selectors into one string, separated by commas
+        # This stops the elements index shifting
+        unique_selectors = list(set(css_selectors))  # Ensure uniqueness
+        combined_css_selector = " , ".join(unique_selectors)
+        modified_html = subtractive_css_selector(combined_css_selector, modified_html)
+
+
    return modified_html

 def elementpath_tostring(obj):
--- a/changedetectionio/tests/test_element_removal.py
+++ b/changedetectionio/tests/test_element_removal.py
@ -11,6 +11,35 @@ from .util import live_server_setup, wait_for_all_checks
 def test_setup(live_server):
    live_server_setup(live_server)

+def set_response_with_multiple_index():
+    data= """<!DOCTYPE html>
+<html>
+<body>
+
+<!-- NOTE!! CHROME WILL ADD TBODY HERE IF ITS NOT THERE!! -->
+<table style="width:100%">
+  <tr>
+    <th>Person 1</th>
+    <th>Person 2</th>
+    <th>Person 3</th>
+  </tr>
+  <tr>
+    <td>Emil</td>
+    <td>Tobias</td>
+    <td>Linus</td>
+  </tr>
+  <tr>
+    <td>16</td>
+    <td>14</td>
+    <td>10</td>
+  </tr>
+</table>
+</body>
+</html>
+"""
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(data)
+

 def set_original_response():
    test_return_data = """<html>
@ -177,3 +206,61 @@ def test_element_removal_full(client, live_server, measure_memory_usage):
    # There should not be an unviewed change, as changes should be removed
    res = client.get(url_for("index"))
    assert b"unviewed" not in res.data
+
+# Re #2752
+def test_element_removal_nth_offset_no_shift(client, live_server, measure_memory_usage):
+    #live_server_setup(live_server)
+
+    set_response_with_multiple_index()
+    subtractive_selectors_data = ["""
+body > table > tr:nth-child(1) > th:nth-child(2)
+body > table >  tr:nth-child(2) > td:nth-child(2)
+body > table > tr:nth-child(3) > td:nth-child(2)
+body > table > tr:nth-child(1) > th:nth-child(3)
+body > table >  tr:nth-child(2) > td:nth-child(3)
+body > table > tr:nth-child(3) > td:nth-child(3)""",
+"""//body/table/tr[1]/th[2]
+//body/table/tr[2]/td[2]
+//body/table/tr[3]/td[2]
+//body/table/tr[1]/th[3]
+//body/table/tr[2]/td[3]
+//body/table/tr[3]/td[3]"""]
+
+    for selector_list in subtractive_selectors_data:
+
+        res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+        assert b'Deleted' in res.data
+
+        # Add our URL to the import page
+        test_url = url_for("test_endpoint", _external=True)
+        res = client.post(
+            url_for("import_page"), data={"urls": test_url}, follow_redirects=True
+        )
+        assert b"1 Imported" in res.data
+        wait_for_all_checks(client)
+
+        res = client.post(
+            url_for("edit_page", uuid="first"),
+            data={
+                "subtractive_selectors": selector_list,
+                "url": test_url,
+                "tags": "",
+                "fetch_backend": "html_requests",
+            },
+            follow_redirects=True,
+        )
+        assert b"Updated watch." in res.data
+        wait_for_all_checks(client)
+
+        res = client.get(
+            url_for("preview_page", uuid="first"),
+            follow_redirects=True
+        )
+
+        assert b"Tobias" not in res.data
+        assert b"Linus" not in res.data
+        assert b"Person 2" not in res.data
+        assert b"Person 3" not in res.data
+        # First column should exist
+        assert b"Emil" in res.data
+