Bug fix for newer lxml module - module 'lxml.etree' has no attribute '_ElementStringResult' - reimplement _ElementStringResult (#2313 #2312)

2024-04-17 19:55:45 +02:00 · 2024-04-17 19:55:45 +02:00 · 74707909f1
commit 74707909f1
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@ -169,15 +169,13 @@ def xpath1_filter(xpath_filter, html_content, append_pretty_line_formatting=Fals
        # And where the matched result doesn't include something that will cause Inscriptis to add a newline
        # (This way each 'match' reliably has a new-line in the diff)
        # Divs are converted to 4 whitespaces by inscriptis
-        if append_pretty_line_formatting and len(html_block) and (not hasattr( element, 'tag' ) or not element.tag in (['br', 'hr', 'div', 'p'])):
+        if append_pretty_line_formatting and len(html_block) and (not hasattr(element, 'tag') or not element.tag in (['br', 'hr', 'div', 'p'])):
            html_block += TEXT_FILTER_LIST_LINE_SUFFIX

-        if type(element) == etree._ElementStringResult:
-            html_block += str(element)
-        elif type(element) == etree._ElementUnicodeResult:
-            html_block += str(element)
+        if isinstance(element, str):
+            html_block += element
        else:
-            html_block += etree.tostring(element, pretty_print=True).decode('utf-8')
+            html_block += etree.tostring(element, pretty_print=True, encoding='utf-8')

    return html_block

--- a/changedetectionio/tests/test_xpath_selector.py
+++ b/changedetectionio/tests/test_xpath_selector.py
@ -1,4 +1,4 @@
-#!/usr/bin/python3
+# -*- coding: utf-8 -*-

 import time
 from flask import url_for
@ -255,6 +255,69 @@ def test_xpath23_prefix_validation(client, live_server):
    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
    assert b'Deleted' in res.data

+def test_xpath1_lxml(client, live_server):
+    #live_server_setup(live_server)
+
+    d = '''<?xml version="1.0" encoding="UTF-8"?>
+    <rss xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
+    	<channel>
+    		<title>rpilocator.com</title>
+    		<link>https://rpilocator.com</link>
+    		<description>Find Raspberry Pi Computers in Stock</description>
+    		<lastBuildDate>Thu, 19 May 2022 23:27:30 GMT</lastBuildDate>
+    		<image>
+    			<url>https://rpilocator.com/favicon.png</url>
+    			<title>rpilocator.com</title>
+    			<link>https://rpilocator.com/</link>
+    			<width>32</width>
+    			<height>32</height>
+    		</image>
+    		<item>
+    			<title>Stock Alert (UK): RPi CM4</title>
+    			<foo>something else unrelated</foo>
+    		</item>
+    		<item>
+    			<title>Stock Alert (UK): Big monitorěěěě</title>
+    			<foo>something else unrelated</foo>
+    		</item>		
+    	</channel>
+    </rss>'''.encode('utf-8')
+
+    with open("test-datastore/endpoint-content.txt", "wb") as f:
+        f.write(d)
+
+
+    test_url = url_for('test_endpoint', _external=True)
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+    wait_for_all_checks(client)
+
+    res = client.post(
+        url_for("edit_page", uuid="first"),
+        data={"include_filters": "xpath1://title/text()", "url": test_url, "tags": "", "headers": "",
+              'fetch_backend': "html_requests"},
+        follow_redirects=True
+    )
+
+    ##### #2312
+    wait_for_all_checks(client)
+    res = client.get(url_for("index"))
+    assert b'_ElementStringResult' not in res.data # tested with 5.1.1 when it was removed and 5.1.0
+    assert b'Exception' not in res.data
+    res = client.get(
+        url_for("preview_page", uuid="first"),
+        follow_redirects=True
+    )
+
+    assert b"rpilocator.com" in res.data  # in selector
+    assert "Stock Alert (UK): Big monitorěěěě".encode('utf-8') in res.data  # not in selector
+
+    #####
+

 def test_xpath1_validation(client, live_server):
    # Add our URL to the import page
--- a/requirements.txt
+++ b/requirements.txt
@ -52,6 +52,7 @@ cryptography~=3.4
 beautifulsoup4

 # XPath filtering, lxml is required by bs4 anyway, but put it here to be safe.
+# #2312 - In 5.1.1 _ElementStringResult was removed -  ImportError: cannot import name '_ElementStringResult' from 'lxml.etree'
 lxml

 # XPath 2.0-3.1 support - 4.2.0 broke something?