Enhancement - support xPath text() function filter, for example "//title/text()" in RSS feeds (#778)

2022-07-28 11:50:31 +02:00 · 2022-07-28 11:50:31 +02:00 · b1c51c0a65
commit b1c51c0a65
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@ -52,8 +52,15 @@ def xpath_filter(xpath_filter, html_content):
    if len(html_content) > 0 and len(r) == 0:
        raise FilterNotFoundInResponse(xpath_filter)
-    for item in r:
+    #@note: //title/text() wont work where <title>CDATA..
-        html_block += etree.tostring(item, pretty_print=True).decode('utf-8') + "<br/>"
+
    for element in r:
        if type(element) == etree._ElementStringResult:
            html_block += str(element) + "<br/>"
        elif type(element) == etree._ElementUnicodeResult:
            html_block += str(element) + "<br/>"
        else:
            html_block += etree.tostring(element, pretty_print=True).decode('utf-8') + "<br/>"
    return html_block
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@ -163,15 +163,26 @@ User-Agent: wonderbra 1.0") }}
                        </div>
                    </fieldset>
                    <div class="pure-control-group">
-                        {{ render_field(form.css_filter, placeholder=".class-name or #some-id, or other CSS selector rule.",
+                        {% set field = render_field(form.css_filter,
-                        class="m-d") }}
+                            placeholder=".class-name or #some-id, or other CSS selector rule.",
                            class="m-d")
                        %}
                        {{ field }}
                        {% if '/text()' in  field %}
                          <span class="pure-form-message-inline"><strong>Note!: //text() function does not work where the &lt;element&gt; contains &lt;![CDATA[]]&gt;</strong></span><br/>
                        {% endif %}
                        <span class="pure-form-message-inline">
                    <ul>
                        <li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
                        <li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <code>"json:"</code>, use <code>json:$</code> to force re-formatting if required,  <a
                                href="https://jsonpath.com/" target="new">test your JSONPath here</a></li>
-                        <li>XPath - Limit text to this XPath rule, simply start with a forward-slash, example  <code>//*[contains(@class, 'sametext')]</code> or <code>xpath://*[contains(@class, 'sametext')]</code>, <a
+                        <li>XPath - Limit text to this XPath rule, simply start with a forward-slash,
                            <ul>
                                <li>Example:  <code>//*[contains(@class, 'sametext')]</code> or <code>xpath://*[contains(@class, 'sametext')]</code>, <a
                                href="http://xpather.com/" target="new">test your XPath here</a></li>
                                <li>Example: Get all titles from an RSS feed <code>//title/text()</code></li>
                            </ul>
                            </li>
                    </ul>
                    Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a
                                href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br/>
--- a/changedetectionio/tests/test_xpath_selector.py
+++ b/changedetectionio/tests/test_xpath_selector.py
@ -86,6 +86,7 @@ def test_check_xpath_filter_utf8(client, live_server):
        follow_redirects=True
    )
    assert b"1 Imported" in res.data
    time.sleep(1)
    res = client.post(
        url_for("edit_page", uuid="first"),
        data={"css_filter": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
@ -99,6 +100,68 @@ def test_check_xpath_filter_utf8(client, live_server):
    assert b'Deleted' in res.data
 # Handle utf-8 charset replies https://github.com/dgtlmoon/changedetection.io/pull/613
 def test_check_xpath_text_function_utf8(client, live_server):
    filter='//item/title/text()'
    d='''<?xml version="1.0" encoding="UTF-8"?>
 <rss xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
 	<channel>
 		<title>rpilocator.com</title>
 		<link>https://rpilocator.com</link>
 		<description>Find Raspberry Pi Computers in Stock</description>
 		<lastBuildDate>Thu, 19 May 2022 23:27:30 GMT</lastBuildDate>
 		<image>
 			<url>https://rpilocator.com/favicon.png</url>
 			<title>rpilocator.com</title>
 			<link>https://rpilocator.com/</link>
 			<width>32</width>
 			<height>32</height>
 		</image>
 		<item>
 			<title>Stock Alert (UK): RPi CM4</title>
 			<foo>something else unrelated</foo>
 		</item>
 		<item>
 			<title>Stock Alert (UK): Big monitor</title>
 			<foo>something else unrelated</foo>
 		</item>		
 	</channel>
 </rss>'''
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write(d)
    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True, content_type="application/rss+xml;charset=UTF-8")
    res = client.post(
        url_for("import_page"),
        data={"urls": test_url},
        follow_redirects=True
    )
    assert b"1 Imported" in res.data
    time.sleep(1)
    res = client.post(
        url_for("edit_page", uuid="first"),
        data={"css_filter": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
        follow_redirects=True
    )
    assert b"Updated watch." in res.data
    time.sleep(3)
    res = client.get(url_for("index"))
    assert b'Unicode strings with encoding declaration are not supported.' not in res.data
    # The service should echo back the request headers
    res = client.get(
        url_for("preview_page", uuid="first"),
        follow_redirects=True
    )
    assert b'<div class="">Stock Alert (UK): RPi CM4' in res.data
    assert b'<div class="">Stock Alert (UK): Big monitor' in res.data
    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
    assert b'Deleted' in res.data
 def test_check_markup_xpath_filter_restriction(client, live_server):
    sleep_time_for_fetch_thread = 3