kopia lustrzana https://github.com/dgtlmoon/changedetection.io
Enhancement - support xPath text() function filter, for example "//title/text()" in RSS feeds (#778)
rodzic
232bd92389
commit
b1c51c0a65
|
@ -52,8 +52,15 @@ def xpath_filter(xpath_filter, html_content):
|
||||||
if len(html_content) > 0 and len(r) == 0:
|
if len(html_content) > 0 and len(r) == 0:
|
||||||
raise FilterNotFoundInResponse(xpath_filter)
|
raise FilterNotFoundInResponse(xpath_filter)
|
||||||
|
|
||||||
for item in r:
|
#@note: //title/text() wont work where <title>CDATA..
|
||||||
html_block += etree.tostring(item, pretty_print=True).decode('utf-8') + "<br/>"
|
|
||||||
|
for element in r:
|
||||||
|
if type(element) == etree._ElementStringResult:
|
||||||
|
html_block += str(element) + "<br/>"
|
||||||
|
elif type(element) == etree._ElementUnicodeResult:
|
||||||
|
html_block += str(element) + "<br/>"
|
||||||
|
else:
|
||||||
|
html_block += etree.tostring(element, pretty_print=True).decode('utf-8') + "<br/>"
|
||||||
|
|
||||||
return html_block
|
return html_block
|
||||||
|
|
||||||
|
|
|
@ -163,15 +163,26 @@ User-Agent: wonderbra 1.0") }}
|
||||||
</div>
|
</div>
|
||||||
</fieldset>
|
</fieldset>
|
||||||
<div class="pure-control-group">
|
<div class="pure-control-group">
|
||||||
{{ render_field(form.css_filter, placeholder=".class-name or #some-id, or other CSS selector rule.",
|
{% set field = render_field(form.css_filter,
|
||||||
class="m-d") }}
|
placeholder=".class-name or #some-id, or other CSS selector rule.",
|
||||||
|
class="m-d")
|
||||||
|
%}
|
||||||
|
{{ field }}
|
||||||
|
{% if '/text()' in field %}
|
||||||
|
<span class="pure-form-message-inline"><strong>Note!: //text() function does not work where the <element> contains <![CDATA[]]></strong></span><br/>
|
||||||
|
{% endif %}
|
||||||
<span class="pure-form-message-inline">
|
<span class="pure-form-message-inline">
|
||||||
<ul>
|
<ul>
|
||||||
<li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
|
<li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
|
||||||
<li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <code>"json:"</code>, use <code>json:$</code> to force re-formatting if required, <a
|
<li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <code>"json:"</code>, use <code>json:$</code> to force re-formatting if required, <a
|
||||||
href="https://jsonpath.com/" target="new">test your JSONPath here</a></li>
|
href="https://jsonpath.com/" target="new">test your JSONPath here</a></li>
|
||||||
<li>XPath - Limit text to this XPath rule, simply start with a forward-slash, example <code>//*[contains(@class, 'sametext')]</code> or <code>xpath://*[contains(@class, 'sametext')]</code>, <a
|
<li>XPath - Limit text to this XPath rule, simply start with a forward-slash,
|
||||||
|
<ul>
|
||||||
|
<li>Example: <code>//*[contains(@class, 'sametext')]</code> or <code>xpath://*[contains(@class, 'sametext')]</code>, <a
|
||||||
href="http://xpather.com/" target="new">test your XPath here</a></li>
|
href="http://xpather.com/" target="new">test your XPath here</a></li>
|
||||||
|
<li>Example: Get all titles from an RSS feed <code>//title/text()</code></li>
|
||||||
|
</ul>
|
||||||
|
</li>
|
||||||
</ul>
|
</ul>
|
||||||
Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a
|
Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a
|
||||||
href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br/>
|
href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br/>
|
||||||
|
|
|
@ -86,6 +86,7 @@ def test_check_xpath_filter_utf8(client, live_server):
|
||||||
follow_redirects=True
|
follow_redirects=True
|
||||||
)
|
)
|
||||||
assert b"1 Imported" in res.data
|
assert b"1 Imported" in res.data
|
||||||
|
time.sleep(1)
|
||||||
res = client.post(
|
res = client.post(
|
||||||
url_for("edit_page", uuid="first"),
|
url_for("edit_page", uuid="first"),
|
||||||
data={"css_filter": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
|
data={"css_filter": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
|
||||||
|
@ -99,6 +100,68 @@ def test_check_xpath_filter_utf8(client, live_server):
|
||||||
assert b'Deleted' in res.data
|
assert b'Deleted' in res.data
|
||||||
|
|
||||||
|
|
||||||
|
# Handle utf-8 charset replies https://github.com/dgtlmoon/changedetection.io/pull/613
|
||||||
|
def test_check_xpath_text_function_utf8(client, live_server):
|
||||||
|
filter='//item/title/text()'
|
||||||
|
|
||||||
|
d='''<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<rss xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
|
||||||
|
<channel>
|
||||||
|
<title>rpilocator.com</title>
|
||||||
|
<link>https://rpilocator.com</link>
|
||||||
|
<description>Find Raspberry Pi Computers in Stock</description>
|
||||||
|
<lastBuildDate>Thu, 19 May 2022 23:27:30 GMT</lastBuildDate>
|
||||||
|
<image>
|
||||||
|
<url>https://rpilocator.com/favicon.png</url>
|
||||||
|
<title>rpilocator.com</title>
|
||||||
|
<link>https://rpilocator.com/</link>
|
||||||
|
<width>32</width>
|
||||||
|
<height>32</height>
|
||||||
|
</image>
|
||||||
|
<item>
|
||||||
|
<title>Stock Alert (UK): RPi CM4</title>
|
||||||
|
<foo>something else unrelated</foo>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<title>Stock Alert (UK): Big monitor</title>
|
||||||
|
<foo>something else unrelated</foo>
|
||||||
|
</item>
|
||||||
|
</channel>
|
||||||
|
</rss>'''
|
||||||
|
|
||||||
|
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||||
|
f.write(d)
|
||||||
|
|
||||||
|
# Add our URL to the import page
|
||||||
|
test_url = url_for('test_endpoint', _external=True, content_type="application/rss+xml;charset=UTF-8")
|
||||||
|
res = client.post(
|
||||||
|
url_for("import_page"),
|
||||||
|
data={"urls": test_url},
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
assert b"1 Imported" in res.data
|
||||||
|
time.sleep(1)
|
||||||
|
res = client.post(
|
||||||
|
url_for("edit_page", uuid="first"),
|
||||||
|
data={"css_filter": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
assert b"Updated watch." in res.data
|
||||||
|
time.sleep(3)
|
||||||
|
res = client.get(url_for("index"))
|
||||||
|
assert b'Unicode strings with encoding declaration are not supported.' not in res.data
|
||||||
|
|
||||||
|
# The service should echo back the request headers
|
||||||
|
res = client.get(
|
||||||
|
url_for("preview_page", uuid="first"),
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
|
||||||
|
assert b'<div class="">Stock Alert (UK): RPi CM4' in res.data
|
||||||
|
assert b'<div class="">Stock Alert (UK): Big monitor' in res.data
|
||||||
|
|
||||||
|
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||||
|
assert b'Deleted' in res.data
|
||||||
|
|
||||||
def test_check_markup_xpath_filter_restriction(client, live_server):
|
def test_check_markup_xpath_filter_restriction(client, live_server):
|
||||||
sleep_time_for_fetch_thread = 3
|
sleep_time_for_fetch_thread = 3
|
||||||
|
|
Ładowanie…
Reference in New Issue