kopia lustrzana https://github.com/dgtlmoon/changedetection.io
Enhancement - support xPath text() function filter, for example "//title/text()" in RSS feeds (#778)
rodzic
232bd92389
commit
b1c51c0a65
|
@ -52,8 +52,15 @@ def xpath_filter(xpath_filter, html_content):
|
|||
if len(html_content) > 0 and len(r) == 0:
|
||||
raise FilterNotFoundInResponse(xpath_filter)
|
||||
|
||||
for item in r:
|
||||
html_block += etree.tostring(item, pretty_print=True).decode('utf-8') + "<br/>"
|
||||
#@note: //title/text() wont work where <title>CDATA..
|
||||
|
||||
for element in r:
|
||||
if type(element) == etree._ElementStringResult:
|
||||
html_block += str(element) + "<br/>"
|
||||
elif type(element) == etree._ElementUnicodeResult:
|
||||
html_block += str(element) + "<br/>"
|
||||
else:
|
||||
html_block += etree.tostring(element, pretty_print=True).decode('utf-8') + "<br/>"
|
||||
|
||||
return html_block
|
||||
|
||||
|
|
|
@ -163,15 +163,26 @@ User-Agent: wonderbra 1.0") }}
|
|||
</div>
|
||||
</fieldset>
|
||||
<div class="pure-control-group">
|
||||
{{ render_field(form.css_filter, placeholder=".class-name or #some-id, or other CSS selector rule.",
|
||||
class="m-d") }}
|
||||
{% set field = render_field(form.css_filter,
|
||||
placeholder=".class-name or #some-id, or other CSS selector rule.",
|
||||
class="m-d")
|
||||
%}
|
||||
{{ field }}
|
||||
{% if '/text()' in field %}
|
||||
<span class="pure-form-message-inline"><strong>Note!: //text() function does not work where the <element> contains <![CDATA[]]></strong></span><br/>
|
||||
{% endif %}
|
||||
<span class="pure-form-message-inline">
|
||||
<ul>
|
||||
<li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
|
||||
<li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <code>"json:"</code>, use <code>json:$</code> to force re-formatting if required, <a
|
||||
href="https://jsonpath.com/" target="new">test your JSONPath here</a></li>
|
||||
<li>XPath - Limit text to this XPath rule, simply start with a forward-slash, example <code>//*[contains(@class, 'sametext')]</code> or <code>xpath://*[contains(@class, 'sametext')]</code>, <a
|
||||
<li>XPath - Limit text to this XPath rule, simply start with a forward-slash,
|
||||
<ul>
|
||||
<li>Example: <code>//*[contains(@class, 'sametext')]</code> or <code>xpath://*[contains(@class, 'sametext')]</code>, <a
|
||||
href="http://xpather.com/" target="new">test your XPath here</a></li>
|
||||
<li>Example: Get all titles from an RSS feed <code>//title/text()</code></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a
|
||||
href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br/>
|
||||
|
|
|
@ -86,6 +86,7 @@ def test_check_xpath_filter_utf8(client, live_server):
|
|||
follow_redirects=True
|
||||
)
|
||||
assert b"1 Imported" in res.data
|
||||
time.sleep(1)
|
||||
res = client.post(
|
||||
url_for("edit_page", uuid="first"),
|
||||
data={"css_filter": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
|
||||
|
@ -99,6 +100,68 @@ def test_check_xpath_filter_utf8(client, live_server):
|
|||
assert b'Deleted' in res.data
|
||||
|
||||
|
||||
# Handle utf-8 charset replies https://github.com/dgtlmoon/changedetection.io/pull/613
|
||||
def test_check_xpath_text_function_utf8(client, live_server):
|
||||
filter='//item/title/text()'
|
||||
|
||||
d='''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
|
||||
<channel>
|
||||
<title>rpilocator.com</title>
|
||||
<link>https://rpilocator.com</link>
|
||||
<description>Find Raspberry Pi Computers in Stock</description>
|
||||
<lastBuildDate>Thu, 19 May 2022 23:27:30 GMT</lastBuildDate>
|
||||
<image>
|
||||
<url>https://rpilocator.com/favicon.png</url>
|
||||
<title>rpilocator.com</title>
|
||||
<link>https://rpilocator.com/</link>
|
||||
<width>32</width>
|
||||
<height>32</height>
|
||||
</image>
|
||||
<item>
|
||||
<title>Stock Alert (UK): RPi CM4</title>
|
||||
<foo>something else unrelated</foo>
|
||||
</item>
|
||||
<item>
|
||||
<title>Stock Alert (UK): Big monitor</title>
|
||||
<foo>something else unrelated</foo>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>'''
|
||||
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write(d)
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_endpoint', _external=True, content_type="application/rss+xml;charset=UTF-8")
|
||||
res = client.post(
|
||||
url_for("import_page"),
|
||||
data={"urls": test_url},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"1 Imported" in res.data
|
||||
time.sleep(1)
|
||||
res = client.post(
|
||||
url_for("edit_page", uuid="first"),
|
||||
data={"css_filter": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"Updated watch." in res.data
|
||||
time.sleep(3)
|
||||
res = client.get(url_for("index"))
|
||||
assert b'Unicode strings with encoding declaration are not supported.' not in res.data
|
||||
|
||||
# The service should echo back the request headers
|
||||
res = client.get(
|
||||
url_for("preview_page", uuid="first"),
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b'<div class="">Stock Alert (UK): RPi CM4' in res.data
|
||||
assert b'<div class="">Stock Alert (UK): Big monitor' in res.data
|
||||
|
||||
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||
assert b'Deleted' in res.data
|
||||
|
||||
def test_check_markup_xpath_filter_restriction(client, live_server):
|
||||
sleep_time_for_fetch_thread = 3
|
||||
|
|
Ładowanie…
Reference in New Issue