kopia lustrzana https://github.com/dgtlmoon/changedetection.io
RSS fetch - RSS field <title> was not rendering as text correctly, added workaround #1879
rodzic
1c0fe4c23e
commit
7220afab0a
|
@ -98,10 +98,7 @@ def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False
|
||||||
elif type(element) == etree._ElementUnicodeResult:
|
elif type(element) == etree._ElementUnicodeResult:
|
||||||
html_block += str(element)
|
html_block += str(element)
|
||||||
else:
|
else:
|
||||||
if not is_rss:
|
|
||||||
html_block += etree.tostring(element, pretty_print=True).decode('utf-8')
|
html_block += etree.tostring(element, pretty_print=True).decode('utf-8')
|
||||||
else:
|
|
||||||
html_block += f"<div>{element.text}</div>\n"
|
|
||||||
|
|
||||||
return html_block
|
return html_block
|
||||||
|
|
||||||
|
@ -274,7 +271,7 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
|
||||||
pattern = '<!\[CDATA\[(\s*(?:.(?<!\]\]>)\s*)*)\]\]>'
|
pattern = '<!\[CDATA\[(\s*(?:.(?<!\]\]>)\s*)*)\]\]>'
|
||||||
def repl(m):
|
def repl(m):
|
||||||
text = m.group(1)
|
text = m.group(1)
|
||||||
return xml_escape(html_to_text(html_content=text))
|
return xml_escape(html_to_text(html_content=text)).strip()
|
||||||
|
|
||||||
return re.sub(pattern, repl, html_content)
|
return re.sub(pattern, repl, html_content)
|
||||||
|
|
||||||
|
@ -295,7 +292,8 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
|
||||||
# extracting this content
|
# extracting this content
|
||||||
if render_anchor_tag_content:
|
if render_anchor_tag_content:
|
||||||
parser_config = ParserConfig(
|
parser_config = ParserConfig(
|
||||||
annotation_rules={"a": ["hyperlink"]}, display_links=True
|
annotation_rules={"a": ["hyperlink"]},
|
||||||
|
display_links=True
|
||||||
)
|
)
|
||||||
# otherwise set config to None/default
|
# otherwise set config to None/default
|
||||||
else:
|
else:
|
||||||
|
@ -303,12 +301,11 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
|
||||||
|
|
||||||
# RSS Mode - Inscriptis will treat `title` as something else.
|
# RSS Mode - Inscriptis will treat `title` as something else.
|
||||||
# Make it as a regular block display element (//item/title)
|
# Make it as a regular block display element (//item/title)
|
||||||
|
# This is a bit of a hack - the real way it to use XSLT to convert it to HTML #1874
|
||||||
if is_rss:
|
if is_rss:
|
||||||
css = CSS_PROFILES['strict'].copy()
|
html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
|
||||||
css['title'] = HtmlElement(display=Display.block)
|
html_content = re.sub(r'</title>', r'</h1>', html_content)
|
||||||
text_content = get_text(html_content, ParserConfig(css=css))
|
|
||||||
else:
|
|
||||||
# get text and annotations via inscriptis
|
|
||||||
text_content = get_text(html_content, config=parser_config)
|
text_content = get_text(html_content, config=parser_config)
|
||||||
|
|
||||||
return text_content
|
return text_content
|
||||||
|
|
|
@ -274,7 +274,7 @@ class perform_site_check(difference_detection_processor):
|
||||||
html_tools.html_to_text(
|
html_tools.html_to_text(
|
||||||
html_content=html_content,
|
html_content=html_content,
|
||||||
render_anchor_tag_content=do_anchor,
|
render_anchor_tag_content=do_anchor,
|
||||||
is_rss=is_rss
|
is_rss=is_rss # #1874 activate the <title workaround hack
|
||||||
)
|
)
|
||||||
|
|
||||||
# Re #340 - return the content before the 'ignore text' was applied
|
# Re #340 - return the content before the 'ignore text' was applied
|
||||||
|
|
|
@ -154,6 +154,9 @@ def test_rss_xpath_filtering(client, live_server):
|
||||||
)
|
)
|
||||||
assert b'CDATA' not in res.data
|
assert b'CDATA' not in res.data
|
||||||
assert b'<![' not in res.data
|
assert b'<![' not in res.data
|
||||||
|
# #1874 All but the first <title was getting selected
|
||||||
|
# Convert any HTML with just a top level <title> to <h1> to be sure title renders
|
||||||
|
|
||||||
assert b'Hackers can access your computer' in res.data # Should ONLY be selected by the xpath
|
assert b'Hackers can access your computer' in res.data # Should ONLY be selected by the xpath
|
||||||
assert b'Some other title' in res.data # Should ONLY be selected by the xpath
|
assert b'Some other title' in res.data # Should ONLY be selected by the xpath
|
||||||
assert b'The days of Terminator' not in res.data # Should NOT be selected by the xpath
|
assert b'The days of Terminator' not in res.data # Should NOT be selected by the xpath
|
||||||
|
|
Ładowanie…
Reference in New Issue