kopia lustrzana https://github.com/dgtlmoon/changedetection.io
Handle simple obfuscations - HomeDepot.com style price obfuscation (#764)
rodzic
a95468be08
commit
dd0eeca056
|
@ -150,7 +150,9 @@ class perform_site_check():
|
|||
is_html = False
|
||||
|
||||
if is_html or is_source:
|
||||
|
||||
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
||||
fetcher.content = html_tools.workarounds_for_obfuscations(fetcher.content)
|
||||
html_content = fetcher.content
|
||||
|
||||
# If not JSON, and if it's not text/plain..
|
||||
|
|
|
@ -202,3 +202,17 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
|
|||
|
||||
return text_content
|
||||
|
||||
def workarounds_for_obfuscations(content):
|
||||
"""
|
||||
Some sites are using sneaky tactics to make prices and other information un-renderable by Inscriptis
|
||||
This could go into its own Pip package in the future, for faster updates
|
||||
"""
|
||||
|
||||
# HomeDepot.com style <span>$<!-- -->90<!-- -->.<!-- -->74</span>
|
||||
# https://github.com/weblyzard/inscriptis/issues/45
|
||||
if not content:
|
||||
return content
|
||||
|
||||
content = re.sub('<!--\s+-->', '', content)
|
||||
|
||||
return content
|
||||
|
|
Plik diff jest za duży
Load Diff
|
@ -0,0 +1,43 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import time
|
||||
from flask import url_for
|
||||
from .util import live_server_setup
|
||||
|
||||
|
||||
def set_original_ignore_response():
|
||||
test_return_data = """<html>
|
||||
<body>
|
||||
<span>The price is</span><span>$<!-- -->90<!-- -->.<!-- -->74</span>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
"""
|
||||
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write(test_return_data)
|
||||
|
||||
|
||||
def test_obfuscations(client, live_server):
|
||||
set_original_ignore_response()
|
||||
live_server_setup(live_server)
|
||||
time.sleep(1)
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_endpoint', _external=True)
|
||||
res = client.post(
|
||||
url_for("import_page"),
|
||||
data={"urls": test_url},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"1 Imported" in res.data
|
||||
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(3)
|
||||
|
||||
# Check HTML conversion detected and workd
|
||||
res = client.get(
|
||||
url_for("preview_page", uuid="first"),
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b'$90.74' in res.data
|
Ładowanie…
Reference in New Issue