kopia lustrzana https://github.com/dgtlmoon/changedetection.io
commit
609a0a3aad
|
@ -386,6 +386,17 @@ def changedetection_app(conig=None, datastore_o=None):
|
|||
if len(datastore.data['watching'][uuid]['history']):
|
||||
update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid)
|
||||
|
||||
|
||||
# CSS Filter
|
||||
css_filter = request.form.get('css_filter')
|
||||
if css_filter:
|
||||
datastore.data['watching'][uuid]['css_filter'] = css_filter.strip()
|
||||
|
||||
# Reset the previous_md5 so we process a new snapshot including stripping ignore text.
|
||||
if len(datastore.data['watching'][uuid]['history']):
|
||||
update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid)
|
||||
|
||||
|
||||
validators.url(url) # @todo switch to prop/attr/observer
|
||||
datastore.data['watching'][uuid].update(update_obj)
|
||||
datastore.needs_write = True
|
||||
|
@ -876,7 +887,7 @@ def ticker_thread_check_time_launch_checks():
|
|||
if not uuid in running_uuids and uuid not in update_q.queue:
|
||||
update_q.put(uuid)
|
||||
|
||||
time.sleep(1)
|
||||
time.sleep(0.1)
|
||||
|
||||
# Should be low so we can break this out in testing
|
||||
app.config.exit.wait(1)
|
||||
|
|
|
@ -66,25 +66,36 @@ class perform_site_check():
|
|||
timeout=timeout,
|
||||
verify=False)
|
||||
|
||||
stripped_text_from_html = get_text(r.text)
|
||||
# CSS Filter
|
||||
css_filter = self.datastore.data['watching'][uuid]['css_filter']
|
||||
if css_filter and len(css_filter.strip()):
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(r.content, "html.parser")
|
||||
stripped_text_from_html = ""
|
||||
for item in soup.select(css_filter):
|
||||
text = str(item.get_text())+"\n"
|
||||
stripped_text_from_html += text
|
||||
|
||||
else:
|
||||
stripped_text_from_html = get_text(r.text)
|
||||
|
||||
# Usually from networkIO/requests level
|
||||
except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e:
|
||||
update_obj["last_error"] = str(e)
|
||||
|
||||
print(str(e))
|
||||
|
||||
except requests.exceptions.MissingSchema:
|
||||
print("Skipping {} due to missing schema/bad url".format(uuid))
|
||||
|
||||
# Usually from html2text level
|
||||
except UnicodeDecodeError as e:
|
||||
|
||||
except Exception as e:
|
||||
# except UnicodeDecodeError as e:
|
||||
update_obj["last_error"] = str(e)
|
||||
print(str(e))
|
||||
# figure out how to deal with this cleaner..
|
||||
# 'utf-8' codec can't decode byte 0xe9 in position 480: invalid continuation byte
|
||||
|
||||
|
||||
else:
|
||||
# We rely on the actual text in the html output.. many sites have random script vars etc,
|
||||
# in the future we'll implement other mechanisms.
|
||||
|
|
|
@ -61,7 +61,8 @@ class ChangeDetectionStore:
|
|||
'headers': {}, # Extra headers to send
|
||||
'history': {}, # Dict of timestamp and output stripped filename
|
||||
'ignore_text': [], # List of text to ignore when calculating the comparison checksum
|
||||
'notification_urls': [] # List of URLs to add to the notification Queue (Usually AppRise)
|
||||
'notification_urls': [], # List of URLs to add to the notification Queue (Usually AppRise)
|
||||
'css_filter': "",
|
||||
}
|
||||
|
||||
if path.isfile('backend/source.txt'):
|
||||
|
|
|
@ -24,7 +24,13 @@
|
|||
size="5"/>
|
||||
<span class="pure-form-message-inline">Minimum 1 minute between recheck</span>
|
||||
</div>
|
||||
|
||||
</br>
|
||||
<div class="pure-control-group">
|
||||
<label for="minutes">CSS Filter</label>
|
||||
<input type="text" id="css_filter" name="css_filter" value="{{watch.css_filter}}"
|
||||
size="25"/>
|
||||
<span class="pure-form-message-inline">Limit text to this CSS rule, all matching CSS is included.</span>
|
||||
</div>
|
||||
<!-- @todo: move to tabs --->
|
||||
<fieldset class="pure-group">
|
||||
<label for="ignore-text">Ignore text</label>
|
||||
|
|
|
@ -0,0 +1,102 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import time
|
||||
from flask import url_for
|
||||
from . util import live_server_setup
|
||||
|
||||
def test_setup(live_server):
|
||||
live_server_setup(live_server)
|
||||
|
||||
def set_original_response():
|
||||
test_return_data = """<html>
|
||||
<body>
|
||||
Some initial text</br>
|
||||
<p>Which is across multiple lines</p>
|
||||
</br>
|
||||
So let's see what happens. </br>
|
||||
<div id="sametext">Some text thats the same</div>
|
||||
<div id="changetext">Some text that will change</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
with open("test-datastore/output.txt", "w") as f:
|
||||
f.write(test_return_data)
|
||||
return None
|
||||
|
||||
def set_modified_response():
|
||||
test_return_data = """<html>
|
||||
<body>
|
||||
Some initial text</br>
|
||||
<p>which has this one new line</p>
|
||||
</br>
|
||||
So let's see what happens. </br>
|
||||
<div id="sametext">Some text thats the same</div>
|
||||
<div id="changetext">Some text that changes</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
with open("test-datastore/output.txt", "w") as f:
|
||||
f.write(test_return_data)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def test_check_markup_css_filter_restriction(client, live_server):
|
||||
sleep_time_for_fetch_thread = 3
|
||||
|
||||
css_filter = "#sametext"
|
||||
|
||||
set_original_response()
|
||||
|
||||
# Give the endpoint time to spin up
|
||||
time.sleep(1)
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_endpoint', _external=True)
|
||||
res = client.post(
|
||||
url_for("import_page"),
|
||||
data={"urls": test_url},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"1 Imported" in res.data
|
||||
|
||||
# Trigger a check
|
||||
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
|
||||
# Goto the edit page, add our ignore text
|
||||
# Add our URL to the import page
|
||||
res = client.post(
|
||||
url_for("edit_page", uuid="first"),
|
||||
data={"css_filter": css_filter, "url": test_url, "tag": "", "headers": ""},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"Updated watch." in res.data
|
||||
|
||||
# Check it saved
|
||||
res = client.get(
|
||||
url_for("edit_page", uuid="first"),
|
||||
)
|
||||
assert bytes(css_filter.encode('utf-8')) in res.data
|
||||
|
||||
# Trigger a check
|
||||
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
# Make a change
|
||||
set_modified_response()
|
||||
|
||||
# Trigger a check
|
||||
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
|
||||
# It should have 'unviewed' still
|
||||
# Because it should be looking at only that 'sametext' id
|
||||
res = client.get(url_for("index"))
|
||||
assert b'unviewed' in res.data
|
|
@ -11,4 +11,10 @@ feedgen ~= 0.9
|
|||
flask-login ~= 0.5
|
||||
pytz
|
||||
urllib3
|
||||
|
||||
# Notification library
|
||||
apprise ~= 0.9
|
||||
|
||||
# Used for CSS filtering
|
||||
bs4
|
||||
|
||||
|
|
Ładowanie…
Reference in New Issue