kopia lustrzana https://github.com/dgtlmoon/changedetection.io
CSS Filter - Smarter is to just extract the HTML blob and continue with inscriptus, so we have almost the same output as not using the filter
rodzic
8d59ef2e10
commit
f215adbbe5
|
@ -5,6 +5,17 @@ from inscriptis import get_text
|
||||||
import urllib3
|
import urllib3
|
||||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
|
||||||
|
# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
|
||||||
|
class css_filter(object):
|
||||||
|
def apply(self, css_filter, html_content):
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
soup = BeautifulSoup(html_content, "html.parser")
|
||||||
|
html_block = ""
|
||||||
|
for item in soup.select(css_filter, separator=""):
|
||||||
|
html_block += str(item)
|
||||||
|
|
||||||
|
return html_block+"\n"
|
||||||
|
|
||||||
# Some common stuff here that can be moved to a base class
|
# Some common stuff here that can be moved to a base class
|
||||||
class perform_site_check():
|
class perform_site_check():
|
||||||
|
|
||||||
|
@ -82,21 +93,15 @@ class perform_site_check():
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
verify=False)
|
verify=False)
|
||||||
|
|
||||||
# CSS Filter
|
html = r.text
|
||||||
css_filter = self.datastore.data['watching'][uuid]['css_filter']
|
|
||||||
if css_filter and len(css_filter.strip()):
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
soup = BeautifulSoup(r.content, "html.parser")
|
|
||||||
stripped_text_from_html = ""
|
|
||||||
for item in soup.select(css_filter):
|
|
||||||
# By default, bs4's get_text will lump the text together
|
|
||||||
# BS4's element strip() will lose the indentation format, I've tried using a space as separator, setting strip=False etc, but doesnt help
|
|
||||||
# @todo ideas? if you compare the css_filtered output to non-filtered snapshot it will always lose the indentation/format
|
|
||||||
text = str(item.get_text(separator="\n", strip=True)).strip() + '\n'
|
|
||||||
stripped_text_from_html += text
|
|
||||||
|
|
||||||
else:
|
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
||||||
stripped_text_from_html = get_text(r.text)
|
css_filter_rule = self.datastore.data['watching'][uuid]['css_filter']
|
||||||
|
if css_filter_rule and len(css_filter_rule.strip()):
|
||||||
|
filter = css_filter()
|
||||||
|
html = filter.apply(css_filter=css_filter_rule, html_content=r.content)
|
||||||
|
|
||||||
|
stripped_text_from_html = get_text(html)
|
||||||
|
|
||||||
# Usually from networkIO/requests level
|
# Usually from networkIO/requests level
|
||||||
except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e:
|
except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e:
|
||||||
|
|
|
@ -43,6 +43,32 @@ def set_modified_response():
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# Test that the CSS extraction works how we expect, important here is the right placing of new lines \n's
|
||||||
|
def test_css_filter_output():
|
||||||
|
from backend import fetch_site_status
|
||||||
|
from inscriptis import get_text
|
||||||
|
|
||||||
|
css_filter = fetch_site_status.css_filter()
|
||||||
|
|
||||||
|
# Check text with sub-parts renders correctly
|
||||||
|
content = """<html> <body><div id="thingthing" > Some really <b>bold</b> text </div> </body> </html>"""
|
||||||
|
html_blob = css_filter.apply(css_filter="#thingthing", html_content=content)
|
||||||
|
text = get_text(html_blob)
|
||||||
|
assert text == " Some really bold text"
|
||||||
|
|
||||||
|
content = """<html> <body>
|
||||||
|
<p>foo bar blah</p>
|
||||||
|
<div class="parts">Block A</div> <div class="parts">Block B</div></body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
html_blob = css_filter.apply(css_filter=".parts", html_content=content)
|
||||||
|
text = get_text(html_blob)
|
||||||
|
|
||||||
|
# Divs are converted to 4 whitespaces by inscriptis
|
||||||
|
assert text == " Block A\n Block B"
|
||||||
|
|
||||||
|
|
||||||
|
# Tests the whole stack works with the CSS Filter
|
||||||
def test_check_markup_css_filter_restriction(client, live_server):
|
def test_check_markup_css_filter_restriction(client, live_server):
|
||||||
sleep_time_for_fetch_thread = 3
|
sleep_time_for_fetch_thread = 3
|
||||||
|
|
||||||
|
|
|
@ -18,6 +18,6 @@ wtforms ~= 2.3.3
|
||||||
# Notification library
|
# Notification library
|
||||||
apprise ~= 0.9
|
apprise ~= 0.9
|
||||||
|
|
||||||
# Used for CSS filtering
|
# Used for CSS filtering, replace with soupsieve and lxml for xpath
|
||||||
bs4
|
bs4
|
||||||
|
|
||||||
|
|
Ładowanie…
Reference in New Issue