kopia lustrzana https://github.com/dgtlmoon/changedetection.io
				
				
				
			Auto extract html title as title (#102)
* Auto extract <title> as watch title, Minor refactor for html toolingpull/107/head
							rodzic
							
								
									9af1ea9fc0
								
							
						
					
					
						commit
						25185e6d00
					
				| 
						 | 
				
			
			@ -442,6 +442,7 @@ def changedetection_app(config=None, datastore_o=None):
 | 
			
		|||
        if request.method == 'GET':
 | 
			
		||||
            form.minutes_between_check.data = int(datastore.data['settings']['requests']['minutes_between_check'])
 | 
			
		||||
            form.notification_urls.data = datastore.data['settings']['application']['notification_urls']
 | 
			
		||||
            form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title']
 | 
			
		||||
 | 
			
		||||
            # Password unset is a GET
 | 
			
		||||
            if request.values.get('removepassword') == 'true':
 | 
			
		||||
| 
						 | 
				
			
			@ -454,6 +455,7 @@ def changedetection_app(config=None, datastore_o=None):
 | 
			
		|||
 | 
			
		||||
            datastore.data['settings']['application']['notification_urls'] = form.notification_urls.data
 | 
			
		||||
            datastore.data['settings']['requests']['minutes_between_check'] = form.minutes_between_check.data
 | 
			
		||||
            datastore.data['settings']['application']['extract_title_as_title'] = form.extract_title_as_title.data
 | 
			
		||||
 | 
			
		||||
            if len(form.notification_urls.data):
 | 
			
		||||
                import apprise
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,18 +3,10 @@ import requests
 | 
			
		|||
import hashlib
 | 
			
		||||
from inscriptis import get_text
 | 
			
		||||
import urllib3
 | 
			
		||||
from . import html_tools
 | 
			
		||||
 | 
			
		||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 | 
			
		||||
 | 
			
		||||
# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
 | 
			
		||||
class css_filter(object):
 | 
			
		||||
    def apply(self, css_filter, html_content):
 | 
			
		||||
        from bs4 import BeautifulSoup
 | 
			
		||||
        soup = BeautifulSoup(html_content, "html.parser")
 | 
			
		||||
        html_block = ""
 | 
			
		||||
        for item in soup.select(css_filter, separator=""):
 | 
			
		||||
            html_block += str(item)
 | 
			
		||||
 | 
			
		||||
        return html_block+"\n"
 | 
			
		||||
 | 
			
		||||
# Some common stuff here that can be moved to a base class
 | 
			
		||||
class perform_site_check():
 | 
			
		||||
| 
						 | 
				
			
			@ -59,6 +51,7 @@ class perform_site_check():
 | 
			
		|||
 | 
			
		||||
    def run(self, uuid):
 | 
			
		||||
        timestamp = int(time.time())  # used for storage etc too
 | 
			
		||||
 | 
			
		||||
        stripped_text_from_html = False
 | 
			
		||||
        changed_detected = False
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -98,8 +91,7 @@ class perform_site_check():
 | 
			
		|||
            # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
 | 
			
		||||
            css_filter_rule = self.datastore.data['watching'][uuid]['css_filter']
 | 
			
		||||
            if css_filter_rule and len(css_filter_rule.strip()):
 | 
			
		||||
                filter = css_filter()
 | 
			
		||||
                html = filter.apply(css_filter=css_filter_rule, html_content=r.content)
 | 
			
		||||
                html = html_tools.css_filter(css_filter=css_filter_rule, html_content=r.content)
 | 
			
		||||
 | 
			
		||||
            stripped_text_from_html = get_text(html)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -150,4 +142,10 @@ class perform_site_check():
 | 
			
		|||
 | 
			
		||||
                update_obj["previous_md5"] = fetched_md5
 | 
			
		||||
 | 
			
		||||
            # Extract title as title
 | 
			
		||||
            if self.datastore.data['settings']['application']['extract_title_as_title']:
 | 
			
		||||
                if not self.datastore.data['watching'][uuid]['title'] or not len(self.datastore.data['watching'][uuid]['title']):
 | 
			
		||||
                    update_obj['title'] = html_tools.extract_element(find='title', html_content=html)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        return changed_detected, update_obj, stripped_text_from_html
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -128,4 +128,5 @@ class globalSettingsForm(Form):
 | 
			
		|||
                                               [validators.NumberRange(min=1)])
 | 
			
		||||
 | 
			
		||||
    notification_urls = StringListField('Notification URL List')
 | 
			
		||||
    extract_title_as_title = BooleanField('Extract <title> from document and use as watch title')
 | 
			
		||||
    trigger_check = BooleanField('Send test notification on save')
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -0,0 +1,23 @@
 | 
			
		|||
from bs4 import BeautifulSoup
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
 | 
			
		||||
def css_filter(css_filter, html_content):
 | 
			
		||||
    soup = BeautifulSoup(html_content, "html.parser")
 | 
			
		||||
    html_block = ""
 | 
			
		||||
    for item in soup.select(css_filter, separator=""):
 | 
			
		||||
        html_block += str(item)
 | 
			
		||||
 | 
			
		||||
    return html_block + "\n"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Extract/find element
 | 
			
		||||
def extract_element(find='title', html_content=''):
 | 
			
		||||
    html_title = False
 | 
			
		||||
 | 
			
		||||
    soup = BeautifulSoup(html_content, 'html.parser')
 | 
			
		||||
    title = soup.find(find)
 | 
			
		||||
    if title and title.string is not None:
 | 
			
		||||
        html_title = title.string.strip()
 | 
			
		||||
 | 
			
		||||
    return html_title
 | 
			
		||||
| 
						 | 
				
			
			@ -38,6 +38,7 @@ class ChangeDetectionStore:
 | 
			
		|||
                },
 | 
			
		||||
                'application': {
 | 
			
		||||
                    'password': False,
 | 
			
		||||
                    'extract_title_as_title': False,
 | 
			
		||||
                    'notification_urls': [] # Apprise URL list
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -16,6 +16,10 @@
 | 
			
		|||
                    {{ render_field(form.password, size=10) }}
 | 
			
		||||
                {% endif %}
 | 
			
		||||
            </div>
 | 
			
		||||
            <div class="pure-control-group">
 | 
			
		||||
                {{ render_field(form.extract_title_as_title) }}
 | 
			
		||||
                  <span class="pure-form-message-inline">Note: This will automatically apply to all existing watches.</span>
 | 
			
		||||
            </div>
 | 
			
		||||
            <div class="pure-control-group">
 | 
			
		||||
                {{ render_field(form.notification_urls, rows=5, placeholder="Gitter - gitter://token/room
 | 
			
		||||
Office365 - o365://TenantID:AccountEmail/ClientID/ClientSecret/TargetEmail
 | 
			
		||||
| 
						 | 
				
			
			@ -27,7 +31,6 @@ SMTPS - mailtos://user:pass@mail.domain.com?to=receivingAddress@example.com
 | 
			
		|||
                <div class="pure-controls">
 | 
			
		||||
                    <span class="pure-form-message-inline"><label for="trigger-test-notification" class="pure-checkbox">
 | 
			
		||||
                        <input type="checkbox" id="trigger-test-notification" name="trigger-test-notification"> Send test notification on save.</label></span>
 | 
			
		||||
 | 
			
		||||
                </div>
 | 
			
		||||
 | 
			
		||||
            <br/>
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -8,8 +8,6 @@ from . util import set_original_response, set_modified_response, live_server_set
 | 
			
		|||
sleep_time_for_fetch_thread = 3
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_check_basic_change_detection_functionality(client, live_server):
 | 
			
		||||
    set_original_response()
 | 
			
		||||
    live_server_setup(live_server)
 | 
			
		||||
| 
						 | 
				
			
			@ -82,15 +80,27 @@ def test_check_basic_change_detection_functionality(client, live_server):
 | 
			
		|||
        # It should report nothing found (no new 'unviewed' class)
 | 
			
		||||
        res = client.get(url_for("index"))
 | 
			
		||||
        assert b'unviewed' not in res.data
 | 
			
		||||
        assert b'head title' not in res.data # Should not be present because this is off by default
 | 
			
		||||
        assert b'test-endpoint' in res.data
 | 
			
		||||
 | 
			
		||||
    set_original_response()
 | 
			
		||||
 | 
			
		||||
    # Enable auto pickup of <title> in settings
 | 
			
		||||
    res = client.post(
 | 
			
		||||
        url_for("settings_page"),
 | 
			
		||||
        data={"extract_title_as_title": "1", "minutes_between_check": 180},
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    client.get(url_for("api_watch_checknow"), follow_redirects=True)
 | 
			
		||||
    time.sleep(sleep_time_for_fetch_thread)
 | 
			
		||||
 | 
			
		||||
    res = client.get(url_for("index"))
 | 
			
		||||
    assert b'unviewed' in res.data
 | 
			
		||||
    # It should have picked up the <title>
 | 
			
		||||
    assert b'head title' in res.data
 | 
			
		||||
 | 
			
		||||
    #
 | 
			
		||||
    # Cleanup everything
 | 
			
		||||
    res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
 | 
			
		||||
    assert b'Deleted' in res.data
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -4,6 +4,8 @@ import time
 | 
			
		|||
from flask import url_for
 | 
			
		||||
from . util import live_server_setup
 | 
			
		||||
 | 
			
		||||
from ..html_tools import *
 | 
			
		||||
 | 
			
		||||
def test_setup(live_server):
 | 
			
		||||
    live_server_setup(live_server)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -48,11 +50,9 @@ def test_css_filter_output():
 | 
			
		|||
    from backend import fetch_site_status
 | 
			
		||||
    from inscriptis import get_text
 | 
			
		||||
 | 
			
		||||
    css_filter = fetch_site_status.css_filter()
 | 
			
		||||
 | 
			
		||||
    # Check text with sub-parts renders correctly
 | 
			
		||||
    content = """<html> <body><div id="thingthing" >  Some really <b>bold</b> text  </div> </body> </html>"""
 | 
			
		||||
    html_blob = css_filter.apply(css_filter="#thingthing", html_content=content)
 | 
			
		||||
    html_blob = css_filter(css_filter="#thingthing", html_content=content)
 | 
			
		||||
    text = get_text(html_blob)
 | 
			
		||||
    assert text == "  Some really bold text"
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -61,7 +61,7 @@ def test_css_filter_output():
 | 
			
		|||
    <div class="parts">Block A</div> <div class="parts">Block B</div></body> 
 | 
			
		||||
    </html>
 | 
			
		||||
"""
 | 
			
		||||
    html_blob = css_filter.apply(css_filter=".parts", html_content=content)
 | 
			
		||||
    html_blob = css_filter(css_filter=".parts", html_content=content)
 | 
			
		||||
    text = get_text(html_blob)
 | 
			
		||||
 | 
			
		||||
    # Divs are converted to 4 whitespaces by inscriptis
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,7 +3,8 @@
 | 
			
		|||
 | 
			
		||||
def set_original_response():
 | 
			
		||||
    test_return_data = """<html>
 | 
			
		||||
       <body>
 | 
			
		||||
    <head><title>head title</title></head>
 | 
			
		||||
    <body>
 | 
			
		||||
     Some initial text</br>
 | 
			
		||||
     <p>Which is across multiple lines</p>
 | 
			
		||||
     </br>
 | 
			
		||||
| 
						 | 
				
			
			@ -18,7 +19,8 @@ def set_original_response():
 | 
			
		|||
 | 
			
		||||
def set_modified_response():
 | 
			
		||||
    test_return_data = """<html>
 | 
			
		||||
       <body>
 | 
			
		||||
    <head><title>modified head title</title></head>
 | 
			
		||||
    <body>
 | 
			
		||||
     Some initial text</br>
 | 
			
		||||
     <p>which has this one new line</p>
 | 
			
		||||
     </br>
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -31,8 +31,10 @@ class update_worker(threading.Thread):
 | 
			
		|||
                    try:
 | 
			
		||||
                        changed_detected, result, contents = update_handler.run(uuid)
 | 
			
		||||
 | 
			
		||||
                    except PermissionError as s:
 | 
			
		||||
                        self.app.logger.error("File permission error updating", uuid, str(s))
 | 
			
		||||
                    except PermissionError as e:
 | 
			
		||||
                        self.app.logger.error("File permission error updating", uuid, str(e))
 | 
			
		||||
                    except Exception as e:
 | 
			
		||||
                        self.app.logger.error("Exception reached", uuid, str(e))
 | 
			
		||||
                    else:
 | 
			
		||||
                        if result:
 | 
			
		||||
                            try:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Ładowanie…
	
		Reference in New Issue