kopia lustrzana https://github.com/dgtlmoon/changedetection.io
Always extract page <title> Re #3402
rodzic
7576bec66a
commit
a7c21c566c
|
@ -310,15 +310,6 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if process_changedetection_results:
|
if process_changedetection_results:
|
||||||
# Extract title if needed
|
|
||||||
if datastore.data['settings']['application'].get('extract_title_as_title') or watch['extract_title_as_title']:
|
|
||||||
if not watch['title'] or not len(watch['title']):
|
|
||||||
try:
|
|
||||||
update_obj['title'] = html_tools.extract_element(find='title', html_content=update_handler.fetcher.content)
|
|
||||||
logger.info(f"UUID: {uuid} Extract <title> updated title to '{update_obj['title']}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"UUID: {uuid} Extract <title> as watch title was enabled, but couldn't find a <title>.")
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
datastore.update_watch(uuid=uuid, update_obj=update_obj)
|
datastore.update_watch(uuid=uuid, update_obj=update_obj)
|
||||||
|
|
||||||
|
@ -357,6 +348,14 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore):
|
||||||
# Always record attempt count
|
# Always record attempt count
|
||||||
count = watch.get('check_count', 0) + 1
|
count = watch.get('check_count', 0) + 1
|
||||||
|
|
||||||
|
# Always record page title (used in notifications, and can change even when the content is the same)
|
||||||
|
try:
|
||||||
|
page_title = html_tools.extract_title(data=update_handler.fetcher.content)
|
||||||
|
logger.info(f"UUID: {uuid} Page <title> is '{page_title}")
|
||||||
|
datastore.update_watch(uuid=uuid, update_obj={'title': page_title})
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"UUID: {uuid} Exception when extracting <title> - {str(e)}")
|
||||||
|
|
||||||
# Record server header
|
# Record server header
|
||||||
try:
|
try:
|
||||||
server_header = update_handler.fetcher.headers.get('server', '').strip().lower()[:255]
|
server_header = update_handler.fetcher.headers.get('server', '').strip().lower()[:255]
|
||||||
|
|
|
@ -84,7 +84,7 @@
|
||||||
<span class="pure-form-message-inline">Love RSS? Does your reader support HTML? Set it here</span>
|
<span class="pure-form-message-inline">Love RSS? Does your reader support HTML? Set it here</span>
|
||||||
</div>
|
</div>
|
||||||
<div class="pure-control-group">
|
<div class="pure-control-group">
|
||||||
{{ render_checkbox_field(form.application.form.extract_title_as_title) }}
|
{{ render_checkbox_field(form.application.form.use_page_title_in_list) }}
|
||||||
<span class="pure-form-message-inline">Note: This will automatically apply to all existing watches.</span>
|
<span class="pure-form-message-inline">Note: This will automatically apply to all existing watches.</span>
|
||||||
</div>
|
</div>
|
||||||
<div class="pure-control-group">
|
<div class="pure-control-group">
|
||||||
|
|
|
@ -548,7 +548,7 @@ class commonSettingsForm(Form):
|
||||||
self.notification_title.extra_notification_tokens = kwargs.get('extra_notification_tokens', {})
|
self.notification_title.extra_notification_tokens = kwargs.get('extra_notification_tokens', {})
|
||||||
self.notification_urls.extra_notification_tokens = kwargs.get('extra_notification_tokens', {})
|
self.notification_urls.extra_notification_tokens = kwargs.get('extra_notification_tokens', {})
|
||||||
|
|
||||||
extract_title_as_title = BooleanField('Extract <title> from document and use as watch title', default=False)
|
use_page_title_in_list = BooleanField('Use page <title> in watch', default=False)
|
||||||
fetch_backend = RadioField(u'Fetch Method', choices=content_fetchers.available_fetchers(), validators=[ValidateContentFetcherIsReady()])
|
fetch_backend = RadioField(u'Fetch Method', choices=content_fetchers.available_fetchers(), validators=[ValidateContentFetcherIsReady()])
|
||||||
notification_body = TextAreaField('Notification Body', default='{{ watch_url }} had a change.', validators=[validators.Optional(), ValidateJinja2Template()])
|
notification_body = TextAreaField('Notification Body', default='{{ watch_url }} had a change.', validators=[validators.Optional(), ValidateJinja2Template()])
|
||||||
notification_format = SelectField('Notification format', choices=valid_notification_formats.keys())
|
notification_format = SelectField('Notification format', choices=valid_notification_formats.keys())
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from typing import List
|
from typing import List
|
||||||
|
import html
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
@ -9,6 +10,11 @@ TEXT_FILTER_LIST_LINE_SUFFIX = "<br>"
|
||||||
TRANSLATE_WHITESPACE_TABLE = str.maketrans('', '', '\r\n\t ')
|
TRANSLATE_WHITESPACE_TABLE = str.maketrans('', '', '\r\n\t ')
|
||||||
PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$'
|
PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$'
|
||||||
|
|
||||||
|
TITLE_RE = re.compile(r"<title[^>]*>(.*?)</title>", re.I | re.S)
|
||||||
|
META_CS = re.compile(r'<meta[^>]+charset=["\']?\s*([a-z0-9_\-:+.]+)', re.I)
|
||||||
|
META_CT = re.compile(r'<meta[^>]+http-equiv=["\']?content-type["\']?[^>]*content=["\'][^>]*charset=([a-z0-9_\-:+.]+)', re.I)
|
||||||
|
|
||||||
|
|
||||||
# 'price' , 'lowPrice', 'highPrice' are usually under here
|
# 'price' , 'lowPrice', 'highPrice' are usually under here
|
||||||
# All of those may or may not appear on different websites - I didnt find a way todo case-insensitive searching here
|
# All of those may or may not appear on different websites - I didnt find a way todo case-insensitive searching here
|
||||||
LD_JSON_PRODUCT_OFFER_SELECTORS = ["json:$..offers", "json:$..Offers"]
|
LD_JSON_PRODUCT_OFFER_SELECTORS = ["json:$..offers", "json:$..Offers"]
|
||||||
|
@ -510,3 +516,41 @@ def get_triggered_text(content, trigger_text):
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
return triggered_text
|
return triggered_text
|
||||||
|
|
||||||
|
|
||||||
|
def extract_title(data: bytes | str, sniff_bytes: int = 2048, scan_chars: int = 8192) -> str | None:
|
||||||
|
try:
|
||||||
|
# Only decode/process the prefix we need for title extraction
|
||||||
|
match data:
|
||||||
|
case bytes() if data.startswith((b"\xff\xfe", b"\xfe\xff")):
|
||||||
|
prefix = data[:scan_chars * 2].decode("utf-16", errors="replace")
|
||||||
|
case bytes() if data.startswith((b"\xff\xfe\x00\x00", b"\x00\x00\xfe\xff")):
|
||||||
|
prefix = data[:scan_chars * 4].decode("utf-32", errors="replace")
|
||||||
|
case bytes():
|
||||||
|
try:
|
||||||
|
prefix = data[:scan_chars].decode("utf-8")
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
try:
|
||||||
|
head = data[:sniff_bytes].decode("ascii", errors="ignore")
|
||||||
|
if m := (META_CS.search(head) or META_CT.search(head)):
|
||||||
|
enc = m.group(1).lower()
|
||||||
|
else:
|
||||||
|
enc = "cp1252"
|
||||||
|
prefix = data[:scan_chars * 2].decode(enc, errors="replace")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Title extraction encoding detection failed: {e}")
|
||||||
|
return None
|
||||||
|
case str():
|
||||||
|
prefix = data[:scan_chars] if len(data) > scan_chars else data
|
||||||
|
case _:
|
||||||
|
logger.error(f"Title extraction received unsupported data type: {type(data)}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Search only in the prefix
|
||||||
|
if m := TITLE_RE.search(prefix):
|
||||||
|
return html.unescape(" ".join(m.group(1).split())).strip()
|
||||||
|
return None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Title extraction failed: {e}")
|
||||||
|
return None
|
|
@ -39,7 +39,6 @@ class model(dict):
|
||||||
'api_access_token_enabled': True,
|
'api_access_token_enabled': True,
|
||||||
'base_url' : None,
|
'base_url' : None,
|
||||||
'empty_pages_are_a_change': False,
|
'empty_pages_are_a_change': False,
|
||||||
'extract_title_as_title': False,
|
|
||||||
'fetch_backend': getenv("DEFAULT_FETCH_BACKEND", "html_requests"),
|
'fetch_backend': getenv("DEFAULT_FETCH_BACKEND", "html_requests"),
|
||||||
'filter_failure_notification_threshold_attempts': _FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT,
|
'filter_failure_notification_threshold_attempts': _FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT,
|
||||||
'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum
|
'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum
|
||||||
|
@ -57,9 +56,10 @@ class model(dict):
|
||||||
'rss_hide_muted_watches': True,
|
'rss_hide_muted_watches': True,
|
||||||
'schema_version' : 0,
|
'schema_version' : 0,
|
||||||
'shared_diff_access': False,
|
'shared_diff_access': False,
|
||||||
'webdriver_delay': None , # Extra delay in seconds before extracting text
|
|
||||||
'tags': {}, #@todo use Tag.model initialisers
|
'tags': {}, #@todo use Tag.model initialisers
|
||||||
'timezone': None, # Default IANA timezone name
|
'timezone': None, # Default IANA timezone name
|
||||||
|
'use_page_title_in_list': False,
|
||||||
|
'webdriver_delay': None , # Extra delay in seconds before extracting text
|
||||||
'ui': {
|
'ui': {
|
||||||
'open_diff_in_new_tab': True,
|
'open_diff_in_new_tab': True,
|
||||||
'socket_io_enabled': True,
|
'socket_io_enabled': True,
|
||||||
|
|
|
@ -24,7 +24,6 @@ class watch_base(dict):
|
||||||
'content-type': None,
|
'content-type': None,
|
||||||
'date_created': None,
|
'date_created': None,
|
||||||
'extract_text': [], # Extract text by regex after filters
|
'extract_text': [], # Extract text by regex after filters
|
||||||
'extract_title_as_title': False,
|
|
||||||
'fetch_backend': 'system', # plaintext, playwright etc
|
'fetch_backend': 'system', # plaintext, playwright etc
|
||||||
'fetch_time': 0.0,
|
'fetch_time': 0.0,
|
||||||
'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
|
'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
|
||||||
|
@ -128,6 +127,7 @@ class watch_base(dict):
|
||||||
'remove_duplicate_lines': False,
|
'remove_duplicate_lines': False,
|
||||||
'trigger_text': [], # List of text or regex to wait for until a change is detected
|
'trigger_text': [], # List of text or regex to wait for until a change is detected
|
||||||
'url': '',
|
'url': '',
|
||||||
|
'use_page_title_in_list': False,
|
||||||
'uuid': str(uuid.uuid4()),
|
'uuid': str(uuid.uuid4()),
|
||||||
'webdriver_delay': None,
|
'webdriver_delay': None,
|
||||||
'webdriver_js_execute_code': None, # Run before change-detection
|
'webdriver_js_execute_code': None, # Run before change-detection
|
||||||
|
|
|
@ -262,11 +262,6 @@ class ChangeDetectionStore:
|
||||||
extras = deepcopy(self.data['watching'][uuid])
|
extras = deepcopy(self.data['watching'][uuid])
|
||||||
new_uuid = self.add_watch(url=url, extras=extras)
|
new_uuid = self.add_watch(url=url, extras=extras)
|
||||||
watch = self.data['watching'][new_uuid]
|
watch = self.data['watching'][new_uuid]
|
||||||
|
|
||||||
if self.data['settings']['application'].get('extract_title_as_title') or watch['extract_title_as_title']:
|
|
||||||
# Because it will be recalculated on the next fetch
|
|
||||||
self.data['watching'][new_uuid]['title'] = None
|
|
||||||
|
|
||||||
return new_uuid
|
return new_uuid
|
||||||
|
|
||||||
def url_exists(self, url):
|
def url_exists(self, url):
|
||||||
|
@ -308,7 +303,6 @@ class ChangeDetectionStore:
|
||||||
'browser_steps',
|
'browser_steps',
|
||||||
'css_filter',
|
'css_filter',
|
||||||
'extract_text',
|
'extract_text',
|
||||||
'extract_title_as_title',
|
|
||||||
'headers',
|
'headers',
|
||||||
'ignore_text',
|
'ignore_text',
|
||||||
'include_filters',
|
'include_filters',
|
||||||
|
@ -323,6 +317,7 @@ class ChangeDetectionStore:
|
||||||
'title',
|
'title',
|
||||||
'trigger_text',
|
'trigger_text',
|
||||||
'url',
|
'url',
|
||||||
|
'use_page_title_in_list',
|
||||||
'webdriver_js_execute_code',
|
'webdriver_js_execute_code',
|
||||||
]:
|
]:
|
||||||
if res.get(k):
|
if res.get(k):
|
||||||
|
@ -973,6 +968,16 @@ class ChangeDetectionStore:
|
||||||
f_d.write(zlib.compress(f_j.read()))
|
f_d.write(zlib.compress(f_j.read()))
|
||||||
os.unlink(json_path)
|
os.unlink(json_path)
|
||||||
|
|
||||||
|
def update_20(self):
|
||||||
|
for uuid, watch in self.data['watching'].items():
|
||||||
|
if self.data['watching'][uuid].get('extract_title_as_title'):
|
||||||
|
self.data['watching'][uuid]['use_page_title_in_list'] = self.data['watching'][uuid].get('extract_title_as_title')
|
||||||
|
del self.data['watching'][uuid]['extract_title_as_title']
|
||||||
|
|
||||||
|
if self.data['settings']['application'].get('extract_title_as_title'):
|
||||||
|
self.data['settings']['application']['use_page_title_in_list'] = self.data['settings']['application'].get('extract_title_as_title')
|
||||||
|
|
||||||
|
|
||||||
def add_notification_url(self, notification_url):
|
def add_notification_url(self, notification_url):
|
||||||
|
|
||||||
logger.debug(f">>> Adding new notification_url - '{notification_url}'")
|
logger.debug(f">>> Adding new notification_url - '{notification_url}'")
|
||||||
|
|
|
@ -70,7 +70,7 @@
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td><code>{{ '{{watch_title}}' }}</code></td>
|
<td><code>{{ '{{watch_title}}' }}</code></td>
|
||||||
<td>The title of the watch.</td>
|
<td>The page title of the watch.</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td><code>{{ '{{watch_tag}}' }}</code></td>
|
<td><code>{{ '{{watch_tag}}' }}</code></td>
|
||||||
|
|
|
@ -102,7 +102,7 @@
|
||||||
<br>
|
<br>
|
||||||
</div>
|
</div>
|
||||||
<div class="pure-control-group">
|
<div class="pure-control-group">
|
||||||
{{ render_checkbox_field(form.extract_title_as_title) }}
|
{{ render_checkbox_field(form.use_page_title_in_list) }}
|
||||||
</div>
|
</div>
|
||||||
<div class="pure-control-group">
|
<div class="pure-control-group">
|
||||||
{{ render_checkbox_field(form.filter_failure_notification_send) }}
|
{{ render_checkbox_field(form.filter_failure_notification_send) }}
|
||||||
|
|
|
@ -123,7 +123,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
|
||||||
# Enable auto pickup of <title> in settings
|
# Enable auto pickup of <title> in settings
|
||||||
res = client.post(
|
res = client.post(
|
||||||
url_for("settings.settings_page"),
|
url_for("settings.settings_page"),
|
||||||
data={"application-extract_title_as_title": "1", "requests-time_between_check-minutes": 180,
|
data={"application-use_page_title_in_list": "1", "requests-time_between_check-minutes": 180,
|
||||||
'application-fetch_backend': "html_requests"},
|
'application-fetch_backend': "html_requests"},
|
||||||
follow_redirects=True
|
follow_redirects=True
|
||||||
)
|
)
|
||||||
|
@ -138,6 +138,15 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
|
||||||
# It should have picked up the <title>
|
# It should have picked up the <title>
|
||||||
assert b'head title' in res.data
|
assert b'head title' in res.data
|
||||||
|
|
||||||
|
|
||||||
|
# Recheck it but only with a title change
|
||||||
|
set_original_response(extra_title=" and more")
|
||||||
|
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
|
||||||
|
wait_for_all_checks(client)
|
||||||
|
res = client.get(url_for("watchlist.index"))
|
||||||
|
assert b'head title and more' in res.data
|
||||||
|
|
||||||
|
|
||||||
# Be sure the last_viewed is going to be greater than the last snapshot
|
# Be sure the last_viewed is going to be greater than the last snapshot
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
|
|
|
@ -6,9 +6,9 @@ from flask import url_for
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
|
|
||||||
def set_original_response():
|
def set_original_response(extra_title=''):
|
||||||
test_return_data = """<html>
|
test_return_data = f"""<html>
|
||||||
<head><title>head title</title></head>
|
<head><title>head title{extra_title}</title></head>
|
||||||
<body>
|
<body>
|
||||||
Some initial text<br>
|
Some initial text<br>
|
||||||
<p>Which is across multiple lines</p>
|
<p>Which is across multiple lines</p>
|
||||||
|
|
Ładowanie…
Reference in New Issue