2023-11-13 09:42:56 +00:00
|
|
|
# HTML to TEXT/JSON DIFFERENCE self.fetcher
|
2023-03-12 17:11:53 +00:00
|
|
|
|
2021-01-27 14:12:31 +00:00
|
|
|
import hashlib
|
2022-12-15 08:13:09 +00:00
|
|
|
import json
|
2022-03-21 19:59:20 +00:00
|
|
|
import os
|
2021-08-16 11:13:17 +00:00
|
|
|
import re
|
2022-03-12 12:29:30 +00:00
|
|
|
import urllib3
|
|
|
|
|
2025-03-17 18:20:24 +00:00
|
|
|
from changedetectionio.conditions import execute_ruleset_against_all_plugins
|
2024-07-12 15:09:42 +00:00
|
|
|
from changedetectionio.processors import difference_detection_processor
|
2024-10-10 12:59:39 +00:00
|
|
|
from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text, TRANSLATE_WHITESPACE_TABLE
|
2024-02-10 23:09:12 +00:00
|
|
|
from changedetectionio import html_tools, content_fetchers
|
2022-12-08 21:35:37 +00:00
|
|
|
from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
|
2024-01-16 08:48:16 +00:00
|
|
|
from loguru import logger
|
2021-02-04 11:38:48 +00:00
|
|
|
|
2021-06-24 09:10:19 +00:00
|
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
2021-06-23 10:40:01 +00:00
|
|
|
|
2023-10-03 15:44:27 +00:00
|
|
|
name = 'Webpage Text/HTML, JSON and PDF changes'
|
2023-03-18 19:36:26 +00:00
|
|
|
description = 'Detects all text changes where possible'
|
2024-07-12 15:09:42 +00:00
|
|
|
|
2024-06-21 11:31:03 +00:00
|
|
|
json_filter_prefixes = ['json:', 'jq:', 'jqraw:']
|
2023-10-03 15:44:27 +00:00
|
|
|
|
2022-11-03 11:13:54 +00:00
|
|
|
class FilterNotFoundInResponse(ValueError):
|
2024-07-09 13:35:19 +00:00
|
|
|
def __init__(self, msg, screenshot=None, xpath_data=None):
|
2024-06-23 07:19:32 +00:00
|
|
|
self.screenshot = screenshot
|
2024-07-09 13:35:19 +00:00
|
|
|
self.xpath_data = xpath_data
|
2022-11-03 11:13:54 +00:00
|
|
|
ValueError.__init__(self, msg)
|
|
|
|
|
2023-10-03 15:44:27 +00:00
|
|
|
|
2022-12-19 16:51:41 +00:00
|
|
|
class PDFToHTMLToolNotFound(ValueError):
|
|
|
|
def __init__(self, msg):
|
|
|
|
ValueError.__init__(self, msg)
|
|
|
|
|
2022-11-03 11:13:54 +00:00
|
|
|
|
2021-02-05 17:43:35 +00:00
|
|
|
# Some common stuff here that can be moved to a base class
|
2022-07-26 15:33:40 +00:00
|
|
|
# (set_proxy_from_list)
|
2023-03-18 19:36:26 +00:00
|
|
|
class perform_site_check(difference_detection_processor):
|
2021-02-04 11:38:48 +00:00
|
|
|
|
2024-10-11 15:28:42 +00:00
|
|
|
def run_changedetection(self, watch):
|
2021-02-21 19:23:50 +00:00
|
|
|
changed_detected = False
|
2023-11-13 09:42:56 +00:00
|
|
|
html_content = ""
|
2022-04-12 15:36:29 +00:00
|
|
|
screenshot = False # as bytes
|
2021-08-12 10:05:59 +00:00
|
|
|
stripped_text_from_html = ""
|
2021-02-05 17:43:35 +00:00
|
|
|
|
2022-09-15 13:25:23 +00:00
|
|
|
if not watch:
|
2023-03-18 19:36:26 +00:00
|
|
|
raise Exception("Watch no longer exists.")
|
2021-08-16 11:13:17 +00:00
|
|
|
|
2022-03-21 19:59:20 +00:00
|
|
|
# Unset any existing notification error
|
2022-02-04 19:54:20 +00:00
|
|
|
update_obj = {'last_notification_error': False, 'last_error': False}
|
2021-02-05 17:43:35 +00:00
|
|
|
|
2022-10-27 11:29:24 +00:00
|
|
|
url = watch.link
|
2022-10-24 21:20:39 +00:00
|
|
|
|
2023-11-13 09:42:56 +00:00
|
|
|
self.screenshot = self.fetcher.screenshot
|
|
|
|
self.xpath_data = self.fetcher.xpath_data
|
2022-08-17 11:21:06 +00:00
|
|
|
|
2022-12-19 16:51:41 +00:00
|
|
|
# Track the content type
|
2023-11-13 09:42:56 +00:00
|
|
|
update_obj['content_type'] = self.fetcher.get_all_headers().get('content-type', '').lower()
|
2022-12-19 16:51:41 +00:00
|
|
|
|
2022-12-14 14:08:34 +00:00
|
|
|
# Watches added automatically in the queue manager will skip if its the same checksum as the previous run
|
|
|
|
# Saves a lot of CPU
|
2023-11-13 09:42:56 +00:00
|
|
|
update_obj['previous_md5_before_filters'] = hashlib.md5(self.fetcher.content.encode('utf-8')).hexdigest()
|
2022-12-14 14:08:34 +00:00
|
|
|
|
2022-04-12 06:36:08 +00:00
|
|
|
# Fetching complete, now filters
|
|
|
|
|
|
|
|
# @note: I feel like the following should be in a more obvious chain system
|
|
|
|
# - Check filter text
|
|
|
|
# - Is the checksum different?
|
|
|
|
# - Do we convert to JSON?
|
|
|
|
# https://stackoverflow.com/questions/41817578/basic-method-chaining ?
|
|
|
|
# return content().textfilter().jsonextract().checksumcompare() ?
|
|
|
|
|
2023-11-13 09:42:56 +00:00
|
|
|
is_json = 'application/json' in self.fetcher.get_all_headers().get('content-type', '').lower()
|
2022-04-12 06:36:08 +00:00
|
|
|
is_html = not is_json
|
2023-10-17 16:34:19 +00:00
|
|
|
is_rss = False
|
|
|
|
|
2023-11-13 09:42:56 +00:00
|
|
|
ctype_header = self.fetcher.get_all_headers().get('content-type', '').lower()
|
2023-10-17 16:34:19 +00:00
|
|
|
# Go into RSS preprocess for converting CDATA/comment to usable text
|
|
|
|
if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']):
|
2023-11-13 09:42:56 +00:00
|
|
|
if '<rss' in self.fetcher.content[:100].lower():
|
|
|
|
self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content)
|
2023-10-17 16:34:19 +00:00
|
|
|
is_rss = True
|
2022-04-12 15:36:29 +00:00
|
|
|
|
|
|
|
# source: support, basically treat it as plaintext
|
2023-11-13 09:42:56 +00:00
|
|
|
if watch.is_source_type_url:
|
2022-04-12 15:36:29 +00:00
|
|
|
is_html = False
|
|
|
|
is_json = False
|
|
|
|
|
2023-11-13 09:42:56 +00:00
|
|
|
inline_pdf = self.fetcher.get_all_headers().get('content-disposition', '') and '%PDF-1' in self.fetcher.content[:10]
|
|
|
|
if watch.is_pdf or 'application/pdf' in self.fetcher.get_all_headers().get('content-type', '').lower() or inline_pdf:
|
2022-12-19 16:51:41 +00:00
|
|
|
from shutil import which
|
|
|
|
tool = os.getenv("PDF_TO_HTML_TOOL", "pdftohtml")
|
|
|
|
if not which(tool):
|
|
|
|
raise PDFToHTMLToolNotFound("Command-line `{}` tool was not found in system PATH, was it installed?".format(tool))
|
|
|
|
|
|
|
|
import subprocess
|
|
|
|
proc = subprocess.Popen(
|
|
|
|
[tool, '-stdout', '-', '-s', 'out.pdf', '-i'],
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
stdin=subprocess.PIPE)
|
2023-11-13 09:42:56 +00:00
|
|
|
proc.stdin.write(self.fetcher.raw_content)
|
2022-12-19 16:51:41 +00:00
|
|
|
proc.stdin.close()
|
2023-11-13 09:42:56 +00:00
|
|
|
self.fetcher.content = proc.stdout.read().decode('utf-8')
|
2022-12-19 16:51:41 +00:00
|
|
|
proc.wait(timeout=60)
|
|
|
|
|
|
|
|
# Add a little metadata so we know if the file changes (like if an image changes, but the text is the same
|
|
|
|
# @todo may cause problems with non-UTF8?
|
|
|
|
metadata = "<p>Added by changedetection.io: Document checksum - {} Filesize - {} bytes</p>".format(
|
2023-11-13 09:42:56 +00:00
|
|
|
hashlib.md5(self.fetcher.raw_content).hexdigest().upper(),
|
|
|
|
len(self.fetcher.content))
|
2022-12-19 16:51:41 +00:00
|
|
|
|
2023-11-13 09:42:56 +00:00
|
|
|
self.fetcher.content = self.fetcher.content.replace('</body>', metadata + '</body>')
|
2022-12-19 16:51:41 +00:00
|
|
|
|
2023-06-19 21:29:13 +00:00
|
|
|
# Better would be if Watch.model could access the global data also
|
|
|
|
# and then use getattr https://docs.python.org/3/reference/datamodel.html#object.__getitem__
|
|
|
|
# https://realpython.com/inherit-python-dict/ instead of doing it procedurely
|
2024-06-23 07:19:32 +00:00
|
|
|
include_filters_from_tags = self.datastore.get_tag_overrides_for_watch(uuid=watch.get('uuid'), attr='include_filters')
|
2024-02-02 08:30:01 +00:00
|
|
|
|
|
|
|
# 1845 - remove duplicated filters in both group and watch include filter
|
2024-02-21 10:46:23 +00:00
|
|
|
include_filters_rule = list(dict.fromkeys(watch.get('include_filters', []) + include_filters_from_tags))
|
2022-12-19 16:51:41 +00:00
|
|
|
|
2024-06-23 07:19:32 +00:00
|
|
|
subtractive_selectors = [*self.datastore.get_tag_overrides_for_watch(uuid=watch.get('uuid'), attr='subtractive_selectors'),
|
2023-06-19 21:29:13 +00:00
|
|
|
*watch.get("subtractive_selectors", []),
|
|
|
|
*self.datastore.data["settings"]["application"].get("global_subtractive_selectors", [])
|
|
|
|
]
|
2022-04-12 06:36:08 +00:00
|
|
|
|
2022-12-08 16:47:22 +00:00
|
|
|
# Inject a virtual LD+JSON price tracker rule
|
2022-12-08 21:35:37 +00:00
|
|
|
if watch.get('track_ldjson_price_data', '') == PRICE_DATA_TRACK_ACCEPT:
|
2023-10-04 13:57:55 +00:00
|
|
|
include_filters_rule += html_tools.LD_JSON_PRODUCT_OFFER_SELECTORS
|
2022-12-08 16:47:22 +00:00
|
|
|
|
2023-06-19 21:29:13 +00:00
|
|
|
has_filter_rule = len(include_filters_rule) and len(include_filters_rule[0].strip())
|
|
|
|
has_subtractive_selectors = len(subtractive_selectors) and len(subtractive_selectors[0].strip())
|
2022-04-12 06:36:08 +00:00
|
|
|
|
|
|
|
if is_json and not has_filter_rule:
|
2022-11-03 11:13:54 +00:00
|
|
|
include_filters_rule.append("json:$")
|
2022-04-12 06:36:08 +00:00
|
|
|
has_filter_rule = True
|
|
|
|
|
2022-12-15 08:13:09 +00:00
|
|
|
if is_json:
|
|
|
|
# Sort the JSON so we dont get false alerts when the content is just re-ordered
|
|
|
|
try:
|
2023-11-13 09:42:56 +00:00
|
|
|
self.fetcher.content = json.dumps(json.loads(self.fetcher.content), sort_keys=True)
|
2022-12-15 08:13:09 +00:00
|
|
|
except Exception as e:
|
|
|
|
# Might have just been a snippet, or otherwise bad JSON, continue
|
|
|
|
pass
|
|
|
|
|
2022-04-12 06:36:08 +00:00
|
|
|
if has_filter_rule:
|
2022-11-03 11:13:54 +00:00
|
|
|
for filter in include_filters_rule:
|
|
|
|
if any(prefix in filter for prefix in json_filter_prefixes):
|
2023-11-13 09:42:56 +00:00
|
|
|
stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter)
|
2022-11-03 11:13:54 +00:00
|
|
|
is_html = False
|
2022-04-12 06:36:08 +00:00
|
|
|
|
2023-11-13 09:42:56 +00:00
|
|
|
if is_html or watch.is_source_type_url:
|
2022-11-08 11:18:38 +00:00
|
|
|
|
2022-04-12 06:36:08 +00:00
|
|
|
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
2023-11-13 09:42:56 +00:00
|
|
|
self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content)
|
|
|
|
html_content = self.fetcher.content
|
2022-04-12 06:36:08 +00:00
|
|
|
|
|
|
|
# If not JSON, and if it's not text/plain..
|
2023-11-13 09:42:56 +00:00
|
|
|
if 'text/plain' in self.fetcher.get_all_headers().get('content-type', '').lower():
|
2022-04-12 06:36:08 +00:00
|
|
|
# Don't run get_text or xpath/css filters on plaintext
|
|
|
|
stripped_text_from_html = html_content
|
2021-02-27 19:30:06 +00:00
|
|
|
else:
|
2022-12-08 16:47:22 +00:00
|
|
|
# Does it have some ld+json price data? used for easier monitoring
|
2023-11-13 09:42:56 +00:00
|
|
|
update_obj['has_ldjson_price_data'] = html_tools.has_ldjson_product_info(self.fetcher.content)
|
2022-12-08 16:47:22 +00:00
|
|
|
|
2022-04-12 06:36:08 +00:00
|
|
|
# Then we assume HTML
|
|
|
|
if has_filter_rule:
|
2022-11-03 11:13:54 +00:00
|
|
|
html_content = ""
|
2022-12-08 16:47:22 +00:00
|
|
|
|
2022-11-03 11:13:54 +00:00
|
|
|
for filter_rule in include_filters_rule:
|
|
|
|
# For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
|
|
|
|
if filter_rule[0] == '/' or filter_rule.startswith('xpath:'):
|
2024-09-27 19:36:02 +00:00
|
|
|
html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''),
|
2023-11-13 09:42:56 +00:00
|
|
|
html_content=self.fetcher.content,
|
|
|
|
append_pretty_line_formatting=not watch.is_source_type_url,
|
2024-09-27 19:36:02 +00:00
|
|
|
is_rss=is_rss)
|
2024-09-11 14:20:49 +00:00
|
|
|
|
2023-11-13 15:42:21 +00:00
|
|
|
elif filter_rule.startswith('xpath1:'):
|
2024-09-27 19:36:02 +00:00
|
|
|
html_content += html_tools.xpath1_filter(xpath_filter=filter_rule.replace('xpath1:', ''),
|
|
|
|
html_content=self.fetcher.content,
|
|
|
|
append_pretty_line_formatting=not watch.is_source_type_url,
|
|
|
|
is_rss=is_rss)
|
2022-11-03 11:13:54 +00:00
|
|
|
else:
|
2024-09-27 19:36:02 +00:00
|
|
|
html_content += html_tools.include_filters(include_filters=filter_rule,
|
2023-11-13 09:42:56 +00:00
|
|
|
html_content=self.fetcher.content,
|
2024-09-27 19:36:02 +00:00
|
|
|
append_pretty_line_formatting=not watch.is_source_type_url)
|
2022-11-03 11:13:54 +00:00
|
|
|
|
|
|
|
if not html_content.strip():
|
2024-07-09 13:35:19 +00:00
|
|
|
raise FilterNotFoundInResponse(msg=include_filters_rule, screenshot=self.fetcher.screenshot, xpath_data=self.fetcher.xpath_data)
|
2022-05-10 15:15:41 +00:00
|
|
|
|
2022-04-12 06:36:08 +00:00
|
|
|
if has_subtractive_selectors:
|
|
|
|
html_content = html_tools.element_removal(subtractive_selectors, html_content)
|
2022-04-12 15:36:29 +00:00
|
|
|
|
2023-11-13 09:42:56 +00:00
|
|
|
if watch.is_source_type_url:
|
2022-11-03 11:13:54 +00:00
|
|
|
stripped_text_from_html = html_content
|
|
|
|
else:
|
2022-04-12 15:36:29 +00:00
|
|
|
# extract text
|
2022-11-08 11:18:38 +00:00
|
|
|
do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
|
2024-09-27 19:36:02 +00:00
|
|
|
stripped_text_from_html = html_tools.html_to_text(html_content=html_content,
|
|
|
|
render_anchor_tag_content=do_anchor,
|
|
|
|
is_rss=is_rss) # 1874 activate the <title workaround hack
|
2022-04-12 15:36:29 +00:00
|
|
|
|
2024-09-18 13:45:44 +00:00
|
|
|
if watch.get('trim_text_whitespace'):
|
|
|
|
stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())
|
|
|
|
|
2022-04-12 06:36:08 +00:00
|
|
|
# Re #340 - return the content before the 'ignore text' was applied
|
2024-10-05 14:32:28 +00:00
|
|
|
# Also used to calculate/show what was removed
|
2024-10-10 12:59:39 +00:00
|
|
|
text_content_before_ignored_filter = stripped_text_from_html
|
2022-04-12 06:36:08 +00:00
|
|
|
|
2023-03-20 19:16:57 +00:00
|
|
|
# @todo whitespace coming from missing rtrim()?
|
|
|
|
# stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about.
|
|
|
|
# Rewrite's the processing text based on only what diff result they want to see
|
2024-10-10 10:45:23 +00:00
|
|
|
|
2023-03-20 19:16:57 +00:00
|
|
|
if watch.has_special_diff_filter_options_set() and len(watch.history.keys()):
|
|
|
|
# Now the content comes from the diff-parser and not the returned HTTP traffic, so could be some differences
|
2024-07-12 15:09:42 +00:00
|
|
|
from changedetectionio import diff
|
2023-03-20 19:16:57 +00:00
|
|
|
# needs to not include (added) etc or it may get used twice
|
|
|
|
# Replace the processed text with the preferred result
|
2024-06-23 07:19:32 +00:00
|
|
|
rendered_diff = diff.render_diff(previous_version_file_contents=watch.get_last_fetched_text_before_filters(),
|
2023-10-03 15:44:27 +00:00
|
|
|
newest_version_file_contents=stripped_text_from_html,
|
|
|
|
include_equal=False, # not the same lines
|
|
|
|
include_added=watch.get('filter_text_added', True),
|
|
|
|
include_removed=watch.get('filter_text_removed', True),
|
|
|
|
include_replaced=watch.get('filter_text_replaced', True),
|
|
|
|
line_feed_sep="\n",
|
|
|
|
include_change_type_prefix=False)
|
2023-03-20 19:16:57 +00:00
|
|
|
|
2024-10-10 12:59:39 +00:00
|
|
|
watch.save_last_text_fetched_before_filters(text_content_before_ignored_filter.encode('utf-8'))
|
2023-03-20 19:16:57 +00:00
|
|
|
|
|
|
|
if not rendered_diff and stripped_text_from_html:
|
|
|
|
# We had some content, but no differences were found
|
|
|
|
# Store our new file as the MD5 so it will trigger in the future
|
2024-10-10 12:59:39 +00:00
|
|
|
c = hashlib.md5(stripped_text_from_html.translate(TRANSLATE_WHITESPACE_TABLE).encode('utf-8')).hexdigest()
|
2024-10-05 14:32:28 +00:00
|
|
|
return False, {'previous_md5': c}, stripped_text_from_html.encode('utf-8')
|
2023-03-20 19:16:57 +00:00
|
|
|
else:
|
|
|
|
stripped_text_from_html = rendered_diff
|
|
|
|
|
2022-05-17 20:22:00 +00:00
|
|
|
# Treat pages with no renderable text content as a change? No by default
|
|
|
|
empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
|
|
|
|
if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
|
2024-02-10 23:09:12 +00:00
|
|
|
raise content_fetchers.exceptions.ReplyWithContentButNoText(url=url,
|
2023-11-13 09:42:56 +00:00
|
|
|
status_code=self.fetcher.get_last_status_code(),
|
2024-07-09 13:35:19 +00:00
|
|
|
screenshot=self.fetcher.screenshot,
|
2023-09-26 11:59:59 +00:00
|
|
|
has_filters=has_filter_rule,
|
2024-07-09 13:35:19 +00:00
|
|
|
html_content=html_content,
|
|
|
|
xpath_data=self.fetcher.xpath_data
|
2023-09-26 11:59:59 +00:00
|
|
|
)
|
2022-05-17 20:22:00 +00:00
|
|
|
|
2022-04-12 06:36:08 +00:00
|
|
|
# We rely on the actual text in the html output.. many sites have random script vars etc,
|
|
|
|
# in the future we'll implement other mechanisms.
|
|
|
|
|
2023-11-13 09:42:56 +00:00
|
|
|
update_obj["last_check_status"] = self.fetcher.get_last_status_code()
|
2022-04-12 06:36:08 +00:00
|
|
|
|
2022-06-06 14:57:50 +00:00
|
|
|
# 615 Extract text by regex
|
|
|
|
extract_text = watch.get('extract_text', [])
|
2025-04-04 09:16:12 +00:00
|
|
|
extract_text += self.datastore.get_tag_overrides_for_watch(uuid=watch.get('uuid'), attr='extract_text')
|
2022-06-06 14:57:50 +00:00
|
|
|
if len(extract_text) > 0:
|
|
|
|
regex_matched_output = []
|
|
|
|
for s_re in extract_text:
|
2022-07-26 15:33:40 +00:00
|
|
|
# incase they specified something in '/.../x'
|
2023-10-03 15:44:27 +00:00
|
|
|
if re.search(PERL_STYLE_REGEX, s_re, re.IGNORECASE):
|
|
|
|
regex = html_tools.perl_style_slash_enclosed_regex_to_options(s_re)
|
2024-10-10 12:59:39 +00:00
|
|
|
result = re.findall(regex, stripped_text_from_html)
|
2023-10-03 15:44:27 +00:00
|
|
|
|
|
|
|
for l in result:
|
|
|
|
if type(l) is tuple:
|
|
|
|
# @todo - some formatter option default (between groups)
|
2024-10-10 12:59:39 +00:00
|
|
|
regex_matched_output += list(l) + ['\n']
|
2023-10-03 15:44:27 +00:00
|
|
|
else:
|
|
|
|
# @todo - some formatter option default (between each ungrouped result)
|
2024-10-10 12:59:39 +00:00
|
|
|
regex_matched_output += [l] + ['\n']
|
2023-10-03 15:44:27 +00:00
|
|
|
else:
|
|
|
|
# Doesnt look like regex, just hunt for plaintext and return that which matches
|
|
|
|
# `stripped_text_from_html` will be bytes, so we must encode s_re also to bytes
|
2024-10-10 12:59:39 +00:00
|
|
|
r = re.compile(re.escape(s_re), re.IGNORECASE)
|
2023-10-03 15:44:27 +00:00
|
|
|
res = r.findall(stripped_text_from_html)
|
|
|
|
if res:
|
|
|
|
for match in res:
|
2024-10-10 12:59:39 +00:00
|
|
|
regex_matched_output += [match] + ['\n']
|
2022-06-06 14:57:50 +00:00
|
|
|
|
2024-09-18 13:45:44 +00:00
|
|
|
##########################################################
|
2024-10-10 12:59:39 +00:00
|
|
|
stripped_text_from_html = ''
|
|
|
|
|
2022-06-06 14:57:50 +00:00
|
|
|
if regex_matched_output:
|
2022-07-26 15:33:40 +00:00
|
|
|
# @todo some formatter for presentation?
|
2024-10-10 12:59:39 +00:00
|
|
|
stripped_text_from_html = ''.join(regex_matched_output)
|
2022-06-06 14:57:50 +00:00
|
|
|
|
2024-10-10 12:59:39 +00:00
|
|
|
if watch.get('remove_duplicate_lines'):
|
|
|
|
stripped_text_from_html = '\n'.join(dict.fromkeys(line for line in stripped_text_from_html.replace("\n\n", "\n").splitlines()))
|
2024-09-18 13:45:44 +00:00
|
|
|
|
|
|
|
|
2024-10-10 12:59:39 +00:00
|
|
|
if watch.get('sort_text_alphabetically'):
|
|
|
|
# Note: Because a <p>something</p> will add an extra line feed to signify the paragraph gap
|
|
|
|
# we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here.
|
|
|
|
stripped_text_from_html = stripped_text_from_html.replace("\n\n", "\n")
|
|
|
|
stripped_text_from_html = '\n'.join(sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower()))
|
|
|
|
|
|
|
|
### CALCULATE MD5
|
|
|
|
# If there's text to ignore
|
|
|
|
text_to_ignore = watch.get('ignore_text', []) + self.datastore.data['settings']['application'].get('global_ignore_text', [])
|
2025-04-04 09:16:12 +00:00
|
|
|
text_to_ignore += self.datastore.get_tag_overrides_for_watch(uuid=watch.get('uuid'), attr='ignore_text')
|
|
|
|
|
2024-10-10 12:59:39 +00:00
|
|
|
text_for_checksuming = stripped_text_from_html
|
|
|
|
if text_to_ignore:
|
|
|
|
text_for_checksuming = html_tools.strip_ignore_text(stripped_text_from_html, text_to_ignore)
|
|
|
|
|
2022-04-12 06:36:08 +00:00
|
|
|
# Re #133 - if we should strip whitespaces from triggering the change detected comparison
|
2024-10-10 12:59:39 +00:00
|
|
|
if text_for_checksuming and self.datastore.data['settings']['application'].get('ignore_whitespace', False):
|
|
|
|
fetched_md5 = hashlib.md5(text_for_checksuming.translate(TRANSLATE_WHITESPACE_TABLE).encode('utf-8')).hexdigest()
|
2022-04-12 06:36:08 +00:00
|
|
|
else:
|
2024-10-10 12:59:39 +00:00
|
|
|
fetched_md5 = hashlib.md5(text_for_checksuming.encode('utf-8')).hexdigest()
|
2021-02-26 19:07:26 +00:00
|
|
|
|
2022-06-15 15:18:46 +00:00
|
|
|
############ Blocking rules, after checksum #################
|
2022-06-15 20:56:43 +00:00
|
|
|
blocked = False
|
2022-11-08 11:18:38 +00:00
|
|
|
trigger_text = watch.get('trigger_text', [])
|
2025-04-04 09:16:12 +00:00
|
|
|
trigger_text += self.datastore.get_tag_overrides_for_watch(uuid=watch.get('uuid'), attr='trigger_text')
|
2022-11-08 11:18:38 +00:00
|
|
|
if len(trigger_text):
|
2022-06-15 20:56:43 +00:00
|
|
|
# Assume blocked
|
|
|
|
blocked = True
|
2022-04-12 06:36:08 +00:00
|
|
|
# Filter and trigger works the same, so reuse it
|
2022-06-06 14:57:50 +00:00
|
|
|
# It should return the line numbers that match
|
2023-03-20 19:16:57 +00:00
|
|
|
# Unblock flow if the trigger was found (some text remained after stripped what didnt match)
|
2022-04-12 06:36:08 +00:00
|
|
|
result = html_tools.strip_ignore_text(content=str(stripped_text_from_html),
|
2022-11-08 11:18:38 +00:00
|
|
|
wordlist=trigger_text,
|
2022-04-12 06:36:08 +00:00
|
|
|
mode="line numbers")
|
2022-06-15 20:56:43 +00:00
|
|
|
# Unblock if the trigger was found
|
2022-04-12 06:36:08 +00:00
|
|
|
if result:
|
2022-06-15 20:56:43 +00:00
|
|
|
blocked = False
|
2021-08-16 11:13:17 +00:00
|
|
|
|
2022-11-08 11:18:38 +00:00
|
|
|
text_should_not_be_present = watch.get('text_should_not_be_present', [])
|
2025-04-04 09:16:12 +00:00
|
|
|
text_should_not_be_present += self.datastore.get_tag_overrides_for_watch(uuid=watch.get('uuid'), attr='text_should_not_be_present')
|
2022-11-08 11:18:38 +00:00
|
|
|
if len(text_should_not_be_present):
|
2022-06-15 20:56:43 +00:00
|
|
|
# If anything matched, then we should block a change from happening
|
|
|
|
result = html_tools.strip_ignore_text(content=str(stripped_text_from_html),
|
2022-11-08 11:18:38 +00:00
|
|
|
wordlist=text_should_not_be_present,
|
2022-06-15 20:56:43 +00:00
|
|
|
mode="line numbers")
|
|
|
|
if result:
|
|
|
|
blocked = True
|
|
|
|
|
2025-03-17 18:20:24 +00:00
|
|
|
# And check if 'conditions' will let this pass through
|
|
|
|
if watch.get('conditions') and watch.get('conditions_match_logic'):
|
2025-03-27 09:29:11 +00:00
|
|
|
conditions_result = execute_ruleset_against_all_plugins(current_watch_uuid=watch.get('uuid'),
|
|
|
|
application_datastruct=self.datastore.data,
|
|
|
|
ephemeral_data={
|
|
|
|
'text': stripped_text_from_html
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
|
|
|
if not conditions_result.get('result'):
|
2025-03-17 18:20:24 +00:00
|
|
|
# Conditions say "Condition not met" so we block it.
|
|
|
|
blocked = True
|
2022-06-15 15:18:46 +00:00
|
|
|
|
2022-06-15 20:56:43 +00:00
|
|
|
# Looks like something changed, but did it match all the rules?
|
|
|
|
if blocked:
|
|
|
|
changed_detected = False
|
2024-10-14 10:57:02 +00:00
|
|
|
else:
|
|
|
|
# The main thing that all this at the moment comes down to :)
|
|
|
|
if watch.get('previous_md5') != fetched_md5:
|
|
|
|
changed_detected = True
|
|
|
|
|
|
|
|
# Always record the new checksum
|
|
|
|
update_obj["previous_md5"] = fetched_md5
|
|
|
|
|
|
|
|
# On the first run of a site, watch['previous_md5'] will be None, set it the current one.
|
|
|
|
if not watch.get('previous_md5'):
|
|
|
|
watch['previous_md5'] = fetched_md5
|
2022-06-15 20:56:43 +00:00
|
|
|
|
2024-06-23 07:19:32 +00:00
|
|
|
logger.debug(f"Watch UUID {watch.get('uuid')} content check - Previous MD5: {watch.get('previous_md5')}, Fetched MD5 {fetched_md5}")
|
2024-01-29 10:21:21 +00:00
|
|
|
|
2022-06-28 16:34:32 +00:00
|
|
|
if changed_detected:
|
|
|
|
if watch.get('check_unique_lines', False):
|
2024-10-10 12:59:39 +00:00
|
|
|
ignore_whitespace = self.datastore.data['settings']['application'].get('ignore_whitespace')
|
|
|
|
|
|
|
|
has_unique_lines = watch.lines_contain_something_unique_compared_to_history(
|
|
|
|
lines=stripped_text_from_html.splitlines(),
|
|
|
|
ignore_whitespace=ignore_whitespace
|
|
|
|
)
|
|
|
|
|
2022-06-28 16:34:32 +00:00
|
|
|
# One or more lines? unsure?
|
|
|
|
if not has_unique_lines:
|
2024-06-23 07:19:32 +00:00
|
|
|
logger.debug(f"check_unique_lines: UUID {watch.get('uuid')} didnt have anything new setting change_detected=False")
|
2022-06-28 16:34:32 +00:00
|
|
|
changed_detected = False
|
|
|
|
else:
|
2024-06-23 07:19:32 +00:00
|
|
|
logger.debug(f"check_unique_lines: UUID {watch.get('uuid')} had unique content")
|
2022-06-28 16:34:32 +00:00
|
|
|
|
2022-06-15 15:18:46 +00:00
|
|
|
|
2024-10-05 14:32:28 +00:00
|
|
|
# stripped_text_from_html - Everything after filters and NO 'ignored' content
|
|
|
|
return changed_detected, update_obj, stripped_text_from_html
|