diff --git a/changedetectionio/content_fetchers/base.py b/changedetectionio/content_fetchers/base.py index 1e896686..8be939ef 100644 --- a/changedetectionio/content_fetchers/base.py +++ b/changedetectionio/content_fetchers/base.py @@ -70,15 +70,17 @@ class Fetcher(): @abstractmethod async def run(self, - url, - timeout, - request_headers, - request_body, - request_method, - ignore_status_codes=False, - current_include_filters=None, - is_binary=False, - empty_pages_are_a_change=False): + fetch_favicon=True, + current_include_filters=None, + empty_pages_are_a_change=False, + ignore_status_codes=False, + is_binary=False, + request_body=None, + request_headers=None, + request_method=None, + timeout=None, + url=None, + ): # Should set self.error, self.status_code and self.content pass diff --git a/changedetectionio/content_fetchers/playwright.py b/changedetectionio/content_fetchers/playwright.py index 6093036f..04fc3c0f 100644 --- a/changedetectionio/content_fetchers/playwright.py +++ b/changedetectionio/content_fetchers/playwright.py @@ -143,15 +143,17 @@ class fetcher(Fetcher): f.write(content) async def run(self, - url, - timeout, - request_headers, - request_body, - request_method, - ignore_status_codes=False, - current_include_filters=None, - is_binary=False, - empty_pages_are_a_change=False): + fetch_favicon=True, + current_include_filters=None, + empty_pages_are_a_change=False, + ignore_status_codes=False, + is_binary=False, + request_body=None, + request_headers=None, + request_method=None, + timeout=None, + url=None, + ): from playwright.async_api import async_playwright import playwright._impl._errors @@ -234,11 +236,12 @@ class fetcher(Fetcher): await browser.close() raise PageUnloadable(url=url, status_code=None, message=str(e)) - try: - self.favicon_blob = await self.page.evaluate(FAVICON_FETCHER_JS) - await self.page.request_gc() - except Exception as e: - logger.error(f"Error fetching FavIcon info {str(e)}, continuing.") + if fetch_favicon: + try: + self.favicon_blob = await self.page.evaluate(FAVICON_FETCHER_JS) + await self.page.request_gc() + except Exception as e: + logger.error(f"Error fetching FavIcon info {str(e)}, continuing.") if self.status_code != 200 and not ignore_status_codes: screenshot = await capture_full_page_async(self.page) diff --git a/changedetectionio/content_fetchers/puppeteer.py b/changedetectionio/content_fetchers/puppeteer.py index 34eec47f..58511d54 100644 --- a/changedetectionio/content_fetchers/puppeteer.py +++ b/changedetectionio/content_fetchers/puppeteer.py @@ -145,15 +145,16 @@ class fetcher(Fetcher): # f.write(content) async def fetch_page(self, - url, - timeout, - request_headers, - request_body, - request_method, - ignore_status_codes, current_include_filters, + empty_pages_are_a_change, + fetch_favicon, + ignore_status_codes, is_binary, - empty_pages_are_a_change + request_body, + request_headers, + request_method, + timeout, + url, ): import re self.delete_browser_steps_screenshots() @@ -181,6 +182,9 @@ class fetcher(Fetcher): # more reliable is to just request a new page self.page = await browser.newPage() + + # Add console handler to capture console.log from favicon fetcher + #self.page.on('console', lambda msg: logger.debug(f"Browser console [{msg.type}]: {msg.text}")) if '--window-size' in self.browser_connection_url: # Be sure the viewport is always the window-size, this is often not the same thing @@ -290,10 +294,11 @@ class fetcher(Fetcher): await browser.close() raise PageUnloadable(url=url, status_code=None, message=str(e)) - try: - self.favicon_blob = await self.page.evaluate(FAVICON_FETCHER_JS) - except Exception as e: - logger.error(f"Error fetching FavIcon info {str(e)}, continuing.") + if fetch_favicon: + try: + self.favicon_blob = await self.page.evaluate(FAVICON_FETCHER_JS) + except Exception as e: + logger.error(f"Error fetching FavIcon info {str(e)}, continuing.") if self.status_code != 200 and not ignore_status_codes: screenshot = await capture_full_page(page=self.page) @@ -346,8 +351,18 @@ class fetcher(Fetcher): async def main(self, **kwargs): await self.fetch_page(**kwargs) - async def run(self, url, timeout, request_headers, request_body, request_method, ignore_status_codes=False, - current_include_filters=None, is_binary=False, empty_pages_are_a_change=False): + async def run(self, + fetch_favicon=True, + current_include_filters=None, + empty_pages_are_a_change=False, + ignore_status_codes=False, + is_binary=False, + request_body=None, + request_headers=None, + request_method=None, + timeout=None, + url=None, + ): #@todo make update_worker async which could run any of these content_fetchers within memory and time constraints max_time = int(os.getenv('PUPPETEER_MAX_PROCESSING_TIMEOUT_SECONDS', 180)) @@ -355,16 +370,17 @@ class fetcher(Fetcher): # Now we run this properly in async context since we're called from async worker try: await asyncio.wait_for(self.main( - url=url, - timeout=timeout, - request_headers=request_headers, - request_body=request_body, - request_method=request_method, - ignore_status_codes=ignore_status_codes, current_include_filters=current_include_filters, + empty_pages_are_a_change=empty_pages_are_a_change, + fetch_favicon=fetch_favicon, + ignore_status_codes=ignore_status_codes, is_binary=is_binary, - empty_pages_are_a_change=empty_pages_are_a_change - ), timeout=max_time) + request_body=request_body, + request_headers=request_headers, + request_method=request_method, + timeout=timeout, + url=url, + ), timeout=max_time + ) except asyncio.TimeoutError: - raise(BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds.")) - + raise (BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds.")) diff --git a/changedetectionio/content_fetchers/requests.py b/changedetectionio/content_fetchers/requests.py index aba5ed0d..cfa9400d 100644 --- a/changedetectionio/content_fetchers/requests.py +++ b/changedetectionio/content_fetchers/requests.py @@ -104,15 +104,17 @@ class fetcher(Fetcher): self.raw_content = r.content async def run(self, - url, - timeout, - request_headers, - request_body, - request_method, - ignore_status_codes=False, - current_include_filters=None, - is_binary=False, - empty_pages_are_a_change=False): + fetch_favicon=True, + current_include_filters=None, + empty_pages_are_a_change=False, + ignore_status_codes=False, + is_binary=False, + request_body=None, + request_headers=None, + request_method=None, + timeout=None, + url=None, + ): """Async wrapper that runs the synchronous requests code in a thread pool""" loop = asyncio.get_event_loop() diff --git a/changedetectionio/content_fetchers/res/favicon-fetcher.js b/changedetectionio/content_fetchers/res/favicon-fetcher.js index f1a0fd8d..3f7a0ad8 100644 --- a/changedetectionio/content_fetchers/res/favicon-fetcher.js +++ b/changedetectionio/content_fetchers/res/favicon-fetcher.js @@ -1,79 +1,101 @@ (async () => { - const links = Array.from(document.querySelectorAll( - 'link[rel~="apple-touch-icon"], link[rel~="icon"]' - )); + // Define the function inside the IIFE for console testing + window.getFaviconAsBlob = async function() { + const links = Array.from(document.querySelectorAll( + 'link[rel~="apple-touch-icon"], link[rel~="icon"]' + )); - const icons = links.map(link => { - const sizesStr = link.getAttribute('sizes'); - let size = 0; - if (sizesStr) { - const [w] = sizesStr.split('x').map(Number); - if (!isNaN(w)) size = w; - } else { - size = 16; - } - return { - size, - rel: link.getAttribute('rel'), - href: link.href - }; - }); - - // If no icons found, add fallback favicon.ico - if (icons.length === 0) { - icons.push({ - size: 16, - rel: 'icon', - href: '/favicon.ico' + const icons = links.map(link => { + const sizesStr = link.getAttribute('sizes'); + let size = 0; + if (sizesStr) { + const [w] = sizesStr.split('x').map(Number); + if (!isNaN(w)) size = w; + } else { + size = 16; + } + return { + size, + rel: link.getAttribute('rel'), + href: link.href, + hasSizes: !!sizesStr + }; }); - } - // sort preference - icons.sort((a, b) => { - const isAppleA = /apple-touch-icon/.test(a.rel); - const isAppleB = /apple-touch-icon/.test(b.rel); - if (isAppleA && !isAppleB) return -1; - if (!isAppleA && isAppleB) return 1; - return b.size - a.size; - }); - - const timeoutMs = 2000; - - for (const icon of icons) { - try { - const controller = new AbortController(); - const timeout = setTimeout(() => controller.abort(), timeoutMs); - - const resp = await fetch(icon.href, { - signal: controller.signal, - redirect: 'follow' + // If no icons found, add fallback favicon.ico + if (icons.length === 0) { + icons.push({ + size: 16, + rel: 'icon', + href: '/favicon.ico', + hasSizes: false }); + } - clearTimeout(timeout); + // sort preference: highest resolution first, then apple-touch-icon, then regular icons + icons.sort((a, b) => { + // First priority: actual size (highest first) + if (a.size !== b.size) { + return b.size - a.size; + } + + // Second priority: apple-touch-icon over regular icon + const isAppleA = /apple-touch-icon/.test(a.rel); + const isAppleB = /apple-touch-icon/.test(b.rel); + if (isAppleA && !isAppleB) return -1; + if (!isAppleA && isAppleB) return 1; + + // Third priority: icons with no size attribute (fallback icons) last + const hasNoSizeA = !a.hasSizes; + const hasNoSizeB = !b.hasSizes; + if (hasNoSizeA && !hasNoSizeB) return 1; + if (!hasNoSizeA && hasNoSizeB) return -1; + + return 0; + }); - if (!resp.ok) { + const timeoutMs = 2000; + + for (const icon of icons) { + try { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); + + const resp = await fetch(icon.href, { + signal: controller.signal, + redirect: 'follow' + }); + + clearTimeout(timeout); + + if (!resp.ok) { + continue; + } + + const blob = await resp.blob(); + + // Convert blob to base64 + const reader = new FileReader(); + return await new Promise(resolve => { + reader.onloadend = () => { + resolve({ + url: icon.href, + base64: reader.result.split(",")[1] + }); + }; + reader.readAsDataURL(blob); + }); + + } catch (e) { continue; } - - const blob = await resp.blob(); - - // Convert blob to base64 - const reader = new FileReader(); - return await new Promise(resolve => { - reader.onloadend = () => { - resolve({ - url: icon.href, - base64: reader.result.split(",")[1] - }); - }; - reader.readAsDataURL(blob); - }); - - } catch (e) { - continue; } - } - // nothing found - return null; + // nothing found + return null; + }; + + // Auto-execute and return result for page.evaluate() + return await window.getFaviconAsBlob(); })(); + diff --git a/changedetectionio/content_fetchers/webdriver_selenium.py b/changedetectionio/content_fetchers/webdriver_selenium.py index 48897d7a..41cbf5d5 100644 --- a/changedetectionio/content_fetchers/webdriver_selenium.py +++ b/changedetectionio/content_fetchers/webdriver_selenium.py @@ -4,9 +4,10 @@ import time from loguru import logger from changedetectionio.content_fetchers.base import Fetcher + class fetcher(Fetcher): if os.getenv("WEBDRIVER_URL"): - fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL")) + fetcher_description = f"WebDriver Chrome/Javascript via \"{os.getenv('WEBDRIVER_URL', '')}\"" else: fetcher_description = "WebDriver Chrome/Javascript" @@ -25,7 +26,6 @@ class fetcher(Fetcher): self.browser_connection_is_custom = True self.browser_connection_url = custom_browser_connection_url - ##### PROXY SETUP ##### proxy_sources = [ @@ -38,7 +38,7 @@ class fetcher(Fetcher): os.getenv('webdriver_proxyHttps'), os.getenv('webdriver_httpsProxy'), os.getenv('webdriver_sslProxy'), - proxy_override, # last one should override + proxy_override, # last one should override ] # The built in selenium proxy handling is super unreliable!!! so we just grab which ever proxy setting we can find and throw it in --proxy-server= for k in filter(None, proxy_sources): @@ -46,20 +46,21 @@ class fetcher(Fetcher): continue self.proxy_url = k.strip() - async def run(self, - url, - timeout, - request_headers, - request_body, - request_method, - ignore_status_codes=False, - current_include_filters=None, - is_binary=False, - empty_pages_are_a_change=False): + fetch_favicon=True, + current_include_filters=None, + empty_pages_are_a_change=False, + ignore_status_codes=False, + is_binary=False, + request_body=None, + request_headers=None, + request_method=None, + timeout=None, + url=None, + ): import asyncio - + # Wrap the entire selenium operation in a thread executor def _run_sync(): from selenium.webdriver.chrome.options import Options as ChromeOptions @@ -140,4 +141,3 @@ class fetcher(Fetcher): # Run the selenium operations in a thread pool to avoid blocking the event loop loop = asyncio.get_event_loop() await loop.run_in_executor(None, _run_sync) - diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index f6dfffcc..4974d918 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -14,6 +14,8 @@ from ..html_tools import TRANSLATE_WHITESPACE_TABLE # Allowable protocols, protects against javascript: etc # file:// is further checked by ALLOW_FILE_URI SAFE_PROTOCOL_REGEX='^(http|https|ftp|file):' +FAVICON_RESAVE_THRESHOLD_SECONDS=86400 + minimum_seconds_recheck_time = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 3)) mtable = {'seconds': 1, 'minutes': 60, 'hours': 3600, 'days': 86400, 'weeks': 86400 * 7} @@ -420,6 +422,28 @@ class model(watch_base): # False is not an option for AppRise, must be type None return None + def favicon_is_expired(self): + favicon_fname = self.get_favicon_filename() + import glob + import time + + if not favicon_fname: + return True + try: + fname = next(iter(glob.glob(os.path.join(self.watch_data_dir, "favicon.*"))), None) + logger.trace(f"Favicon file maybe found at {fname}") + if os.path.isfile(fname): + file_age = int(time.time() - os.path.getmtime(fname)) + logger.trace(f"Favicon file age is {file_age}s") + if file_age < FAVICON_RESAVE_THRESHOLD_SECONDS: + return False + except Exception as e: + logger.critical(f"Exception checking Favicon age {str(e)}") + return True + + # Also in the case that the file didnt exist + return True + def bump_favicon(self, url, favicon_base_64: str) -> None: from urllib.parse import urlparse import base64 diff --git a/changedetectionio/processors/__init__.py b/changedetectionio/processors/__init__.py index 2ae6df4d..fadffc91 100644 --- a/changedetectionio/processors/__init__.py +++ b/changedetectionio/processors/__init__.py @@ -146,18 +146,19 @@ class difference_detection_processor(): # And here we go! call the right browser with browser-specific settings empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False) - # All fetchers are now async - await self.fetcher.run(url=url, - timeout=timeout, - request_headers=request_headers, - request_body=request_body, - request_method=request_method, - ignore_status_codes=ignore_status_codes, - current_include_filters=self.watch.get('include_filters'), - is_binary=is_binary, - empty_pages_are_a_change=empty_pages_are_a_change - ) + await self.fetcher.run( + current_include_filters=self.watch.get('include_filters'), + empty_pages_are_a_change=empty_pages_are_a_change, + fetch_favicon=self.watch.favicon_is_expired(), + ignore_status_codes=ignore_status_codes, + is_binary=is_binary, + request_body=request_body, + request_headers=request_headers, + request_method=request_method, + timeout=timeout, + url=url, + ) #@todo .quit here could go on close object, so we can run JS if change-detected self.fetcher.quit(watch=self.watch)