Favicons in list - Prefer best/highest quality (#3351)

pull/3383/head
dgtlmoon 2025-08-20 13:08:36 +02:00 zatwierdzone przez GitHub
rodzic 20bcca578a
commit 685bd01156
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: B5690EEEBB952194
8 zmienionych plików z 218 dodań i 148 usunięć

Wyświetl plik

@ -70,15 +70,17 @@ class Fetcher():
@abstractmethod
async def run(self,
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False,
empty_pages_are_a_change=False):
fetch_favicon=True,
current_include_filters=None,
empty_pages_are_a_change=False,
ignore_status_codes=False,
is_binary=False,
request_body=None,
request_headers=None,
request_method=None,
timeout=None,
url=None,
):
# Should set self.error, self.status_code and self.content
pass

Wyświetl plik

@ -143,15 +143,17 @@ class fetcher(Fetcher):
f.write(content)
async def run(self,
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False,
empty_pages_are_a_change=False):
fetch_favicon=True,
current_include_filters=None,
empty_pages_are_a_change=False,
ignore_status_codes=False,
is_binary=False,
request_body=None,
request_headers=None,
request_method=None,
timeout=None,
url=None,
):
from playwright.async_api import async_playwright
import playwright._impl._errors
@ -234,11 +236,12 @@ class fetcher(Fetcher):
await browser.close()
raise PageUnloadable(url=url, status_code=None, message=str(e))
try:
self.favicon_blob = await self.page.evaluate(FAVICON_FETCHER_JS)
await self.page.request_gc()
except Exception as e:
logger.error(f"Error fetching FavIcon info {str(e)}, continuing.")
if fetch_favicon:
try:
self.favicon_blob = await self.page.evaluate(FAVICON_FETCHER_JS)
await self.page.request_gc()
except Exception as e:
logger.error(f"Error fetching FavIcon info {str(e)}, continuing.")
if self.status_code != 200 and not ignore_status_codes:
screenshot = await capture_full_page_async(self.page)

Wyświetl plik

@ -145,15 +145,16 @@ class fetcher(Fetcher):
# f.write(content)
async def fetch_page(self,
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes,
current_include_filters,
empty_pages_are_a_change,
fetch_favicon,
ignore_status_codes,
is_binary,
empty_pages_are_a_change
request_body,
request_headers,
request_method,
timeout,
url,
):
import re
self.delete_browser_steps_screenshots()
@ -181,6 +182,9 @@ class fetcher(Fetcher):
# more reliable is to just request a new page
self.page = await browser.newPage()
# Add console handler to capture console.log from favicon fetcher
#self.page.on('console', lambda msg: logger.debug(f"Browser console [{msg.type}]: {msg.text}"))
if '--window-size' in self.browser_connection_url:
# Be sure the viewport is always the window-size, this is often not the same thing
@ -290,10 +294,11 @@ class fetcher(Fetcher):
await browser.close()
raise PageUnloadable(url=url, status_code=None, message=str(e))
try:
self.favicon_blob = await self.page.evaluate(FAVICON_FETCHER_JS)
except Exception as e:
logger.error(f"Error fetching FavIcon info {str(e)}, continuing.")
if fetch_favicon:
try:
self.favicon_blob = await self.page.evaluate(FAVICON_FETCHER_JS)
except Exception as e:
logger.error(f"Error fetching FavIcon info {str(e)}, continuing.")
if self.status_code != 200 and not ignore_status_codes:
screenshot = await capture_full_page(page=self.page)
@ -346,8 +351,18 @@ class fetcher(Fetcher):
async def main(self, **kwargs):
await self.fetch_page(**kwargs)
async def run(self, url, timeout, request_headers, request_body, request_method, ignore_status_codes=False,
current_include_filters=None, is_binary=False, empty_pages_are_a_change=False):
async def run(self,
fetch_favicon=True,
current_include_filters=None,
empty_pages_are_a_change=False,
ignore_status_codes=False,
is_binary=False,
request_body=None,
request_headers=None,
request_method=None,
timeout=None,
url=None,
):
#@todo make update_worker async which could run any of these content_fetchers within memory and time constraints
max_time = int(os.getenv('PUPPETEER_MAX_PROCESSING_TIMEOUT_SECONDS', 180))
@ -355,16 +370,17 @@ class fetcher(Fetcher):
# Now we run this properly in async context since we're called from async worker
try:
await asyncio.wait_for(self.main(
url=url,
timeout=timeout,
request_headers=request_headers,
request_body=request_body,
request_method=request_method,
ignore_status_codes=ignore_status_codes,
current_include_filters=current_include_filters,
empty_pages_are_a_change=empty_pages_are_a_change,
fetch_favicon=fetch_favicon,
ignore_status_codes=ignore_status_codes,
is_binary=is_binary,
empty_pages_are_a_change=empty_pages_are_a_change
), timeout=max_time)
request_body=request_body,
request_headers=request_headers,
request_method=request_method,
timeout=timeout,
url=url,
), timeout=max_time
)
except asyncio.TimeoutError:
raise(BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds."))
raise (BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds."))

Wyświetl plik

@ -104,15 +104,17 @@ class fetcher(Fetcher):
self.raw_content = r.content
async def run(self,
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False,
empty_pages_are_a_change=False):
fetch_favicon=True,
current_include_filters=None,
empty_pages_are_a_change=False,
ignore_status_codes=False,
is_binary=False,
request_body=None,
request_headers=None,
request_method=None,
timeout=None,
url=None,
):
"""Async wrapper that runs the synchronous requests code in a thread pool"""
loop = asyncio.get_event_loop()

Wyświetl plik

@ -1,79 +1,101 @@
(async () => {
const links = Array.from(document.querySelectorAll(
'link[rel~="apple-touch-icon"], link[rel~="icon"]'
));
// Define the function inside the IIFE for console testing
window.getFaviconAsBlob = async function() {
const links = Array.from(document.querySelectorAll(
'link[rel~="apple-touch-icon"], link[rel~="icon"]'
));
const icons = links.map(link => {
const sizesStr = link.getAttribute('sizes');
let size = 0;
if (sizesStr) {
const [w] = sizesStr.split('x').map(Number);
if (!isNaN(w)) size = w;
} else {
size = 16;
}
return {
size,
rel: link.getAttribute('rel'),
href: link.href
};
});
// If no icons found, add fallback favicon.ico
if (icons.length === 0) {
icons.push({
size: 16,
rel: 'icon',
href: '/favicon.ico'
const icons = links.map(link => {
const sizesStr = link.getAttribute('sizes');
let size = 0;
if (sizesStr) {
const [w] = sizesStr.split('x').map(Number);
if (!isNaN(w)) size = w;
} else {
size = 16;
}
return {
size,
rel: link.getAttribute('rel'),
href: link.href,
hasSizes: !!sizesStr
};
});
}
// sort preference
icons.sort((a, b) => {
const isAppleA = /apple-touch-icon/.test(a.rel);
const isAppleB = /apple-touch-icon/.test(b.rel);
if (isAppleA && !isAppleB) return -1;
if (!isAppleA && isAppleB) return 1;
return b.size - a.size;
});
const timeoutMs = 2000;
for (const icon of icons) {
try {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
const resp = await fetch(icon.href, {
signal: controller.signal,
redirect: 'follow'
// If no icons found, add fallback favicon.ico
if (icons.length === 0) {
icons.push({
size: 16,
rel: 'icon',
href: '/favicon.ico',
hasSizes: false
});
}
clearTimeout(timeout);
// sort preference: highest resolution first, then apple-touch-icon, then regular icons
icons.sort((a, b) => {
// First priority: actual size (highest first)
if (a.size !== b.size) {
return b.size - a.size;
}
// Second priority: apple-touch-icon over regular icon
const isAppleA = /apple-touch-icon/.test(a.rel);
const isAppleB = /apple-touch-icon/.test(b.rel);
if (isAppleA && !isAppleB) return -1;
if (!isAppleA && isAppleB) return 1;
// Third priority: icons with no size attribute (fallback icons) last
const hasNoSizeA = !a.hasSizes;
const hasNoSizeB = !b.hasSizes;
if (hasNoSizeA && !hasNoSizeB) return 1;
if (!hasNoSizeA && hasNoSizeB) return -1;
return 0;
});
if (!resp.ok) {
const timeoutMs = 2000;
for (const icon of icons) {
try {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
const resp = await fetch(icon.href, {
signal: controller.signal,
redirect: 'follow'
});
clearTimeout(timeout);
if (!resp.ok) {
continue;
}
const blob = await resp.blob();
// Convert blob to base64
const reader = new FileReader();
return await new Promise(resolve => {
reader.onloadend = () => {
resolve({
url: icon.href,
base64: reader.result.split(",")[1]
});
};
reader.readAsDataURL(blob);
});
} catch (e) {
continue;
}
const blob = await resp.blob();
// Convert blob to base64
const reader = new FileReader();
return await new Promise(resolve => {
reader.onloadend = () => {
resolve({
url: icon.href,
base64: reader.result.split(",")[1]
});
};
reader.readAsDataURL(blob);
});
} catch (e) {
continue;
}
}
// nothing found
return null;
// nothing found
return null;
};
// Auto-execute and return result for page.evaluate()
return await window.getFaviconAsBlob();
})();

Wyświetl plik

@ -4,9 +4,10 @@ import time
from loguru import logger
from changedetectionio.content_fetchers.base import Fetcher
class fetcher(Fetcher):
if os.getenv("WEBDRIVER_URL"):
fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL"))
fetcher_description = f"WebDriver Chrome/Javascript via \"{os.getenv('WEBDRIVER_URL', '')}\""
else:
fetcher_description = "WebDriver Chrome/Javascript"
@ -25,7 +26,6 @@ class fetcher(Fetcher):
self.browser_connection_is_custom = True
self.browser_connection_url = custom_browser_connection_url
##### PROXY SETUP #####
proxy_sources = [
@ -38,7 +38,7 @@ class fetcher(Fetcher):
os.getenv('webdriver_proxyHttps'),
os.getenv('webdriver_httpsProxy'),
os.getenv('webdriver_sslProxy'),
proxy_override, # last one should override
proxy_override, # last one should override
]
# The built in selenium proxy handling is super unreliable!!! so we just grab which ever proxy setting we can find and throw it in --proxy-server=
for k in filter(None, proxy_sources):
@ -46,20 +46,21 @@ class fetcher(Fetcher):
continue
self.proxy_url = k.strip()
async def run(self,
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False,
empty_pages_are_a_change=False):
fetch_favicon=True,
current_include_filters=None,
empty_pages_are_a_change=False,
ignore_status_codes=False,
is_binary=False,
request_body=None,
request_headers=None,
request_method=None,
timeout=None,
url=None,
):
import asyncio
# Wrap the entire selenium operation in a thread executor
def _run_sync():
from selenium.webdriver.chrome.options import Options as ChromeOptions
@ -140,4 +141,3 @@ class fetcher(Fetcher):
# Run the selenium operations in a thread pool to avoid blocking the event loop
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, _run_sync)

Wyświetl plik

@ -14,6 +14,8 @@ from ..html_tools import TRANSLATE_WHITESPACE_TABLE
# Allowable protocols, protects against javascript: etc
# file:// is further checked by ALLOW_FILE_URI
SAFE_PROTOCOL_REGEX='^(http|https|ftp|file):'
FAVICON_RESAVE_THRESHOLD_SECONDS=86400
minimum_seconds_recheck_time = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 3))
mtable = {'seconds': 1, 'minutes': 60, 'hours': 3600, 'days': 86400, 'weeks': 86400 * 7}
@ -420,6 +422,28 @@ class model(watch_base):
# False is not an option for AppRise, must be type None
return None
def favicon_is_expired(self):
favicon_fname = self.get_favicon_filename()
import glob
import time
if not favicon_fname:
return True
try:
fname = next(iter(glob.glob(os.path.join(self.watch_data_dir, "favicon.*"))), None)
logger.trace(f"Favicon file maybe found at {fname}")
if os.path.isfile(fname):
file_age = int(time.time() - os.path.getmtime(fname))
logger.trace(f"Favicon file age is {file_age}s")
if file_age < FAVICON_RESAVE_THRESHOLD_SECONDS:
return False
except Exception as e:
logger.critical(f"Exception checking Favicon age {str(e)}")
return True
# Also in the case that the file didnt exist
return True
def bump_favicon(self, url, favicon_base_64: str) -> None:
from urllib.parse import urlparse
import base64

Wyświetl plik

@ -146,18 +146,19 @@ class difference_detection_processor():
# And here we go! call the right browser with browser-specific settings
empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
# All fetchers are now async
await self.fetcher.run(url=url,
timeout=timeout,
request_headers=request_headers,
request_body=request_body,
request_method=request_method,
ignore_status_codes=ignore_status_codes,
current_include_filters=self.watch.get('include_filters'),
is_binary=is_binary,
empty_pages_are_a_change=empty_pages_are_a_change
)
await self.fetcher.run(
current_include_filters=self.watch.get('include_filters'),
empty_pages_are_a_change=empty_pages_are_a_change,
fetch_favicon=self.watch.favicon_is_expired(),
ignore_status_codes=ignore_status_codes,
is_binary=is_binary,
request_body=request_body,
request_headers=request_headers,
request_method=request_method,
timeout=timeout,
url=url,
)
#@todo .quit here could go on close object, so we can run JS if change-detected
self.fetcher.quit(watch=self.watch)