Simplify it with just scraping out the favicon

pull/3196/head
dgtlmoon 2025-07-09 09:54:26 +02:00
rodzic 5fc82cf450
commit a3261eddd8
16 zmienionych plików z 174 dodań i 67 usunięć

Wyświetl plik

@ -353,6 +353,12 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore):
except Exception as e:
pass
# Store favicon if necessary
if update_handler.fetcher.favicon_blob and update_handler.fetcher.favicon_blob.get('base64'):
watch.bump_favicon(url=update_handler.fetcher.favicon_blob.get('url'),
favicon_base_64=update_handler.fetcher.favicon_blob.get('base64')
)
datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - fetch_start_time, 3),
'check_count': count})

Wyświetl plik

@ -256,12 +256,6 @@ nav
{{ render_checkbox_field(form.application.form.ui.form.socket_io_enabled, class="socket_io_enabled") }}
<span class="pure-form-message-inline">Realtime UI Updates Enabled - (Restart required if this is changed)</span>
</div>
<div class="pure-control-group inline-radio">
{{ render_field(form.application.form.ui.form.thumbnail_type) }}
<span class="pure-form-message-inline">
Favicon/Thumbnail source for the watch overview listing icons.
</span>
</div>
</div>
<div class="tab-pane-inner" id="proxies">
<div id="recommended-proxy">

Wyświetl plik

@ -6,8 +6,6 @@
<script>let nowtimeserver={{ now_time_server }};</script>
<script>let ajax_toggle_url="{{ ajax_toggle_url }}";</script>
<script>let thumbnail_baseURL="{{ url_for('static_content', group='thumbnail', filename="PLACEHOLDER", _external=True) }}"</script>
<script>
// Initialize Feather icons after the page loads
document.addEventListener('DOMContentLoaded', function() {
@ -81,14 +79,13 @@ document.addEventListener('DOMContentLoaded', function() {
{%- set pagination_page = request.args.get('page', 0) -%}
{%- set cols_required = 6 -%}
{%- set any_has_restock_price_processor = datastore.any_watches_have_processor_by_name("restock_diff") -%}
{%- set thumbnail_type = datastore.data['settings']['application']['ui'].get('thumbnail_type') -%}
{%- if any_has_restock_price_processor -%}
{%- set cols_required = cols_required + 1 -%}
{%- endif -%}
<div id="watch-table-wrapper">
<table class="pure-table pure-table-striped watch-table thumbnail-type-{{ thumbnail_type }}">
<table class="pure-table pure-table-striped watch-table">
<thead>
<tr>
{%- set link_order = "desc" if sort_order == 'asc' else "asc" -%}
@ -122,7 +119,7 @@ document.addEventListener('DOMContentLoaded', function() {
'paused' if watch.paused is defined and watch.paused != False else '',
'unviewed' if watch.has_unviewed else '',
'has-restock-info' if watch.has_restock_info else 'no-restock-info',
'has-thumbnail' if watch.get_screenshot() or thumbnail_type not in ('', 'screenshot') else '',
'has-favicon' if watch.get_favicon_filename() else '',
'in-stock' if watch.has_restock_info and watch['restock']['in_stock'] else '',
'not-in-stock' if watch.has_restock_info and not watch['restock']['in_stock'] else '',
'queued' if watch.uuid in queued_uuids else '',
@ -142,17 +139,9 @@ document.addEventListener('DOMContentLoaded', function() {
<td class="title-col inline">
<div class="flex-wrapper">
{%- if thumbnail_type -%}
<div>
{%- if thumbnail_type == 'screenshot' -%}
<img style="display: none;" class="thumbnail" src="{{url_for('static_content', group='thumbnail', filename=watch.uuid)}}">
{% elif thumbnail_type == 'ddg_favicon' %}
<img style="display: none;" class="thumbnail" src="https://icons.duckduckgo.com/ip3/{{ watch.domain_only_from_link }}.ico">
{% elif thumbnail_type == 'google_favicon' %}
<img style="display: none;" class="thumbnail" src="https://t3.gstatic.com/faviconV2?client=SOCIAL&type=FAVICON&fallback_opts=TYPE,SIZE,URL&url=https://{{ watch.domain_only_from_link }}&size=64">
{%- endif -%}
<img style="display: none;" class="thumbnail" src="{{url_for('static_content', group='favicon', filename=watch.uuid)}}">
</div>
{%- endif -%}
<div>
{{watch.title if watch.title is not none and watch.title|length > 0 else watch.url}}
<div class="error-text" style="display:none;">{{ watch.compile_error_texts(has_proxies=datastore.proxy_list) }}</div>
@ -169,7 +158,7 @@ document.addEventListener('DOMContentLoaded', function() {
{%- endfor -%}
</div>
<div class="status-icons">
<a class="external" target="_blank" rel="noopener" href="{{ watch.link.replace('source:','') }}">&nbsp;</a>
<a class="external" target="_blank" rel="noopener" href="{{ watch.link.replace('source:','') }}">&nbsp;</a>
<a class="link-spread" href="{{url_for('ui.form_share_put_watch', uuid=watch.uuid)}}"><img src="{{url_for('static_content', group='images', filename='spread.svg')}}" class="status-icon icon icon-spread" title="Create a link to share watch config with others" ></a>
{%- if watch.get_fetch_backend == "html_webdriver"
or ( watch.get_fetch_backend == "system" and system_default_fetcher == 'html_webdriver' )

Wyświetl plik

@ -28,6 +28,7 @@ from changedetectionio.content_fetchers.requests import fetcher as html_requests
import importlib.resources
XPATH_ELEMENT_JS = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text(encoding='utf-8')
INSTOCK_DATA_JS = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text(encoding='utf-8')
FAVICON_FETCHER_JS = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('favicon-fetcher.js').read_text(encoding='utf-8')
def available_fetchers():

Wyświetl plik

@ -48,6 +48,7 @@ class Fetcher():
error = None
fetcher_description = "No description"
headers = {}
favicon_blob = None
instock_data = None
instock_data_js = ""
status_code = None

Wyświetl plik

@ -5,7 +5,7 @@ from urllib.parse import urlparse
from loguru import logger
from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \
SCREENSHOT_SIZE_STITCH_THRESHOLD, SCREENSHOT_MAX_TOTAL_HEIGHT, XPATH_ELEMENT_JS, INSTOCK_DATA_JS
SCREENSHOT_SIZE_STITCH_THRESHOLD, SCREENSHOT_MAX_TOTAL_HEIGHT, XPATH_ELEMENT_JS, INSTOCK_DATA_JS, FAVICON_FETCHER_JS
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
@ -234,6 +234,12 @@ class fetcher(Fetcher):
await browser.close()
raise PageUnloadable(url=url, status_code=None, message=str(e))
try:
self.favicon_blob = await self.page.evaluate(FAVICON_FETCHER_JS)
await self.page.request_gc()
except Exception as e:
logger.error(f"Error fetching FavIcon info {str(e)}, continuing.")
if self.status_code != 200 and not ignore_status_codes:
screenshot = await capture_full_page_async(self.page)
raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
@ -274,6 +280,7 @@ class fetcher(Fetcher):
await self.page.request_gc()
logger.debug(f"Scrape xPath element data in browser done in {time.time() - now:.2f}s")
# Bug 3 in Playwright screenshot handling
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
# JPEG is better here because the screenshots can be very very large

Wyświetl plik

@ -8,7 +8,7 @@ from loguru import logger
from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \
SCREENSHOT_SIZE_STITCH_THRESHOLD, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS, \
SCREENSHOT_MAX_TOTAL_HEIGHT
SCREENSHOT_MAX_TOTAL_HEIGHT, FAVICON_FETCHER_JS
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, \
BrowserConnectError
@ -179,10 +179,8 @@ class fetcher(Fetcher):
except Exception as e:
raise BrowserConnectError(msg=f"Error connecting to the browser - Exception '{str(e)}'")
# Better is to launch chrome with the URL as arg
# non-headless - newPage() will launch an extra tab/window, .browser should already contain 1 page/tab
# headless - ask a new page
self.page = (pages := await browser.pages) and len(pages) or await browser.newPage()
# more reliable is to just request a new page
self.page = await browser.newPage()
if '--window-size' in self.browser_connection_url:
# Be sure the viewport is always the window-size, this is often not the same thing
@ -292,6 +290,11 @@ class fetcher(Fetcher):
await browser.close()
raise PageUnloadable(url=url, status_code=None, message=str(e))
try:
self.favicon_blob = await self.page.evaluate(FAVICON_FETCHER_JS)
except Exception as e:
logger.error(f"Error fetching FavIcon info {str(e)}, continuing.")
if self.status_code != 200 and not ignore_status_codes:
screenshot = await capture_full_page(page=self.page)

Wyświetl plik

@ -0,0 +1,70 @@
async () => {
const links = Array.from(document.querySelectorAll('link[rel~="apple-touch-icon"], link[rel~="icon"]'));
const icons = links.map(link => {
const sizesStr = link.getAttribute('sizes');
let size = 0;
if (sizesStr) {
const [w] = sizesStr.split('x').map(Number);
if (!isNaN(w)) size = w;
} else {
size = 16;
}
return {
size,
rel: link.getAttribute('rel'),
href: link.href
};
});
if (icons.length === 0) return null;
icons.sort((a, b) => {
const isAppleA = /apple-touch-icon/.test(a.rel);
const isAppleB = /apple-touch-icon/.test(b.rel);
if (isAppleA && !isAppleB) return -1;
if (!isAppleA && isAppleB) return 1;
return b.size - a.size;
});
// Set a timeout value in ms
const timeoutMs = 2000;
for (const icon of icons) {
try {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
const resp = await fetch(icon.href, {
signal: controller.signal,
redirect: 'follow'
});
clearTimeout(timeout);
if (!resp.ok) {
// skip 404, 500, etc.
continue;
}
const blob = await resp.blob();
// Convert to base64
const reader = new FileReader();
return await new Promise(resolve => {
reader.onloadend = () => {
resolve({
url: icon.href,
base64: reader.result.split(",")[1]
});
};
reader.readAsDataURL(blob);
});
} catch (e) {
// fetch error, timeout, or abort
continue;
}
}
return null;
}

Wyświetl plik

@ -427,43 +427,24 @@ def changedetection_app(config=None, datastore_o=None):
except FileNotFoundError:
abort(404)
if group == 'thumbnail':
if group == 'favicon':
# Could be sensitive, follow password requirements
if datastore.data['settings']['application']['password'] and not flask_login.current_user.is_authenticated:
abort(403)
# Get the watch object
watch = datastore.data['watching'].get(filename)
if not watch:
abort(404)
t_type = datastore.data['settings']['application']['ui'].get('thumbnail_type')
if t_type == 'screenshot':
# Generate thumbnail if needed
max_age = int(request.args.get('max_age', '3200'))
thumbnail_path = watch.get_screenshot_as_thumbnail(max_age=max_age)
if not thumbnail_path:
abort(404)
try:
# Get file modification time for ETag
file_mtime = int(os.path.getmtime(thumbnail_path))
etag = f'"{file_mtime}"'
# Check if browser has valid cached version
if request.if_none_match and etag in request.if_none_match:
return "", 304 # Not Modified
# Set up response with appropriate cache headers
response = make_response(send_from_directory(os.path.dirname(thumbnail_path), os.path.basename(thumbnail_path)))
response.headers['Content-type'] = 'image/jpeg'
response.headers['ETag'] = etag
response.headers['Cache-Control'] = 'max-age=300, must-revalidate' # Cache for 5 minutes, then revalidate
return response
except FileNotFoundError:
abort(404)
favicon_filename = watch.get_favicon_filename()
if favicon_filename:
import mimetypes
mime, encoding = mimetypes.guess_type(favicon_filename)
response = make_response(send_from_directory(watch.watch_data_dir, favicon_filename))
response.headers['Content-type'] = mime
response.headers['Cache-Control'] = 'max-age=300, must-revalidate' # Cache for 5 minutes, then revalidate
return response
if group == 'visual_selector_data':
# Could be sensitive, follow password requirements

Wyświetl plik

@ -740,7 +740,6 @@ class globalSettingsRequestForm(Form):
class globalSettingsApplicationUIForm(Form):
open_diff_in_new_tab = BooleanField("Open 'History' page in a new tab", default=True, validators=[validators.Optional()])
socket_io_enabled = BooleanField('Realtime UI Updates Enabled', default=True, validators=[validators.Optional()])
thumbnail_type = RadioField(u'Icon / Thumbnail type', choices=[('screenshot', 'Screenshot based'), ('ddg_favicon', "Favicon from DuckDuckGo"), ('google_favicon', "Favicon from Google"), ('', 'None / Off')], default="screenshot")
# datastore.data['settings']['application']..
class globalSettingsApplicationForm(commonSettingsForm):

Wyświetl plik

@ -63,7 +63,6 @@ class model(dict):
'ui': {
'open_diff_in_new_tab': True,
'socket_io_enabled': True,
'thumbnail_type': 'screenshot' # False (off), screenshot, ddg_favicon, google_favicon
},
}
}

Wyświetl plik

@ -420,6 +420,63 @@ class model(watch_base):
# False is not an option for AppRise, must be type None
return None
def bump_favicon(self, url, favicon_base_64: str) -> None:
from urllib.parse import urlparse
import base64
import binascii
decoded = None
if url:
try:
parsed = urlparse(url)
filename = os.path.basename(parsed.path)
(base, extension) = filename.lower().strip().rsplit('.', 1)
except ValueError:
logger.error(f"UUID: {self.get('uuid')} Cant work out file extension from '{url}'")
return None
else:
# Assume favicon.ico
base = "favicon"
extension = "ico"
fname = os.path.join(self.watch_data_dir, f"favicon.{extension}")
try:
# validate=True makes sure the string only contains valid base64 chars
decoded = base64.b64decode(favicon_base_64, validate=True)
except (binascii.Error, ValueError) as e:
logger.warning(f"UUID: {self.get('uuid')} FavIcon save data (Base64) corrupt? {str(e)}")
else:
if decoded:
try:
with open(fname, 'wb') as f:
f.write(decoded)
except Exception as e:
logger.warning(f"UUID: {self.get('uuid')} error saving FavIcon to {fname} - {str(e)}")
# @todo - Store some checksum and only write when its different
logger.debug(f"UUID: {self.get('uuid')} updated favicon to at {fname}")
def get_favicon_filename(self) -> str | None:
"""
Find any favicon.* file in the current working directory
and return the contents of the newest one.
Returns:
bytes: Contents of the newest favicon file, or None if not found.
"""
import glob
# Search for all favicon.* files
files = glob.glob(os.path.join(self.watch_data_dir, "favicon.*"))
if not files:
return None
# Find the newest by modification time
newest_file = max(files, key=os.path.getmtime)
return os.path.basename(newest_file)
def get_screenshot_as_thumbnail(self, max_age=3200):
"""Return path to a square thumbnail of the most recent screenshot.

Wyświetl plik

@ -211,7 +211,7 @@ def handle_watch_update(socketio, **kwargs):
'event_timestamp': time.time(),
'fetch_time': watch.get('fetch_time'),
'has_error': True if error_texts else False,
'has_thumbnail': True if watch.get_screenshot_as_thumbnail() else False,
'has_favicon': True if watch.get_screenshot_as_thumbnail() else False,
'history_n': watch.history_n,
'last_changed_text': timeago.format(int(watch.last_changed), time.time()) if watch.history_n >= 2 and int(watch.last_changed) > 0 else 'Not yet',
'last_checked': watch.get('last_checked'),

Wyświetl plik

@ -126,14 +126,14 @@ $(document).ready(function () {
$($watchRow).toggleClass('queued', watch.queued);
$($watchRow).toggleClass('unviewed', watch.unviewed);
$($watchRow).toggleClass('has-error', watch.has_error);
$($watchRow).toggleClass('has-thumbnail', watch.has_thumbnail);
$($watchRow).toggleClass('has-favicon', watch.has_favicon);
$($watchRow).toggleClass('notification_muted', watch.notification_muted);
$($watchRow).toggleClass('paused', watch.paused);
$($watchRow).toggleClass('single-history', watch.history_n === 1);
$($watchRow).toggleClass('multiple-history', watch.history_n >= 2);
$('td.title-col .error-text', $watchRow).html(watch.error_text)
if (watch.has_thumbnail) {
if (watch.has_favicon) {
// Because the event could be emitted from a process that is outside the app context, url_for() might not work.
// Lets use url_for at template generation time to give us a PLACEHOLDER instead
$('img.thumbnail', $watchRow).attr('src', thumbnail_baseURL.replace('/PLACEHOLDER', `/${watch.uuid}`));

Wyświetl plik

@ -6,7 +6,7 @@
}
tr.has-thumbnail {
tr.has-favicon {
img.thumbnail {
display: inline-block !important;
}
@ -55,7 +55,7 @@
padding-right: 4px;
}
tr.has-thumbnail {
tr.has-favicon {
td.inline.title-col {
.flex-wrapper {
display: flex;
@ -66,7 +66,7 @@
}
&.thumbnail-type-screenshot {
tr.has-thumbnail {
tr.has-favicon {
td.inline.title-col {
img.thumbnail {
background-color: #fff; /* fallback bg for SVGs without bg */

Wyświetl plik

@ -689,10 +689,10 @@ ul#conditions_match_logic {
.watch-table th {
vertical-align: middle; }
.watch-table tr.has-thumbnail img.thumbnail {
.watch-table tr.has-favicon img.thumbnail {
display: inline-block !important; }
.watch-table tr.has-thumbnail.unviewed img.thumbnail {
.watch-table tr.has-favicon.unviewed img.thumbnail {
opacity: 1.0 !important; }
.watch-table .status-icons {
@ -729,12 +729,12 @@ ul#conditions_match_logic {
height: 25px;
padding-right: 4px; }
.watch-table tr.has-thumbnail td.inline.title-col .flex-wrapper {
.watch-table tr.has-favicon td.inline.title-col .flex-wrapper {
display: flex;
align-items: center;
gap: 4px; }
.watch-table.thumbnail-type-screenshot tr.has-thumbnail td.inline.title-col img.thumbnail {
.watch-table.thumbnail-type-screenshot tr.has-favicon td.inline.title-col img.thumbnail {
background-color: #fff;
/* fallback bg for SVGs without bg */
border-radius: 4px;