changedetection.io/changedetectionio/async_update_worker.py

299 wiersze
14 KiB
Python
Czysty Zwykły widok Historia

2025-05-30 13:49:41 +00:00
from .processors.exceptions import ProcessorException
import changedetectionio.content_fetchers.exceptions as content_fetchers_exceptions
from changedetectionio.processors.text_json_diff.processor import FilterNotFoundInResponse
from changedetectionio import html_tools
from changedetectionio.flask_app import watch_check_update
import asyncio
import importlib
import os
import time
from loguru import logger
# Async version of update_worker
# Processes jobs from AsyncSignalPriorityQueue instead of threaded queue
async def async_update_worker(worker_id, q, notification_q, app, datastore):
"""
Async worker function that processes watch check jobs from the queue.
Args:
worker_id: Unique identifier for this worker
q: AsyncSignalPriorityQueue containing jobs to process
notification_q: Standard queue for notifications
app: Flask application instance
datastore: Application datastore
"""
logger.info(f"Starting async worker {worker_id}")
while not app.config.exit.is_set():
update_handler = None
watch = None
current_uuid = None
try:
# Use asyncio wait_for to make queue.get() cancellable
queued_item_data = await asyncio.wait_for(q.get(), timeout=1.0)
except asyncio.TimeoutError:
# No jobs available, continue loop
continue
except Exception as e:
logger.error(f"Worker {worker_id} error getting queue item: {e}")
await asyncio.sleep(0.1)
continue
uuid = None
try:
uuid = queued_item_data.item.get('uuid')
fetch_start_time = round(time.time())
current_uuid = uuid
# Mark this UUID as being processed
from changedetectionio import worker_handler
worker_handler.set_uuid_processing(uuid, processing=True)
if uuid in list(datastore.data['watching'].keys()) and datastore.data['watching'][uuid].get('url'):
changed_detected = False
contents = b''
process_changedetection_results = True
update_obj = {}
# Clear last errors
datastore.data['watching'][uuid]['browser_steps_last_error_step'] = None
datastore.data['watching'][uuid]['last_checked'] = fetch_start_time
watch = datastore.data['watching'].get(uuid)
logger.info(f"Worker {worker_id} processing watch UUID {uuid} Priority {queued_item_data.priority} URL {watch['url']}")
try:
watch_check_update.send(watch_uuid=uuid)
# Processor is what we are using for detecting the "Change"
processor = watch.get('processor', 'text_json_diff')
# Init a new 'difference_detection_processor'
processor_module_name = f"changedetectionio.processors.{processor}.processor"
try:
processor_module = importlib.import_module(processor_module_name)
except ModuleNotFoundError as e:
print(f"Processor module '{processor}' not found.")
raise e
update_handler = processor_module.perform_site_check(datastore=datastore,
watch_uuid=uuid)
# All fetchers are now async, so call directly
await update_handler.call_browser()
# Run change detection (this is synchronous)
changed_detected, update_obj, contents = update_handler.run_changedetection(watch=watch)
except PermissionError as e:
logger.critical(f"File permission error updating file, watch: {uuid}")
logger.critical(str(e))
process_changedetection_results = False
except ProcessorException as e:
if e.screenshot:
watch.save_screenshot(screenshot=e.screenshot)
if e.xpath_data:
watch.save_xpath_data(data=e.xpath_data)
datastore.update_watch(uuid=uuid, update_obj={'last_error': e.message})
process_changedetection_results = False
except content_fetchers_exceptions.ReplyWithContentButNoText as e:
extra_help = ""
if e.has_filters:
has_img = html_tools.include_filters(include_filters='img',
html_content=e.html_content)
if has_img:
extra_help = ", it's possible that the filters you have give an empty result or contain only an image."
else:
extra_help = ", it's possible that the filters were found, but contained no usable text."
datastore.update_watch(uuid=uuid, update_obj={
'last_error': f"Got HTML content but no text found (With {e.status_code} reply code){extra_help}"
})
if e.screenshot:
watch.save_screenshot(screenshot=e.screenshot, as_error=True)
if e.xpath_data:
watch.save_xpath_data(data=e.xpath_data)
process_changedetection_results = False
except content_fetchers_exceptions.Non200ErrorCodeReceived as e:
if e.status_code == 403:
err_text = "Error - 403 (Access denied) received"
elif e.status_code == 404:
err_text = "Error - 404 (Page not found) received"
elif e.status_code == 407:
err_text = "Error - 407 (Proxy authentication required) received, did you need a username and password for the proxy?"
elif e.status_code == 500:
err_text = "Error - 500 (Internal server error) received from the web site"
else:
extra = ' (Access denied or blocked)' if str(e.status_code).startswith('4') else ''
err_text = f"Error - Request returned a HTTP error code {e.status_code}{extra}"
if e.screenshot:
watch.save_screenshot(screenshot=e.screenshot, as_error=True)
if e.xpath_data:
watch.save_xpath_data(data=e.xpath_data, as_error=True)
if e.page_text:
watch.save_error_text(contents=e.page_text)
datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text})
process_changedetection_results = False
# [Include all other exception handlers from original worker...]
# (Abbreviated for brevity - same exception handling logic applies)
except Exception as e:
logger.error(f"Worker {worker_id} exception processing watch UUID: {uuid}")
logger.error(str(e))
datastore.update_watch(uuid=uuid, update_obj={'last_error': "Exception: " + str(e)})
process_changedetection_results = False
else:
if not datastore.data['watching'].get(uuid):
continue
update_obj['content-type'] = update_handler.fetcher.get_all_headers().get('content-type', '').lower()
if not watch.get('ignore_status_codes'):
update_obj['consecutive_filter_failures'] = 0
update_obj['last_error'] = False
cleanup_error_artifacts(uuid, datastore)
if not datastore.data['watching'].get(uuid):
continue
if process_changedetection_results:
# Extract title if needed
if datastore.data['settings']['application'].get('extract_title_as_title') or watch['extract_title_as_title']:
if not watch['title'] or not len(watch['title']):
try:
update_obj['title'] = html_tools.extract_element(find='title', html_content=update_handler.fetcher.content)
logger.info(f"UUID: {uuid} Extract <title> updated title to '{update_obj['title']}")
except Exception as e:
logger.warning(f"UUID: {uuid} Extract <title> as watch title was enabled, but couldn't find a <title>.")
try:
datastore.update_watch(uuid=uuid, update_obj=update_obj)
if changed_detected or not watch.history_n:
if update_handler.screenshot:
watch.save_screenshot(screenshot=update_handler.screenshot)
if update_handler.xpath_data:
watch.save_xpath_data(data=update_handler.xpath_data)
# Ensure unique timestamp for history
if watch.newest_history_key and int(fetch_start_time) == int(watch.newest_history_key):
logger.warning(f"Timestamp {fetch_start_time} already exists, waiting 1 seconds")
fetch_start_time += 1
await asyncio.sleep(1)
watch.save_history_text(contents=contents,
timestamp=int(fetch_start_time),
snapshot_id=update_obj.get('previous_md5', 'none'))
empty_pages_are_a_change = datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
if update_handler.fetcher.content or (not update_handler.fetcher.content and empty_pages_are_a_change):
watch.save_last_fetched_html(contents=update_handler.fetcher.content, timestamp=int(fetch_start_time))
# Send notifications on second+ check
if watch.history_n >= 2:
logger.info(f"Change detected in UUID {uuid} - {watch['url']}")
if not watch.get('notification_muted'):
await send_content_changed_notification(uuid, notification_q, datastore)
except Exception as e:
logger.critical(f"Worker {worker_id} exception in process_changedetection_results")
logger.critical(str(e))
datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
# Always record attempt count
count = watch.get('check_count', 0) + 1
# Record server header
try:
server_header = update_handler.fetcher.headers.get('server', '').strip().lower()[:255]
datastore.update_watch(uuid=uuid, update_obj={'remote_server_reply': server_header})
except Exception as e:
pass
datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - fetch_start_time, 3),
'check_count': count})
current_uuid = None
# Mark UUID as no longer being processed
worker_handler.set_uuid_processing(uuid, processing=False)
# Send completion signal
if watch:
logger.info(f"Worker {worker_id} sending completion signal for UUID {watch['uuid']}")
watch_check_update.send(watch_uuid=watch['uuid'])
update_handler = None
logger.debug(f"Worker {worker_id} completed watch {uuid} in {time.time()-fetch_start_time:.2f}s")
# Yield control to other coroutines
await asyncio.sleep(0.01)
except Exception as e:
2025-05-30 14:14:31 +00:00
logger.error(f"Worker {worker_id} unexpected error processing {uuid}: {e}")
logger.error(f"Worker {worker_id} traceback:", exc_info=True)
2025-05-30 13:49:41 +00:00
# Make sure to mark UUID as completed even on error
if uuid:
2025-05-30 14:14:31 +00:00
try:
worker_handler.set_uuid_processing(uuid, processing=False)
# Also update the watch with error information
if datastore and uuid in datastore.data['watching']:
datastore.update_watch(uuid=uuid, update_obj={'last_error': f"Worker error: {str(e)}"})
except Exception as cleanup_error:
logger.error(f"Worker {worker_id} error during cleanup: {cleanup_error}")
2025-05-30 13:49:41 +00:00
current_uuid = None
2025-05-30 14:14:31 +00:00
# Brief pause before continuing to avoid tight error loops
await asyncio.sleep(1.0)
2025-05-30 13:49:41 +00:00
# Check if we should exit
if app.config.exit.is_set():
break
logger.info(f"Worker {worker_id} shutting down")
def cleanup_error_artifacts(uuid, datastore):
"""Helper function to clean up error artifacts"""
cleanup_files = ["last-error-screenshot.png", "last-error.txt"]
for f in cleanup_files:
full_path = os.path.join(datastore.datastore_path, uuid, f)
if os.path.isfile(full_path):
os.unlink(full_path)
async def send_content_changed_notification(watch_uuid, notification_q, datastore):
"""Helper function to queue notifications (kept sync for now)"""
# Note: This uses the original sync notification logic since notifications
# are handled by a separate thread. Could be made async later if needed.
try:
# Import here to avoid circular imports
from changedetectionio.update_worker import update_worker
# Create temporary worker instance just for notification methods
temp_worker = update_worker(None, notification_q, None, datastore)
temp_worker.datastore = datastore
temp_worker.notification_q = notification_q
temp_worker.send_content_changed_notification(watch_uuid)
except Exception as e:
logger.error(f"Error sending notification for {watch_uuid}: {e}")