Fetching pages - Custom browser endpoints should not have default proxy info added

2024-02-12 19:05:10 +01:00 · 2024-02-12 19:05:10 +01:00 · ccb42bcb12
commit ccb42bcb12
--- a/changedetectionio/content_fetchers/exceptions/init.py
+++ b/changedetectionio/content_fetchers/exceptions/init.py
@ -36,6 +36,13 @@ class BrowserConnectError(Exception):
        logger.error(f"Browser connection error {msg}")
        return
 class BrowserFetchTimedOut(Exception):
    msg = ''
    def __init__(self, msg):
        self.msg = msg
        logger.error(f"Browser processing took too long - {msg}")
        return
 class BrowserStepsStepException(Exception):
    def __init__(self, step_n, original_e):
        self.step_n = step_n
--- a/changedetectionio/content_fetchers/puppeteer.py
+++ b/changedetectionio/content_fetchers/puppeteer.py
@ -6,7 +6,7 @@ from urllib.parse import urlparse
 from loguru import logger
 from changedetectionio.content_fetchers.base import Fetcher
-from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable, BrowserConnectError
+from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError
 class fetcher(Fetcher):
@ -221,14 +221,21 @@ class fetcher(Fetcher):
    def run(self, url, timeout, request_headers, request_body, request_method, ignore_status_codes=False,
            current_include_filters=None, is_binary=False):
        #@todo make update_worker async which could run any of these content_fetchers within memory and time constraints
        max_time = os.getenv('PUPPETEER_MAX_PROCESSING_TIMEOUT_SECONDS', 180)
        # This will work in 3.10 but not >= 3.11 because 3.11 wants tasks only
-        asyncio.run(self.main(
+        try:
-            url=url,
+            asyncio.run(asyncio.wait_for(self.main(
-            timeout=timeout,
+                url=url,
-            request_headers=request_headers,
+                timeout=timeout,
-            request_body=request_body,
+                request_headers=request_headers,
-            request_method=request_method,
+                request_body=request_body,
-            ignore_status_codes=ignore_status_codes,
+                request_method=request_method,
-            current_include_filters=current_include_filters,
+                ignore_status_codes=ignore_status_codes,
-            is_binary=is_binary
+                current_include_filters=current_include_filters,
-        ))
+                is_binary=is_binary
            ), timeout=max_time))
        except asyncio.TimeoutError:
            raise(BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds."))
--- a/changedetectionio/processors/init.py
+++ b/changedetectionio/processors/init.py
@ -75,8 +75,12 @@ class difference_detection_processor():
        proxy_url = None
        if preferred_proxy_id:
-            proxy_url = self.datastore.proxy_list.get(preferred_proxy_id).get('url')
+            # Custom browser endpoints should not have a proxy added
-            logger.debug(f"Selected proxy key '{preferred_proxy_id}' as proxy URL '{proxy_url}' for {url}")
+            if not preferred_proxy_id.startswith('ui-'):
                proxy_url = self.datastore.proxy_list.get(preferred_proxy_id).get('url')
                logger.debug(f"Selected proxy key '{preferred_proxy_id}' as proxy URL '{proxy_url}' for {url}")
            else:
                logger.debug(f"Skipping adding proxy data when custom Browser endpoint is specified.")
        # Now call the fetcher (playwright/requests/etc) with arguments that only a fetcher would need.
        # When browser_connection_url is None, it method should default to working out whats the best defaults (os env vars etc)
--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@ -369,6 +369,12 @@ class update_worker(threading.Thread):
                                                                }
                                                    )
                        process_changedetection_results = False
                    except content_fetchers.exceptions.BrowserFetchTimedOut as e:
                        self.datastore.update_watch(uuid=uuid,
                                                    update_obj={'last_error': e.msg
                                                                }
                                                    )
                        process_changedetection_results = False
                    except content_fetchers.exceptions.BrowserStepsStepException as e:
                        if not self.datastore.data['watching'].get(uuid):