changedetection.io/changedetectionio/content_fetchers/webdriver_selenium.py

import os
import time

from loguru import logger
from changedetectionio.content_fetchers.base import Fetcher

class fetcher(Fetcher):
    if os.getenv("WEBDRIVER_URL"):
        fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL"))
    else:
        fetcher_description = "WebDriver Chrome/Javascript"

    proxy = None
    proxy_url = None

    def __init__(self, proxy_override=None, custom_browser_connection_url=None):
        super().__init__()
        from urllib.parse import urlparse
        from selenium.webdriver.common.proxy import Proxy

        # .strip('"') is going to save someone a lot of time when they accidently wrap the env value
        if not custom_browser_connection_url:
            self.browser_connection_url = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"')
        else:
            self.browser_connection_is_custom = True
            self.browser_connection_url = custom_browser_connection_url


        ##### PROXY SETUP #####

        proxy_sources = [
            self.system_http_proxy,
            self.system_https_proxy,
            os.getenv('webdriver_proxySocks'),
            os.getenv('webdriver_socksProxy'),
            os.getenv('webdriver_proxyHttp'),
            os.getenv('webdriver_httpProxy'),
            os.getenv('webdriver_proxyHttps'),
            os.getenv('webdriver_httpsProxy'),
            os.getenv('webdriver_sslProxy'),
            proxy_override, # last one should override
        ]
        # The built in selenium proxy handling is super unreliable!!! so we just grab which ever proxy setting we can find and throw it in --proxy-server=
        for k in filter(None, proxy_sources):
            if not k:
                continue
            self.proxy_url = k.strip()


    def run(self,
            url,
            timeout,
            request_headers,
            request_body,
            request_method,
            ignore_status_codes=False,
            current_include_filters=None,
            is_binary=False,
            empty_pages_are_a_change=False):

        from selenium.webdriver.chrome.options import Options as ChromeOptions
        # request_body, request_method unused for now, until some magic in the future happens.

        options = ChromeOptions()

        # Load Chrome options from env
        CHROME_OPTIONS = [
            line.strip()
            for line in os.getenv("CHROME_OPTIONS", "").strip().splitlines()
            if line.strip()
        ]

        for opt in CHROME_OPTIONS:
            options.add_argument(opt)

        # 1. proxy_config /Proxy(proxy_config) selenium object is REALLY unreliable
        # 2. selenium-wire cant be used because the websocket version conflicts with pypeteer-ng
        # 3. selenium only allows ONE runner at a time by default!
        # 4. driver must use quit() or it will continue to block/hold the selenium process!!

        if self.proxy_url:
            options.add_argument(f'--proxy-server={self.proxy_url}')

        from selenium.webdriver.remote.remote_connection import RemoteConnection
        from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
        driver = None
        try:
            # Create the RemoteConnection and set timeout (e.g., 30 seconds)
            remote_connection = RemoteConnection(
                self.browser_connection_url,
            )
            remote_connection.set_timeout(30)  # seconds

            # Now create the driver with the RemoteConnection
            driver = RemoteWebDriver(
                command_executor=remote_connection,
                options=options
            )

            driver.set_page_load_timeout(int(os.getenv("WEBDRIVER_PAGELOAD_TIMEOUT", 45)))
        except Exception as e:
            if driver:
                driver.quit()
            raise e

        try:
            driver.get(url)

            if not "--window-size" in os.getenv("CHROME_OPTIONS", ""):
                driver.set_window_size(1280, 1024)

            driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))

            if self.webdriver_js_execute_code is not None:
                driver.execute_script(self.webdriver_js_execute_code)
                # Selenium doesn't automatically wait for actions as good as Playwright, so wait again
                driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))

            # @todo - how to check this? is it possible?
            self.status_code = 200
            # @todo somehow we should try to get this working for WebDriver
            # raise EmptyReply(url=url, status_code=r.status_code)

            # @todo - dom wait loaded?
            time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
            self.content = driver.page_source
            self.headers = {}
            self.screenshot = driver.get_screenshot_as_png()
        except Exception as e:
            driver.quit()
            raise e

        driver.quit()