changedetection.io/changedetectionio/content_fetchers/webdriver_selenium.py

135 wiersze
5.0 KiB
Python

import os
import time
from loguru import logger
from changedetectionio.content_fetchers.base import Fetcher
class fetcher(Fetcher):
if os.getenv("WEBDRIVER_URL"):
fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL"))
else:
fetcher_description = "WebDriver Chrome/Javascript"
proxy = None
proxy_url = None
def __init__(self, proxy_override=None, custom_browser_connection_url=None):
super().__init__()
from urllib.parse import urlparse
from selenium.webdriver.common.proxy import Proxy
# .strip('"') is going to save someone a lot of time when they accidently wrap the env value
if not custom_browser_connection_url:
self.browser_connection_url = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"')
else:
self.browser_connection_is_custom = True
self.browser_connection_url = custom_browser_connection_url
##### PROXY SETUP #####
proxy_sources = [
self.system_http_proxy,
self.system_https_proxy,
os.getenv('webdriver_proxySocks'),
os.getenv('webdriver_socksProxy'),
os.getenv('webdriver_proxyHttp'),
os.getenv('webdriver_httpProxy'),
os.getenv('webdriver_proxyHttps'),
os.getenv('webdriver_httpsProxy'),
os.getenv('webdriver_sslProxy'),
proxy_override, # last one should override
]
# The built in selenium proxy handling is super unreliable!!! so we just grab which ever proxy setting we can find and throw it in --proxy-server=
for k in filter(None, proxy_sources):
if not k:
continue
self.proxy_url = k.strip()
def run(self,
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False,
empty_pages_are_a_change=False):
from selenium.webdriver.chrome.options import Options as ChromeOptions
# request_body, request_method unused for now, until some magic in the future happens.
options = ChromeOptions()
# Load Chrome options from env
CHROME_OPTIONS = [
line.strip()
for line in os.getenv("CHROME_OPTIONS", "").strip().splitlines()
if line.strip()
]
for opt in CHROME_OPTIONS:
options.add_argument(opt)
# 1. proxy_config /Proxy(proxy_config) selenium object is REALLY unreliable
# 2. selenium-wire cant be used because the websocket version conflicts with pypeteer-ng
# 3. selenium only allows ONE runner at a time by default!
# 4. driver must use quit() or it will continue to block/hold the selenium process!!
if self.proxy_url:
options.add_argument(f'--proxy-server={self.proxy_url}')
from selenium.webdriver.remote.remote_connection import RemoteConnection
from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
driver = None
try:
# Create the RemoteConnection and set timeout (e.g., 30 seconds)
remote_connection = RemoteConnection(
self.browser_connection_url,
)
remote_connection.set_timeout(30) # seconds
# Now create the driver with the RemoteConnection
driver = RemoteWebDriver(
command_executor=remote_connection,
options=options
)
driver.set_page_load_timeout(int(os.getenv("WEBDRIVER_PAGELOAD_TIMEOUT", 45)))
except Exception as e:
if driver:
driver.quit()
raise e
try:
driver.get(url)
if not "--window-size" in os.getenv("CHROME_OPTIONS", ""):
driver.set_window_size(1280, 1024)
driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
if self.webdriver_js_execute_code is not None:
driver.execute_script(self.webdriver_js_execute_code)
# Selenium doesn't automatically wait for actions as good as Playwright, so wait again
driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
# @todo - how to check this? is it possible?
self.status_code = 200
# @todo somehow we should try to get this working for WebDriver
# raise EmptyReply(url=url, status_code=r.status_code)
# @todo - dom wait loaded?
time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
self.content = driver.page_source
self.headers = {}
self.screenshot = driver.get_screenshot_as_png()
except Exception as e:
driver.quit()
raise e
driver.quit()