kopia lustrzana https://github.com/dgtlmoon/changedetection.io
135 wiersze
5.0 KiB
Python
135 wiersze
5.0 KiB
Python
import os
|
|
import time
|
|
|
|
from loguru import logger
|
|
from changedetectionio.content_fetchers.base import Fetcher
|
|
|
|
class fetcher(Fetcher):
|
|
if os.getenv("WEBDRIVER_URL"):
|
|
fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL"))
|
|
else:
|
|
fetcher_description = "WebDriver Chrome/Javascript"
|
|
|
|
proxy = None
|
|
proxy_url = None
|
|
|
|
def __init__(self, proxy_override=None, custom_browser_connection_url=None):
|
|
super().__init__()
|
|
from urllib.parse import urlparse
|
|
from selenium.webdriver.common.proxy import Proxy
|
|
|
|
# .strip('"') is going to save someone a lot of time when they accidently wrap the env value
|
|
if not custom_browser_connection_url:
|
|
self.browser_connection_url = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"')
|
|
else:
|
|
self.browser_connection_is_custom = True
|
|
self.browser_connection_url = custom_browser_connection_url
|
|
|
|
|
|
##### PROXY SETUP #####
|
|
|
|
proxy_sources = [
|
|
self.system_http_proxy,
|
|
self.system_https_proxy,
|
|
os.getenv('webdriver_proxySocks'),
|
|
os.getenv('webdriver_socksProxy'),
|
|
os.getenv('webdriver_proxyHttp'),
|
|
os.getenv('webdriver_httpProxy'),
|
|
os.getenv('webdriver_proxyHttps'),
|
|
os.getenv('webdriver_httpsProxy'),
|
|
os.getenv('webdriver_sslProxy'),
|
|
proxy_override, # last one should override
|
|
]
|
|
# The built in selenium proxy handling is super unreliable!!! so we just grab which ever proxy setting we can find and throw it in --proxy-server=
|
|
for k in filter(None, proxy_sources):
|
|
if not k:
|
|
continue
|
|
self.proxy_url = k.strip()
|
|
|
|
|
|
def run(self,
|
|
url,
|
|
timeout,
|
|
request_headers,
|
|
request_body,
|
|
request_method,
|
|
ignore_status_codes=False,
|
|
current_include_filters=None,
|
|
is_binary=False,
|
|
empty_pages_are_a_change=False):
|
|
|
|
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
|
# request_body, request_method unused for now, until some magic in the future happens.
|
|
|
|
options = ChromeOptions()
|
|
|
|
# Load Chrome options from env
|
|
CHROME_OPTIONS = [
|
|
line.strip()
|
|
for line in os.getenv("CHROME_OPTIONS", "").strip().splitlines()
|
|
if line.strip()
|
|
]
|
|
|
|
for opt in CHROME_OPTIONS:
|
|
options.add_argument(opt)
|
|
|
|
# 1. proxy_config /Proxy(proxy_config) selenium object is REALLY unreliable
|
|
# 2. selenium-wire cant be used because the websocket version conflicts with pypeteer-ng
|
|
# 3. selenium only allows ONE runner at a time by default!
|
|
# 4. driver must use quit() or it will continue to block/hold the selenium process!!
|
|
|
|
if self.proxy_url:
|
|
options.add_argument(f'--proxy-server={self.proxy_url}')
|
|
|
|
from selenium.webdriver.remote.remote_connection import RemoteConnection
|
|
from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
|
|
driver = None
|
|
try:
|
|
# Create the RemoteConnection and set timeout (e.g., 30 seconds)
|
|
remote_connection = RemoteConnection(
|
|
self.browser_connection_url,
|
|
)
|
|
remote_connection.set_timeout(30) # seconds
|
|
|
|
# Now create the driver with the RemoteConnection
|
|
driver = RemoteWebDriver(
|
|
command_executor=remote_connection,
|
|
options=options
|
|
)
|
|
|
|
driver.set_page_load_timeout(int(os.getenv("WEBDRIVER_PAGELOAD_TIMEOUT", 45)))
|
|
except Exception as e:
|
|
if driver:
|
|
driver.quit()
|
|
raise e
|
|
|
|
try:
|
|
driver.get(url)
|
|
|
|
if not "--window-size" in os.getenv("CHROME_OPTIONS", ""):
|
|
driver.set_window_size(1280, 1024)
|
|
|
|
driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
|
|
|
|
if self.webdriver_js_execute_code is not None:
|
|
driver.execute_script(self.webdriver_js_execute_code)
|
|
# Selenium doesn't automatically wait for actions as good as Playwright, so wait again
|
|
driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
|
|
|
|
# @todo - how to check this? is it possible?
|
|
self.status_code = 200
|
|
# @todo somehow we should try to get this working for WebDriver
|
|
# raise EmptyReply(url=url, status_code=r.status_code)
|
|
|
|
# @todo - dom wait loaded?
|
|
time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
|
|
self.content = driver.page_source
|
|
self.headers = {}
|
|
self.screenshot = driver.get_screenshot_as_png()
|
|
except Exception as e:
|
|
driver.quit()
|
|
raise e
|
|
|
|
driver.quit()
|
|
|