2024-02-10 23:09:12 +00:00
import json
import os
from urllib . parse import urlparse
from loguru import logger
2024-02-26 17:02:45 +00:00
2025-04-11 07:04:51 +00:00
from changedetectionio . content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT , visualselector_xpath_selectors , \
2025-04-11 11:42:50 +00:00
SCREENSHOT_SIZE_STITCH_THRESHOLD , SCREENSHOT_MAX_TOTAL_HEIGHT , XPATH_ELEMENT_JS , INSTOCK_DATA_JS
2024-02-26 17:02:45 +00:00
from changedetectionio . content_fetchers . base import Fetcher , manage_user_agent
2024-02-10 23:09:12 +00:00
from changedetectionio . content_fetchers . exceptions import PageUnloadable , Non200ErrorCodeReceived , EmptyReply , ScreenshotUnavailable
2025-04-11 07:04:51 +00:00
def capture_full_page ( page ) :
import os
import time
from multiprocessing import Process , Pipe
start = time . time ( )
page_height = page . evaluate ( " document.documentElement.scrollHeight " )
2025-04-11 11:42:50 +00:00
page_width = page . evaluate ( " document.documentElement.scrollWidth " )
original_viewport = page . viewport_size
2025-04-11 07:04:51 +00:00
2025-04-11 11:42:50 +00:00
logger . debug ( f " Playwright viewport size { page . viewport_size } page height { page_height } page width { page_width } " )
2025-04-11 07:04:51 +00:00
2025-04-11 11:42:50 +00:00
# Use an approach similar to puppeteer: set a larger viewport and take screenshots in chunks
step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Size that won't cause GPU to overflow
2025-04-11 07:04:51 +00:00
screenshot_chunks = [ ]
2025-04-11 11:42:50 +00:00
y = 0
2025-04-12 16:37:59 +00:00
2025-04-11 11:42:50 +00:00
if page_height > page . viewport_size [ ' height ' ] :
2025-04-12 16:37:59 +00:00
if page_height < step_size :
step_size = page_height # Incase page is bigger than default viewport but smaller than proposed step size
logger . debug ( f " Setting bigger viewport to step through large page width W { page . viewport_size [ ' width ' ] } xH { step_size } because page_height > viewport_size " )
2025-04-11 11:42:50 +00:00
# Set viewport to a larger size to capture more content at once
page . set_viewport_size ( { ' width ' : page . viewport_size [ ' width ' ] , ' height ' : step_size } )
# Capture screenshots in chunks up to the max total height
while y < min ( page_height , SCREENSHOT_MAX_TOTAL_HEIGHT ) :
2025-04-11 07:04:51 +00:00
page . request_gc ( )
2025-04-11 11:42:50 +00:00
page . evaluate ( f " window.scrollTo(0, { y } ) " )
2025-04-11 07:04:51 +00:00
page . request_gc ( )
screenshot_chunks . append ( page . screenshot (
2025-04-11 11:42:50 +00:00
type = " jpeg " ,
full_page = False ,
quality = int ( os . getenv ( " SCREENSHOT_QUALITY " , 72 ) )
) )
y + = step_size
2025-04-11 07:04:51 +00:00
page . request_gc ( )
2025-04-11 11:42:50 +00:00
# Restore original viewport size
page . set_viewport_size ( { ' width ' : original_viewport [ ' width ' ] , ' height ' : original_viewport [ ' height ' ] } )
# If we have multiple chunks, stitch them together
if len ( screenshot_chunks ) > 1 :
from changedetectionio . content_fetchers . screenshot_handler import stitch_images_worker
logger . debug ( f " Screenshot stitching { len ( screenshot_chunks ) } chunks together " )
parent_conn , child_conn = Pipe ( )
p = Process ( target = stitch_images_worker , args = ( child_conn , screenshot_chunks , page_height , SCREENSHOT_MAX_TOTAL_HEIGHT ) )
p . start ( )
screenshot = parent_conn . recv_bytes ( )
p . join ( )
logger . debug (
f " Screenshot (chunked/stitched) - Page height: { page_height } Capture height: { SCREENSHOT_MAX_TOTAL_HEIGHT } - Stitched together in { time . time ( ) - start : .2f } s " )
2025-04-11 16:18:29 +00:00
# Explicit cleanup
del screenshot_chunks
del p
del parent_conn , child_conn
2025-04-11 11:42:50 +00:00
screenshot_chunks = None
return screenshot
2025-04-11 07:04:51 +00:00
2025-04-11 11:42:50 +00:00
logger . debug (
f " Screenshot Page height: { page_height } Capture height: { SCREENSHOT_MAX_TOTAL_HEIGHT } - Stitched together in { time . time ( ) - start : .2f } s " )
2025-04-11 07:04:51 +00:00
2025-04-11 11:42:50 +00:00
return screenshot_chunks [ 0 ]
2025-04-11 07:04:51 +00:00
2024-02-10 23:09:12 +00:00
class fetcher ( Fetcher ) :
fetcher_description = " Playwright {} /Javascript " . format (
os . getenv ( " PLAYWRIGHT_BROWSER_TYPE " , ' chromium ' ) . capitalize ( )
)
if os . getenv ( " PLAYWRIGHT_DRIVER_URL " ) :
fetcher_description + = " via ' {} ' " . format ( os . getenv ( " PLAYWRIGHT_DRIVER_URL " ) )
browser_type = ' '
command_executor = ' '
# Configs for Proxy setup
# In the ENV vars, is prefixed with "playwright_proxy_", so it is for example "playwright_proxy_server"
playwright_proxy_settings_mappings = [ ' bypass ' , ' server ' , ' username ' , ' password ' ]
proxy = None
def __init__ ( self , proxy_override = None , custom_browser_connection_url = None ) :
super ( ) . __init__ ( )
self . browser_type = os . getenv ( " PLAYWRIGHT_BROWSER_TYPE " , ' chromium ' ) . strip ( ' " ' )
if custom_browser_connection_url :
self . browser_connection_is_custom = True
self . browser_connection_url = custom_browser_connection_url
else :
# Fallback to fetching from system
# .strip('"') is going to save someone a lot of time when they accidently wrap the env value
self . browser_connection_url = os . getenv ( " PLAYWRIGHT_DRIVER_URL " , ' ws://playwright-chrome:3000 ' ) . strip ( ' " ' )
# If any proxy settings are enabled, then we should setup the proxy object
proxy_args = { }
for k in self . playwright_proxy_settings_mappings :
v = os . getenv ( ' playwright_proxy_ ' + k , False )
if v :
proxy_args [ k ] = v . strip ( ' " ' )
if proxy_args :
self . proxy = proxy_args
# allow per-watch proxy selection override
if proxy_override :
self . proxy = { ' server ' : proxy_override }
if self . proxy :
# Playwright needs separate username and password values
parsed = urlparse ( self . proxy . get ( ' server ' ) )
if parsed . username :
self . proxy [ ' username ' ] = parsed . username
self . proxy [ ' password ' ] = parsed . password
def screenshot_step ( self , step_n = ' ' ) :
2024-07-11 13:03:30 +00:00
super ( ) . screenshot_step ( step_n = step_n )
2025-04-11 07:04:51 +00:00
screenshot = capture_full_page ( page = self . page )
2024-02-10 23:09:12 +00:00
if self . browser_steps_screenshot_path is not None :
destination = os . path . join ( self . browser_steps_screenshot_path , ' step_ {} .jpeg ' . format ( step_n ) )
logger . debug ( f " Saving step screenshot to { destination } " )
with open ( destination , ' wb ' ) as f :
f . write ( screenshot )
def save_step_html ( self , step_n ) :
2024-07-11 13:03:30 +00:00
super ( ) . save_step_html ( step_n = step_n )
2024-02-10 23:09:12 +00:00
content = self . page . content ( )
destination = os . path . join ( self . browser_steps_screenshot_path , ' step_ {} .html ' . format ( step_n ) )
logger . debug ( f " Saving step HTML to { destination } " )
with open ( destination , ' w ' ) as f :
f . write ( content )
def run ( self ,
url ,
timeout ,
request_headers ,
request_body ,
request_method ,
ignore_status_codes = False ,
current_include_filters = None ,
2024-07-29 11:27:59 +00:00
is_binary = False ,
empty_pages_are_a_change = False ) :
2024-02-10 23:09:12 +00:00
from playwright . sync_api import sync_playwright
import playwright . _impl . _errors
2025-02-27 15:52:38 +00:00
import time
2024-02-10 23:09:12 +00:00
self . delete_browser_steps_screenshots ( )
response = None
with sync_playwright ( ) as p :
browser_type = getattr ( p , self . browser_type )
# Seemed to cause a connection Exception even tho I can see it connect
# self.browser = browser_type.connect(self.command_executor, timeout=timeout*1000)
# 60,000 connection timeout only
browser = browser_type . connect_over_cdp ( self . browser_connection_url , timeout = 60000 )
# SOCKS5 with authentication is not supported (yet)
# https://github.com/microsoft/playwright/issues/10567
# Set user agent to prevent Cloudflare from blocking the browser
# Use the default one configured in the App.py model that's passed from fetch_site_status.py
context = browser . new_context (
2024-02-26 17:02:45 +00:00
accept_downloads = False , # Should never be needed
bypass_csp = True , # This is needed to enable JavaScript execution on GitHub and others
extra_http_headers = request_headers ,
ignore_https_errors = True ,
2024-02-10 23:09:12 +00:00
proxy = self . proxy ,
2024-02-26 17:02:45 +00:00
service_workers = os . getenv ( ' PLAYWRIGHT_SERVICE_WORKERS ' , ' allow ' ) , # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
user_agent = manage_user_agent ( headers = request_headers ) ,
2024-02-10 23:09:12 +00:00
)
self . page = context . new_page ( )
# Listen for all console events and handle errors
self . page . on ( " console " , lambda msg : print ( f " Playwright console: Watch URL: { url } { msg . type } : { msg . text } { msg . args } " ) )
# Re-use as much code from browser steps as possible so its the same
from changedetectionio . blueprint . browser_steps . browser_steps import steppable_browser_interface
2024-05-15 08:49:30 +00:00
browsersteps_interface = steppable_browser_interface ( start_url = url )
2024-02-10 23:09:12 +00:00
browsersteps_interface . page = self . page
response = browsersteps_interface . action_goto_url ( value = url )
self . headers = response . all_headers ( )
if response is None :
context . close ( )
browser . close ( )
2024-07-29 11:27:59 +00:00
logger . debug ( " Content Fetcher > Response object from the browser communication was none " )
2024-02-10 23:09:12 +00:00
raise EmptyReply ( url = url , status_code = None )
try :
if self . webdriver_js_execute_code is not None and len ( self . webdriver_js_execute_code ) :
browsersteps_interface . action_execute_js ( value = self . webdriver_js_execute_code , selector = None )
except playwright . _impl . _errors . TimeoutError as e :
context . close ( )
browser . close ( )
# This can be ok, we will try to grab what we could retrieve
pass
except Exception as e :
logger . debug ( f " Content Fetcher > Other exception when executing custom JS code { str ( e ) } " )
context . close ( )
browser . close ( )
raise PageUnloadable ( url = url , status_code = None , message = str ( e ) )
extra_wait = int ( os . getenv ( " WEBDRIVER_DELAY_BEFORE_CONTENT_READY " , 5 ) ) + self . render_extract_delay
self . page . wait_for_timeout ( extra_wait * 1000 )
try :
self . status_code = response . status
except Exception as e :
# https://github.com/dgtlmoon/changedetection.io/discussions/2122#discussioncomment-8241962
logger . critical ( f " Response from the browser/Playwright did not have a status_code! Response follows. " )
logger . critical ( response )
context . close ( )
browser . close ( )
raise PageUnloadable ( url = url , status_code = None , message = str ( e ) )
if self . status_code != 200 and not ignore_status_codes :
2025-04-10 06:44:53 +00:00
screenshot = capture_full_page ( self . page )
2024-02-10 23:09:12 +00:00
raise Non200ErrorCodeReceived ( url = url , status_code = self . status_code , screenshot = screenshot )
2024-07-29 11:27:59 +00:00
if not empty_pages_are_a_change and len ( self . page . content ( ) . strip ( ) ) == 0 :
logger . debug ( " Content Fetcher > Content was empty, empty_pages_are_a_change = False " )
2024-02-10 23:09:12 +00:00
context . close ( )
browser . close ( )
raise EmptyReply ( url = url , status_code = response . status )
# Run Browser Steps here
if self . browser_steps_get_valid_steps ( ) :
2024-05-15 08:49:30 +00:00
self . iterate_browser_steps ( start_url = url )
2024-02-10 23:09:12 +00:00
self . page . wait_for_timeout ( extra_wait * 1000 )
2025-02-27 15:52:38 +00:00
now = time . time ( )
2024-02-10 23:09:12 +00:00
# So we can find an element on the page where its selector was entered manually (maybe not xPath etc)
if current_include_filters is not None :
self . page . evaluate ( " var include_filters= {} " . format ( json . dumps ( current_include_filters ) ) )
else :
self . page . evaluate ( " var include_filters= ' ' " )
2025-04-11 07:04:51 +00:00
self . page . request_gc ( )
# request_gc before and after evaluate to free up memory
# @todo browsersteps etc
MAX_TOTAL_HEIGHT = int ( os . getenv ( " SCREENSHOT_MAX_HEIGHT " , SCREENSHOT_MAX_HEIGHT_DEFAULT ) )
self . xpath_data = self . page . evaluate ( XPATH_ELEMENT_JS , {
" visualselector_xpath_selectors " : visualselector_xpath_selectors ,
" max_height " : MAX_TOTAL_HEIGHT
} )
self . page . request_gc ( )
2024-02-10 23:09:12 +00:00
2025-04-11 07:04:51 +00:00
self . instock_data = self . page . evaluate ( INSTOCK_DATA_JS )
self . page . request_gc ( )
2024-02-10 23:09:12 +00:00
self . content = self . page . content ( )
2025-04-11 11:42:50 +00:00
self . page . request_gc ( )
2025-04-11 07:04:51 +00:00
logger . debug ( f " Scrape xPath element data in browser done in { time . time ( ) - now : .2f } s " )
2025-02-27 15:52:38 +00:00
2024-02-10 23:09:12 +00:00
# Bug 3 in Playwright screenshot handling
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
# JPEG is better here because the screenshots can be very very large
# Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded
# which will significantly increase the IO size between the server and client, it's recommended to use the lowest
# acceptable screenshot quality here
try :
# The actual screenshot - this always base64 and needs decoding! horrible! huge CPU usage
2025-04-11 07:04:51 +00:00
self . screenshot = capture_full_page ( page = self . page )
2025-02-27 15:52:38 +00:00
2024-02-10 23:09:12 +00:00
except Exception as e :
# It's likely the screenshot was too long/big and something crashed
raise ScreenshotUnavailable ( url = url , status_code = self . status_code )
finally :
2025-04-11 07:04:51 +00:00
# Request garbage collection one more time before closing
try :
self . page . request_gc ( )
except :
pass
# Clean up resources properly
2025-04-11 16:18:29 +00:00
try :
self . page . request_gc ( )
except :
pass
2025-04-11 11:42:50 +00:00
2025-04-11 16:18:29 +00:00
try :
self . page . close ( )
except :
pass
2025-04-11 11:42:50 +00:00
self . page = None
2025-04-11 16:18:29 +00:00
try :
context . close ( )
except :
pass
context = None
try :
browser . close ( )
except :
pass
browser = None
2025-04-11 11:42:50 +00:00