BrowserSteps - Speed up scraping, refactor screenshot handling for very long pages (#2999)

pull/3012/head
dgtlmoon 2025-02-27 16:52:38 +01:00 zatwierdzone przez GitHub
rodzic 4bf560256b
commit 849c5b2293
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: B5690EEEBB952194
6 zmienionych plików z 179 dodań i 69 usunięć

Wyświetl plik

@ -45,8 +45,12 @@ jobs:
- name: Test that the basic pip built package runs without error - name: Test that the basic pip built package runs without error
run: | run: |
set -ex set -ex
pip3 install dist/changedetection.io*.whl ls -alR
# Find and install the first .whl file
find dist -type f -name "*.whl" -exec pip3 install {} \; -quit
changedetection.io -d /tmp -p 10000 & changedetection.io -d /tmp -p 10000 &
sleep 3 sleep 3
curl --retry-connrefused --retry 6 http://127.0.0.1:10000/static/styles/pure-min.css >/dev/null curl --retry-connrefused --retry 6 http://127.0.0.1:10000/static/styles/pure-min.css >/dev/null
curl --retry-connrefused --retry 6 http://127.0.0.1:10000/ >/dev/null curl --retry-connrefused --retry 6 http://127.0.0.1:10000/ >/dev/null

Wyświetl plik

@ -22,7 +22,10 @@ from loguru import logger
browsersteps_sessions = {} browsersteps_sessions = {}
io_interface_context = None io_interface_context = None
import json
import base64
import hashlib
from flask import Response
def construct_blueprint(datastore: ChangeDetectionStore): def construct_blueprint(datastore: ChangeDetectionStore):
browser_steps_blueprint = Blueprint('browser_steps', __name__, template_folder="templates") browser_steps_blueprint = Blueprint('browser_steps', __name__, template_folder="templates")
@ -160,14 +163,13 @@ def construct_blueprint(datastore: ChangeDetectionStore):
if not browsersteps_sessions.get(browsersteps_session_id): if not browsersteps_sessions.get(browsersteps_session_id):
return make_response('No session exists under that ID', 500) return make_response('No session exists under that ID', 500)
is_last_step = False
# Actions - step/apply/etc, do the thing and return state # Actions - step/apply/etc, do the thing and return state
if request.method == 'POST': if request.method == 'POST':
# @todo - should always be an existing session # @todo - should always be an existing session
step_operation = request.form.get('operation') step_operation = request.form.get('operation')
step_selector = request.form.get('selector') step_selector = request.form.get('selector')
step_optional_value = request.form.get('optional_value') step_optional_value = request.form.get('optional_value')
step_n = int(request.form.get('step_n'))
is_last_step = strtobool(request.form.get('is_last_step')) is_last_step = strtobool(request.form.get('is_last_step'))
# @todo try.. accept.. nice errors not popups.. # @todo try.. accept.. nice errors not popups..
@ -182,16 +184,6 @@ def construct_blueprint(datastore: ChangeDetectionStore):
# Try to find something of value to give back to the user # Try to find something of value to give back to the user
return make_response(str(e).splitlines()[0], 401) return make_response(str(e).splitlines()[0], 401)
# Get visual selector ready/update its data (also use the current filter info from the page?)
# When the last 'apply' button was pressed
# @todo this adds overhead because the xpath selection is happening twice
u = browsersteps_sessions[browsersteps_session_id]['browserstepper'].page.url
if is_last_step and u:
(screenshot, xpath_data) = browsersteps_sessions[browsersteps_session_id]['browserstepper'].request_visualselector_data()
watch = datastore.data['watching'].get(uuid)
if watch:
watch.save_screenshot(screenshot=screenshot)
watch.save_xpath_data(data=xpath_data)
# if not this_session.page: # if not this_session.page:
# cleanup_playwright_session() # cleanup_playwright_session()
@ -199,31 +191,35 @@ def construct_blueprint(datastore: ChangeDetectionStore):
# Screenshots and other info only needed on requesting a step (POST) # Screenshots and other info only needed on requesting a step (POST)
try: try:
state = browsersteps_sessions[browsersteps_session_id]['browserstepper'].get_current_state() (screenshot, xpath_data) = browsersteps_sessions[browsersteps_session_id]['browserstepper'].get_current_state()
if is_last_step:
watch = datastore.data['watching'].get(uuid)
u = browsersteps_sessions[browsersteps_session_id]['browserstepper'].page.url
if watch and u:
watch.save_screenshot(screenshot=screenshot)
watch.save_xpath_data(data=xpath_data)
except playwright._impl._api_types.Error as e: except playwright._impl._api_types.Error as e:
return make_response("Browser session ran out of time :( Please reload this page."+str(e), 401) return make_response("Browser session ran out of time :( Please reload this page."+str(e), 401)
except Exception as e:
return make_response("Error fetching screenshot and element data - " + str(e), 401)
# Use send_file() which is way faster than read/write loop on bytes # SEND THIS BACK TO THE BROWSER
import json
from tempfile import mkstemp
from flask import send_file
tmp_fd, tmp_file = mkstemp(text=True, suffix=".json", prefix="changedetectionio-")
output = json.dumps({'screenshot': "data:image/jpeg;base64,{}".format( output = {
base64.b64encode(state[0]).decode('ascii')), "screenshot": f"data:image/jpeg;base64,{base64.b64encode(screenshot).decode('ascii')}",
'xpath_data': state[1], "xpath_data": xpath_data,
'session_age_start': browsersteps_sessions[browsersteps_session_id]['browserstepper'].age_start, "session_age_start": browsersteps_sessions[browsersteps_session_id]['browserstepper'].age_start,
'browser_time_remaining': round(remaining) "browser_time_remaining": round(remaining)
}) }
json_data = json.dumps(output)
with os.fdopen(tmp_fd, 'w') as f: # Generate an ETag (hash of the response body)
f.write(output) etag_hash = hashlib.md5(json_data.encode('utf-8')).hexdigest()
response = make_response(send_file(path_or_file=tmp_file, # Create the response with ETag
mimetype='application/json; charset=UTF-8', response = Response(json_data, mimetype="application/json; charset=UTF-8")
etag=True)) response.set_etag(etag_hash)
# No longer needed
os.unlink(tmp_file)
return response return response

Wyświetl plik

@ -1,14 +1,15 @@
#!/usr/bin/env python3
import os import os
import time import time
import re import re
from random import randint from random import randint
from loguru import logger from loguru import logger
from changedetectionio.content_fetchers.helpers import capture_stitched_together_full_page, SCREENSHOT_SIZE_STITCH_THRESHOLD
from changedetectionio.content_fetchers.base import manage_user_agent from changedetectionio.content_fetchers.base import manage_user_agent
from changedetectionio.safe_jinja import render as jinja_render from changedetectionio.safe_jinja import render as jinja_render
# Two flags, tell the JS which of the "Selector" or "Value" field should be enabled in the front end # Two flags, tell the JS which of the "Selector" or "Value" field should be enabled in the front end
# 0- off, 1- on # 0- off, 1- on
browser_step_ui_config = {'Choose one': '0 0', browser_step_ui_config = {'Choose one': '0 0',
@ -279,6 +280,7 @@ class browsersteps_live_ui(steppable_browser_interface):
logger.debug(f"Time to browser setup {time.time()-now:.2f}s") logger.debug(f"Time to browser setup {time.time()-now:.2f}s")
self.page.wait_for_timeout(1 * 1000) self.page.wait_for_timeout(1 * 1000)
def mark_as_closed(self): def mark_as_closed(self):
logger.debug("Page closed, cleaning up..") logger.debug("Page closed, cleaning up..")
@ -296,39 +298,30 @@ class browsersteps_live_ui(steppable_browser_interface):
now = time.time() now = time.time()
self.page.wait_for_timeout(1 * 1000) self.page.wait_for_timeout(1 * 1000)
# The actual screenshot
screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=40)
full_height = self.page.evaluate("document.documentElement.scrollHeight")
if full_height >= SCREENSHOT_SIZE_STITCH_THRESHOLD:
logger.warning(f"Page full Height: {full_height}px longer than {SCREENSHOT_SIZE_STITCH_THRESHOLD}px, using 'stitched screenshot method'.")
screenshot = capture_stitched_together_full_page(self.page)
else:
screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=40)
logger.debug(f"Time to get screenshot from browser {time.time() - now:.2f}s")
now = time.time()
self.page.evaluate("var include_filters=''") self.page.evaluate("var include_filters=''")
# Go find the interactive elements # Go find the interactive elements
# @todo in the future, something smarter that can scan for elements with .click/focus etc event handlers? # @todo in the future, something smarter that can scan for elements with .click/focus etc event handlers?
elements = 'a,button,input,select,textarea,i,th,td,p,li,h1,h2,h3,h4,div,span' elements = 'a,button,input,select,textarea,i,th,td,p,li,h1,h2,h3,h4,div,span'
xpath_element_js = xpath_element_js.replace('%ELEMENTS%', elements) xpath_element_js = xpath_element_js.replace('%ELEMENTS%', elements)
xpath_data = self.page.evaluate("async () => {" + xpath_element_js + "}") xpath_data = self.page.evaluate("async () => {" + xpath_element_js + "}")
# So the JS will find the smallest one first # So the JS will find the smallest one first
xpath_data['size_pos'] = sorted(xpath_data['size_pos'], key=lambda k: k['width'] * k['height'], reverse=True) xpath_data['size_pos'] = sorted(xpath_data['size_pos'], key=lambda k: k['width'] * k['height'], reverse=True)
logger.debug(f"Time to complete get_current_state of browser {time.time()-now:.2f}s") logger.debug(f"Time to scrape xpath element data in browser {time.time()-now:.2f}s")
# except
# playwright._impl._api_types.Error: Browser closed. # playwright._impl._api_types.Error: Browser closed.
# @todo show some countdown timer? # @todo show some countdown timer?
return (screenshot, xpath_data) return (screenshot, xpath_data)
def request_visualselector_data(self):
"""
Does the same that the playwright operation in content_fetcher does
This is used to just bump the VisualSelector data so it' ready to go if they click on the tab
@todo refactor and remove duplicate code, add include_filters
:param xpath_data:
:param screenshot:
:param current_include_filters:
:return:
"""
import importlib.resources
self.page.evaluate("var include_filters=''")
xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text()
from changedetectionio.content_fetchers import visualselector_xpath_selectors
xpath_element_js = xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors)
xpath_data = self.page.evaluate("async () => {" + xpath_element_js + "}")
screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))
return (screenshot, xpath_data)

Wyświetl plik

@ -0,0 +1,104 @@
# Pages with a vertical height longer than this will use the 'stitch together' method.
# - Many GPUs have a max texture size of 16384x16384px (or lower on older devices).
# - If a page is taller than ~8000–10000px, it risks exceeding GPU memory limits.
# - This is especially important on headless Chromium, where Playwright may fail to allocate a massive full-page buffer.
# The size at which we will switch to stitching method
SCREENSHOT_SIZE_STITCH_THRESHOLD=8000
from loguru import logger
def capture_stitched_together_full_page(page):
import io
import os
import time
from PIL import Image, ImageDraw, ImageFont
MAX_TOTAL_HEIGHT = SCREENSHOT_SIZE_STITCH_THRESHOLD*4 # Maximum total height for the final image (When in stitch mode)
MAX_CHUNK_HEIGHT = 4000 # Height per screenshot chunk
WARNING_TEXT_HEIGHT = 20 # Height of the warning text overlay
# Save the original viewport size
original_viewport = page.viewport_size
now = time.time()
try:
viewport = page.viewport_size
page_height = page.evaluate("document.documentElement.scrollHeight")
# Limit the total capture height
capture_height = min(page_height, MAX_TOTAL_HEIGHT)
images = []
total_captured_height = 0
for offset in range(0, capture_height, MAX_CHUNK_HEIGHT):
# Ensure we do not exceed the total height limit
chunk_height = min(MAX_CHUNK_HEIGHT, MAX_TOTAL_HEIGHT - total_captured_height)
# Adjust viewport size for this chunk
page.set_viewport_size({"width": viewport["width"], "height": chunk_height})
# Scroll to the correct position
page.evaluate(f"window.scrollTo(0, {offset})")
# Capture screenshot chunk
screenshot_bytes = page.screenshot(type='jpeg', quality=int(os.getenv("SCREENSHOT_QUALITY", 30)))
images.append(Image.open(io.BytesIO(screenshot_bytes)))
total_captured_height += chunk_height
# Stop if we reached the maximum total height
if total_captured_height >= MAX_TOTAL_HEIGHT:
break
# Create the final stitched image
stitched_image = Image.new('RGB', (viewport["width"], total_captured_height))
y_offset = 0
# Stitch the screenshot chunks together
for img in images:
stitched_image.paste(img, (0, y_offset))
y_offset += img.height
logger.debug(f"Screenshot stitched together in {time.time()-now:.2f}s")
# Overlay warning text if the screenshot was trimmed
if page_height > MAX_TOTAL_HEIGHT:
draw = ImageDraw.Draw(stitched_image)
warning_text = f"WARNING: Screenshot was {page_height}px but trimmed to {MAX_TOTAL_HEIGHT}px because it was too long"
# Load font (default system font if Arial is unavailable)
try:
font = ImageFont.truetype("arial.ttf", WARNING_TEXT_HEIGHT) # Arial (Windows/Mac)
except IOError:
font = ImageFont.load_default() # Default font if Arial not found
# Get text bounding box (correct method for newer Pillow versions)
text_bbox = draw.textbbox((0, 0), warning_text, font=font)
text_width = text_bbox[2] - text_bbox[0] # Calculate text width
text_height = text_bbox[3] - text_bbox[1] # Calculate text height
# Define background rectangle (top of the image)
draw.rectangle([(0, 0), (viewport["width"], WARNING_TEXT_HEIGHT)], fill="white")
# Center text horizontally within the warning area
text_x = (viewport["width"] - text_width) // 2
text_y = (WARNING_TEXT_HEIGHT - text_height) // 2
# Draw the warning text in red
draw.text((text_x, text_y), warning_text, fill="red", font=font)
# Save or return the final image
output = io.BytesIO()
stitched_image.save(output, format="JPEG", quality=int(os.getenv("SCREENSHOT_QUALITY", 30)))
screenshot = output.getvalue()
finally:
# Restore the original viewport size
page.set_viewport_size(original_viewport)
return screenshot

Wyświetl plik

@ -4,6 +4,7 @@ from urllib.parse import urlparse
from loguru import logger from loguru import logger
from changedetectionio.content_fetchers.helpers import capture_stitched_together_full_page, SCREENSHOT_SIZE_STITCH_THRESHOLD
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
@ -89,6 +90,7 @@ class fetcher(Fetcher):
from playwright.sync_api import sync_playwright from playwright.sync_api import sync_playwright
import playwright._impl._errors import playwright._impl._errors
from changedetectionio.content_fetchers import visualselector_xpath_selectors from changedetectionio.content_fetchers import visualselector_xpath_selectors
import time
self.delete_browser_steps_screenshots() self.delete_browser_steps_screenshots()
response = None response = None
@ -179,6 +181,7 @@ class fetcher(Fetcher):
self.page.wait_for_timeout(extra_wait * 1000) self.page.wait_for_timeout(extra_wait * 1000)
now = time.time()
# So we can find an element on the page where its selector was entered manually (maybe not xPath etc) # So we can find an element on the page where its selector was entered manually (maybe not xPath etc)
if current_include_filters is not None: if current_include_filters is not None:
self.page.evaluate("var include_filters={}".format(json.dumps(current_include_filters))) self.page.evaluate("var include_filters={}".format(json.dumps(current_include_filters)))
@ -190,6 +193,8 @@ class fetcher(Fetcher):
self.instock_data = self.page.evaluate("async () => {" + self.instock_data_js + "}") self.instock_data = self.page.evaluate("async () => {" + self.instock_data_js + "}")
self.content = self.page.content() self.content = self.page.content()
logger.debug(f"Time to scrape xpath element data in browser {time.time() - now:.2f}s")
# Bug 3 in Playwright screenshot handling # Bug 3 in Playwright screenshot handling
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
# JPEG is better here because the screenshots can be very very large # JPEG is better here because the screenshots can be very very large
@ -199,10 +204,15 @@ class fetcher(Fetcher):
# acceptable screenshot quality here # acceptable screenshot quality here
try: try:
# The actual screenshot - this always base64 and needs decoding! horrible! huge CPU usage # The actual screenshot - this always base64 and needs decoding! horrible! huge CPU usage
self.screenshot = self.page.screenshot(type='jpeg', full_height = self.page.evaluate("document.documentElement.scrollHeight")
full_page=True,
quality=int(os.getenv("SCREENSHOT_QUALITY", 72)), if full_height >= SCREENSHOT_SIZE_STITCH_THRESHOLD:
) logger.warning(
f"Page full Height: {full_height}px longer than {SCREENSHOT_SIZE_STITCH_THRESHOLD}px, using 'stitched screenshot method'.")
self.screenshot = capture_stitched_together_full_page(self.page)
else:
self.screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("SCREENSHOT_QUALITY", 30)))
except Exception as e: except Exception as e:
# It's likely the screenshot was too long/big and something crashed # It's likely the screenshot was too long/big and something crashed
raise ScreenshotUnavailable(url=url, status_code=self.status_code) raise ScreenshotUnavailable(url=url, status_code=self.status_code)

Wyświetl plik

@ -41,7 +41,7 @@ const findUpTag = (el) => {
// Strategy 1: If it's an input, with name, and there's only one, prefer that // Strategy 1: If it's an input, with name, and there's only one, prefer that
if (el.name !== undefined && el.name.length) { if (el.name !== undefined && el.name.length) {
var proposed = el.tagName + "[name=" + el.name + "]"; var proposed = el.tagName + "[name=\"" + CSS.escape(el.name) + "\"]";
var proposed_element = window.document.querySelectorAll(proposed); var proposed_element = window.document.querySelectorAll(proposed);
if (proposed_element.length) { if (proposed_element.length) {
if (proposed_element.length === 1) { if (proposed_element.length === 1) {
@ -102,13 +102,15 @@ function collectVisibleElements(parent, visibleElements) {
const children = parent.children; const children = parent.children;
for (let i = 0; i < children.length; i++) { for (let i = 0; i < children.length; i++) {
const child = children[i]; const child = children[i];
const computedStyle = window.getComputedStyle(child);
if ( if (
child.nodeType === Node.ELEMENT_NODE && child.nodeType === Node.ELEMENT_NODE &&
window.getComputedStyle(child).display !== 'none' && computedStyle.display !== 'none' &&
window.getComputedStyle(child).visibility !== 'hidden' && computedStyle.visibility !== 'hidden' &&
child.offsetWidth >= 0 && child.offsetWidth >= 0 &&
child.offsetHeight >= 0 && child.offsetHeight >= 0 &&
window.getComputedStyle(child).contentVisibility !== 'hidden' computedStyle.contentVisibility !== 'hidden'
) { ) {
// If the child is an element and is visible, recursively collect visible elements // If the child is an element and is visible, recursively collect visible elements
collectVisibleElements(child, visibleElements); collectVisibleElements(child, visibleElements);
@ -173,6 +175,7 @@ visibleElementsArray.forEach(function (element) {
// Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training. // Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training.
const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,–)/.test(text) ; const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,–)/.test(text) ;
const computedStyle = window.getComputedStyle(element);
size_pos.push({ size_pos.push({
xpath: xpath_result, xpath: xpath_result,
@ -184,10 +187,10 @@ visibleElementsArray.forEach(function (element) {
tagName: (element.tagName) ? element.tagName.toLowerCase() : '', tagName: (element.tagName) ? element.tagName.toLowerCase() : '',
// tagtype used by Browser Steps // tagtype used by Browser Steps
tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '', tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
isClickable: window.getComputedStyle(element).cursor === "pointer", isClickable: computedStyle.cursor === "pointer",
// Used by the keras trainer // Used by the keras trainer
fontSize: window.getComputedStyle(element).getPropertyValue('font-size'), fontSize: computedStyle.getPropertyValue('font-size'),
fontWeight: window.getComputedStyle(element).getPropertyValue('font-weight'), fontWeight: computedStyle.getPropertyValue('font-weight'),
hasDigitCurrency: hasDigitCurrency, hasDigitCurrency: hasDigitCurrency,
label: label, label: label,
}); });