kopia lustrzana https://github.com/dgtlmoon/changedetection.io
Fetcher - Use bigger screenshot chunks to speed up page screenshot (#3107)
rodzic
96dc49e229
commit
45a030bac6
|
@ -7,13 +7,13 @@ import os
|
||||||
# Visual Selector scraper - 'Button' is there because some sites have <button>OUT OF STOCK</button>.
|
# Visual Selector scraper - 'Button' is there because some sites have <button>OUT OF STOCK</button>.
|
||||||
visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary,button'
|
visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary,button'
|
||||||
|
|
||||||
SCREENSHOT_MAX_HEIGHT_DEFAULT = 16000
|
SCREENSHOT_MAX_HEIGHT_DEFAULT = 20000
|
||||||
SCREENSHOT_DEFAULT_QUALITY = 40
|
SCREENSHOT_DEFAULT_QUALITY = 40
|
||||||
|
|
||||||
# Maximum total height for the final image (When in stitch mode).
|
# Maximum total height for the final image (When in stitch mode).
|
||||||
# We limit this to 16000px due to the huge amount of RAM that was being used
|
# We limit this to 16000px due to the huge amount of RAM that was being used
|
||||||
# Example: 16000 × 1400 × 3 = 67,200,000 bytes ≈ 64.1 MB (not including buffers in PIL etc)
|
# Example: 16000 × 1400 × 3 = 67,200,000 bytes ≈ 64.1 MB (not including buffers in PIL etc)
|
||||||
MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT))
|
SCREENSHOT_MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT))
|
||||||
|
|
||||||
# The size at which we will switch to stitching method, when below this (and
|
# The size at which we will switch to stitching method, when below this (and
|
||||||
# MAX_TOTAL_HEIGHT which can be set by a user) we will use the default
|
# MAX_TOTAL_HEIGHT which can be set by a user) we will use the default
|
||||||
|
|
|
@ -5,13 +5,10 @@ from urllib.parse import urlparse
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \
|
from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \
|
||||||
SCREENSHOT_SIZE_STITCH_THRESHOLD, MAX_TOTAL_HEIGHT, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS
|
SCREENSHOT_SIZE_STITCH_THRESHOLD, SCREENSHOT_MAX_TOTAL_HEIGHT, XPATH_ELEMENT_JS, INSTOCK_DATA_JS
|
||||||
from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
|
|
||||||
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
|
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
|
||||||
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
|
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def capture_full_page(page):
|
def capture_full_page(page):
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
@ -20,84 +17,56 @@ def capture_full_page(page):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
|
||||||
page_height = page.evaluate("document.documentElement.scrollHeight")
|
page_height = page.evaluate("document.documentElement.scrollHeight")
|
||||||
|
page_width = page.evaluate("document.documentElement.scrollWidth")
|
||||||
|
original_viewport = page.viewport_size
|
||||||
|
|
||||||
logger.debug(f"Playwright viewport size {page.viewport_size}")
|
logger.debug(f"Playwright viewport size {page.viewport_size} page height {page_height} page width {page_width}")
|
||||||
|
|
||||||
############################################################
|
# Use an approach similar to puppeteer: set a larger viewport and take screenshots in chunks
|
||||||
#### SCREENSHOT FITS INTO ONE SNAPSHOT (SMALLER PAGES) #####
|
step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Size that won't cause GPU to overflow
|
||||||
############################################################
|
|
||||||
|
|
||||||
# Optimization to avoid unnecessary stitching if we can avoid it
|
|
||||||
# Use the default screenshot method for smaller pages to take advantage
|
|
||||||
# of GPU and native playwright screenshot optimizations
|
|
||||||
# - No PIL needed here, no danger of memory leaks, no sub process required
|
|
||||||
if (page_height < SCREENSHOT_SIZE_STITCH_THRESHOLD and page_height < MAX_TOTAL_HEIGHT ):
|
|
||||||
logger.debug("Using default screenshot method")
|
|
||||||
page.request_gc()
|
|
||||||
screenshot = page.screenshot(
|
|
||||||
type="jpeg",
|
|
||||||
quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)),
|
|
||||||
full_page=True,
|
|
||||||
)
|
|
||||||
page.request_gc()
|
|
||||||
logger.debug(f"Screenshot captured in {time.time() - start:.2f}s")
|
|
||||||
return screenshot
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
###################################################################################
|
|
||||||
#### CASE FOR LARGE SCREENSHOTS THAT NEED TO BE TRIMMED DUE TO MEMORY ISSUES #####
|
|
||||||
###################################################################################
|
|
||||||
# - PIL can easily allocate memory and not release it cleanly
|
|
||||||
# - Fetching screenshot from playwright seems OK
|
|
||||||
# Image.new is leaky even with .close()
|
|
||||||
# So lets prepare all the data chunks and farm it out to a subprocess for clean memory handling
|
|
||||||
|
|
||||||
logger.debug(
|
|
||||||
"Using stitching method for large screenshot because page height exceeds threshold"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Limit the total capture height
|
|
||||||
capture_height = min(page_height, MAX_TOTAL_HEIGHT)
|
|
||||||
|
|
||||||
# Calculate number of chunks needed using ORIGINAL viewport height
|
|
||||||
num_chunks = (capture_height + page.viewport_size['height'] - 1) // page.viewport_size['height']
|
|
||||||
screenshot_chunks = []
|
screenshot_chunks = []
|
||||||
|
y = 0
|
||||||
|
|
||||||
# Track cumulative paste position
|
# If page height is larger than current viewport, use a larger viewport for better capturing
|
||||||
y_offset = 0
|
if page_height > page.viewport_size['height']:
|
||||||
for _ in range(num_chunks):
|
# Set viewport to a larger size to capture more content at once
|
||||||
|
page.set_viewport_size({'width': page.viewport_size['width'], 'height': step_size})
|
||||||
|
|
||||||
|
# Capture screenshots in chunks up to the max total height
|
||||||
|
while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT):
|
||||||
page.request_gc()
|
page.request_gc()
|
||||||
page.evaluate(f"window.scrollTo(0, {y_offset})")
|
page.evaluate(f"window.scrollTo(0, {y})")
|
||||||
page.request_gc()
|
page.request_gc()
|
||||||
h = min(page.viewport_size['height'], capture_height - y_offset)
|
|
||||||
screenshot_chunks.append(page.screenshot(
|
screenshot_chunks.append(page.screenshot(
|
||||||
type="jpeg",
|
type="jpeg",
|
||||||
clip={
|
full_page=False,
|
||||||
"x": 0,
|
quality=int(os.getenv("SCREENSHOT_QUALITY", 72))
|
||||||
"y": 0,
|
|
||||||
"width": page.viewport_size['width'],
|
|
||||||
"height": h,
|
|
||||||
},
|
|
||||||
quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)),
|
|
||||||
))
|
))
|
||||||
|
y += step_size
|
||||||
y_offset += h # maybe better to inspect the image here?
|
|
||||||
page.request_gc()
|
page.request_gc()
|
||||||
|
|
||||||
# PIL can leak memory in various situations, assign the work to a subprocess for totally clean handling
|
# Restore original viewport size
|
||||||
|
page.set_viewport_size({'width': original_viewport['width'], 'height': original_viewport['height']})
|
||||||
|
|
||||||
|
# If we have multiple chunks, stitch them together
|
||||||
|
if len(screenshot_chunks) > 1:
|
||||||
|
from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
|
||||||
|
logger.debug(f"Screenshot stitching {len(screenshot_chunks)} chunks together")
|
||||||
parent_conn, child_conn = Pipe()
|
parent_conn, child_conn = Pipe()
|
||||||
p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, capture_height))
|
p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, SCREENSHOT_MAX_TOTAL_HEIGHT))
|
||||||
p.start()
|
p.start()
|
||||||
result = parent_conn.recv_bytes()
|
screenshot = parent_conn.recv_bytes()
|
||||||
p.join()
|
p.join()
|
||||||
|
logger.debug(
|
||||||
|
f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
|
||||||
|
|
||||||
screenshot_chunks = None
|
screenshot_chunks = None
|
||||||
logger.debug(f"Screenshot - Page height: {page_height} Capture height: {capture_height} - Stitched together in {time.time() - start:.2f}s")
|
return screenshot
|
||||||
|
|
||||||
return result
|
logger.debug(
|
||||||
|
f"Screenshot Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
|
||||||
|
|
||||||
|
return screenshot_chunks[0]
|
||||||
|
|
||||||
|
|
||||||
class fetcher(Fetcher):
|
class fetcher(Fetcher):
|
||||||
|
@ -292,6 +261,7 @@ class fetcher(Fetcher):
|
||||||
self.page.request_gc()
|
self.page.request_gc()
|
||||||
|
|
||||||
self.content = self.page.content()
|
self.content = self.page.content()
|
||||||
|
self.page.request_gc()
|
||||||
logger.debug(f"Scrape xPath element data in browser done in {time.time() - now:.2f}s")
|
logger.debug(f"Scrape xPath element data in browser done in {time.time() - now:.2f}s")
|
||||||
|
|
||||||
# Bug 3 in Playwright screenshot handling
|
# Bug 3 in Playwright screenshot handling
|
||||||
|
@ -317,4 +287,11 @@ class fetcher(Fetcher):
|
||||||
|
|
||||||
# Clean up resources properly
|
# Clean up resources properly
|
||||||
context.close()
|
context.close()
|
||||||
|
context = None
|
||||||
|
|
||||||
|
self.page.close()
|
||||||
|
self.page = None
|
||||||
|
|
||||||
browser.close()
|
browser.close()
|
||||||
|
borwser = None
|
||||||
|
|
||||||
|
|
|
@ -7,10 +7,11 @@ from urllib.parse import urlparse
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \
|
from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \
|
||||||
SCREENSHOT_SIZE_STITCH_THRESHOLD, MAX_TOTAL_HEIGHT, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS
|
SCREENSHOT_SIZE_STITCH_THRESHOLD, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS, \
|
||||||
|
SCREENSHOT_MAX_TOTAL_HEIGHT
|
||||||
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
|
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
|
||||||
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError
|
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, \
|
||||||
from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
|
BrowserConnectError
|
||||||
|
|
||||||
|
|
||||||
# Bug 3 in Playwright screenshot handling
|
# Bug 3 in Playwright screenshot handling
|
||||||
|
@ -27,71 +28,53 @@ async def capture_full_page(page):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
|
||||||
page_height = await page.evaluate("document.documentElement.scrollHeight")
|
page_height = await page.evaluate("document.documentElement.scrollHeight")
|
||||||
|
page_width = await page.evaluate("document.documentElement.scrollWidth")
|
||||||
|
original_viewport = page.viewport
|
||||||
|
|
||||||
logger.debug(f"Puppeteer viewport size {page.viewport}")
|
logger.debug(f"Puppeteer viewport size {page.viewport} page height {page_height} page width {page_width}")
|
||||||
|
|
||||||
############################################################
|
# Bug 3 in Playwright screenshot handling
|
||||||
#### SCREENSHOT FITS INTO ONE SNAPSHOT (SMALLER PAGES) #####
|
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
|
||||||
############################################################
|
# JPEG is better here because the screenshots can be very very large
|
||||||
|
|
||||||
# Optimization to avoid unnecessary stitching if we can avoid it
|
# Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded
|
||||||
# Use the default screenshot method for smaller pages to take advantage
|
# which will significantly increase the IO size between the server and client, it's recommended to use the lowest
|
||||||
# of GPU and native playwright screenshot optimizations
|
# acceptable screenshot quality here
|
||||||
# - No PIL needed here, no danger of memory leaks, no sub process required
|
|
||||||
if (page_height < SCREENSHOT_SIZE_STITCH_THRESHOLD and page_height < MAX_TOTAL_HEIGHT ):
|
|
||||||
logger.debug("Using default screenshot method")
|
|
||||||
await page.evaluate(f"window.scrollTo(0, 0)")
|
|
||||||
screenshot = await page.screenshot(
|
|
||||||
type_="jpeg",
|
|
||||||
quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)),
|
|
||||||
fullPage=True,
|
|
||||||
)
|
|
||||||
logger.debug(f"Screenshot captured in {time.time() - start:.2f}s")
|
|
||||||
return screenshot
|
|
||||||
|
|
||||||
###################################################################################
|
|
||||||
#### CASE FOR LARGE SCREENSHOTS THAT NEED TO BE TRIMMED DUE TO MEMORY ISSUES #####
|
|
||||||
###################################################################################
|
|
||||||
# - PIL can easily allocate memory and not release it cleanly
|
|
||||||
# - Fetching screenshot from playwright seems OK
|
|
||||||
# Image.new is leaky even with .close()
|
|
||||||
# So lets prepare all the data chunks and farm it out to a subprocess for clean memory handling
|
|
||||||
|
|
||||||
logger.debug(
|
step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Something that will not cause the GPU to overflow when taking the screenshot
|
||||||
"Using stitching method for large screenshot because page height exceeds threshold"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Limit the total capture height
|
|
||||||
capture_height = min(page_height, MAX_TOTAL_HEIGHT)
|
|
||||||
|
|
||||||
# Calculate number of chunks needed using ORIGINAL viewport height
|
|
||||||
num_chunks = (capture_height + page.viewport['height'] - 1) // page.viewport['height']
|
|
||||||
screenshot_chunks = []
|
screenshot_chunks = []
|
||||||
|
y = 0
|
||||||
|
if page_height > page.viewport['height']:
|
||||||
|
await page.setViewport({'width': page.viewport['width'], 'height': step_size})
|
||||||
|
|
||||||
# Track cumulative paste position
|
|
||||||
y_offset = 0
|
|
||||||
for _ in range(num_chunks):
|
|
||||||
await page.evaluate(f"window.scrollTo(0, {y_offset})")
|
|
||||||
h = min(page.viewport['height'], capture_height - y_offset)
|
|
||||||
screenshot_chunks.append(await page.screenshot(
|
|
||||||
type_="jpeg",
|
|
||||||
quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)),
|
|
||||||
))
|
|
||||||
|
|
||||||
y_offset += h # maybe better to inspect the image here?
|
while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT):
|
||||||
|
await page.evaluate(f"window.scrollTo(0, {y})")
|
||||||
|
screenshot_chunks.append(await page.screenshot(type_='jpeg',
|
||||||
|
fullPage=False,
|
||||||
|
quality=int(os.getenv("SCREENSHOT_QUALITY", 72))))
|
||||||
|
y += step_size
|
||||||
|
|
||||||
# PIL can leak memory in various situations, assign the work to a subprocess for totally clean handling
|
await page.setViewport({'width': original_viewport['width'], 'height': original_viewport['height']})
|
||||||
|
|
||||||
|
if len(screenshot_chunks) > 1:
|
||||||
|
from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
|
||||||
|
logger.debug(f"Screenshot stitching {len(screenshot_chunks)} chunks together")
|
||||||
parent_conn, child_conn = Pipe()
|
parent_conn, child_conn = Pipe()
|
||||||
p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, capture_height))
|
p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, SCREENSHOT_MAX_TOTAL_HEIGHT))
|
||||||
p.start()
|
p.start()
|
||||||
result = parent_conn.recv_bytes()
|
screenshot = parent_conn.recv_bytes()
|
||||||
p.join()
|
p.join()
|
||||||
|
logger.debug(
|
||||||
|
f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
|
||||||
|
|
||||||
screenshot_chunks = None
|
screenshot_chunks = None
|
||||||
logger.debug(f"Screenshot - Page height: {page_height} Capture height: {capture_height} - Stitched together in {time.time() - start:.2f}s")
|
return screenshot
|
||||||
|
|
||||||
return result
|
logger.debug(
|
||||||
|
f"Screenshot Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
|
||||||
|
return screenshot_chunks[0]
|
||||||
|
|
||||||
|
|
||||||
class fetcher(Fetcher):
|
class fetcher(Fetcher):
|
||||||
|
|
Ładowanie…
Reference in New Issue