kopia lustrzana https://github.com/dgtlmoon/changedetection.io
Refactor image saving with forked process to reduce memory usage, improvements to xpath scraper handling (#3099)
rodzic
cdfb3f206c
commit
4269079c54
|
@ -4,7 +4,7 @@ import re
|
|||
from random import randint
|
||||
from loguru import logger
|
||||
|
||||
from changedetectionio.content_fetchers.helpers import capture_full_page
|
||||
from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT
|
||||
from changedetectionio.content_fetchers.base import manage_user_agent
|
||||
from changedetectionio.safe_jinja import render as jinja_render
|
||||
|
||||
|
@ -293,12 +293,16 @@ class browsersteps_live_ui(steppable_browser_interface):
|
|||
def get_current_state(self):
|
||||
"""Return the screenshot and interactive elements mapping, generally always called after action_()"""
|
||||
import importlib.resources
|
||||
import json
|
||||
# because we for now only run browser steps in playwright mode (not puppeteer mode)
|
||||
from changedetectionio.content_fetchers.playwright import capture_full_page
|
||||
|
||||
xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text()
|
||||
|
||||
now = time.time()
|
||||
self.page.wait_for_timeout(1 * 1000)
|
||||
|
||||
screenshot = capture_full_page(self.page)
|
||||
screenshot = capture_full_page(page=self.page)
|
||||
|
||||
logger.debug(f"Time to get screenshot from browser {time.time() - now:.2f}s")
|
||||
|
||||
|
@ -306,13 +310,21 @@ class browsersteps_live_ui(steppable_browser_interface):
|
|||
self.page.evaluate("var include_filters=''")
|
||||
# Go find the interactive elements
|
||||
# @todo in the future, something smarter that can scan for elements with .click/focus etc event handlers?
|
||||
elements = 'a,button,input,select,textarea,i,th,td,p,li,h1,h2,h3,h4,div,span'
|
||||
xpath_element_js = xpath_element_js.replace('%ELEMENTS%', elements)
|
||||
|
||||
xpath_data = self.page.evaluate("async () => {" + xpath_element_js + "}")
|
||||
self.page.request_gc()
|
||||
|
||||
scan_elements = 'a,button,input,select,textarea,i,th,td,p,li,h1,h2,h3,h4,div,span'
|
||||
|
||||
MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT))
|
||||
xpath_data = json.loads(self.page.evaluate(xpath_element_js, {
|
||||
"visualselector_xpath_selectors": scan_elements,
|
||||
"max_height": MAX_TOTAL_HEIGHT
|
||||
}))
|
||||
self.page.request_gc()
|
||||
|
||||
# So the JS will find the smallest one first
|
||||
xpath_data['size_pos'] = sorted(xpath_data['size_pos'], key=lambda k: k['width'] * k['height'], reverse=True)
|
||||
logger.debug(f"Time to scrape xpath element data in browser {time.time()-now:.2f}s")
|
||||
logger.debug(f"Time to scrape xPath element data in browser {time.time()-now:.2f}s")
|
||||
|
||||
# playwright._impl._api_types.Error: Browser closed.
|
||||
# @todo show some countdown timer?
|
||||
|
|
|
@ -7,11 +7,29 @@ import os
|
|||
# Visual Selector scraper - 'Button' is there because some sites have <button>OUT OF STOCK</button>.
|
||||
visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary,button'
|
||||
|
||||
SCREENSHOT_MAX_HEIGHT_DEFAULT = 16000
|
||||
SCREENSHOT_DEFAULT_QUALITY = 40
|
||||
|
||||
# Maximum total height for the final image (When in stitch mode).
|
||||
# We limit this to 16000px due to the huge amount of RAM that was being used
|
||||
# Example: 16000 × 1400 × 3 = 67,200,000 bytes ≈ 64.1 MB (not including buffers in PIL etc)
|
||||
MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT))
|
||||
|
||||
# The size at which we will switch to stitching method, when below this (and
|
||||
# MAX_TOTAL_HEIGHT which can be set by a user) we will use the default
|
||||
# screenshot method.
|
||||
SCREENSHOT_SIZE_STITCH_THRESHOLD = 8000
|
||||
|
||||
# available_fetchers() will scan this implementation looking for anything starting with html_
|
||||
# this information is used in the form selections
|
||||
from changedetectionio.content_fetchers.requests import fetcher as html_requests
|
||||
|
||||
|
||||
import importlib.resources
|
||||
XPATH_ELEMENT_JS = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text(encoding='utf-8')
|
||||
INSTOCK_DATA_JS = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text(encoding='utf-8')
|
||||
|
||||
|
||||
def available_fetchers():
|
||||
# See the if statement at the bottom of this file for how we switch between playwright and webdriver
|
||||
import inspect
|
||||
|
|
|
@ -63,11 +63,6 @@ class Fetcher():
|
|||
# Time ONTOP of the system defined env minimum time
|
||||
render_extract_delay = 0
|
||||
|
||||
def __init__(self):
|
||||
import importlib.resources
|
||||
self.xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text(encoding='utf-8')
|
||||
self.instock_data_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text(encoding='utf-8')
|
||||
|
||||
@abstractmethod
|
||||
def get_error(self):
|
||||
return self.error
|
||||
|
@ -143,6 +138,7 @@ class Fetcher():
|
|||
logger.debug(f">> Iterating check - browser Step n {step_n} - {step['operation']}...")
|
||||
self.screenshot_step("before-" + str(step_n))
|
||||
self.save_step_html("before-" + str(step_n))
|
||||
|
||||
try:
|
||||
optional_value = step['optional_value']
|
||||
selector = step['selector']
|
||||
|
|
|
@ -1,138 +0,0 @@
|
|||
# Pages with a vertical height longer than this will use the 'stitch together' method.
|
||||
|
||||
# - Many GPUs have a max texture size of 16384x16384px (or lower on older devices).
|
||||
# - If a page is taller than ~8000–10000px, it risks exceeding GPU memory limits.
|
||||
# - This is especially important on headless Chromium, where Playwright may fail to allocate a massive full-page buffer.
|
||||
|
||||
from loguru import logger
|
||||
|
||||
def capture_full_page(page):
|
||||
import io
|
||||
import os
|
||||
import time
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
# Maximum total height for the final image (When in stitch mode).
|
||||
# We limit this to 16000px due to the huge amount of RAM that was being used
|
||||
# Example: 16000 × 1400 × 3 = 67,200,000 bytes ≈ 64.1 MB (not including buffers in PIL etc)
|
||||
MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", 16000))
|
||||
|
||||
# The size at which we will switch to stitching method, when below this (and
|
||||
# MAX_TOTAL_HEIGHT which can be set by a user) we will use the default
|
||||
# screenshot method.
|
||||
SCREENSHOT_SIZE_STITCH_THRESHOLD = 8000
|
||||
|
||||
WARNING_TEXT_HEIGHT = 20 # Height of the warning text overlay
|
||||
|
||||
# Save the original viewport size
|
||||
original_viewport = page.viewport_size
|
||||
start = time.time()
|
||||
|
||||
stitched_image = None
|
||||
|
||||
try:
|
||||
viewport_width = original_viewport["width"]
|
||||
viewport_height = original_viewport["height"]
|
||||
|
||||
page_height = page.evaluate("document.documentElement.scrollHeight")
|
||||
|
||||
# Optimization to avoid unnecessary stitching if we can avoid it
|
||||
# Use the default screenshot method for smaller pages to take advantage
|
||||
# of GPU and native playwright screenshot optimizations
|
||||
if (
|
||||
page_height < SCREENSHOT_SIZE_STITCH_THRESHOLD
|
||||
and page_height < MAX_TOTAL_HEIGHT
|
||||
):
|
||||
logger.debug("Using default screenshot method")
|
||||
screenshot = page.screenshot(
|
||||
type="jpeg",
|
||||
quality=int(os.getenv("SCREENSHOT_QUALITY", 30)),
|
||||
full_page=True,
|
||||
)
|
||||
logger.debug(f"Screenshot captured in {time.time() - start:.2f}s")
|
||||
return screenshot
|
||||
|
||||
logger.debug(
|
||||
"Using stitching method for large screenshot because page height exceeds threshold"
|
||||
)
|
||||
|
||||
# Limit the total capture height
|
||||
capture_height = min(page_height, MAX_TOTAL_HEIGHT)
|
||||
|
||||
# Calculate number of chunks needed using ORIGINAL viewport height
|
||||
num_chunks = (capture_height + viewport_height - 1) // viewport_height
|
||||
|
||||
# Create the final image upfront to avoid holding all chunks in memory
|
||||
stitched_image = Image.new("RGB", (viewport_width, capture_height))
|
||||
|
||||
# Track cumulative paste position
|
||||
y_offset = 0
|
||||
|
||||
for _ in range(num_chunks):
|
||||
# Scroll to position (no viewport resizing)
|
||||
page.evaluate(f"window.scrollTo(0, {y_offset})")
|
||||
|
||||
# Capture only the visible area using clip
|
||||
with io.BytesIO(
|
||||
page.screenshot(
|
||||
type="jpeg",
|
||||
clip={
|
||||
"x": 0,
|
||||
"y": 0,
|
||||
"width": viewport_width,
|
||||
"height": min(viewport_height, capture_height - y_offset),
|
||||
},
|
||||
quality=int(os.getenv("SCREENSHOT_QUALITY", 30)),
|
||||
)
|
||||
) as buf:
|
||||
with Image.open(buf) as img:
|
||||
img.load()
|
||||
stitched_image.paste(img, (0, y_offset))
|
||||
y_offset += img.height
|
||||
|
||||
logger.debug(f"Screenshot stitched together in {time.time() - start:.2f}s")
|
||||
|
||||
# Overlay warning text if the screenshot was trimmed
|
||||
if capture_height < page_height:
|
||||
draw = ImageDraw.Draw(stitched_image)
|
||||
warning_text = f"WARNING: Screenshot was {page_height}px but trimmed to {MAX_TOTAL_HEIGHT}px because it was too long"
|
||||
|
||||
# Load font (default system font if Arial is unavailable)
|
||||
try:
|
||||
font = ImageFont.truetype(
|
||||
"arial.ttf", WARNING_TEXT_HEIGHT
|
||||
) # Arial (Windows/Mac)
|
||||
except IOError:
|
||||
font = ImageFont.load_default() # Default font if Arial not found
|
||||
|
||||
# Get text bounding box (correct method for newer Pillow versions)
|
||||
text_bbox = draw.textbbox((0, 0), warning_text, font=font)
|
||||
text_width = text_bbox[2] - text_bbox[0] # Calculate text width
|
||||
text_height = text_bbox[3] - text_bbox[1] # Calculate text height
|
||||
|
||||
# Define background rectangle (top of the image)
|
||||
draw.rectangle(
|
||||
[(0, 0), (viewport_width, WARNING_TEXT_HEIGHT)], fill="white"
|
||||
)
|
||||
|
||||
# Center text horizontally within the warning area
|
||||
text_x = (viewport_width - text_width) // 2
|
||||
text_y = (WARNING_TEXT_HEIGHT - text_height) // 2
|
||||
|
||||
# Draw the warning text in red
|
||||
draw.text((text_x, text_y), warning_text, fill="red", font=font)
|
||||
|
||||
# Save final image
|
||||
with io.BytesIO() as output:
|
||||
stitched_image.save(
|
||||
output, format="JPEG", quality=int(os.getenv("SCREENSHOT_QUALITY", 30))
|
||||
)
|
||||
screenshot = output.getvalue()
|
||||
|
||||
finally:
|
||||
# Restore the original viewport size
|
||||
page.set_viewport_size(original_viewport)
|
||||
if stitched_image is not None:
|
||||
stitched_image.close()
|
||||
|
||||
return screenshot
|
|
@ -4,10 +4,102 @@ from urllib.parse import urlparse
|
|||
|
||||
from loguru import logger
|
||||
|
||||
from changedetectionio.content_fetchers.helpers import capture_full_page
|
||||
from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \
|
||||
SCREENSHOT_SIZE_STITCH_THRESHOLD, MAX_TOTAL_HEIGHT, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS
|
||||
from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
|
||||
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
|
||||
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
|
||||
|
||||
|
||||
|
||||
def capture_full_page(page):
|
||||
import os
|
||||
import time
|
||||
from multiprocessing import Process, Pipe
|
||||
|
||||
start = time.time()
|
||||
|
||||
page_height = page.evaluate("document.documentElement.scrollHeight")
|
||||
|
||||
logger.debug(f"Playwright viewport size {page.viewport_size}")
|
||||
|
||||
############################################################
|
||||
#### SCREENSHOT FITS INTO ONE SNAPSHOT (SMALLER PAGES) #####
|
||||
############################################################
|
||||
|
||||
# Optimization to avoid unnecessary stitching if we can avoid it
|
||||
# Use the default screenshot method for smaller pages to take advantage
|
||||
# of GPU and native playwright screenshot optimizations
|
||||
# - No PIL needed here, no danger of memory leaks, no sub process required
|
||||
if (page_height < SCREENSHOT_SIZE_STITCH_THRESHOLD and page_height < MAX_TOTAL_HEIGHT ):
|
||||
logger.debug("Using default screenshot method")
|
||||
page.request_gc()
|
||||
screenshot = page.screenshot(
|
||||
type="jpeg",
|
||||
quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)),
|
||||
full_page=True,
|
||||
)
|
||||
page.request_gc()
|
||||
logger.debug(f"Screenshot captured in {time.time() - start:.2f}s")
|
||||
return screenshot
|
||||
|
||||
|
||||
|
||||
###################################################################################
|
||||
#### CASE FOR LARGE SCREENSHOTS THAT NEED TO BE TRIMMED DUE TO MEMORY ISSUES #####
|
||||
###################################################################################
|
||||
# - PIL can easily allocate memory and not release it cleanly
|
||||
# - Fetching screenshot from playwright seems OK
|
||||
# Image.new is leaky even with .close()
|
||||
# So lets prepare all the data chunks and farm it out to a subprocess for clean memory handling
|
||||
|
||||
logger.debug(
|
||||
"Using stitching method for large screenshot because page height exceeds threshold"
|
||||
)
|
||||
|
||||
# Limit the total capture height
|
||||
capture_height = min(page_height, MAX_TOTAL_HEIGHT)
|
||||
|
||||
# Calculate number of chunks needed using ORIGINAL viewport height
|
||||
num_chunks = (capture_height + page.viewport_size['height'] - 1) // page.viewport_size['height']
|
||||
screenshot_chunks = []
|
||||
|
||||
# Track cumulative paste position
|
||||
y_offset = 0
|
||||
for _ in range(num_chunks):
|
||||
|
||||
page.request_gc()
|
||||
page.evaluate(f"window.scrollTo(0, {y_offset})")
|
||||
page.request_gc()
|
||||
h = min(page.viewport_size['height'], capture_height - y_offset)
|
||||
screenshot_chunks.append(page.screenshot(
|
||||
type="jpeg",
|
||||
clip={
|
||||
"x": 0,
|
||||
"y": 0,
|
||||
"width": page.viewport_size['width'],
|
||||
"height": h,
|
||||
},
|
||||
quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)),
|
||||
))
|
||||
|
||||
y_offset += h # maybe better to inspect the image here?
|
||||
page.request_gc()
|
||||
|
||||
# PIL can leak memory in various situations, assign the work to a subprocess for totally clean handling
|
||||
|
||||
parent_conn, child_conn = Pipe()
|
||||
p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, capture_height))
|
||||
p.start()
|
||||
result = parent_conn.recv_bytes()
|
||||
p.join()
|
||||
|
||||
screenshot_chunks = None
|
||||
logger.debug(f"Screenshot - Page height: {page_height} Capture height: {capture_height} - Stitched together in {time.time() - start:.2f}s")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class fetcher(Fetcher):
|
||||
fetcher_description = "Playwright {}/Javascript".format(
|
||||
os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize()
|
||||
|
@ -60,7 +152,8 @@ class fetcher(Fetcher):
|
|||
|
||||
def screenshot_step(self, step_n=''):
|
||||
super().screenshot_step(step_n=step_n)
|
||||
screenshot = capture_full_page(self.page)
|
||||
screenshot = capture_full_page(page=self.page)
|
||||
|
||||
|
||||
if self.browser_steps_screenshot_path is not None:
|
||||
destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.jpeg'.format(step_n))
|
||||
|
@ -89,7 +182,6 @@ class fetcher(Fetcher):
|
|||
|
||||
from playwright.sync_api import sync_playwright
|
||||
import playwright._impl._errors
|
||||
from changedetectionio.content_fetchers import visualselector_xpath_selectors
|
||||
import time
|
||||
self.delete_browser_steps_screenshots()
|
||||
response = None
|
||||
|
@ -185,13 +277,22 @@ class fetcher(Fetcher):
|
|||
self.page.evaluate("var include_filters={}".format(json.dumps(current_include_filters)))
|
||||
else:
|
||||
self.page.evaluate("var include_filters=''")
|
||||
self.page.request_gc()
|
||||
|
||||
self.xpath_data = self.page.evaluate(
|
||||
"async () => {" + self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) + "}")
|
||||
self.instock_data = self.page.evaluate("async () => {" + self.instock_data_js + "}")
|
||||
# request_gc before and after evaluate to free up memory
|
||||
# @todo browsersteps etc
|
||||
MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT))
|
||||
self.xpath_data = self.page.evaluate(XPATH_ELEMENT_JS, {
|
||||
"visualselector_xpath_selectors": visualselector_xpath_selectors,
|
||||
"max_height": MAX_TOTAL_HEIGHT
|
||||
})
|
||||
self.page.request_gc()
|
||||
|
||||
self.instock_data = self.page.evaluate(INSTOCK_DATA_JS)
|
||||
self.page.request_gc()
|
||||
|
||||
self.content = self.page.content()
|
||||
logger.debug(f"Time to scrape xpath element data in browser {time.time() - now:.2f}s")
|
||||
logger.debug(f"Scrape xPath element data in browser done in {time.time() - now:.2f}s")
|
||||
|
||||
# Bug 3 in Playwright screenshot handling
|
||||
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
|
||||
|
@ -202,11 +303,18 @@ class fetcher(Fetcher):
|
|||
# acceptable screenshot quality here
|
||||
try:
|
||||
# The actual screenshot - this always base64 and needs decoding! horrible! huge CPU usage
|
||||
self.screenshot = capture_full_page(self.page)
|
||||
self.screenshot = capture_full_page(page=self.page)
|
||||
|
||||
except Exception as e:
|
||||
# It's likely the screenshot was too long/big and something crashed
|
||||
raise ScreenshotUnavailable(url=url, status_code=self.status_code)
|
||||
finally:
|
||||
# Request garbage collection one more time before closing
|
||||
try:
|
||||
self.page.request_gc()
|
||||
except:
|
||||
pass
|
||||
|
||||
# Clean up resources properly
|
||||
context.close()
|
||||
browser.close()
|
||||
|
|
|
@ -6,8 +6,93 @@ from urllib.parse import urlparse
|
|||
|
||||
from loguru import logger
|
||||
|
||||
from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \
|
||||
SCREENSHOT_SIZE_STITCH_THRESHOLD, MAX_TOTAL_HEIGHT, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS
|
||||
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
|
||||
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError
|
||||
from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
|
||||
|
||||
|
||||
# Bug 3 in Playwright screenshot handling
|
||||
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
|
||||
|
||||
# Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded
|
||||
# which will significantly increase the IO size between the server and client, it's recommended to use the lowest
|
||||
# acceptable screenshot quality here
|
||||
async def capture_full_page(page):
|
||||
import os
|
||||
import time
|
||||
from multiprocessing import Process, Pipe
|
||||
|
||||
start = time.time()
|
||||
|
||||
page_height = await page.evaluate("document.documentElement.scrollHeight")
|
||||
|
||||
logger.debug(f"Puppeteer viewport size {page.viewport}")
|
||||
|
||||
############################################################
|
||||
#### SCREENSHOT FITS INTO ONE SNAPSHOT (SMALLER PAGES) #####
|
||||
############################################################
|
||||
|
||||
# Optimization to avoid unnecessary stitching if we can avoid it
|
||||
# Use the default screenshot method for smaller pages to take advantage
|
||||
# of GPU and native playwright screenshot optimizations
|
||||
# - No PIL needed here, no danger of memory leaks, no sub process required
|
||||
if (page_height < SCREENSHOT_SIZE_STITCH_THRESHOLD and page_height < MAX_TOTAL_HEIGHT ):
|
||||
logger.debug("Using default screenshot method")
|
||||
await page.evaluate(f"window.scrollTo(0, 0)")
|
||||
screenshot = await page.screenshot(
|
||||
type_="jpeg",
|
||||
quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)),
|
||||
fullPage=True,
|
||||
)
|
||||
logger.debug(f"Screenshot captured in {time.time() - start:.2f}s")
|
||||
return screenshot
|
||||
|
||||
###################################################################################
|
||||
#### CASE FOR LARGE SCREENSHOTS THAT NEED TO BE TRIMMED DUE TO MEMORY ISSUES #####
|
||||
###################################################################################
|
||||
# - PIL can easily allocate memory and not release it cleanly
|
||||
# - Fetching screenshot from playwright seems OK
|
||||
# Image.new is leaky even with .close()
|
||||
# So lets prepare all the data chunks and farm it out to a subprocess for clean memory handling
|
||||
|
||||
logger.debug(
|
||||
"Using stitching method for large screenshot because page height exceeds threshold"
|
||||
)
|
||||
|
||||
# Limit the total capture height
|
||||
capture_height = min(page_height, MAX_TOTAL_HEIGHT)
|
||||
|
||||
# Calculate number of chunks needed using ORIGINAL viewport height
|
||||
num_chunks = (capture_height + page.viewport['height'] - 1) // page.viewport['height']
|
||||
screenshot_chunks = []
|
||||
|
||||
# Track cumulative paste position
|
||||
y_offset = 0
|
||||
for _ in range(num_chunks):
|
||||
await page.evaluate(f"window.scrollTo(0, {y_offset})")
|
||||
h = min(page.viewport['height'], capture_height - y_offset)
|
||||
screenshot_chunks.append(await page.screenshot(
|
||||
type_="jpeg",
|
||||
quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)),
|
||||
))
|
||||
|
||||
y_offset += h # maybe better to inspect the image here?
|
||||
|
||||
# PIL can leak memory in various situations, assign the work to a subprocess for totally clean handling
|
||||
|
||||
parent_conn, child_conn = Pipe()
|
||||
p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, capture_height))
|
||||
p.start()
|
||||
result = parent_conn.recv_bytes()
|
||||
p.join()
|
||||
|
||||
screenshot_chunks = None
|
||||
logger.debug(f"Screenshot - Page height: {page_height} Capture height: {capture_height} - Stitched together in {time.time() - start:.2f}s")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class fetcher(Fetcher):
|
||||
fetcher_description = "Puppeteer/direct {}/Javascript".format(
|
||||
|
@ -79,7 +164,6 @@ class fetcher(Fetcher):
|
|||
empty_pages_are_a_change
|
||||
):
|
||||
|
||||
from changedetectionio.content_fetchers import visualselector_xpath_selectors
|
||||
self.delete_browser_steps_screenshots()
|
||||
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
|
||||
|
||||
|
@ -181,11 +265,10 @@ class fetcher(Fetcher):
|
|||
raise PageUnloadable(url=url, status_code=None, message=str(e))
|
||||
|
||||
if self.status_code != 200 and not ignore_status_codes:
|
||||
screenshot = await self.page.screenshot(type_='jpeg',
|
||||
fullPage=True,
|
||||
quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))
|
||||
screenshot = await capture_full_page(page=self.page)
|
||||
|
||||
raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
|
||||
|
||||
content = await self.page.content
|
||||
|
||||
if not empty_pages_are_a_change and len(content.strip()) == 0:
|
||||
|
@ -203,46 +286,31 @@ class fetcher(Fetcher):
|
|||
|
||||
# So we can find an element on the page where its selector was entered manually (maybe not xPath etc)
|
||||
# Setup the xPath/VisualSelector scraper
|
||||
if current_include_filters is not None:
|
||||
if current_include_filters:
|
||||
js = json.dumps(current_include_filters)
|
||||
await self.page.evaluate(f"var include_filters={js}")
|
||||
else:
|
||||
await self.page.evaluate(f"var include_filters=''")
|
||||
|
||||
self.xpath_data = await self.page.evaluate(
|
||||
"async () => {" + self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) + "}")
|
||||
self.instock_data = await self.page.evaluate("async () => {" + self.instock_data_js + "}")
|
||||
MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT))
|
||||
self.xpath_data = await self.page.evaluate(XPATH_ELEMENT_JS, {
|
||||
"visualselector_xpath_selectors": visualselector_xpath_selectors,
|
||||
"max_height": MAX_TOTAL_HEIGHT
|
||||
})
|
||||
if not self.xpath_data:
|
||||
raise Exception(f"Content Fetcher > xPath scraper failed. Please report this URL so we can fix it :)")
|
||||
|
||||
self.instock_data = await self.page.evaluate(INSTOCK_DATA_JS)
|
||||
|
||||
self.content = await self.page.content
|
||||
# Bug 3 in Playwright screenshot handling
|
||||
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
|
||||
# JPEG is better here because the screenshots can be very very large
|
||||
|
||||
# Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded
|
||||
# which will significantly increase the IO size between the server and client, it's recommended to use the lowest
|
||||
# acceptable screenshot quality here
|
||||
try:
|
||||
self.screenshot = await self.page.screenshot(type_='jpeg',
|
||||
fullPage=True,
|
||||
quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))
|
||||
except Exception as e:
|
||||
logger.error("Error fetching screenshot")
|
||||
# // May fail on very large pages with 'WARNING: tile memory limits exceeded, some content may not draw'
|
||||
# // @ todo after text extract, we can place some overlay text with red background to say 'croppped'
|
||||
logger.error('ERROR: content-fetcher page was maybe too large for a screenshot, reverting to viewport only screenshot')
|
||||
try:
|
||||
self.screenshot = await self.page.screenshot(type_='jpeg',
|
||||
fullPage=False,
|
||||
quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))
|
||||
except Exception as e:
|
||||
logger.error('ERROR: Failed to get viewport-only reduced screenshot :(')
|
||||
pass
|
||||
finally:
|
||||
# It's good to log here in the case that the browser crashes on shutting down but we still get the data we need
|
||||
logger.success(f"Fetching '{url}' complete, closing page")
|
||||
await self.page.close()
|
||||
logger.success(f"Fetching '{url}' complete, closing browser")
|
||||
await browser.close()
|
||||
self.screenshot = await capture_full_page(page=self.page)
|
||||
|
||||
# It's good to log here in the case that the browser crashes on shutting down but we still get the data we need
|
||||
logger.success(f"Fetching '{url}' complete, closing page")
|
||||
await self.page.close()
|
||||
logger.success(f"Fetching '{url}' complete, closing browser")
|
||||
await browser.close()
|
||||
logger.success(f"Fetching '{url}' complete, exiting puppeteer fetch.")
|
||||
|
||||
async def main(self, **kwargs):
|
||||
|
|
|
@ -1,190 +0,0 @@
|
|||
module.exports = async ({page, context}) => {
|
||||
|
||||
var {
|
||||
url,
|
||||
execute_js,
|
||||
user_agent,
|
||||
extra_wait_ms,
|
||||
req_headers,
|
||||
include_filters,
|
||||
xpath_element_js,
|
||||
screenshot_quality,
|
||||
proxy_username,
|
||||
proxy_password,
|
||||
disk_cache_dir,
|
||||
no_cache_list,
|
||||
block_url_list,
|
||||
} = context;
|
||||
|
||||
await page.setBypassCSP(true)
|
||||
await page.setExtraHTTPHeaders(req_headers);
|
||||
|
||||
if (user_agent) {
|
||||
await page.setUserAgent(user_agent);
|
||||
}
|
||||
// https://ourcodeworld.com/articles/read/1106/how-to-solve-puppeteer-timeouterror-navigation-timeout-of-30000-ms-exceeded
|
||||
|
||||
await page.setDefaultNavigationTimeout(0);
|
||||
|
||||
if (proxy_username) {
|
||||
// Setting Proxy-Authentication header is deprecated, and doing so can trigger header change errors from Puppeteer
|
||||
// https://github.com/puppeteer/puppeteer/issues/676 ?
|
||||
// https://help.brightdata.com/hc/en-us/articles/12632549957649-Proxy-Manager-How-to-Guides#h_01HAKWR4Q0AFS8RZTNYWRDFJC2
|
||||
// https://cri.dev/posts/2020-03-30-How-to-solve-Puppeteer-Chrome-Error-ERR_INVALID_ARGUMENT/
|
||||
await page.authenticate({
|
||||
username: proxy_username,
|
||||
password: proxy_password
|
||||
});
|
||||
}
|
||||
|
||||
await page.setViewport({
|
||||
width: 1024,
|
||||
height: 768,
|
||||
deviceScaleFactor: 1,
|
||||
});
|
||||
|
||||
await page.setRequestInterception(true);
|
||||
if (disk_cache_dir) {
|
||||
console.log(">>>>>>>>>>>>>>> LOCAL DISK CACHE ENABLED <<<<<<<<<<<<<<<<<<<<<");
|
||||
}
|
||||
const fs = require('fs');
|
||||
const crypto = require('crypto');
|
||||
|
||||
function file_is_expired(file_path) {
|
||||
if (!fs.existsSync(file_path)) {
|
||||
return true;
|
||||
}
|
||||
var stats = fs.statSync(file_path);
|
||||
const now_date = new Date();
|
||||
const expire_seconds = 300;
|
||||
if ((now_date / 1000) - (stats.mtime.getTime() / 1000) > expire_seconds) {
|
||||
console.log("CACHE EXPIRED: " + file_path);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
page.on('request', async (request) => {
|
||||
// General blocking of requests that waste traffic
|
||||
if (block_url_list.some(substring => request.url().toLowerCase().includes(substring))) return request.abort();
|
||||
|
||||
if (disk_cache_dir) {
|
||||
const url = request.url();
|
||||
const key = crypto.createHash('md5').update(url).digest("hex");
|
||||
const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/';
|
||||
|
||||
// https://stackoverflow.com/questions/4482686/check-synchronously-if-file-directory-exists-in-node-js
|
||||
|
||||
if (fs.existsSync(dir_path + key)) {
|
||||
console.log("* CACHE HIT , using - " + dir_path + key + " - " + url);
|
||||
const cached_data = fs.readFileSync(dir_path + key);
|
||||
// @todo headers can come from dir_path+key+".meta" json file
|
||||
request.respond({
|
||||
status: 200,
|
||||
//contentType: 'text/html', //@todo
|
||||
body: cached_data
|
||||
});
|
||||
return;
|
||||
}
|
||||
}
|
||||
request.continue();
|
||||
});
|
||||
|
||||
|
||||
if (disk_cache_dir) {
|
||||
page.on('response', async (response) => {
|
||||
const url = response.url();
|
||||
// Basic filtering for sane responses
|
||||
if (response.request().method() != 'GET' || response.request().resourceType() == 'xhr' || response.request().resourceType() == 'document' || response.status() != 200) {
|
||||
console.log("Skipping (not useful) - Status:" + response.status() + " Method:" + response.request().method() + " ResourceType:" + response.request().resourceType() + " " + url);
|
||||
return;
|
||||
}
|
||||
if (no_cache_list.some(substring => url.toLowerCase().includes(substring))) {
|
||||
console.log("Skipping (no_cache_list) - " + url);
|
||||
return;
|
||||
}
|
||||
if (url.toLowerCase().includes('data:')) {
|
||||
console.log("Skipping (embedded-data) - " + url);
|
||||
return;
|
||||
}
|
||||
response.buffer().then(buffer => {
|
||||
if (buffer.length > 100) {
|
||||
console.log("Cache - Saving " + response.request().method() + " - " + url + " - " + response.request().resourceType());
|
||||
|
||||
const key = crypto.createHash('md5').update(url).digest("hex");
|
||||
const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/';
|
||||
|
||||
if (!fs.existsSync(dir_path)) {
|
||||
fs.mkdirSync(dir_path, {recursive: true})
|
||||
}
|
||||
|
||||
if (fs.existsSync(dir_path + key)) {
|
||||
if (file_is_expired(dir_path + key)) {
|
||||
fs.writeFileSync(dir_path + key, buffer);
|
||||
}
|
||||
} else {
|
||||
fs.writeFileSync(dir_path + key, buffer);
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
const r = await page.goto(url, {
|
||||
waitUntil: 'load'
|
||||
});
|
||||
|
||||
await page.waitForTimeout(1000);
|
||||
await page.waitForTimeout(extra_wait_ms);
|
||||
|
||||
if (execute_js) {
|
||||
await page.evaluate(execute_js);
|
||||
await page.waitForTimeout(200);
|
||||
}
|
||||
|
||||
var xpath_data;
|
||||
var instock_data;
|
||||
try {
|
||||
// Not sure the best way here, in the future this should be a new package added to npm then run in evaluatedCode
|
||||
// (Once the old playwright is removed)
|
||||
xpath_data = await page.evaluate((include_filters) => {%xpath_scrape_code%}, include_filters);
|
||||
instock_data = await page.evaluate(() => {%instock_scrape_code%});
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
}
|
||||
|
||||
// Protocol error (Page.captureScreenshot): Cannot take screenshot with 0 width can come from a proxy auth failure
|
||||
// Wrap it here (for now)
|
||||
|
||||
var b64s = false;
|
||||
try {
|
||||
b64s = await page.screenshot({encoding: "base64", fullPage: true, quality: screenshot_quality, type: 'jpeg'});
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
}
|
||||
|
||||
// May fail on very large pages with 'WARNING: tile memory limits exceeded, some content may not draw'
|
||||
if (!b64s) {
|
||||
// @todo after text extract, we can place some overlay text with red background to say 'croppped'
|
||||
console.error('ERROR: content-fetcher page was maybe too large for a screenshot, reverting to viewport only screenshot');
|
||||
try {
|
||||
b64s = await page.screenshot({encoding: "base64", quality: screenshot_quality, type: 'jpeg'});
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
}
|
||||
}
|
||||
|
||||
var html = await page.content();
|
||||
return {
|
||||
data: {
|
||||
'content': html,
|
||||
'headers': r.headers(),
|
||||
'instock_data': instock_data,
|
||||
'screenshot': b64s,
|
||||
'status_code': r.status(),
|
||||
'xpath_data': xpath_data
|
||||
},
|
||||
type: 'application/json',
|
||||
};
|
||||
};
|
|
@ -1,229 +1,220 @@
|
|||
// Restock Detector
|
||||
// (c) Leigh Morresi dgtlmoon@gmail.com
|
||||
//
|
||||
// Assumes the product is in stock to begin with, unless the following appears above the fold ;
|
||||
// - outOfStockTexts appears above the fold (out of stock)
|
||||
// - negateOutOfStockRegex (really is in stock)
|
||||
async () => {
|
||||
|
||||
function isItemInStock() {
|
||||
// @todo Pass these in so the same list can be used in non-JS fetchers
|
||||
const outOfStockTexts = [
|
||||
' أخبرني عندما يتوفر',
|
||||
'0 in stock',
|
||||
'actuellement indisponible',
|
||||
'agotado',
|
||||
'article épuisé',
|
||||
'artikel zurzeit vergriffen',
|
||||
'as soon as stock is available',
|
||||
'ausverkauft', // sold out
|
||||
'available for back order',
|
||||
'awaiting stock',
|
||||
'back in stock soon',
|
||||
'back-order or out of stock',
|
||||
'backordered',
|
||||
'benachrichtigt mich', // notify me
|
||||
'brak na stanie',
|
||||
'brak w magazynie',
|
||||
'coming soon',
|
||||
'currently have any tickets for this',
|
||||
'currently unavailable',
|
||||
'dieser artikel ist bald wieder verfügbar',
|
||||
'dostępne wkrótce',
|
||||
'en rupture',
|
||||
'en rupture de stock',
|
||||
'épuisé',
|
||||
'esgotado',
|
||||
'indisponible',
|
||||
'indisponível',
|
||||
'isn\'t in stock right now',
|
||||
'isnt in stock right now',
|
||||
'isn’t in stock right now',
|
||||
'item is no longer available',
|
||||
'let me know when it\'s available',
|
||||
'mail me when available',
|
||||
'message if back in stock',
|
||||
'mevcut değil',
|
||||
'nachricht bei',
|
||||
'nicht auf lager',
|
||||
'nicht lagernd',
|
||||
'nicht lieferbar',
|
||||
'nicht verfügbar',
|
||||
'nicht vorrätig',
|
||||
'nicht zur verfügung',
|
||||
'nie znaleziono produktów',
|
||||
'niet beschikbaar',
|
||||
'niet leverbaar',
|
||||
'niet op voorraad',
|
||||
'no disponible',
|
||||
'non disponibile',
|
||||
'non disponible',
|
||||
'no longer in stock',
|
||||
'no tickets available',
|
||||
'not available',
|
||||
'not currently available',
|
||||
'not in stock',
|
||||
'notify me when available',
|
||||
'notify me',
|
||||
'notify when available',
|
||||
'não disponível',
|
||||
'não estamos a aceitar encomendas',
|
||||
'out of stock',
|
||||
'out-of-stock',
|
||||
'plus disponible',
|
||||
'prodotto esaurito',
|
||||
'produkt niedostępny',
|
||||
'rupture',
|
||||
'sold out',
|
||||
'sold-out',
|
||||
'stok habis',
|
||||
'stok kosong',
|
||||
'stok varian ini habis',
|
||||
'stokta yok',
|
||||
'temporarily out of stock',
|
||||
'temporarily unavailable',
|
||||
'there were no search results for',
|
||||
'this item is currently unavailable',
|
||||
'tickets unavailable',
|
||||
'tidak dijual',
|
||||
'tidak tersedia',
|
||||
'tijdelijk uitverkocht',
|
||||
'tiket tidak tersedia',
|
||||
'tükendi',
|
||||
'unavailable nearby',
|
||||
'unavailable tickets',
|
||||
'vergriffen',
|
||||
'vorbestellen',
|
||||
'vorbestellung ist bald möglich',
|
||||
'we don\'t currently have any',
|
||||
'we couldn\'t find any products that match',
|
||||
'we do not currently have an estimate of when this product will be back in stock.',
|
||||
'we don\'t know when or if this item will be back in stock.',
|
||||
'we were not able to find a match',
|
||||
'when this arrives in stock',
|
||||
'zur zeit nicht an lager',
|
||||
'品切れ',
|
||||
'已售',
|
||||
'已售完',
|
||||
'품절'
|
||||
];
|
||||
function isItemInStock() {
|
||||
// @todo Pass these in so the same list can be used in non-JS fetchers
|
||||
const outOfStockTexts = [
|
||||
' أخبرني عندما يتوفر',
|
||||
'0 in stock',
|
||||
'actuellement indisponible',
|
||||
'agotado',
|
||||
'article épuisé',
|
||||
'artikel zurzeit vergriffen',
|
||||
'as soon as stock is available',
|
||||
'ausverkauft', // sold out
|
||||
'available for back order',
|
||||
'awaiting stock',
|
||||
'back in stock soon',
|
||||
'back-order or out of stock',
|
||||
'backordered',
|
||||
'benachrichtigt mich', // notify me
|
||||
'brak na stanie',
|
||||
'brak w magazynie',
|
||||
'coming soon',
|
||||
'currently have any tickets for this',
|
||||
'currently unavailable',
|
||||
'dieser artikel ist bald wieder verfügbar',
|
||||
'dostępne wkrótce',
|
||||
'en rupture',
|
||||
'en rupture de stock',
|
||||
'épuisé',
|
||||
'esgotado',
|
||||
'indisponible',
|
||||
'indisponível',
|
||||
'isn\'t in stock right now',
|
||||
'isnt in stock right now',
|
||||
'isn’t in stock right now',
|
||||
'item is no longer available',
|
||||
'let me know when it\'s available',
|
||||
'mail me when available',
|
||||
'message if back in stock',
|
||||
'mevcut değil',
|
||||
'nachricht bei',
|
||||
'nicht auf lager',
|
||||
'nicht lagernd',
|
||||
'nicht lieferbar',
|
||||
'nicht verfügbar',
|
||||
'nicht vorrätig',
|
||||
'nicht zur verfügung',
|
||||
'nie znaleziono produktów',
|
||||
'niet beschikbaar',
|
||||
'niet leverbaar',
|
||||
'niet op voorraad',
|
||||
'no disponible',
|
||||
'non disponibile',
|
||||
'non disponible',
|
||||
'no longer in stock',
|
||||
'no tickets available',
|
||||
'not available',
|
||||
'not currently available',
|
||||
'not in stock',
|
||||
'notify me when available',
|
||||
'notify me',
|
||||
'notify when available',
|
||||
'não disponível',
|
||||
'não estamos a aceitar encomendas',
|
||||
'out of stock',
|
||||
'out-of-stock',
|
||||
'plus disponible',
|
||||
'prodotto esaurito',
|
||||
'produkt niedostępny',
|
||||
'rupture',
|
||||
'sold out',
|
||||
'sold-out',
|
||||
'stok habis',
|
||||
'stok kosong',
|
||||
'stok varian ini habis',
|
||||
'stokta yok',
|
||||
'temporarily out of stock',
|
||||
'temporarily unavailable',
|
||||
'there were no search results for',
|
||||
'this item is currently unavailable',
|
||||
'tickets unavailable',
|
||||
'tidak dijual',
|
||||
'tidak tersedia',
|
||||
'tijdelijk uitverkocht',
|
||||
'tiket tidak tersedia',
|
||||
'tükendi',
|
||||
'unavailable nearby',
|
||||
'unavailable tickets',
|
||||
'vergriffen',
|
||||
'vorbestellen',
|
||||
'vorbestellung ist bald möglich',
|
||||
'we don\'t currently have any',
|
||||
'we couldn\'t find any products that match',
|
||||
'we do not currently have an estimate of when this product will be back in stock.',
|
||||
'we don\'t know when or if this item will be back in stock.',
|
||||
'we were not able to find a match',
|
||||
'when this arrives in stock',
|
||||
'zur zeit nicht an lager',
|
||||
'品切れ',
|
||||
'已售',
|
||||
'已售完',
|
||||
'품절'
|
||||
];
|
||||
|
||||
|
||||
const vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0);
|
||||
const vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0);
|
||||
|
||||
function getElementBaseText(element) {
|
||||
// .textContent can include text from children which may give the wrong results
|
||||
// scan only immediate TEXT_NODEs, which will be a child of the element
|
||||
var text = "";
|
||||
for (var i = 0; i < element.childNodes.length; ++i)
|
||||
if (element.childNodes[i].nodeType === Node.TEXT_NODE)
|
||||
text += element.childNodes[i].textContent;
|
||||
return text.toLowerCase().trim();
|
||||
}
|
||||
function getElementBaseText(element) {
|
||||
// .textContent can include text from children which may give the wrong results
|
||||
// scan only immediate TEXT_NODEs, which will be a child of the element
|
||||
var text = "";
|
||||
for (var i = 0; i < element.childNodes.length; ++i)
|
||||
if (element.childNodes[i].nodeType === Node.TEXT_NODE)
|
||||
text += element.childNodes[i].textContent;
|
||||
return text.toLowerCase().trim();
|
||||
}
|
||||
|
||||
const negateOutOfStockRegex = new RegExp('^([0-9] in stock|add to cart|in stock)', 'ig');
|
||||
const negateOutOfStockRegex = new RegExp('^([0-9] in stock|add to cart|in stock)', 'ig');
|
||||
|
||||
// The out-of-stock or in-stock-text is generally always above-the-fold
|
||||
// and often below-the-fold is a list of related products that may or may not contain trigger text
|
||||
// so it's good to filter to just the 'above the fold' elements
|
||||
// and it should be atleast 100px from the top to ignore items in the toolbar, sometimes menu items like "Coming soon" exist
|
||||
// The out-of-stock or in-stock-text is generally always above-the-fold
|
||||
// and often below-the-fold is a list of related products that may or may not contain trigger text
|
||||
// so it's good to filter to just the 'above the fold' elements
|
||||
// and it should be atleast 100px from the top to ignore items in the toolbar, sometimes menu items like "Coming soon" exist
|
||||
|
||||
|
||||
// @todo - if it's SVG or IMG, go into image diff mode
|
||||
// %ELEMENTS% replaced at injection time because different interfaces use it with different settings
|
||||
|
||||
console.log("Scanning %ELEMENTS%");
|
||||
function collectVisibleElements(parent, visibleElements) {
|
||||
if (!parent) return; // Base case: if parent is null or undefined, return
|
||||
|
||||
function collectVisibleElements(parent, visibleElements) {
|
||||
if (!parent) return; // Base case: if parent is null or undefined, return
|
||||
// Add the parent itself to the visible elements array if it's of the specified types
|
||||
visibleElements.push(parent);
|
||||
|
||||
// Add the parent itself to the visible elements array if it's of the specified types
|
||||
visibleElements.push(parent);
|
||||
|
||||
// Iterate over the parent's children
|
||||
const children = parent.children;
|
||||
for (let i = 0; i < children.length; i++) {
|
||||
const child = children[i];
|
||||
if (
|
||||
child.nodeType === Node.ELEMENT_NODE &&
|
||||
window.getComputedStyle(child).display !== 'none' &&
|
||||
window.getComputedStyle(child).visibility !== 'hidden' &&
|
||||
child.offsetWidth >= 0 &&
|
||||
child.offsetHeight >= 0 &&
|
||||
window.getComputedStyle(child).contentVisibility !== 'hidden'
|
||||
) {
|
||||
// If the child is an element and is visible, recursively collect visible elements
|
||||
collectVisibleElements(child, visibleElements);
|
||||
// Iterate over the parent's children
|
||||
const children = parent.children;
|
||||
for (let i = 0; i < children.length; i++) {
|
||||
const child = children[i];
|
||||
if (
|
||||
child.nodeType === Node.ELEMENT_NODE &&
|
||||
window.getComputedStyle(child).display !== 'none' &&
|
||||
window.getComputedStyle(child).visibility !== 'hidden' &&
|
||||
child.offsetWidth >= 0 &&
|
||||
child.offsetHeight >= 0 &&
|
||||
window.getComputedStyle(child).contentVisibility !== 'hidden'
|
||||
) {
|
||||
// If the child is an element and is visible, recursively collect visible elements
|
||||
collectVisibleElements(child, visibleElements);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const elementsToScan = [];
|
||||
collectVisibleElements(document.body, elementsToScan);
|
||||
const elementsToScan = [];
|
||||
collectVisibleElements(document.body, elementsToScan);
|
||||
|
||||
var elementText = "";
|
||||
var elementText = "";
|
||||
|
||||
// REGEXS THAT REALLY MEAN IT'S IN STOCK
|
||||
for (let i = elementsToScan.length - 1; i >= 0; i--) {
|
||||
const element = elementsToScan[i];
|
||||
// REGEXS THAT REALLY MEAN IT'S IN STOCK
|
||||
for (let i = elementsToScan.length - 1; i >= 0; i--) {
|
||||
const element = elementsToScan[i];
|
||||
|
||||
// outside the 'fold' or some weird text in the heading area
|
||||
// .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
|
||||
if (element.getBoundingClientRect().top + window.scrollY >= vh || element.getBoundingClientRect().top + window.scrollY <= 100) {
|
||||
continue
|
||||
// outside the 'fold' or some weird text in the heading area
|
||||
// .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
|
||||
if (element.getBoundingClientRect().top + window.scrollY >= vh || element.getBoundingClientRect().top + window.scrollY <= 100) {
|
||||
continue
|
||||
}
|
||||
|
||||
elementText = "";
|
||||
try {
|
||||
if (element.tagName.toLowerCase() === "input") {
|
||||
elementText = element.value.toLowerCase().trim();
|
||||
} else {
|
||||
elementText = getElementBaseText(element);
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn('stock-not-in-stock.js scraper - handling element for gettext failed', e);
|
||||
}
|
||||
|
||||
if (elementText.length) {
|
||||
// try which ones could mean its in stock
|
||||
if (negateOutOfStockRegex.test(elementText) && !elementText.includes('(0 products)')) {
|
||||
console.log(`Negating/overriding 'Out of Stock' back to "Possibly in stock" found "${elementText}"`)
|
||||
return 'Possibly in stock';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
elementText = "";
|
||||
try {
|
||||
// OTHER STUFF THAT COULD BE THAT IT'S OUT OF STOCK
|
||||
for (let i = elementsToScan.length - 1; i >= 0; i--) {
|
||||
const element = elementsToScan[i];
|
||||
// outside the 'fold' or some weird text in the heading area
|
||||
// .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
|
||||
// Note: theres also an automated test that places the 'out of stock' text fairly low down
|
||||
if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 100) {
|
||||
continue
|
||||
}
|
||||
elementText = "";
|
||||
if (element.tagName.toLowerCase() === "input") {
|
||||
elementText = element.value.toLowerCase().trim();
|
||||
} else {
|
||||
elementText = getElementBaseText(element);
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn('stock-not-in-stock.js scraper - handling element for gettext failed', e);
|
||||
}
|
||||
|
||||
if (elementText.length) {
|
||||
// try which ones could mean its in stock
|
||||
if (negateOutOfStockRegex.test(elementText) && !elementText.includes('(0 products)')) {
|
||||
console.log(`Negating/overriding 'Out of Stock' back to "Possibly in stock" found "${elementText}"`)
|
||||
return 'Possibly in stock';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// OTHER STUFF THAT COULD BE THAT IT'S OUT OF STOCK
|
||||
for (let i = elementsToScan.length - 1; i >= 0; i--) {
|
||||
const element = elementsToScan[i];
|
||||
// outside the 'fold' or some weird text in the heading area
|
||||
// .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
|
||||
// Note: theres also an automated test that places the 'out of stock' text fairly low down
|
||||
if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 100) {
|
||||
continue
|
||||
}
|
||||
elementText = "";
|
||||
if (element.tagName.toLowerCase() === "input") {
|
||||
elementText = element.value.toLowerCase().trim();
|
||||
} else {
|
||||
elementText = getElementBaseText(element);
|
||||
}
|
||||
|
||||
if (elementText.length) {
|
||||
// and these mean its out of stock
|
||||
for (const outOfStockText of outOfStockTexts) {
|
||||
if (elementText.includes(outOfStockText)) {
|
||||
console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}" - offset top ${element.getBoundingClientRect().top}, page height is ${vh}`)
|
||||
return outOfStockText; // item is out of stock
|
||||
if (elementText.length) {
|
||||
// and these mean its out of stock
|
||||
for (const outOfStockText of outOfStockTexts) {
|
||||
if (elementText.includes(outOfStockText)) {
|
||||
console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}" - offset top ${element.getBoundingClientRect().top}, page height is ${vh}`)
|
||||
return outOfStockText; // item is out of stock
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`Returning 'Possibly in stock' - cant' find any useful matching text`)
|
||||
return 'Possibly in stock'; // possibly in stock, cant decide otherwise.
|
||||
}
|
||||
|
||||
console.log(`Returning 'Possibly in stock' - cant' find any useful matching text`)
|
||||
return 'Possibly in stock'; // possibly in stock, cant decide otherwise.
|
||||
}
|
||||
|
||||
// returns the element text that makes it think it's out of stock
|
||||
return isItemInStock().trim()
|
||||
|
||||
|
||||
return isItemInStock().trim()
|
||||
}
|
||||
|
|
|
@ -1,285 +1,285 @@
|
|||
// Copyright (C) 2021 Leigh Morresi (dgtlmoon@gmail.com)
|
||||
// All rights reserved.
|
||||
async (options) => {
|
||||
|
||||
// @file Scrape the page looking for elements of concern (%ELEMENTS%)
|
||||
// http://matatk.agrip.org.uk/tests/position-and-width/
|
||||
// https://stackoverflow.com/questions/26813480/when-is-element-getboundingclientrect-guaranteed-to-be-updated-accurate
|
||||
//
|
||||
// Some pages like https://www.londonstockexchange.com/stock/NCCL/ncondezi-energy-limited/analysis
|
||||
// will automatically force a scroll somewhere, so include the position offset
|
||||
// Lets hope the position doesnt change while we iterate the bbox's, but this is better than nothing
|
||||
var scroll_y = 0;
|
||||
try {
|
||||
scroll_y = +document.documentElement.scrollTop || document.body.scrollTop
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
}
|
||||
let visualselector_xpath_selectors = options.visualselector_xpath_selectors
|
||||
let max_height = options.max_height
|
||||
|
||||
|
||||
// Include the getXpath script directly, easier than fetching
|
||||
function getxpath(e) {
|
||||
var n = e;
|
||||
if (n && n.id) return '//*[@id="' + n.id + '"]';
|
||||
for (var o = []; n && Node.ELEMENT_NODE === n.nodeType;) {
|
||||
for (var i = 0, r = !1, d = n.previousSibling; d;) d.nodeType !== Node.DOCUMENT_TYPE_NODE && d.nodeName === n.nodeName && i++, d = d.previousSibling;
|
||||
for (d = n.nextSibling; d;) {
|
||||
if (d.nodeName === n.nodeName) {
|
||||
r = !0;
|
||||
break
|
||||
}
|
||||
d = d.nextSibling
|
||||
}
|
||||
o.push((n.prefix ? n.prefix + ":" : "") + n.localName + (i || r ? "[" + (i + 1) + "]" : "")), n = n.parentNode
|
||||
}
|
||||
return o.length ? "/" + o.reverse().join("/") : ""
|
||||
}
|
||||
|
||||
const findUpTag = (el) => {
|
||||
let r = el
|
||||
chained_css = [];
|
||||
depth = 0;
|
||||
|
||||
// Strategy 1: If it's an input, with name, and there's only one, prefer that
|
||||
if (el.name !== undefined && el.name.length) {
|
||||
var proposed = el.tagName + "[name=\"" + CSS.escape(el.name) + "\"]";
|
||||
var proposed_element = window.document.querySelectorAll(proposed);
|
||||
if (proposed_element.length) {
|
||||
if (proposed_element.length === 1) {
|
||||
return proposed;
|
||||
} else {
|
||||
// Some sites change ID but name= stays the same, we can hit it if we know the index
|
||||
// Find all the elements that match and work out the input[n]
|
||||
var n = Array.from(proposed_element).indexOf(el);
|
||||
// Return a Playwright selector for nthinput[name=zipcode]
|
||||
return proposed + " >> nth=" + n;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Strategy 2: Keep going up until we hit an ID tag, imagine it's like #list-widget div h4
|
||||
while (r.parentNode) {
|
||||
if (depth === 5) {
|
||||
break;
|
||||
}
|
||||
if ('' !== r.id) {
|
||||
chained_css.unshift("#" + CSS.escape(r.id));
|
||||
final_selector = chained_css.join(' > ');
|
||||
// Be sure theres only one, some sites have multiples of the same ID tag :-(
|
||||
if (window.document.querySelectorAll(final_selector).length === 1) {
|
||||
return final_selector;
|
||||
}
|
||||
return null;
|
||||
} else {
|
||||
chained_css.unshift(r.tagName.toLowerCase());
|
||||
}
|
||||
r = r.parentNode;
|
||||
depth += 1;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
// @todo - if it's SVG or IMG, go into image diff mode
|
||||
// %ELEMENTS% replaced at injection time because different interfaces use it with different settings
|
||||
|
||||
var size_pos = [];
|
||||
// after page fetch, inject this JS
|
||||
// build a map of all elements and their positions (maybe that only include text?)
|
||||
var bbox;
|
||||
console.log("Scanning %ELEMENTS%");
|
||||
|
||||
function collectVisibleElements(parent, visibleElements) {
|
||||
if (!parent) return; // Base case: if parent is null or undefined, return
|
||||
|
||||
|
||||
// Add the parent itself to the visible elements array if it's of the specified types
|
||||
const tagName = parent.tagName.toLowerCase();
|
||||
if ("%ELEMENTS%".split(',').includes(tagName)) {
|
||||
visibleElements.push(parent);
|
||||
}
|
||||
|
||||
// Iterate over the parent's children
|
||||
const children = parent.children;
|
||||
for (let i = 0; i < children.length; i++) {
|
||||
const child = children[i];
|
||||
const computedStyle = window.getComputedStyle(child);
|
||||
|
||||
if (
|
||||
child.nodeType === Node.ELEMENT_NODE &&
|
||||
computedStyle.display !== 'none' &&
|
||||
computedStyle.visibility !== 'hidden' &&
|
||||
child.offsetWidth >= 0 &&
|
||||
child.offsetHeight >= 0 &&
|
||||
computedStyle.contentVisibility !== 'hidden'
|
||||
) {
|
||||
// If the child is an element and is visible, recursively collect visible elements
|
||||
collectVisibleElements(child, visibleElements);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create an array to hold the visible elements
|
||||
const visibleElementsArray = [];
|
||||
|
||||
// Call collectVisibleElements with the starting parent element
|
||||
collectVisibleElements(document.body, visibleElementsArray);
|
||||
|
||||
|
||||
visibleElementsArray.forEach(function (element) {
|
||||
|
||||
bbox = element.getBoundingClientRect();
|
||||
|
||||
// Skip really small ones, and where width or height ==0
|
||||
if (bbox['width'] * bbox['height'] < 10) {
|
||||
return
|
||||
}
|
||||
|
||||
// Don't include elements that are offset from canvas
|
||||
if (bbox['top'] + scroll_y < 0 || bbox['left'] < 0) {
|
||||
return
|
||||
}
|
||||
|
||||
// @todo the getXpath kind of sucks, it doesnt know when there is for example just one ID sometimes
|
||||
// it should not traverse when we know we can anchor off just an ID one level up etc..
|
||||
// maybe, get current class or id, keep traversing up looking for only class or id until there is just one match
|
||||
|
||||
// 1st primitive - if it has class, try joining it all and select, if theres only one.. well thats us.
|
||||
xpath_result = false;
|
||||
var scroll_y = 0;
|
||||
try {
|
||||
var d = findUpTag(element);
|
||||
if (d) {
|
||||
xpath_result = d;
|
||||
}
|
||||
scroll_y = +document.documentElement.scrollTop || document.body.scrollTop
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
}
|
||||
// You could swap it and default to getXpath and then try the smarter one
|
||||
// default back to the less intelligent one
|
||||
if (!xpath_result) {
|
||||
try {
|
||||
// I've seen on FB and eBay that this doesnt work
|
||||
// ReferenceError: getXPath is not defined at eval (eval at evaluate (:152:29), <anonymous>:67:20) at UtilityScript.evaluate (<anonymous>:159:18) at UtilityScript.<anonymous> (<anonymous>:1:44)
|
||||
xpath_result = getxpath(element);
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
return
|
||||
|
||||
// Include the getXpath script directly, easier than fetching
|
||||
function getxpath(e) {
|
||||
var n = e;
|
||||
if (n && n.id) return '//*[@id="' + n.id + '"]';
|
||||
for (var o = []; n && Node.ELEMENT_NODE === n.nodeType;) {
|
||||
for (var i = 0, r = !1, d = n.previousSibling; d;) d.nodeType !== Node.DOCUMENT_TYPE_NODE && d.nodeName === n.nodeName && i++, d = d.previousSibling;
|
||||
for (d = n.nextSibling; d;) {
|
||||
if (d.nodeName === n.nodeName) {
|
||||
r = !0;
|
||||
break
|
||||
}
|
||||
d = d.nextSibling
|
||||
}
|
||||
o.push((n.prefix ? n.prefix + ":" : "") + n.localName + (i || r ? "[" + (i + 1) + "]" : "")), n = n.parentNode
|
||||
}
|
||||
return o.length ? "/" + o.reverse().join("/") : ""
|
||||
}
|
||||
|
||||
const findUpTag = (el) => {
|
||||
let r = el
|
||||
chained_css = [];
|
||||
depth = 0;
|
||||
|
||||
// Strategy 1: If it's an input, with name, and there's only one, prefer that
|
||||
if (el.name !== undefined && el.name.length) {
|
||||
var proposed = el.tagName + "[name=\"" + CSS.escape(el.name) + "\"]";
|
||||
var proposed_element = window.document.querySelectorAll(proposed);
|
||||
if (proposed_element.length) {
|
||||
if (proposed_element.length === 1) {
|
||||
return proposed;
|
||||
} else {
|
||||
// Some sites change ID but name= stays the same, we can hit it if we know the index
|
||||
// Find all the elements that match and work out the input[n]
|
||||
var n = Array.from(proposed_element).indexOf(el);
|
||||
// Return a Playwright selector for nthinput[name=zipcode]
|
||||
return proposed + " >> nth=" + n;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Strategy 2: Keep going up until we hit an ID tag, imagine it's like #list-widget div h4
|
||||
while (r.parentNode) {
|
||||
if (depth === 5) {
|
||||
break;
|
||||
}
|
||||
if ('' !== r.id) {
|
||||
chained_css.unshift("#" + CSS.escape(r.id));
|
||||
final_selector = chained_css.join(' > ');
|
||||
// Be sure theres only one, some sites have multiples of the same ID tag :-(
|
||||
if (window.document.querySelectorAll(final_selector).length === 1) {
|
||||
return final_selector;
|
||||
}
|
||||
return null;
|
||||
} else {
|
||||
chained_css.unshift(r.tagName.toLowerCase());
|
||||
}
|
||||
r = r.parentNode;
|
||||
depth += 1;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
// @todo - if it's SVG or IMG, go into image diff mode
|
||||
|
||||
var size_pos = [];
|
||||
// after page fetch, inject this JS
|
||||
// build a map of all elements and their positions (maybe that only include text?)
|
||||
var bbox;
|
||||
console.log(`Scanning for "${visualselector_xpath_selectors}"`);
|
||||
|
||||
function collectVisibleElements(parent, visibleElements) {
|
||||
if (!parent) return; // Base case: if parent is null or undefined, return
|
||||
|
||||
|
||||
// Add the parent itself to the visible elements array if it's of the specified types
|
||||
const tagName = parent.tagName.toLowerCase();
|
||||
if (visualselector_xpath_selectors.split(',').includes(tagName)) {
|
||||
visibleElements.push(parent);
|
||||
}
|
||||
|
||||
// Iterate over the parent's children
|
||||
const children = parent.children;
|
||||
for (let i = 0; i < children.length; i++) {
|
||||
const child = children[i];
|
||||
const computedStyle = window.getComputedStyle(child);
|
||||
|
||||
if (
|
||||
child.nodeType === Node.ELEMENT_NODE &&
|
||||
computedStyle.display !== 'none' &&
|
||||
computedStyle.visibility !== 'hidden' &&
|
||||
child.offsetWidth >= 0 &&
|
||||
child.offsetHeight >= 0 &&
|
||||
computedStyle.contentVisibility !== 'hidden'
|
||||
) {
|
||||
// If the child is an element and is visible, recursively collect visible elements
|
||||
collectVisibleElements(child, visibleElements);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let label = "not-interesting" // A placeholder, the actual labels for training are done by hand for now
|
||||
// Create an array to hold the visible elements
|
||||
const visibleElementsArray = [];
|
||||
|
||||
let text = element.textContent.trim().slice(0, 30).trim();
|
||||
while (/\n{2,}|\t{2,}/.test(text)) {
|
||||
text = text.replace(/\n{2,}/g, '\n').replace(/\t{2,}/g, '\t')
|
||||
}
|
||||
// Call collectVisibleElements with the starting parent element
|
||||
collectVisibleElements(document.body, visibleElementsArray);
|
||||
|
||||
// Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training.
|
||||
const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,–)/.test(text) ;
|
||||
const computedStyle = window.getComputedStyle(element);
|
||||
|
||||
size_pos.push({
|
||||
xpath: xpath_result,
|
||||
width: Math.round(bbox['width']),
|
||||
height: Math.round(bbox['height']),
|
||||
left: Math.floor(bbox['left']),
|
||||
top: Math.floor(bbox['top']) + scroll_y,
|
||||
// tagName used by Browser Steps
|
||||
tagName: (element.tagName) ? element.tagName.toLowerCase() : '',
|
||||
// tagtype used by Browser Steps
|
||||
tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
|
||||
isClickable: computedStyle.cursor === "pointer",
|
||||
// Used by the keras trainer
|
||||
fontSize: computedStyle.getPropertyValue('font-size'),
|
||||
fontWeight: computedStyle.getPropertyValue('font-weight'),
|
||||
hasDigitCurrency: hasDigitCurrency,
|
||||
label: label,
|
||||
visibleElementsArray.forEach(function (element) {
|
||||
|
||||
bbox = element.getBoundingClientRect();
|
||||
|
||||
// Skip really small ones, and where width or height ==0
|
||||
if (bbox['width'] * bbox['height'] < 10) {
|
||||
return
|
||||
}
|
||||
|
||||
// Don't include elements that are offset from canvas
|
||||
if (bbox['top'] + scroll_y < 0 || bbox['left'] < 0) {
|
||||
return
|
||||
}
|
||||
|
||||
// @todo the getXpath kind of sucks, it doesnt know when there is for example just one ID sometimes
|
||||
// it should not traverse when we know we can anchor off just an ID one level up etc..
|
||||
// maybe, get current class or id, keep traversing up looking for only class or id until there is just one match
|
||||
|
||||
// 1st primitive - if it has class, try joining it all and select, if theres only one.. well thats us.
|
||||
xpath_result = false;
|
||||
try {
|
||||
var d = findUpTag(element);
|
||||
if (d) {
|
||||
xpath_result = d;
|
||||
}
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
}
|
||||
// You could swap it and default to getXpath and then try the smarter one
|
||||
// default back to the less intelligent one
|
||||
if (!xpath_result) {
|
||||
try {
|
||||
// I've seen on FB and eBay that this doesnt work
|
||||
// ReferenceError: getXPath is not defined at eval (eval at evaluate (:152:29), <anonymous>:67:20) at UtilityScript.evaluate (<anonymous>:159:18) at UtilityScript.<anonymous> (<anonymous>:1:44)
|
||||
xpath_result = getxpath(element);
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
let label = "not-interesting" // A placeholder, the actual labels for training are done by hand for now
|
||||
|
||||
let text = element.textContent.trim().slice(0, 30).trim();
|
||||
while (/\n{2,}|\t{2,}/.test(text)) {
|
||||
text = text.replace(/\n{2,}/g, '\n').replace(/\t{2,}/g, '\t')
|
||||
}
|
||||
|
||||
// Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training.
|
||||
const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6))) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,–)/.test(text);
|
||||
const computedStyle = window.getComputedStyle(element);
|
||||
|
||||
if (Math.floor(bbox['top']) + scroll_y > max_height) {
|
||||
return
|
||||
}
|
||||
|
||||
size_pos.push({
|
||||
xpath: xpath_result,
|
||||
width: Math.round(bbox['width']),
|
||||
height: Math.round(bbox['height']),
|
||||
left: Math.floor(bbox['left']),
|
||||
top: Math.floor(bbox['top']) + scroll_y,
|
||||
// tagName used by Browser Steps
|
||||
tagName: (element.tagName) ? element.tagName.toLowerCase() : '',
|
||||
// tagtype used by Browser Steps
|
||||
tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
|
||||
isClickable: computedStyle.cursor === "pointer",
|
||||
// Used by the keras trainer
|
||||
fontSize: computedStyle.getPropertyValue('font-size'),
|
||||
fontWeight: computedStyle.getPropertyValue('font-weight'),
|
||||
hasDigitCurrency: hasDigitCurrency,
|
||||
label: label,
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
|
||||
// Inject the current one set in the include_filters, which may be a CSS rule
|
||||
// used for displaying the current one in VisualSelector, where its not one we generated.
|
||||
if (include_filters.length) {
|
||||
let results;
|
||||
// Foreach filter, go and find it on the page and add it to the results so we can visualise it again
|
||||
for (const f of include_filters) {
|
||||
bbox = false;
|
||||
q = false;
|
||||
if (include_filters.length) {
|
||||
let results;
|
||||
// Foreach filter, go and find it on the page and add it to the results so we can visualise it again
|
||||
for (const f of include_filters) {
|
||||
bbox = false;
|
||||
q = false;
|
||||
|
||||
if (!f.length) {
|
||||
console.log("xpath_element_scraper: Empty filter, skipping");
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
// is it xpath?
|
||||
if (f.startsWith('/') || f.startsWith('xpath')) {
|
||||
var qry_f = f.replace(/xpath(:|\d:)/, '')
|
||||
console.log("[xpath] Scanning for included filter " + qry_f)
|
||||
let xpathResult = document.evaluate(qry_f, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
|
||||
results = [];
|
||||
for (let i = 0; i < xpathResult.snapshotLength; i++) {
|
||||
results.push(xpathResult.snapshotItem(i));
|
||||
}
|
||||
} else {
|
||||
console.log("[css] Scanning for included filter " + f)
|
||||
console.log("[css] Scanning for included filter " + f);
|
||||
results = document.querySelectorAll(f);
|
||||
if (!f.length) {
|
||||
console.log("xpath_element_scraper: Empty filter, skipping");
|
||||
continue;
|
||||
}
|
||||
} catch (e) {
|
||||
// Maybe catch DOMException and alert?
|
||||
console.log("xpath_element_scraper: Exception selecting element from filter " + f);
|
||||
console.log(e);
|
||||
}
|
||||
|
||||
if (results != null && results.length) {
|
||||
|
||||
// Iterate over the results
|
||||
results.forEach(node => {
|
||||
// Try to resolve //something/text() back to its /something so we can atleast get the bounding box
|
||||
try {
|
||||
if (typeof node.nodeName == 'string' && node.nodeName === '#text') {
|
||||
node = node.parentElement
|
||||
try {
|
||||
// is it xpath?
|
||||
if (f.startsWith('/') || f.startsWith('xpath')) {
|
||||
var qry_f = f.replace(/xpath(:|\d:)/, '')
|
||||
console.log("[xpath] Scanning for included filter " + qry_f)
|
||||
let xpathResult = document.evaluate(qry_f, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
|
||||
results = [];
|
||||
for (let i = 0; i < xpathResult.snapshotLength; i++) {
|
||||
results.push(xpathResult.snapshotItem(i));
|
||||
}
|
||||
} catch (e) {
|
||||
console.log(e)
|
||||
console.log("xpath_element_scraper: #text resolver")
|
||||
}
|
||||
|
||||
// #1231 - IN the case XPath attribute filter is applied, we will have to traverse up and find the element.
|
||||
if (typeof node.getBoundingClientRect == 'function') {
|
||||
bbox = node.getBoundingClientRect();
|
||||
console.log("xpath_element_scraper: Got filter element, scroll from top was " + scroll_y)
|
||||
} else {
|
||||
console.log("[css] Scanning for included filter " + f)
|
||||
console.log("[css] Scanning for included filter " + f);
|
||||
results = document.querySelectorAll(f);
|
||||
}
|
||||
} catch (e) {
|
||||
// Maybe catch DOMException and alert?
|
||||
console.log("xpath_element_scraper: Exception selecting element from filter " + f);
|
||||
console.log(e);
|
||||
}
|
||||
|
||||
if (results != null && results.length) {
|
||||
|
||||
// Iterate over the results
|
||||
results.forEach(node => {
|
||||
// Try to resolve //something/text() back to its /something so we can atleast get the bounding box
|
||||
try {
|
||||
// Try and see we can find its ownerElement
|
||||
bbox = node.ownerElement.getBoundingClientRect();
|
||||
console.log("xpath_element_scraper: Got filter by ownerElement element, scroll from top was " + scroll_y)
|
||||
if (typeof node.nodeName == 'string' && node.nodeName === '#text') {
|
||||
node = node.parentElement
|
||||
}
|
||||
} catch (e) {
|
||||
console.log(e)
|
||||
console.log("xpath_element_scraper: error looking up q.ownerElement")
|
||||
console.log("xpath_element_scraper: #text resolver")
|
||||
}
|
||||
}
|
||||
|
||||
if (bbox && bbox['width'] > 0 && bbox['height'] > 0) {
|
||||
size_pos.push({
|
||||
xpath: f,
|
||||
width: parseInt(bbox['width']),
|
||||
height: parseInt(bbox['height']),
|
||||
left: parseInt(bbox['left']),
|
||||
top: parseInt(bbox['top']) + scroll_y,
|
||||
highlight_as_custom_filter: true
|
||||
});
|
||||
}
|
||||
});
|
||||
// #1231 - IN the case XPath attribute filter is applied, we will have to traverse up and find the element.
|
||||
if (typeof node.getBoundingClientRect == 'function') {
|
||||
bbox = node.getBoundingClientRect();
|
||||
console.log("xpath_element_scraper: Got filter element, scroll from top was " + scroll_y)
|
||||
} else {
|
||||
try {
|
||||
// Try and see we can find its ownerElement
|
||||
bbox = node.ownerElement.getBoundingClientRect();
|
||||
console.log("xpath_element_scraper: Got filter by ownerElement element, scroll from top was " + scroll_y)
|
||||
} catch (e) {
|
||||
console.log(e)
|
||||
console.log("xpath_element_scraper: error looking up q.ownerElement")
|
||||
}
|
||||
}
|
||||
|
||||
if (bbox && bbox['width'] > 0 && bbox['height'] > 0) {
|
||||
size_pos.push({
|
||||
xpath: f,
|
||||
width: parseInt(bbox['width']),
|
||||
height: parseInt(bbox['height']),
|
||||
left: parseInt(bbox['left']),
|
||||
top: parseInt(bbox['top']) + scroll_y,
|
||||
highlight_as_custom_filter: true
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sort the elements so we find the smallest one first, in other words, we find the smallest one matching in that area
|
||||
// so that we dont select the wrapping element by mistake and be unable to select what we want
|
||||
size_pos.sort((a, b) => (a.width * a.height > b.width * b.height) ? 1 : -1)
|
||||
size_pos.sort((a, b) => (a.width * a.height > b.width * b.height) ? 1 : -1)
|
||||
|
||||
// browser_width required for proper scaling in the frontend
|
||||
// Return as a string to save playwright for juggling thousands of objects
|
||||
return JSON.stringify({'size_pos': size_pos, 'browser_width': window.innerWidth});
|
||||
}
|
||||
|
||||
// Window.width required for proper scaling in the frontend
|
||||
return {'size_pos': size_pos, 'browser_width': window.innerWidth};
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
# Pages with a vertical height longer than this will use the 'stitch together' method.
|
||||
|
||||
# - Many GPUs have a max texture size of 16384x16384px (or lower on older devices).
|
||||
# - If a page is taller than ~8000–10000px, it risks exceeding GPU memory limits.
|
||||
# - This is especially important on headless Chromium, where Playwright may fail to allocate a massive full-page buffer.
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, SCREENSHOT_DEFAULT_QUALITY
|
||||
|
||||
|
||||
def stitch_images_worker(pipe_conn, chunks_bytes, original_page_height, capture_height):
|
||||
import os
|
||||
import io
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
try:
|
||||
|
||||
# Load images from byte chunks
|
||||
images = [Image.open(io.BytesIO(b)) for b in chunks_bytes]
|
||||
total_height = sum(im.height for im in images)
|
||||
max_width = max(im.width for im in images)
|
||||
|
||||
# Create stitched image
|
||||
stitched = Image.new('RGB', (max_width, total_height))
|
||||
y_offset = 0
|
||||
for im in images:
|
||||
stitched.paste(im, (0, y_offset))
|
||||
y_offset += im.height
|
||||
|
||||
# Draw caption on top (overlaid, not extending canvas)
|
||||
draw = ImageDraw.Draw(stitched)
|
||||
|
||||
|
||||
caption_text = f"WARNING: Screenshot was {original_page_height}px but trimmed to {capture_height}px because it was too long"
|
||||
padding = 10
|
||||
font_size = 35
|
||||
font_color = (255, 0, 0)
|
||||
background_color = (255, 255, 255)
|
||||
|
||||
|
||||
# Try to load a proper font
|
||||
try:
|
||||
font = ImageFont.truetype("arial.ttf", font_size)
|
||||
except IOError:
|
||||
font = ImageFont.load_default()
|
||||
|
||||
bbox = draw.textbbox((0, 0), caption_text, font=font)
|
||||
text_width = bbox[2] - bbox[0]
|
||||
text_height = bbox[3] - bbox[1]
|
||||
|
||||
# Draw white rectangle background behind text
|
||||
rect_top = 0
|
||||
rect_bottom = text_height + 2 * padding
|
||||
draw.rectangle([(0, rect_top), (max_width, rect_bottom)], fill=background_color)
|
||||
|
||||
# Draw text centered horizontally, 10px padding from top of the rectangle
|
||||
text_x = (max_width - text_width) // 2
|
||||
text_y = padding
|
||||
draw.text((text_x, text_y), caption_text, font=font, fill=font_color)
|
||||
|
||||
# Encode and send image
|
||||
output = io.BytesIO()
|
||||
stitched.save(output, format="JPEG", quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)))
|
||||
pipe_conn.send_bytes(output.getvalue())
|
||||
|
||||
stitched.close()
|
||||
except Exception as e:
|
||||
pipe_conn.send(f"error:{e}")
|
||||
finally:
|
||||
pipe_conn.close()
|
||||
|
||||
|
|
@ -394,7 +394,7 @@ def changedetection_app(config=None, datastore_o=None):
|
|||
response.headers['Content-Type'] = 'application/json'
|
||||
response.headers['Content-Encoding'] = 'deflate'
|
||||
else:
|
||||
logger.error(f'Request elements.deflate at "{watch_directory}" but was notfound.')
|
||||
logger.error(f'Request elements.deflate at "{watch_directory}" but was not found.')
|
||||
abort(404)
|
||||
|
||||
if response:
|
||||
|
|
|
@ -553,7 +553,10 @@ class model(watch_base):
|
|||
self.ensure_data_dir_exists()
|
||||
|
||||
with open(target_path, 'wb') as f:
|
||||
f.write(zlib.compress(json.dumps(data).encode()))
|
||||
if not isinstance(data, str):
|
||||
f.write(zlib.compress(json.dumps(data).encode()))
|
||||
else:
|
||||
f.write(zlib.compress(data.encode()))
|
||||
f.close()
|
||||
|
||||
# Save as PNG, PNG is larger but better for doing visual diff in the future
|
||||
|
|
|
@ -592,6 +592,7 @@ class update_worker(threading.Thread):
|
|||
|
||||
self.current_uuid = None # Done
|
||||
self.q.task_done()
|
||||
update_handler = None
|
||||
logger.debug(f"Watch {uuid} done in {time.time()-fetch_start_time:.2f}s")
|
||||
|
||||
# Give the CPU time to interrupt
|
||||
|
|
Ładowanie…
Reference in New Issue