kopia lustrzana https://github.com/dgtlmoon/changedetection.io
BrowserSteps - Speed up scraping, refactor screenshot handling for very long pages (#2999)
rodzic
4bf560256b
commit
849c5b2293
|
@ -45,8 +45,12 @@ jobs:
|
||||||
- name: Test that the basic pip built package runs without error
|
- name: Test that the basic pip built package runs without error
|
||||||
run: |
|
run: |
|
||||||
set -ex
|
set -ex
|
||||||
pip3 install dist/changedetection.io*.whl
|
ls -alR
|
||||||
|
|
||||||
|
# Find and install the first .whl file
|
||||||
|
find dist -type f -name "*.whl" -exec pip3 install {} \; -quit
|
||||||
changedetection.io -d /tmp -p 10000 &
|
changedetection.io -d /tmp -p 10000 &
|
||||||
|
|
||||||
sleep 3
|
sleep 3
|
||||||
curl --retry-connrefused --retry 6 http://127.0.0.1:10000/static/styles/pure-min.css >/dev/null
|
curl --retry-connrefused --retry 6 http://127.0.0.1:10000/static/styles/pure-min.css >/dev/null
|
||||||
curl --retry-connrefused --retry 6 http://127.0.0.1:10000/ >/dev/null
|
curl --retry-connrefused --retry 6 http://127.0.0.1:10000/ >/dev/null
|
||||||
|
|
|
@ -22,7 +22,10 @@ from loguru import logger
|
||||||
|
|
||||||
browsersteps_sessions = {}
|
browsersteps_sessions = {}
|
||||||
io_interface_context = None
|
io_interface_context = None
|
||||||
|
import json
|
||||||
|
import base64
|
||||||
|
import hashlib
|
||||||
|
from flask import Response
|
||||||
|
|
||||||
def construct_blueprint(datastore: ChangeDetectionStore):
|
def construct_blueprint(datastore: ChangeDetectionStore):
|
||||||
browser_steps_blueprint = Blueprint('browser_steps', __name__, template_folder="templates")
|
browser_steps_blueprint = Blueprint('browser_steps', __name__, template_folder="templates")
|
||||||
|
@ -160,14 +163,13 @@ def construct_blueprint(datastore: ChangeDetectionStore):
|
||||||
if not browsersteps_sessions.get(browsersteps_session_id):
|
if not browsersteps_sessions.get(browsersteps_session_id):
|
||||||
return make_response('No session exists under that ID', 500)
|
return make_response('No session exists under that ID', 500)
|
||||||
|
|
||||||
|
is_last_step = False
|
||||||
# Actions - step/apply/etc, do the thing and return state
|
# Actions - step/apply/etc, do the thing and return state
|
||||||
if request.method == 'POST':
|
if request.method == 'POST':
|
||||||
# @todo - should always be an existing session
|
# @todo - should always be an existing session
|
||||||
step_operation = request.form.get('operation')
|
step_operation = request.form.get('operation')
|
||||||
step_selector = request.form.get('selector')
|
step_selector = request.form.get('selector')
|
||||||
step_optional_value = request.form.get('optional_value')
|
step_optional_value = request.form.get('optional_value')
|
||||||
step_n = int(request.form.get('step_n'))
|
|
||||||
is_last_step = strtobool(request.form.get('is_last_step'))
|
is_last_step = strtobool(request.form.get('is_last_step'))
|
||||||
|
|
||||||
# @todo try.. accept.. nice errors not popups..
|
# @todo try.. accept.. nice errors not popups..
|
||||||
|
@ -182,16 +184,6 @@ def construct_blueprint(datastore: ChangeDetectionStore):
|
||||||
# Try to find something of value to give back to the user
|
# Try to find something of value to give back to the user
|
||||||
return make_response(str(e).splitlines()[0], 401)
|
return make_response(str(e).splitlines()[0], 401)
|
||||||
|
|
||||||
# Get visual selector ready/update its data (also use the current filter info from the page?)
|
|
||||||
# When the last 'apply' button was pressed
|
|
||||||
# @todo this adds overhead because the xpath selection is happening twice
|
|
||||||
u = browsersteps_sessions[browsersteps_session_id]['browserstepper'].page.url
|
|
||||||
if is_last_step and u:
|
|
||||||
(screenshot, xpath_data) = browsersteps_sessions[browsersteps_session_id]['browserstepper'].request_visualselector_data()
|
|
||||||
watch = datastore.data['watching'].get(uuid)
|
|
||||||
if watch:
|
|
||||||
watch.save_screenshot(screenshot=screenshot)
|
|
||||||
watch.save_xpath_data(data=xpath_data)
|
|
||||||
|
|
||||||
# if not this_session.page:
|
# if not this_session.page:
|
||||||
# cleanup_playwright_session()
|
# cleanup_playwright_session()
|
||||||
|
@ -199,31 +191,35 @@ def construct_blueprint(datastore: ChangeDetectionStore):
|
||||||
|
|
||||||
# Screenshots and other info only needed on requesting a step (POST)
|
# Screenshots and other info only needed on requesting a step (POST)
|
||||||
try:
|
try:
|
||||||
state = browsersteps_sessions[browsersteps_session_id]['browserstepper'].get_current_state()
|
(screenshot, xpath_data) = browsersteps_sessions[browsersteps_session_id]['browserstepper'].get_current_state()
|
||||||
|
if is_last_step:
|
||||||
|
watch = datastore.data['watching'].get(uuid)
|
||||||
|
u = browsersteps_sessions[browsersteps_session_id]['browserstepper'].page.url
|
||||||
|
if watch and u:
|
||||||
|
watch.save_screenshot(screenshot=screenshot)
|
||||||
|
watch.save_xpath_data(data=xpath_data)
|
||||||
|
|
||||||
except playwright._impl._api_types.Error as e:
|
except playwright._impl._api_types.Error as e:
|
||||||
return make_response("Browser session ran out of time :( Please reload this page."+str(e), 401)
|
return make_response("Browser session ran out of time :( Please reload this page."+str(e), 401)
|
||||||
|
except Exception as e:
|
||||||
|
return make_response("Error fetching screenshot and element data - " + str(e), 401)
|
||||||
|
|
||||||
# Use send_file() which is way faster than read/write loop on bytes
|
# SEND THIS BACK TO THE BROWSER
|
||||||
import json
|
|
||||||
from tempfile import mkstemp
|
|
||||||
from flask import send_file
|
|
||||||
tmp_fd, tmp_file = mkstemp(text=True, suffix=".json", prefix="changedetectionio-")
|
|
||||||
|
|
||||||
output = json.dumps({'screenshot': "data:image/jpeg;base64,{}".format(
|
output = {
|
||||||
base64.b64encode(state[0]).decode('ascii')),
|
"screenshot": f"data:image/jpeg;base64,{base64.b64encode(screenshot).decode('ascii')}",
|
||||||
'xpath_data': state[1],
|
"xpath_data": xpath_data,
|
||||||
'session_age_start': browsersteps_sessions[browsersteps_session_id]['browserstepper'].age_start,
|
"session_age_start": browsersteps_sessions[browsersteps_session_id]['browserstepper'].age_start,
|
||||||
'browser_time_remaining': round(remaining)
|
"browser_time_remaining": round(remaining)
|
||||||
})
|
}
|
||||||
|
json_data = json.dumps(output)
|
||||||
|
|
||||||
with os.fdopen(tmp_fd, 'w') as f:
|
# Generate an ETag (hash of the response body)
|
||||||
f.write(output)
|
etag_hash = hashlib.md5(json_data.encode('utf-8')).hexdigest()
|
||||||
|
|
||||||
response = make_response(send_file(path_or_file=tmp_file,
|
# Create the response with ETag
|
||||||
mimetype='application/json; charset=UTF-8',
|
response = Response(json_data, mimetype="application/json; charset=UTF-8")
|
||||||
etag=True))
|
response.set_etag(etag_hash)
|
||||||
# No longer needed
|
|
||||||
os.unlink(tmp_file)
|
|
||||||
|
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
|
@ -1,14 +1,15 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import re
|
import re
|
||||||
from random import randint
|
from random import randint
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
|
from changedetectionio.content_fetchers.helpers import capture_stitched_together_full_page, SCREENSHOT_SIZE_STITCH_THRESHOLD
|
||||||
from changedetectionio.content_fetchers.base import manage_user_agent
|
from changedetectionio.content_fetchers.base import manage_user_agent
|
||||||
from changedetectionio.safe_jinja import render as jinja_render
|
from changedetectionio.safe_jinja import render as jinja_render
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Two flags, tell the JS which of the "Selector" or "Value" field should be enabled in the front end
|
# Two flags, tell the JS which of the "Selector" or "Value" field should be enabled in the front end
|
||||||
# 0- off, 1- on
|
# 0- off, 1- on
|
||||||
browser_step_ui_config = {'Choose one': '0 0',
|
browser_step_ui_config = {'Choose one': '0 0',
|
||||||
|
@ -279,6 +280,7 @@ class browsersteps_live_ui(steppable_browser_interface):
|
||||||
logger.debug(f"Time to browser setup {time.time()-now:.2f}s")
|
logger.debug(f"Time to browser setup {time.time()-now:.2f}s")
|
||||||
self.page.wait_for_timeout(1 * 1000)
|
self.page.wait_for_timeout(1 * 1000)
|
||||||
|
|
||||||
|
|
||||||
def mark_as_closed(self):
|
def mark_as_closed(self):
|
||||||
logger.debug("Page closed, cleaning up..")
|
logger.debug("Page closed, cleaning up..")
|
||||||
|
|
||||||
|
@ -296,39 +298,30 @@ class browsersteps_live_ui(steppable_browser_interface):
|
||||||
now = time.time()
|
now = time.time()
|
||||||
self.page.wait_for_timeout(1 * 1000)
|
self.page.wait_for_timeout(1 * 1000)
|
||||||
|
|
||||||
# The actual screenshot
|
|
||||||
screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=40)
|
|
||||||
|
|
||||||
|
full_height = self.page.evaluate("document.documentElement.scrollHeight")
|
||||||
|
|
||||||
|
if full_height >= SCREENSHOT_SIZE_STITCH_THRESHOLD:
|
||||||
|
logger.warning(f"Page full Height: {full_height}px longer than {SCREENSHOT_SIZE_STITCH_THRESHOLD}px, using 'stitched screenshot method'.")
|
||||||
|
screenshot = capture_stitched_together_full_page(self.page)
|
||||||
|
else:
|
||||||
|
screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=40)
|
||||||
|
|
||||||
|
logger.debug(f"Time to get screenshot from browser {time.time() - now:.2f}s")
|
||||||
|
|
||||||
|
now = time.time()
|
||||||
self.page.evaluate("var include_filters=''")
|
self.page.evaluate("var include_filters=''")
|
||||||
# Go find the interactive elements
|
# Go find the interactive elements
|
||||||
# @todo in the future, something smarter that can scan for elements with .click/focus etc event handlers?
|
# @todo in the future, something smarter that can scan for elements with .click/focus etc event handlers?
|
||||||
elements = 'a,button,input,select,textarea,i,th,td,p,li,h1,h2,h3,h4,div,span'
|
elements = 'a,button,input,select,textarea,i,th,td,p,li,h1,h2,h3,h4,div,span'
|
||||||
xpath_element_js = xpath_element_js.replace('%ELEMENTS%', elements)
|
xpath_element_js = xpath_element_js.replace('%ELEMENTS%', elements)
|
||||||
|
|
||||||
xpath_data = self.page.evaluate("async () => {" + xpath_element_js + "}")
|
xpath_data = self.page.evaluate("async () => {" + xpath_element_js + "}")
|
||||||
# So the JS will find the smallest one first
|
# So the JS will find the smallest one first
|
||||||
xpath_data['size_pos'] = sorted(xpath_data['size_pos'], key=lambda k: k['width'] * k['height'], reverse=True)
|
xpath_data['size_pos'] = sorted(xpath_data['size_pos'], key=lambda k: k['width'] * k['height'], reverse=True)
|
||||||
logger.debug(f"Time to complete get_current_state of browser {time.time()-now:.2f}s")
|
logger.debug(f"Time to scrape xpath element data in browser {time.time()-now:.2f}s")
|
||||||
# except
|
|
||||||
# playwright._impl._api_types.Error: Browser closed.
|
# playwright._impl._api_types.Error: Browser closed.
|
||||||
# @todo show some countdown timer?
|
# @todo show some countdown timer?
|
||||||
return (screenshot, xpath_data)
|
return (screenshot, xpath_data)
|
||||||
|
|
||||||
def request_visualselector_data(self):
|
|
||||||
"""
|
|
||||||
Does the same that the playwright operation in content_fetcher does
|
|
||||||
This is used to just bump the VisualSelector data so it' ready to go if they click on the tab
|
|
||||||
@todo refactor and remove duplicate code, add include_filters
|
|
||||||
:param xpath_data:
|
|
||||||
:param screenshot:
|
|
||||||
:param current_include_filters:
|
|
||||||
:return:
|
|
||||||
"""
|
|
||||||
import importlib.resources
|
|
||||||
self.page.evaluate("var include_filters=''")
|
|
||||||
xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text()
|
|
||||||
from changedetectionio.content_fetchers import visualselector_xpath_selectors
|
|
||||||
xpath_element_js = xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors)
|
|
||||||
xpath_data = self.page.evaluate("async () => {" + xpath_element_js + "}")
|
|
||||||
screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))
|
|
||||||
|
|
||||||
return (screenshot, xpath_data)
|
|
||||||
|
|
|
@ -0,0 +1,104 @@
|
||||||
|
|
||||||
|
# Pages with a vertical height longer than this will use the 'stitch together' method.
|
||||||
|
|
||||||
|
# - Many GPUs have a max texture size of 16384x16384px (or lower on older devices).
|
||||||
|
# - If a page is taller than ~8000–10000px, it risks exceeding GPU memory limits.
|
||||||
|
# - This is especially important on headless Chromium, where Playwright may fail to allocate a massive full-page buffer.
|
||||||
|
|
||||||
|
|
||||||
|
# The size at which we will switch to stitching method
|
||||||
|
SCREENSHOT_SIZE_STITCH_THRESHOLD=8000
|
||||||
|
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
def capture_stitched_together_full_page(page):
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
|
|
||||||
|
MAX_TOTAL_HEIGHT = SCREENSHOT_SIZE_STITCH_THRESHOLD*4 # Maximum total height for the final image (When in stitch mode)
|
||||||
|
MAX_CHUNK_HEIGHT = 4000 # Height per screenshot chunk
|
||||||
|
WARNING_TEXT_HEIGHT = 20 # Height of the warning text overlay
|
||||||
|
|
||||||
|
# Save the original viewport size
|
||||||
|
original_viewport = page.viewport_size
|
||||||
|
now = time.time()
|
||||||
|
|
||||||
|
try:
|
||||||
|
viewport = page.viewport_size
|
||||||
|
page_height = page.evaluate("document.documentElement.scrollHeight")
|
||||||
|
|
||||||
|
# Limit the total capture height
|
||||||
|
capture_height = min(page_height, MAX_TOTAL_HEIGHT)
|
||||||
|
|
||||||
|
images = []
|
||||||
|
total_captured_height = 0
|
||||||
|
|
||||||
|
for offset in range(0, capture_height, MAX_CHUNK_HEIGHT):
|
||||||
|
# Ensure we do not exceed the total height limit
|
||||||
|
chunk_height = min(MAX_CHUNK_HEIGHT, MAX_TOTAL_HEIGHT - total_captured_height)
|
||||||
|
|
||||||
|
# Adjust viewport size for this chunk
|
||||||
|
page.set_viewport_size({"width": viewport["width"], "height": chunk_height})
|
||||||
|
|
||||||
|
# Scroll to the correct position
|
||||||
|
page.evaluate(f"window.scrollTo(0, {offset})")
|
||||||
|
|
||||||
|
# Capture screenshot chunk
|
||||||
|
screenshot_bytes = page.screenshot(type='jpeg', quality=int(os.getenv("SCREENSHOT_QUALITY", 30)))
|
||||||
|
images.append(Image.open(io.BytesIO(screenshot_bytes)))
|
||||||
|
|
||||||
|
total_captured_height += chunk_height
|
||||||
|
|
||||||
|
# Stop if we reached the maximum total height
|
||||||
|
if total_captured_height >= MAX_TOTAL_HEIGHT:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Create the final stitched image
|
||||||
|
stitched_image = Image.new('RGB', (viewport["width"], total_captured_height))
|
||||||
|
y_offset = 0
|
||||||
|
|
||||||
|
# Stitch the screenshot chunks together
|
||||||
|
for img in images:
|
||||||
|
stitched_image.paste(img, (0, y_offset))
|
||||||
|
y_offset += img.height
|
||||||
|
|
||||||
|
logger.debug(f"Screenshot stitched together in {time.time()-now:.2f}s")
|
||||||
|
|
||||||
|
# Overlay warning text if the screenshot was trimmed
|
||||||
|
if page_height > MAX_TOTAL_HEIGHT:
|
||||||
|
draw = ImageDraw.Draw(stitched_image)
|
||||||
|
warning_text = f"WARNING: Screenshot was {page_height}px but trimmed to {MAX_TOTAL_HEIGHT}px because it was too long"
|
||||||
|
|
||||||
|
# Load font (default system font if Arial is unavailable)
|
||||||
|
try:
|
||||||
|
font = ImageFont.truetype("arial.ttf", WARNING_TEXT_HEIGHT) # Arial (Windows/Mac)
|
||||||
|
except IOError:
|
||||||
|
font = ImageFont.load_default() # Default font if Arial not found
|
||||||
|
|
||||||
|
# Get text bounding box (correct method for newer Pillow versions)
|
||||||
|
text_bbox = draw.textbbox((0, 0), warning_text, font=font)
|
||||||
|
text_width = text_bbox[2] - text_bbox[0] # Calculate text width
|
||||||
|
text_height = text_bbox[3] - text_bbox[1] # Calculate text height
|
||||||
|
|
||||||
|
# Define background rectangle (top of the image)
|
||||||
|
draw.rectangle([(0, 0), (viewport["width"], WARNING_TEXT_HEIGHT)], fill="white")
|
||||||
|
|
||||||
|
# Center text horizontally within the warning area
|
||||||
|
text_x = (viewport["width"] - text_width) // 2
|
||||||
|
text_y = (WARNING_TEXT_HEIGHT - text_height) // 2
|
||||||
|
|
||||||
|
# Draw the warning text in red
|
||||||
|
draw.text((text_x, text_y), warning_text, fill="red", font=font)
|
||||||
|
|
||||||
|
# Save or return the final image
|
||||||
|
output = io.BytesIO()
|
||||||
|
stitched_image.save(output, format="JPEG", quality=int(os.getenv("SCREENSHOT_QUALITY", 30)))
|
||||||
|
screenshot = output.getvalue()
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Restore the original viewport size
|
||||||
|
page.set_viewport_size(original_viewport)
|
||||||
|
|
||||||
|
return screenshot
|
|
@ -4,6 +4,7 @@ from urllib.parse import urlparse
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
|
from changedetectionio.content_fetchers.helpers import capture_stitched_together_full_page, SCREENSHOT_SIZE_STITCH_THRESHOLD
|
||||||
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
|
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
|
||||||
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
|
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
|
||||||
|
|
||||||
|
@ -89,6 +90,7 @@ class fetcher(Fetcher):
|
||||||
from playwright.sync_api import sync_playwright
|
from playwright.sync_api import sync_playwright
|
||||||
import playwright._impl._errors
|
import playwright._impl._errors
|
||||||
from changedetectionio.content_fetchers import visualselector_xpath_selectors
|
from changedetectionio.content_fetchers import visualselector_xpath_selectors
|
||||||
|
import time
|
||||||
self.delete_browser_steps_screenshots()
|
self.delete_browser_steps_screenshots()
|
||||||
response = None
|
response = None
|
||||||
|
|
||||||
|
@ -179,6 +181,7 @@ class fetcher(Fetcher):
|
||||||
|
|
||||||
self.page.wait_for_timeout(extra_wait * 1000)
|
self.page.wait_for_timeout(extra_wait * 1000)
|
||||||
|
|
||||||
|
now = time.time()
|
||||||
# So we can find an element on the page where its selector was entered manually (maybe not xPath etc)
|
# So we can find an element on the page where its selector was entered manually (maybe not xPath etc)
|
||||||
if current_include_filters is not None:
|
if current_include_filters is not None:
|
||||||
self.page.evaluate("var include_filters={}".format(json.dumps(current_include_filters)))
|
self.page.evaluate("var include_filters={}".format(json.dumps(current_include_filters)))
|
||||||
|
@ -190,6 +193,8 @@ class fetcher(Fetcher):
|
||||||
self.instock_data = self.page.evaluate("async () => {" + self.instock_data_js + "}")
|
self.instock_data = self.page.evaluate("async () => {" + self.instock_data_js + "}")
|
||||||
|
|
||||||
self.content = self.page.content()
|
self.content = self.page.content()
|
||||||
|
logger.debug(f"Time to scrape xpath element data in browser {time.time() - now:.2f}s")
|
||||||
|
|
||||||
# Bug 3 in Playwright screenshot handling
|
# Bug 3 in Playwright screenshot handling
|
||||||
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
|
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
|
||||||
# JPEG is better here because the screenshots can be very very large
|
# JPEG is better here because the screenshots can be very very large
|
||||||
|
@ -199,10 +204,15 @@ class fetcher(Fetcher):
|
||||||
# acceptable screenshot quality here
|
# acceptable screenshot quality here
|
||||||
try:
|
try:
|
||||||
# The actual screenshot - this always base64 and needs decoding! horrible! huge CPU usage
|
# The actual screenshot - this always base64 and needs decoding! horrible! huge CPU usage
|
||||||
self.screenshot = self.page.screenshot(type='jpeg',
|
full_height = self.page.evaluate("document.documentElement.scrollHeight")
|
||||||
full_page=True,
|
|
||||||
quality=int(os.getenv("SCREENSHOT_QUALITY", 72)),
|
if full_height >= SCREENSHOT_SIZE_STITCH_THRESHOLD:
|
||||||
)
|
logger.warning(
|
||||||
|
f"Page full Height: {full_height}px longer than {SCREENSHOT_SIZE_STITCH_THRESHOLD}px, using 'stitched screenshot method'.")
|
||||||
|
self.screenshot = capture_stitched_together_full_page(self.page)
|
||||||
|
else:
|
||||||
|
self.screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("SCREENSHOT_QUALITY", 30)))
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# It's likely the screenshot was too long/big and something crashed
|
# It's likely the screenshot was too long/big and something crashed
|
||||||
raise ScreenshotUnavailable(url=url, status_code=self.status_code)
|
raise ScreenshotUnavailable(url=url, status_code=self.status_code)
|
||||||
|
|
|
@ -41,7 +41,7 @@ const findUpTag = (el) => {
|
||||||
|
|
||||||
// Strategy 1: If it's an input, with name, and there's only one, prefer that
|
// Strategy 1: If it's an input, with name, and there's only one, prefer that
|
||||||
if (el.name !== undefined && el.name.length) {
|
if (el.name !== undefined && el.name.length) {
|
||||||
var proposed = el.tagName + "[name=" + el.name + "]";
|
var proposed = el.tagName + "[name=\"" + CSS.escape(el.name) + "\"]";
|
||||||
var proposed_element = window.document.querySelectorAll(proposed);
|
var proposed_element = window.document.querySelectorAll(proposed);
|
||||||
if (proposed_element.length) {
|
if (proposed_element.length) {
|
||||||
if (proposed_element.length === 1) {
|
if (proposed_element.length === 1) {
|
||||||
|
@ -102,13 +102,15 @@ function collectVisibleElements(parent, visibleElements) {
|
||||||
const children = parent.children;
|
const children = parent.children;
|
||||||
for (let i = 0; i < children.length; i++) {
|
for (let i = 0; i < children.length; i++) {
|
||||||
const child = children[i];
|
const child = children[i];
|
||||||
|
const computedStyle = window.getComputedStyle(child);
|
||||||
|
|
||||||
if (
|
if (
|
||||||
child.nodeType === Node.ELEMENT_NODE &&
|
child.nodeType === Node.ELEMENT_NODE &&
|
||||||
window.getComputedStyle(child).display !== 'none' &&
|
computedStyle.display !== 'none' &&
|
||||||
window.getComputedStyle(child).visibility !== 'hidden' &&
|
computedStyle.visibility !== 'hidden' &&
|
||||||
child.offsetWidth >= 0 &&
|
child.offsetWidth >= 0 &&
|
||||||
child.offsetHeight >= 0 &&
|
child.offsetHeight >= 0 &&
|
||||||
window.getComputedStyle(child).contentVisibility !== 'hidden'
|
computedStyle.contentVisibility !== 'hidden'
|
||||||
) {
|
) {
|
||||||
// If the child is an element and is visible, recursively collect visible elements
|
// If the child is an element and is visible, recursively collect visible elements
|
||||||
collectVisibleElements(child, visibleElements);
|
collectVisibleElements(child, visibleElements);
|
||||||
|
@ -173,6 +175,7 @@ visibleElementsArray.forEach(function (element) {
|
||||||
|
|
||||||
// Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training.
|
// Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training.
|
||||||
const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,–)/.test(text) ;
|
const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,–)/.test(text) ;
|
||||||
|
const computedStyle = window.getComputedStyle(element);
|
||||||
|
|
||||||
size_pos.push({
|
size_pos.push({
|
||||||
xpath: xpath_result,
|
xpath: xpath_result,
|
||||||
|
@ -184,10 +187,10 @@ visibleElementsArray.forEach(function (element) {
|
||||||
tagName: (element.tagName) ? element.tagName.toLowerCase() : '',
|
tagName: (element.tagName) ? element.tagName.toLowerCase() : '',
|
||||||
// tagtype used by Browser Steps
|
// tagtype used by Browser Steps
|
||||||
tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
|
tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
|
||||||
isClickable: window.getComputedStyle(element).cursor === "pointer",
|
isClickable: computedStyle.cursor === "pointer",
|
||||||
// Used by the keras trainer
|
// Used by the keras trainer
|
||||||
fontSize: window.getComputedStyle(element).getPropertyValue('font-size'),
|
fontSize: computedStyle.getPropertyValue('font-size'),
|
||||||
fontWeight: window.getComputedStyle(element).getPropertyValue('font-weight'),
|
fontWeight: computedStyle.getPropertyValue('font-weight'),
|
||||||
hasDigitCurrency: hasDigitCurrency,
|
hasDigitCurrency: hasDigitCurrency,
|
||||||
label: label,
|
label: label,
|
||||||
});
|
});
|
||||||
|
|
Ładowanie…
Reference in New Issue