kopia lustrzana https://github.com/dgtlmoon/changedetection.io
Puppeteer fetcher - Issue a Page.sendStop on frame load incase the browser is waiting for other data (#3232)
rodzic
37019355fd
commit
47eb874f47
|
@ -51,7 +51,15 @@ async def capture_full_page(page):
|
||||||
await page.setViewport({'width': page.viewport['width'], 'height': step_size})
|
await page.setViewport({'width': page.viewport['width'], 'height': step_size})
|
||||||
|
|
||||||
while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT):
|
while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT):
|
||||||
await page.evaluate(f"window.scrollTo(0, {y})")
|
# better than scrollTo incase they override it in the page
|
||||||
|
await page.evaluate(
|
||||||
|
"""(y) => {
|
||||||
|
document.documentElement.scrollTop = y;
|
||||||
|
document.body.scrollTop = y;
|
||||||
|
}""",
|
||||||
|
y
|
||||||
|
)
|
||||||
|
|
||||||
screenshot_chunks.append(await page.screenshot(type_='jpeg',
|
screenshot_chunks.append(await page.screenshot(type_='jpeg',
|
||||||
fullPage=False,
|
fullPage=False,
|
||||||
quality=int(os.getenv("SCREENSHOT_QUALITY", 72))))
|
quality=int(os.getenv("SCREENSHOT_QUALITY", 72))))
|
||||||
|
@ -149,7 +157,11 @@ class fetcher(Fetcher):
|
||||||
):
|
):
|
||||||
import re
|
import re
|
||||||
self.delete_browser_steps_screenshots()
|
self.delete_browser_steps_screenshots()
|
||||||
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
|
|
||||||
|
n = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
|
||||||
|
extra_wait = min(n, 15)
|
||||||
|
|
||||||
|
logger.debug(f"Extra wait set to {extra_wait}s, requested was {n}s.")
|
||||||
|
|
||||||
from pyppeteer import Pyppeteer
|
from pyppeteer import Pyppeteer
|
||||||
pyppeteer_instance = Pyppeteer()
|
pyppeteer_instance = Pyppeteer()
|
||||||
|
@ -165,7 +177,7 @@ class fetcher(Fetcher):
|
||||||
except websockets.exceptions.InvalidURI:
|
except websockets.exceptions.InvalidURI:
|
||||||
raise BrowserConnectError(msg=f"Error connecting to the browser, check your browser connection address (should be ws:// or wss://")
|
raise BrowserConnectError(msg=f"Error connecting to the browser, check your browser connection address (should be ws:// or wss://")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise BrowserConnectError(msg=f"Error connecting to the browser {str(e)}")
|
raise BrowserConnectError(msg=f"Error connecting to the browser - Exception '{str(e)}'")
|
||||||
|
|
||||||
# Better is to launch chrome with the URL as arg
|
# Better is to launch chrome with the URL as arg
|
||||||
# non-headless - newPage() will launch an extra tab/window, .browser should already contain 1 page/tab
|
# non-headless - newPage() will launch an extra tab/window, .browser should already contain 1 page/tab
|
||||||
|
@ -227,13 +239,35 @@ class fetcher(Fetcher):
|
||||||
# browsersteps_interface = steppable_browser_interface()
|
# browsersteps_interface = steppable_browser_interface()
|
||||||
# browsersteps_interface.page = self.page
|
# browsersteps_interface.page = self.page
|
||||||
|
|
||||||
response = await self.page.goto(url, waitUntil="load")
|
async def handle_frame_navigation(event):
|
||||||
|
logger.debug(f"Frame navigated: {event}")
|
||||||
|
w = extra_wait - 2 if extra_wait > 4 else 2
|
||||||
|
logger.debug(f"Waiting {w} seconds before calling Page.stopLoading...")
|
||||||
|
await asyncio.sleep(w)
|
||||||
|
logger.debug("Issuing stopLoading command...")
|
||||||
|
await self.page._client.send('Page.stopLoading')
|
||||||
|
logger.debug("stopLoading command sent!")
|
||||||
|
|
||||||
if response is None:
|
self.page._client.on('Page.frameStartedNavigating', lambda event: asyncio.create_task(handle_frame_navigation(event)))
|
||||||
await self.page.close()
|
self.page._client.on('Page.frameStartedLoading', lambda event: asyncio.create_task(handle_frame_navigation(event)))
|
||||||
await browser.close()
|
self.page._client.on('Page.frameStoppedLoading', lambda event: logger.debug(f"Frame stopped loading: {event}"))
|
||||||
logger.warning("Content Fetcher > Response object was none (as in, the response from the browser was empty, not just the content)")
|
|
||||||
raise EmptyReply(url=url, status_code=None)
|
response = None
|
||||||
|
attempt=0
|
||||||
|
while not response:
|
||||||
|
logger.debug(f"Attempting page fetch {url} attempt {attempt}")
|
||||||
|
response = await self.page.goto(url)
|
||||||
|
await asyncio.sleep(1 + extra_wait)
|
||||||
|
if response:
|
||||||
|
break
|
||||||
|
if not response:
|
||||||
|
logger.warning("Page did not fetch! trying again!")
|
||||||
|
if response is None and attempt>=2:
|
||||||
|
await self.page.close()
|
||||||
|
await browser.close()
|
||||||
|
logger.warning(f"Content Fetcher > Response object was none (as in, the response from the browser was empty, not just the content) exiting attmpt {attempt}")
|
||||||
|
raise EmptyReply(url=url, status_code=None)
|
||||||
|
attempt+=1
|
||||||
|
|
||||||
self.headers = response.headers
|
self.headers = response.headers
|
||||||
|
|
||||||
|
@ -276,7 +310,6 @@ class fetcher(Fetcher):
|
||||||
# if self.browser_steps_get_valid_steps():
|
# if self.browser_steps_get_valid_steps():
|
||||||
# self.iterate_browser_steps()
|
# self.iterate_browser_steps()
|
||||||
|
|
||||||
await asyncio.sleep(1 + extra_wait)
|
|
||||||
|
|
||||||
# So we can find an element on the page where its selector was entered manually (maybe not xPath etc)
|
# So we can find an element on the page where its selector was entered manually (maybe not xPath etc)
|
||||||
# Setup the xPath/VisualSelector scraper
|
# Setup the xPath/VisualSelector scraper
|
||||||
|
|
Ładowanie…
Reference in New Issue