kopia lustrzana https://github.com/dgtlmoon/changedetection.io
Fetcher - Experimental fetcher improvements (Code TidyUp, Improve tests, revert to old playwright when using BrowserSteps for now) (#1564)
rodzic
690cf4acc9
commit
d939882dde
|
@ -58,9 +58,9 @@ jobs:
|
||||||
# restock detection via playwright - added name=changedet here so that playwright/browserless can connect to it
|
# restock detection via playwright - added name=changedet here so that playwright/browserless can connect to it
|
||||||
docker run --rm --name "changedet" -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-port=5004 --live-server-host=0.0.0.0 tests/restock/test_restock.py'
|
docker run --rm --name "changedet" -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-port=5004 --live-server-host=0.0.0.0 tests/restock/test_restock.py'
|
||||||
|
|
||||||
- name: Test with puppeteer fetcher
|
- name: Test with puppeteer fetcher and disk cache
|
||||||
run: |
|
run: |
|
||||||
docker run --rm -e "USE_EXPERIMENTAL_PUPPETEER_FETCH=yes" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py'
|
docker run --rm -e "PUPPETEER_DISK_CACHE=/tmp/data/" -e "USE_EXPERIMENTAL_PUPPETEER_FETCH=yes" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py'
|
||||||
# Browserless would have had -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" added above
|
# Browserless would have had -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" added above
|
||||||
|
|
||||||
- name: Test proxy interaction
|
- name: Test proxy interaction
|
||||||
|
|
|
@ -287,168 +287,18 @@ class base_html_playwright(Fetcher):
|
||||||
current_include_filters=None,
|
current_include_filters=None,
|
||||||
is_binary=False):
|
is_binary=False):
|
||||||
|
|
||||||
|
from pkg_resources import resource_string
|
||||||
|
|
||||||
extra_wait_ms = (int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay) * 1000
|
extra_wait_ms = (int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay) * 1000
|
||||||
xpath_element_js = self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors)
|
|
||||||
|
|
||||||
code = f"""module.exports = async ({{ page, context }}) => {{
|
self.xpath_element_js = self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors)
|
||||||
|
code = resource_string(__name__, "res/puppeteer_fetch.js").decode('utf-8')
|
||||||
var {{ url, execute_js, user_agent, extra_wait_ms, req_headers, include_filters, xpath_element_js, screenshot_quality, proxy_username, proxy_password, disk_cache_dir}} = context;
|
# In the future inject this is a proper JS package
|
||||||
|
code = code.replace('%xpath_scrape_code%', self.xpath_element_js)
|
||||||
await page.setBypassCSP(true)
|
code = code.replace('%instock_scrape_code%', self.instock_data_js)
|
||||||
await page.setExtraHTTPHeaders(req_headers);
|
|
||||||
await page.setUserAgent(user_agent);
|
|
||||||
// https://ourcodeworld.com/articles/read/1106/how-to-solve-puppeteer-timeouterror-navigation-timeout-of-30000-ms-exceeded
|
|
||||||
|
|
||||||
await page.setDefaultNavigationTimeout(0);
|
|
||||||
|
|
||||||
if(proxy_username) {{
|
|
||||||
await page.authenticate({{
|
|
||||||
username: proxy_username,
|
|
||||||
password: proxy_password
|
|
||||||
}});
|
|
||||||
}}
|
|
||||||
|
|
||||||
await page.setViewport({{
|
|
||||||
width: 1024,
|
|
||||||
height: 768,
|
|
||||||
deviceScaleFactor: 1,
|
|
||||||
}});
|
|
||||||
|
|
||||||
// Very primitive disk cache - USE WITH EXTREME CAUTION
|
|
||||||
// Run browserless container with -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]"
|
|
||||||
if ( disk_cache_dir ) {{
|
|
||||||
|
|
||||||
await page.setRequestInterception(true);
|
|
||||||
|
|
||||||
console.log(">>>>>>>>>>>>>>> LOCAL DISK CACHE ENABLED <<<<<<<<<<<<<<<<<<<<<");
|
|
||||||
const fs = require('fs');
|
|
||||||
const crypto = require('crypto');
|
|
||||||
function file_is_expired(file_path) {{
|
|
||||||
if (!fs.existsSync(dir_path+key)) {{
|
|
||||||
return true;
|
|
||||||
}}
|
|
||||||
var stats = fs.statSync(file_path);
|
|
||||||
const now_date = new Date();
|
|
||||||
const expire_seconds = 300;
|
|
||||||
if ( (now_date/1000) - (stats.mtime.getTime() / 1000) > expire_seconds) {{
|
|
||||||
console.log("CACHE EXPIRED: "+file_path);
|
|
||||||
return true;
|
|
||||||
}}
|
|
||||||
return false;
|
|
||||||
|
|
||||||
}}
|
|
||||||
|
|
||||||
page.on('request', async (request) => {{
|
|
||||||
|
|
||||||
// if (blockedExtensions.some((str) => req.url().endsWith(str))) return req.abort();
|
|
||||||
const url = request.url();
|
|
||||||
const key = crypto.createHash('md5').update(url).digest("hex");
|
|
||||||
const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/';
|
|
||||||
|
|
||||||
// https://stackoverflow.com/questions/4482686/check-synchronously-if-file-directory-exists-in-node-js
|
|
||||||
|
|
||||||
if (fs.existsSync(dir_path+key)) {{
|
|
||||||
file_is_expired(dir_path+key);
|
|
||||||
console.log("Cache exists "+dir_path+key+ " - "+url);
|
|
||||||
const cached_data = fs.readFileSync(dir_path+key);
|
|
||||||
request.respond({{
|
|
||||||
status: 200,
|
|
||||||
//contentType: 'text/html', //@todo
|
|
||||||
body: cached_data
|
|
||||||
}});
|
|
||||||
return;
|
|
||||||
}}
|
|
||||||
request.continue();
|
|
||||||
}});
|
|
||||||
|
|
||||||
page.on('response', async (response) => {{
|
|
||||||
const url = response.url();
|
|
||||||
// @todo - check response size()
|
|
||||||
console.log("Cache - Got "+response.request().method()+" - "+url+" - "+response.request().resourceType());
|
|
||||||
|
|
||||||
if(response.request().method() != 'GET' || response.request().resourceType() == 'xhr' || response.request().resourceType() == 'document' || response.status() != 200 ) {{
|
|
||||||
console.log("Skipping- "+url);
|
|
||||||
return;
|
|
||||||
}}
|
|
||||||
|
|
||||||
const key = crypto.createHash('md5').update(url).digest("hex");
|
|
||||||
const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/';
|
|
||||||
const data = await response.text();
|
|
||||||
if (!fs.existsSync(dir_path)) {{
|
|
||||||
fs.mkdirSync(dir_path, {{ recursive: true }})
|
|
||||||
}}
|
|
||||||
|
|
||||||
var expired = false;
|
|
||||||
if (fs.existsSync(dir_path+key)) {{
|
|
||||||
if (file_is_expired(dir_path+key)) {{
|
|
||||||
fs.writeFileSync(dir_path+key, data);
|
|
||||||
}}
|
|
||||||
}} else {{
|
|
||||||
fs.writeFileSync(dir_path+key, data);
|
|
||||||
}}
|
|
||||||
}});
|
|
||||||
}}
|
|
||||||
|
|
||||||
|
|
||||||
const r = await page.goto(url, {{
|
|
||||||
waitUntil: 'load'
|
|
||||||
}});
|
|
||||||
|
|
||||||
await page.waitForTimeout(1000);
|
|
||||||
await page.waitForTimeout(extra_wait_ms);
|
|
||||||
|
|
||||||
if(execute_js) {{
|
|
||||||
await page.evaluate(execute_js);
|
|
||||||
await page.waitForTimeout(200);
|
|
||||||
}}
|
|
||||||
|
|
||||||
var xpath_data;
|
|
||||||
var instock_data;
|
|
||||||
try {{
|
|
||||||
xpath_data = await page.evaluate((include_filters) => {{ {xpath_element_js} }}, include_filters);
|
|
||||||
instock_data = await page.evaluate(() => {{ {self.instock_data_js} }});
|
|
||||||
}} catch (e) {{
|
|
||||||
console.log(e);
|
|
||||||
}}
|
|
||||||
|
|
||||||
// Protocol error (Page.captureScreenshot): Cannot take screenshot with 0 width can come from a proxy auth failure
|
|
||||||
// Wrap it here (for now)
|
|
||||||
|
|
||||||
var b64s = false;
|
|
||||||
try {{
|
|
||||||
b64s = await page.screenshot({{ encoding: "base64", fullPage: true, quality: screenshot_quality, type: 'jpeg' }});
|
|
||||||
}} catch (e) {{
|
|
||||||
console.log(e);
|
|
||||||
}}
|
|
||||||
|
|
||||||
// May fail on very large pages with 'WARNING: tile memory limits exceeded, some content may not draw'
|
|
||||||
if (!b64s) {{
|
|
||||||
// @todo after text extract, we can place some overlay text with red background to say 'croppped'
|
|
||||||
console.error('ERROR: content-fetcher page was maybe too large for a screenshot, reverting to viewport only screenshot');
|
|
||||||
try {{
|
|
||||||
b64s = await page.screenshot({{ encoding: "base64", quality: screenshot_quality, type: 'jpeg' }});
|
|
||||||
}} catch (e) {{
|
|
||||||
console.log(e);
|
|
||||||
}}
|
|
||||||
}}
|
|
||||||
|
|
||||||
|
|
||||||
var html = await page.content();
|
|
||||||
return {{
|
|
||||||
data: {{
|
|
||||||
'content': html,
|
|
||||||
'headers': r.headers(),
|
|
||||||
'instock_data': instock_data,
|
|
||||||
'screenshot': b64s,
|
|
||||||
'status_code': r.status(),
|
|
||||||
'xpath_data': xpath_data
|
|
||||||
}},
|
|
||||||
type: 'application/json',
|
|
||||||
}};
|
|
||||||
}};"""
|
|
||||||
|
|
||||||
from requests.exceptions import ConnectTimeout, ReadTimeout
|
from requests.exceptions import ConnectTimeout, ReadTimeout
|
||||||
wait_browserless_seconds = 120
|
wait_browserless_seconds = 240
|
||||||
|
|
||||||
browserless_function_url = os.getenv('BROWSERLESS_FUNCTION_URL')
|
browserless_function_url = os.getenv('BROWSERLESS_FUNCTION_URL')
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
@ -475,7 +325,9 @@ class base_html_playwright(Fetcher):
|
||||||
json={
|
json={
|
||||||
"code": code,
|
"code": code,
|
||||||
"context": {
|
"context": {
|
||||||
'disk_cache_dir': False, # or path to disk cache
|
# Very primitive disk cache - USE WITH EXTREME CAUTION
|
||||||
|
# Run browserless container with -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]"
|
||||||
|
'disk_cache_dir': os.getenv("PUPPETEER_DISK_CACHE", False), # or path to disk cache ending in /, ie /tmp/cache/
|
||||||
'execute_js': self.webdriver_js_execute_code,
|
'execute_js': self.webdriver_js_execute_code,
|
||||||
'extra_wait_ms': extra_wait_ms,
|
'extra_wait_ms': extra_wait_ms,
|
||||||
'include_filters': current_include_filters,
|
'include_filters': current_include_filters,
|
||||||
|
@ -484,14 +336,26 @@ class base_html_playwright(Fetcher):
|
||||||
'url': url,
|
'url': url,
|
||||||
'user_agent': request_headers.get('User-Agent', 'Mozilla/5.0'),
|
'user_agent': request_headers.get('User-Agent', 'Mozilla/5.0'),
|
||||||
'proxy_username': self.proxy.get('username','') if self.proxy else False,
|
'proxy_username': self.proxy.get('username','') if self.proxy else False,
|
||||||
'proxy_password': self.proxy.get('password','') if self.proxy else False,
|
'proxy_password': self.proxy.get('password', '') if self.proxy else False,
|
||||||
|
'no_cache_list': [
|
||||||
|
'twitter',
|
||||||
|
'.pdf'
|
||||||
|
],
|
||||||
|
# Could use https://github.com/easylist/easylist here, or install a plugin
|
||||||
|
'block_url_list': [
|
||||||
|
'adnxs.com',
|
||||||
|
'analytics.twitter.com',
|
||||||
|
'doubleclick.net',
|
||||||
|
'google-analytics.com',
|
||||||
|
'googletagmanager',
|
||||||
|
'trustpilot.com'
|
||||||
|
]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
# @todo /function needs adding ws:// to http:// rebuild this
|
# @todo /function needs adding ws:// to http:// rebuild this
|
||||||
url=browserless_function_url+f"{amp}--disable-features=AudioServiceOutOfProcess&dumpio=true&--disable-remote-fonts",
|
url=browserless_function_url+f"{amp}--disable-features=AudioServiceOutOfProcess&dumpio=true&--disable-remote-fonts",
|
||||||
timeout=wait_browserless_seconds)
|
timeout=wait_browserless_seconds)
|
||||||
|
|
||||||
# 'ziparchive::addglob() will throw an instance of error instead of resulting in a fatal error if glob support is not available.'
|
|
||||||
except ReadTimeout:
|
except ReadTimeout:
|
||||||
raise PageUnloadable(url=url, status_code=None, message=f"No response from browserless in {wait_browserless_seconds}s")
|
raise PageUnloadable(url=url, status_code=None, message=f"No response from browserless in {wait_browserless_seconds}s")
|
||||||
except ConnectTimeout:
|
except ConnectTimeout:
|
||||||
|
@ -535,17 +399,23 @@ class base_html_playwright(Fetcher):
|
||||||
current_include_filters=None,
|
current_include_filters=None,
|
||||||
is_binary=False):
|
is_binary=False):
|
||||||
|
|
||||||
if os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'):
|
# For now, USE_EXPERIMENTAL_PUPPETEER_FETCH is not supported by watches with BrowserSteps (for now!)
|
||||||
# Temporary backup solution until we rewrite the playwright code
|
has_browser_steps = self.browser_steps and list(filter(
|
||||||
return self.run_fetch_browserless_puppeteer(
|
lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'),
|
||||||
url,
|
self.browser_steps))
|
||||||
timeout,
|
|
||||||
request_headers,
|
if not has_browser_steps:
|
||||||
request_body,
|
if os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'):
|
||||||
request_method,
|
# Temporary backup solution until we rewrite the playwright code
|
||||||
ignore_status_codes,
|
return self.run_fetch_browserless_puppeteer(
|
||||||
current_include_filters,
|
url,
|
||||||
is_binary)
|
timeout,
|
||||||
|
request_headers,
|
||||||
|
request_body,
|
||||||
|
request_method,
|
||||||
|
ignore_status_codes,
|
||||||
|
current_include_filters,
|
||||||
|
is_binary)
|
||||||
|
|
||||||
from playwright.sync_api import sync_playwright
|
from playwright.sync_api import sync_playwright
|
||||||
import playwright._impl._api_types
|
import playwright._impl._api_types
|
||||||
|
|
|
@ -0,0 +1,179 @@
|
||||||
|
module.exports = async ({page, context}) => {
|
||||||
|
|
||||||
|
var {
|
||||||
|
url,
|
||||||
|
execute_js,
|
||||||
|
user_agent,
|
||||||
|
extra_wait_ms,
|
||||||
|
req_headers,
|
||||||
|
include_filters,
|
||||||
|
xpath_element_js,
|
||||||
|
screenshot_quality,
|
||||||
|
proxy_username,
|
||||||
|
proxy_password,
|
||||||
|
disk_cache_dir,
|
||||||
|
no_cache_list,
|
||||||
|
block_url_list,
|
||||||
|
} = context;
|
||||||
|
|
||||||
|
await page.setBypassCSP(true)
|
||||||
|
await page.setExtraHTTPHeaders(req_headers);
|
||||||
|
await page.setUserAgent(user_agent);
|
||||||
|
// https://ourcodeworld.com/articles/read/1106/how-to-solve-puppeteer-timeouterror-navigation-timeout-of-30000-ms-exceeded
|
||||||
|
|
||||||
|
await page.setDefaultNavigationTimeout(0);
|
||||||
|
|
||||||
|
if (proxy_username) {
|
||||||
|
await page.authenticate({
|
||||||
|
username: proxy_username,
|
||||||
|
password: proxy_password
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
await page.setViewport({
|
||||||
|
width: 1024,
|
||||||
|
height: 768,
|
||||||
|
deviceScaleFactor: 1,
|
||||||
|
});
|
||||||
|
|
||||||
|
await page.setRequestInterception(true);
|
||||||
|
if (disk_cache_dir) {
|
||||||
|
console.log(">>>>>>>>>>>>>>> LOCAL DISK CACHE ENABLED <<<<<<<<<<<<<<<<<<<<<");
|
||||||
|
}
|
||||||
|
const fs = require('fs');
|
||||||
|
const crypto = require('crypto');
|
||||||
|
|
||||||
|
function file_is_expired(file_path) {
|
||||||
|
if (!fs.existsSync(file_path)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
var stats = fs.statSync(file_path);
|
||||||
|
const now_date = new Date();
|
||||||
|
const expire_seconds = 300;
|
||||||
|
if ((now_date / 1000) - (stats.mtime.getTime() / 1000) > expire_seconds) {
|
||||||
|
console.log("CACHE EXPIRED: " + file_path);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
page.on('request', async (request) => {
|
||||||
|
// General blocking of requests that waste traffic
|
||||||
|
if (block_url_list.some(substring => request.url().toLowerCase().includes(substring))) return request.abort();
|
||||||
|
|
||||||
|
if (disk_cache_dir) {
|
||||||
|
const url = request.url();
|
||||||
|
const key = crypto.createHash('md5').update(url).digest("hex");
|
||||||
|
const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/';
|
||||||
|
|
||||||
|
// https://stackoverflow.com/questions/4482686/check-synchronously-if-file-directory-exists-in-node-js
|
||||||
|
|
||||||
|
if (fs.existsSync(dir_path + key)) {
|
||||||
|
console.log("* CACHE HIT , using - " + dir_path + key + " - " + url);
|
||||||
|
const cached_data = fs.readFileSync(dir_path + key);
|
||||||
|
// @todo headers can come from dir_path+key+".meta" json file
|
||||||
|
request.respond({
|
||||||
|
status: 200,
|
||||||
|
//contentType: 'text/html', //@todo
|
||||||
|
body: cached_data
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
request.continue();
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
if (disk_cache_dir) {
|
||||||
|
page.on('response', async (response) => {
|
||||||
|
const url = response.url();
|
||||||
|
// Basic filtering for sane responses
|
||||||
|
if (response.request().method() != 'GET' || response.request().resourceType() == 'xhr' || response.request().resourceType() == 'document' || response.status() != 200) {
|
||||||
|
console.log("Skipping (not useful) - Status:" + response.status() + " Method:" + response.request().method() + " ResourceType:" + response.request().resourceType() + " " + url);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (no_cache_list.some(substring => url.toLowerCase().includes(substring))) {
|
||||||
|
console.log("Skipping (no_cache_list) - " + url);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
response.buffer().then(buffer => {
|
||||||
|
if (buffer.length > 100) {
|
||||||
|
console.log("Cache - Saving " + response.request().method() + " - " + url + " - " + response.request().resourceType());
|
||||||
|
|
||||||
|
const key = crypto.createHash('md5').update(url).digest("hex");
|
||||||
|
const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/';
|
||||||
|
|
||||||
|
if (!fs.existsSync(dir_path)) {
|
||||||
|
fs.mkdirSync(dir_path, {recursive: true})
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fs.existsSync(dir_path + key)) {
|
||||||
|
if (file_is_expired(dir_path + key)) {
|
||||||
|
fs.writeFileSync(dir_path + key, buffer);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
fs.writeFileSync(dir_path + key, buffer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const r = await page.goto(url, {
|
||||||
|
waitUntil: 'load'
|
||||||
|
});
|
||||||
|
|
||||||
|
await page.waitForTimeout(1000);
|
||||||
|
await page.waitForTimeout(extra_wait_ms);
|
||||||
|
|
||||||
|
if (execute_js) {
|
||||||
|
await page.evaluate(execute_js);
|
||||||
|
await page.waitForTimeout(200);
|
||||||
|
}
|
||||||
|
|
||||||
|
var xpath_data;
|
||||||
|
var instock_data;
|
||||||
|
try {
|
||||||
|
// Not sure the best way here, in the future this should be a new package added to npm then run in browserless
|
||||||
|
// (Once the old playwright is removed)
|
||||||
|
xpath_data = await page.evaluate((include_filters) => {%xpath_scrape_code%}, include_filters);
|
||||||
|
instock_data = await page.evaluate(() => {%instock_scrape_code%});
|
||||||
|
} catch (e) {
|
||||||
|
console.log(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Protocol error (Page.captureScreenshot): Cannot take screenshot with 0 width can come from a proxy auth failure
|
||||||
|
// Wrap it here (for now)
|
||||||
|
|
||||||
|
var b64s = false;
|
||||||
|
try {
|
||||||
|
b64s = await page.screenshot({encoding: "base64", fullPage: true, quality: screenshot_quality, type: 'jpeg'});
|
||||||
|
} catch (e) {
|
||||||
|
console.log(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
// May fail on very large pages with 'WARNING: tile memory limits exceeded, some content may not draw'
|
||||||
|
if (!b64s) {
|
||||||
|
// @todo after text extract, we can place some overlay text with red background to say 'croppped'
|
||||||
|
console.error('ERROR: content-fetcher page was maybe too large for a screenshot, reverting to viewport only screenshot');
|
||||||
|
try {
|
||||||
|
b64s = await page.screenshot({encoding: "base64", quality: screenshot_quality, type: 'jpeg'});
|
||||||
|
} catch (e) {
|
||||||
|
console.log(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var html = await page.content();
|
||||||
|
return {
|
||||||
|
data: {
|
||||||
|
'content': html,
|
||||||
|
'headers': r.headers(),
|
||||||
|
'instock_data': instock_data,
|
||||||
|
'screenshot': b64s,
|
||||||
|
'status_code': r.status(),
|
||||||
|
'xpath_data': xpath_data
|
||||||
|
},
|
||||||
|
type: 'application/json',
|
||||||
|
};
|
||||||
|
};
|
Ładowanie…
Reference in New Issue