kopia lustrzana https://github.com/dgtlmoon/changedetection.io
Fetching - Always record `server` software reply headers (will be used in the future) (#2143)
rodzic
fca40e4d5b
commit
7d96b4ba83
|
@ -51,6 +51,7 @@ class BrowserStepsStepException(Exception):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
# @todo - make base Exception class that announces via logger()
|
||||||
class PageUnloadable(Exception):
|
class PageUnloadable(Exception):
|
||||||
def __init__(self, status_code, url, message, screenshot=False):
|
def __init__(self, status_code, url, message, screenshot=False):
|
||||||
# Set this so we can use it in other parts of the app
|
# Set this so we can use it in other parts of the app
|
||||||
|
@ -389,10 +390,24 @@ class base_html_playwright(Fetcher):
|
||||||
raise PageUnloadable(url=url, status_code=None, message=f"Timed out connecting to browserless, retrying..")
|
raise PageUnloadable(url=url, status_code=None, message=f"Timed out connecting to browserless, retrying..")
|
||||||
else:
|
else:
|
||||||
# 200 Here means that the communication to browserless worked only, not the page state
|
# 200 Here means that the communication to browserless worked only, not the page state
|
||||||
if response.status_code == 200:
|
try:
|
||||||
|
x = response.json()
|
||||||
|
except Exception as e:
|
||||||
|
raise PageUnloadable(url=url, message="Error reading JSON response from browserless")
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.status_code = response.status_code
|
||||||
|
except Exception as e:
|
||||||
|
raise PageUnloadable(url=url, message="Error reading status_code code response from browserless")
|
||||||
|
|
||||||
|
self.headers = x.get('headers')
|
||||||
|
|
||||||
|
if self.status_code != 200 and not ignore_status_codes:
|
||||||
|
raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, page_html=x.get('content',''))
|
||||||
|
|
||||||
|
if self.status_code == 200:
|
||||||
import base64
|
import base64
|
||||||
|
|
||||||
x = response.json()
|
|
||||||
if not x.get('screenshot'):
|
if not x.get('screenshot'):
|
||||||
# https://github.com/puppeteer/puppeteer/blob/v1.0.0/docs/troubleshooting.md#tips
|
# https://github.com/puppeteer/puppeteer/blob/v1.0.0/docs/troubleshooting.md#tips
|
||||||
# https://github.com/puppeteer/puppeteer/issues/1834
|
# https://github.com/puppeteer/puppeteer/issues/1834
|
||||||
|
@ -403,16 +418,10 @@ class base_html_playwright(Fetcher):
|
||||||
if not x.get('content', '').strip():
|
if not x.get('content', '').strip():
|
||||||
raise EmptyReply(url=url, status_code=None)
|
raise EmptyReply(url=url, status_code=None)
|
||||||
|
|
||||||
if x.get('status_code', 200) != 200 and not ignore_status_codes:
|
|
||||||
raise Non200ErrorCodeReceived(url=url, status_code=x.get('status_code', 200), page_html=x['content'])
|
|
||||||
|
|
||||||
self.content = x.get('content')
|
self.content = x.get('content')
|
||||||
self.headers = x.get('headers')
|
|
||||||
self.instock_data = x.get('instock_data')
|
self.instock_data = x.get('instock_data')
|
||||||
self.screenshot = base64.b64decode(x.get('screenshot'))
|
self.screenshot = base64.b64decode(x.get('screenshot'))
|
||||||
self.status_code = x.get('status_code')
|
|
||||||
self.xpath_data = x.get('xpath_data')
|
self.xpath_data = x.get('xpath_data')
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Some other error from browserless
|
# Some other error from browserless
|
||||||
raise PageUnloadable(url=url, status_code=None, message=response.content.decode('utf-8'))
|
raise PageUnloadable(url=url, status_code=None, message=response.content.decode('utf-8'))
|
||||||
|
@ -742,6 +751,8 @@ class html_requests(Fetcher):
|
||||||
if encoding:
|
if encoding:
|
||||||
r.encoding = encoding
|
r.encoding = encoding
|
||||||
|
|
||||||
|
self.headers = r.headers
|
||||||
|
|
||||||
if not r.content or not len(r.content):
|
if not r.content or not len(r.content):
|
||||||
raise EmptyReply(url=url, status_code=r.status_code)
|
raise EmptyReply(url=url, status_code=r.status_code)
|
||||||
|
|
||||||
|
@ -758,7 +769,7 @@ class html_requests(Fetcher):
|
||||||
else:
|
else:
|
||||||
self.content = r.text
|
self.content = r.text
|
||||||
|
|
||||||
self.headers = r.headers
|
|
||||||
self.raw_content = r.content
|
self.raw_content = r.content
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -56,6 +56,7 @@ base_config = {
|
||||||
'previous_md5': False,
|
'previous_md5': False,
|
||||||
'previous_md5_before_filters': False, # Used for skipping changedetection entirely
|
'previous_md5_before_filters': False, # Used for skipping changedetection entirely
|
||||||
'proxy': None, # Preferred proxy connection
|
'proxy': None, # Preferred proxy connection
|
||||||
|
'remote_server_reply': None, # From 'server' reply header
|
||||||
'subtractive_selectors': [],
|
'subtractive_selectors': [],
|
||||||
'tag': '', # Old system of text name for a tag, to be removed
|
'tag': '', # Old system of text name for a tag, to be removed
|
||||||
'tags': [], # list of UUIDs to App.Tags
|
'tags': [], # list of UUIDs to App.Tags
|
||||||
|
|
|
@ -255,6 +255,7 @@ class ChangeDetectionStore:
|
||||||
'last_viewed': 0,
|
'last_viewed': 0,
|
||||||
'previous_md5': False,
|
'previous_md5': False,
|
||||||
'previous_md5_before_filters': False,
|
'previous_md5_before_filters': False,
|
||||||
|
'remote_server_reply': None,
|
||||||
'track_ldjson_price_data': None,
|
'track_ldjson_price_data': None,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,7 @@ def test_setup(live_server):
|
||||||
# Hard to just add more live server URLs when one test is already running (I think)
|
# Hard to just add more live server URLs when one test is already running (I think)
|
||||||
# So we add our test here (was in a different file)
|
# So we add our test here (was in a different file)
|
||||||
def test_headers_in_request(client, live_server):
|
def test_headers_in_request(client, live_server):
|
||||||
#live_server_setup(live_server)
|
#ve_server_setup(live_server)
|
||||||
# Add our URL to the import page
|
# Add our URL to the import page
|
||||||
test_url = url_for('test_headers', _external=True)
|
test_url = url_for('test_headers', _external=True)
|
||||||
if os.getenv('PLAYWRIGHT_DRIVER_URL'):
|
if os.getenv('PLAYWRIGHT_DRIVER_URL'):
|
||||||
|
@ -70,16 +70,17 @@ def test_headers_in_request(client, live_server):
|
||||||
|
|
||||||
wait_for_all_checks(client)
|
wait_for_all_checks(client)
|
||||||
|
|
||||||
# Re #137 - Examine the JSON index file, it should have only one set of headers entered
|
# Re #137 - It should have only one set of headers entered
|
||||||
watches_with_headers = 0
|
watches_with_headers = 0
|
||||||
with open('test-datastore/url-watches.json') as f:
|
for k, watch in client.application.config.get('DATASTORE').data.get('watching').items():
|
||||||
app_struct = json.load(f)
|
if (len(watch['headers'])):
|
||||||
for uuid in app_struct['watching']:
|
|
||||||
if (len(app_struct['watching'][uuid]['headers'])):
|
|
||||||
watches_with_headers += 1
|
watches_with_headers += 1
|
||||||
|
assert watches_with_headers == 1
|
||||||
|
|
||||||
|
# 'server' http header was automatically recorded
|
||||||
|
for k, watch in client.application.config.get('DATASTORE').data.get('watching').items():
|
||||||
|
assert 'custom' in watch.get('remote_server_reply') # added in util.py
|
||||||
|
|
||||||
# Should be only one with headers set
|
|
||||||
assert watches_with_headers==1
|
|
||||||
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||||
assert b'Deleted' in res.data
|
assert b'Deleted' in res.data
|
||||||
|
|
||||||
|
|
|
@ -175,12 +175,16 @@ def live_server_setup(live_server):
|
||||||
@live_server.app.route('/test-headers')
|
@live_server.app.route('/test-headers')
|
||||||
def test_headers():
|
def test_headers():
|
||||||
|
|
||||||
output= []
|
output = []
|
||||||
|
|
||||||
for header in request.headers:
|
for header in request.headers:
|
||||||
output.append("{}:{}".format(str(header[0]),str(header[1]) ))
|
output.append("{}:{}".format(str(header[0]), str(header[1])))
|
||||||
|
|
||||||
return "\n".join(output)
|
content = "\n".join(output)
|
||||||
|
|
||||||
|
resp = make_response(content, 200)
|
||||||
|
resp.headers['server'] = 'custom'
|
||||||
|
return resp
|
||||||
|
|
||||||
# Just return the body in the request
|
# Just return the body in the request
|
||||||
@live_server.app.route('/test-body', methods=['POST', 'GET'])
|
@live_server.app.route('/test-body', methods=['POST', 'GET'])
|
||||||
|
|
|
@ -491,6 +491,16 @@ class update_worker(threading.Thread):
|
||||||
if self.datastore.data['watching'].get(uuid):
|
if self.datastore.data['watching'].get(uuid):
|
||||||
# Always record that we atleast tried
|
# Always record that we atleast tried
|
||||||
count = self.datastore.data['watching'][uuid].get('check_count', 0) + 1
|
count = self.datastore.data['watching'][uuid].get('check_count', 0) + 1
|
||||||
|
|
||||||
|
# Record the 'server' header reply, can be used for actions in the future like cloudflare/akamai workarounds
|
||||||
|
try:
|
||||||
|
server_header = update_handler.fetcher.headers.get('server', '').strip().lower()[:255]
|
||||||
|
self.datastore.update_watch(uuid=uuid,
|
||||||
|
update_obj={'remote_server_reply': server_header}
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
|
||||||
self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3),
|
self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3),
|
||||||
'last_checked': round(time.time()),
|
'last_checked': round(time.time()),
|
||||||
'check_count': count
|
'check_count': count
|
||||||
|
|
|
@ -94,7 +94,8 @@ services:
|
||||||
#
|
#
|
||||||
|
|
||||||
# Used for fetching pages via Playwright+Chrome where you need Javascript support.
|
# Used for fetching pages via Playwright+Chrome where you need Javascript support.
|
||||||
# Note: works well but is deprecated, does not fetch full page screenshots (doesnt work with Visual Selector) and other issues
|
# Note: Works well but is deprecated, does not fetch full page screenshots (doesnt work with Visual Selector)
|
||||||
|
# Does not report status codes (200, 404, 403) and other issues
|
||||||
# More information about the advantages of playwright/browserless https://www.browserless.io/blog/2023/12/13/migrating-selenium-to-playwright/
|
# More information about the advantages of playwright/browserless https://www.browserless.io/blog/2023/12/13/migrating-selenium-to-playwright/
|
||||||
# browser-chrome:
|
# browser-chrome:
|
||||||
# hostname: browser-chrome
|
# hostname: browser-chrome
|
||||||
|
|
Ładowanie…
Reference in New Issue