diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index 4568831c..ce051920 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -51,6 +51,7 @@ class BrowserStepsStepException(Exception): return +# @todo - make base Exception class that announces via logger() class PageUnloadable(Exception): def __init__(self, status_code, url, message, screenshot=False): # Set this so we can use it in other parts of the app @@ -389,10 +390,24 @@ class base_html_playwright(Fetcher): raise PageUnloadable(url=url, status_code=None, message=f"Timed out connecting to browserless, retrying..") else: # 200 Here means that the communication to browserless worked only, not the page state - if response.status_code == 200: + try: + x = response.json() + except Exception as e: + raise PageUnloadable(url=url, message="Error reading JSON response from browserless") + + try: + self.status_code = response.status_code + except Exception as e: + raise PageUnloadable(url=url, message="Error reading status_code code response from browserless") + + self.headers = x.get('headers') + + if self.status_code != 200 and not ignore_status_codes: + raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, page_html=x.get('content','')) + + if self.status_code == 200: import base64 - x = response.json() if not x.get('screenshot'): # https://github.com/puppeteer/puppeteer/blob/v1.0.0/docs/troubleshooting.md#tips # https://github.com/puppeteer/puppeteer/issues/1834 @@ -403,16 +418,10 @@ class base_html_playwright(Fetcher): if not x.get('content', '').strip(): raise EmptyReply(url=url, status_code=None) - if x.get('status_code', 200) != 200 and not ignore_status_codes: - raise Non200ErrorCodeReceived(url=url, status_code=x.get('status_code', 200), page_html=x['content']) - self.content = x.get('content') - self.headers = x.get('headers') self.instock_data = x.get('instock_data') self.screenshot = base64.b64decode(x.get('screenshot')) - self.status_code = x.get('status_code') self.xpath_data = x.get('xpath_data') - else: # Some other error from browserless raise PageUnloadable(url=url, status_code=None, message=response.content.decode('utf-8')) @@ -742,6 +751,8 @@ class html_requests(Fetcher): if encoding: r.encoding = encoding + self.headers = r.headers + if not r.content or not len(r.content): raise EmptyReply(url=url, status_code=r.status_code) @@ -758,7 +769,7 @@ class html_requests(Fetcher): else: self.content = r.text - self.headers = r.headers + self.raw_content = r.content diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index ec93e24f..d4cf2b39 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -56,6 +56,7 @@ base_config = { 'previous_md5': False, 'previous_md5_before_filters': False, # Used for skipping changedetection entirely 'proxy': None, # Preferred proxy connection + 'remote_server_reply': None, # From 'server' reply header 'subtractive_selectors': [], 'tag': '', # Old system of text name for a tag, to be removed 'tags': [], # list of UUIDs to App.Tags diff --git a/changedetectionio/store.py b/changedetectionio/store.py index 4296515b..768b8af2 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -255,6 +255,7 @@ class ChangeDetectionStore: 'last_viewed': 0, 'previous_md5': False, 'previous_md5_before_filters': False, + 'remote_server_reply': None, 'track_ldjson_price_data': None, }) diff --git a/changedetectionio/tests/test_request.py b/changedetectionio/tests/test_request.py index 9586363f..5deaafa8 100644 --- a/changedetectionio/tests/test_request.py +++ b/changedetectionio/tests/test_request.py @@ -10,7 +10,7 @@ def test_setup(live_server): # Hard to just add more live server URLs when one test is already running (I think) # So we add our test here (was in a different file) def test_headers_in_request(client, live_server): - #live_server_setup(live_server) + #ve_server_setup(live_server) # Add our URL to the import page test_url = url_for('test_headers', _external=True) if os.getenv('PLAYWRIGHT_DRIVER_URL'): @@ -70,16 +70,17 @@ def test_headers_in_request(client, live_server): wait_for_all_checks(client) - # Re #137 - Examine the JSON index file, it should have only one set of headers entered + # Re #137 - It should have only one set of headers entered watches_with_headers = 0 - with open('test-datastore/url-watches.json') as f: - app_struct = json.load(f) - for uuid in app_struct['watching']: - if (len(app_struct['watching'][uuid]['headers'])): + for k, watch in client.application.config.get('DATASTORE').data.get('watching').items(): + if (len(watch['headers'])): watches_with_headers += 1 + assert watches_with_headers == 1 + + # 'server' http header was automatically recorded + for k, watch in client.application.config.get('DATASTORE').data.get('watching').items(): + assert 'custom' in watch.get('remote_server_reply') # added in util.py - # Should be only one with headers set - assert watches_with_headers==1 res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) assert b'Deleted' in res.data diff --git a/changedetectionio/tests/util.py b/changedetectionio/tests/util.py index ed1e424e..5974e47a 100644 --- a/changedetectionio/tests/util.py +++ b/changedetectionio/tests/util.py @@ -175,12 +175,16 @@ def live_server_setup(live_server): @live_server.app.route('/test-headers') def test_headers(): - output= [] + output = [] for header in request.headers: - output.append("{}:{}".format(str(header[0]),str(header[1]) )) + output.append("{}:{}".format(str(header[0]), str(header[1]))) - return "\n".join(output) + content = "\n".join(output) + + resp = make_response(content, 200) + resp.headers['server'] = 'custom' + return resp # Just return the body in the request @live_server.app.route('/test-body', methods=['POST', 'GET']) diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index 469ee6ca..46984089 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -491,6 +491,16 @@ class update_worker(threading.Thread): if self.datastore.data['watching'].get(uuid): # Always record that we atleast tried count = self.datastore.data['watching'][uuid].get('check_count', 0) + 1 + + # Record the 'server' header reply, can be used for actions in the future like cloudflare/akamai workarounds + try: + server_header = update_handler.fetcher.headers.get('server', '').strip().lower()[:255] + self.datastore.update_watch(uuid=uuid, + update_obj={'remote_server_reply': server_header} + ) + except Exception as e: + pass + self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3), 'last_checked': round(time.time()), 'check_count': count diff --git a/docker-compose.yml b/docker-compose.yml index a7e7e166..e9bdf85e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -94,7 +94,8 @@ services: # # Used for fetching pages via Playwright+Chrome where you need Javascript support. - # Note: works well but is deprecated, does not fetch full page screenshots (doesnt work with Visual Selector) and other issues + # Note: Works well but is deprecated, does not fetch full page screenshots (doesnt work with Visual Selector) + # Does not report status codes (200, 404, 403) and other issues # More information about the advantages of playwright/browserless https://www.browserless.io/blog/2023/12/13/migrating-selenium-to-playwright/ # browser-chrome: # hostname: browser-chrome