From c6ee6687b5ee643a8c05c838f4556e8f11e72e5d Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Thu, 13 Jun 2024 10:50:46 +0200 Subject: [PATCH] Fetching/Requests - Fixing user agent header overrides per-watch of global settings (#2409) --- changedetectionio/content_fetchers/base.py | 2 +- .../content_fetchers/puppeteer.py | 11 ++-- changedetectionio/processors/__init__.py | 18 +++--- changedetectionio/tests/test_request.py | 58 ++++++++++++++++++- 4 files changed, 73 insertions(+), 16 deletions(-) diff --git a/changedetectionio/content_fetchers/base.py b/changedetectionio/content_fetchers/base.py index f817341d..1ca6876e 100644 --- a/changedetectionio/content_fetchers/base.py +++ b/changedetectionio/content_fetchers/base.py @@ -28,7 +28,7 @@ def manage_user_agent(headers, current_ua=''): :return: """ # Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default - ua_in_custom_headers = next((v for k, v in headers.items() if k.lower() == "user-agent"), None) + ua_in_custom_headers = headers.get('User-Agent') if ua_in_custom_headers: return ua_in_custom_headers diff --git a/changedetectionio/content_fetchers/puppeteer.py b/changedetectionio/content_fetchers/puppeteer.py index 725be3b3..923a2be1 100644 --- a/changedetectionio/content_fetchers/puppeteer.py +++ b/changedetectionio/content_fetchers/puppeteer.py @@ -115,12 +115,11 @@ class fetcher(Fetcher): # This user agent is similar to what was used when tweaking the evasions in inject_evasions_into_page(..) user_agent = None - if request_headers: - user_agent = next((value for key, value in request_headers.items() if key.lower().strip() == 'user-agent'), None) - if user_agent: - await self.page.setUserAgent(user_agent) - # Remove it so it's not sent again with headers after - [request_headers.pop(key) for key in list(request_headers) if key.lower().strip() == 'user-agent'.lower().strip()] + if request_headers and request_headers.get('User-Agent'): + # Request_headers should now be CaaseInsensitiveDict + # Remove it so it's not sent again with headers after + user_agent = request_headers.pop('User-Agent').strip() + await self.page.setUserAgent(user_agent) if not user_agent: # Attempt to strip 'HeadlessChrome' etc diff --git a/changedetectionio/processors/__init__.py b/changedetectionio/processors/__init__.py index 8702ee5d..9e4ce6b1 100644 --- a/changedetectionio/processors/__init__.py +++ b/changedetectionio/processors/__init__.py @@ -1,10 +1,10 @@ from abc import abstractmethod -import os -import hashlib -import re -from copy import deepcopy from changedetectionio.strtobool import strtobool +from copy import deepcopy from loguru import logger +import hashlib +import os +import re class difference_detection_processor(): @@ -21,7 +21,7 @@ class difference_detection_processor(): self.watch = deepcopy(self.datastore.data['watching'].get(watch_uuid)) def call_browser(self): - + from requests.structures import CaseInsensitiveDict # Protect against file:// access if re.search(r'^file://', self.watch.get('url', '').strip(), re.IGNORECASE): if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')): @@ -93,14 +93,16 @@ class difference_detection_processor(): self.fetcher.browser_steps_screenshot_path = os.path.join(self.datastore.datastore_path, self.watch.get('uuid')) # Tweak the base config with the per-watch ones - request_headers = self.watch.get('headers', []) - request_headers.update(self.datastore.get_all_base_headers()) - request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=self.watch.get('uuid'))) + request_headers = CaseInsensitiveDict() ua = self.datastore.data['settings']['requests'].get('default_ua') if ua and ua.get(prefer_fetch_backend): request_headers.update({'User-Agent': ua.get(prefer_fetch_backend)}) + request_headers.update(self.watch.get('headers', {})) + request_headers.update(self.datastore.get_all_base_headers()) + request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=self.watch.get('uuid'))) + # https://github.com/psf/requests/issues/4525 # Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot # do this by accident. diff --git a/changedetectionio/tests/test_request.py b/changedetectionio/tests/test_request.py index cfbc7825..1ec501fe 100644 --- a/changedetectionio/tests/test_request.py +++ b/changedetectionio/tests/test_request.py @@ -253,6 +253,62 @@ def test_method_in_request(client, live_server): res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) assert b'Deleted' in res.data +# Re #2408 - user-agent override test, also should handle case-insensitive header deduplication +def test_ua_global_override(client, live_server): + # live_server_setup(live_server) + test_url = url_for('test_headers', _external=True) + + res = client.post( + url_for("settings_page"), + data={ + "application-fetch_backend": "html_requests", + "application-minutes_between_check": 180, + "requests-default_ua-html_requests": "html-requests-user-agent" + }, + follow_redirects=True + ) + assert b'Settings updated' in res.data + + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + + wait_for_all_checks(client) + res = client.get( + url_for("preview_page", uuid="first"), + follow_redirects=True + ) + + assert b"html-requests-user-agent" in res.data + # default user-agent should have shown by now + # now add a custom one in the headers + + + # Add some headers to a request + res = client.post( + url_for("edit_page", uuid="first"), + data={ + "url": test_url, + "tags": "testtag", + "fetch_backend": 'html_requests', + # Important - also test case-insensitive + "headers": "User-AGent: agent-from-watch"}, + follow_redirects=True + ) + assert b"Updated watch." in res.data + wait_for_all_checks(client) + res = client.get( + url_for("preview_page", uuid="first"), + follow_redirects=True + ) + assert b"agent-from-watch" in res.data + assert b"html-requests-user-agent" not in res.data + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data + def test_headers_textfile_in_request(client, live_server): #live_server_setup(live_server) # Add our URL to the import page @@ -333,7 +389,7 @@ def test_headers_textfile_in_request(client, live_server): # Not needed anymore os.unlink('test-datastore/headers.txt') os.unlink('test-datastore/headers-testtag.txt') - os.unlink('test-datastore/' + extract_UUID_from_client(client) + '/headers.txt') + # The service should echo back the request verb res = client.get( url_for("preview_page", uuid="first"),