diff --git a/changedetectionio/content_fetchers/puppeteer.py b/changedetectionio/content_fetchers/puppeteer.py index cad1b6b8..a497cb16 100644 --- a/changedetectionio/content_fetchers/puppeteer.py +++ b/changedetectionio/content_fetchers/puppeteer.py @@ -9,7 +9,6 @@ from loguru import logger from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError - class fetcher(Fetcher): fetcher_description = "Puppeteer/direct {}/Javascript".format( os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize() diff --git a/changedetectionio/content_fetchers/requests.py b/changedetectionio/content_fetchers/requests.py index b743dbce..2c28cda7 100644 --- a/changedetectionio/content_fetchers/requests.py +++ b/changedetectionio/content_fetchers/requests.py @@ -30,11 +30,6 @@ class fetcher(Fetcher): if self.browser_steps_get_valid_steps(): raise BrowserStepsInUnsupportedFetcher(url=url) - # Make requests use a more modern looking user-agent - if not {k.lower(): v for k, v in request_headers.items()}.get('user-agent', None): - request_headers['User-Agent'] = os.getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT", - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36') - proxies = {} # Allows override the proxy on a per-request basis diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 2d64a227..673be9ca 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -526,6 +526,10 @@ class SingleExtraBrowser(Form): browser_connection_url = StringField('Browser connection URL', [validators.Optional()], render_kw={"placeholder": "wss://brightdata... wss://oxylabs etc", "size":50}) # @todo do the validation here instead +class DefaultUAInputForm(Form): + html_requests = StringField('Plaintext requests', validators=[validators.Optional()], render_kw={"placeholder": ""}) + if os.getenv("PLAYWRIGHT_DRIVER_URL") or os.getenv("WEBDRIVER_URL"): + html_webdriver = StringField('Chrome requests', validators=[validators.Optional()], render_kw={"placeholder": ""}) # datastore.data['settings']['requests'].. class globalSettingsRequestForm(Form): @@ -537,6 +541,8 @@ class globalSettingsRequestForm(Form): extra_proxies = FieldList(FormField(SingleExtraProxy), min_entries=5) extra_browsers = FieldList(FormField(SingleExtraBrowser), min_entries=5) + default_ua = FormField(DefaultUAInputForm, label="Default User-Agent overrides") + def validate_extra_proxies(self, extra_validators=None): for e in self.data['extra_proxies']: if e.get('proxy_name') or e.get('proxy_url'): diff --git a/changedetectionio/model/App.py b/changedetectionio/model/App.py index 1202d5db..75384f17 100644 --- a/changedetectionio/model/App.py +++ b/changedetectionio/model/App.py @@ -6,6 +6,7 @@ from changedetectionio.notification import ( ) _FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT = 6 +DEFAULT_SETTINGS_HEADERS_USERAGENT='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36' class model(dict): base_config = { @@ -22,6 +23,10 @@ class model(dict): 'time_between_check': {'weeks': None, 'days': None, 'hours': 3, 'minutes': None, 'seconds': None}, 'timeout': int(getenv("DEFAULT_SETTINGS_REQUESTS_TIMEOUT", "45")), # Default 45 seconds 'workers': int(getenv("DEFAULT_SETTINGS_REQUESTS_WORKERS", "10")), # Number of threads, lower is better for slow connections + 'default_ua': { + 'html_requests': getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT", DEFAULT_SETTINGS_HEADERS_USERAGENT), + 'html_webdriver': None, + } }, 'application': { # Custom notification content diff --git a/changedetectionio/processors/__init__.py b/changedetectionio/processors/__init__.py index e2b54481..8702ee5d 100644 --- a/changedetectionio/processors/__init__.py +++ b/changedetectionio/processors/__init__.py @@ -97,6 +97,10 @@ class difference_detection_processor(): request_headers.update(self.datastore.get_all_base_headers()) request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=self.watch.get('uuid'))) + ua = self.datastore.data['settings']['requests'].get('default_ua') + if ua and ua.get(prefer_fetch_backend): + request_headers.update({'User-Agent': ua.get(prefer_fetch_backend)}) + # https://github.com/psf/requests/issues/4525 # Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot # do this by accident. diff --git a/changedetectionio/store.py b/changedetectionio/store.py index 884c617a..afa6b2ae 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -554,7 +554,6 @@ class ChangeDetectionStore: return os.path.isfile(filepath) def get_all_base_headers(self): - from .model.App import parse_headers_from_text_file headers = {} # Global app settings headers.update(self.data['settings'].get('headers', {})) diff --git a/changedetectionio/templates/settings.html b/changedetectionio/templates/settings.html index e72c7818..0e3cea34 100644 --- a/changedetectionio/templates/settings.html +++ b/changedetectionio/templates/settings.html @@ -108,8 +108,6 @@

Use the Basic method (default) where your watched sites don't need Javascript to render.

The Chrome/Javascript method requires a network connection to a running WebDriver+Chrome server, set by the ENV var 'WEBDRIVER_URL'.

-
- Tip: Connect using Bright Data and Oxylabs Proxies, find out more here.
@@ -121,6 +119,18 @@ {{ render_field(form.application.form.webdriver_delay) }}
+
+ {{ render_field(form.requests.form.default_ua) }} + + Applied to all requests.

+ Note: Simply changing the User-Agent often does not defeat anti-robot technologies, it's important to consider all of the ways that the browser is detected. +
+
+
@@ -190,7 +200,7 @@ nav - + Chrome Chrome Webstore

diff --git a/changedetectionio/tests/test_request.py b/changedetectionio/tests/test_request.py index 869ea349..cfbc7825 100644 --- a/changedetectionio/tests/test_request.py +++ b/changedetectionio/tests/test_request.py @@ -256,12 +256,40 @@ def test_method_in_request(client, live_server): def test_headers_textfile_in_request(client, live_server): #live_server_setup(live_server) # Add our URL to the import page + + webdriver_ua = "Hello fancy webdriver UA 1.0" + requests_ua = "Hello basic requests UA 1.1" + test_url = url_for('test_headers', _external=True) if os.getenv('PLAYWRIGHT_DRIVER_URL'): # Because its no longer calling back to localhost but from the browser container, set in test-only.yml test_url = test_url.replace('localhost', 'cdio') - print ("TEST URL IS ",test_url) + form_data = { + "application-fetch_backend": "html_requests", + "application-minutes_between_check": 180, + "requests-default_ua-html_requests": requests_ua + } + + if os.getenv('PLAYWRIGHT_DRIVER_URL'): + form_data["requests-default_ua-html_webdriver"] = webdriver_ua + + res = client.post( + url_for("settings_page"), + data=form_data, + follow_redirects=True + ) + assert b'Settings updated' in res.data + + res = client.get(url_for("settings_page")) + + # Only when some kind of real browser is setup + if os.getenv('PLAYWRIGHT_DRIVER_URL'): + assert b'requests-default_ua-html_webdriver' in res.data + + # Field should always be there + assert b"requests-default_ua-html_requests" in res.data + # Add the test URL twice, we will check res = client.post( url_for("import_page"), @@ -272,15 +300,14 @@ def test_headers_textfile_in_request(client, live_server): wait_for_all_checks(client) - # Add some headers to a request res = client.post( url_for("edit_page", uuid="first"), data={ - "url": test_url, - "tags": "testtag", - "fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests', - "headers": "xxx:ooo\ncool:yeah\r\n"}, + "url": test_url, + "tags": "testtag", + "fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests', + "headers": "xxx:ooo\ncool:yeah\r\n"}, follow_redirects=True ) assert b"Updated watch." in res.data @@ -292,7 +319,7 @@ def test_headers_textfile_in_request(client, live_server): with open('test-datastore/headers.txt', 'w') as f: f.write("global-header: nice\r\nnext-global-header: nice") - with open('test-datastore/'+extract_UUID_from_client(client)+'/headers.txt', 'w') as f: + with open('test-datastore/' + extract_UUID_from_client(client) + '/headers.txt', 'w') as f: f.write("watch-header: nice") client.get(url_for("form_watch_checknow"), follow_redirects=True) @@ -306,7 +333,7 @@ def test_headers_textfile_in_request(client, live_server): # Not needed anymore os.unlink('test-datastore/headers.txt') os.unlink('test-datastore/headers-testtag.txt') - os.unlink('test-datastore/'+extract_UUID_from_client(client)+'/headers.txt') + os.unlink('test-datastore/' + extract_UUID_from_client(client) + '/headers.txt') # The service should echo back the request verb res = client.get( url_for("preview_page", uuid="first"), @@ -319,7 +346,12 @@ def test_headers_textfile_in_request(client, live_server): assert b"Watch-Header:nice" in res.data assert b"Tag-Header:test" in res.data + # Check the custom UA from system settings page made it through + if os.getenv('PLAYWRIGHT_DRIVER_URL'): + assert "User-Agent:".encode('utf-8') + webdriver_ua.encode('utf-8') in res.data + else: + assert "User-Agent:".encode('utf-8') + requests_ua.encode('utf-8') in res.data - #unlink headers.txt on start/stop + # unlink headers.txt on start/stop res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) - assert b'Deleted' in res.data \ No newline at end of file + assert b'Deleted' in res.data