kopia lustrzana https://github.com/dgtlmoon/changedetection.io
Ability to set default User-Agent for either fetching types directly in the UI (#2375)
rodzic
a8959be348
commit
f49eb4567f
|
@ -9,7 +9,6 @@ from loguru import logger
|
||||||
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
|
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
|
||||||
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError
|
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError
|
||||||
|
|
||||||
|
|
||||||
class fetcher(Fetcher):
|
class fetcher(Fetcher):
|
||||||
fetcher_description = "Puppeteer/direct {}/Javascript".format(
|
fetcher_description = "Puppeteer/direct {}/Javascript".format(
|
||||||
os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize()
|
os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize()
|
||||||
|
|
|
@ -30,11 +30,6 @@ class fetcher(Fetcher):
|
||||||
if self.browser_steps_get_valid_steps():
|
if self.browser_steps_get_valid_steps():
|
||||||
raise BrowserStepsInUnsupportedFetcher(url=url)
|
raise BrowserStepsInUnsupportedFetcher(url=url)
|
||||||
|
|
||||||
# Make requests use a more modern looking user-agent
|
|
||||||
if not {k.lower(): v for k, v in request_headers.items()}.get('user-agent', None):
|
|
||||||
request_headers['User-Agent'] = os.getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT",
|
|
||||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36')
|
|
||||||
|
|
||||||
proxies = {}
|
proxies = {}
|
||||||
|
|
||||||
# Allows override the proxy on a per-request basis
|
# Allows override the proxy on a per-request basis
|
||||||
|
|
|
@ -526,6 +526,10 @@ class SingleExtraBrowser(Form):
|
||||||
browser_connection_url = StringField('Browser connection URL', [validators.Optional()], render_kw={"placeholder": "wss://brightdata... wss://oxylabs etc", "size":50})
|
browser_connection_url = StringField('Browser connection URL', [validators.Optional()], render_kw={"placeholder": "wss://brightdata... wss://oxylabs etc", "size":50})
|
||||||
# @todo do the validation here instead
|
# @todo do the validation here instead
|
||||||
|
|
||||||
|
class DefaultUAInputForm(Form):
|
||||||
|
html_requests = StringField('Plaintext requests', validators=[validators.Optional()], render_kw={"placeholder": "<default>"})
|
||||||
|
if os.getenv("PLAYWRIGHT_DRIVER_URL") or os.getenv("WEBDRIVER_URL"):
|
||||||
|
html_webdriver = StringField('Chrome requests', validators=[validators.Optional()], render_kw={"placeholder": "<default>"})
|
||||||
|
|
||||||
# datastore.data['settings']['requests']..
|
# datastore.data['settings']['requests']..
|
||||||
class globalSettingsRequestForm(Form):
|
class globalSettingsRequestForm(Form):
|
||||||
|
@ -537,6 +541,8 @@ class globalSettingsRequestForm(Form):
|
||||||
extra_proxies = FieldList(FormField(SingleExtraProxy), min_entries=5)
|
extra_proxies = FieldList(FormField(SingleExtraProxy), min_entries=5)
|
||||||
extra_browsers = FieldList(FormField(SingleExtraBrowser), min_entries=5)
|
extra_browsers = FieldList(FormField(SingleExtraBrowser), min_entries=5)
|
||||||
|
|
||||||
|
default_ua = FormField(DefaultUAInputForm, label="Default User-Agent overrides")
|
||||||
|
|
||||||
def validate_extra_proxies(self, extra_validators=None):
|
def validate_extra_proxies(self, extra_validators=None):
|
||||||
for e in self.data['extra_proxies']:
|
for e in self.data['extra_proxies']:
|
||||||
if e.get('proxy_name') or e.get('proxy_url'):
|
if e.get('proxy_name') or e.get('proxy_url'):
|
||||||
|
|
|
@ -6,6 +6,7 @@ from changedetectionio.notification import (
|
||||||
)
|
)
|
||||||
|
|
||||||
_FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT = 6
|
_FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT = 6
|
||||||
|
DEFAULT_SETTINGS_HEADERS_USERAGENT='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
|
||||||
|
|
||||||
class model(dict):
|
class model(dict):
|
||||||
base_config = {
|
base_config = {
|
||||||
|
@ -22,6 +23,10 @@ class model(dict):
|
||||||
'time_between_check': {'weeks': None, 'days': None, 'hours': 3, 'minutes': None, 'seconds': None},
|
'time_between_check': {'weeks': None, 'days': None, 'hours': 3, 'minutes': None, 'seconds': None},
|
||||||
'timeout': int(getenv("DEFAULT_SETTINGS_REQUESTS_TIMEOUT", "45")), # Default 45 seconds
|
'timeout': int(getenv("DEFAULT_SETTINGS_REQUESTS_TIMEOUT", "45")), # Default 45 seconds
|
||||||
'workers': int(getenv("DEFAULT_SETTINGS_REQUESTS_WORKERS", "10")), # Number of threads, lower is better for slow connections
|
'workers': int(getenv("DEFAULT_SETTINGS_REQUESTS_WORKERS", "10")), # Number of threads, lower is better for slow connections
|
||||||
|
'default_ua': {
|
||||||
|
'html_requests': getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT", DEFAULT_SETTINGS_HEADERS_USERAGENT),
|
||||||
|
'html_webdriver': None,
|
||||||
|
}
|
||||||
},
|
},
|
||||||
'application': {
|
'application': {
|
||||||
# Custom notification content
|
# Custom notification content
|
||||||
|
|
|
@ -97,6 +97,10 @@ class difference_detection_processor():
|
||||||
request_headers.update(self.datastore.get_all_base_headers())
|
request_headers.update(self.datastore.get_all_base_headers())
|
||||||
request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=self.watch.get('uuid')))
|
request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=self.watch.get('uuid')))
|
||||||
|
|
||||||
|
ua = self.datastore.data['settings']['requests'].get('default_ua')
|
||||||
|
if ua and ua.get(prefer_fetch_backend):
|
||||||
|
request_headers.update({'User-Agent': ua.get(prefer_fetch_backend)})
|
||||||
|
|
||||||
# https://github.com/psf/requests/issues/4525
|
# https://github.com/psf/requests/issues/4525
|
||||||
# Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot
|
# Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot
|
||||||
# do this by accident.
|
# do this by accident.
|
||||||
|
|
|
@ -554,7 +554,6 @@ class ChangeDetectionStore:
|
||||||
return os.path.isfile(filepath)
|
return os.path.isfile(filepath)
|
||||||
|
|
||||||
def get_all_base_headers(self):
|
def get_all_base_headers(self):
|
||||||
from .model.App import parse_headers_from_text_file
|
|
||||||
headers = {}
|
headers = {}
|
||||||
# Global app settings
|
# Global app settings
|
||||||
headers.update(self.data['settings'].get('headers', {}))
|
headers.update(self.data['settings'].get('headers', {}))
|
||||||
|
|
|
@ -108,8 +108,6 @@
|
||||||
<p>Use the <strong>Basic</strong> method (default) where your watched sites don't need Javascript to render.</p>
|
<p>Use the <strong>Basic</strong> method (default) where your watched sites don't need Javascript to render.</p>
|
||||||
<p>The <strong>Chrome/Javascript</strong> method requires a network connection to a running WebDriver+Chrome server, set by the ENV var 'WEBDRIVER_URL'. </p>
|
<p>The <strong>Chrome/Javascript</strong> method requires a network connection to a running WebDriver+Chrome server, set by the ENV var 'WEBDRIVER_URL'. </p>
|
||||||
</span>
|
</span>
|
||||||
<br>
|
|
||||||
Tip: <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Proxy-configuration#brightdata-proxy-support">Connect using Bright Data and Oxylabs Proxies, find out more here.</a>
|
|
||||||
</div>
|
</div>
|
||||||
<fieldset class="pure-group" id="webdriver-override-options" data-visible-for="application-fetch_backend=html_webdriver">
|
<fieldset class="pure-group" id="webdriver-override-options" data-visible-for="application-fetch_backend=html_webdriver">
|
||||||
<div class="pure-form-message-inline">
|
<div class="pure-form-message-inline">
|
||||||
|
@ -121,6 +119,18 @@
|
||||||
{{ render_field(form.application.form.webdriver_delay) }}
|
{{ render_field(form.application.form.webdriver_delay) }}
|
||||||
</div>
|
</div>
|
||||||
</fieldset>
|
</fieldset>
|
||||||
|
<div class="pure-control-group inline-radio">
|
||||||
|
{{ render_field(form.requests.form.default_ua) }}
|
||||||
|
<span class="pure-form-message-inline">
|
||||||
|
Applied to all requests.<br><br>
|
||||||
|
Note: Simply changing the User-Agent often does not defeat anti-robot technologies, it's important to consider <a href="https://changedetection.io/tutorial/what-are-main-types-anti-robot-mechanisms">all of the ways that the browser is detected</a>.
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<div class="pure-control-group">
|
||||||
|
<br>
|
||||||
|
Tip: <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Proxy-configuration#brightdata-proxy-support">Connect using Bright Data and Oxylabs Proxies, find out more here.</a>
|
||||||
|
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="tab-pane-inner" id="filters">
|
<div class="tab-pane-inner" id="filters">
|
||||||
|
@ -190,7 +200,7 @@ nav
|
||||||
<a id="chrome-extension-link"
|
<a id="chrome-extension-link"
|
||||||
title="Try our new Chrome Extension!"
|
title="Try our new Chrome Extension!"
|
||||||
href="https://chromewebstore.google.com/detail/changedetectionio-website/kefcfmgmlhmankjmnbijimhofdjekbop">
|
href="https://chromewebstore.google.com/detail/changedetectionio-website/kefcfmgmlhmankjmnbijimhofdjekbop">
|
||||||
<img src="{{ url_for('static_content', group='images', filename='Google-Chrome-icon.png') }}">
|
<img src="{{ url_for('static_content', group='images', filename='Google-Chrome-icon.png') }}" alt="Chrome">
|
||||||
Chrome Webstore
|
Chrome Webstore
|
||||||
</a>
|
</a>
|
||||||
</p>
|
</p>
|
||||||
|
|
|
@ -256,12 +256,40 @@ def test_method_in_request(client, live_server):
|
||||||
def test_headers_textfile_in_request(client, live_server):
|
def test_headers_textfile_in_request(client, live_server):
|
||||||
#live_server_setup(live_server)
|
#live_server_setup(live_server)
|
||||||
# Add our URL to the import page
|
# Add our URL to the import page
|
||||||
|
|
||||||
|
webdriver_ua = "Hello fancy webdriver UA 1.0"
|
||||||
|
requests_ua = "Hello basic requests UA 1.1"
|
||||||
|
|
||||||
test_url = url_for('test_headers', _external=True)
|
test_url = url_for('test_headers', _external=True)
|
||||||
if os.getenv('PLAYWRIGHT_DRIVER_URL'):
|
if os.getenv('PLAYWRIGHT_DRIVER_URL'):
|
||||||
# Because its no longer calling back to localhost but from the browser container, set in test-only.yml
|
# Because its no longer calling back to localhost but from the browser container, set in test-only.yml
|
||||||
test_url = test_url.replace('localhost', 'cdio')
|
test_url = test_url.replace('localhost', 'cdio')
|
||||||
|
|
||||||
print ("TEST URL IS ",test_url)
|
form_data = {
|
||||||
|
"application-fetch_backend": "html_requests",
|
||||||
|
"application-minutes_between_check": 180,
|
||||||
|
"requests-default_ua-html_requests": requests_ua
|
||||||
|
}
|
||||||
|
|
||||||
|
if os.getenv('PLAYWRIGHT_DRIVER_URL'):
|
||||||
|
form_data["requests-default_ua-html_webdriver"] = webdriver_ua
|
||||||
|
|
||||||
|
res = client.post(
|
||||||
|
url_for("settings_page"),
|
||||||
|
data=form_data,
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
assert b'Settings updated' in res.data
|
||||||
|
|
||||||
|
res = client.get(url_for("settings_page"))
|
||||||
|
|
||||||
|
# Only when some kind of real browser is setup
|
||||||
|
if os.getenv('PLAYWRIGHT_DRIVER_URL'):
|
||||||
|
assert b'requests-default_ua-html_webdriver' in res.data
|
||||||
|
|
||||||
|
# Field should always be there
|
||||||
|
assert b"requests-default_ua-html_requests" in res.data
|
||||||
|
|
||||||
# Add the test URL twice, we will check
|
# Add the test URL twice, we will check
|
||||||
res = client.post(
|
res = client.post(
|
||||||
url_for("import_page"),
|
url_for("import_page"),
|
||||||
|
@ -272,15 +300,14 @@ def test_headers_textfile_in_request(client, live_server):
|
||||||
|
|
||||||
wait_for_all_checks(client)
|
wait_for_all_checks(client)
|
||||||
|
|
||||||
|
|
||||||
# Add some headers to a request
|
# Add some headers to a request
|
||||||
res = client.post(
|
res = client.post(
|
||||||
url_for("edit_page", uuid="first"),
|
url_for("edit_page", uuid="first"),
|
||||||
data={
|
data={
|
||||||
"url": test_url,
|
"url": test_url,
|
||||||
"tags": "testtag",
|
"tags": "testtag",
|
||||||
"fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests',
|
"fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests',
|
||||||
"headers": "xxx:ooo\ncool:yeah\r\n"},
|
"headers": "xxx:ooo\ncool:yeah\r\n"},
|
||||||
follow_redirects=True
|
follow_redirects=True
|
||||||
)
|
)
|
||||||
assert b"Updated watch." in res.data
|
assert b"Updated watch." in res.data
|
||||||
|
@ -292,7 +319,7 @@ def test_headers_textfile_in_request(client, live_server):
|
||||||
with open('test-datastore/headers.txt', 'w') as f:
|
with open('test-datastore/headers.txt', 'w') as f:
|
||||||
f.write("global-header: nice\r\nnext-global-header: nice")
|
f.write("global-header: nice\r\nnext-global-header: nice")
|
||||||
|
|
||||||
with open('test-datastore/'+extract_UUID_from_client(client)+'/headers.txt', 'w') as f:
|
with open('test-datastore/' + extract_UUID_from_client(client) + '/headers.txt', 'w') as f:
|
||||||
f.write("watch-header: nice")
|
f.write("watch-header: nice")
|
||||||
|
|
||||||
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||||
|
@ -306,7 +333,7 @@ def test_headers_textfile_in_request(client, live_server):
|
||||||
# Not needed anymore
|
# Not needed anymore
|
||||||
os.unlink('test-datastore/headers.txt')
|
os.unlink('test-datastore/headers.txt')
|
||||||
os.unlink('test-datastore/headers-testtag.txt')
|
os.unlink('test-datastore/headers-testtag.txt')
|
||||||
os.unlink('test-datastore/'+extract_UUID_from_client(client)+'/headers.txt')
|
os.unlink('test-datastore/' + extract_UUID_from_client(client) + '/headers.txt')
|
||||||
# The service should echo back the request verb
|
# The service should echo back the request verb
|
||||||
res = client.get(
|
res = client.get(
|
||||||
url_for("preview_page", uuid="first"),
|
url_for("preview_page", uuid="first"),
|
||||||
|
@ -319,7 +346,12 @@ def test_headers_textfile_in_request(client, live_server):
|
||||||
assert b"Watch-Header:nice" in res.data
|
assert b"Watch-Header:nice" in res.data
|
||||||
assert b"Tag-Header:test" in res.data
|
assert b"Tag-Header:test" in res.data
|
||||||
|
|
||||||
|
# Check the custom UA from system settings page made it through
|
||||||
|
if os.getenv('PLAYWRIGHT_DRIVER_URL'):
|
||||||
|
assert "User-Agent:".encode('utf-8') + webdriver_ua.encode('utf-8') in res.data
|
||||||
|
else:
|
||||||
|
assert "User-Agent:".encode('utf-8') + requests_ua.encode('utf-8') in res.data
|
||||||
|
|
||||||
#unlink headers.txt on start/stop
|
# unlink headers.txt on start/stop
|
||||||
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||||
assert b'Deleted' in res.data
|
assert b'Deleted' in res.data
|
Ładowanie…
Reference in New Issue