kopia lustrzana https://github.com/dgtlmoon/changedetection.io
Fetching - Clarifying how fetchers work with SOCKS5 proxies
rodzic
59578803bf
commit
7debccca73
|
@ -83,6 +83,7 @@ jobs:
|
|||
run: |
|
||||
cd changedetectionio
|
||||
./run_proxy_tests.sh
|
||||
# And again with PLAYWRIGHT_DRIVER_URL=..
|
||||
cd ..
|
||||
|
||||
- name: Test changedetection.io container starts+runs basically without error
|
||||
|
|
|
@ -1,12 +1,15 @@
|
|||
import hashlib
|
||||
from abc import abstractmethod
|
||||
from distutils.util import strtobool
|
||||
from urllib.parse import urlparse
|
||||
import chardet
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import requests
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
|
||||
visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4, header, footer, section, article, aside, details, main, nav, section, summary'
|
||||
|
||||
|
@ -266,7 +269,6 @@ class base_html_playwright(Fetcher):
|
|||
|
||||
if self.proxy:
|
||||
# Playwright needs separate username and password values
|
||||
from urllib.parse import urlparse
|
||||
parsed = urlparse(self.proxy.get('server'))
|
||||
if parsed.username:
|
||||
self.proxy['username'] = parsed.username
|
||||
|
@ -321,14 +323,13 @@ class base_html_playwright(Fetcher):
|
|||
|
||||
# Append proxy connect string
|
||||
if self.proxy:
|
||||
import urllib.parse
|
||||
# Remove username/password if it exists in the URL or you will receive "ERR_NO_SUPPORTED_PROXIES" error
|
||||
# Actual authentication handled by Puppeteer/node
|
||||
o = urlparse(self.proxy.get('server'))
|
||||
proxy_url = urllib.parse.quote(o._replace(netloc="{}:{}".format(o.hostname, o.port)).geturl())
|
||||
# Remove scheme, socks5:// doesnt always work and it will autodetect anyway
|
||||
proxy_url = urllib.parse.quote(o._replace(netloc="{}:{}".format(o.hostname, o.port)).geturl().replace(f"{o.scheme}://", '', 1))
|
||||
browserless_function_url = f"{browserless_function_url}&--proxy-server={proxy_url}&dumpio=true"
|
||||
|
||||
|
||||
try:
|
||||
amp = '&' if '?' in browserless_function_url else '?'
|
||||
response = requests.request(
|
||||
|
@ -347,7 +348,7 @@ class base_html_playwright(Fetcher):
|
|||
'url': url,
|
||||
'user_agent': {k.lower(): v for k, v in request_headers.items()}.get('user-agent', None),
|
||||
'proxy_username': self.proxy.get('username', '') if self.proxy else False,
|
||||
'proxy_password': self.proxy.get('password', '') if self.proxy else False,
|
||||
'proxy_password': self.proxy.get('password', '') if self.proxy and self.proxy.get('username') else False,
|
||||
'no_cache_list': [
|
||||
'twitter',
|
||||
'.pdf'
|
||||
|
@ -416,8 +417,8 @@ class base_html_playwright(Fetcher):
|
|||
lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'),
|
||||
self.browser_steps))
|
||||
|
||||
if not has_browser_steps:
|
||||
if os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'):
|
||||
if not has_browser_steps and os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'):
|
||||
if strtobool(os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH')):
|
||||
# Temporary backup solution until we rewrite the playwright code
|
||||
return self.run_fetch_browserless_puppeteer(
|
||||
url,
|
||||
|
@ -434,6 +435,7 @@ class base_html_playwright(Fetcher):
|
|||
|
||||
self.delete_browser_steps_screenshots()
|
||||
response = None
|
||||
|
||||
with sync_playwright() as p:
|
||||
browser_type = getattr(p, self.browser_type)
|
||||
|
||||
|
@ -442,6 +444,9 @@ class base_html_playwright(Fetcher):
|
|||
# 60,000 connection timeout only
|
||||
browser = browser_type.connect_over_cdp(self.command_executor, timeout=60000)
|
||||
|
||||
# SOCKS5 with authentication is not supported (yet)
|
||||
# https://github.com/microsoft/playwright/issues/10567
|
||||
|
||||
# Set user agent to prevent Cloudflare from blocking the browser
|
||||
# Use the default one configured in the App.py model that's passed from fetch_site_status.py
|
||||
context = browser.new_context(
|
||||
|
@ -478,7 +483,6 @@ class base_html_playwright(Fetcher):
|
|||
print("Content Fetcher > retrying request got error - ", str(e))
|
||||
time.sleep(1)
|
||||
response = self.page.goto(url, wait_until='commit')
|
||||
|
||||
except Exception as e:
|
||||
print("Content Fetcher > Other exception when page.goto", str(e))
|
||||
context.close()
|
||||
|
@ -614,7 +618,6 @@ class base_html_webdriver(Fetcher):
|
|||
from selenium.common.exceptions import WebDriverException
|
||||
# request_body, request_method unused for now, until some magic in the future happens.
|
||||
|
||||
# check env for WEBDRIVER_URL
|
||||
self.driver = webdriver.Remote(
|
||||
command_executor=self.command_executor,
|
||||
desired_capabilities=DesiredCapabilities.CHROME,
|
||||
|
@ -693,6 +696,10 @@ class html_requests(Fetcher):
|
|||
proxies = {}
|
||||
|
||||
# Allows override the proxy on a per-request basis
|
||||
|
||||
# https://requests.readthedocs.io/en/latest/user/advanced/#socks
|
||||
# Should also work with `socks5://user:pass@host:port` type syntax.
|
||||
|
||||
if self.proxy_override:
|
||||
proxies = {'http': self.proxy_override, 'https': self.proxy_override, 'ftp': self.proxy_override}
|
||||
else:
|
||||
|
|
|
@ -481,7 +481,7 @@ class SingleExtraProxy(Form):
|
|||
|
||||
# maybe better to set some <script>var..
|
||||
proxy_name = StringField('Name', [validators.Optional()], render_kw={"placeholder": "Name"})
|
||||
proxy_url = StringField('Proxy URL', [validators.Optional()], render_kw={"placeholder": "http://user:pass@...:3128", "size":50})
|
||||
proxy_url = StringField('Proxy URL', [validators.Optional()], render_kw={"placeholder": "socks5:// or regular proxy http://user:pass@...:3128", "size":50})
|
||||
# @todo do the validation here instead
|
||||
|
||||
# datastore.data['settings']['requests']..
|
||||
|
|
|
@ -18,6 +18,7 @@ module.exports = async ({page, context}) => {
|
|||
|
||||
await page.setBypassCSP(true)
|
||||
await page.setExtraHTTPHeaders(req_headers);
|
||||
|
||||
if (user_agent) {
|
||||
await page.setUserAgent(user_agent);
|
||||
}
|
||||
|
@ -26,6 +27,10 @@ module.exports = async ({page, context}) => {
|
|||
await page.setDefaultNavigationTimeout(0);
|
||||
|
||||
if (proxy_username) {
|
||||
// Setting Proxy-Authentication header is deprecated, and doing so can trigger header change errors from Puppeteer
|
||||
// https://github.com/puppeteer/puppeteer/issues/676 ?
|
||||
// https://help.brightdata.com/hc/en-us/articles/12632549957649-Proxy-Manager-How-to-Guides#h_01HAKWR4Q0AFS8RZTNYWRDFJC2
|
||||
// https://cri.dev/posts/2020-03-30-How-to-solve-Puppeteer-Chrome-Error-ERR_INVALID_ARGUMENT/
|
||||
await page.authenticate({
|
||||
username: proxy_username,
|
||||
password: proxy_password
|
||||
|
|
|
@ -10,6 +10,40 @@ set -x
|
|||
docker run --network changedet-network -d --name squid-one --hostname squid-one --rm -v `pwd`/tests/proxy_list/squid.conf:/etc/squid/conf.d/debian.conf ubuntu/squid:4.13-21.10_edge
|
||||
docker run --network changedet-network -d --name squid-two --hostname squid-two --rm -v `pwd`/tests/proxy_list/squid.conf:/etc/squid/conf.d/debian.conf ubuntu/squid:4.13-21.10_edge
|
||||
|
||||
# SOCKS5 related - start simple Socks5 proxy server
|
||||
# SOCKSTEST=xyz should show in the logs of this service to confirm it fetched
|
||||
docker run --network changedet-network -d --hostname socks5proxy --name socks5proxy -p 1080:1080 -e PROXY_USER=proxy_user123 -e PROXY_PASSWORD=proxy_pass123 serjs/go-socks5-proxy
|
||||
docker run --network changedet-network -d --hostname socks5proxy-noauth -p 1081:1080 --name socks5proxy-noauth serjs/go-socks5-proxy
|
||||
|
||||
echo "---------------------------------- SOCKS5 -------------------"
|
||||
# SOCKS5 related - test from proxies.json
|
||||
docker run --network changedet-network \
|
||||
-v `pwd`/tests/proxy_socks5/proxies.json-example:/app/changedetectionio/test-datastore/proxies.json \
|
||||
--rm \
|
||||
-e "SOCKSTEST=proxiesjson" \
|
||||
test-changedetectionio \
|
||||
bash -c 'cd changedetectionio && pytest tests/proxy_socks5/test_socks5_proxy_sources.py'
|
||||
|
||||
# SOCKS5 related - by manually entering in UI
|
||||
docker run --network changedet-network \
|
||||
--rm \
|
||||
-e "SOCKSTEST=manual" \
|
||||
test-changedetectionio \
|
||||
bash -c 'cd changedetectionio && pytest tests/proxy_socks5/test_socks5_proxy.py'
|
||||
|
||||
# SOCKS5 related - test from proxies.json via playwright - NOTE- PLAYWRIGHT DOESNT SUPPORT AUTHENTICATING PROXY
|
||||
docker run --network changedet-network \
|
||||
-e "SOCKSTEST=manual-playwright" \
|
||||
-v `pwd`/tests/proxy_socks5/proxies.json-example-noauth:/app/changedetectionio/test-datastore/proxies.json \
|
||||
-e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" \
|
||||
--rm \
|
||||
test-changedetectionio \
|
||||
bash -c 'cd changedetectionio && pytest tests/proxy_socks5/test_socks5_proxy_sources.py'
|
||||
|
||||
echo "socks5 server logs"
|
||||
docker logs socks5proxy
|
||||
echo "----------------------------------"
|
||||
|
||||
# Used for configuring a custom proxy URL via the UI
|
||||
docker run --network changedet-network -d \
|
||||
--name squid-custom \
|
||||
|
|
|
@ -42,6 +42,7 @@ class ChangeDetectionStore:
|
|||
self.__data = App.model()
|
||||
self.datastore_path = datastore_path
|
||||
self.json_store_path = "{}/url-watches.json".format(self.datastore_path)
|
||||
print(">>> Datastore path is ", self.json_store_path)
|
||||
self.needs_write = False
|
||||
self.start_time = time.time()
|
||||
self.stop_thread = False
|
||||
|
|
|
@ -230,6 +230,7 @@ nav
|
|||
<div class="pure-control-group">
|
||||
{{ render_field(form.requests.form.extra_proxies) }}
|
||||
<span class="pure-form-message-inline">"Name" will be used for selecting the proxy in the Watch Edit settings</span>
|
||||
<span class="pure-form-message-inline">SOCKS5 proxies with authentication are only supported with 'plain requests' fetcher, for other fetchers you should whitelist the IP access instead</span>
|
||||
</div>
|
||||
</div>
|
||||
<div id="actions">
|
||||
|
|
|
@ -28,8 +28,6 @@ def test_fetch_webdriver_content(client, live_server):
|
|||
)
|
||||
|
||||
assert b"1 Imported" in res.data
|
||||
time.sleep(3)
|
||||
|
||||
wait_for_all_checks(client)
|
||||
|
||||
|
||||
|
|
|
@ -2,12 +2,11 @@
|
|||
|
||||
import time
|
||||
from flask import url_for
|
||||
from ..util import live_server_setup
|
||||
from ..util import live_server_setup, wait_for_all_checks
|
||||
|
||||
|
||||
def test_preferred_proxy(client, live_server):
|
||||
time.sleep(1)
|
||||
live_server_setup(live_server)
|
||||
time.sleep(1)
|
||||
url = "http://chosen.changedetection.io"
|
||||
|
||||
res = client.post(
|
||||
|
@ -20,7 +19,7 @@ def test_preferred_proxy(client, live_server):
|
|||
|
||||
assert b"1 Imported" in res.data
|
||||
|
||||
time.sleep(2)
|
||||
wait_for_all_checks(client)
|
||||
res = client.post(
|
||||
url_for("edit_page", uuid="first"),
|
||||
data={
|
||||
|
@ -34,5 +33,5 @@ def test_preferred_proxy(client, live_server):
|
|||
follow_redirects=True
|
||||
)
|
||||
assert b"Updated watch." in res.data
|
||||
time.sleep(2)
|
||||
wait_for_all_checks(client)
|
||||
# Now the request should appear in the second-squid logs
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
{
|
||||
"socks5proxy": {
|
||||
"label": "socks5proxy",
|
||||
"url": "socks5://proxy_user123:proxy_pass123@socks5proxy:1080"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,6 @@
|
|||
{
|
||||
"socks5proxy": {
|
||||
"label": "socks5proxy",
|
||||
"url": "socks5://socks5proxy-noauth:1080"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,63 @@
|
|||
#!/usr/bin/python3
|
||||
import os
|
||||
import time
|
||||
from flask import url_for
|
||||
from changedetectionio.tests.util import live_server_setup, wait_for_all_checks
|
||||
|
||||
|
||||
def test_socks5(client, live_server):
|
||||
live_server_setup(live_server)
|
||||
|
||||
# Setup a proxy
|
||||
res = client.post(
|
||||
url_for("settings_page"),
|
||||
data={
|
||||
"requests-time_between_check-minutes": 180,
|
||||
"application-ignore_whitespace": "y",
|
||||
"application-fetch_backend": "html_requests",
|
||||
# set in .github/workflows/test-only.yml
|
||||
"requests-extra_proxies-0-proxy_url": "socks5://proxy_user123:proxy_pass123@socks5proxy:1080",
|
||||
"requests-extra_proxies-0-proxy_name": "socks5proxy",
|
||||
},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b"Settings updated." in res.data
|
||||
|
||||
test_url = "https://changedetection.io/CHANGELOG.txt?socks-test-tag=" + os.getenv('SOCKSTEST', '')
|
||||
|
||||
res = client.post(
|
||||
url_for("form_quick_watch_add"),
|
||||
data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"Watch added in Paused state, saving will unpause" in res.data
|
||||
|
||||
res = client.get(
|
||||
url_for("edit_page", uuid="first", unpause_on_save=1),
|
||||
)
|
||||
# check the proxy is offered as expected
|
||||
assert b'ui-0socks5proxy' in res.data
|
||||
|
||||
res = client.post(
|
||||
url_for("edit_page", uuid="first", unpause_on_save=1),
|
||||
data={
|
||||
"include_filters": "",
|
||||
"fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests',
|
||||
"headers": "",
|
||||
"proxy": "ui-0socks5proxy",
|
||||
"tags": "",
|
||||
"url": test_url,
|
||||
},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"unpaused" in res.data
|
||||
wait_for_all_checks(client)
|
||||
|
||||
res = client.get(
|
||||
url_for("preview_page", uuid="first"),
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
# Should see the proper string
|
||||
assert "+0200:".encode('utf-8') in res.data
|
|
@ -0,0 +1,52 @@
|
|||
#!/usr/bin/python3
|
||||
import os
|
||||
import time
|
||||
from flask import url_for
|
||||
from changedetectionio.tests.util import live_server_setup, wait_for_all_checks
|
||||
|
||||
|
||||
# should be proxies.json mounted from run_proxy_tests.sh already
|
||||
# -v `pwd`/tests/proxy_socks5/proxies.json-example:/app/changedetectionio/test-datastore/proxies.json
|
||||
def test_socks5_from_proxiesjson_file(client, live_server):
|
||||
live_server_setup(live_server)
|
||||
|
||||
test_url = "https://changedetection.io/CHANGELOG.txt?socks-test-tag=" + os.getenv('SOCKSTEST', '')
|
||||
|
||||
res = client.get(url_for("settings_page"))
|
||||
assert b'name="requests-proxy" type="radio" value="socks5proxy"' in res.data
|
||||
|
||||
res = client.post(
|
||||
url_for("form_quick_watch_add"),
|
||||
data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"Watch added in Paused state, saving will unpause" in res.data
|
||||
|
||||
res = client.get(
|
||||
url_for("edit_page", uuid="first", unpause_on_save=1),
|
||||
)
|
||||
# check the proxy is offered as expected
|
||||
assert b'name="proxy" type="radio" value="socks5proxy"' in res.data
|
||||
|
||||
res = client.post(
|
||||
url_for("edit_page", uuid="first", unpause_on_save=1),
|
||||
data={
|
||||
"include_filters": "",
|
||||
"fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests',
|
||||
"headers": "",
|
||||
"proxy": "socks5proxy",
|
||||
"tags": "",
|
||||
"url": test_url,
|
||||
},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"unpaused" in res.data
|
||||
wait_for_all_checks(client)
|
||||
|
||||
res = client.get(
|
||||
url_for("preview_page", uuid="first"),
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
# Should see the proper string
|
||||
assert "+0200:".encode('utf-8') in res.data
|
|
@ -16,7 +16,7 @@ validators~=0.21
|
|||
# Set these versions together to avoid a RequestsDependencyWarning
|
||||
# >= 2.26 also adds Brotli support if brotli is installed
|
||||
brotli~=1.0
|
||||
requests[socks] ~=2.28
|
||||
requests[socks]
|
||||
|
||||
urllib3>1.26
|
||||
chardet>2.3.0
|
||||
|
|
Ładowanie…
Reference in New Issue