kopia lustrzana https://github.com/dgtlmoon/changedetection.io
rodzic
fbe20d45cc
commit
98f6f4619f
|
@ -6,7 +6,7 @@ import os
|
||||||
import re
|
import re
|
||||||
import html2text
|
import html2text
|
||||||
from urlextract import URLExtract
|
from urlextract import URLExtract
|
||||||
|
from inscriptis import get_text
|
||||||
|
|
||||||
# Hmm Polymorphism datastore, thread, etc
|
# Hmm Polymorphism datastore, thread, etc
|
||||||
class perform_site_check(Thread):
|
class perform_site_check(Thread):
|
||||||
|
@ -36,7 +36,6 @@ class perform_site_check(Thread):
|
||||||
f.write(output)
|
f.write(output)
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
|
|
||||||
def save_response_stripped_output(self, output):
|
def save_response_stripped_output(self, output):
|
||||||
fname = "{}/{}.stripped.txt".format(self.output_path, self.timestamp)
|
fname = "{}/{}.stripped.txt".format(self.output_path, self.timestamp)
|
||||||
with open(fname, 'w') as f:
|
with open(fname, 'w') as f:
|
||||||
|
@ -47,49 +46,56 @@ class perform_site_check(Thread):
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
|
|
||||||
# Default headers
|
|
||||||
headers = {
|
|
||||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
|
|
||||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
|
||||||
'Accept-Encoding': 'gzip, deflate, br',
|
|
||||||
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8,cs;q=0.7'
|
|
||||||
}
|
|
||||||
|
|
||||||
extra_headers = self.datastore.get_val(self.uuid, 'headers')
|
extra_headers = self.datastore.get_val(self.uuid, 'headers')
|
||||||
headers.update(extra_headers)
|
|
||||||
|
# Tweak the base config with the per-watch ones
|
||||||
|
request_headers = self.datastore.data['settings']['headers'].copy()
|
||||||
|
request_headers.update(extra_headers)
|
||||||
|
|
||||||
print("Checking", self.url)
|
print("Checking", self.url)
|
||||||
|
print(request_headers)
|
||||||
|
|
||||||
self.ensure_output_path()
|
self.ensure_output_path()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
r = requests.get(self.url, headers=headers, timeout=15, verify=False)
|
timeout = self.datastore.data['settings']['requests']['timeout']
|
||||||
stripped_text_from_html = html2text.html2text(r.text)
|
except KeyError:
|
||||||
|
# @todo yeah this should go back to the default value in store.py, but this whole object should abstract off it
|
||||||
|
timeout = 15
|
||||||
|
|
||||||
|
try:
|
||||||
|
r = requests.get(self.url,
|
||||||
|
headers=request_headers,
|
||||||
|
timeout=timeout,
|
||||||
|
verify=False)
|
||||||
|
|
||||||
|
stripped_text_from_html = get_text(r.text)
|
||||||
|
|
||||||
|
|
||||||
# @todo This should be a config option.
|
# @todo This should be a config option.
|
||||||
# Many websites include junk in the links, trackers, etc.. Since we are really a service all about text changes..
|
# Many websites include junk in the links, trackers, etc.. Since we are really a service all about text changes..
|
||||||
|
|
||||||
extractor = URLExtract()
|
# inscriptis handles this much cleaner, probably not needed..
|
||||||
urls = extractor.find_urls(stripped_text_from_html)
|
# extractor = URLExtract()
|
||||||
|
# urls = extractor.find_urls(stripped_text_from_html)
|
||||||
# Remove the urls, longest first so that we dont end up chewing up bigger links with parts of smaller ones.
|
# Remove the urls, longest first so that we dont end up chewing up bigger links with parts of smaller ones.
|
||||||
if urls:
|
# if urls:
|
||||||
urls.sort(key=len, reverse=True)
|
# urls.sort(key=len, reverse=True)
|
||||||
|
# for url in urls:
|
||||||
for url in urls:
|
# # Sometimes URLExtract will consider something like 'foobar.com' as a link when that was just text.
|
||||||
# Sometimes URLExtract will consider something like 'foobar.com' as a link when that was just text.
|
# if "://" in url:
|
||||||
if "://" in url:
|
# # print ("Stripping link", url)
|
||||||
#print ("Stripping link", url)
|
# stripped_text_from_html = stripped_text_from_html.replace(url, '')
|
||||||
stripped_text_from_html = stripped_text_from_html.replace(url, '')
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Usually from networkIO/requests level
|
# Usually from networkIO/requests level
|
||||||
except (requests.exceptions.ConnectionError,requests.exceptions.ReadTimeout) as e:
|
except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e:
|
||||||
self.datastore.update_watch(self.uuid, 'last_error', str(e))
|
self.datastore.update_watch(self.uuid, 'last_error', str(e))
|
||||||
print(str(e))
|
print(str(e))
|
||||||
|
|
||||||
except requests.exceptions.MissingSchema:
|
except requests.exceptions.MissingSchema:
|
||||||
print ("Skipping {} due to missing schema/bad url".format(self.uuid))
|
print("Skipping {} due to missing schema/bad url".format(self.uuid))
|
||||||
|
|
||||||
# Usually from html2text level
|
# Usually from html2text level
|
||||||
except UnicodeDecodeError as e:
|
except UnicodeDecodeError as e:
|
||||||
|
@ -123,6 +129,5 @@ class perform_site_check(Thread):
|
||||||
history.update(dict([(self.timestamp, output_filepath)]))
|
history.update(dict([(self.timestamp, output_filepath)]))
|
||||||
self.datastore.update_watch(self.uuid, 'history', history)
|
self.datastore.update_watch(self.uuid, 'history', history)
|
||||||
|
|
||||||
|
|
||||||
self.datastore.update_watch(self.uuid, 'last_checked', int(time.time()))
|
self.datastore.update_watch(self.uuid, 'last_checked', int(time.time()))
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -10,7 +10,19 @@ class ChangeDetectionStore:
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.data = {
|
self.data = {
|
||||||
'watching': {}
|
'watching': {},
|
||||||
|
'settings': {
|
||||||
|
'headers': {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
||||||
|
'Accept-Encoding': 'gzip, deflate, br',
|
||||||
|
'Accept-Language': 'en-GB,en-US;q=0.9,en;'
|
||||||
|
},
|
||||||
|
'requests': {
|
||||||
|
'timeout': 15, # Default 15 seconds
|
||||||
|
'max_seconds_from_last_check': 3 * 60 * 60 # Default 3 hours
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -26,10 +38,12 @@ class ChangeDetectionStore:
|
||||||
'history' : {} # Dict of timestamp and output stripped filename
|
'history' : {} # Dict of timestamp and output stripped filename
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open('/datastore/url-watches.json') as json_file:
|
with open('/datastore/url-watches.json') as json_file:
|
||||||
|
from_disk = json.load(json_file)
|
||||||
|
|
||||||
self.data.update(json.load(json_file))
|
self.data.update(from_disk)
|
||||||
|
|
||||||
# Reinitialise each `watching` with our generic_definition in the case that we add a new var in the future.
|
# Reinitialise each `watching` with our generic_definition in the case that we add a new var in the future.
|
||||||
# @todo pretty sure theres a python we todo this with an abstracted(?) object!
|
# @todo pretty sure theres a python we todo this with an abstracted(?) object!
|
||||||
|
|
|
@ -0,0 +1,54 @@
|
||||||
|
{% extends 'base.html' %}
|
||||||
|
|
||||||
|
{% block content %}
|
||||||
|
<div class="edit-form">
|
||||||
|
|
||||||
|
|
||||||
|
<form class="pure-form pure-form-stacked" action="/api/update?uuid={{uuid}}" method="POST">
|
||||||
|
<fieldset>
|
||||||
|
<div class="pure-control-group">
|
||||||
|
<label for="url">URL</label>
|
||||||
|
<input type="url" id="url" required="" placeholder="https://..." name="url" value="{{ watch.url}}"
|
||||||
|
size="50"/>
|
||||||
|
<span class="pure-form-message-inline">This is a required field.</span>
|
||||||
|
</div>
|
||||||
|
<div class="pure-control-group">
|
||||||
|
<label for="tag">Tag</label>
|
||||||
|
<input type="text" placeholder="tag" size="10" id="tag" name="tag" value="{{ watch.tag}}"/>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<fieldset class="pure-group">
|
||||||
|
<label for="headers">Extra request headers</label>
|
||||||
|
|
||||||
|
<textarea id=headers name="headers" class="pure-input-1-2" placeholder="Example
|
||||||
|
Cookie: foobar
|
||||||
|
User-Agent: wonderbra 1.0"
|
||||||
|
style="width: 100%;
|
||||||
|
font-family:monospace;
|
||||||
|
white-space: pre;
|
||||||
|
overflow-wrap: normal;
|
||||||
|
overflow-x: scroll;" rows="5">{% for key, value in watch.headers.items() %}{{ key }}: {{ value }}
|
||||||
|
{% endfor %}</textarea>
|
||||||
|
<br/>
|
||||||
|
|
||||||
|
</fieldset>
|
||||||
|
<div class="pure-control-group">
|
||||||
|
<button type="submit" class="pure-button pure-button-primary">Save</button>
|
||||||
|
</div>
|
||||||
|
<br/>
|
||||||
|
|
||||||
|
<div class="pure-control-group">
|
||||||
|
<a href="/" class="pure-button button-small button-cancel">Cancel</a>
|
||||||
|
<a href="/api/delete?uuid={{uuid}}"
|
||||||
|
class="pure-button button-small button-error ">Delete</a>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
</fieldset>
|
||||||
|
</form>
|
||||||
|
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{% endblock %}
|
|
@ -15,6 +15,7 @@ bleach==3.2.1
|
||||||
html5lib==0.9999999 # via bleach
|
html5lib==0.9999999 # via bleach
|
||||||
timeago
|
timeago
|
||||||
html2text
|
html2text
|
||||||
urlextract
|
inscriptis
|
||||||
|
|
||||||
# @notes
|
# @notes
|
||||||
# - Dont install socketio, it interferes with flask_socketio
|
# - Dont install socketio, it interferes with flask_socketio
|
||||||
|
|
Ładowanie…
Reference in New Issue