kopia lustrzana https://github.com/dgtlmoon/changedetection.io
Ability to Import from Wachete XLSX (or any XLSX) - Wachete alternative made easy (#1921)
rodzic
3b43da35ec
commit
e209d9fba0
|
@ -822,6 +822,7 @@ def changedetection_app(config=None, datastore_o=None):
|
|||
from . import forms
|
||||
|
||||
if request.method == 'POST':
|
||||
|
||||
from .importer import import_url_list, import_distill_io_json
|
||||
|
||||
# URL List import
|
||||
|
@ -845,11 +846,32 @@ def changedetection_app(config=None, datastore_o=None):
|
|||
for uuid in d_importer.new_uuids:
|
||||
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': True}))
|
||||
|
||||
# XLSX importer
|
||||
if request.files and request.files.get('xlsx_file'):
|
||||
file = request.files['xlsx_file']
|
||||
from .importer import import_xlsx_wachete, import_xlsx_custom
|
||||
|
||||
if request.values.get('file_mapping') == 'wachete':
|
||||
w_importer = import_xlsx_wachete()
|
||||
w_importer.run(data=file, flash=flash, datastore=datastore)
|
||||
else:
|
||||
w_importer = import_xlsx_custom()
|
||||
# Building mapping of col # to col # type
|
||||
map = {}
|
||||
for i in range(10):
|
||||
c = request.values.get(f"custom_xlsx[col_{i}]")
|
||||
v = request.values.get(f"custom_xlsx[col_type_{i}]")
|
||||
if c and v:
|
||||
map[int(c)] = v
|
||||
|
||||
w_importer.import_profile = map
|
||||
w_importer.run(data=file, flash=flash, datastore=datastore)
|
||||
|
||||
for uuid in w_importer.new_uuids:
|
||||
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': True}))
|
||||
|
||||
form = forms.importForm(formdata=request.form if request.method == 'POST' else None,
|
||||
# data=default,
|
||||
)
|
||||
# Could be some remaining, or we could be on GET
|
||||
form = forms.importForm(formdata=request.form if request.method == 'POST' else None)
|
||||
output = render_template("import.html",
|
||||
form=form,
|
||||
import_url_list_remaining="\n".join(remaining_urls),
|
||||
|
|
|
@ -15,9 +15,14 @@ from wtforms import (
|
|||
validators,
|
||||
widgets
|
||||
)
|
||||
from flask_wtf.file import FileField, FileAllowed
|
||||
from wtforms.fields import FieldList
|
||||
|
||||
from wtforms.validators import ValidationError
|
||||
|
||||
from validators.url import url as url_validator
|
||||
|
||||
|
||||
# default
|
||||
# each select <option data-enabled="enabled-0-0"
|
||||
from changedetectionio.blueprint.browser_steps.browser_steps import browser_step_ui_config
|
||||
|
@ -41,7 +46,7 @@ valid_method = {
|
|||
}
|
||||
|
||||
default_method = 'GET'
|
||||
|
||||
allow_simplehost = not strtobool(os.getenv('BLOCK_SIMPLEHOSTS', 'False'))
|
||||
|
||||
class StringListField(StringField):
|
||||
widget = widgets.TextArea()
|
||||
|
@ -261,19 +266,23 @@ class validateURL(object):
|
|||
self.message = message
|
||||
|
||||
def __call__(self, form, field):
|
||||
import validators
|
||||
# If hosts that only contain alphanumerics are allowed ("localhost" for example)
|
||||
allow_simplehost = not strtobool(os.getenv('BLOCK_SIMPLEHOSTS', 'False'))
|
||||
try:
|
||||
validators.url(field.data.strip(), simple_host=allow_simplehost)
|
||||
except validators.ValidationFailure:
|
||||
message = field.gettext('\'%s\' is not a valid URL.' % (field.data.strip()))
|
||||
raise ValidationError(message)
|
||||
# This should raise a ValidationError() or not
|
||||
validate_url(field.data)
|
||||
|
||||
from .model.Watch import is_safe_url
|
||||
if not is_safe_url(field.data):
|
||||
raise ValidationError('Watch protocol is not permitted by SAFE_PROTOCOL_REGEX')
|
||||
def validate_url(test_url):
|
||||
# If hosts that only contain alphanumerics are allowed ("localhost" for example)
|
||||
try:
|
||||
url_validator(test_url, simple_host=allow_simplehost)
|
||||
except validators.ValidationError:
|
||||
#@todo check for xss
|
||||
message = f"'{test_url}' is not a valid URL."
|
||||
# This should be wtforms.validators.
|
||||
raise ValidationError(message)
|
||||
|
||||
from .model.Watch import is_safe_url
|
||||
if not is_safe_url(test_url):
|
||||
# This should be wtforms.validators.
|
||||
raise ValidationError('Watch protocol is not permitted by SAFE_PROTOCOL_REGEX or incorrect URL format')
|
||||
|
||||
class ValidateListRegex(object):
|
||||
"""
|
||||
|
@ -398,6 +407,9 @@ class importForm(Form):
|
|||
from . import processors
|
||||
processor = RadioField(u'Processor', choices=processors.available_processors(), default="text_json_diff")
|
||||
urls = TextAreaField('URLs')
|
||||
xlsx_file = FileField('Upload .xlsx file', validators=[FileAllowed(['xlsx'], 'Must be .xlsx file!')])
|
||||
file_mapping = SelectField('File mapping', [validators.DataRequired()], choices={('wachete', 'Wachete mapping'), ('custom','Custom mapping')})
|
||||
|
||||
|
||||
class SingleBrowserStep(Form):
|
||||
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
from abc import ABC, abstractmethod
|
||||
import time
|
||||
import validators
|
||||
from wtforms import ValidationError
|
||||
|
||||
from changedetectionio.forms import validate_url
|
||||
|
||||
|
||||
class Importer():
|
||||
|
@ -12,6 +15,7 @@ class Importer():
|
|||
self.new_uuids = []
|
||||
self.good = 0
|
||||
self.remaining_data = []
|
||||
self.import_profile = None
|
||||
|
||||
@abstractmethod
|
||||
def run(self,
|
||||
|
@ -132,3 +136,145 @@ class import_distill_io_json(Importer):
|
|||
good += 1
|
||||
|
||||
flash("{} Imported from Distill.io in {:.2f}s, {} Skipped.".format(len(self.new_uuids), time.time() - now, len(self.remaining_data)))
|
||||
|
||||
class import_xlsx_wachete(Importer):
|
||||
|
||||
def run(self,
|
||||
data,
|
||||
flash,
|
||||
datastore,
|
||||
):
|
||||
good = 0
|
||||
now = time.time()
|
||||
self.new_uuids = []
|
||||
|
||||
from openpyxl import load_workbook
|
||||
|
||||
try:
|
||||
wb = load_workbook(data)
|
||||
except Exception as e:
|
||||
#@todo correct except
|
||||
flash("Unable to read export XLSX file, something wrong with the file?", 'error')
|
||||
return
|
||||
|
||||
sheet_obj = wb.active
|
||||
|
||||
i = 1
|
||||
row = 2
|
||||
while sheet_obj.cell(row=row, column=1).value:
|
||||
data = {}
|
||||
while sheet_obj.cell(row=row, column=i).value:
|
||||
column_title = sheet_obj.cell(row=1, column=i).value.strip().lower()
|
||||
column_row_value = sheet_obj.cell(row=row, column=i).value
|
||||
data[column_title] = column_row_value
|
||||
|
||||
i += 1
|
||||
|
||||
extras = {}
|
||||
if data.get('xpath'):
|
||||
#@todo split by || ?
|
||||
extras['include_filters'] = [data.get('xpath')]
|
||||
if data.get('name'):
|
||||
extras['title'] = [data.get('name').strip()]
|
||||
if data.get('interval (min)'):
|
||||
minutes = int(data.get('interval (min)'))
|
||||
hours, minutes = divmod(minutes, 60)
|
||||
days, hours = divmod(hours, 24)
|
||||
weeks, days = divmod(days, 7)
|
||||
extras['time_between_check'] = {'weeks': weeks, 'days': days, 'hours': hours, 'minutes': minutes, 'seconds': 0}
|
||||
|
||||
|
||||
# At minimum a URL is required.
|
||||
if data.get('url'):
|
||||
try:
|
||||
validate_url(data.get('url'))
|
||||
except ValidationError as e:
|
||||
print(">> import URL error", data.get('url'), str(e))
|
||||
# Don't bother processing anything else on this row
|
||||
continue
|
||||
|
||||
new_uuid = datastore.add_watch(url=data['url'].strip(),
|
||||
extras=extras,
|
||||
tag=data.get('folder'),
|
||||
write_to_disk_now=False)
|
||||
if new_uuid:
|
||||
# Straight into the queue.
|
||||
self.new_uuids.append(new_uuid)
|
||||
good += 1
|
||||
|
||||
row += 1
|
||||
i = 1
|
||||
|
||||
|
||||
flash(
|
||||
"{} imported from Wachete .xlsx in {:.2f}s".format(len(self.new_uuids), time.time() - now))
|
||||
|
||||
class import_xlsx_custom(Importer):
|
||||
|
||||
def run(self,
|
||||
data,
|
||||
flash,
|
||||
datastore,
|
||||
):
|
||||
good = 0
|
||||
now = time.time()
|
||||
self.new_uuids = []
|
||||
|
||||
from openpyxl import load_workbook
|
||||
|
||||
try:
|
||||
wb = load_workbook(data)
|
||||
except Exception as e:
|
||||
#@todo correct except
|
||||
flash("Unable to read export XLSX file, something wrong with the file?", 'error')
|
||||
return
|
||||
|
||||
# @todo cehck atleast 2 rows, same in other method
|
||||
|
||||
sheet_obj = wb.active
|
||||
from .forms import validate_url
|
||||
row = 2
|
||||
while sheet_obj.cell(row=row, column=1).value:
|
||||
url = None
|
||||
tags = None
|
||||
extras = {}
|
||||
for col_i, cell_map in self.import_profile.items():
|
||||
cell_val = sheet_obj.cell(row=row, column=col_i).value
|
||||
if cell_map == 'url':
|
||||
url = cell_val.strip()
|
||||
try:
|
||||
validate_url(url)
|
||||
except ValidationError as e:
|
||||
print (">> Import URL error",url, str(e))
|
||||
# Don't bother processing anything else on this row
|
||||
url = None
|
||||
break
|
||||
|
||||
elif cell_map == 'tag':
|
||||
tags = cell_val.strip()
|
||||
elif cell_map == 'include_filters':
|
||||
# @todo validate?
|
||||
extras['include_filters'] = [cell_val.strip()]
|
||||
elif cell_map == 'interval_minutes':
|
||||
hours, minutes = divmod(int(cell_val), 60)
|
||||
days, hours = divmod(hours, 24)
|
||||
weeks, days = divmod(days, 7)
|
||||
extras['time_between_check'] = {'weeks': weeks, 'days': days, 'hours': hours, 'minutes': minutes, 'seconds': 0}
|
||||
else:
|
||||
extras[cell_map] = cell_val.strip()
|
||||
|
||||
# At minimum a URL is required.
|
||||
if url:
|
||||
new_uuid = datastore.add_watch(url=url,
|
||||
extras=extras,
|
||||
tag=tags,
|
||||
write_to_disk_now=False)
|
||||
if new_uuid:
|
||||
# Straight into the queue.
|
||||
self.new_uuids.append(new_uuid)
|
||||
good += 1
|
||||
|
||||
row += 1
|
||||
|
||||
flash(
|
||||
"{} imported from custom .xlsx in {:.2f}s".format(len(self.new_uuids), time.time() - now))
|
||||
|
|
|
@ -360,6 +360,8 @@ class ChangeDetectionStore:
|
|||
if write_to_disk_now:
|
||||
self.sync_to_json()
|
||||
|
||||
print("added ", url)
|
||||
|
||||
return new_uuid
|
||||
|
||||
def visualselector_data_is_ready(self, watch_uuid):
|
||||
|
|
|
@ -8,11 +8,12 @@
|
|||
<ul>
|
||||
<li class="tab" id=""><a href="#url-list">URL List</a></li>
|
||||
<li class="tab"><a href="#distill-io">Distill.io</a></li>
|
||||
<li class="tab"><a href="#xlsx">.XLSX & Wachete</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="box-wrap inner">
|
||||
<form class="pure-form pure-form-aligned" action="{{url_for('import_page')}}" method="POST">
|
||||
<form class="pure-form" action="{{url_for('import_page')}}" method="POST" enctype="multipart/form-data">
|
||||
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}">
|
||||
<div class="tab-pane-inner" id="url-list">
|
||||
<legend>
|
||||
|
@ -79,6 +80,42 @@
|
|||
" rows="25">{{ original_distill_json }}</textarea>
|
||||
|
||||
</div>
|
||||
<div class="tab-pane-inner" id="xlsx">
|
||||
<fieldset>
|
||||
<div class="pure-control-group">
|
||||
{{ render_field(form.xlsx_file, class="processor") }}
|
||||
</div>
|
||||
<div class="pure-control-group">
|
||||
{{ render_field(form.file_mapping, class="processor") }}
|
||||
</div>
|
||||
</fieldset>
|
||||
<div class="pure-control-group">
|
||||
<span class="pure-form-message-inline">
|
||||
Table of custom column and data types mapping for the <strong>Custom mapping</strong> File mapping type.
|
||||
</span>
|
||||
<table style="border: 1px solid #aaa; padding: 0.5rem; border-radius: 4px;">
|
||||
<tr>
|
||||
<td><strong>Column #</strong></td>
|
||||
{% for n in range(4) %}
|
||||
<td><input type="number" name="custom_xlsx[col_{{n}}]" style="width: 4rem;" min="1"></td>
|
||||
{% endfor %}
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>Type</strong></td>
|
||||
{% for n in range(4) %}
|
||||
<td><select name="custom_xlsx[col_type_{{n}}]">
|
||||
<option value="" style="color: #aaa"> -- none --</option>
|
||||
<option value="url">URL</option>
|
||||
<option value="title">Title</option>
|
||||
<option value="include_filter">CSS/xPath filter</option>
|
||||
<option value="tag">Group / Tag name(s)</option>
|
||||
<option value="interval_minutes">Recheck time (minutes)</option>
|
||||
</select></td>
|
||||
{% endfor %}
|
||||
</tr>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
<button type="submit" class="pure-button pure-input-1-2 pure-button-primary">Import</button>
|
||||
</form>
|
||||
|
||||
|
|
Plik binarny nie jest wyświetlany.
|
@ -1,16 +1,19 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import io
|
||||
import os
|
||||
import time
|
||||
|
||||
from flask import url_for
|
||||
|
||||
from .util import live_server_setup
|
||||
from .util import live_server_setup, wait_for_all_checks
|
||||
|
||||
|
||||
def test_setup(client, live_server):
|
||||
live_server_setup(live_server)
|
||||
|
||||
def test_import(client, live_server):
|
||||
# Give the endpoint time to spin up
|
||||
time.sleep(1)
|
||||
wait_for_all_checks(client)
|
||||
|
||||
res = client.post(
|
||||
url_for("import_page"),
|
||||
|
@ -119,3 +122,82 @@ def test_import_distillio(client, live_server):
|
|||
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||
# Clear flask alerts
|
||||
res = client.get(url_for("index"))
|
||||
|
||||
def test_import_custom_xlsx(client, live_server):
|
||||
"""Test can upload a excel spreadsheet and the watches are created correctly"""
|
||||
|
||||
#live_server_setup(live_server)
|
||||
dirname = os.path.dirname(__file__)
|
||||
filename = os.path.join(dirname, 'import/spreadsheet.xlsx')
|
||||
with open(filename, 'rb') as f:
|
||||
|
||||
data= {
|
||||
'file_mapping': 'custom',
|
||||
'custom_xlsx[col_0]': '1',
|
||||
'custom_xlsx[col_1]': '3',
|
||||
'custom_xlsx[col_2]': '5',
|
||||
'custom_xlsx[col_3]': '4',
|
||||
'custom_xlsx[col_type_0]': 'title',
|
||||
'custom_xlsx[col_type_1]': 'url',
|
||||
'custom_xlsx[col_type_2]': 'include_filters',
|
||||
'custom_xlsx[col_type_3]': 'interval_minutes',
|
||||
'xlsx_file': (io.BytesIO(f.read()), 'spreadsheet.xlsx')
|
||||
}
|
||||
|
||||
res = client.post(
|
||||
url_for("import_page"),
|
||||
data=data,
|
||||
follow_redirects=True,
|
||||
)
|
||||
|
||||
assert b'2 imported from custom .xlsx' in res.data
|
||||
|
||||
res = client.get(
|
||||
url_for("index")
|
||||
)
|
||||
|
||||
|
||||
assert b'Somesite results ABC' in res.data
|
||||
assert b'City news results' in res.data
|
||||
|
||||
# Just find one to check over
|
||||
for uuid, watch in live_server.app.config['DATASTORE'].data['watching'].items():
|
||||
if watch.get('title') == 'Somesite results ABC':
|
||||
filters = watch.get('include_filters')
|
||||
assert filters[0] == '/html[1]/body[1]/div[4]/div[1]/div[1]/div[1]||//*[@id=\'content\']/div[3]/div[1]/div[1]||//*[@id=\'content\']/div[1]'
|
||||
assert watch.get('time_between_check') == {'weeks': 0, 'days': 1, 'hours': 6, 'minutes': 24, 'seconds': 0}
|
||||
|
||||
def test_import_watchete_xlsx(client, live_server):
|
||||
"""Test can upload a excel spreadsheet and the watches are created correctly"""
|
||||
|
||||
#live_server_setup(live_server)
|
||||
dirname = os.path.dirname(__file__)
|
||||
filename = os.path.join(dirname, 'import/spreadsheet.xlsx')
|
||||
with open(filename, 'rb') as f:
|
||||
|
||||
data= {
|
||||
'file_mapping': 'wachete',
|
||||
'xlsx_file': (io.BytesIO(f.read()), 'spreadsheet.xlsx')
|
||||
}
|
||||
|
||||
res = client.post(
|
||||
url_for("import_page"),
|
||||
data=data,
|
||||
follow_redirects=True,
|
||||
)
|
||||
|
||||
assert b'2 imported from Wachete .xlsx' in res.data
|
||||
|
||||
res = client.get(
|
||||
url_for("index")
|
||||
)
|
||||
|
||||
assert b'Somesite results ABC' in res.data
|
||||
assert b'City news results' in res.data
|
||||
|
||||
# Just find one to check over
|
||||
for uuid, watch in live_server.app.config['DATASTORE'].data['watching'].items():
|
||||
if watch.get('title') == 'Somesite results ABC':
|
||||
filters = watch.get('include_filters')
|
||||
assert filters[0] == '/html[1]/body[1]/div[4]/div[1]/div[1]/div[1]||//*[@id=\'content\']/div[3]/div[1]/div[1]||//*[@id=\'content\']/div[1]'
|
||||
assert watch.get('time_between_check') == {'weeks': 0, 'days': 1, 'hours': 6, 'minutes': 24, 'seconds': 0}
|
||||
|
|
|
@ -59,7 +59,7 @@ werkzeug~=2.0.0
|
|||
# Templating, so far just in the URLs but in the future can be for the notifications also
|
||||
jinja2~=3.1
|
||||
jinja2-time
|
||||
|
||||
openpyxl
|
||||
# https://peps.python.org/pep-0508/#environment-markers
|
||||
# https://github.com/dgtlmoon/changedetection.io/pull/1009
|
||||
jq~=1.3; python_version >= "3.8" and sys_platform == "darwin"
|
||||
|
|
Ładowanie…
Reference in New Issue