kopia lustrzana https://github.com/dgtlmoon/changedetection.io
New feature - Simple extract data by regex from all historical watch text into CSV (#1191)
rodzic
b8d5a12ad0
commit
2345b6b558
|
@ -810,7 +810,7 @@ def changedetection_app(config=None, datastore_o=None):
|
||||||
|
|
||||||
return redirect(url_for('index'))
|
return redirect(url_for('index'))
|
||||||
|
|
||||||
@app.route("/diff/<string:uuid>", methods=['GET'])
|
@app.route("/diff/<string:uuid>", methods=['GET', 'POST'])
|
||||||
@login_required
|
@login_required
|
||||||
def diff_history_page(uuid):
|
def diff_history_page(uuid):
|
||||||
|
|
||||||
|
@ -818,6 +818,7 @@ def changedetection_app(config=None, datastore_o=None):
|
||||||
if uuid == 'first':
|
if uuid == 'first':
|
||||||
uuid = list(datastore.data['watching'].keys()).pop()
|
uuid = list(datastore.data['watching'].keys()).pop()
|
||||||
|
|
||||||
|
|
||||||
extra_stylesheets = [url_for('static_content', group='styles', filename='diff.css')]
|
extra_stylesheets = [url_for('static_content', group='styles', filename='diff.css')]
|
||||||
try:
|
try:
|
||||||
watch = datastore.data['watching'][uuid]
|
watch = datastore.data['watching'][uuid]
|
||||||
|
@ -825,6 +826,23 @@ def changedetection_app(config=None, datastore_o=None):
|
||||||
flash("No history found for the specified link, bad link?", "error")
|
flash("No history found for the specified link, bad link?", "error")
|
||||||
return redirect(url_for('index'))
|
return redirect(url_for('index'))
|
||||||
|
|
||||||
|
# For submission of requesting an extract
|
||||||
|
if request.method == 'POST':
|
||||||
|
extract_regex = request.form.get('extract_regex').strip()
|
||||||
|
output = watch.extract_regex_from_all_history(extract_regex)
|
||||||
|
if output:
|
||||||
|
watch_dir = os.path.join(datastore_o.datastore_path, uuid)
|
||||||
|
response = make_response(send_from_directory(directory=watch_dir, path=output, as_attachment=True))
|
||||||
|
response.headers['Content-type'] = 'text/csv'
|
||||||
|
response.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate'
|
||||||
|
response.headers['Pragma'] = 'no-cache'
|
||||||
|
response.headers['Expires'] = 0
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
flash('Nothing matches that RegEx', 'error')
|
||||||
|
redirect(url_for('diff_history_page', uuid=uuid)+'#extract')
|
||||||
|
|
||||||
history = watch.history
|
history = watch.history
|
||||||
dates = list(history.keys())
|
dates = list(history.keys())
|
||||||
|
|
||||||
|
@ -866,24 +884,28 @@ def changedetection_app(config=None, datastore_o=None):
|
||||||
is_html_webdriver = True if watch.get('fetch_backend') == 'html_webdriver' or (
|
is_html_webdriver = True if watch.get('fetch_backend') == 'html_webdriver' or (
|
||||||
watch.get('fetch_backend', None) is None and system_uses_webdriver) else False
|
watch.get('fetch_backend', None) is None and system_uses_webdriver) else False
|
||||||
|
|
||||||
|
from changedetectionio import forms
|
||||||
|
extract_form = forms.extractDataForm(request.form)
|
||||||
|
|
||||||
output = render_template("diff.html",
|
output = render_template("diff.html",
|
||||||
watch_a=watch,
|
|
||||||
newest=newest_version_file_contents,
|
|
||||||
previous=previous_version_file_contents,
|
|
||||||
extra_stylesheets=extra_stylesheets,
|
|
||||||
dark_mode=getDarkModeSetting(),
|
|
||||||
versions=dates[:-1], # All except current/last
|
|
||||||
uuid=uuid,
|
|
||||||
newest_version_timestamp=dates[-1],
|
|
||||||
current_previous_version=str(previous_version),
|
|
||||||
current_diff_url=watch['url'],
|
current_diff_url=watch['url'],
|
||||||
|
current_previous_version=str(previous_version),
|
||||||
|
dark_mode=getDarkModeSetting(),
|
||||||
|
extra_stylesheets=extra_stylesheets,
|
||||||
extra_title=" - Diff - {}".format(watch['title'] if watch['title'] else watch['url']),
|
extra_title=" - Diff - {}".format(watch['title'] if watch['title'] else watch['url']),
|
||||||
left_sticky=True,
|
extract_form=extract_form,
|
||||||
screenshot=screenshot_url,
|
|
||||||
is_html_webdriver=is_html_webdriver,
|
is_html_webdriver=is_html_webdriver,
|
||||||
last_error=watch['last_error'],
|
last_error=watch['last_error'],
|
||||||
|
last_error_screenshot=watch.get_error_snapshot(),
|
||||||
last_error_text=watch.get_error_text(),
|
last_error_text=watch.get_error_text(),
|
||||||
last_error_screenshot=watch.get_error_snapshot()
|
left_sticky=True,
|
||||||
|
newest=newest_version_file_contents,
|
||||||
|
newest_version_timestamp=dates[-1],
|
||||||
|
previous=previous_version_file_contents,
|
||||||
|
screenshot=screenshot_url,
|
||||||
|
uuid=uuid,
|
||||||
|
versions=dates[:-1], # All except current/last
|
||||||
|
watch_a=watch
|
||||||
)
|
)
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
|
@ -448,3 +448,9 @@ class globalSettingsForm(Form):
|
||||||
requests = FormField(globalSettingsRequestForm)
|
requests = FormField(globalSettingsRequestForm)
|
||||||
application = FormField(globalSettingsApplicationForm)
|
application = FormField(globalSettingsApplicationForm)
|
||||||
save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"})
|
save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"})
|
||||||
|
|
||||||
|
|
||||||
|
class extractDataForm(Form):
|
||||||
|
extract_regex = StringField('RegEx to extract')
|
||||||
|
extract_submit_button = SubmitField('Extract as CSV', render_kw={"class": "pure-button pure-button-primary"})
|
||||||
|
|
||||||
|
|
|
@ -318,3 +318,47 @@ class model(dict):
|
||||||
if os.path.isfile(fname):
|
if os.path.isfile(fname):
|
||||||
return fname
|
return fname
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def extract_regex_from_all_history(self, regex):
|
||||||
|
import csv
|
||||||
|
import re
|
||||||
|
import datetime
|
||||||
|
csv_output_filename = False
|
||||||
|
csv_writer = False
|
||||||
|
f = None
|
||||||
|
|
||||||
|
# self.history will be keyed with the full path
|
||||||
|
for k, fname in self.history.items():
|
||||||
|
if os.path.isfile(fname):
|
||||||
|
with open(fname, "r") as f:
|
||||||
|
contents = f.read()
|
||||||
|
res = re.findall(regex, contents, re.MULTILINE)
|
||||||
|
if res:
|
||||||
|
if not csv_writer:
|
||||||
|
# A file on the disk can be transferred much faster via flask than a string reply
|
||||||
|
csv_output_filename = 'report.csv'
|
||||||
|
f = open(os.path.join(self.watch_data_dir, csv_output_filename), 'w')
|
||||||
|
# @todo some headers in the future
|
||||||
|
#fieldnames = ['Epoch seconds', 'Date']
|
||||||
|
csv_writer = csv.writer(f,
|
||||||
|
delimiter=',',
|
||||||
|
quotechar='"',
|
||||||
|
quoting=csv.QUOTE_MINIMAL,
|
||||||
|
#fieldnames=fieldnames
|
||||||
|
)
|
||||||
|
csv_writer.writerow(['Epoch seconds', 'Date'])
|
||||||
|
# csv_writer.writeheader()
|
||||||
|
|
||||||
|
date_str = datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S')
|
||||||
|
for r in res:
|
||||||
|
row = [k, date_str]
|
||||||
|
if isinstance(r, str):
|
||||||
|
row.append(r)
|
||||||
|
else:
|
||||||
|
row+=r
|
||||||
|
csv_writer.writerow(row)
|
||||||
|
|
||||||
|
if f:
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
return csv_output_filename
|
||||||
|
|
|
@ -13,6 +13,8 @@ $(document).ready(function () {
|
||||||
} else if (hash_name === '#error-screenshot') {
|
} else if (hash_name === '#error-screenshot') {
|
||||||
$("img#error-screenshot-img").attr('src', error_screenshot_url);
|
$("img#error-screenshot-img").attr('src', error_screenshot_url);
|
||||||
$("#settings").hide();
|
$("#settings").hide();
|
||||||
|
} else if (hash_name === '#extract') {
|
||||||
|
$("#settings").hide();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -132,7 +132,8 @@ html[data-darkmode="true"] {
|
||||||
padding: 2em;
|
padding: 2em;
|
||||||
margin-left: 1em;
|
margin-left: 1em;
|
||||||
margin-right: 1em;
|
margin-right: 1em;
|
||||||
border-radius: 5px;
|
border-radius: 5px; }
|
||||||
|
#diff-ui #text {
|
||||||
font-size: 11px; }
|
font-size: 11px; }
|
||||||
#diff-ui table {
|
#diff-ui table {
|
||||||
table-layout: fixed;
|
table-layout: fixed;
|
||||||
|
|
|
@ -7,7 +7,11 @@
|
||||||
margin-left: 1em;
|
margin-left: 1em;
|
||||||
margin-right: 1em;
|
margin-right: 1em;
|
||||||
border-radius: 5px;
|
border-radius: 5px;
|
||||||
|
|
||||||
|
// The first tab 'text' diff
|
||||||
|
#text {
|
||||||
font-size: 11px;
|
font-size: 11px;
|
||||||
|
}
|
||||||
|
|
||||||
table {
|
table {
|
||||||
table-layout: fixed;
|
table-layout: fixed;
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
{% extends 'base.html' %}
|
{% extends 'base.html' %}
|
||||||
|
{% from '_helpers.jinja' import render_field, render_checkbox_field, render_button %}
|
||||||
{% block content %}
|
{% block content %}
|
||||||
<script>
|
<script>
|
||||||
const screenshot_url="{{url_for('static_content', group='screenshot', filename=uuid)}}";
|
const screenshot_url="{{url_for('static_content', group='screenshot', filename=uuid)}}";
|
||||||
|
@ -58,6 +58,7 @@
|
||||||
{% if last_error_screenshot %}<li class="tab" id="error-screenshot-tab"><a href="#error-screenshot">Error Screenshot</a></li> {% endif %}
|
{% if last_error_screenshot %}<li class="tab" id="error-screenshot-tab"><a href="#error-screenshot">Error Screenshot</a></li> {% endif %}
|
||||||
<li class="tab" id=""><a href="#text">Text</a></li>
|
<li class="tab" id=""><a href="#text">Text</a></li>
|
||||||
<li class="tab" id="screenshot-tab"><a href="#screenshot">Screenshot</a></li>
|
<li class="tab" id="screenshot-tab"><a href="#screenshot">Screenshot</a></li>
|
||||||
|
<li class="tab" id="extract-tab"><a href="#extract">Extract Data</a></li>
|
||||||
</ul>
|
</ul>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -108,6 +109,37 @@
|
||||||
<strong>Screenshot requires Playwright/WebDriver enabled</strong>
|
<strong>Screenshot requires Playwright/WebDriver enabled</strong>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</div>
|
</div>
|
||||||
|
<div class="tab-pane-inner" id="extract">
|
||||||
|
<form id="extract-data-form" class="pure-form pure-form-stacked edit-form"
|
||||||
|
action="{{ url_for('diff_history_page', uuid=uuid) }}#extract"
|
||||||
|
method="POST">
|
||||||
|
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
|
||||||
|
|
||||||
|
<p>This tool will extract text data from all of the watch history.</p>
|
||||||
|
|
||||||
|
<div class="pure-control-group">
|
||||||
|
{{ render_field(extract_form.extract_regex) }}
|
||||||
|
<span class="pure-form-message-inline">
|
||||||
|
A <strong>RegEx</strong> is a pattern that identifies exactly which part inside of the text that you want to extract.<br/>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
For example, to extract only the numbers from text ‐</br>
|
||||||
|
<strong>Raw text</strong>: <code>Temperature <span style="color: red">5.5</span>°C in Sydney</code></br>
|
||||||
|
<strong>RegEx to extract:</strong> <code>Temperature ([0-9\.]+)</code><br/>
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
<a href="https://RegExr.com/">Be sure to test your RegEx here.</a>
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
Each RegEx group bracket <code>()</code> will be in its own column, the first column value is always the date.
|
||||||
|
</p>
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<div class="pure-control-group">
|
||||||
|
{{ render_button(extract_form.extract_submit_button) }}
|
||||||
|
</div>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
|
|
|
@ -244,6 +244,7 @@ xpath://body/div/span[contains(@class, 'example-class')]",
|
||||||
<span class="pure-form-message-inline"><strong>Note!: //text() function does not work where the <element> contains <![CDATA[]]></strong></span><br/>
|
<span class="pure-form-message-inline"><strong>Note!: //text() function does not work where the <element> contains <![CDATA[]]></strong></span><br/>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
<span class="pure-form-message-inline">One rule per line, <i>any</i> rules that matches will be used.<br/>
|
<span class="pure-form-message-inline">One rule per line, <i>any</i> rules that matches will be used.<br/>
|
||||||
|
|
||||||
<ul>
|
<ul>
|
||||||
<li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
|
<li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
|
||||||
<li>JSON - Limit text to this JSON rule, using either <a href="https://pypi.org/project/jsonpath-ng/" target="new">JSONPath</a> or <a href="https://stedolan.github.io/jq/" target="new">jq</a> (if installed).
|
<li>JSON - Limit text to this JSON rule, using either <a href="https://pypi.org/project/jsonpath-ng/" target="new">JSONPath</a> or <a href="https://stedolan.github.io/jq/" target="new">jq</a> (if installed).
|
||||||
|
|
|
@ -0,0 +1,70 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import time
|
||||||
|
from flask import url_for
|
||||||
|
from urllib.request import urlopen
|
||||||
|
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks
|
||||||
|
|
||||||
|
sleep_time_for_fetch_thread = 3
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_check_extract_text_from_diff(client, live_server):
|
||||||
|
import time
|
||||||
|
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||||
|
f.write("Now it's {} seconds since epoch, time flies!".format(str(time.time())))
|
||||||
|
|
||||||
|
live_server_setup(live_server)
|
||||||
|
|
||||||
|
# Add our URL to the import page
|
||||||
|
res = client.post(
|
||||||
|
url_for("import_page"),
|
||||||
|
data={"urls": url_for('test_endpoint', _external=True)},
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
|
||||||
|
assert b"1 Imported" in res.data
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
# Load in 5 different numbers/changes
|
||||||
|
last_date=""
|
||||||
|
for n in range(5):
|
||||||
|
# Give the thread time to pick it up
|
||||||
|
print("Bumping snapshot and checking.. ", n)
|
||||||
|
last_date = str(time.time())
|
||||||
|
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||||
|
f.write("Now it's {} seconds since epoch, time flies!".format(last_date))
|
||||||
|
|
||||||
|
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||||
|
wait_for_all_checks(client)
|
||||||
|
|
||||||
|
res = client.post(
|
||||||
|
url_for("diff_history_page", uuid="first"),
|
||||||
|
data={"extract_regex": "Now it's ([0-9\.]+)",
|
||||||
|
"extract_submit_button": "Extract as CSV"},
|
||||||
|
follow_redirects=False
|
||||||
|
)
|
||||||
|
|
||||||
|
assert b'Nothing matches that RegEx' not in res.data
|
||||||
|
assert res.content_type == 'text/csv'
|
||||||
|
|
||||||
|
# Read the csv reply as stringio
|
||||||
|
from io import StringIO
|
||||||
|
import csv
|
||||||
|
|
||||||
|
f = StringIO(res.data.decode('utf-8'))
|
||||||
|
reader = csv.reader(f, delimiter=',')
|
||||||
|
output=[]
|
||||||
|
|
||||||
|
for row in reader:
|
||||||
|
output.append(row)
|
||||||
|
|
||||||
|
assert output[0][0] == 'Epoch seconds'
|
||||||
|
|
||||||
|
# Header line + 1 origin/first + 5 changes
|
||||||
|
assert(len(output) == 7)
|
||||||
|
|
||||||
|
# We expect to find the last bumped date in the changes in the last field of the spreadsheet
|
||||||
|
assert(output[6][2] == last_date)
|
||||||
|
# And nothing else, only that group () of the decimal and .
|
||||||
|
assert "time flies" not in output[6][2]
|
Ładowanie…
Reference in New Issue