kopia lustrzana https://github.com/dgtlmoon/changedetection.io
New feature - Simple extract data by regex from all historical watch text into CSV (#1191)
rodzic
b8d5a12ad0
commit
2345b6b558
|
@ -810,7 +810,7 @@ def changedetection_app(config=None, datastore_o=None):
|
|||
|
||||
return redirect(url_for('index'))
|
||||
|
||||
@app.route("/diff/<string:uuid>", methods=['GET'])
|
||||
@app.route("/diff/<string:uuid>", methods=['GET', 'POST'])
|
||||
@login_required
|
||||
def diff_history_page(uuid):
|
||||
|
||||
|
@ -818,6 +818,7 @@ def changedetection_app(config=None, datastore_o=None):
|
|||
if uuid == 'first':
|
||||
uuid = list(datastore.data['watching'].keys()).pop()
|
||||
|
||||
|
||||
extra_stylesheets = [url_for('static_content', group='styles', filename='diff.css')]
|
||||
try:
|
||||
watch = datastore.data['watching'][uuid]
|
||||
|
@ -825,6 +826,23 @@ def changedetection_app(config=None, datastore_o=None):
|
|||
flash("No history found for the specified link, bad link?", "error")
|
||||
return redirect(url_for('index'))
|
||||
|
||||
# For submission of requesting an extract
|
||||
if request.method == 'POST':
|
||||
extract_regex = request.form.get('extract_regex').strip()
|
||||
output = watch.extract_regex_from_all_history(extract_regex)
|
||||
if output:
|
||||
watch_dir = os.path.join(datastore_o.datastore_path, uuid)
|
||||
response = make_response(send_from_directory(directory=watch_dir, path=output, as_attachment=True))
|
||||
response.headers['Content-type'] = 'text/csv'
|
||||
response.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate'
|
||||
response.headers['Pragma'] = 'no-cache'
|
||||
response.headers['Expires'] = 0
|
||||
return response
|
||||
|
||||
|
||||
flash('Nothing matches that RegEx', 'error')
|
||||
redirect(url_for('diff_history_page', uuid=uuid)+'#extract')
|
||||
|
||||
history = watch.history
|
||||
dates = list(history.keys())
|
||||
|
||||
|
@ -866,24 +884,28 @@ def changedetection_app(config=None, datastore_o=None):
|
|||
is_html_webdriver = True if watch.get('fetch_backend') == 'html_webdriver' or (
|
||||
watch.get('fetch_backend', None) is None and system_uses_webdriver) else False
|
||||
|
||||
from changedetectionio import forms
|
||||
extract_form = forms.extractDataForm(request.form)
|
||||
|
||||
output = render_template("diff.html",
|
||||
watch_a=watch,
|
||||
newest=newest_version_file_contents,
|
||||
previous=previous_version_file_contents,
|
||||
extra_stylesheets=extra_stylesheets,
|
||||
dark_mode=getDarkModeSetting(),
|
||||
versions=dates[:-1], # All except current/last
|
||||
uuid=uuid,
|
||||
newest_version_timestamp=dates[-1],
|
||||
current_previous_version=str(previous_version),
|
||||
current_diff_url=watch['url'],
|
||||
current_previous_version=str(previous_version),
|
||||
dark_mode=getDarkModeSetting(),
|
||||
extra_stylesheets=extra_stylesheets,
|
||||
extra_title=" - Diff - {}".format(watch['title'] if watch['title'] else watch['url']),
|
||||
left_sticky=True,
|
||||
screenshot=screenshot_url,
|
||||
extract_form=extract_form,
|
||||
is_html_webdriver=is_html_webdriver,
|
||||
last_error=watch['last_error'],
|
||||
last_error_screenshot=watch.get_error_snapshot(),
|
||||
last_error_text=watch.get_error_text(),
|
||||
last_error_screenshot=watch.get_error_snapshot()
|
||||
left_sticky=True,
|
||||
newest=newest_version_file_contents,
|
||||
newest_version_timestamp=dates[-1],
|
||||
previous=previous_version_file_contents,
|
||||
screenshot=screenshot_url,
|
||||
uuid=uuid,
|
||||
versions=dates[:-1], # All except current/last
|
||||
watch_a=watch
|
||||
)
|
||||
|
||||
return output
|
||||
|
|
|
@ -448,3 +448,9 @@ class globalSettingsForm(Form):
|
|||
requests = FormField(globalSettingsRequestForm)
|
||||
application = FormField(globalSettingsApplicationForm)
|
||||
save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"})
|
||||
|
||||
|
||||
class extractDataForm(Form):
|
||||
extract_regex = StringField('RegEx to extract')
|
||||
extract_submit_button = SubmitField('Extract as CSV', render_kw={"class": "pure-button pure-button-primary"})
|
||||
|
||||
|
|
|
@ -318,3 +318,47 @@ class model(dict):
|
|||
if os.path.isfile(fname):
|
||||
return fname
|
||||
return False
|
||||
|
||||
def extract_regex_from_all_history(self, regex):
|
||||
import csv
|
||||
import re
|
||||
import datetime
|
||||
csv_output_filename = False
|
||||
csv_writer = False
|
||||
f = None
|
||||
|
||||
# self.history will be keyed with the full path
|
||||
for k, fname in self.history.items():
|
||||
if os.path.isfile(fname):
|
||||
with open(fname, "r") as f:
|
||||
contents = f.read()
|
||||
res = re.findall(regex, contents, re.MULTILINE)
|
||||
if res:
|
||||
if not csv_writer:
|
||||
# A file on the disk can be transferred much faster via flask than a string reply
|
||||
csv_output_filename = 'report.csv'
|
||||
f = open(os.path.join(self.watch_data_dir, csv_output_filename), 'w')
|
||||
# @todo some headers in the future
|
||||
#fieldnames = ['Epoch seconds', 'Date']
|
||||
csv_writer = csv.writer(f,
|
||||
delimiter=',',
|
||||
quotechar='"',
|
||||
quoting=csv.QUOTE_MINIMAL,
|
||||
#fieldnames=fieldnames
|
||||
)
|
||||
csv_writer.writerow(['Epoch seconds', 'Date'])
|
||||
# csv_writer.writeheader()
|
||||
|
||||
date_str = datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S')
|
||||
for r in res:
|
||||
row = [k, date_str]
|
||||
if isinstance(r, str):
|
||||
row.append(r)
|
||||
else:
|
||||
row+=r
|
||||
csv_writer.writerow(row)
|
||||
|
||||
if f:
|
||||
f.close()
|
||||
|
||||
return csv_output_filename
|
||||
|
|
|
@ -13,6 +13,8 @@ $(document).ready(function () {
|
|||
} else if (hash_name === '#error-screenshot') {
|
||||
$("img#error-screenshot-img").attr('src', error_screenshot_url);
|
||||
$("#settings").hide();
|
||||
} else if (hash_name === '#extract') {
|
||||
$("#settings").hide();
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -132,8 +132,9 @@ html[data-darkmode="true"] {
|
|||
padding: 2em;
|
||||
margin-left: 1em;
|
||||
margin-right: 1em;
|
||||
border-radius: 5px;
|
||||
font-size: 11px; }
|
||||
border-radius: 5px; }
|
||||
#diff-ui #text {
|
||||
font-size: 11px; }
|
||||
#diff-ui table {
|
||||
table-layout: fixed;
|
||||
width: 100%; }
|
||||
|
|
|
@ -7,7 +7,11 @@
|
|||
margin-left: 1em;
|
||||
margin-right: 1em;
|
||||
border-radius: 5px;
|
||||
font-size: 11px;
|
||||
|
||||
// The first tab 'text' diff
|
||||
#text {
|
||||
font-size: 11px;
|
||||
}
|
||||
|
||||
table {
|
||||
table-layout: fixed;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
{% extends 'base.html' %}
|
||||
|
||||
{% from '_helpers.jinja' import render_field, render_checkbox_field, render_button %}
|
||||
{% block content %}
|
||||
<script>
|
||||
const screenshot_url="{{url_for('static_content', group='screenshot', filename=uuid)}}";
|
||||
|
@ -58,6 +58,7 @@
|
|||
{% if last_error_screenshot %}<li class="tab" id="error-screenshot-tab"><a href="#error-screenshot">Error Screenshot</a></li> {% endif %}
|
||||
<li class="tab" id=""><a href="#text">Text</a></li>
|
||||
<li class="tab" id="screenshot-tab"><a href="#screenshot">Screenshot</a></li>
|
||||
<li class="tab" id="extract-tab"><a href="#extract">Extract Data</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
|
@ -108,6 +109,37 @@
|
|||
<strong>Screenshot requires Playwright/WebDriver enabled</strong>
|
||||
{% endif %}
|
||||
</div>
|
||||
<div class="tab-pane-inner" id="extract">
|
||||
<form id="extract-data-form" class="pure-form pure-form-stacked edit-form"
|
||||
action="{{ url_for('diff_history_page', uuid=uuid) }}#extract"
|
||||
method="POST">
|
||||
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
|
||||
|
||||
<p>This tool will extract text data from all of the watch history.</p>
|
||||
|
||||
<div class="pure-control-group">
|
||||
{{ render_field(extract_form.extract_regex) }}
|
||||
<span class="pure-form-message-inline">
|
||||
A <strong>RegEx</strong> is a pattern that identifies exactly which part inside of the text that you want to extract.<br/>
|
||||
|
||||
<p>
|
||||
For example, to extract only the numbers from text ‐</br>
|
||||
<strong>Raw text</strong>: <code>Temperature <span style="color: red">5.5</span>°C in Sydney</code></br>
|
||||
<strong>RegEx to extract:</strong> <code>Temperature ([0-9\.]+)</code><br/>
|
||||
</p>
|
||||
<p>
|
||||
<a href="https://RegExr.com/">Be sure to test your RegEx here.</a>
|
||||
</p>
|
||||
<p>
|
||||
Each RegEx group bracket <code>()</code> will be in its own column, the first column value is always the date.
|
||||
</p>
|
||||
</span>
|
||||
</div>
|
||||
<div class="pure-control-group">
|
||||
{{ render_button(extract_form.extract_submit_button) }}
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
|
|
|
@ -244,6 +244,7 @@ xpath://body/div/span[contains(@class, 'example-class')]",
|
|||
<span class="pure-form-message-inline"><strong>Note!: //text() function does not work where the <element> contains <![CDATA[]]></strong></span><br/>
|
||||
{% endif %}
|
||||
<span class="pure-form-message-inline">One rule per line, <i>any</i> rules that matches will be used.<br/>
|
||||
|
||||
<ul>
|
||||
<li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
|
||||
<li>JSON - Limit text to this JSON rule, using either <a href="https://pypi.org/project/jsonpath-ng/" target="new">JSONPath</a> or <a href="https://stedolan.github.io/jq/" target="new">jq</a> (if installed).
|
||||
|
|
|
@ -0,0 +1,70 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import time
|
||||
from flask import url_for
|
||||
from urllib.request import urlopen
|
||||
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks
|
||||
|
||||
sleep_time_for_fetch_thread = 3
|
||||
|
||||
|
||||
|
||||
def test_check_extract_text_from_diff(client, live_server):
|
||||
import time
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write("Now it's {} seconds since epoch, time flies!".format(str(time.time())))
|
||||
|
||||
live_server_setup(live_server)
|
||||
|
||||
# Add our URL to the import page
|
||||
res = client.post(
|
||||
url_for("import_page"),
|
||||
data={"urls": url_for('test_endpoint', _external=True)},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b"1 Imported" in res.data
|
||||
time.sleep(1)
|
||||
|
||||
# Load in 5 different numbers/changes
|
||||
last_date=""
|
||||
for n in range(5):
|
||||
# Give the thread time to pick it up
|
||||
print("Bumping snapshot and checking.. ", n)
|
||||
last_date = str(time.time())
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write("Now it's {} seconds since epoch, time flies!".format(last_date))
|
||||
|
||||
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||
wait_for_all_checks(client)
|
||||
|
||||
res = client.post(
|
||||
url_for("diff_history_page", uuid="first"),
|
||||
data={"extract_regex": "Now it's ([0-9\.]+)",
|
||||
"extract_submit_button": "Extract as CSV"},
|
||||
follow_redirects=False
|
||||
)
|
||||
|
||||
assert b'Nothing matches that RegEx' not in res.data
|
||||
assert res.content_type == 'text/csv'
|
||||
|
||||
# Read the csv reply as stringio
|
||||
from io import StringIO
|
||||
import csv
|
||||
|
||||
f = StringIO(res.data.decode('utf-8'))
|
||||
reader = csv.reader(f, delimiter=',')
|
||||
output=[]
|
||||
|
||||
for row in reader:
|
||||
output.append(row)
|
||||
|
||||
assert output[0][0] == 'Epoch seconds'
|
||||
|
||||
# Header line + 1 origin/first + 5 changes
|
||||
assert(len(output) == 7)
|
||||
|
||||
# We expect to find the last bumped date in the changes in the last field of the spreadsheet
|
||||
assert(output[6][2] == last_date)
|
||||
# And nothing else, only that group () of the decimal and .
|
||||
assert "time flies" not in output[6][2]
|
Ładowanie…
Reference in New Issue