New feature - Simple extract data by regex from all historical watch text into CSV (#1191)

2022-12-05 14:48:03 +01:00 · 2022-12-05 14:48:03 +01:00 · 2345b6b558
commit 2345b6b558
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@ -810,7 +810,7 @@ def changedetection_app(config=None, datastore_o=None):
        return redirect(url_for('index'))
-    @app.route("/diff/<string:uuid>", methods=['GET'])
+    @app.route("/diff/<string:uuid>", methods=['GET', 'POST'])
    @login_required
    def diff_history_page(uuid):
@ -818,6 +818,7 @@ def changedetection_app(config=None, datastore_o=None):
        if uuid == 'first':
            uuid = list(datastore.data['watching'].keys()).pop()
        extra_stylesheets = [url_for('static_content', group='styles', filename='diff.css')]
        try:
            watch = datastore.data['watching'][uuid]
@ -825,6 +826,23 @@ def changedetection_app(config=None, datastore_o=None):
            flash("No history found for the specified link, bad link?", "error")
            return redirect(url_for('index'))
        # For submission of requesting an extract
        if request.method == 'POST':
            extract_regex = request.form.get('extract_regex').strip()
            output = watch.extract_regex_from_all_history(extract_regex)
            if output:
                watch_dir = os.path.join(datastore_o.datastore_path, uuid)
                response = make_response(send_from_directory(directory=watch_dir, path=output, as_attachment=True))
                response.headers['Content-type'] = 'text/csv'
                response.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate'
                response.headers['Pragma'] = 'no-cache'
                response.headers['Expires'] = 0
                return response
            flash('Nothing matches that RegEx', 'error')
            redirect(url_for('diff_history_page', uuid=uuid)+'#extract')
        history = watch.history
        dates = list(history.keys())
@ -866,24 +884,28 @@ def changedetection_app(config=None, datastore_o=None):
        is_html_webdriver = True if watch.get('fetch_backend') == 'html_webdriver' or (
                    watch.get('fetch_backend', None) is None and system_uses_webdriver) else False
        from changedetectionio import forms
        extract_form = forms.extractDataForm(request.form)
        output = render_template("diff.html",
                                 watch_a=watch,
                                 newest=newest_version_file_contents,
                                 previous=previous_version_file_contents,
                                 extra_stylesheets=extra_stylesheets,
                                 dark_mode=getDarkModeSetting(),
                                 versions=dates[:-1], # All except current/last
                                 uuid=uuid,
                                 newest_version_timestamp=dates[-1],
                                 current_previous_version=str(previous_version),
                                 current_diff_url=watch['url'],
                                 current_previous_version=str(previous_version),
                                 dark_mode=getDarkModeSetting(),
                                 extra_stylesheets=extra_stylesheets,
                                 extra_title=" - Diff - {}".format(watch['title'] if watch['title'] else watch['url']),
-                                 left_sticky=True,
+                                 extract_form=extract_form,
                                 screenshot=screenshot_url,
                                 is_html_webdriver=is_html_webdriver,
                                 last_error=watch['last_error'],
                                 last_error_screenshot=watch.get_error_snapshot(),
                                 last_error_text=watch.get_error_text(),
-                                 last_error_screenshot=watch.get_error_snapshot()
+                                 left_sticky=True,
                                 newest=newest_version_file_contents,
                                 newest_version_timestamp=dates[-1],
                                 previous=previous_version_file_contents,
                                 screenshot=screenshot_url,
                                 uuid=uuid,
                                 versions=dates[:-1], # All except current/last
                                 watch_a=watch
                                 )
        return output
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@ -448,3 +448,9 @@ class globalSettingsForm(Form):
    requests = FormField(globalSettingsRequestForm)
    application = FormField(globalSettingsApplicationForm)
    save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"})
 class extractDataForm(Form):
    extract_regex = StringField('RegEx to extract')
    extract_submit_button = SubmitField('Extract as CSV', render_kw={"class": "pure-button pure-button-primary"})
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@ -318,3 +318,47 @@ class model(dict):
        if os.path.isfile(fname):
            return fname
        return False
    def extract_regex_from_all_history(self, regex):
        import csv
        import re
        import datetime
        csv_output_filename = False
        csv_writer = False
        f = None
        # self.history will be keyed with the full path
        for k, fname in self.history.items():
            if os.path.isfile(fname):
                with open(fname, "r") as f:
                    contents = f.read()
                    res = re.findall(regex, contents, re.MULTILINE)
                    if res:
                        if not csv_writer:
                            # A file on the disk can be transferred much faster via flask than a string reply
                            csv_output_filename = 'report.csv'
                            f = open(os.path.join(self.watch_data_dir, csv_output_filename), 'w')
                            # @todo some headers in the future
                            #fieldnames = ['Epoch seconds', 'Date']
                            csv_writer = csv.writer(f,
                                                    delimiter=',',
                                                    quotechar='"',
                                                    quoting=csv.QUOTE_MINIMAL,
                                                    #fieldnames=fieldnames
                                                    )
                            csv_writer.writerow(['Epoch seconds', 'Date'])
                            # csv_writer.writeheader()
                        date_str = datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S')
                        for r in res:
                            row = [k, date_str]
                            if isinstance(r, str):
                                row.append(r)
                            else:
                                row+=r
                            csv_writer.writerow(row)
        if f:
            f.close()
        return csv_output_filename
--- a/changedetectionio/static/js/diff-overview.js
+++ b/changedetectionio/static/js/diff-overview.js
@ -13,6 +13,8 @@ $(document).ready(function () {
        } else if (hash_name === '#error-screenshot') {
            $("img#error-screenshot-img").attr('src', error_screenshot_url);
            $("#settings").hide();
        } else if (hash_name === '#extract') {
            $("#settings").hide();
        }
--- a/changedetectionio/static/styles/diff.css
+++ b/changedetectionio/static/styles/diff.css
@ -132,7 +132,8 @@ html[data-darkmode="true"] {
  padding: 2em;
  margin-left: 1em;
  margin-right: 1em;
-  border-radius: 5px;
+  border-radius: 5px; }
  #diff-ui #text {
    font-size: 11px; }
  #diff-ui table {
    table-layout: fixed;
--- a/changedetectionio/static/styles/scss/diff.scss
+++ b/changedetectionio/static/styles/scss/diff.scss
@ -7,7 +7,11 @@
  margin-left: 1em;
  margin-right: 1em;
  border-radius: 5px;
  // The first tab 'text' diff
  #text {
    font-size: 11px;
  }
  table {
    table-layout: fixed;
--- a/changedetectionio/templates/diff.html
+++ b/changedetectionio/templates/diff.html
@ -1,5 +1,5 @@
 {% extends 'base.html' %}
-
+{% from '_helpers.jinja' import render_field, render_checkbox_field, render_button %}
 {% block content %}
 <script>
    const screenshot_url="{{url_for('static_content', group='screenshot', filename=uuid)}}";
@ -58,6 +58,7 @@
        {% if last_error_screenshot %}<li class="tab" id="error-screenshot-tab"><a href="#error-screenshot">Error Screenshot</a></li> {% endif %}
        <li class="tab" id=""><a href="#text">Text</a></li>
        <li class="tab" id="screenshot-tab"><a href="#screenshot">Screenshot</a></li>
        <li class="tab" id="extract-tab"><a href="#extract">Extract Data</a></li>
    </ul>
 </div>
@ -108,6 +109,37 @@
           <strong>Screenshot requires Playwright/WebDriver enabled</strong>
         {% endif %}
     </div>
    <div class="tab-pane-inner" id="extract">
        <form id="extract-data-form" class="pure-form pure-form-stacked edit-form"
              action="{{ url_for('diff_history_page', uuid=uuid) }}#extract"
              method="POST">
            <input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
            <p>This tool will extract text data from all of the watch history.</p>
            <div class="pure-control-group">
                {{ render_field(extract_form.extract_regex) }}
                <span class="pure-form-message-inline">
                    A <strong>RegEx</strong> is a pattern that identifies exactly which part inside of the text that you want to extract.<br/>
                    <p>
                        For example, to extract only the numbers from text &dash;</br>
                        <strong>Raw text</strong>: <code>Temperature <span style="color: red">5.5</span>°C in Sydney</code></br>
                        <strong>RegEx to extract:</strong> <code>Temperature ([0-9\.]+)</code><br/>
                    </p>
                    <p>
                        <a href="https://RegExr.com/">Be sure to test your RegEx here.</a>
                    </p>
                    <p>
                        Each RegEx group bracket <code>()</code> will be in its own column, the first column value is always the date.
                    </p>
                </span>
            </div>
            <div class="pure-control-group">
                {{ render_button(extract_form.extract_submit_button) }}
            </div>
        </form>
    </div>
 </div>
 <script>
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@ -244,6 +244,7 @@ xpath://body/div/span[contains(@class, 'example-class')]",
                          <span class="pure-form-message-inline"><strong>Note!: //text() function does not work where the &lt;element&gt; contains &lt;![CDATA[]]&gt;</strong></span><br/>
                        {% endif %}
                        <span class="pure-form-message-inline">One rule per line, <i>any</i> rules that matches will be used.<br/>
                    <ul>
                        <li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
                        <li>JSON - Limit text to this JSON rule, using either <a href="https://pypi.org/project/jsonpath-ng/" target="new">JSONPath</a> or <a href="https://stedolan.github.io/jq/" target="new">jq</a> (if installed).
--- a/changedetectionio/tests/test_extract_csv.py
+++ b/changedetectionio/tests/test_extract_csv.py
@ -0,0 +1,70 @@
 #!/usr/bin/python3
 import time
 from flask import url_for
 from urllib.request import urlopen
 from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks
 sleep_time_for_fetch_thread = 3
 def test_check_extract_text_from_diff(client, live_server):
    import time
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write("Now it's {} seconds since epoch, time flies!".format(str(time.time())))
    live_server_setup(live_server)
    # Add our URL to the import page
    res = client.post(
        url_for("import_page"),
        data={"urls": url_for('test_endpoint', _external=True)},
        follow_redirects=True
    )
    assert b"1 Imported" in res.data
    time.sleep(1)
    # Load in 5 different numbers/changes
    last_date=""
    for n in range(5):
        # Give the thread time to pick it up
        print("Bumping snapshot and checking.. ", n)
        last_date = str(time.time())
        with open("test-datastore/endpoint-content.txt", "w") as f:
            f.write("Now it's {} seconds since epoch, time flies!".format(last_date))
        client.get(url_for("form_watch_checknow"), follow_redirects=True)
        wait_for_all_checks(client)
    res = client.post(
        url_for("diff_history_page", uuid="first"),
        data={"extract_regex": "Now it's ([0-9\.]+)",
              "extract_submit_button": "Extract as CSV"},
        follow_redirects=False
    )
    assert b'Nothing matches that RegEx' not in res.data
    assert res.content_type == 'text/csv'
    # Read the csv reply as stringio
    from io import StringIO
    import csv
    f = StringIO(res.data.decode('utf-8'))
    reader = csv.reader(f, delimiter=',')
    output=[]
    for row in reader:
        output.append(row)
    assert output[0][0] == 'Epoch seconds'
    # Header line + 1 origin/first + 5 changes
    assert(len(output) == 7)
    # We expect to find the last bumped date in the changes in the last field of the spreadsheet
    assert(output[6][2] == last_date)
    # And nothing else, only that group () of the decimal and .
    assert "time flies" not in output[6][2]