New feature - Simple extract data by regex from all historical watch text into CSV (#1191)

2022-12-05 14:48:03 +01:00 · 2022-12-05 14:48:03 +01:00 · 2345b6b558
commit 2345b6b558
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@ -810,7 +810,7 @@ def changedetection_app(config=None, datastore_o=None):

        return redirect(url_for('index'))

-    @app.route("/diff/<string:uuid>", methods=['GET'])
+    @app.route("/diff/<string:uuid>", methods=['GET', 'POST'])
    @login_required
    def diff_history_page(uuid):

@ -818,6 +818,7 @@ def changedetection_app(config=None, datastore_o=None):
        if uuid == 'first':
            uuid = list(datastore.data['watching'].keys()).pop()

+
        extra_stylesheets = [url_for('static_content', group='styles', filename='diff.css')]
        try:
            watch = datastore.data['watching'][uuid]
@ -825,6 +826,23 @@ def changedetection_app(config=None, datastore_o=None):
            flash("No history found for the specified link, bad link?", "error")
            return redirect(url_for('index'))

+        # For submission of requesting an extract
+        if request.method == 'POST':
+            extract_regex = request.form.get('extract_regex').strip()
+            output = watch.extract_regex_from_all_history(extract_regex)
+            if output:
+                watch_dir = os.path.join(datastore_o.datastore_path, uuid)
+                response = make_response(send_from_directory(directory=watch_dir, path=output, as_attachment=True))
+                response.headers['Content-type'] = 'text/csv'
+                response.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate'
+                response.headers['Pragma'] = 'no-cache'
+                response.headers['Expires'] = 0
+                return response
+
+
+            flash('Nothing matches that RegEx', 'error')
+            redirect(url_for('diff_history_page', uuid=uuid)+'#extract')
+
        history = watch.history
        dates = list(history.keys())

@ -866,24 +884,28 @@ def changedetection_app(config=None, datastore_o=None):
        is_html_webdriver = True if watch.get('fetch_backend') == 'html_webdriver' or (
                    watch.get('fetch_backend', None) is None and system_uses_webdriver) else False

+        from changedetectionio import forms
+        extract_form = forms.extractDataForm(request.form)
+
        output = render_template("diff.html",
-                                 watch_a=watch,
-                                 newest=newest_version_file_contents,
-                                 previous=previous_version_file_contents,
-                                 extra_stylesheets=extra_stylesheets,
-                                 dark_mode=getDarkModeSetting(),
-                                 versions=dates[:-1], # All except current/last
-                                 uuid=uuid,
-                                 newest_version_timestamp=dates[-1],
-                                 current_previous_version=str(previous_version),
                                 current_diff_url=watch['url'],
+                                 current_previous_version=str(previous_version),
+                                 dark_mode=getDarkModeSetting(),
+                                 extra_stylesheets=extra_stylesheets,
                                 extra_title=" - Diff - {}".format(watch['title'] if watch['title'] else watch['url']),
-                                 left_sticky=True,
-                                 screenshot=screenshot_url,
+                                 extract_form=extract_form,
                                 is_html_webdriver=is_html_webdriver,
                                 last_error=watch['last_error'],
+                                 last_error_screenshot=watch.get_error_snapshot(),
                                 last_error_text=watch.get_error_text(),
-                                 last_error_screenshot=watch.get_error_snapshot()
+                                 left_sticky=True,
+                                 newest=newest_version_file_contents,
+                                 newest_version_timestamp=dates[-1],
+                                 previous=previous_version_file_contents,
+                                 screenshot=screenshot_url,
+                                 uuid=uuid,
+                                 versions=dates[:-1], # All except current/last
+                                 watch_a=watch
                                 )

        return output
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@ -448,3 +448,9 @@ class globalSettingsForm(Form):
    requests = FormField(globalSettingsRequestForm)
    application = FormField(globalSettingsApplicationForm)
    save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"})
+
+
+class extractDataForm(Form):
+    extract_regex = StringField('RegEx to extract')
+    extract_submit_button = SubmitField('Extract as CSV', render_kw={"class": "pure-button pure-button-primary"})
+
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@ -318,3 +318,47 @@ class model(dict):
        if os.path.isfile(fname):
            return fname
        return False
+
+    def extract_regex_from_all_history(self, regex):
+        import csv
+        import re
+        import datetime
+        csv_output_filename = False
+        csv_writer = False
+        f = None
+
+        # self.history will be keyed with the full path
+        for k, fname in self.history.items():
+            if os.path.isfile(fname):
+                with open(fname, "r") as f:
+                    contents = f.read()
+                    res = re.findall(regex, contents, re.MULTILINE)
+                    if res:
+                        if not csv_writer:
+                            # A file on the disk can be transferred much faster via flask than a string reply
+                            csv_output_filename = 'report.csv'
+                            f = open(os.path.join(self.watch_data_dir, csv_output_filename), 'w')
+                            # @todo some headers in the future
+                            #fieldnames = ['Epoch seconds', 'Date']
+                            csv_writer = csv.writer(f,
+                                                    delimiter=',',
+                                                    quotechar='"',
+                                                    quoting=csv.QUOTE_MINIMAL,
+                                                    #fieldnames=fieldnames
+                                                    )
+                            csv_writer.writerow(['Epoch seconds', 'Date'])
+                            # csv_writer.writeheader()
+
+                        date_str = datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S')
+                        for r in res:
+                            row = [k, date_str]
+                            if isinstance(r, str):
+                                row.append(r)
+                            else:
+                                row+=r
+                            csv_writer.writerow(row)
+
+        if f:
+            f.close()
+
+        return csv_output_filename
--- a/changedetectionio/static/js/diff-overview.js
+++ b/changedetectionio/static/js/diff-overview.js
@ -13,6 +13,8 @@ $(document).ready(function () {
        } else if (hash_name === '#error-screenshot') {
            $("img#error-screenshot-img").attr('src', error_screenshot_url);
            $("#settings").hide();
+        } else if (hash_name === '#extract') {
+            $("#settings").hide();
        }


--- a/changedetectionio/static/styles/diff.css
+++ b/changedetectionio/static/styles/diff.css
@ -132,8 +132,9 @@ html[data-darkmode="true"] {
  padding: 2em;
  margin-left: 1em;
  margin-right: 1em;
-  border-radius: 5px;
-  font-size: 11px; }
+  border-radius: 5px; }
+  #diff-ui #text {
+    font-size: 11px; }
  #diff-ui table {
    table-layout: fixed;
    width: 100%; }
--- a/changedetectionio/static/styles/scss/diff.scss
+++ b/changedetectionio/static/styles/scss/diff.scss
@ -7,7 +7,11 @@
  margin-left: 1em;
  margin-right: 1em;
  border-radius: 5px;
-  font-size: 11px;
+
+  // The first tab 'text' diff
+  #text {
+    font-size: 11px;
+  }

  table {
    table-layout: fixed;
--- a/changedetectionio/templates/diff.html
+++ b/changedetectionio/templates/diff.html
@ -1,5 +1,5 @@
 {% extends 'base.html' %}
-
+{% from '_helpers.jinja' import render_field, render_checkbox_field, render_button %}
 {% block content %}
 <script>
    const screenshot_url="{{url_for('static_content', group='screenshot', filename=uuid)}}";
@ -58,6 +58,7 @@
        {% if last_error_screenshot %}<li class="tab" id="error-screenshot-tab"><a href="#error-screenshot">Error Screenshot</a></li> {% endif %}
        <li class="tab" id=""><a href="#text">Text</a></li>
        <li class="tab" id="screenshot-tab"><a href="#screenshot">Screenshot</a></li>
+        <li class="tab" id="extract-tab"><a href="#extract">Extract Data</a></li>
    </ul>
 </div>

@ -108,6 +109,37 @@
           <strong>Screenshot requires Playwright/WebDriver enabled</strong>
         {% endif %}
     </div>
+    <div class="tab-pane-inner" id="extract">
+        <form id="extract-data-form" class="pure-form pure-form-stacked edit-form"
+              action="{{ url_for('diff_history_page', uuid=uuid) }}#extract"
+              method="POST">
+            <input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
+
+            <p>This tool will extract text data from all of the watch history.</p>
+
+            <div class="pure-control-group">
+                {{ render_field(extract_form.extract_regex) }}
+                <span class="pure-form-message-inline">
+                    A <strong>RegEx</strong> is a pattern that identifies exactly which part inside of the text that you want to extract.<br/>
+
+                    <p>
+                        For example, to extract only the numbers from text &dash;</br>
+                        <strong>Raw text</strong>: <code>Temperature <span style="color: red">5.5</span>°C in Sydney</code></br>
+                        <strong>RegEx to extract:</strong> <code>Temperature ([0-9\.]+)</code><br/>
+                    </p>
+                    <p>
+                        <a href="https://RegExr.com/">Be sure to test your RegEx here.</a>
+                    </p>
+                    <p>
+                        Each RegEx group bracket <code>()</code> will be in its own column, the first column value is always the date.
+                    </p>
+                </span>
+            </div>
+            <div class="pure-control-group">
+                {{ render_button(extract_form.extract_submit_button) }}
+            </div>
+        </form>
+    </div>
 </div>

 <script>
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@ -244,6 +244,7 @@ xpath://body/div/span[contains(@class, 'example-class')]",
                          <span class="pure-form-message-inline"><strong>Note!: //text() function does not work where the &lt;element&gt; contains &lt;![CDATA[]]&gt;</strong></span><br/>
                        {% endif %}
                        <span class="pure-form-message-inline">One rule per line, <i>any</i> rules that matches will be used.<br/>
+
                    <ul>
                        <li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
                        <li>JSON - Limit text to this JSON rule, using either <a href="https://pypi.org/project/jsonpath-ng/" target="new">JSONPath</a> or <a href="https://stedolan.github.io/jq/" target="new">jq</a> (if installed).
--- a/changedetectionio/tests/test_extract_csv.py
+++ b/changedetectionio/tests/test_extract_csv.py
@ -0,0 +1,70 @@
+#!/usr/bin/python3
+
+import time
+from flask import url_for
+from urllib.request import urlopen
+from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks
+
+sleep_time_for_fetch_thread = 3
+
+
+
+def test_check_extract_text_from_diff(client, live_server):
+    import time
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write("Now it's {} seconds since epoch, time flies!".format(str(time.time())))
+
+    live_server_setup(live_server)
+
+    # Add our URL to the import page
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": url_for('test_endpoint', _external=True)},
+        follow_redirects=True
+    )
+
+    assert b"1 Imported" in res.data
+    time.sleep(1)
+
+    # Load in 5 different numbers/changes
+    last_date=""
+    for n in range(5):
+        # Give the thread time to pick it up
+        print("Bumping snapshot and checking.. ", n)
+        last_date = str(time.time())
+        with open("test-datastore/endpoint-content.txt", "w") as f:
+            f.write("Now it's {} seconds since epoch, time flies!".format(last_date))
+
+        client.get(url_for("form_watch_checknow"), follow_redirects=True)
+        wait_for_all_checks(client)
+
+    res = client.post(
+        url_for("diff_history_page", uuid="first"),
+        data={"extract_regex": "Now it's ([0-9\.]+)",
+              "extract_submit_button": "Extract as CSV"},
+        follow_redirects=False
+    )
+
+    assert b'Nothing matches that RegEx' not in res.data
+    assert res.content_type == 'text/csv'
+
+    # Read the csv reply as stringio
+    from io import StringIO
+    import csv
+
+    f = StringIO(res.data.decode('utf-8'))
+    reader = csv.reader(f, delimiter=',')
+    output=[]
+
+    for row in reader:
+        output.append(row)
+
+    assert output[0][0] == 'Epoch seconds'
+
+    # Header line + 1 origin/first + 5 changes
+    assert(len(output) == 7)
+
+    # We expect to find the last bumped date in the changes in the last field of the spreadsheet
+    assert(output[6][2] == last_date)
+    # And nothing else, only that group () of the decimal and .
+    assert "time flies" not in output[6][2]