Re #154 Ldjson extract parse (#158)

* Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it. * Update README.md
2021-07-25 07:02:19 +02:00 · 2021-07-25 07:02:19 +02:00 · e2304b2ce0
commit e2304b2ce0
--- a/README.md
+++ b/README.md
@ -102,6 +102,20 @@ This will re-parse the JSON and apply formatting to the text, making it super ea
 ![image](https://user-images.githubusercontent.com/275001/125165995-d9ea5580-e1dc-11eb-8030-f0deced2661a.png)
 #### Parse JSON embedded in HTML!
 When you enable a `json:` filter, you can even automatically extract and parse embedded JSON inside a HTML page! Amazingly handy for sites that build content based on JSON, such as many e-commerce websites. 
 ```
 <html>
 ...
 <script type="application/ld+json">
  {"@context":"http://schema.org","@type":"Product","name":"Nan Optipro Stage 1 Baby Formula  800g","price": 23.50 }
 </script>
 ```  
 `json:$.price` would give `23.50`, or you can extract the whole structure
 ### Proxy
 A proxy for ChangeDetection.io can be configured by setting environment the 
--- a/backend/fetch_site_status.py
+++ b/backend/fetch_site_status.py
@ -92,27 +92,8 @@ class perform_site_check():
            css_filter_rule = self.datastore.data['watching'][uuid]['css_filter']
            if css_filter_rule and len(css_filter_rule.strip()):
                if 'json:' in css_filter_rule:
-                    # POC hack, @todo rename vars, see how it fits in with the javascript version
+                    stripped_text_from_html = html_tools.extract_json_as_string(html, css_filter_rule)
                    import json
                    from jsonpath_ng import jsonpath, parse
                    json_data = json.loads(html)
                    jsonpath_expression = parse(css_filter_rule.replace('json:', ''))
                    match = jsonpath_expression.find(json_data)
                    s = []
                    # More than one result, we will return it as a JSON list.
                    if len(match) > 1:
                        for i in match:
                            s.append(i.value)
                    # Single value, use just the value, as it could be later used in a token in notifications.
                    if len(match) == 1:
                        s = match[0].value
                    stripped_text_from_html = json.dumps(s, indent=4)
                    is_html = False
                else:
                    # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
                    html = html_tools.css_filter(css_filter=css_filter_rule, html_content=r.content)
--- a/backend/html_tools.py
+++ b/backend/html_tools.py
@ -1,6 +1,12 @@
 import json
 from bs4 import BeautifulSoup
 from jsonpath_ng import parse
 class JSONNotFound(ValueError):
    def __init__(self, msg):
        ValueError.__init__(self, msg)
 # Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
 def css_filter(css_filter, html_content):
    soup = BeautifulSoup(html_content, "html.parser")
@ -24,3 +30,54 @@ def extract_element(find='title', html_content=''):
    return element_text
 #
 def _parse_json(json_data, jsonpath_filter):
    s=[]
    jsonpath_expression = parse(jsonpath_filter.replace('json:', ''))
    match = jsonpath_expression.find(json_data)
    # More than one result, we will return it as a JSON list.
    if len(match) > 1:
        for i in match:
            s.append(i.value)
    # Single value, use just the value, as it could be later used in a token in notifications.
    if len(match) == 1:
        s = match[0].value
    if not s:
        raise JSONNotFound("No Matching JSON could be found for the rule {}".format(jsonpath_filter.replace('json:', '')))
    stripped_text_from_html = json.dumps(s, indent=4)
    return stripped_text_from_html
 def extract_json_as_string(content, jsonpath_filter):
    stripped_text_from_html = False
    # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson>
    try:
        stripped_text_from_html = _parse_json(json.loads(content), jsonpath_filter)
    except json.JSONDecodeError:
        # Foreach <script json></script> blob.. just return the first that matches jsonpath_filter
        s = []
        soup = BeautifulSoup(content, 'html.parser')
        bs_result = soup.findAll('script')
        if not bs_result:
            raise JSONNotFound("No parsable JSON found in this document")
        for result in bs_result:
            try:
                json_data = json.loads(result.string)
            except json.JSONDecodeError:
                # Just skip it
                continue
            else:
                stripped_text_from_html = _parse_json(json_data, jsonpath_filter)
                if stripped_text_from_html:
                    break
    return stripped_text_from_html
--- a/backend/tests/test_jsonpath_selector.py
+++ b/backend/tests/test_jsonpath_selector.py
@ -4,6 +4,42 @@ import time
 from flask import url_for
 from . util import live_server_setup
 def test_unittest_inline_html_extract():
    # So lets pretend that the JSON we want is inside some HTML
    content="""
    <html>
    food and stuff and more
    <script>
    alert('nothing really good here');
    </script>
    <script type="application/ld+json">
  xx {"@context":"http://schema.org","@type":"Product","name":"Nan Optipro Stage 1 Baby Formula  800g","description":"During the first year of life, nutrition is critical for your baby. NAN OPTIPRO 1 is tailored to ensure your formula fed infant receives balanced, high quality nutrition.<br />Starter infant formula. The age optimised protein source (whey dominant) is from cow’s milk.<br />Backed by more than 150 years of Nestlé expertise.<br />For hygiene and convenience, it is available in an innovative packaging format with a separate storage area for the scoop, and a semi-transparent window which allows you to see how much powder is left in the can without having to open it.","image":"https://cdn0.woolworths.media/content/wowproductimages/large/155536.jpg","brand":{"@context":"http://schema.org","@type":"Organization","name":"Nan"},"gtin13":"7613287517388","offers":{"@context":"http://schema.org","@type":"Offer","potentialAction":{"@context":"http://schema.org","@type":"BuyAction"},"availability":"http://schema.org/InStock","itemCondition":"http://schema.org/NewCondition","price":23.5,"priceCurrency":"AUD"},"review":[],"sku":"155536"}
 </script>
 <body>
 and it can also be repeated
 <script type="application/ld+json">
  {"@context":"http://schema.org","@type":"Product","name":"Nan Optipro Stage 1 Baby Formula  800g","description":"During the first year of life, nutrition is critical for your baby. NAN OPTIPRO 1 is tailored to ensure your formula fed infant receives balanced, high quality nutrition.<br />Starter infant formula. The age optimised protein source (whey dominant) is from cow’s milk.<br />Backed by more than 150 years of Nestlé expertise.<br />For hygiene and convenience, it is available in an innovative packaging format with a separate storage area for the scoop, and a semi-transparent window which allows you to see how much powder is left in the can without having to open it.","image":"https://cdn0.woolworths.media/content/wowproductimages/large/155536.jpg","brand":{"@context":"http://schema.org","@type":"Organization","name":"Nan"},"gtin13":"7613287517388","offers":{"@context":"http://schema.org","@type":"Offer","potentialAction":{"@context":"http://schema.org","@type":"BuyAction"},"availability":"http://schema.org/InStock","itemCondition":"http://schema.org/NewCondition","price":23.5,"priceCurrency":"AUD"},"review":[],"sku":"155536"}
 </script>
 <h4>ok</h4>
 </body>
 </html>
    """
    from .. import html_tools
    # See that we can find the second <script> one, which is not broken, and matches our filter
    text = html_tools.extract_json_as_string(content, "$.offers.price")
    assert text == "23.5"
    text = html_tools.extract_json_as_string('{"id":5}', "$.id")
    assert text == "5"
 #    @todo how to test for exception raised
 #    text = html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "$.id")
 def test_setup(live_server):
    live_server_setup(live_server)