kopia lustrzana https://github.com/dgtlmoon/changedetection.io
* Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it. * Update README.mdpull/159/head
rodzic
b87236ea20
commit
e2304b2ce0
14
README.md
14
README.md
|
@ -102,6 +102,20 @@ This will re-parse the JSON and apply formatting to the text, making it super ea
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
|
#### Parse JSON embedded in HTML!
|
||||||
|
|
||||||
|
When you enable a `json:` filter, you can even automatically extract and parse embedded JSON inside a HTML page! Amazingly handy for sites that build content based on JSON, such as many e-commerce websites.
|
||||||
|
|
||||||
|
```
|
||||||
|
<html>
|
||||||
|
...
|
||||||
|
<script type="application/ld+json">
|
||||||
|
{"@context":"http://schema.org","@type":"Product","name":"Nan Optipro Stage 1 Baby Formula 800g","price": 23.50 }
|
||||||
|
</script>
|
||||||
|
```
|
||||||
|
|
||||||
|
`json:$.price` would give `23.50`, or you can extract the whole structure
|
||||||
|
|
||||||
### Proxy
|
### Proxy
|
||||||
|
|
||||||
A proxy for ChangeDetection.io can be configured by setting environment the
|
A proxy for ChangeDetection.io can be configured by setting environment the
|
||||||
|
|
|
@ -92,27 +92,8 @@ class perform_site_check():
|
||||||
css_filter_rule = self.datastore.data['watching'][uuid]['css_filter']
|
css_filter_rule = self.datastore.data['watching'][uuid]['css_filter']
|
||||||
if css_filter_rule and len(css_filter_rule.strip()):
|
if css_filter_rule and len(css_filter_rule.strip()):
|
||||||
if 'json:' in css_filter_rule:
|
if 'json:' in css_filter_rule:
|
||||||
# POC hack, @todo rename vars, see how it fits in with the javascript version
|
stripped_text_from_html = html_tools.extract_json_as_string(html, css_filter_rule)
|
||||||
import json
|
|
||||||
from jsonpath_ng import jsonpath, parse
|
|
||||||
|
|
||||||
json_data = json.loads(html)
|
|
||||||
jsonpath_expression = parse(css_filter_rule.replace('json:', ''))
|
|
||||||
match = jsonpath_expression.find(json_data)
|
|
||||||
s = []
|
|
||||||
|
|
||||||
# More than one result, we will return it as a JSON list.
|
|
||||||
if len(match) > 1:
|
|
||||||
for i in match:
|
|
||||||
s.append(i.value)
|
|
||||||
|
|
||||||
# Single value, use just the value, as it could be later used in a token in notifications.
|
|
||||||
if len(match) == 1:
|
|
||||||
s = match[0].value
|
|
||||||
|
|
||||||
stripped_text_from_html = json.dumps(s, indent=4)
|
|
||||||
is_html = False
|
is_html = False
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
||||||
html = html_tools.css_filter(css_filter=css_filter_rule, html_content=r.content)
|
html = html_tools.css_filter(css_filter=css_filter_rule, html_content=r.content)
|
||||||
|
|
|
@ -1,6 +1,12 @@
|
||||||
|
import json
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from jsonpath_ng import parse
|
||||||
|
|
||||||
|
|
||||||
|
class JSONNotFound(ValueError):
|
||||||
|
def __init__(self, msg):
|
||||||
|
ValueError.__init__(self, msg)
|
||||||
|
|
||||||
# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
|
# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
|
||||||
def css_filter(css_filter, html_content):
|
def css_filter(css_filter, html_content):
|
||||||
soup = BeautifulSoup(html_content, "html.parser")
|
soup = BeautifulSoup(html_content, "html.parser")
|
||||||
|
@ -24,3 +30,54 @@ def extract_element(find='title', html_content=''):
|
||||||
|
|
||||||
return element_text
|
return element_text
|
||||||
|
|
||||||
|
#
|
||||||
|
def _parse_json(json_data, jsonpath_filter):
|
||||||
|
s=[]
|
||||||
|
jsonpath_expression = parse(jsonpath_filter.replace('json:', ''))
|
||||||
|
match = jsonpath_expression.find(json_data)
|
||||||
|
|
||||||
|
# More than one result, we will return it as a JSON list.
|
||||||
|
if len(match) > 1:
|
||||||
|
for i in match:
|
||||||
|
s.append(i.value)
|
||||||
|
|
||||||
|
# Single value, use just the value, as it could be later used in a token in notifications.
|
||||||
|
if len(match) == 1:
|
||||||
|
s = match[0].value
|
||||||
|
|
||||||
|
if not s:
|
||||||
|
raise JSONNotFound("No Matching JSON could be found for the rule {}".format(jsonpath_filter.replace('json:', '')))
|
||||||
|
|
||||||
|
stripped_text_from_html = json.dumps(s, indent=4)
|
||||||
|
|
||||||
|
return stripped_text_from_html
|
||||||
|
|
||||||
|
def extract_json_as_string(content, jsonpath_filter):
|
||||||
|
|
||||||
|
stripped_text_from_html = False
|
||||||
|
|
||||||
|
# Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson>
|
||||||
|
try:
|
||||||
|
stripped_text_from_html = _parse_json(json.loads(content), jsonpath_filter)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
|
||||||
|
# Foreach <script json></script> blob.. just return the first that matches jsonpath_filter
|
||||||
|
s = []
|
||||||
|
soup = BeautifulSoup(content, 'html.parser')
|
||||||
|
bs_result = soup.findAll('script')
|
||||||
|
|
||||||
|
if not bs_result:
|
||||||
|
raise JSONNotFound("No parsable JSON found in this document")
|
||||||
|
|
||||||
|
for result in bs_result:
|
||||||
|
try:
|
||||||
|
json_data = json.loads(result.string)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
# Just skip it
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
stripped_text_from_html = _parse_json(json_data, jsonpath_filter)
|
||||||
|
if stripped_text_from_html:
|
||||||
|
break
|
||||||
|
|
||||||
|
return stripped_text_from_html
|
||||||
|
|
|
@ -4,6 +4,42 @@ import time
|
||||||
from flask import url_for
|
from flask import url_for
|
||||||
from . util import live_server_setup
|
from . util import live_server_setup
|
||||||
|
|
||||||
|
def test_unittest_inline_html_extract():
|
||||||
|
# So lets pretend that the JSON we want is inside some HTML
|
||||||
|
content="""
|
||||||
|
<html>
|
||||||
|
|
||||||
|
food and stuff and more
|
||||||
|
<script>
|
||||||
|
alert('nothing really good here');
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<script type="application/ld+json">
|
||||||
|
xx {"@context":"http://schema.org","@type":"Product","name":"Nan Optipro Stage 1 Baby Formula 800g","description":"During the first year of life, nutrition is critical for your baby. NAN OPTIPRO 1 is tailored to ensure your formula fed infant receives balanced, high quality nutrition.<br />Starter infant formula. The age optimised protein source (whey dominant) is from cow’s milk.<br />Backed by more than 150 years of Nestlé expertise.<br />For hygiene and convenience, it is available in an innovative packaging format with a separate storage area for the scoop, and a semi-transparent window which allows you to see how much powder is left in the can without having to open it.","image":"https://cdn0.woolworths.media/content/wowproductimages/large/155536.jpg","brand":{"@context":"http://schema.org","@type":"Organization","name":"Nan"},"gtin13":"7613287517388","offers":{"@context":"http://schema.org","@type":"Offer","potentialAction":{"@context":"http://schema.org","@type":"BuyAction"},"availability":"http://schema.org/InStock","itemCondition":"http://schema.org/NewCondition","price":23.5,"priceCurrency":"AUD"},"review":[],"sku":"155536"}
|
||||||
|
</script>
|
||||||
|
<body>
|
||||||
|
and it can also be repeated
|
||||||
|
<script type="application/ld+json">
|
||||||
|
{"@context":"http://schema.org","@type":"Product","name":"Nan Optipro Stage 1 Baby Formula 800g","description":"During the first year of life, nutrition is critical for your baby. NAN OPTIPRO 1 is tailored to ensure your formula fed infant receives balanced, high quality nutrition.<br />Starter infant formula. The age optimised protein source (whey dominant) is from cow’s milk.<br />Backed by more than 150 years of Nestlé expertise.<br />For hygiene and convenience, it is available in an innovative packaging format with a separate storage area for the scoop, and a semi-transparent window which allows you to see how much powder is left in the can without having to open it.","image":"https://cdn0.woolworths.media/content/wowproductimages/large/155536.jpg","brand":{"@context":"http://schema.org","@type":"Organization","name":"Nan"},"gtin13":"7613287517388","offers":{"@context":"http://schema.org","@type":"Offer","potentialAction":{"@context":"http://schema.org","@type":"BuyAction"},"availability":"http://schema.org/InStock","itemCondition":"http://schema.org/NewCondition","price":23.5,"priceCurrency":"AUD"},"review":[],"sku":"155536"}
|
||||||
|
</script>
|
||||||
|
<h4>ok</h4>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
|
||||||
|
"""
|
||||||
|
from .. import html_tools
|
||||||
|
|
||||||
|
# See that we can find the second <script> one, which is not broken, and matches our filter
|
||||||
|
text = html_tools.extract_json_as_string(content, "$.offers.price")
|
||||||
|
assert text == "23.5"
|
||||||
|
|
||||||
|
text = html_tools.extract_json_as_string('{"id":5}', "$.id")
|
||||||
|
assert text == "5"
|
||||||
|
|
||||||
|
# @todo how to test for exception raised
|
||||||
|
# text = html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "$.id")
|
||||||
|
|
||||||
|
|
||||||
def test_setup(live_server):
|
def test_setup(live_server):
|
||||||
live_server_setup(live_server)
|
live_server_setup(live_server)
|
||||||
|
|
||||||
|
|
Ładowanie…
Reference in New Issue