kopia lustrzana https://github.com/dgtlmoon/changedetection.io
* Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it. * Update README.mdpull/159/head
rodzic
b87236ea20
commit
e2304b2ce0
14
README.md
14
README.md
|
@ -102,6 +102,20 @@ This will re-parse the JSON and apply formatting to the text, making it super ea
|
|||
|
||||

|
||||
|
||||
#### Parse JSON embedded in HTML!
|
||||
|
||||
When you enable a `json:` filter, you can even automatically extract and parse embedded JSON inside a HTML page! Amazingly handy for sites that build content based on JSON, such as many e-commerce websites.
|
||||
|
||||
```
|
||||
<html>
|
||||
...
|
||||
<script type="application/ld+json">
|
||||
{"@context":"http://schema.org","@type":"Product","name":"Nan Optipro Stage 1 Baby Formula 800g","price": 23.50 }
|
||||
</script>
|
||||
```
|
||||
|
||||
`json:$.price` would give `23.50`, or you can extract the whole structure
|
||||
|
||||
### Proxy
|
||||
|
||||
A proxy for ChangeDetection.io can be configured by setting environment the
|
||||
|
|
|
@ -92,27 +92,8 @@ class perform_site_check():
|
|||
css_filter_rule = self.datastore.data['watching'][uuid]['css_filter']
|
||||
if css_filter_rule and len(css_filter_rule.strip()):
|
||||
if 'json:' in css_filter_rule:
|
||||
# POC hack, @todo rename vars, see how it fits in with the javascript version
|
||||
import json
|
||||
from jsonpath_ng import jsonpath, parse
|
||||
|
||||
json_data = json.loads(html)
|
||||
jsonpath_expression = parse(css_filter_rule.replace('json:', ''))
|
||||
match = jsonpath_expression.find(json_data)
|
||||
s = []
|
||||
|
||||
# More than one result, we will return it as a JSON list.
|
||||
if len(match) > 1:
|
||||
for i in match:
|
||||
s.append(i.value)
|
||||
|
||||
# Single value, use just the value, as it could be later used in a token in notifications.
|
||||
if len(match) == 1:
|
||||
s = match[0].value
|
||||
|
||||
stripped_text_from_html = json.dumps(s, indent=4)
|
||||
stripped_text_from_html = html_tools.extract_json_as_string(html, css_filter_rule)
|
||||
is_html = False
|
||||
|
||||
else:
|
||||
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
||||
html = html_tools.css_filter(css_filter=css_filter_rule, html_content=r.content)
|
||||
|
|
|
@ -1,6 +1,12 @@
|
|||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
from jsonpath_ng import parse
|
||||
|
||||
|
||||
class JSONNotFound(ValueError):
|
||||
def __init__(self, msg):
|
||||
ValueError.__init__(self, msg)
|
||||
|
||||
# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
|
||||
def css_filter(css_filter, html_content):
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
|
@ -24,3 +30,54 @@ def extract_element(find='title', html_content=''):
|
|||
|
||||
return element_text
|
||||
|
||||
#
|
||||
def _parse_json(json_data, jsonpath_filter):
|
||||
s=[]
|
||||
jsonpath_expression = parse(jsonpath_filter.replace('json:', ''))
|
||||
match = jsonpath_expression.find(json_data)
|
||||
|
||||
# More than one result, we will return it as a JSON list.
|
||||
if len(match) > 1:
|
||||
for i in match:
|
||||
s.append(i.value)
|
||||
|
||||
# Single value, use just the value, as it could be later used in a token in notifications.
|
||||
if len(match) == 1:
|
||||
s = match[0].value
|
||||
|
||||
if not s:
|
||||
raise JSONNotFound("No Matching JSON could be found for the rule {}".format(jsonpath_filter.replace('json:', '')))
|
||||
|
||||
stripped_text_from_html = json.dumps(s, indent=4)
|
||||
|
||||
return stripped_text_from_html
|
||||
|
||||
def extract_json_as_string(content, jsonpath_filter):
|
||||
|
||||
stripped_text_from_html = False
|
||||
|
||||
# Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson>
|
||||
try:
|
||||
stripped_text_from_html = _parse_json(json.loads(content), jsonpath_filter)
|
||||
except json.JSONDecodeError:
|
||||
|
||||
# Foreach <script json></script> blob.. just return the first that matches jsonpath_filter
|
||||
s = []
|
||||
soup = BeautifulSoup(content, 'html.parser')
|
||||
bs_result = soup.findAll('script')
|
||||
|
||||
if not bs_result:
|
||||
raise JSONNotFound("No parsable JSON found in this document")
|
||||
|
||||
for result in bs_result:
|
||||
try:
|
||||
json_data = json.loads(result.string)
|
||||
except json.JSONDecodeError:
|
||||
# Just skip it
|
||||
continue
|
||||
else:
|
||||
stripped_text_from_html = _parse_json(json_data, jsonpath_filter)
|
||||
if stripped_text_from_html:
|
||||
break
|
||||
|
||||
return stripped_text_from_html
|
||||
|
|
|
@ -4,6 +4,42 @@ import time
|
|||
from flask import url_for
|
||||
from . util import live_server_setup
|
||||
|
||||
def test_unittest_inline_html_extract():
|
||||
# So lets pretend that the JSON we want is inside some HTML
|
||||
content="""
|
||||
<html>
|
||||
|
||||
food and stuff and more
|
||||
<script>
|
||||
alert('nothing really good here');
|
||||
</script>
|
||||
|
||||
<script type="application/ld+json">
|
||||
xx {"@context":"http://schema.org","@type":"Product","name":"Nan Optipro Stage 1 Baby Formula 800g","description":"During the first year of life, nutrition is critical for your baby. NAN OPTIPRO 1 is tailored to ensure your formula fed infant receives balanced, high quality nutrition.<br />Starter infant formula. The age optimised protein source (whey dominant) is from cow’s milk.<br />Backed by more than 150 years of Nestlé expertise.<br />For hygiene and convenience, it is available in an innovative packaging format with a separate storage area for the scoop, and a semi-transparent window which allows you to see how much powder is left in the can without having to open it.","image":"https://cdn0.woolworths.media/content/wowproductimages/large/155536.jpg","brand":{"@context":"http://schema.org","@type":"Organization","name":"Nan"},"gtin13":"7613287517388","offers":{"@context":"http://schema.org","@type":"Offer","potentialAction":{"@context":"http://schema.org","@type":"BuyAction"},"availability":"http://schema.org/InStock","itemCondition":"http://schema.org/NewCondition","price":23.5,"priceCurrency":"AUD"},"review":[],"sku":"155536"}
|
||||
</script>
|
||||
<body>
|
||||
and it can also be repeated
|
||||
<script type="application/ld+json">
|
||||
{"@context":"http://schema.org","@type":"Product","name":"Nan Optipro Stage 1 Baby Formula 800g","description":"During the first year of life, nutrition is critical for your baby. NAN OPTIPRO 1 is tailored to ensure your formula fed infant receives balanced, high quality nutrition.<br />Starter infant formula. The age optimised protein source (whey dominant) is from cow’s milk.<br />Backed by more than 150 years of Nestlé expertise.<br />For hygiene and convenience, it is available in an innovative packaging format with a separate storage area for the scoop, and a semi-transparent window which allows you to see how much powder is left in the can without having to open it.","image":"https://cdn0.woolworths.media/content/wowproductimages/large/155536.jpg","brand":{"@context":"http://schema.org","@type":"Organization","name":"Nan"},"gtin13":"7613287517388","offers":{"@context":"http://schema.org","@type":"Offer","potentialAction":{"@context":"http://schema.org","@type":"BuyAction"},"availability":"http://schema.org/InStock","itemCondition":"http://schema.org/NewCondition","price":23.5,"priceCurrency":"AUD"},"review":[],"sku":"155536"}
|
||||
</script>
|
||||
<h4>ok</h4>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
"""
|
||||
from .. import html_tools
|
||||
|
||||
# See that we can find the second <script> one, which is not broken, and matches our filter
|
||||
text = html_tools.extract_json_as_string(content, "$.offers.price")
|
||||
assert text == "23.5"
|
||||
|
||||
text = html_tools.extract_json_as_string('{"id":5}', "$.id")
|
||||
assert text == "5"
|
||||
|
||||
# @todo how to test for exception raised
|
||||
# text = html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "$.id")
|
||||
|
||||
|
||||
def test_setup(live_server):
|
||||
live_server_setup(live_server)
|
||||
|
||||
|
|
Ładowanie…
Reference in New Issue