kopia lustrzana https://github.com/dgtlmoon/changedetection.io
Automatically offer to track LD+JSON product price data (#1204)
rodzic
f7bb8a0afa
commit
b58fd995b5
|
@ -1343,6 +1343,10 @@ def changedetection_app(config=None, datastore_o=None):
|
|||
import changedetectionio.blueprint.browser_steps as browser_steps
|
||||
app.register_blueprint(browser_steps.construct_blueprint(datastore), url_prefix='/browser-steps')
|
||||
|
||||
import changedetectionio.blueprint.price_data_follower as price_data_follower
|
||||
app.register_blueprint(price_data_follower.construct_blueprint(datastore), url_prefix='/price_data_follower')
|
||||
|
||||
|
||||
# @todo handle ctrl break
|
||||
ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start()
|
||||
threading.Thread(target=notification_runner).start()
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
|
||||
from distutils.util import strtobool
|
||||
from flask import Blueprint, flash, redirect, url_for
|
||||
from flask_login import login_required
|
||||
from changedetectionio.store import ChangeDetectionStore
|
||||
|
||||
def construct_blueprint(datastore: ChangeDetectionStore):
|
||||
|
||||
price_data_follower_blueprint = Blueprint('price_data_follower', __name__)
|
||||
|
||||
@login_required
|
||||
@price_data_follower_blueprint.route("/<string:uuid>/accept", methods=['GET'])
|
||||
def accept(uuid):
|
||||
datastore.data['watching'][uuid]['track_ldjson_price_data'] = 'accepted'
|
||||
return redirect(url_for("form_watch_checknow", uuid=uuid))
|
||||
|
||||
|
||||
@login_required
|
||||
@price_data_follower_blueprint.route("/<string:uuid>/reject", methods=['GET'])
|
||||
def reject(uuid):
|
||||
datastore.data['watching'][uuid]['track_ldjson_price_data'] = 'rejected'
|
||||
return redirect(url_for("index"))
|
||||
|
||||
|
||||
return price_data_follower_blueprint
|
||||
|
||||
|
|
@ -2,7 +2,6 @@ import hashlib
|
|||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import urllib3
|
||||
|
||||
from changedetectionio import content_fetcher, html_tools
|
||||
|
@ -140,7 +139,7 @@ class perform_site_check():
|
|||
is_html = False
|
||||
is_json = False
|
||||
|
||||
include_filters_rule = watch.get('include_filters', [])
|
||||
include_filters_rule = deepcopy(watch.get('include_filters', []))
|
||||
# include_filters_rule = watch['include_filters']
|
||||
subtractive_selectors = watch.get(
|
||||
"subtractive_selectors", []
|
||||
|
@ -148,6 +147,10 @@ class perform_site_check():
|
|||
"global_subtractive_selectors", []
|
||||
)
|
||||
|
||||
# Inject a virtual LD+JSON price tracker rule
|
||||
if watch.get('track_ldjson_price_data'):
|
||||
include_filters_rule.append(html_tools.LD_JSON_PRODUCT_OFFER_SELECTOR)
|
||||
|
||||
has_filter_rule = include_filters_rule and len("".join(include_filters_rule).strip())
|
||||
has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip())
|
||||
|
||||
|
@ -173,9 +176,13 @@ class perform_site_check():
|
|||
# Don't run get_text or xpath/css filters on plaintext
|
||||
stripped_text_from_html = html_content
|
||||
else:
|
||||
# Does it have some ld+json price data? used for easier monitoring
|
||||
update_obj['has_ldjson_price_data'] = html_tools.has_ldjson_product_info(fetcher.content)
|
||||
|
||||
# Then we assume HTML
|
||||
if has_filter_rule:
|
||||
html_content = ""
|
||||
|
||||
for filter_rule in include_filters_rule:
|
||||
# For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
|
||||
if filter_rule[0] == '/' or filter_rule.startswith('xpath:'):
|
||||
|
|
|
@ -10,6 +10,10 @@ import re
|
|||
# HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
|
||||
TEXT_FILTER_LIST_LINE_SUFFIX = "<br/>"
|
||||
|
||||
# 'price' , 'lowPrice', 'highPrice' are usually under here
|
||||
# all of those may or may not appear on different websites
|
||||
LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers"
|
||||
|
||||
class JSONNotFound(ValueError):
|
||||
def __init__(self, msg):
|
||||
ValueError.__init__(self, msg)
|
||||
|
@ -127,8 +131,10 @@ def _get_stripped_text_from_json_match(match):
|
|||
|
||||
return stripped_text_from_html
|
||||
|
||||
def extract_json_as_string(content, json_filter):
|
||||
|
||||
# content - json
|
||||
# json_filter - ie json:$..price
|
||||
# ensure_is_ldjson_info_type - str "product", optional, "@type == product" (I dont know how to do that as a json selector)
|
||||
def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None):
|
||||
stripped_text_from_html = False
|
||||
|
||||
# Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson>
|
||||
|
@ -139,7 +145,12 @@ def extract_json_as_string(content, json_filter):
|
|||
# Foreach <script json></script> blob.. just return the first that matches json_filter
|
||||
s = []
|
||||
soup = BeautifulSoup(content, 'html.parser')
|
||||
bs_result = soup.findAll('script')
|
||||
|
||||
if ensure_is_ldjson_info_type:
|
||||
bs_result = soup.findAll('script', {"type": "application/ld+json"})
|
||||
else:
|
||||
bs_result = soup.findAll('script')
|
||||
|
||||
|
||||
if not bs_result:
|
||||
raise JSONNotFound("No parsable JSON found in this document")
|
||||
|
@ -156,7 +167,14 @@ def extract_json_as_string(content, json_filter):
|
|||
continue
|
||||
else:
|
||||
stripped_text_from_html = _parse_json(json_data, json_filter)
|
||||
if stripped_text_from_html:
|
||||
if ensure_is_ldjson_info_type:
|
||||
# Could sometimes be list, string or something else random
|
||||
if isinstance(json_data, dict):
|
||||
# If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search
|
||||
# (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part)
|
||||
if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower() and stripped_text_from_html:
|
||||
break
|
||||
elif stripped_text_from_html:
|
||||
break
|
||||
|
||||
if not stripped_text_from_html:
|
||||
|
@ -243,6 +261,18 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
|
|||
|
||||
return text_content
|
||||
|
||||
|
||||
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
|
||||
def has_ldjson_product_info(content):
|
||||
try:
|
||||
pricing_data = extract_json_as_string(content=content, json_filter=LD_JSON_PRODUCT_OFFER_SELECTOR, ensure_is_ldjson_info_type="product")
|
||||
except JSONNotFound as e:
|
||||
# Totally fine
|
||||
return False
|
||||
x=bool(pricing_data)
|
||||
return x
|
||||
|
||||
|
||||
def workarounds_for_obfuscations(content):
|
||||
"""
|
||||
Some sites are using sneaky tactics to make prices and other information un-renderable by Inscriptis
|
||||
|
|
|
@ -26,6 +26,8 @@ class model(dict):
|
|||
'extract_title_as_title': False,
|
||||
'fetch_backend': None,
|
||||
'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
|
||||
'has_ldjson_price_data': None,
|
||||
'track_ldjson_price_data': None,
|
||||
'headers': {}, # Extra headers to send
|
||||
'ignore_text': [], # List of text to ignore when calculating the comparison checksum
|
||||
'include_filters': [],
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<svg width="83.39" height="89.648" enable-background="new 0 0 122.406 122.881" version="1.1" viewBox="0 0 83.39 89.648" xml:space="preserve" xmlns="http://www.w3.org/2000/svg"><g transform="translate(5e-4 -33.234)"><path d="m44.239 42.946-39.111 39.896 34.908 34.91 39.09-39.876-1.149-34.931zm-0.91791 42.273c0.979-0.979 1.507-1.99 1.577-3.027 0.077-1.043-0.248-2.424-0.967-4.135-0.725-1.717-1.348-3.346-1.87-4.885s-0.814-3.014-0.897-4.432c-0.07-1.42 0.134-2.768 0.624-4.045 0.477-1.279 1.348-2.545 2.607-3.804 2.099-2.099 4.535-3.123 7.314-3.065 2.773 0.063 5.457 1.158 8.04 3.294l2.881 3.034c1.946 2.607 2.799 5.33 2.557 8.166-0.235 2.83-1.532 5.426-3.893 7.785l-6.296-6.297c1.291-1.291 2.035-2.531 2.238-3.727 0.191-1.197-0.165-2.252-1.081-3.168-0.821-0.82-1.717-1.195-2.69-1.139-0.967 0.064-1.908 0.547-2.817 1.457-0.922 0.922-1.393 1.914-1.412 2.977s0.306 2.416 0.973 4.064c0.661 1.652 1.24 3.25 1.736 4.801 0.496 1.553 0.782 3.035 0.858 4.445 0.076 1.426-0.127 2.787-0.591 4.104-0.477 1.316-1.336 2.596-2.588 3.848-2.125 2.125-4.522 3.186-7.212 3.18s-5.311-1.063-7.855-3.16l-3.747 3.746-2.964-2.965 3.766-3.764c-2.423-2.996-3.568-5.998-3.447-9.02 0.127-3.014 1.476-5.813 4.045-8.383l6.278 6.277c-1.412 1.412-2.175 2.799-2.277 4.16-0.108 1.367 0.414 2.627 1.571 3.783 0.839 0.84 1.755 1.26 2.741 1.242 0.985-0.017 1.92-0.47 2.798-1.347zm21.127-46.435h17.457c-0.0269 2.2368 0.69936 16.025 0.69936 16.025l0.785 23.858c0.019 0.609-0.221 1.164-0.619 1.564l5e-3 4e-3 -41.236 42.022c-0.82213 0.8378-2.175 0.83-3.004 0l-37.913-37.91c-0.83-0.83-0.83-2.176 0-3.006l41.236-42.021c0.39287-0.42671 1.502-0.53568 1.502-0.53568zm18.011 11.59c-59.392-29.687-29.696-14.843 0 0z"/></g></svg>
|
Po Szerokość: | Wysokość: | Rozmiar: 1.7 KiB |
|
@ -1009,3 +1009,30 @@ ul {
|
|||
border-radius: 5px;
|
||||
color: var(--color-warning);
|
||||
}
|
||||
|
||||
/* automatic price following helpers */
|
||||
.tracking-ldjson-price-data {
|
||||
background-color: var(--color-background-button-green);
|
||||
color: #000;
|
||||
padding: 3px;
|
||||
border-radius: 3px;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.ldjson-price-track-offer {
|
||||
a.pure-button {
|
||||
border-radius: 3px;
|
||||
padding: 3px;
|
||||
background-color: var(--color-background-button-green);
|
||||
}
|
||||
|
||||
font-weight: bold;
|
||||
font-style: italic;
|
||||
}
|
||||
|
||||
.price-follow-tag-icon {
|
||||
display: inline-block;
|
||||
height: 0.8rem;
|
||||
vertical-align: middle;
|
||||
}
|
||||
|
||||
|
|
|
@ -945,3 +945,24 @@ ul {
|
|||
display: inline;
|
||||
height: 26px;
|
||||
vertical-align: middle; }
|
||||
|
||||
/* automatic price following helpers */
|
||||
.tracking-ldjson-price-data {
|
||||
background-color: var(--color-background-button-green);
|
||||
color: #000;
|
||||
padding: 3px;
|
||||
border-radius: 3px;
|
||||
white-space: nowrap; }
|
||||
|
||||
.ldjson-price-track-offer {
|
||||
font-weight: bold;
|
||||
font-style: italic; }
|
||||
.ldjson-price-track-offer a.pure-button {
|
||||
border-radius: 3px;
|
||||
padding: 3px;
|
||||
background-color: var(--color-background-button-green); }
|
||||
|
||||
.price-follow-tag-icon {
|
||||
display: inline-block;
|
||||
height: 0.8rem;
|
||||
vertical-align: middle; }
|
||||
|
|
|
@ -250,12 +250,15 @@ class ChangeDetectionStore:
|
|||
def clear_watch_history(self, uuid):
|
||||
import pathlib
|
||||
|
||||
self.__data['watching'][uuid].update(
|
||||
{'last_checked': 0,
|
||||
'last_viewed': 0,
|
||||
'previous_md5': False,
|
||||
'last_notification_error': False,
|
||||
'last_error': False})
|
||||
self.__data['watching'][uuid].update({
|
||||
'last_checked': 0,
|
||||
'has_ldjson_price_data': None,
|
||||
'last_error': False,
|
||||
'last_notification_error': False,
|
||||
'last_viewed': 0,
|
||||
'previous_md5': False,
|
||||
'track_ldjson_price_data': None,
|
||||
})
|
||||
|
||||
# JSON Data, Screenshots, Textfiles (history index and snapshots), HTML in the future etc
|
||||
for item in pathlib.Path(os.path.join(self.datastore_path, uuid)).rglob("*.*"):
|
||||
|
|
|
@ -88,9 +88,9 @@
|
|||
</td>
|
||||
<td class="title-col inline">{{watch.title if watch.title is not none and watch.title|length > 0 else watch.url}}
|
||||
<a class="external" target="_blank" rel="noopener" href="{{ watch.link.replace('source:','') }}"></a>
|
||||
<a class="link-spread" href="{{url_for('form_share_put_watch', uuid=watch.uuid)}}"><img style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread.svg')}}" class="icon icon-spread" /></a>
|
||||
<a class="link-spread" href="{{url_for('form_share_put_watch', uuid=watch.uuid)}}"><img style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread.svg')}}" class="icon icon-spread" title="Create a link to share watch config with others" /></a>
|
||||
|
||||
{%if watch.fetch_backend == "html_webdriver" %}<img style="height: 1em; display:inline-block;" src="{{url_for('static_content', group='images', filename='Google-Chrome-icon.png')}}" />{% endif %}
|
||||
{%if watch.fetch_backend == "html_webdriver" %}<img style="height: 1em; display:inline-block;" src="{{url_for('static_content', group='images', filename='Google-Chrome-icon.png')}}" title="Using a chrome browser" />{% endif %}
|
||||
|
||||
{% if watch.last_error is defined and watch.last_error != False %}
|
||||
<div class="fetch-error">{{ watch.last_error }}</div>
|
||||
|
@ -98,6 +98,12 @@
|
|||
{% if watch.last_notification_error is defined and watch.last_notification_error != False %}
|
||||
<div class="fetch-error notification-error"><a href="{{url_for('notification_logs')}}">{{ watch.last_notification_error }}</a></div>
|
||||
{% endif %}
|
||||
{% if watch['has_ldjson_price_data'] and not watch['track_ldjson_price_data'] %}
|
||||
<div class="ldjson-price-track-offer">Embedded price data detected, follow only price data? <a href="{{url_for('price_data_follower.accept', uuid=watch.uuid)}}" class="pure-button button-xsmall">Yes</a> <a href="{{url_for('price_data_follower.reject', uuid=watch.uuid)}}" class="">No</a></div>
|
||||
{% endif %}
|
||||
{% if watch['track_ldjson_price_data'] == 'accepted' %}
|
||||
<span class="tracking-ldjson-price-data" title="Automatically following embedded price information"><img src="{{url_for('static_content', group='images', filename='price-tag-icon.svg')}}" class="price-follow-tag-icon"/> Price</span>
|
||||
{% endif %}
|
||||
{% if not active_tag %}
|
||||
<span class="watch-tag-list">{{ watch.tag}}</span>
|
||||
{% endif %}
|
||||
|
|
|
@ -0,0 +1,146 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import time
|
||||
from flask import url_for
|
||||
from .util import live_server_setup, extract_UUID_from_client, extract_api_key_from_UI
|
||||
|
||||
def set_response_with_ldjson():
|
||||
test_return_data = """<html>
|
||||
<body>
|
||||
Some initial text</br>
|
||||
<p>Which is across multiple lines</p>
|
||||
</br>
|
||||
So let's see what happens. </br>
|
||||
<div class="sametext">Some text thats the same</div>
|
||||
<div class="changetext">Some text that will change</div>
|
||||
<script type="application/ld+json">
|
||||
{
|
||||
"@context":"https://schema.org/",
|
||||
"@type":"Product",
|
||||
"@id":"https://www.some-virtual-phone-shop.com/celular-iphone-14/p",
|
||||
"name":"Celular Iphone 14 Pro Max 256Gb E Sim A16 Bionic",
|
||||
"brand":{
|
||||
"@type":"Brand",
|
||||
"name":"APPLE"
|
||||
},
|
||||
"image":"https://www.some-virtual-phone-shop.com/15509426/image.jpg",
|
||||
"description":"You dont need it",
|
||||
"mpn":"111111",
|
||||
"sku":"22222",
|
||||
"offers":{
|
||||
"@type":"AggregateOffer",
|
||||
"lowPrice":8097000,
|
||||
"highPrice":8099900,
|
||||
"priceCurrency":"COP",
|
||||
"offers":[
|
||||
{
|
||||
"@type":"Offer",
|
||||
"price":8097000,
|
||||
"priceCurrency":"COP",
|
||||
"availability":"http://schema.org/InStock",
|
||||
"sku":"102375961",
|
||||
"itemCondition":"http://schema.org/NewCondition",
|
||||
"seller":{
|
||||
"@type":"Organization",
|
||||
"name":"ajax"
|
||||
}
|
||||
}
|
||||
],
|
||||
"offerCount":1
|
||||
}
|
||||
}
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write(test_return_data)
|
||||
return None
|
||||
|
||||
def set_response_without_ldjson():
|
||||
test_return_data = """<html>
|
||||
<body>
|
||||
Some initial text</br>
|
||||
<p>Which is across multiple lines</p>
|
||||
</br>
|
||||
So let's see what happens. </br>
|
||||
<div class="sametext">Some text thats the same</div>
|
||||
<div class="changetext">Some text that will change</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write(test_return_data)
|
||||
return None
|
||||
|
||||
# actually only really used by the distll.io importer, but could be handy too
|
||||
def test_check_ldjson_price_autodetect(client, live_server):
|
||||
live_server_setup(live_server)
|
||||
|
||||
# Give the endpoint time to spin up
|
||||
time.sleep(1)
|
||||
|
||||
set_response_with_ldjson()
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_endpoint', _external=True)
|
||||
res = client.post(
|
||||
url_for("import_page"),
|
||||
data={"urls": test_url},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"1 Imported" in res.data
|
||||
time.sleep(3)
|
||||
|
||||
# Should get a notice that it's available
|
||||
res = client.get(url_for("index"))
|
||||
assert b'ldjson-price-track-offer' in res.data
|
||||
|
||||
# Accept it
|
||||
uuid = extract_UUID_from_client(client)
|
||||
|
||||
client.get(url_for('price_data_follower.accept', uuid=uuid, follow_redirects=True))
|
||||
time.sleep(2)
|
||||
|
||||
# Trigger a check
|
||||
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||
time.sleep(2)
|
||||
# Offer should be gone
|
||||
res = client.get(url_for("index"))
|
||||
assert b'Embedded price data' not in res.data
|
||||
assert b'tracking-ldjson-price-data' in res.data
|
||||
|
||||
# and last snapshop (via API) should be just the price
|
||||
api_key = extract_api_key_from_UI(client)
|
||||
res = client.get(
|
||||
url_for("watchsinglehistory", uuid=uuid, timestamp='latest'),
|
||||
headers={'x-api-key': api_key},
|
||||
)
|
||||
|
||||
# Should see this (dont know where the whitespace came from)
|
||||
assert b'"highPrice": 8099900' in res.data
|
||||
# And not this cause its not the ld-json
|
||||
assert b"So let's see what happens" not in res.data
|
||||
|
||||
client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||
|
||||
##########################################################################################
|
||||
# And we shouldnt see the offer
|
||||
set_response_without_ldjson()
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_endpoint', _external=True)
|
||||
res = client.post(
|
||||
url_for("import_page"),
|
||||
data={"urls": test_url},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"1 Imported" in res.data
|
||||
time.sleep(3)
|
||||
res = client.get(url_for("index"))
|
||||
assert b'ldjson-price-track-offer' not in res.data
|
||||
|
||||
##########################################################################################
|
||||
client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
Ładowanie…
Reference in New Issue