Re #2945 - Handle/Strip UTF-8 ByteOrderMark in JSON strings correctly (fixes `"Exception: No parsable JSON found in this document" ` error) (#2947)

pull/2948/head
dgtlmoon 2025-02-07 22:19:23 +01:00 zatwierdzone przez GitHub
rodzic dbd4adf23a
commit 55da48f719
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: B5690EEEBB952194
2 zmienionych plików z 18 dodań i 3 usunięć

Wyświetl plik

@ -1,5 +1,6 @@
from typing import List
from loguru import logger
from lxml import etree
from typing import List
import json
import re
@ -298,8 +299,10 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
# https://github.com/dgtlmoon/changedetection.io/pull/2041#issuecomment-1848397161w
# Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded within HTML tags
try:
stripped_text_from_html = _parse_json(json.loads(content), json_filter)
except json.JSONDecodeError:
# .lstrip("\ufeff") strings ByteOrderMark from UTF8 and still lets the UTF work
stripped_text_from_html = _parse_json(json.loads(content.lstrip("\ufeff") ), json_filter)
except json.JSONDecodeError as e:
logger.warning(str(e))
# Foreach <script json></script> blob.. just return the first that matches json_filter
# As a last resort, try to parse the whole <body>

Wyświetl plik

@ -514,3 +514,15 @@ def test_check_jq_ext_filter(client, live_server, measure_memory_usage):
def test_check_jqraw_ext_filter(client, live_server, measure_memory_usage):
if jq_support:
check_json_ext_filter('jq:.[] | select(.status | contains("Sold"))', client, live_server)
def test_jsonpath_BOM_utf8(client, live_server, measure_memory_usage):
from .. import html_tools
# JSON string with BOM and correct double-quoted keys
json_str = '\ufeff{"name": "José", "emoji": "😊", "language": "中文", "greeting": "Привет"}'
# See that we can find the second <script> one, which is not broken, and matches our filter
text = html_tools.extract_json_as_string(json_str, "json:$.name")
assert text == '"José"'