Re #2945 - Handle/Strip UTF-8 ByteOrderMark in JSON strings correctly (fixes `"Exception: No parsable JSON found in this document" ` error) (#2947)

2025-02-07 22:19:23 +01:00 · 2025-02-07 22:19:23 +01:00 · 55da48f719
commit 55da48f719
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@ -1,5 +1,6 @@
-from typing import List
+from loguru import logger
 from lxml import etree
+from typing import List
 import json
 import re

@ -298,8 +299,10 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
 # https://github.com/dgtlmoon/changedetection.io/pull/2041#issuecomment-1848397161w
    # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded within HTML tags
    try:
-        stripped_text_from_html = _parse_json(json.loads(content), json_filter)
-    except json.JSONDecodeError:
+        # .lstrip("\ufeff") strings ByteOrderMark from UTF8 and still lets the UTF work
+        stripped_text_from_html = _parse_json(json.loads(content.lstrip("\ufeff") ), json_filter)
+    except json.JSONDecodeError as e:
+        logger.warning(str(e))

        # Foreach <script json></script> blob.. just return the first that matches json_filter
        # As a last resort, try to parse the whole <body>
--- a/changedetectionio/tests/test_jsonpath_jq_selector.py
+++ b/changedetectionio/tests/test_jsonpath_jq_selector.py
@ -514,3 +514,15 @@ def test_check_jq_ext_filter(client, live_server, measure_memory_usage):
 def test_check_jqraw_ext_filter(client, live_server, measure_memory_usage):
    if jq_support:
        check_json_ext_filter('jq:.[] | select(.status | contains("Sold"))', client, live_server)
+
+def test_jsonpath_BOM_utf8(client, live_server, measure_memory_usage):
+    from .. import html_tools
+
+    # JSON string with BOM and correct double-quoted keys
+    json_str = '\ufeff{"name": "José", "emoji": "😊", "language": "中文", "greeting": "Привет"}'
+
+    # See that we can find the second <script> one, which is not broken, and matches our filter
+    text = html_tools.extract_json_as_string(json_str, "json:$.name")
+    assert text == '"José"'
+
+