Revert memory strategy change for html_to_text (Was hanging under high concurrency setups)

2025-05-09 09:44:02 +02:00 · 2025-05-09 09:44:02 +02:00 · a9ca511004
commit a9ca511004
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@ -436,55 +436,27 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
    return re.sub(pattern, repl, html_content)


-def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False):
+# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON

+
+def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
    from inscriptis import get_text
    from inscriptis.model.config import ParserConfig

-    """Converts html string to a string with just the text. If ignoring
-    rendering anchor tag content is enable, anchor tag content are also
-    included in the text
-
-    :param html_content: string with html content
-    :param render_anchor_tag_content: boolean flag indicating whether to extract
-    hyperlinks (the anchor tag content) together with text. This refers to the
-    'href' inside 'a' tags.
-    Anchor tag content is rendered in the following manner:
-    '[ text ](anchor tag content)'
-    :return: extracted text from the HTML
-    """
-    #  if anchor tag content flag is set to True define a config for
-    #  extracting this content
    if render_anchor_tag_content:
        parser_config = ParserConfig(
            annotation_rules={"a": ["hyperlink"]},
            display_links=True
        )
-    # otherwise set config to None/default
    else:
        parser_config = None

-    # RSS Mode - Inscriptis will treat `title` as something else.
-    # Make it as a regular block display element (//item/title)
-    # This is a bit of a hack - the real way it to use XSLT to convert it to HTML #1874
    if is_rss:
        html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
        html_content = re.sub(r'</title>', r'</h1>', html_content)

    text_content = get_text(html_content, config=parser_config)
-    conn.send(text_content)
-    conn.close()
-
-# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
-def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
-    from multiprocessing import Process, Pipe
-
-    parent_conn, child_conn = Pipe()
-    p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss))
-    p.start()
-    text = parent_conn.recv()
-    p.join()
-    return text
+    return text_content

 # Does LD+JSON exist with a @type=='product' and a .price set anywhere?
 def has_ldjson_product_info(content):
--- a/changedetectionio/run_basic_tests.sh
+++ b/changedetectionio/run_basic_tests.sh
@ -38,6 +38,9 @@ pytest tests/test_backend.py
 pytest tests/test_rss.py
 pytest tests/test_unique_lines.py

+# Try high concurrency
+FETCH_WORKERS=130 pytest  tests/test_history_consistency.py -v -l
+
 # Check file:// will pickup a file when enabled
 echo "Hello world" > /tmp/test-file.txt
 ALLOW_FILE_URI=yes pytest tests/test_security.py
--- a/changedetectionio/tests/test_history_consistency.py
+++ b/changedetectionio/tests/test_history_consistency.py
@ -10,8 +10,8 @@ from urllib.parse import urlparse, parse_qs

 def test_consistent_history(client, live_server, measure_memory_usage):
    live_server_setup(live_server)
-
-    r = range(1, 30)
+    workers = int(os.getenv("FETCH_WORKERS", 10))
+    r = range(1, 10+workers)

    for one in r:
        test_url = url_for('test_endpoint', content_type="text/html", content=str(one), _external=True)
@ -46,9 +46,10 @@ def test_consistent_history(client, live_server, measure_memory_usage):

    # assert the right amount of watches was found in the JSON
    assert len(json_obj['watching']) == len(r), "Correct number of watches was found in the JSON"
-
+    i=0
    # each one should have a history.txt containing just one line
    for w in json_obj['watching'].keys():
+        i+=1
        history_txt_index_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, w, 'history.txt')
        assert os.path.isfile(history_txt_index_file), f"History.txt should exist where I expect it at {history_txt_index_file}"

@ -58,8 +59,8 @@ def test_consistent_history(client, live_server, measure_memory_usage):
            assert len(tmp_history) == 1, "History.txt should contain 1 line"

        # Should be two files,. the history.txt , and the snapshot.txt
-        files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path,
-                                                     w))
+        files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path, w))
+
        # Find the snapshot one
        for fname in files_in_watch_dir:
            if fname != 'history.txt' and 'html' not in fname:
@ -75,7 +76,6 @@ def test_consistent_history(client, live_server, measure_memory_usage):

        assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot"

-
    json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json')
    with open(json_db_file, 'r') as f:
        assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved"
--- a/changedetectionio/tests/util.py
+++ b/changedetectionio/tests/util.py
@ -126,18 +126,51 @@ def extract_UUID_from_client(client):
    uuid = m.group(1)
    return uuid.strip()

-def wait_for_all_checks(client):
-    # actually this is not entirely true, it can still be 'processing' but not in the queue
-    # Loop waiting until done..
-    attempt=0
-    # because sub-second rechecks are problematic in testing, use lots of delays
-    time.sleep(1)
-    while attempt < 60:
-        res = client.get(url_for("watchlist.index"))
-        if not b'Checking now' in res.data:
-            break
-        logging.getLogger().info(f"Waiting for watch-list to not say 'Checking now'.. {attempt}")
-        time.sleep(1)
+
+def wait_for_all_checks(client=None):
+    """
+    Waits until the queue is empty and remains empty for at least `required_empty_duration` seconds,
+    and also ensures no running threads have `current_uuid` set.
+    Retries for up to `max_attempts` times, sleeping `wait_between_attempts` seconds between checks.
+    """
+    from changedetectionio.flask_app import update_q as global_update_q, running_update_threads
+
+    # Configuration
+    attempt = 0
+    i=0
+    max_attempts = 60
+    wait_between_attempts = 2
+    required_empty_duration = 2
+
+    logger = logging.getLogger()
+    time.sleep(1.2)
+
+    empty_since = None
+
+    while attempt < max_attempts:
+        q_length = global_update_q.qsize()
+
+        # Check if any threads are still processing
+        time.sleep(1.2)
+        any_threads_busy = any(t.current_uuid for t in running_update_threads)
+
+
+        if q_length == 0 and not any_threads_busy:
+            if empty_since is None:
+                empty_since = time.time()
+                logger.info(f"Queue empty and no active threads at attempt {attempt}, starting empty timer...")
+            elif time.time() - empty_since >= required_empty_duration:
+                logger.info(f"Queue has been empty and threads idle for {required_empty_duration} seconds. Done waiting.")
+                break
+            else:
+                logger.info(f"Still waiting: queue empty and no active threads, but not yet {required_empty_duration} seconds...")
+        else:
+            if q_length != 0:
+                logger.info(f"Queue not empty (size={q_length}), resetting timer.")
+            if any_threads_busy:
+                busy_threads = [t.name for t in running_update_threads if t.current_uuid]
+                logger.info(f"Threads still busy: {busy_threads}, resetting timer.")
+            empty_since = None
        attempt += 1

    time.sleep(1)