Revert multiprocess memory management, was unreliable under high concurrency

2025-05-08 18:09:47 +02:00 · 2025-05-08 18:09:47 +02:00 · 1ec86bd38d
commit 1ec86bd38d
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@ -436,7 +436,10 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
    return re.sub(pattern, repl, html_content)


-def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False):
+
+
+# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
+def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):

    from inscriptis import get_text
    from inscriptis.model.config import ParserConfig
@ -472,19 +475,7 @@ def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=F
        html_content = re.sub(r'</title>', r'</h1>', html_content)

    text_content = get_text(html_content, config=parser_config)
-    conn.send(text_content)
-    conn.close()
-
-# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
-def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
-    from multiprocessing import Process, Pipe
-
-    parent_conn, child_conn = Pipe()
-    p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss))
-    p.start()
-    text = parent_conn.recv()
-    p.join()
-    return text
+    return text_content

 # Does LD+JSON exist with a @type=='product' and a .price set anywhere?
 def has_ldjson_product_info(content):
--- a/changedetectionio/tests/test_history_consistency.py
+++ b/changedetectionio/tests/test_history_consistency.py
@ -58,8 +58,8 @@ def test_consistent_history(client, live_server, measure_memory_usage):
            assert len(tmp_history) == 1, "History.txt should contain 1 line"

        # Should be two files,. the history.txt , and the snapshot.txt
-        files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path,
-                                                     w))
+        files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path, w))
+
        # Find the snapshot one
        for fname in files_in_watch_dir:
            if fname != 'history.txt' and 'html' not in fname:
@ -75,7 +75,6 @@ def test_consistent_history(client, live_server, measure_memory_usage):

        assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot"

-
    json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json')
    with open(json_db_file, 'r') as f:
        assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved"