Revert multiprocess memory management, was unreliable under high concurrency

pull/3185/head
dgtlmoon 2025-05-08 18:09:47 +02:00
rodzic 8df61f5eaa
commit 1ec86bd38d
2 zmienionych plików z 7 dodań i 17 usunięć

Wyświetl plik

@ -436,7 +436,10 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
return re.sub(pattern, repl, html_content) return re.sub(pattern, repl, html_content)
def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False):
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
from inscriptis import get_text from inscriptis import get_text
from inscriptis.model.config import ParserConfig from inscriptis.model.config import ParserConfig
@ -472,19 +475,7 @@ def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=F
html_content = re.sub(r'</title>', r'</h1>', html_content) html_content = re.sub(r'</title>', r'</h1>', html_content)
text_content = get_text(html_content, config=parser_config) text_content = get_text(html_content, config=parser_config)
conn.send(text_content) return text_content
conn.close()
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
from multiprocessing import Process, Pipe
parent_conn, child_conn = Pipe()
p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss))
p.start()
text = parent_conn.recv()
p.join()
return text
# Does LD+JSON exist with a @type=='product' and a .price set anywhere? # Does LD+JSON exist with a @type=='product' and a .price set anywhere?
def has_ldjson_product_info(content): def has_ldjson_product_info(content):

Wyświetl plik

@ -58,8 +58,8 @@ def test_consistent_history(client, live_server, measure_memory_usage):
assert len(tmp_history) == 1, "History.txt should contain 1 line" assert len(tmp_history) == 1, "History.txt should contain 1 line"
# Should be two files,. the history.txt , and the snapshot.txt # Should be two files,. the history.txt , and the snapshot.txt
files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path, files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path, w))
w))
# Find the snapshot one # Find the snapshot one
for fname in files_in_watch_dir: for fname in files_in_watch_dir:
if fname != 'history.txt' and 'html' not in fname: if fname != 'history.txt' and 'html' not in fname:
@ -75,7 +75,6 @@ def test_consistent_history(client, live_server, measure_memory_usage):
assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot" assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot"
json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json') json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json')
with open(json_db_file, 'r') as f: with open(json_db_file, 'r') as f:
assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved" assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved"