kopia lustrzana https://github.com/dgtlmoon/changedetection.io
Revert multiprocess memory management, was unreliable under high concurrency
rodzic
8df61f5eaa
commit
1ec86bd38d
|
@ -436,7 +436,10 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
|
|||
return re.sub(pattern, repl, html_content)
|
||||
|
||||
|
||||
def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False):
|
||||
|
||||
|
||||
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
|
||||
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
|
||||
|
||||
from inscriptis import get_text
|
||||
from inscriptis.model.config import ParserConfig
|
||||
|
@ -472,19 +475,7 @@ def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=F
|
|||
html_content = re.sub(r'</title>', r'</h1>', html_content)
|
||||
|
||||
text_content = get_text(html_content, config=parser_config)
|
||||
conn.send(text_content)
|
||||
conn.close()
|
||||
|
||||
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
|
||||
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
|
||||
from multiprocessing import Process, Pipe
|
||||
|
||||
parent_conn, child_conn = Pipe()
|
||||
p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss))
|
||||
p.start()
|
||||
text = parent_conn.recv()
|
||||
p.join()
|
||||
return text
|
||||
return text_content
|
||||
|
||||
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
|
||||
def has_ldjson_product_info(content):
|
||||
|
|
|
@ -58,8 +58,8 @@ def test_consistent_history(client, live_server, measure_memory_usage):
|
|||
assert len(tmp_history) == 1, "History.txt should contain 1 line"
|
||||
|
||||
# Should be two files,. the history.txt , and the snapshot.txt
|
||||
files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path,
|
||||
w))
|
||||
files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path, w))
|
||||
|
||||
# Find the snapshot one
|
||||
for fname in files_in_watch_dir:
|
||||
if fname != 'history.txt' and 'html' not in fname:
|
||||
|
@ -75,7 +75,6 @@ def test_consistent_history(client, live_server, measure_memory_usage):
|
|||
|
||||
assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot"
|
||||
|
||||
|
||||
json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json')
|
||||
with open(json_db_file, 'r') as f:
|
||||
assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved"
|
||||
|
|
Ładowanie…
Reference in New Issue