Memory management - Run HTML to text in sub process, a few more cleanups to playwright (#3110)

pull/3113/head
dgtlmoon 2025-04-11 18:18:29 +02:00 zatwierdzone przez GitHub
rodzic cfb4decf67
commit 3a583a4e5d
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: B5690EEEBB952194
2 zmienionych plików z 39 dodań i 8 usunięć

Wyświetl plik

@ -59,7 +59,10 @@ def capture_full_page(page):
p.join()
logger.debug(
f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
# Explicit cleanup
del screenshot_chunks
del p
del parent_conn, child_conn
screenshot_chunks = None
return screenshot
@ -286,12 +289,28 @@ class fetcher(Fetcher):
pass
# Clean up resources properly
context.close()
context = None
try:
self.page.request_gc()
except:
pass
self.page.close()
try:
self.page.close()
except:
pass
self.page = None
browser.close()
borwser = None
try:
context.close()
except:
pass
context = None
try:
browser.close()
except:
pass
browser = None

Wyświetl plik

@ -435,7 +435,9 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
return re.sub(pattern, repl, html_content)
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False) -> str:
def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False):
from inscriptis import get_text
from inscriptis.model.config import ParserConfig
@ -470,9 +472,19 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
html_content = re.sub(r'</title>', r'</h1>', html_content)
text_content = get_text(html_content, config=parser_config)
conn.send(text_content)
conn.close()
return text_content
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
from multiprocessing import Process, Pipe
parent_conn, child_conn = Pipe()
p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss))
p.start()
text = parent_conn.recv()
p.join()
return text
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
def has_ldjson_product_info(content):