kopia lustrzana https://github.com/dgtlmoon/changedetection.io
Memory management - Run HTML to text in sub process, a few more cleanups to playwright (#3110)
rodzic
cfb4decf67
commit
3a583a4e5d
|
@ -59,7 +59,10 @@ def capture_full_page(page):
|
||||||
p.join()
|
p.join()
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
|
f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
|
||||||
|
# Explicit cleanup
|
||||||
|
del screenshot_chunks
|
||||||
|
del p
|
||||||
|
del parent_conn, child_conn
|
||||||
screenshot_chunks = None
|
screenshot_chunks = None
|
||||||
return screenshot
|
return screenshot
|
||||||
|
|
||||||
|
@ -286,12 +289,28 @@ class fetcher(Fetcher):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Clean up resources properly
|
# Clean up resources properly
|
||||||
context.close()
|
try:
|
||||||
context = None
|
self.page.request_gc()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
self.page.close()
|
self.page.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
self.page = None
|
self.page = None
|
||||||
|
|
||||||
browser.close()
|
try:
|
||||||
borwser = None
|
context.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
context = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
browser.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
browser = None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -435,7 +435,9 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
|
||||||
|
|
||||||
return re.sub(pattern, repl, html_content)
|
return re.sub(pattern, repl, html_content)
|
||||||
|
|
||||||
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False) -> str:
|
|
||||||
|
def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False):
|
||||||
|
|
||||||
from inscriptis import get_text
|
from inscriptis import get_text
|
||||||
from inscriptis.model.config import ParserConfig
|
from inscriptis.model.config import ParserConfig
|
||||||
|
|
||||||
|
@ -470,9 +472,19 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
|
||||||
html_content = re.sub(r'</title>', r'</h1>', html_content)
|
html_content = re.sub(r'</title>', r'</h1>', html_content)
|
||||||
|
|
||||||
text_content = get_text(html_content, config=parser_config)
|
text_content = get_text(html_content, config=parser_config)
|
||||||
|
conn.send(text_content)
|
||||||
|
conn.close()
|
||||||
|
|
||||||
return text_content
|
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
|
||||||
|
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
|
||||||
|
from multiprocessing import Process, Pipe
|
||||||
|
|
||||||
|
parent_conn, child_conn = Pipe()
|
||||||
|
p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss))
|
||||||
|
p.start()
|
||||||
|
text = parent_conn.recv()
|
||||||
|
p.join()
|
||||||
|
return text
|
||||||
|
|
||||||
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
|
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
|
||||||
def has_ldjson_product_info(content):
|
def has_ldjson_product_info(content):
|
||||||
|
|
Ładowanie…
Reference in New Issue