Avoid pickling issues

pull/3185/head
dgtlmoon 2025-05-08 18:15:16 +02:00
rodzic 1ec86bd38d
commit e38f264750
1 zmienionych plików z 47 dodań i 32 usunięć

Wyświetl plik

@ -436,46 +436,61 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
return re.sub(pattern, repl, html_content) return re.sub(pattern, repl, html_content)
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON # NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False): import os
def html_to_text_sub_worker(temp_file_path, html_content, render_anchor_tag_content=False, is_rss=False):
from inscriptis import get_text from inscriptis import get_text
from inscriptis.model.config import ParserConfig from inscriptis.model.config import ParserConfig
try:
if render_anchor_tag_content:
parser_config = ParserConfig(
annotation_rules={"a": ["hyperlink"]},
display_links=True
)
else:
parser_config = None
"""Converts html string to a string with just the text. If ignoring if is_rss:
rendering anchor tag content is enable, anchor tag content are also html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
included in the text html_content = re.sub(r'</title>', r'</h1>', html_content)
:param html_content: string with html content text_content = get_text(html_content, config=parser_config)
:param render_anchor_tag_content: boolean flag indicating whether to extract
hyperlinks (the anchor tag content) together with text. This refers to the
'href' inside 'a' tags.
Anchor tag content is rendered in the following manner:
'[ text ](anchor tag content)'
:return: extracted text from the HTML
"""
# if anchor tag content flag is set to True define a config for
# extracting this content
if render_anchor_tag_content:
parser_config = ParserConfig(
annotation_rules={"a": ["hyperlink"]},
display_links=True
)
# otherwise set config to None/default
else:
parser_config = None
# RSS Mode - Inscriptis will treat `title` as something else. with open(temp_file_path, "w", encoding="utf-8") as f:
# Make it as a regular block display element (//item/title) f.write(text_content)
# This is a bit of a hack - the real way it to use XSLT to convert it to HTML #1874
if is_rss:
html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
html_content = re.sub(r'</title>', r'</h1>', html_content)
text_content = get_text(html_content, config=parser_config) except Exception as e:
return text_content # Write error to file so the parent can read it
with open(temp_file_path, "w", encoding="utf-8") as f:
f.write(f"[ERROR] {e}")
import tempfile
from multiprocessing import Process
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8") as tmp_file:
temp_file_path = tmp_file.name
p = Process(
target=html_to_text_sub_worker,
args=(temp_file_path, html_content, render_anchor_tag_content, is_rss)
)
p.start()
p.join(timeout)
if p.is_alive():
p.terminate()
p.join()
try:
with open(temp_file_path, "r", encoding="utf-8") as f:
result = f.read()
finally:
os.remove(temp_file_path)
return result
# Does LD+JSON exist with a @type=='product' and a .price set anywhere? # Does LD+JSON exist with a @type=='product' and a .price set anywhere?
def has_ldjson_product_info(content): def has_ldjson_product_info(content):