Avoid pickling issues

2025-05-08 18:15:16 +02:00 · 2025-05-08 18:15:16 +02:00 · e38f264750
commit e38f264750
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@ -436,46 +436,61 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
    return re.sub(pattern, repl, html_content)
 # NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
-def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
+import os
 def html_to_text_sub_worker(temp_file_path, html_content, render_anchor_tag_content=False, is_rss=False):
    from inscriptis import get_text
    from inscriptis.model.config import ParserConfig
    try:
        if render_anchor_tag_content:
            parser_config = ParserConfig(
                annotation_rules={"a": ["hyperlink"]},
                display_links=True
            )
        else:
            parser_config = None
-    """Converts html string to a string with just the text. If ignoring
+        if is_rss:
-    rendering anchor tag content is enable, anchor tag content are also
+            html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
-    included in the text
+            html_content = re.sub(r'</title>', r'</h1>', html_content)
-    :param html_content: string with html content
+        text_content = get_text(html_content, config=parser_config)
    :param render_anchor_tag_content: boolean flag indicating whether to extract
    hyperlinks (the anchor tag content) together with text. This refers to the
    'href' inside 'a' tags.
    Anchor tag content is rendered in the following manner:
    '[ text ](anchor tag content)'
    :return: extracted text from the HTML
    """
    #  if anchor tag content flag is set to True define a config for
    #  extracting this content
    if render_anchor_tag_content:
        parser_config = ParserConfig(
            annotation_rules={"a": ["hyperlink"]},
            display_links=True
        )
    # otherwise set config to None/default
    else:
        parser_config = None
-    # RSS Mode - Inscriptis will treat `title` as something else.
+        with open(temp_file_path, "w", encoding="utf-8") as f:
-    # Make it as a regular block display element (//item/title)
+            f.write(text_content)
    # This is a bit of a hack - the real way it to use XSLT to convert it to HTML #1874
    if is_rss:
        html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
        html_content = re.sub(r'</title>', r'</h1>', html_content)
-    text_content = get_text(html_content, config=parser_config)
+    except Exception as e:
-    return text_content
+        # Write error to file so the parent can read it
        with open(temp_file_path, "w", encoding="utf-8") as f:
            f.write(f"[ERROR] {e}")
 import tempfile
 from multiprocessing import Process
 def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
    with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8") as tmp_file:
        temp_file_path = tmp_file.name
    p = Process(
        target=html_to_text_sub_worker,
        args=(temp_file_path, html_content, render_anchor_tag_content, is_rss)
    )
    p.start()
    p.join(timeout)
    if p.is_alive():
        p.terminate()
        p.join()
    try:
        with open(temp_file_path, "r", encoding="utf-8") as f:
            result = f.read()
    finally:
        os.remove(temp_file_path)
    return result
 # Does LD+JSON exist with a @type=='product' and a .price set anywhere?
 def has_ldjson_product_info(content):