Revert memory strategy change for html_to_text (Was hanging under high concurrency setups)

pull/3162/merge
dgtlmoon 2025-05-09 09:44:02 +02:00 zatwierdzone przez GitHub
rodzic 8df61f5eaa
commit a9ca511004
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: B5690EEEBB952194
4 zmienionych plików z 58 dodań i 50 usunięć

Wyświetl plik

@ -436,55 +436,27 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
return re.sub(pattern, repl, html_content)
def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False):
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
from inscriptis import get_text
from inscriptis.model.config import ParserConfig
"""Converts html string to a string with just the text. If ignoring
rendering anchor tag content is enable, anchor tag content are also
included in the text
:param html_content: string with html content
:param render_anchor_tag_content: boolean flag indicating whether to extract
hyperlinks (the anchor tag content) together with text. This refers to the
'href' inside 'a' tags.
Anchor tag content is rendered in the following manner:
'[ text ](anchor tag content)'
:return: extracted text from the HTML
"""
# if anchor tag content flag is set to True define a config for
# extracting this content
if render_anchor_tag_content:
parser_config = ParserConfig(
annotation_rules={"a": ["hyperlink"]},
display_links=True
)
# otherwise set config to None/default
else:
parser_config = None
# RSS Mode - Inscriptis will treat `title` as something else.
# Make it as a regular block display element (//item/title)
# This is a bit of a hack - the real way it to use XSLT to convert it to HTML #1874
if is_rss:
html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
html_content = re.sub(r'</title>', r'</h1>', html_content)
text_content = get_text(html_content, config=parser_config)
conn.send(text_content)
conn.close()
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
from multiprocessing import Process, Pipe
parent_conn, child_conn = Pipe()
p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss))
p.start()
text = parent_conn.recv()
p.join()
return text
return text_content
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
def has_ldjson_product_info(content):

Wyświetl plik

@ -38,6 +38,9 @@ pytest tests/test_backend.py
pytest tests/test_rss.py
pytest tests/test_unique_lines.py
# Try high concurrency
FETCH_WORKERS=130 pytest tests/test_history_consistency.py -v -l
# Check file:// will pickup a file when enabled
echo "Hello world" > /tmp/test-file.txt
ALLOW_FILE_URI=yes pytest tests/test_security.py

Wyświetl plik

@ -10,8 +10,8 @@ from urllib.parse import urlparse, parse_qs
def test_consistent_history(client, live_server, measure_memory_usage):
live_server_setup(live_server)
r = range(1, 30)
workers = int(os.getenv("FETCH_WORKERS", 10))
r = range(1, 10+workers)
for one in r:
test_url = url_for('test_endpoint', content_type="text/html", content=str(one), _external=True)
@ -46,9 +46,10 @@ def test_consistent_history(client, live_server, measure_memory_usage):
# assert the right amount of watches was found in the JSON
assert len(json_obj['watching']) == len(r), "Correct number of watches was found in the JSON"
i=0
# each one should have a history.txt containing just one line
for w in json_obj['watching'].keys():
i+=1
history_txt_index_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, w, 'history.txt')
assert os.path.isfile(history_txt_index_file), f"History.txt should exist where I expect it at {history_txt_index_file}"
@ -58,8 +59,8 @@ def test_consistent_history(client, live_server, measure_memory_usage):
assert len(tmp_history) == 1, "History.txt should contain 1 line"
# Should be two files,. the history.txt , and the snapshot.txt
files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path,
w))
files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path, w))
# Find the snapshot one
for fname in files_in_watch_dir:
if fname != 'history.txt' and 'html' not in fname:
@ -75,7 +76,6 @@ def test_consistent_history(client, live_server, measure_memory_usage):
assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot"
json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json')
with open(json_db_file, 'r') as f:
assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved"

Wyświetl plik

@ -126,18 +126,51 @@ def extract_UUID_from_client(client):
uuid = m.group(1)
return uuid.strip()
def wait_for_all_checks(client):
# actually this is not entirely true, it can still be 'processing' but not in the queue
# Loop waiting until done..
attempt=0
# because sub-second rechecks are problematic in testing, use lots of delays
time.sleep(1)
while attempt < 60:
res = client.get(url_for("watchlist.index"))
if not b'Checking now' in res.data:
break
logging.getLogger().info(f"Waiting for watch-list to not say 'Checking now'.. {attempt}")
time.sleep(1)
def wait_for_all_checks(client=None):
"""
Waits until the queue is empty and remains empty for at least `required_empty_duration` seconds,
and also ensures no running threads have `current_uuid` set.
Retries for up to `max_attempts` times, sleeping `wait_between_attempts` seconds between checks.
"""
from changedetectionio.flask_app import update_q as global_update_q, running_update_threads
# Configuration
attempt = 0
i=0
max_attempts = 60
wait_between_attempts = 2
required_empty_duration = 2
logger = logging.getLogger()
time.sleep(1.2)
empty_since = None
while attempt < max_attempts:
q_length = global_update_q.qsize()
# Check if any threads are still processing
time.sleep(1.2)
any_threads_busy = any(t.current_uuid for t in running_update_threads)
if q_length == 0 and not any_threads_busy:
if empty_since is None:
empty_since = time.time()
logger.info(f"Queue empty and no active threads at attempt {attempt}, starting empty timer...")
elif time.time() - empty_since >= required_empty_duration:
logger.info(f"Queue has been empty and threads idle for {required_empty_duration} seconds. Done waiting.")
break
else:
logger.info(f"Still waiting: queue empty and no active threads, but not yet {required_empty_duration} seconds...")
else:
if q_length != 0:
logger.info(f"Queue not empty (size={q_length}), resetting timer.")
if any_threads_busy:
busy_threads = [t.name for t in running_update_threads if t.current_uuid]
logger.info(f"Threads still busy: {busy_threads}, resetting timer.")
empty_since = None
attempt += 1
time.sleep(1)