From 199a96625b3ca60f2f6f8e16495c79b49a0c23f9 Mon Sep 17 00:00:00 2001
From: Ivan Habunek <ivan@habunek.com>
Date: Sat, 4 Nov 2023 07:40:56 +0100
Subject: [PATCH] Extract parsing html

---
 toot/tui/richtext.py   |  5 ++---
 toot/utils/__init__.py | 12 +++++++-----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/toot/tui/richtext.py b/toot/tui/richtext.py
index b1a4c68..66c1f27 100644
--- a/toot/tui/richtext.py
+++ b/toot/tui/richtext.py
@@ -2,11 +2,10 @@ import re
 import urwid
 import unicodedata
 
-from bs4 import BeautifulSoup
 from bs4.element import NavigableString, Tag
 from toot.tui.constants import PALETTE
 from toot.tui.stubs.urwidgets import TextEmbed, Hyperlink, parse_text, has_urwidgets
-from toot.utils import urlencode_url
+from toot.utils import parse_html, urlencode_url
 from typing import List, Tuple
 from urwid.util import decompose_tagmarkup
 
@@ -23,7 +22,7 @@ class ContentParser:
         """Convert html to urwid widgets"""
         widgets: List[urwid.Widget] = []
         html = unicodedata.normalize("NFKC", html)
-        soup = BeautifulSoup(html.replace("&apos;", "'"), "html.parser")
+        soup = parse_html(html)
         first_tag = True
         for e in soup.body or soup:
             if isinstance(e, NavigableString):
diff --git a/toot/utils/__init__.py b/toot/utils/__init__.py
index f0fda9e..c4afa7f 100644
--- a/toot/utils/__init__.py
+++ b/toot/utils/__init__.py
@@ -23,17 +23,19 @@ def str_bool_nullable(b):
     return None if b is None else str_bool(b)
 
 
-def get_text(html):
-    """Converts html to text, strips all tags."""
-
+def parse_html(html: str) -> BeautifulSoup:
     # Ignore warnings made by BeautifulSoup, if passed something that looks like
     # a file (e.g. a dot which matches current dict), it will warn that the file
     # should be opened instead of passing a filename.
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
-        text = BeautifulSoup(html.replace('&apos;', "'"), "html.parser").get_text()
+        return BeautifulSoup(html.replace("&apos;", "'"), "html.parser")
 
-    return unicodedata.normalize('NFKC', text)
+
+def get_text(html):
+    """Converts html to text, strips all tags."""
+    text = parse_html(html).get_text()
+    return unicodedata.normalize("NFKC", text)
 
 
 def html_to_paragraphs(html):