From 199a96625b3ca60f2f6f8e16495c79b49a0c23f9 Mon Sep 17 00:00:00 2001 From: Ivan Habunek Date: Sat, 4 Nov 2023 07:40:56 +0100 Subject: [PATCH] Extract parsing html --- toot/tui/richtext.py | 5 ++--- toot/utils/__init__.py | 12 +++++++----- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/toot/tui/richtext.py b/toot/tui/richtext.py index b1a4c68..66c1f27 100644 --- a/toot/tui/richtext.py +++ b/toot/tui/richtext.py @@ -2,11 +2,10 @@ import re import urwid import unicodedata -from bs4 import BeautifulSoup from bs4.element import NavigableString, Tag from toot.tui.constants import PALETTE from toot.tui.stubs.urwidgets import TextEmbed, Hyperlink, parse_text, has_urwidgets -from toot.utils import urlencode_url +from toot.utils import parse_html, urlencode_url from typing import List, Tuple from urwid.util import decompose_tagmarkup @@ -23,7 +22,7 @@ class ContentParser: """Convert html to urwid widgets""" widgets: List[urwid.Widget] = [] html = unicodedata.normalize("NFKC", html) - soup = BeautifulSoup(html.replace("'", "'"), "html.parser") + soup = parse_html(html) first_tag = True for e in soup.body or soup: if isinstance(e, NavigableString): diff --git a/toot/utils/__init__.py b/toot/utils/__init__.py index f0fda9e..c4afa7f 100644 --- a/toot/utils/__init__.py +++ b/toot/utils/__init__.py @@ -23,17 +23,19 @@ def str_bool_nullable(b): return None if b is None else str_bool(b) -def get_text(html): - """Converts html to text, strips all tags.""" - +def parse_html(html: str) -> BeautifulSoup: # Ignore warnings made by BeautifulSoup, if passed something that looks like # a file (e.g. a dot which matches current dict), it will warn that the file # should be opened instead of passing a filename. with warnings.catch_warnings(): warnings.simplefilter("ignore") - text = BeautifulSoup(html.replace(''', "'"), "html.parser").get_text() + return BeautifulSoup(html.replace("'", "'"), "html.parser") - return unicodedata.normalize('NFKC', text) + +def get_text(html): + """Converts html to text, strips all tags.""" + text = parse_html(html).get_text() + return unicodedata.normalize("NFKC", text) def html_to_paragraphs(html):