Extract parsing html

pull/412/head
Ivan Habunek 2023-11-04 07:40:56 +01:00
rodzic d91c73520e
commit 199a96625b
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: F5F0623FF5EBCB3D
2 zmienionych plików z 9 dodań i 8 usunięć

Wyświetl plik

@ -2,11 +2,10 @@ import re
import urwid import urwid
import unicodedata import unicodedata
from bs4 import BeautifulSoup
from bs4.element import NavigableString, Tag from bs4.element import NavigableString, Tag
from toot.tui.constants import PALETTE from toot.tui.constants import PALETTE
from toot.tui.stubs.urwidgets import TextEmbed, Hyperlink, parse_text, has_urwidgets from toot.tui.stubs.urwidgets import TextEmbed, Hyperlink, parse_text, has_urwidgets
from toot.utils import urlencode_url from toot.utils import parse_html, urlencode_url
from typing import List, Tuple from typing import List, Tuple
from urwid.util import decompose_tagmarkup from urwid.util import decompose_tagmarkup
@ -23,7 +22,7 @@ class ContentParser:
"""Convert html to urwid widgets""" """Convert html to urwid widgets"""
widgets: List[urwid.Widget] = [] widgets: List[urwid.Widget] = []
html = unicodedata.normalize("NFKC", html) html = unicodedata.normalize("NFKC", html)
soup = BeautifulSoup(html.replace("'", "'"), "html.parser") soup = parse_html(html)
first_tag = True first_tag = True
for e in soup.body or soup: for e in soup.body or soup:
if isinstance(e, NavigableString): if isinstance(e, NavigableString):

Wyświetl plik

@ -23,17 +23,19 @@ def str_bool_nullable(b):
return None if b is None else str_bool(b) return None if b is None else str_bool(b)
def get_text(html): def parse_html(html: str) -> BeautifulSoup:
"""Converts html to text, strips all tags."""
# Ignore warnings made by BeautifulSoup, if passed something that looks like # Ignore warnings made by BeautifulSoup, if passed something that looks like
# a file (e.g. a dot which matches current dict), it will warn that the file # a file (e.g. a dot which matches current dict), it will warn that the file
# should be opened instead of passing a filename. # should be opened instead of passing a filename.
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter("ignore") warnings.simplefilter("ignore")
text = BeautifulSoup(html.replace(''', "'"), "html.parser").get_text() return BeautifulSoup(html.replace("'", "'"), "html.parser")
return unicodedata.normalize('NFKC', text)
def get_text(html):
"""Converts html to text, strips all tags."""
text = parse_html(html).get_text()
return unicodedata.normalize("NFKC", text)
def html_to_paragraphs(html): def html_to_paragraphs(html):