Extract parsing html

2023-11-04 07:40:56 +01:00 · 2023-11-04 07:40:56 +01:00 · 199a96625b
parent d91c73520e
commit 199a96625b
2 changed files with 9 additions and 8 deletions
--- a/toot/tui/richtext.py
+++ b/toot/tui/richtext.py
@ -2,11 +2,10 @@ import re
 import urwid
 import unicodedata

-from bs4 import BeautifulSoup
 from bs4.element import NavigableString, Tag
 from toot.tui.constants import PALETTE
 from toot.tui.stubs.urwidgets import TextEmbed, Hyperlink, parse_text, has_urwidgets
-from toot.utils import urlencode_url
+from toot.utils import parse_html, urlencode_url
 from typing import List, Tuple
 from urwid.util import decompose_tagmarkup

@ -23,7 +22,7 @@ class ContentParser:
        """Convert html to urwid widgets"""
        widgets: List[urwid.Widget] = []
        html = unicodedata.normalize("NFKC", html)
-        soup = BeautifulSoup(html.replace("&apos;", "'"), "html.parser")
+        soup = parse_html(html)
        first_tag = True
        for e in soup.body or soup:
            if isinstance(e, NavigableString):
--- a/toot/utils/init.py
+++ b/toot/utils/init.py
@ -23,17 +23,19 @@ def str_bool_nullable(b):
    return None if b is None else str_bool(b)


-def get_text(html):
-    """Converts html to text, strips all tags."""
-
+def parse_html(html: str) -> BeautifulSoup:
    # Ignore warnings made by BeautifulSoup, if passed something that looks like
    # a file (e.g. a dot which matches current dict), it will warn that the file
    # should be opened instead of passing a filename.
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
-        text = BeautifulSoup(html.replace('&apos;', "'"), "html.parser").get_text()
+        return BeautifulSoup(html.replace("&apos;", "'"), "html.parser")

-    return unicodedata.normalize('NFKC', text)
+
+def get_text(html):
+    """Converts html to text, strips all tags."""
+    text = parse_html(html).get_text()
+    return unicodedata.normalize("NFKC", text)


 def html_to_paragraphs(html):