Added a workaround for statuses with malformed HTML

We see this problem with statuses from Pixelfed servers. Per the Mastodon API spec, the content tag is supposed to be HTML, but Pixelfed sends statuses that often start as plain text. They may include embedded anchor tags etc. within the text. This confuses BeautifulSoup HTML parsers and results in bad rendering artifacts. This workaround detects the above condition and attempts to fix it by surrounding the status in <p></p>. This converts it to nominally valid HTML (at least, parseable by BeautifulSoup.)
2023-05-12 20:24:16 -04:00 · 2023-05-12 20:24:16 -04:00 · baa50ca889
parent 4a5db5bed8
commit baa50ca889
1 changed files with 19 additions and 9 deletions
--- a/toot/tui/richtext.py
+++ b/toot/tui/richtext.py
@ -20,19 +20,29 @@ class ContentParser:

        """Parse a limited subset of HTML and create urwid widgets."""

-    def html_to_widgets(self, html) -> List[urwid.Widget]:
+    def html_to_widgets(self, html, recovery_attempt = False) -> List[urwid.Widget]:
        """Convert html to urwid widgets"""
        widgets: List[urwid.Widget] = []
        html = unicodedata.normalize("NFKC", html)
        soup = BeautifulSoup(html.replace("&apos;", "'"), "html.parser")
+        first_tag = True
        for e in soup.body or soup:
            if isinstance(e, NavigableString):
-                continue
-            name = e.name
-            # First, look for a custom tag handler method in this class
-            # If that fails, fall back to inline_tag_to_text handler
-            method = getattr(self, "_" + name, self.inline_tag_to_text)
-            markup = method(e)  # either returns a Widget, or plain text
+                if first_tag and not recovery_attempt:
+                    # if our first "tag" is a navigable string
+                    # the HTML is out of spec, doesn't start with a tag,
+                    # we see this in content from Pixelfed servers.
+                    # attempt a fix by wrapping the HTML with <p></p>
+                    return self.html_to_widgets(f"<p>{html}</p>", recovery_attempt = True)
+                else:
+                    continue
+            else:
+                first_tag = False
+                name = e.name
+                # First, look for a custom tag handler method in this class
+                # If that fails, fall back to inline_tag_to_text handler
+                method = getattr(self, "_" + name, self.inline_tag_to_text)
+                markup = method(e)  # either returns a Widget, or plain text

            if not isinstance(markup, urwid.Widget):
                # plaintext, so create a padded text widget
@ -72,13 +82,13 @@ class ContentParser:
        TRANSFORM = {
            # convert http[s] URLs to Hyperlink widgets for nesting in a TextEmbed widget
            re.compile(r'(^.+)\x03(.+$)'):
-                lambda g: (len(g[1]), urwid.Filler(Hyperlink(g[2], attr[0], g[1]))),
+                lambda g: (len(g[1]), urwid.Filler(Hyperlink(g[2], attr, g[1]))),
        }
        markup_list = []

        for run in markup:
            if isinstance(run, tuple):
-                txt, attr = decompose_tagmarkup(run)
+                txt, attr_list = decompose_tagmarkup(run)
                m = re.match(r'(^.+)\x03(.+$)', txt)
                if m:
                    markup_list.append(parse_text(txt, TRANSFORM,