From baa50ca8893f0de295601dc33e11662e57faba60 Mon Sep 17 00:00:00 2001 From: Daniel Schwarz Date: Fri, 12 May 2023 20:24:16 -0400 Subject: [PATCH] Added a workaround for statuses with malformed HTML We see this problem with statuses from Pixelfed servers. Per the Mastodon API spec, the content tag is supposed to be HTML, but Pixelfed sends statuses that often start as plain text. They may include embedded anchor tags etc. within the text. This confuses BeautifulSoup HTML parsers and results in bad rendering artifacts. This workaround detects the above condition and attempts to fix it by surrounding the status in

. This converts it to nominally valid HTML (at least, parseable by BeautifulSoup.) --- toot/tui/richtext.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/toot/tui/richtext.py b/toot/tui/richtext.py index 26d9dbc..45a5e34 100644 --- a/toot/tui/richtext.py +++ b/toot/tui/richtext.py @@ -20,19 +20,29 @@ class ContentParser: """Parse a limited subset of HTML and create urwid widgets.""" - def html_to_widgets(self, html) -> List[urwid.Widget]: + def html_to_widgets(self, html, recovery_attempt = False) -> List[urwid.Widget]: """Convert html to urwid widgets""" widgets: List[urwid.Widget] = [] html = unicodedata.normalize("NFKC", html) soup = BeautifulSoup(html.replace("'", "'"), "html.parser") + first_tag = True for e in soup.body or soup: if isinstance(e, NavigableString): - continue - name = e.name - # First, look for a custom tag handler method in this class - # If that fails, fall back to inline_tag_to_text handler - method = getattr(self, "_" + name, self.inline_tag_to_text) - markup = method(e) # either returns a Widget, or plain text + if first_tag and not recovery_attempt: + # if our first "tag" is a navigable string + # the HTML is out of spec, doesn't start with a tag, + # we see this in content from Pixelfed servers. + # attempt a fix by wrapping the HTML with

+ return self.html_to_widgets(f"

{html}

", recovery_attempt = True) + else: + continue + else: + first_tag = False + name = e.name + # First, look for a custom tag handler method in this class + # If that fails, fall back to inline_tag_to_text handler + method = getattr(self, "_" + name, self.inline_tag_to_text) + markup = method(e) # either returns a Widget, or plain text if not isinstance(markup, urwid.Widget): # plaintext, so create a padded text widget @@ -72,13 +82,13 @@ class ContentParser: TRANSFORM = { # convert http[s] URLs to Hyperlink widgets for nesting in a TextEmbed widget re.compile(r'(^.+)\x03(.+$)'): - lambda g: (len(g[1]), urwid.Filler(Hyperlink(g[2], attr[0], g[1]))), + lambda g: (len(g[1]), urwid.Filler(Hyperlink(g[2], attr, g[1]))), } markup_list = [] for run in markup: if isinstance(run, tuple): - txt, attr = decompose_tagmarkup(run) + txt, attr_list = decompose_tagmarkup(run) m = re.match(r'(^.+)\x03(.+$)', txt) if m: markup_list.append(parse_text(txt, TRANSFORM,