Added a workaround for statuses with malformed HTML

We see this problem with statuses from Pixelfed servers.
Per the Mastodon API spec, the content tag is supposed to be
HTML, but Pixelfed sends statuses that often start as plain text.
They may include embedded anchor tags etc. within the text.
This confuses BeautifulSoup HTML parsers and results in bad
rendering artifacts.

This workaround detects the above condition and attempts to fix it by
surrounding the status in <p></p>. This converts it to nominally
valid HTML (at least, parseable by BeautifulSoup.)
This commit is contained in:
Daniel Schwarz 2023-05-12 20:24:16 -04:00
parent 4a5db5bed8
commit baa50ca889
1 changed files with 19 additions and 9 deletions

View File

@ -20,19 +20,29 @@ class ContentParser:
"""Parse a limited subset of HTML and create urwid widgets."""
def html_to_widgets(self, html) -> List[urwid.Widget]:
def html_to_widgets(self, html, recovery_attempt = False) -> List[urwid.Widget]:
"""Convert html to urwid widgets"""
widgets: List[urwid.Widget] = []
html = unicodedata.normalize("NFKC", html)
soup = BeautifulSoup(html.replace("&apos;", "'"), "html.parser")
first_tag = True
for e in soup.body or soup:
if isinstance(e, NavigableString):
continue
name = e.name
# First, look for a custom tag handler method in this class
# If that fails, fall back to inline_tag_to_text handler
method = getattr(self, "_" + name, self.inline_tag_to_text)
markup = method(e) # either returns a Widget, or plain text
if first_tag and not recovery_attempt:
# if our first "tag" is a navigable string
# the HTML is out of spec, doesn't start with a tag,
# we see this in content from Pixelfed servers.
# attempt a fix by wrapping the HTML with <p></p>
return self.html_to_widgets(f"<p>{html}</p>", recovery_attempt = True)
else:
continue
else:
first_tag = False
name = e.name
# First, look for a custom tag handler method in this class
# If that fails, fall back to inline_tag_to_text handler
method = getattr(self, "_" + name, self.inline_tag_to_text)
markup = method(e) # either returns a Widget, or plain text
if not isinstance(markup, urwid.Widget):
# plaintext, so create a padded text widget
@ -72,13 +82,13 @@ class ContentParser:
TRANSFORM = {
# convert http[s] URLs to Hyperlink widgets for nesting in a TextEmbed widget
re.compile(r'(^.+)\x03(.+$)'):
lambda g: (len(g[1]), urwid.Filler(Hyperlink(g[2], attr[0], g[1]))),
lambda g: (len(g[1]), urwid.Filler(Hyperlink(g[2], attr, g[1]))),
}
markup_list = []
for run in markup:
if isinstance(run, tuple):
txt, attr = decompose_tagmarkup(run)
txt, attr_list = decompose_tagmarkup(run)
m = re.match(r'(^.+)\x03(.+$)', txt)
if m:
markup_list.append(parse_text(txt, TRANSFORM,