Make HTML class handling more sensible

If the class name appears in the constants.py PALETTE entry, it is
honored. Otherwise, the class is ignored and the tag is handled
as a generic tag of that type.  This allows hashtag anchors
to be highlighted, and URL anchors to be styled differently
regardless of the strange class markup that Akkoma adds to URL
anchors
This commit is contained in:
Dan Schwarz 2023-04-04 19:43:37 -04:00 committed by Daniel Schwarz
parent 84663d6dca
commit 486cd6c7f9
2 changed files with 27 additions and 22 deletions

View File

@ -40,10 +40,7 @@ PALETTE = [
('white_bold', 'white,bold', ''),
# HTML tag styling
# note, anchor styling is often overridden
# by class names in Mastodon statuses
# so you won't see the italics.
('a', ',italics', ''),
('a', '', ''),
('em', ',italics', ''),
('i', ',italics', ''),
('strong', ',bold', ''),

View File

@ -4,18 +4,23 @@ richtext
from typing import List, Tuple
import urwid
import unicodedata
from .constants import PALETTE
from bs4 import BeautifulSoup
from bs4.element import NavigableString, Tag
class ContentParser:
def __init__(self):
self.palette_names = []
for p in PALETTE:
self.palette_names.append(p[0])
"""Parse a limited subset of HTML and create urwid widgets."""
def html_to_widgets(self, html) -> List[urwid.Widget]:
"""Convert html to urwid widgets"""
widgets: List[urwid.Widget] = []
html = unicodedata.normalize('NFKC', html)
html = unicodedata.normalize("NFKC", html)
soup = BeautifulSoup(html.replace("'", "'"), "html.parser")
for e in soup.body or soup:
if isinstance(e, NavigableString):
@ -108,20 +113,18 @@ class ContentParser:
"""Get the class name and translate to a
name suitable for use as an urwid
text attribute name"""
# TODO: think about whitelisting allowed classes,
# or blacklisting classes we do not want.
# Classes to whitelist: "mention" "hashtag"
# used in anchor tags
# Classes to blacklist: "invisible" used in Akkoma
# anchor titles
if "class" in tag.attrs:
clss = tag.attrs["class"]
if len(clss) > 0:
style_name = "class_" + "_".join(clss)
return style_name
# return the class name, only if we
# find it as a defined palette name
if style_name in self.palette_names:
return style_name
style_name = tag.name
# fallback to returning the tag name
return tag.name
# Tag handlers start here.
# Tags not explicitly listed are "supported" by
@ -136,12 +139,12 @@ class ContentParser:
def _a(self, tag) -> Tuple:
markups = self.process_inline_tag_children(tag)
if not markups:
return(tag.name, "")
return (tag.name, "")
# hashtag anchors have a class of "mention hashtag"
# we'll return style "class_mention_hashtag"
# in that case; set this up in constants.py
# to control highlighting of hashtags
# in that case; see corresponding palette entry
# in constants.py controlling hashtag highlighting
return (self.get_urwid_attr_name(tag), markups)
@ -216,12 +219,15 @@ class ContentParser:
if "class" in tag.attrs:
style_name = self.get_urwid_attr_name(tag)
elif tag.parent:
style_name = self.get_urwid_attr_name(tag.parent)
else:
style_name = tag.name
if style_name != "span":
# unique class name matches an entry in our palette
return (style_name, markups)
return (style_name, markups)
if tag.parent:
return (self.get_urwid_attr_name(tag.parent), markups)
else:
# fallback
return ("span", markups)
def _ul(self, tag) -> urwid.Widget:
return self.list_widget(tag, ordered=False)
@ -241,7 +247,9 @@ class ContentParser:
("li", [str(i), ". ", markup])
) # 1. foo, 2. bar, etc.
else:
txt = urwid.Text(("li", ["\N{bullet} ", markup])) # * foo, * bar, etc.
txt = urwid.Text(
("li", ["\N{bullet} ", markup])
) # * foo, * bar, etc.
widgets.append(txt)
else:
if ordered: