Support to display a limited set of HTML tags

HTML tag support is aligned with Mastodon 4.2 supported tags.
This code introduces a soft dependency on the urwidgets library.
If urwidgets is not available, HTML tags are still supported,
but hyperlinks are not underlined using the OCS 8 terminal
feature (on supported terminals).
This commit is contained in:
Daniel Schwarz 2023-09-22 21:32:19 -04:00
parent f6e454956b
commit 0f39b1087f
13 changed files with 619 additions and 54 deletions

View File

@ -1,4 +1,5 @@
[flake8]
exclude=build,tests,tmp,venv,toot/tui/scroll.py
ignore=E128,W503
per-file-ignores=toot/tui/stubs/urwidgets.py:F401
max-line-length=120

View File

@ -143,7 +143,6 @@ class TUI(urwid.Frame):
def run(self):
self.loop.set_alarm_in(0, lambda *args: self.async_load_instance())
self.loop.set_alarm_in(0, lambda *args: self.async_load_followed_accounts())
self.loop.set_alarm_in(0, lambda *args: self.async_load_followed_tags())
self.loop.set_alarm_in(0, lambda *args: self.async_load_timeline(
is_initial=True, timeline_name="home"))
self.loop.run()
@ -339,22 +338,6 @@ class TUI(urwid.Frame):
self.run_in_thread(_load_accounts, done_callback=_done_accounts)
def async_load_followed_tags(self):
def _load_tag_list():
try:
return api.followed_tags(self.app, self.user)
except ApiError:
# not supported by all Mastodon servers so fail silently if necessary
return []
def _done_tag_list(tags):
if len(tags) > 0:
self.followed_tags = [t["name"] for t in tags]
else:
self.followed_tags = []
self.run_in_thread(_load_tag_list, done_callback=_done_tag_list)
def refresh_footer(self, timeline):
"""Show status details in footer."""
status, index, count = timeline.get_focused_status_with_counts()

View File

@ -57,6 +57,29 @@ PALETTE = [
('dim', 'dark gray', ''),
('highlight', 'yellow', ''),
('success', 'dark green', ''),
# HTML tag styling
('a', ',italics', '', 'italics'),
# em tag is mapped to i
('i', ',italics', '', 'italics'),
# strong tag is mapped to b
('b', ',bold', '', 'bold'),
# special case for bold + italic nested tags
('bi', ',bold,italics', '', ',bold,italics'),
('u', ',underline', '', ',underline'),
('del', ',strikethrough', '', ',strikethrough'),
('code', 'light gray, standout', '', ',standout'),
('pre', 'light gray, standout', '', ',standout'),
('blockquote', 'light gray', '', ''),
('h1', ',bold', '', ',bold'),
('h2', ',bold', '', ',bold'),
('h3', ',bold', '', ',bold'),
('h4', ',bold', '', ',bold'),
('h5', ',bold', '', ',bold'),
('h6', ',bold', '', ',bold'),
('class_mention_hashtag', 'light cyan', '', ''),
('class_hashtag', 'light cyan', '', ''),
]
VISIBILITY_OPTIONS = [

View File

@ -4,10 +4,10 @@ import urwid
import webbrowser
from toot import __version__
from toot.utils import format_content
from .utils import highlight_hashtags, highlight_keys
from .widgets import Button, EditBox, SelectableText
from toot import api
from toot.tui.utils import highlight_keys
from toot.tui.widgets import Button, EditBox, SelectableText
from toot.tui.richtext import ContentParser
class StatusSource(urwid.Padding):
@ -255,6 +255,8 @@ class Account(urwid.ListBox):
super().__init__(walker)
def generate_contents(self, account, relationship=None, last_action=None):
parser = ContentParser()
if self.last_action and not self.last_action.startswith("Confirm"):
yield Button(f"Confirm {self.last_action}", on_press=take_action, user_data=self)
yield Button("Cancel", on_press=cancel_action, user_data=self)
@ -279,8 +281,10 @@ class Account(urwid.ListBox):
if account["note"]:
yield urwid.Divider()
for line in format_content(account["note"]):
yield urwid.Text(highlight_hashtags(line, followed_tags=set()))
widgetlist = parser.html_to_widgets(account["note"])
for line in widgetlist:
yield (line)
yield urwid.Divider()
yield urwid.Text(["ID: ", ("highlight", f"{account['id']}")])
@ -312,8 +316,11 @@ class Account(urwid.ListBox):
name = field["name"].title()
yield urwid.Divider()
yield urwid.Text([("bold", f"{name.rstrip(':')}"), ":"])
for line in format_content(field["value"]):
yield urwid.Text(highlight_hashtags(line, followed_tags=set()))
widgetlist = parser.html_to_widgets(field["value"])
for line in widgetlist:
yield (line)
if field["verified_at"]:
yield urwid.Text(("success", "✓ Verified"))

View File

@ -2,11 +2,9 @@ import urwid
from toot import api
from toot.exceptions import ApiError
from toot.utils import format_content
from toot.utils.datetime import parse_datetime
from .utils import highlight_hashtags
from .widgets import Button, CheckBox, RadioButton
from .richtext import ContentParser
class Poll(urwid.ListBox):
@ -87,8 +85,12 @@ class Poll(urwid.ListBox):
def generate_contents(self, status):
yield urwid.Divider()
for line in format_content(status.data["content"]):
yield urwid.Text(highlight_hashtags(line, set()))
parser = ContentParser()
widgetlist = parser.html_to_widgets(status.data["content"])
for line in widgetlist:
yield (line)
yield urwid.Divider()
yield self.build_linebox(self.generate_poll_detail())

457
toot/tui/richtext.py Normal file
View File

@ -0,0 +1,457 @@
"""
richtext
"""
from typing import List, Tuple
import re
import urwid
import unicodedata
from .constants import PALETTE
from bs4 import BeautifulSoup
from bs4.element import NavigableString, Tag
from .stubs.urwidgets import TextEmbed, Hyperlink, parse_text, has_urwidgets
from urwid.util import decompose_tagmarkup
from toot.utils import urlencode_url
class ContentParser:
def __init__(self):
self.palette_names = []
for p in PALETTE:
self.palette_names.append(p[0])
"""Parse a limited subset of HTML and create urwid widgets."""
def html_to_widgets(self, html, recovery_attempt=False) -> List[urwid.Widget]:
"""Convert html to urwid widgets"""
widgets: List[urwid.Widget] = []
html = unicodedata.normalize("NFKC", html)
soup = BeautifulSoup(html.replace("'", "'"), "html.parser")
first_tag = True
for e in soup.body or soup:
if isinstance(e, NavigableString):
if first_tag and not recovery_attempt:
# if our first "tag" is a navigable string
# the HTML is out of spec, doesn't start with a tag,
# we see this in content from Pixelfed servers.
# attempt a fix by wrapping the HTML with <p></p>
return self.html_to_widgets(f"<p>{html}</p>", recovery_attempt=True)
else:
continue
else:
name = e.name
# if our HTML starts with a tag, but not a block tag
# the HTML is out of spec. Attempt a fix by wrapping the
# HTML with <p></p>
if (
first_tag
and not recovery_attempt
and name
not in (
"p",
"pre",
"li",
"blockquote",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
) # NOTE: update this list if Mastodon starts supporting more block tags
):
return self.html_to_widgets(f"<p>{html}</p>", recovery_attempt=True)
# First, look for a custom tag handler method in this class
# If that fails, fall back to inline_tag_to_text handler
method = getattr(self, "_" + name, self.inline_tag_to_text)
markup = method(e) # either returns a Widget, or plain text
first_tag = False
if not isinstance(markup, urwid.Widget):
# plaintext, so create a padded text widget
txt = self.text_to_widget("", markup)
markup = urwid.Padding(
txt,
align="left",
width=("relative", 100),
min_width=None,
)
widgets.append(markup)
# separate top level widgets with a blank line
widgets.append(urwid.Divider(" "))
return widgets[:-1] # but suppress the last blank line
def inline_tag_to_text(self, tag) -> Tuple:
"""Convert html tag to plain text with tag as attributes recursively"""
markups = self.process_inline_tag_children(tag)
if not markups:
return (tag.name, "")
return (tag.name, markups)
def process_inline_tag_children(self, tag) -> List:
"""Recursively retrieve all children
and convert to a list of markup text"""
markups = []
for child in tag.children:
if isinstance(child, Tag):
method = getattr(self, "_" + child.name, self.inline_tag_to_text)
markup = method(child)
markups.append(markup)
else:
markups.append(child)
return markups
def text_to_widget(self, attr, markup) -> urwid.Widget:
if not has_urwidgets:
return urwid.Text((attr, markup))
TRANSFORM = {
# convert http[s] URLs to Hyperlink widgets for nesting in a TextEmbed widget
re.compile(r"(^.+)\x03(.+$)"): lambda g: (
len(g[1]),
urwid.Filler(Hyperlink(g[2], anchor_attr, g[1])),
),
}
markup_list = []
for run in markup:
if isinstance(run, tuple):
txt, attr_list = decompose_tagmarkup(run)
# find anchor titles with an ETX separator followed by href
m = re.match(r"(^.+)\x03(.+$)", txt)
if m:
anchor_attr = self.get_best_anchor_attr(attr_list)
markup_list.append(
parse_text(
txt,
TRANSFORM,
lambda pattern, groups, span: TRANSFORM[pattern](groups),
)
)
else:
markup_list.append(run)
else:
markup_list.append(run)
return TextEmbed(markup_list)
def process_block_tag_children(self, tag) -> List[urwid.Widget]:
"""Recursively retrieve all children
and convert to a list of widgets
any inline tags containing text will be
converted to Text widgets"""
pre_widget_markups = []
post_widget_markups = []
child_widgets = []
found_nested_widget = False
for child in tag.children:
if isinstance(child, Tag):
# child is a nested tag; process using custom method
# or default to inline_tag_to_text
method = getattr(self, "_" + child.name, self.inline_tag_to_text)
result = method(child)
if isinstance(result, urwid.Widget):
found_nested_widget = True
child_widgets.append(result)
else:
if not found_nested_widget:
pre_widget_markups.append(result)
else:
post_widget_markups.append(result)
else:
# child is text; append to the appropriate markup list
if not found_nested_widget:
pre_widget_markups.append(child)
else:
post_widget_markups.append(child)
widget_list = []
if len(pre_widget_markups):
widget_list.append(self.text_to_widget(tag.name, pre_widget_markups))
if len(child_widgets):
widget_list += child_widgets
if len(post_widget_markups):
widget_list.append(self.text_to_widget(tag.name, post_widget_markups))
return widget_list
def get_urwid_attr_name(self, tag) -> str:
"""Get the class name and translate to a
name suitable for use as an urwid
text attribute name"""
if "class" in tag.attrs:
clss = tag.attrs["class"]
if len(clss) > 0:
style_name = "class_" + "_".join(clss)
# return the class name, only if we
# find it as a defined palette name
if style_name in self.palette_names:
return style_name
# fallback to returning the tag name
return tag.name
# Tag handlers start here.
# Tags not explicitly listed are "supported" by
# rendering as text.
# Inline tags return a list of marked up text for urwid.Text
# Block tags return urwid.Widget
def basic_block_tag_handler(self, tag) -> urwid.Widget:
"""default for block tags that need no special treatment"""
return urwid.Pile(self.process_block_tag_children(tag))
def get_best_anchor_attr(self, attrib_list) -> str:
if not attrib_list:
return ""
flat_al = list(flatten(attrib_list))
for a in flat_al[0]:
# ref: https://docs.joinmastodon.org/spec/activitypub/
# these are the class names (translated to attrib names)
# that we can support for display
try:
if a[0] in ["class_hashtag", "class_mention_hashtag", "class_mention"]:
return a[0]
except KeyError:
continue
return "a"
def _a(self, tag) -> Tuple:
"""anchor tag handler"""
markups = self.process_inline_tag_children(tag)
if not markups:
return (tag.name, "")
href = tag.attrs["href"]
title, attrib_list = decompose_tagmarkup(markups)
if not attrib_list:
attrib_list = [tag]
if href and has_urwidgets:
# only if we have urwidgets loaded for OCS 8 hyperlinks:
# urlencode the path and query portions of the URL
href = urlencode_url(href)
# use ASCII ETX (end of record) as a
# delimiter between the title and the HREF
title += f"\x03{href}"
attr = self.get_best_anchor_attr(attrib_list)
if attr == "a":
# didn't find an attribute to use
# in the child markup, so let's
# try the anchor tag's own attributes
attr = self.get_urwid_attr_name(tag)
# hashtag anchors have a class of "mention hashtag"
# or "hashtag"
# we'll return style "class_mention_hashtag"
# or "class_hashtag"
# in that case; see corresponding palette entry
# in constants.py controlling hashtag highlighting
return (attr, title)
def _blockquote(self, tag) -> urwid.Widget:
widget_list = self.process_block_tag_children(tag)
blockquote_widget = urwid.LineBox(
urwid.Padding(
urwid.Pile(widget_list),
align="left",
width=("relative", 100),
min_width=None,
left=1,
right=1,
),
tlcorner="",
tline="",
lline="",
trcorner="",
blcorner="",
rline="",
bline="",
brcorner="",
)
return urwid.Pile([urwid.AttrMap(blockquote_widget, "blockquote")])
def _br(self, tag) -> Tuple:
return ("br", "\n")
def _em(self, tag) -> Tuple:
# to simplify the number of palette entries
# translate EM to I (italic)
markups = self.process_inline_tag_children(tag)
if not markups:
return ("i", "")
# special case processing for bold and italic
for parent in tag.parents:
if parent.name == "b" or parent.name == "strong":
return ("bi", markups)
return ("i", markups)
def _ol(self, tag) -> urwid.Widget:
"""ordered list tag handler"""
widgets = []
list_item_num = 1
increment = -1 if tag.has_attr("reversed") else 1
# get ol start= attribute if present
if tag.has_attr("start") and len(tag.attrs["start"]) > 0:
try:
list_item_num = int(tag.attrs["start"])
except ValueError:
pass
for li in tag.find_all("li", recursive=False):
method = getattr(self, "_li", self.inline_tag_to_text)
markup = method(li)
# li value= attribute will change the item number
# it also overrides any ol start= attribute
if li.has_attr("value") and len(li.attrs["value"]) > 0:
try:
list_item_num = int(li.attrs["value"])
except ValueError:
pass
if not isinstance(markup, urwid.Widget):
txt = self.text_to_widget("li", [str(list_item_num), ". ", markup])
# 1. foo, 2. bar, etc.
widgets.append(txt)
else:
txt = self.text_to_widget("li", [str(list_item_num), ". "])
columns = urwid.Columns(
[txt, ("weight", 9999, markup)], dividechars=1, min_width=3
)
widgets.append(columns)
list_item_num += increment
return urwid.Pile(widgets)
def _pre(self, tag) -> urwid.Widget:
# <PRE> tag spec says that text should not wrap,
# but horizontal screen space is at a premium
# and we have no horizontal scroll bar, so allow
# wrapping.
widget_list = [urwid.Divider(" ")]
widget_list += self.process_block_tag_children(tag)
pre_widget = urwid.Padding(
urwid.Pile(widget_list),
align="left",
width=("relative", 100),
min_width=None,
left=1,
right=1,
)
return urwid.Pile([urwid.AttrMap(pre_widget, "pre")])
def _span(self, tag) -> Tuple:
markups = self.process_inline_tag_children(tag)
if not markups:
return (tag.name, "")
# span inherits its parent's class definition
# unless it has a specific class definition
# of its own
if "class" in tag.attrs:
# uncomment the following code to hide all HTML marked
# invisible (generally, the http:// prefix of URLs)
# could be a user preference, it's only advisable if
# the terminal supports OCS 8 hyperlinks (and that's not
# automatically detectable)
# if "invisible" in tag.attrs["class"]:
# return (tag.name, "")
style_name = self.get_urwid_attr_name(tag)
if style_name != "span":
# unique class name matches an entry in our palette
return (style_name, markups)
if tag.parent:
return (self.get_urwid_attr_name(tag.parent), markups)
else:
# fallback
return ("span", markups)
def _strong(self, tag) -> Tuple:
# to simplify the number of palette entries
# translate STRONG to B (bold)
markups = self.process_inline_tag_children(tag)
if not markups:
return ("b", "")
# special case processing for bold and italic
for parent in tag.parents:
if parent.name == "i" or parent.name == "em":
return ("bi", markups)
return ("b", markups)
def _ul(self, tag) -> urwid.Widget:
"""unordered list tag handler"""
widgets = []
for li in tag.find_all("li", recursive=False):
method = getattr(self, "_li", self.inline_tag_to_text)
markup = method(li)
if not isinstance(markup, urwid.Widget):
txt = self.text_to_widget("li", ["\N{bullet} ", markup])
# * foo, * bar, etc.
widgets.append(txt)
else:
txt = self.text_to_widget("li", ["\N{bullet} "])
columns = urwid.Columns(
[txt, ("weight", 9999, markup)], dividechars=1, min_width=3
)
widgets.append(columns)
return urwid.Pile(widgets)
# These tags are handled identically to others
# the only difference being the tag name used for
# urwid attribute mapping
_b = _strong
_div = basic_block_tag_handler
_i = _em
_li = basic_block_tag_handler
# Glitch-soc and Pleroma allow <H1>...<H6> in content
# Mastodon (PR #23913) does not; header tags are converted to <P><STRONG></STRONG></P>
_h1 = _h2 = _h3 = _h4 = _h5 = _h6 = basic_block_tag_handler
_p = basic_block_tag_handler
def flatten(data):
if isinstance(data, tuple):
for x in data:
yield from flatten(x)
else:
yield data

View File

@ -0,0 +1,30 @@
__all__ = ("Hyperlink",)
import urwid
class Hyperlink(urwid.WidgetWrap):
def __init__(
self,
uri,
attr,
text,
):
pass
def render(self, size, focus):
return None
class HyperlinkCanvas(urwid.Canvas):
def __init__(self, uri: str, text_canv: urwid.TextCanvas):
pass
def cols(self):
return 0
def content(self, *args, **kwargs):
yield [None]
def rows(self):
return 0

View File

@ -0,0 +1,29 @@
__all__ = ("parse_text", "TextEmbed")
import urwid
class TextEmbed(urwid.Text):
def get_text(
self,
):
return None
def render(self, size, focus):
return None
def set_text(self, markup):
pass
def set_wrap_mode(self, mode):
pass
def parse_text(
text,
patterns,
repl,
*repl_args,
**repl_kwargs,
):
return None

View File

@ -0,0 +1,8 @@
# If urwidgets is loaded use it; otherwise use our stubs
try:
from urwidgets import Hyperlink, TextEmbed, parse_text
has_urwidgets = True
except ImportError:
from .stub_hyperlink import Hyperlink
from .stub_text_embed import TextEmbed, parse_text
has_urwidgets = False

View File

@ -1,18 +1,21 @@
import logging
import re
import urwid
import webbrowser
from typing import List, Optional
from toot.tui import app
from toot.utils import format_content
from toot.utils.datetime import parse_datetime, time_ago
from toot.utils.language import language_name
from .entities import Status
from .scroll import Scrollable, ScrollBar
from .utils import highlight_hashtags, highlight_keys
from .widgets import SelectableText, SelectableColumns
from toot.entities import Status
from toot.tui.scroll import Scrollable, ScrollBar
from toot.tui.utils import highlight_keys
from toot.tui.widgets import SelectableText, SelectableColumns
from toot.tui.richtext import ContentParser
from toot.utils import urlencode_url
from toot.tui.stubs.urwidgets import Hyperlink, TextEmbed, parse_text, has_urwidgets
logger = logging.getLogger("toot")
@ -310,7 +313,6 @@ class Timeline(urwid.Columns):
class StatusDetails(urwid.Pile):
def __init__(self, timeline: Timeline, status: Optional[Status]):
self.status = status
self.followed_tags = timeline.tui.followed_tags
self.followed_accounts = timeline.tui.followed_accounts
reblogged_by = status.author if status and status.reblog else None
@ -318,6 +320,20 @@ class StatusDetails(urwid.Pile):
if status else ())
return super().__init__(widget_list)
def linkify_content(self, text) -> urwid.Widget:
if not has_urwidgets:
return urwid.Text(("link", text))
TRANSFORM = {
# convert http[s] URLs to Hyperlink widgets for nesting in a TextEmbed widget
re.compile(r'(https?://[^\s]+)'):
lambda g: (len(g[1]), urwid.Filler(Hyperlink(urlencode_url(g[1]), "link", g[1]))),
}
markup_list = []
markup_list.append(parse_text(text, TRANSFORM,
lambda pattern, groups, span: TRANSFORM[pattern](groups)))
return TextEmbed(markup_list, align='left')
def content_generator(self, status, reblogged_by):
if reblogged_by:
text = "{} boosted".format(reblogged_by.display_name or reblogged_by.username)
@ -340,8 +356,12 @@ class StatusDetails(urwid.Pile):
yield ("pack", urwid.Text(("content_warning", "Marked as sensitive. Press S to view.")))
else:
content = status.original.translation if status.original.show_translation else status.data["content"]
for line in format_content(content):
yield ("pack", urwid.Text(highlight_hashtags(line, self.followed_tags)))
parser = ContentParser()
widgetlist = parser.html_to_widgets(content)
for line in widgetlist:
yield (line)
media = status.data["media_attachments"]
if media:
@ -350,7 +370,7 @@ class StatusDetails(urwid.Pile):
yield ("pack", urwid.Text([("bold", "Media attachment"), " (", m["type"], ")"]))
if m["description"]:
yield ("pack", urwid.Text(m["description"]))
yield ("pack", urwid.Text(("link", m["url"])))
yield ("pack", self.linkify_content(m["url"]))
poll = status.original.data.get("poll")
if poll:
@ -410,7 +430,7 @@ class StatusDetails(urwid.Pile):
if card["description"]:
yield urwid.Text(card["description"].strip())
yield urwid.Text("")
yield urwid.Text(("link", card["url"]))
yield self.linkify_content(card["url"])
def poll_generator(self, poll):
for idx, option in enumerate(poll["options"]):

8
toot/tui/urwidgets.py Normal file
View File

@ -0,0 +1,8 @@
# If urwidgets is loaded use it; otherwise use our stubs
try:
from urwidgets import Hyperlink, TextEmbed, parse_text # noqa: F401
has_urwidgets = True
except ImportError:
from .stub_hyperlink import Hyperlink # noqa: F401
from .stub_text_embed import TextEmbed, parse_text # noqa: F401
has_urwidgets = False

View File

@ -35,21 +35,6 @@ def highlight_keys(text, high_attr, low_attr=""):
return list(_gen())
def highlight_hashtags(line, followed_tags, attr="hashtag", followed_attr="hashtag_followed"):
hline = []
for p in re.split(HASHTAG_PATTERN, line):
if p.startswith("#"):
if p[1:].lower() in (t.lower() for t in followed_tags):
hline.append((followed_attr, p))
else:
hline.append((attr, p))
else:
hline.append(p)
return hline
def show_media(paths):
"""
Attempt to open an image viewer to show given media files.

View File

@ -10,6 +10,7 @@ from bs4 import BeautifulSoup
from typing import Dict
from toot.exceptions import ConsoleError
from urllib.parse import urlparse, urlencode, quote, unquote
def str_bool(b):
@ -186,3 +187,14 @@ def _warn_scheme_deprecated():
"instead write:",
" toot instance http://unsafehost.com\n"
]))
def urlencode_url(url):
parsed_url = urlparse(url)
# unencode before encoding, to prevent double-urlencoding
encoded_path = quote(unquote(parsed_url.path), safe="-._~()'!*:@,;+&=/")
encoded_query = urlencode({k: quote(unquote(v), safe="-._~()'!*:@,;?/") for k, v in parsed_url.params})
encoded_url = parsed_url._replace(path=encoded_path, params=encoded_query).geturl()
return encoded_url