import re import urwid import unicodedata from bs4.element import NavigableString, Tag from toot.tui.constants import PALETTE from toot.utils import parse_html, urlencode_url from typing import List, Tuple from urwid.util import decompose_tagmarkup from urwidgets import Hyperlink, TextEmbed STYLE_NAMES = [p[0] for p in PALETTE] # NOTE: update this list if Mastodon starts supporting more block tags BLOCK_TAGS = ["p", "pre", "li", "blockquote", "h1", "h2", "h3", "h4", "h5", "h6"] def html_to_widgets(html, recovery_attempt=False) -> List[urwid.Widget]: """Convert html to urwid widgets""" widgets: List[urwid.Widget] = [] html = unicodedata.normalize("NFKC", html) soup = parse_html(html) first_tag = True for e in soup.body or soup: if isinstance(e, NavigableString): if first_tag and not recovery_attempt: # if our first "tag" is a navigable string # the HTML is out of spec, doesn't start with a tag, # we see this in content from Pixelfed servers. # attempt a fix by wrapping the HTML with

return html_to_widgets(f"

{html}

", recovery_attempt=True) else: continue else: name = e.name # if our HTML starts with a tag, but not a block tag # the HTML is out of spec. Attempt a fix by wrapping the # HTML with

if (first_tag and not recovery_attempt and name not in BLOCK_TAGS): return html_to_widgets(f"

{html}

", recovery_attempt=True) markup = render(name, e) first_tag = False if not isinstance(markup, urwid.Widget): # plaintext, so create a padded text widget txt = text_to_widget("", markup) markup = urwid.Padding( txt, align="left", width=("relative", 100), min_width=None, ) widgets.append(markup) # separate top level widgets with a blank line widgets.append(urwid.Divider(" ")) return widgets[:-1] # but suppress the last blank line def url_to_widget(url: str): try: widget = len(url), urwid.Filler(Hyperlink(url, "link", url)) except ValueError: widget = len(url), urwid.Filler(urwid.Text(url)) # don't style as link return TextEmbed(widget) def inline_tag_to_text(tag) -> Tuple: """Convert html tag to plain text with tag as attributes recursively""" markups = process_inline_tag_children(tag) if not markups: return (tag.name, "") return (tag.name, markups) def process_inline_tag_children(tag) -> List: """Recursively retrieve all children and convert to a list of markup text""" markups = [] for child in tag.children: if isinstance(child, Tag): markup = render(child.name, child) markups.append(markup) else: markups.append(child) return markups URL_PATTERN = re.compile(r"(^.+)\x03(.+$)") def text_to_widget(attr, markup) -> urwid.Widget: markup_list = [] for run in markup: if isinstance(run, tuple): txt, attr_list = decompose_tagmarkup(run) # find anchor titles with an ETX separator followed by href match = URL_PATTERN.match(txt) if match: label, url = match.groups() anchor_attr = get_best_anchor_attr(attr_list) try: markup_list.append(( len(label), urwid.Filler(Hyperlink(url, anchor_attr, label)), )) except ValueError: markup_list.append(( len(label), urwid.Filler(urwid.Text(url)), # don't style as link )) else: markup_list.append(run) else: markup_list.append(run) return TextEmbed(markup_list) def process_block_tag_children(tag) -> List[urwid.Widget]: """Recursively retrieve all children and convert to a list of widgets any inline tags containing text will be converted to Text widgets""" pre_widget_markups = [] post_widget_markups = [] child_widgets = [] found_nested_widget = False for child in tag.children: if isinstance(child, Tag): # child is a nested tag; process using custom method # or default to inline_tag_to_text result = render(child.name, child) if isinstance(result, urwid.Widget): found_nested_widget = True child_widgets.append(result) else: if not found_nested_widget: pre_widget_markups.append(result) else: post_widget_markups.append(result) else: # child is text; append to the appropriate markup list if not found_nested_widget: pre_widget_markups.append(child) else: post_widget_markups.append(child) widget_list = [] if len(pre_widget_markups): widget_list.append(text_to_widget(tag.name, pre_widget_markups)) if len(child_widgets): widget_list += child_widgets if len(post_widget_markups): widget_list.append(text_to_widget(tag.name, post_widget_markups)) return widget_list def get_urwid_attr_name(tag) -> str: """Get the class name and translate to a name suitable for use as an urwid text attribute name""" if "class" in tag.attrs: clss = tag.attrs["class"] if len(clss) > 0: style_name = "class_" + "_".join(clss) # return the class name, only if we # find it as a defined palette name if style_name in STYLE_NAMES: return style_name # fallback to returning the tag name return tag.name def basic_block_tag_handler(tag) -> urwid.Widget: """default for block tags that need no special treatment""" return urwid.Pile(process_block_tag_children(tag)) def get_best_anchor_attr(attrib_list) -> str: if not attrib_list: return "" flat_al = list(flatten(attrib_list)) for a in flat_al[0]: # ref: https://docs.joinmastodon.org/spec/activitypub/ # these are the class names (translated to attrib names) # that we can support for display try: if a[0] in ["class_hashtag", "class_mention_hashtag", "class_mention"]: return a[0] except KeyError: continue return "a" def render(attr: str, content: str): if attr in ["a"]: return render_anchor(content) if attr in ["blockquote"]: return render_blockquote(content) if attr in ["br"]: return render_br(content) if attr in ["em"]: return render_em(content) if attr in ["ol"]: return render_ol(content) if attr in ["pre"]: return render_pre(content) if attr in ["span"]: return render_span(content) if attr in ["b", "strong"]: return render_strong(content) if attr in ["ul"]: return render_ul(content) # Glitch-soc and Pleroma allow

...

in content # Mastodon (PR #23913) does not; header tags are converted to

if attr in ["p", "div", "li", "h1", "h2", "h3", "h4", "h5", "h6"]: return basic_block_tag_handler(content) # Fall back to inline_tag_to_text handler return inline_tag_to_text(content) def render_anchor(tag) -> Tuple: """anchor tag handler""" markups = process_inline_tag_children(tag) if not markups: return (tag.name, "") href = tag.attrs["href"] title, attrib_list = decompose_tagmarkup(markups) if not attrib_list: attrib_list = [tag] if href: # urlencode the path and query portions of the URL href = urlencode_url(href) # use ASCII ETX (end of record) as a # delimiter between the title and the HREF title += f"\x03{href}" attr = get_best_anchor_attr(attrib_list) if attr == "a": # didn't find an attribute to use # in the child markup, so let's # try the anchor tag's own attributes attr = get_urwid_attr_name(tag) # hashtag anchors have a class of "mention hashtag" # or "hashtag" # we'll return style "class_mention_hashtag" # or "class_hashtag" # in that case; see corresponding palette entry # in constants.py controlling hashtag highlighting return (attr, title) def render_blockquote(tag) -> urwid.Widget: widget_list = process_block_tag_children(tag) blockquote_widget = urwid.LineBox( urwid.Padding( urwid.Pile(widget_list), align="left", width=("relative", 100), min_width=None, left=1, right=1, ), tlcorner="", tline="", lline="│", trcorner="", blcorner="", rline="", bline="", brcorner="", ) return urwid.Pile([urwid.AttrMap(blockquote_widget, "blockquote")]) def render_br(tag) -> Tuple: return ("br", "\n") def render_em(tag) -> Tuple: # to simplify the number of palette entries # translate EM to I (italic) markups = process_inline_tag_children(tag) if not markups: return ("i", "") # special case processing for bold and italic for parent in tag.parents: if parent.name == "b" or parent.name == "strong": return ("bi", markups) return ("i", markups) def render_ol(tag) -> urwid.Widget: """ordered list tag handler""" widgets = [] list_item_num = 1 increment = -1 if tag.has_attr("reversed") else 1 # get ol start= attribute if present if tag.has_attr("start") and len(tag.attrs["start"]) > 0: try: list_item_num = int(tag.attrs["start"]) except ValueError: pass for li in tag.find_all("li", recursive=False): markup = render("li", li) # li value= attribute will change the item number # it also overrides any ol start= attribute if li.has_attr("value") and len(li.attrs["value"]) > 0: try: list_item_num = int(li.attrs["value"]) except ValueError: pass if not isinstance(markup, urwid.Widget): txt = text_to_widget("li", [str(list_item_num), ". ", markup]) # 1. foo, 2. bar, etc. widgets.append(txt) else: txt = text_to_widget("li", [str(list_item_num), ". "]) columns = urwid.Columns( [txt, ("weight", 9999, markup)], dividechars=1, min_width=3 ) widgets.append(columns) list_item_num += increment return urwid.Pile(widgets) def render_pre(tag) -> urwid.Widget: #
 tag spec says that text should not wrap,
    # but horizontal screen space is at a premium
    # and we have no horizontal scroll bar, so allow
    # wrapping.

    widget_list = [urwid.Divider(" ")]
    widget_list += process_block_tag_children(tag)

    pre_widget = urwid.Padding(
        urwid.Pile(widget_list),
        align="left",
        width=("relative", 100),
        min_width=None,
        left=1,
        right=1,
    )
    return urwid.Pile([urwid.AttrMap(pre_widget, "pre")])


def render_span(tag) -> Tuple:
    markups = process_inline_tag_children(tag)

    if not markups:
        return (tag.name, "")

    # span inherits its parent's class definition
    # unless it has a specific class definition
    # of its own

    if "class" in tag.attrs:
        # uncomment the following code to hide all HTML marked
        # invisible (generally, the http:// prefix of URLs)
        # could be a user preference, it's only advisable if
        # the terminal supports OCS 8 hyperlinks (and that's not
        # automatically detectable)

        # if "invisible" in tag.attrs["class"]:
        #     return (tag.name, "")

        style_name = get_urwid_attr_name(tag)

        if style_name != "span":
            # unique class name matches an entry in our palette
            return (style_name, markups)

    if tag.parent:
        return (get_urwid_attr_name(tag.parent), markups)
    else:
        # fallback
        return ("span", markups)


def render_strong(tag) -> Tuple:
    # to simplify the number of palette entries
    # translate STRONG to B (bold)
    markups = process_inline_tag_children(tag)
    if not markups:
        return ("b", "")

    # special case processing for bold and italic
    for parent in tag.parents:
        if parent.name == "i" or parent.name == "em":
            return ("bi", markups)

    return ("b", markups)


def render_ul(tag) -> urwid.Widget:
    """unordered list tag handler"""

    widgets = []

    for li in tag.find_all("li", recursive=False):
        markup = render("li", li)

        if not isinstance(markup, urwid.Widget):
            txt = text_to_widget("li", ["\N{bullet} ", markup])
            # * foo, * bar, etc.
            widgets.append(txt)
        else:
            txt = text_to_widget("li", ["\N{bullet} "])
            columns = urwid.Columns(
                [txt, ("weight", 9999, markup)], dividechars=1, min_width=3
            )
            widgets.append(columns)

    return urwid.Pile(widgets)


def flatten(data):
    if isinstance(data, tuple):
        for x in data:
            yield from flatten(x)
    else:
        yield data