Toot-Mastodon-CLI-TUI-clien.../toot/tui/richtext.py

import re
import urwid
import unicodedata

from bs4.element import NavigableString, Tag
from toot.tui.constants import PALETTE
from toot.tui.stubs.urwidgets import TextEmbed, Hyperlink, parse_text, has_urwidgets
from toot.utils import parse_html, urlencode_url
from typing import List, Tuple
from urwid.util import decompose_tagmarkup


STYLE_NAMES = [p[0] for p in PALETTE]


class ContentParser:
    """Parse a limited subset of HTML and create urwid widgets."""

    def html_to_widgets(self, html, recovery_attempt=False) -> List[urwid.Widget]:
        """Convert html to urwid widgets"""
        widgets: List[urwid.Widget] = []
        html = unicodedata.normalize("NFKC", html)
        soup = parse_html(html)
        first_tag = True
        for e in soup.body or soup:
            if isinstance(e, NavigableString):
                if first_tag and not recovery_attempt:
                    # if our first "tag" is a navigable string
                    # the HTML is out of spec, doesn't start with a tag,
                    # we see this in content from Pixelfed servers.
                    # attempt a fix by wrapping the HTML with <p></p>
                    return self.html_to_widgets(f"<p>{html}</p>", recovery_attempt=True)
                else:
                    continue
            else:
                name = e.name
                # if our HTML starts with a tag, but not a block tag
                # the HTML is out of spec. Attempt a fix by wrapping the
                # HTML with <p></p>
                if (
                    first_tag
                    and not recovery_attempt
                    and name
                    not in (
                        "p",
                        "pre",
                        "li",
                        "blockquote",
                        "h1",
                        "h2",
                        "h3",
                        "h4",
                        "h5",
                        "h6",
                    )  # NOTE: update this list if Mastodon starts supporting more block tags
                ):
                    return self.html_to_widgets(f"<p>{html}</p>", recovery_attempt=True)

                # First, look for a custom tag handler method in this class
                # If that fails, fall back to inline_tag_to_text handler
                method = getattr(self, "_" + name, self.inline_tag_to_text)
                markup = method(e)  # either returns a Widget, or plain text
                first_tag = False

            if not isinstance(markup, urwid.Widget):
                # plaintext, so create a padded text widget
                txt = self.text_to_widget("", markup)
                markup = urwid.Padding(
                    txt,
                    align="left",
                    width=("relative", 100),
                    min_width=None,
                )
            widgets.append(markup)
            # separate top level widgets with a blank line
            widgets.append(urwid.Divider(" "))
        return widgets[:-1]  # but suppress the last blank line

    def inline_tag_to_text(self, tag) -> Tuple:
        """Convert html tag to plain text with tag as attributes recursively"""
        markups = self.process_inline_tag_children(tag)
        if not markups:
            return (tag.name, "")
        return (tag.name, markups)

    def process_inline_tag_children(self, tag) -> List:
        """Recursively retrieve all children
        and convert to a list of markup text"""
        markups = []
        for child in tag.children:
            if isinstance(child, Tag):
                method = getattr(self, "_" + child.name, self.inline_tag_to_text)
                markup = method(child)
                markups.append(markup)
            else:
                markups.append(child)
        return markups

    def text_to_widget(self, attr, markup) -> urwid.Widget:
        if not has_urwidgets:
            return urwid.Text((attr, markup))

        TRANSFORM = {
            # convert http[s] URLs to Hyperlink widgets for nesting in a TextEmbed widget
            re.compile(r"(^.+)\x03(.+$)"): lambda g: (
                len(g[1]),
                urwid.Filler(Hyperlink(g[2], anchor_attr, g[1])),
            ),
        }
        markup_list = []

        for run in markup:
            if isinstance(run, tuple):
                txt, attr_list = decompose_tagmarkup(run)
                # find anchor titles with an ETX separator followed by href
                m = re.match(r"(^.+)\x03(.+$)", txt)
                if m:
                    anchor_attr = self.get_best_anchor_attr(attr_list)
                    markup_list.append(
                        parse_text(
                            txt,
                            TRANSFORM,
                            lambda pattern, groups, span: TRANSFORM[pattern](groups),
                        )
                    )
                else:
                    markup_list.append(run)
            else:
                markup_list.append(run)

        return TextEmbed(markup_list)

    def process_block_tag_children(self, tag) -> List[urwid.Widget]:
        """Recursively retrieve all children
        and convert to a list of widgets
        any inline tags containing text will be
        converted to Text widgets"""

        pre_widget_markups = []
        post_widget_markups = []
        child_widgets = []
        found_nested_widget = False

        for child in tag.children:
            if isinstance(child, Tag):
                # child is a nested tag; process using custom method
                # or default to inline_tag_to_text
                method = getattr(self, "_" + child.name, self.inline_tag_to_text)
                result = method(child)
                if isinstance(result, urwid.Widget):
                    found_nested_widget = True
                    child_widgets.append(result)
                else:
                    if not found_nested_widget:
                        pre_widget_markups.append(result)
                    else:
                        post_widget_markups.append(result)
            else:
                # child is text; append to the appropriate markup list
                if not found_nested_widget:
                    pre_widget_markups.append(child)
                else:
                    post_widget_markups.append(child)

        widget_list = []
        if len(pre_widget_markups):
            widget_list.append(self.text_to_widget(tag.name, pre_widget_markups))

        if len(child_widgets):
            widget_list += child_widgets

        if len(post_widget_markups):
            widget_list.append(self.text_to_widget(tag.name, post_widget_markups))

        return widget_list

    def get_urwid_attr_name(self, tag) -> str:
        """Get the class name and translate to a
        name suitable for use as an urwid
        text attribute name"""

        if "class" in tag.attrs:
            clss = tag.attrs["class"]
            if len(clss) > 0:
                style_name = "class_" + "_".join(clss)
                # return the class name, only if we
                # find it as a defined palette name
                if style_name in STYLE_NAMES:
                    return style_name

        # fallback to returning the tag name
        return tag.name

    # Tag handlers start here.
    # Tags not explicitly listed are "supported" by
    # rendering as text.
    # Inline tags return a list of marked up text for urwid.Text
    # Block tags return urwid.Widget

    def basic_block_tag_handler(self, tag) -> urwid.Widget:
        """default for block tags that need no special treatment"""
        return urwid.Pile(self.process_block_tag_children(tag))

    def get_best_anchor_attr(self, attrib_list) -> str:
        if not attrib_list:
            return ""
        flat_al = list(flatten(attrib_list))

        for a in flat_al[0]:
            # ref: https://docs.joinmastodon.org/spec/activitypub/
            # these are the class names (translated to attrib names)
            # that we can support for display

            try:
                if a[0] in ["class_hashtag", "class_mention_hashtag", "class_mention"]:
                    return a[0]
            except KeyError:
                continue

        return "a"

    def _a(self, tag) -> Tuple:
        """anchor tag handler"""

        markups = self.process_inline_tag_children(tag)
        if not markups:
            return (tag.name, "")

        href = tag.attrs["href"]
        title, attrib_list = decompose_tagmarkup(markups)
        if not attrib_list:
            attrib_list = [tag]
        if href and has_urwidgets:
            # only if we have urwidgets loaded for OCS 8 hyperlinks:
            # urlencode the path and query portions of the URL
            href = urlencode_url(href)
            # use ASCII ETX (end of record) as a
            # delimiter between the title and the HREF
            title += f"\x03{href}"

        attr = self.get_best_anchor_attr(attrib_list)

        if attr == "a":
            # didn't find an attribute to use
            # in the child markup, so let's
            # try the anchor tag's own attributes

            attr = self.get_urwid_attr_name(tag)

        # hashtag anchors have a class of "mention hashtag"
        # or "hashtag"
        # we'll return style "class_mention_hashtag"
        # or "class_hashtag"
        # in that case; see corresponding palette entry
        # in constants.py controlling hashtag highlighting

        return (attr, title)

    def _blockquote(self, tag) -> urwid.Widget:
        widget_list = self.process_block_tag_children(tag)
        blockquote_widget = urwid.LineBox(
            urwid.Padding(
                urwid.Pile(widget_list),
                align="left",
                width=("relative", 100),
                min_width=None,
                left=1,
                right=1,
            ),
            tlcorner="",
            tline="",
            lline="│",
            trcorner="",
            blcorner="",
            rline="",
            bline="",
            brcorner="",
        )
        return urwid.Pile([urwid.AttrMap(blockquote_widget, "blockquote")])

    def _br(self, tag) -> Tuple:
        return ("br", "\n")

    def _em(self, tag) -> Tuple:
        # to simplify the number of palette entries
        # translate EM to I (italic)
        markups = self.process_inline_tag_children(tag)
        if not markups:
            return ("i", "")

        # special case processing for bold and italic
        for parent in tag.parents:
            if parent.name == "b" or parent.name == "strong":
                return ("bi", markups)

        return ("i", markups)

    def _ol(self, tag) -> urwid.Widget:
        """ordered list tag handler"""

        widgets = []
        list_item_num = 1
        increment = -1 if tag.has_attr("reversed") else 1

        # get ol start= attribute if present
        if tag.has_attr("start") and len(tag.attrs["start"]) > 0:
            try:
                list_item_num = int(tag.attrs["start"])
            except ValueError:
                pass

        for li in tag.find_all("li", recursive=False):
            method = getattr(self, "_li", self.inline_tag_to_text)
            markup = method(li)

            # li value= attribute will change the item number
            # it also overrides any ol start= attribute

            if li.has_attr("value") and len(li.attrs["value"]) > 0:
                try:
                    list_item_num = int(li.attrs["value"])
                except ValueError:
                    pass

            if not isinstance(markup, urwid.Widget):
                txt = self.text_to_widget("li", [str(list_item_num), ". ", markup])
                # 1. foo, 2. bar, etc.
                widgets.append(txt)
            else:
                txt = self.text_to_widget("li", [str(list_item_num), ". "])
                columns = urwid.Columns(
                    [txt, ("weight", 9999, markup)], dividechars=1, min_width=3
                )
                widgets.append(columns)

            list_item_num += increment

        return urwid.Pile(widgets)

    def _pre(self, tag) -> urwid.Widget:
        # <PRE> tag spec says that text should not wrap,
        # but horizontal screen space is at a premium
        # and we have no horizontal scroll bar, so allow
        # wrapping.

        widget_list = [urwid.Divider(" ")]
        widget_list += self.process_block_tag_children(tag)

        pre_widget = urwid.Padding(
            urwid.Pile(widget_list),
            align="left",
            width=("relative", 100),
            min_width=None,
            left=1,
            right=1,
        )
        return urwid.Pile([urwid.AttrMap(pre_widget, "pre")])

    def _span(self, tag) -> Tuple:
        markups = self.process_inline_tag_children(tag)

        if not markups:
            return (tag.name, "")

        # span inherits its parent's class definition
        # unless it has a specific class definition
        # of its own

        if "class" in tag.attrs:
            # uncomment the following code to hide all HTML marked
            # invisible (generally, the http:// prefix of URLs)
            # could be a user preference, it's only advisable if
            # the terminal supports OCS 8 hyperlinks (and that's not
            # automatically detectable)

            # if "invisible" in tag.attrs["class"]:
            #     return (tag.name, "")

            style_name = self.get_urwid_attr_name(tag)

            if style_name != "span":
                # unique class name matches an entry in our palette
                return (style_name, markups)

        if tag.parent:
            return (self.get_urwid_attr_name(tag.parent), markups)
        else:
            # fallback
            return ("span", markups)

    def _strong(self, tag) -> Tuple:
        # to simplify the number of palette entries
        # translate STRONG to B (bold)
        markups = self.process_inline_tag_children(tag)
        if not markups:
            return ("b", "")

        # special case processing for bold and italic
        for parent in tag.parents:
            if parent.name == "i" or parent.name == "em":
                return ("bi", markups)

        return ("b", markups)

    def _ul(self, tag) -> urwid.Widget:
        """unordered list tag handler"""

        widgets = []

        for li in tag.find_all("li", recursive=False):
            method = getattr(self, "_li", self.inline_tag_to_text)
            markup = method(li)

            if not isinstance(markup, urwid.Widget):
                txt = self.text_to_widget("li", ["\N{bullet} ", markup])
                # * foo, * bar, etc.
                widgets.append(txt)
            else:
                txt = self.text_to_widget("li", ["\N{bullet} "])
                columns = urwid.Columns(
                    [txt, ("weight", 9999, markup)], dividechars=1, min_width=3
                )
                widgets.append(columns)

        return urwid.Pile(widgets)

    # These tags are handled identically to others
    # the only difference being the tag name used for
    # urwid attribute mapping

    _b = _strong

    _div = basic_block_tag_handler

    _i = _em

    _li = basic_block_tag_handler

    # Glitch-soc and Pleroma allow <H1>...<H6> in content
    # Mastodon (PR #23913) does not; header tags are converted to <P><STRONG></STRONG></P>

    _h1 = _h2 = _h3 = _h4 = _h5 = _h6 = basic_block_tag_handler

    _p = basic_block_tag_handler


def flatten(data):
    if isinstance(data, tuple):
        for x in data:
            yield from flatten(x)
    else:
        yield data