Use Pandoc to render markdown, fallback to plaintext

Also used for markdown rendering in console and copy-to-clipboard
2023-11-23 11:10:02 -05:00 · 2023-11-23 11:10:02 -05:00 · 8cb294f3c8
parent 7443d3e0b5
commit 8cb294f3c8
8 changed files with 85 additions and 247 deletions
--- a/setup.py
+++ b/setup.py
@ -39,12 +39,15 @@ setup(
        "wcwidth>=0.1.7",
        "urwid>=2.0.0,<3.0",
        "tomlkit>=0.10.0,<1.0",
-        "html2text>=2020.1.16"
    ],
    extras_require={
        # Required to display rich text in the TUI
        "richtext": [
-            "urwidgets>=0.1,<0.2"
+            "urwidgets>=0.1,<0.2",
+        ],
+        "markdown": [
+            "pypandoc>=1.12.0,<2.0",
+            "pypandoc-binary>=1.12.0,<2.0",
        ],
        "dev": [
            "coverage",
--- a/tests/test_console.py
+++ b/tests/test_console.py
@ -152,210 +152,6 @@ def test_timeline(mock_get, monkeypatch, capsys):
    assert err == ""


-@mock.patch('toot.http.get')
-def test_timeline_html_content(mock_get, monkeypatch, capsys):
-    mock_get.return_value = MockResponse([{
-        'id': '111111111111111111',
-        'account': {
-            'display_name': 'Frank Zappa 🎸',
-            'acct': 'fz'
-        },
-        'created_at': '2017-04-12T15:53:18.174Z',
-        'content': "<h2>HTML Render Test</h2><p><em>emphasized</em><br><u>underlined</u><br><strong>bold</strong><br><strong><em>bold and italic</em></strong><br><del>strikethrough</del><br>regular text</p><p>Code block:</p><pre><code>10 PRINT \"HELLO WORLD\"<br>20 GOTO 10<br></code></pre><blockquote><p>Something blockquoted here. The indentation is maintained as the text line wraps.</p></blockquote><ol><li>List item<ul><li>Nested item</li><li>Another nested </li></ul></li><li>Another list item. <ol><li>Something else nested</li><li>And a last nested</li></ol></li></ol><blockquote><p>Blockquote</p><ol><li>List in BQ</li><li>List item 2 in BQ</li></ol></blockquote><p><a href=\"https://babka.social/tags/hashtag\" class=\"mention hashtag\" rel=\"tag\">#<span>hashtag</span></a> <a href=\"https://babka.social/tags/test\" class=\"mention hashtag\" rel=\"tag\">#<span>test</span></a> <br><a href=\"https://a.com\" target=\"_blank\" rel=\"nofollow noopener noreferrer\"><span class=\"invisible\">https://</span><span class=\"\">a.com</span><span class=\"invisible\"></span></a> text after link</p>",
-        'reblog': None,
-        'in_reply_to_id': None,
-        'media_attachments': [],
-    }])
-
-    console.run_command(app, user, 'timeline', ['--once'])
-
-    mock_get.assert_called_once_with(app, user, '/api/v1/timelines/home', {'limit': 10})
-
-    out, err = capsys.readouterr()
-    lines = out.split("\n")  
-    reference = [
-        "────────────────────────────────────────────────────────────────────────────────────────────────────",
-        "Frank Zappa 🎸 @fz                                                              2017-04-12 15:53 UTC",
-        "",
-        "## HTML Render Test",
-        "",
-        " _emphasized_  ",
-        " _underlined_  ",
-        " **bold**  ",
-        " ** _bold and italic_**  ",
-        " ~~strikethrough~~  ",
-        "regular text",
-        "",
-        "Code block:",
-        "",
-        "    ",
-        "    10 PRINT \"HELLO WORLD\"  ",
-        "    20 GOTO 10  ",
-        "    ",
-        "> Something blockquoted here. The indentation is maintained as the text line wraps.",
-        "  1. List item",
-        "    • Nested item",
-        "    • Another nested ",
-        "  2. Another list item. ",
-        "    1. Something else nested",
-        "    2. And a last nested",
-        "",
-        "> Blockquote",
-        ">   1. List in BQ",
-        ">   2. List item 2 in BQ",
-        ">",
-        "",
-        "#hashtag #test  ",
-        "https://a.com text after link",
-        "",
-        "ID 111111111111111111   ",
-        "────────────────────────────────────────────────────────────────────────────────────────────────────",
-        "",
-    ]
-
-    assert len(lines) == len(reference)
-    for index, line in enumerate(lines):
-        assert line == reference[index], f"Line #{index}: Expected:\n{reference[index]}\nGot:\n{line}"
-
-    assert err == ""
-
-
-@mock.patch('toot.http.get')
-def test_timeline_html_content(mock_get, monkeypatch, capsys):
-    mock_get.return_value = MockResponse([{
-        'id': '111111111111111111',
-        'account': {
-            'display_name': 'Frank Zappa 🎸',
-            'acct': 'fz'
-        },
-        'created_at': '2017-04-12T15:53:18.174Z',
-        'content': "<h2>HTML Render Test</h2><p><em>emphasized</em><br><u>underlined</u><br><strong>bold</strong><br><strong><em>bold and italic</em></strong><br><del>strikethrough</del><br>regular text</p><p>Code block:</p><pre><code>10 PRINT \"HELLO WORLD\"<br>20 GOTO 10<br></code></pre><blockquote><p>Something blockquoted here. The indentation is maintained as the text line wraps.</p></blockquote><ol><li>List item<ul><li>Nested item</li><li>Another nested </li></ul></li><li>Another list item. <ol><li>Something else nested</li><li>And a last nested</li></ol></li></ol><blockquote><p>Blockquote</p><ol><li>List in BQ</li><li>List item 2 in BQ</li></ol></blockquote><p><a href=\"https://babka.social/tags/hashtag\" class=\"mention hashtag\" rel=\"tag\">#<span>hashtag</span></a> <a href=\"https://babka.social/tags/test\" class=\"mention hashtag\" rel=\"tag\">#<span>test</span></a> <br><a href=\"https://a.com\" target=\"_blank\" rel=\"nofollow noopener noreferrer\"><span class=\"invisible\">https://</span><span class=\"\">a.com</span><span class=\"invisible\"></span></a> text after link</p>",
-        'reblog': None,
-        'in_reply_to_id': None,
-        'media_attachments': [],
-    }])
-
-    console.run_command(app, user, 'timeline', ['--once'])
-
-    mock_get.assert_called_once_with(app, user, '/api/v1/timelines/home', {'limit': 10})
-
-    out, err = capsys.readouterr()
-    lines = out.split("\n")  
-    reference = [
-        "────────────────────────────────────────────────────────────────────────────────────────────────────",
-        "Frank Zappa 🎸 @fz                                                              2017-04-12 15:53 UTC",
-        "",
-        "## HTML Render Test",
-        "",
-        " _emphasized_  ",
-        " _underlined_  ",
-        " **bold**  ",
-        " ** _bold and italic_**  ",
-        " ~~strikethrough~~  ",
-        "regular text",
-        "",
-        "Code block:",
-        "",
-        "    ",
-        "    10 PRINT \"HELLO WORLD\"  ",
-        "    20 GOTO 10  ",
-        "    ",
-        "> Something blockquoted here. The indentation is maintained as the text line wraps.",
-        "  1. List item",
-        "    • Nested item",
-        "    • Another nested ",
-        "  2. Another list item. ",
-        "    1. Something else nested",
-        "    2. And a last nested",
-        "",
-        "> Blockquote",
-        ">   1. List in BQ",
-        ">   2. List item 2 in BQ",
-        ">",
-        "",
-        "#hashtag #test  ",
-        "https://a.com text after link",
-        "",
-        "ID 111111111111111111   ",
-        "────────────────────────────────────────────────────────────────────────────────────────────────────",
-        "",
-    ]
-
-    assert len(lines) == len(reference)
-    for index, line in enumerate(lines):
-        assert line == reference[index], f"Line #{index}: Expected:\n{reference[index]}\nGot:\n{line}"
-
-    assert err == ""
-
-
-@mock.patch('toot.http.get')
-def test_timeline_html_content(mock_get, monkeypatch, capsys):
-    mock_get.return_value = MockResponse([{
-        'id': '111111111111111111',
-        'account': {
-            'display_name': 'Frank Zappa 🎸',
-            'acct': 'fz'
-        },
-        'created_at': '2017-04-12T15:53:18.174Z',
-        'content': "<h2>HTML Render Test</h2><p><em>emphasized</em><br><u>underlined</u><br><strong>bold</strong><br><strong><em>bold and italic</em></strong><br><del>strikethrough</del><br>regular text</p><p>Code block:</p><pre><code>10 PRINT \"HELLO WORLD\"<br>20 GOTO 10<br></code></pre><blockquote><p>Something blockquoted here. The indentation is maintained as the text line wraps.</p></blockquote><ol><li>List item<ul><li>Nested item</li><li>Another nested </li></ul></li><li>Another list item. <ol><li>Something else nested</li><li>And a last nested</li></ol></li></ol><blockquote><p>Blockquote</p><ol><li>List in BQ</li><li>List item 2 in BQ</li></ol></blockquote><p><a href=\"https://babka.social/tags/hashtag\" class=\"mention hashtag\" rel=\"tag\">#<span>hashtag</span></a> <a href=\"https://babka.social/tags/test\" class=\"mention hashtag\" rel=\"tag\">#<span>test</span></a> <br><a href=\"https://a.com\" target=\"_blank\" rel=\"nofollow noopener noreferrer\"><span class=\"invisible\">https://</span><span class=\"\">a.com</span><span class=\"invisible\"></span></a> text after link</p>",
-        'reblog': None,
-        'in_reply_to_id': None,
-        'media_attachments': [],
-    }])
-
-    console.run_command(app, user, 'timeline', ['--once'])
-
-    mock_get.assert_called_once_with(app, user, '/api/v1/timelines/home', {'limit': 10})
-
-    out, err = capsys.readouterr()
-    lines = out.split("\n")
-    reference = [
-        "────────────────────────────────────────────────────────────────────────────────────────────────────",
-        "Frank Zappa 🎸 @fz                                                              2017-04-12 15:53 UTC",
-        "",
-        "## HTML Render Test",
-        "",
-        " _emphasized_  ",
-        " _underlined_  ",
-        " **bold**  ",
-        " ** _bold and italic_**  ",
-        " ~~strikethrough~~  ",
-        "regular text",
-        "",
-        "Code block:",
-        "",
-        "    ",
-        "    10 PRINT \"HELLO WORLD\"  ",
-        "    20 GOTO 10  ",
-        "    ",
-        "> Something blockquoted here. The indentation is maintained as the text line wraps.",
-        "  1. List item",
-        "    • Nested item",
-        "    • Another nested ",
-        "  2. Another list item. ",
-        "    1. Something else nested",
-        "    2. And a last nested",
-        "",
-        "> Blockquote",
-        ">   1. List in BQ",
-        ">   2. List item 2 in BQ",
-        ">",
-        "",
-        "#hashtag #test  ",
-        "https://a.com text after link",
-        "",
-        "ID 111111111111111111   ",
-        "────────────────────────────────────────────────────────────────────────────────────────────────────",
-        "",
-    ]
-
-    assert len(lines) == len(reference)
-    for index, line in enumerate(lines):
-        assert line == reference[index], f"Line #{index}: Expected:\n{reference[index]}\nGot:\n{line}"
-
-    assert err == ""
-
-
@mock.patch('toot.http.get')
 def test_timeline_with_re(mock_get, monkeypatch, capsys):
    mock_get.return_value = MockResponse([{
--- a/toot/output.py
+++ b/toot/output.py
@ -2,11 +2,11 @@ import os
 import re
 import sys
 import textwrap
-import html2text

 from functools import lru_cache
 from toot import settings
 from toot.utils import get_text
+from toot.richtext import html_to_text
 from toot.entities import Account, Instance, Notification, Poll, Status
 from toot.wcstring import wc_wrap
 from typing import List
@ -321,20 +321,9 @@ def print_status(status: Status, width: int = 80):


 def print_html(text, width=80):
-    h2t = html2text.HTML2Text()
-
-    h2t.body_width = width
-    h2t.single_line_break = True
-    h2t.ignore_links = True
-    h2t.wrap_links = True
-    h2t.wrap_list_items = True
-    h2t.wrap_tables = True
-    h2t.unicode_snob = True
-    h2t.ul_item_mark = "\N{bullet}"
-    markdown = h2t.handle(text).strip()
-
+    markdown = "\n".join(html_to_text(text, columns=width, highlight_tags=False))
    print_out("")
-    print_out(highlight_hashtags(markdown))
+    print_out(markdown)


 def print_poll(poll: Poll):
--- a/toot/richtext/init.py
+++ b/toot/richtext/init.py
@ -0,0 +1,25 @@
+from toot.tui.utils import highlight_hashtags
+from toot.utils import html_to_paragraphs
+from toot.wcstring import wc_wrap
+from typing import List
+
+try:
+    # first preference, render markup with pypandoc
+    from .markdown import html_to_text
+
+except ImportError:
+    # Fallback to render in plaintext
+    def html_to_text(html: str, columns=80, highlight_tags=False) -> List:
+        output = []
+        first = True
+        for paragraph in html_to_paragraphs(html):
+            if not first:
+                output.append("")
+            for line in paragraph:
+                for subline in wc_wrap(line, columns):
+                    if highlight_tags:
+                        output.append(highlight_hashtags(subline))
+                    else:
+                        output.append(subline)
+            first = False
+        return output
--- a/toot/richtext/markdown.py
+++ b/toot/richtext/markdown.py
@ -0,0 +1,11 @@
+from pypandoc import convert_text
+from typing import List
+
+
+def html_to_text(html: str, columns=80, highlight_tags=False) -> List:
+    return [convert_text(
+        html,
+        format="html",
+        to="gfm-raw_html",
+        extra_args=["--wrap=auto", f"--columns={columns}"],
+    )]
--- a/toot/tui/app.py
+++ b/toot/tui/app.py
@ -1,13 +1,13 @@
 import logging
 import subprocess
 import urwid
-import html2text

 from concurrent.futures import ThreadPoolExecutor

 from toot import api, config, __version__, settings
 from toot.console import get_default_visibility
 from toot.exceptions import ApiError
+from toot.richtext import html_to_text
 from toot.utils.datetime import parse_datetime

 from .compose import StatusComposer
@ -656,12 +656,8 @@ class TUI(urwid.Frame):
        return self.run_in_thread(_delete, done_callback=_done)

    def copy_status(self, status):
-        h2t = html2text.HTML2Text()
-        h2t.body_width = 0  # nowrap
-        h2t.single_line_break = True
-        h2t.ignore_links = True
-        h2t.unicode_snob = True
-        h2t.ul_item_mark = "\N{bullet}"
+
+        markdown = "\n".join(html_to_text(status.original.data["content"], columns=1024, highlight_tags=False))

        time = parse_datetime(status.original.data['created_at'])
        time = time.strftime('%Y-%m-%d %H:%M %Z')
@ -671,7 +667,7 @@ class TUI(urwid.Frame):
            + "\n"
            + (status.original.author.account or "")
            + "\n\n"
-            + h2t.handle(status.original.data["content"]).strip()
+            + markdown
            + "\n\n"
            + f"Created at: {time}")

--- a/toot/tui/richtext/init.py
+++ b/toot/tui/richtext/init.py
@ -1,27 +1,24 @@
 import urwid
-import html2text
-
+from toot.tui.utils import highlight_hashtags
+from toot.utils import format_content
 from typing import List

 try:
+    # our first preference is to render using urwidgets
    from .richtext import html_to_widgets, url_to_widget
+
 except ImportError:
-    # Fallback if urwidgets are not available
-    def html_to_widgets(html: str) -> List[urwid.Widget]:
-        return [
-            urwid.Text(_format_markdown(html))
-        ]
+    try:
+        # second preference, render markup with pypandoc
+        from .markdown import html_to_widgets, url_to_widget

-    def url_to_widget(url: str):
-        return urwid.Text(("link", url))
+    except ImportError:
+        # Fallback to render in plaintext

-    def _format_markdown(html) -> str:
-        h2t = html2text.HTML2Text()
-        h2t.single_line_break = True
-        h2t.ignore_links = True
-        h2t.wrap_links = False
-        h2t.wrap_list_items = False
-        h2t.wrap_tables = False
-        h2t.unicode_snob = True
-        h2t.ul_item_mark = "\N{bullet}"
-        return h2t.handle(html).strip()
+        def url_to_widget(url: str):
+            return urwid.Text(("link", url))
+
+        def html_to_widgets(html: str) -> List[urwid.Widget]:
+            return [
+                urwid.Text(highlight_hashtags(line)) for line in format_content(html)
+            ]
--- a/toot/tui/richtext/markdown.py
+++ b/toot/tui/richtext/markdown.py
@ -0,0 +1,21 @@
+import urwid
+from pypandoc import convert_text
+
+from typing import List
+
+
+def url_to_widget(url: str):
+    return urwid.Text(("link", url))
+
+
+def html_to_widgets(html: str) -> List[urwid.Widget]:
+    return [
+        urwid.Text(
+            convert_text(
+                html,
+                format="html",
+                to="gfm-raw_html",
+                extra_args=["--wrap=none"],
+            )
+        )
+    ]