Use Pandoc to render markdown, fallback to plaintext

Also used for markdown rendering in console
and copy-to-clipboard
This commit is contained in:
Daniel Schwarz 2023-11-23 11:10:02 -05:00
parent 7443d3e0b5
commit 8cb294f3c8
8 changed files with 85 additions and 247 deletions

View File

@ -39,12 +39,15 @@ setup(
"wcwidth>=0.1.7",
"urwid>=2.0.0,<3.0",
"tomlkit>=0.10.0,<1.0",
"html2text>=2020.1.16"
],
extras_require={
# Required to display rich text in the TUI
"richtext": [
"urwidgets>=0.1,<0.2"
"urwidgets>=0.1,<0.2",
],
"markdown": [
"pypandoc>=1.12.0,<2.0",
"pypandoc-binary>=1.12.0,<2.0",
],
"dev": [
"coverage",

View File

@ -152,210 +152,6 @@ def test_timeline(mock_get, monkeypatch, capsys):
assert err == ""
@mock.patch('toot.http.get')
def test_timeline_html_content(mock_get, monkeypatch, capsys):
mock_get.return_value = MockResponse([{
'id': '111111111111111111',
'account': {
'display_name': 'Frank Zappa 🎸',
'acct': 'fz'
},
'created_at': '2017-04-12T15:53:18.174Z',
'content': "<h2>HTML Render Test</h2><p><em>emphasized</em><br><u>underlined</u><br><strong>bold</strong><br><strong><em>bold and italic</em></strong><br><del>strikethrough</del><br>regular text</p><p>Code block:</p><pre><code>10 PRINT \"HELLO WORLD\"<br>20 GOTO 10<br></code></pre><blockquote><p>Something blockquoted here. The indentation is maintained as the text line wraps.</p></blockquote><ol><li>List item<ul><li>Nested item</li><li>Another nested </li></ul></li><li>Another list item. <ol><li>Something else nested</li><li>And a last nested</li></ol></li></ol><blockquote><p>Blockquote</p><ol><li>List in BQ</li><li>List item 2 in BQ</li></ol></blockquote><p><a href=\"https://babka.social/tags/hashtag\" class=\"mention hashtag\" rel=\"tag\">#<span>hashtag</span></a> <a href=\"https://babka.social/tags/test\" class=\"mention hashtag\" rel=\"tag\">#<span>test</span></a> <br><a href=\"https://a.com\" target=\"_blank\" rel=\"nofollow noopener noreferrer\"><span class=\"invisible\">https://</span><span class=\"\">a.com</span><span class=\"invisible\"></span></a> text after link</p>",
'reblog': None,
'in_reply_to_id': None,
'media_attachments': [],
}])
console.run_command(app, user, 'timeline', ['--once'])
mock_get.assert_called_once_with(app, user, '/api/v1/timelines/home', {'limit': 10})
out, err = capsys.readouterr()
lines = out.split("\n")
reference = [
"────────────────────────────────────────────────────────────────────────────────────────────────────",
"Frank Zappa 🎸 @fz 2017-04-12 15:53 UTC",
"",
"## HTML Render Test",
"",
" _emphasized_ ",
" _underlined_ ",
" **bold** ",
" ** _bold and italic_** ",
" ~~strikethrough~~ ",
"regular text",
"",
"Code block:",
"",
" ",
" 10 PRINT \"HELLO WORLD\" ",
" 20 GOTO 10 ",
" ",
"> Something blockquoted here. The indentation is maintained as the text line wraps.",
" 1. List item",
" • Nested item",
" • Another nested ",
" 2. Another list item. ",
" 1. Something else nested",
" 2. And a last nested",
"",
"> Blockquote",
"> 1. List in BQ",
"> 2. List item 2 in BQ",
">",
"",
"#hashtag #test ",
"https://a.com text after link",
"",
"ID 111111111111111111 ",
"────────────────────────────────────────────────────────────────────────────────────────────────────",
"",
]
assert len(lines) == len(reference)
for index, line in enumerate(lines):
assert line == reference[index], f"Line #{index}: Expected:\n{reference[index]}\nGot:\n{line}"
assert err == ""
@mock.patch('toot.http.get')
def test_timeline_html_content(mock_get, monkeypatch, capsys):
mock_get.return_value = MockResponse([{
'id': '111111111111111111',
'account': {
'display_name': 'Frank Zappa 🎸',
'acct': 'fz'
},
'created_at': '2017-04-12T15:53:18.174Z',
'content': "<h2>HTML Render Test</h2><p><em>emphasized</em><br><u>underlined</u><br><strong>bold</strong><br><strong><em>bold and italic</em></strong><br><del>strikethrough</del><br>regular text</p><p>Code block:</p><pre><code>10 PRINT \"HELLO WORLD\"<br>20 GOTO 10<br></code></pre><blockquote><p>Something blockquoted here. The indentation is maintained as the text line wraps.</p></blockquote><ol><li>List item<ul><li>Nested item</li><li>Another nested </li></ul></li><li>Another list item. <ol><li>Something else nested</li><li>And a last nested</li></ol></li></ol><blockquote><p>Blockquote</p><ol><li>List in BQ</li><li>List item 2 in BQ</li></ol></blockquote><p><a href=\"https://babka.social/tags/hashtag\" class=\"mention hashtag\" rel=\"tag\">#<span>hashtag</span></a> <a href=\"https://babka.social/tags/test\" class=\"mention hashtag\" rel=\"tag\">#<span>test</span></a> <br><a href=\"https://a.com\" target=\"_blank\" rel=\"nofollow noopener noreferrer\"><span class=\"invisible\">https://</span><span class=\"\">a.com</span><span class=\"invisible\"></span></a> text after link</p>",
'reblog': None,
'in_reply_to_id': None,
'media_attachments': [],
}])
console.run_command(app, user, 'timeline', ['--once'])
mock_get.assert_called_once_with(app, user, '/api/v1/timelines/home', {'limit': 10})
out, err = capsys.readouterr()
lines = out.split("\n")
reference = [
"────────────────────────────────────────────────────────────────────────────────────────────────────",
"Frank Zappa 🎸 @fz 2017-04-12 15:53 UTC",
"",
"## HTML Render Test",
"",
" _emphasized_ ",
" _underlined_ ",
" **bold** ",
" ** _bold and italic_** ",
" ~~strikethrough~~ ",
"regular text",
"",
"Code block:",
"",
" ",
" 10 PRINT \"HELLO WORLD\" ",
" 20 GOTO 10 ",
" ",
"> Something blockquoted here. The indentation is maintained as the text line wraps.",
" 1. List item",
" • Nested item",
" • Another nested ",
" 2. Another list item. ",
" 1. Something else nested",
" 2. And a last nested",
"",
"> Blockquote",
"> 1. List in BQ",
"> 2. List item 2 in BQ",
">",
"",
"#hashtag #test ",
"https://a.com text after link",
"",
"ID 111111111111111111 ",
"────────────────────────────────────────────────────────────────────────────────────────────────────",
"",
]
assert len(lines) == len(reference)
for index, line in enumerate(lines):
assert line == reference[index], f"Line #{index}: Expected:\n{reference[index]}\nGot:\n{line}"
assert err == ""
@mock.patch('toot.http.get')
def test_timeline_html_content(mock_get, monkeypatch, capsys):
mock_get.return_value = MockResponse([{
'id': '111111111111111111',
'account': {
'display_name': 'Frank Zappa 🎸',
'acct': 'fz'
},
'created_at': '2017-04-12T15:53:18.174Z',
'content': "<h2>HTML Render Test</h2><p><em>emphasized</em><br><u>underlined</u><br><strong>bold</strong><br><strong><em>bold and italic</em></strong><br><del>strikethrough</del><br>regular text</p><p>Code block:</p><pre><code>10 PRINT \"HELLO WORLD\"<br>20 GOTO 10<br></code></pre><blockquote><p>Something blockquoted here. The indentation is maintained as the text line wraps.</p></blockquote><ol><li>List item<ul><li>Nested item</li><li>Another nested </li></ul></li><li>Another list item. <ol><li>Something else nested</li><li>And a last nested</li></ol></li></ol><blockquote><p>Blockquote</p><ol><li>List in BQ</li><li>List item 2 in BQ</li></ol></blockquote><p><a href=\"https://babka.social/tags/hashtag\" class=\"mention hashtag\" rel=\"tag\">#<span>hashtag</span></a> <a href=\"https://babka.social/tags/test\" class=\"mention hashtag\" rel=\"tag\">#<span>test</span></a> <br><a href=\"https://a.com\" target=\"_blank\" rel=\"nofollow noopener noreferrer\"><span class=\"invisible\">https://</span><span class=\"\">a.com</span><span class=\"invisible\"></span></a> text after link</p>",
'reblog': None,
'in_reply_to_id': None,
'media_attachments': [],
}])
console.run_command(app, user, 'timeline', ['--once'])
mock_get.assert_called_once_with(app, user, '/api/v1/timelines/home', {'limit': 10})
out, err = capsys.readouterr()
lines = out.split("\n")
reference = [
"────────────────────────────────────────────────────────────────────────────────────────────────────",
"Frank Zappa 🎸 @fz 2017-04-12 15:53 UTC",
"",
"## HTML Render Test",
"",
" _emphasized_ ",
" _underlined_ ",
" **bold** ",
" ** _bold and italic_** ",
" ~~strikethrough~~ ",
"regular text",
"",
"Code block:",
"",
" ",
" 10 PRINT \"HELLO WORLD\" ",
" 20 GOTO 10 ",
" ",
"> Something blockquoted here. The indentation is maintained as the text line wraps.",
" 1. List item",
" • Nested item",
" • Another nested ",
" 2. Another list item. ",
" 1. Something else nested",
" 2. And a last nested",
"",
"> Blockquote",
"> 1. List in BQ",
"> 2. List item 2 in BQ",
">",
"",
"#hashtag #test ",
"https://a.com text after link",
"",
"ID 111111111111111111 ",
"────────────────────────────────────────────────────────────────────────────────────────────────────",
"",
]
assert len(lines) == len(reference)
for index, line in enumerate(lines):
assert line == reference[index], f"Line #{index}: Expected:\n{reference[index]}\nGot:\n{line}"
assert err == ""
@mock.patch('toot.http.get')
def test_timeline_with_re(mock_get, monkeypatch, capsys):
mock_get.return_value = MockResponse([{

View File

@ -2,11 +2,11 @@ import os
import re
import sys
import textwrap
import html2text
from functools import lru_cache
from toot import settings
from toot.utils import get_text
from toot.richtext import html_to_text
from toot.entities import Account, Instance, Notification, Poll, Status
from toot.wcstring import wc_wrap
from typing import List
@ -321,20 +321,9 @@ def print_status(status: Status, width: int = 80):
def print_html(text, width=80):
h2t = html2text.HTML2Text()
h2t.body_width = width
h2t.single_line_break = True
h2t.ignore_links = True
h2t.wrap_links = True
h2t.wrap_list_items = True
h2t.wrap_tables = True
h2t.unicode_snob = True
h2t.ul_item_mark = "\N{bullet}"
markdown = h2t.handle(text).strip()
markdown = "\n".join(html_to_text(text, columns=width, highlight_tags=False))
print_out("")
print_out(highlight_hashtags(markdown))
print_out(markdown)
def print_poll(poll: Poll):

25
toot/richtext/__init__.py Normal file
View File

@ -0,0 +1,25 @@
from toot.tui.utils import highlight_hashtags
from toot.utils import html_to_paragraphs
from toot.wcstring import wc_wrap
from typing import List
try:
# first preference, render markup with pypandoc
from .markdown import html_to_text
except ImportError:
# Fallback to render in plaintext
def html_to_text(html: str, columns=80, highlight_tags=False) -> List:
output = []
first = True
for paragraph in html_to_paragraphs(html):
if not first:
output.append("")
for line in paragraph:
for subline in wc_wrap(line, columns):
if highlight_tags:
output.append(highlight_hashtags(subline))
else:
output.append(subline)
first = False
return output

11
toot/richtext/markdown.py Normal file
View File

@ -0,0 +1,11 @@
from pypandoc import convert_text
from typing import List
def html_to_text(html: str, columns=80, highlight_tags=False) -> List:
return [convert_text(
html,
format="html",
to="gfm-raw_html",
extra_args=["--wrap=auto", f"--columns={columns}"],
)]

View File

@ -1,13 +1,13 @@
import logging
import subprocess
import urwid
import html2text
from concurrent.futures import ThreadPoolExecutor
from toot import api, config, __version__, settings
from toot.console import get_default_visibility
from toot.exceptions import ApiError
from toot.richtext import html_to_text
from toot.utils.datetime import parse_datetime
from .compose import StatusComposer
@ -656,12 +656,8 @@ class TUI(urwid.Frame):
return self.run_in_thread(_delete, done_callback=_done)
def copy_status(self, status):
h2t = html2text.HTML2Text()
h2t.body_width = 0 # nowrap
h2t.single_line_break = True
h2t.ignore_links = True
h2t.unicode_snob = True
h2t.ul_item_mark = "\N{bullet}"
markdown = "\n".join(html_to_text(status.original.data["content"], columns=1024, highlight_tags=False))
time = parse_datetime(status.original.data['created_at'])
time = time.strftime('%Y-%m-%d %H:%M %Z')
@ -671,7 +667,7 @@ class TUI(urwid.Frame):
+ "\n"
+ (status.original.author.account or "")
+ "\n\n"
+ h2t.handle(status.original.data["content"]).strip()
+ markdown
+ "\n\n"
+ f"Created at: {time}")

View File

@ -1,27 +1,24 @@
import urwid
import html2text
from toot.tui.utils import highlight_hashtags
from toot.utils import format_content
from typing import List
try:
# our first preference is to render using urwidgets
from .richtext import html_to_widgets, url_to_widget
except ImportError:
# Fallback if urwidgets are not available
def html_to_widgets(html: str) -> List[urwid.Widget]:
return [
urwid.Text(_format_markdown(html))
]
try:
# second preference, render markup with pypandoc
from .markdown import html_to_widgets, url_to_widget
def url_to_widget(url: str):
return urwid.Text(("link", url))
except ImportError:
# Fallback to render in plaintext
def _format_markdown(html) -> str:
h2t = html2text.HTML2Text()
h2t.single_line_break = True
h2t.ignore_links = True
h2t.wrap_links = False
h2t.wrap_list_items = False
h2t.wrap_tables = False
h2t.unicode_snob = True
h2t.ul_item_mark = "\N{bullet}"
return h2t.handle(html).strip()
def url_to_widget(url: str):
return urwid.Text(("link", url))
def html_to_widgets(html: str) -> List[urwid.Widget]:
return [
urwid.Text(highlight_hashtags(line)) for line in format_content(html)
]

View File

@ -0,0 +1,21 @@
import urwid
from pypandoc import convert_text
from typing import List
def url_to_widget(url: str):
return urwid.Text(("link", url))
def html_to_widgets(html: str) -> List[urwid.Widget]:
return [
urwid.Text(
convert_text(
html,
format="html",
to="gfm-raw_html",
extra_args=["--wrap=none"],
)
)
]