1
0
mirror of https://github.com/ihabunek/toot synced 2024-12-23 23:52:40 +01:00
Toot-Mastodon-CLI-TUI-clien.../toot/utils.py
Denis Laxalde 0f6bd920c3 Replace ' by "'" before parsing HTML
Beautiful will does not parse HTML entities like `'` as we expect
and the previous logic of replacing this *after* HTML parsing occurred
did not produced expected results.

To illustrate this, we change data in "test_timeline" to include a
literal `'` as it sometimes occur in data returned by Mastodon API.
New HTML content is:

    <p>The computer can&apos;t tell you the emotional story [...] </p>

Beautiful will parse this as as:

    <p>The computer can&amp;apost tell you the emotional story [...] </p>

which is not what we expect.

We fix this by replacing `&apos;` *before* HTML parsing by Beautiful.
Since test data in "test_timeline" got updated we also add an extra
assertion checking that part of the content with a literal "'" is
(still) properly rendered.
2019-01-01 23:14:54 +01:00

71 lines
1.6 KiB
Python

# -*- coding: utf-8 -*-
import re
import socket
import unicodedata
from bs4 import BeautifulSoup
from toot.exceptions import ConsoleError
def get_text(html):
"""Converts html to text, strips all tags."""
text = BeautifulSoup(html.replace('&apos;', "'"), "html.parser").get_text()
return unicodedata.normalize('NFKC', text)
def parse_html(html):
"""Attempt to convert html to plain text while keeping line breaks.
Returns a list of paragraphs, each being a list of lines.
"""
paragraphs = re.split("</?p[^>]*>", html)
# Convert <br>s to line breaks and remove empty paragraphs
paragraphs = [re.split("<br */?>", p) for p in paragraphs if p]
# Convert each line in each paragraph to plain text:
return [[get_text(l) for l in p] for p in paragraphs]
def format_content(content):
"""Given a Status contents in HTML, converts it into lines of plain text.
Returns a generator yielding lines of content.
"""
paragraphs = parse_html(content)
first = True
for paragraph in paragraphs:
if not first:
yield ""
for line in paragraph:
yield line
first = False
def domain_exists(name):
try:
socket.gethostbyname(name)
return True
except OSError:
return False
def assert_domain_exists(domain):
if not domain_exists(domain):
raise ConsoleError("Domain {} not found".format(domain))
def trunc(text, length):
"""Trims text to given length, if trimmed appends ellipsis."""
if len(text) <= length:
return text
return text[:length - 1] + ''