mirror of
https://github.com/ihabunek/toot
synced 2024-12-23 23:52:40 +01:00
0f6bd920c3
Beautiful will does not parse HTML entities like `'` as we expect and the previous logic of replacing this *after* HTML parsing occurred did not produced expected results. To illustrate this, we change data in "test_timeline" to include a literal `'` as it sometimes occur in data returned by Mastodon API. New HTML content is: <p>The computer can't tell you the emotional story [...] </p> Beautiful will parse this as as: <p>The computer can&apost tell you the emotional story [...] </p> which is not what we expect. We fix this by replacing `'` *before* HTML parsing by Beautiful. Since test data in "test_timeline" got updated we also add an extra assertion checking that part of the content with a literal "'" is (still) properly rendered.
71 lines
1.6 KiB
Python
71 lines
1.6 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
import re
|
|
import socket
|
|
import unicodedata
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from toot.exceptions import ConsoleError
|
|
|
|
|
|
def get_text(html):
|
|
"""Converts html to text, strips all tags."""
|
|
text = BeautifulSoup(html.replace(''', "'"), "html.parser").get_text()
|
|
|
|
return unicodedata.normalize('NFKC', text)
|
|
|
|
|
|
def parse_html(html):
|
|
"""Attempt to convert html to plain text while keeping line breaks.
|
|
Returns a list of paragraphs, each being a list of lines.
|
|
"""
|
|
paragraphs = re.split("</?p[^>]*>", html)
|
|
|
|
# Convert <br>s to line breaks and remove empty paragraphs
|
|
paragraphs = [re.split("<br */?>", p) for p in paragraphs if p]
|
|
|
|
# Convert each line in each paragraph to plain text:
|
|
return [[get_text(l) for l in p] for p in paragraphs]
|
|
|
|
|
|
def format_content(content):
|
|
"""Given a Status contents in HTML, converts it into lines of plain text.
|
|
|
|
Returns a generator yielding lines of content.
|
|
"""
|
|
|
|
paragraphs = parse_html(content)
|
|
|
|
first = True
|
|
|
|
for paragraph in paragraphs:
|
|
if not first:
|
|
yield ""
|
|
|
|
for line in paragraph:
|
|
yield line
|
|
|
|
first = False
|
|
|
|
|
|
def domain_exists(name):
|
|
try:
|
|
socket.gethostbyname(name)
|
|
return True
|
|
except OSError:
|
|
return False
|
|
|
|
|
|
def assert_domain_exists(domain):
|
|
if not domain_exists(domain):
|
|
raise ConsoleError("Domain {} not found".format(domain))
|
|
|
|
|
|
def trunc(text, length):
|
|
"""Trims text to given length, if trimmed appends ellipsis."""
|
|
if len(text) <= length:
|
|
return text
|
|
|
|
return text[:length - 1] + '…'
|