1
0
mirror of https://github.com/ihabunek/toot synced 2024-12-26 00:42:43 +01:00
Toot-Mastodon-CLI-TUI-clien.../toot/utils.py
Ivan Habunek 1b86cdd404
Drop support for Python 2
It's hard to maintain both versions, having unicode issues in py2, etc.
2017-12-29 11:31:21 +01:00

44 lines
1.0 KiB
Python

# -*- coding: utf-8 -*-
import re
from bs4 import BeautifulSoup
def get_text(html):
"""Converts html to text, strips all tags."""
return BeautifulSoup(html, "html.parser").get_text().replace(''', "'")
def parse_html(html):
"""Attempt to convert html to plain text while keeping line breaks.
Returns a list of paragraphs, each being a list of lines.
"""
paragraphs = re.split("</?p[^>]*>", html)
# Convert <br>s to line breaks and remove empty paragraphs
paragraphs = [re.split("<br */?>", p) for p in paragraphs if p]
# Convert each line in each paragraph to plain text:
return [[get_text(l) for l in p] for p in paragraphs]
def format_content(content):
"""Given a Status contents in HTML, converts it into lines of plain text.
Returns a generator yielding lines of content.
"""
paragraphs = parse_html(content)
first = True
for paragraph in paragraphs:
if not first:
yield ""
for line in paragraph:
yield line
first = False