2021-07-21 09:13:32 +02:00
|
|
|
from bs4 import BeautifulSoup, Tag
|
|
|
|
import markdownify
|
|
|
|
|
|
|
|
|
2022-02-22 11:28:16 +01:00
|
|
|
def get_bottom_paragraphs(soup: BeautifulSoup) -> list[Tag]:
|
2021-07-21 09:13:32 +02:00
|
|
|
return [d for d in soup.findAll("p") if not d.find("p")]
|
|
|
|
|
|
|
|
|
2022-02-22 11:28:16 +01:00
|
|
|
def html_to_plaintext(content) -> str:
|
2021-07-21 09:13:32 +02:00
|
|
|
"""
|
2022-02-22 11:28:16 +01:00
|
|
|
Transform a HTML in a plaintext string that can be more easily processed by the publishers.
|
2021-07-21 09:13:32 +02:00
|
|
|
|
|
|
|
:param content:
|
|
|
|
:return:
|
|
|
|
"""
|
|
|
|
# TODO: support links and quotes
|
|
|
|
soup = BeautifulSoup(content)
|
2022-02-22 11:28:16 +01:00
|
|
|
p_list = get_bottom_paragraphs(soup)
|
|
|
|
if p_list:
|
|
|
|
return "\n".join(" ".join(tag.stripped_strings) for tag in p_list)
|
|
|
|
else:
|
|
|
|
return soup.text
|
2021-07-21 09:13:32 +02:00
|
|
|
|
|
|
|
|
2022-02-22 11:28:16 +01:00
|
|
|
def html_to_markdown(content) -> str:
|
2021-07-21 09:13:32 +02:00
|
|
|
markdown = markdownify.markdownify(content)
|
|
|
|
escaped_markdown = markdown.replace(">", "\\>")
|
|
|
|
return escaped_markdown.strip()
|