29 lines
789 B
Python
29 lines
789 B
Python
from bs4 import BeautifulSoup, Tag
|
|
import markdownify
|
|
|
|
|
|
def get_bottom_paragraphs(soup: BeautifulSoup) -> list[Tag]:
|
|
return [d for d in soup.findAll("p") if not d.find("p")]
|
|
|
|
|
|
def html_to_plaintext(content) -> str:
|
|
"""
|
|
Transform a HTML in a plaintext string that can be more easily processed by the publishers.
|
|
|
|
:param content:
|
|
:return:
|
|
"""
|
|
# TODO: support links and quotes
|
|
soup = BeautifulSoup(content, features="html.parser")
|
|
p_list = get_bottom_paragraphs(soup)
|
|
if p_list:
|
|
return "\n".join(" ".join(tag.stripped_strings) for tag in p_list)
|
|
|
|
return soup.text
|
|
|
|
|
|
def html_to_markdown(content) -> str:
|
|
markdown = markdownify.markdownify(content)
|
|
escaped_markdown = markdown.replace(">", "\\>")
|
|
return escaped_markdown.strip()
|