Extract parsing html

This commit is contained in:
Ivan Habunek 2023-11-04 07:40:56 +01:00
parent d91c73520e
commit 199a96625b
No known key found for this signature in database
GPG Key ID: F5F0623FF5EBCB3D
2 changed files with 9 additions and 8 deletions

View File

@ -2,11 +2,10 @@ import re
import urwid
import unicodedata
from bs4 import BeautifulSoup
from bs4.element import NavigableString, Tag
from toot.tui.constants import PALETTE
from toot.tui.stubs.urwidgets import TextEmbed, Hyperlink, parse_text, has_urwidgets
from toot.utils import urlencode_url
from toot.utils import parse_html, urlencode_url
from typing import List, Tuple
from urwid.util import decompose_tagmarkup
@ -23,7 +22,7 @@ class ContentParser:
"""Convert html to urwid widgets"""
widgets: List[urwid.Widget] = []
html = unicodedata.normalize("NFKC", html)
soup = BeautifulSoup(html.replace("'", "'"), "html.parser")
soup = parse_html(html)
first_tag = True
for e in soup.body or soup:
if isinstance(e, NavigableString):

View File

@ -23,17 +23,19 @@ def str_bool_nullable(b):
return None if b is None else str_bool(b)
def get_text(html):
"""Converts html to text, strips all tags."""
def parse_html(html: str) -> BeautifulSoup:
# Ignore warnings made by BeautifulSoup, if passed something that looks like
# a file (e.g. a dot which matches current dict), it will warn that the file
# should be opened instead of passing a filename.
with warnings.catch_warnings():
warnings.simplefilter("ignore")
text = BeautifulSoup(html.replace(''', "'"), "html.parser").get_text()
return BeautifulSoup(html.replace("'", "'"), "html.parser")
return unicodedata.normalize('NFKC', text)
def get_text(html):
"""Converts html to text, strips all tags."""
text = parse_html(html).get_text()
return unicodedata.normalize("NFKC", text)
def html_to_paragraphs(html):