mirror of https://github.com/searx/searx
Merge pull request #2190 from dalf/fix-htmltextextractor
[fix] searx.utils.HTMLTextExtractor: invalid HTML don't raise an Exception
This commit is contained in:
commit
530fc4bda7
|
@ -77,6 +77,10 @@ def highlight_content(content, query):
|
|||
return content
|
||||
|
||||
|
||||
class HTMLTextExtractorException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class HTMLTextExtractor(HTMLParser):
|
||||
|
||||
def __init__(self):
|
||||
|
@ -92,7 +96,7 @@ class HTMLTextExtractor(HTMLParser):
|
|||
return
|
||||
|
||||
if tag != self.tags[-1]:
|
||||
raise Exception("invalid html")
|
||||
raise HTMLTextExtractorException()
|
||||
|
||||
self.tags.pop()
|
||||
|
||||
|
@ -128,7 +132,10 @@ def html_to_text(html):
|
|||
html = html.replace('\n', ' ')
|
||||
html = ' '.join(html.split())
|
||||
s = HTMLTextExtractor()
|
||||
s.feed(html)
|
||||
try:
|
||||
s.feed(html)
|
||||
except HTMLTextExtractorException:
|
||||
logger.debug("HTMLTextExtractor: invalid HTML\n%s", html)
|
||||
return s.get_text()
|
||||
|
||||
|
||||
|
|
|
@ -52,6 +52,10 @@ class TestUtils(SearxTestCase):
|
|||
self.assertIsNotNone(utils.html_to_text(html))
|
||||
self.assertEqual(utils.html_to_text(html), "Test text")
|
||||
|
||||
def test_html_to_text_invalid(self):
|
||||
html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
|
||||
self.assertEqual(utils.html_to_text(html), "Lorem ipsum")
|
||||
|
||||
def test_prettify_url(self):
|
||||
data = (('https://searx.me/', 'https://searx.me/'),
|
||||
('https://searx.me/ű', 'https://searx.me/ű'),
|
||||
|
@ -116,6 +120,11 @@ class TestHTMLTextExtractor(SearxTestCase):
|
|||
self.html_text_extractor.handle_entityref(entity)
|
||||
self.assertIn(entity, self.html_text_extractor.result)
|
||||
|
||||
def test_invalid_html(self):
|
||||
text = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
|
||||
with self.assertRaises(utils.HTMLTextExtractorException):
|
||||
self.html_text_extractor.feed(text)
|
||||
|
||||
|
||||
class TestUnicodeWriter(SearxTestCase):
|
||||
|
||||
|
|
Loading…
Reference in New Issue