"""Tests of the bs4.element.PageElement class""" import copy import pickle import pytest from soupsieve import SelectorSyntaxError from bs4 import BeautifulSoup from bs4.element import ( Comment, SoupStrainer, ) from . import SoupTest class TestEncoding(SoupTest): """Test the ability to encode objects into strings.""" def test_unicode_string_can_be_encoded(self): html = "\N{SNOWMAN}" soup = self.soup(html) assert soup.b.string.encode("utf-8") == "\N{SNOWMAN}".encode("utf-8") def test_tag_containing_unicode_string_can_be_encoded(self): html = "\N{SNOWMAN}" soup = self.soup(html) assert soup.b.encode("utf-8") == html.encode("utf-8") def test_encoding_substitutes_unrecognized_characters_by_default(self): html = "\N{SNOWMAN}" soup = self.soup(html) assert soup.b.encode("ascii") == b"" def test_encoding_can_be_made_strict(self): html = "\N{SNOWMAN}" soup = self.soup(html) with pytest.raises(UnicodeEncodeError): soup.encode("ascii", errors="strict") def test_decode_contents(self): html = "\N{SNOWMAN}" soup = self.soup(html) assert "\N{SNOWMAN}" == soup.b.decode_contents() def test_encode_contents(self): html = "\N{SNOWMAN}" soup = self.soup(html) assert "\N{SNOWMAN}".encode("utf8") == soup.b.encode_contents( encoding="utf8" ) def test_deprecated_renderContents(self): html = "\N{SNOWMAN}" soup = self.soup(html) assert "\N{SNOWMAN}".encode("utf8") == soup.b.renderContents() def test_repr(self): html = "\N{SNOWMAN}" soup = self.soup(html) assert html == repr(soup) class TestFormatters(SoupTest): """Test the formatting feature, used by methods like decode() and prettify(), and the formatters themselves. """ def test_default_formatter_is_minimal(self): markup = "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" soup = self.soup(markup) decoded = soup.decode(formatter="minimal") # The < is converted back into < but the e-with-acute is left alone. assert decoded == self.document_for( "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" ) def test_formatter_html(self): markup = "
<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" soup = self.soup(markup) decoded = soup.decode(formatter="html") assert decoded == self.document_for( "
<<Sacré bleu!>>" ) def test_formatter_html5(self): markup = "
<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" soup = self.soup(markup) decoded = soup.decode(formatter="html5") assert decoded == self.document_for( "
<<Sacré bleu!>>" ) def test_formatter_minimal(self): markup = "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" soup = self.soup(markup) decoded = soup.decode(formatter="minimal") # The < is converted back into < but the e-with-acute is left alone. assert decoded == self.document_for( "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" ) def test_formatter_null(self): markup = "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" soup = self.soup(markup) decoded = soup.decode(formatter=None) # Neither the angle brackets nor the e-with-acute are converted. # This is not valid HTML, but it's what the user wanted. assert decoded == self.document_for( "<>" ) def test_formatter_custom(self): markup = "<foo>bar
" soup = self.soup(markup) decoded = soup.decode(formatter = lambda x: x.upper()) # Instead of normal entity conversion code, the custom # callable is called on every string. assert decoded == self.document_for("BAR
") def test_formatter_is_run_on_attribute_values(self): markup = 'e' soup = self.soup(markup) a = soup.a expect_minimal = 'e' assert expect_minimal == a.decode() assert expect_minimal == a.decode(formatter="minimal") expect_html = 'e' assert expect_html == a.decode(formatter="html") assert markup == a.decode(formatter=None) expect_upper = 'E' assert expect_upper == a.decode(formatter=lambda x: x.upper()) def test_formatter_skips_script_tag_for_html_documents(self): doc = """ """ encoded = BeautifulSoup(doc, 'html.parser').encode() assert b"< < hey > >" in encoded def test_formatter_skips_style_tag_for_html_documents(self): doc = """ """ encoded = BeautifulSoup(doc, 'html.parser').encode() assert b"< < hey > >" in encoded def test_prettify_leaves_preformatted_text_alone(self): soup = self.soup("
foo
  \tbar\n  \n  
baz
") # Everything outside the
 tag is reformatted, but everything
        # inside is left alone.
        assert '
\n foo\n
  \tbar\n  \n  
\n baz\n \n
' == soup.div.prettify() def test_prettify_accepts_formatter_function(self): soup = BeautifulSoup("foo", 'html.parser') pretty = soup.prettify(formatter = lambda x: x.upper()) assert "FOO" in pretty def test_prettify_outputs_unicode_by_default(self): soup = self.soup("") assert str == type(soup.prettify()) def test_prettify_can_encode_data(self): soup = self.soup("") assert bytes == type(soup.prettify("utf-8")) def test_html_entity_substitution_off_by_default(self): markup = "Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!" soup = self.soup(markup) encoded = soup.b.encode("utf-8") assert encoded == markup.encode('utf-8') def test_encoding_substitution(self): # Here's the tag saying that a document is # encoded in Shift-JIS. meta_tag = ('') soup = self.soup(meta_tag) # Parse the document, and the charset apprears unchanged. assert soup.meta['content'] == 'text/html; charset=x-sjis' # Encode the document into some encoding, and the encoding is # substituted into the meta tag. utf_8 = soup.encode("utf-8") assert b"charset=utf-8" in utf_8 euc_jp = soup.encode("euc_jp") assert b"charset=euc_jp" in euc_jp shift_jis = soup.encode("shift-jis") assert b"charset=shift-jis" in shift_jis utf_16_u = soup.encode("utf-16").decode("utf-16") assert "charset=utf-16" in utf_16_u def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self): markup = ('
foo
') # Beautiful Soup used to try to rewrite the meta tag even if the # meta tag got filtered out by the strainer. This test makes # sure that doesn't happen. strainer = SoupStrainer('pre') soup = self.soup(markup, parse_only=strainer) assert soup.contents[0].name == 'pre' class TestCSSSelectors(SoupTest): """Test basic CSS selector functionality. This functionality is implemented in soupsieve, which has a much more comprehensive test suite, so this is basically an extra check that soupsieve works as expected. """ HTML = """ The title Hello there.

An H1

Some text

Some more text

An H2

Another

Bob

Another H2

me span1a1 span1a2 test span2a1

English

English UK

English US

French

""" def setup_method(self): self.soup = BeautifulSoup(self.HTML, 'html.parser') def assert_selects(self, selector, expected_ids, **kwargs): el_ids = [el['id'] for el in self.soup.select(selector, **kwargs)] el_ids.sort() expected_ids.sort() assert expected_ids == el_ids, "Selector %s, expected [%s], got [%s]" % ( selector, ', '.join(expected_ids), ', '.join(el_ids) ) assertSelect = assert_selects def assert_select_multiple(self, *tests): for selector, expected_ids in tests: self.assert_selects(selector, expected_ids) def test_one_tag_one(self): els = self.soup.select('title') assert len(els) == 1 assert els[0].name == 'title' assert els[0].contents == ['The title'] def test_one_tag_many(self): els = self.soup.select('div') assert len(els) == 4 for div in els: assert div.name == 'div' el = self.soup.select_one('div') assert 'main' == el['id'] def test_select_one_returns_none_if_no_match(self): match = self.soup.select_one('nonexistenttag') assert None == match def test_tag_in_tag_one(self): els = self.soup.select('div div') self.assert_selects('div div', ['inner', 'data1']) def test_tag_in_tag_many(self): for selector in ('html div', 'html body div', 'body div'): self.assert_selects(selector, ['data1', 'main', 'inner', 'footer']) def test_limit(self): self.assert_selects('html div', ['main'], limit=1) self.assert_selects('html body div', ['inner', 'main'], limit=2) self.assert_selects('body div', ['data1', 'main', 'inner', 'footer'], limit=10) def test_tag_no_match(self): assert len(self.soup.select('del')) == 0 def test_invalid_tag(self): with pytest.raises(SelectorSyntaxError): self.soup.select('tag%t') def test_select_dashed_tag_ids(self): self.assert_selects('custom-dashed-tag', ['dash1', 'dash2']) def test_select_dashed_by_id(self): dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]') assert dashed[0].name == 'custom-dashed-tag' assert dashed[0]['id'] == 'dash2' def test_dashed_tag_text(self): assert self.soup.select('body > custom-dashed-tag')[0].text == 'Hello there.' def test_select_dashed_matches_find_all(self): assert self.soup.select('custom-dashed-tag') == self.soup.find_all('custom-dashed-tag') def test_header_tags(self): self.assert_select_multiple( ('h1', ['header1']), ('h2', ['header2', 'header3']), ) def test_class_one(self): for selector in ('.onep', 'p.onep', 'html p.onep'): els = self.soup.select(selector) assert len(els) == 1 assert els[0].name == 'p' assert els[0]['class'] == ['onep'] def test_class_mismatched_tag(self): els = self.soup.select('div.onep') assert len(els) == 0 def test_one_id(self): for selector in ('div#inner', '#inner', 'div div#inner'): self.assert_selects(selector, ['inner']) def test_bad_id(self): els = self.soup.select('#doesnotexist') assert len(els) == 0 def test_items_in_id(self): els = self.soup.select('div#inner p') assert len(els) == 3 for el in els: assert el.name == 'p' assert els[1]['class'] == ['onep'] assert not els[0].has_attr('class') def test_a_bunch_of_emptys(self): for selector in ('div#main del', 'div#main div.oops', 'div div#main'): assert len(self.soup.select(selector)) == 0 def test_multi_class_support(self): for selector in ('.class1', 'p.class1', '.class2', 'p.class2', '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'): self.assert_selects(selector, ['pmulti']) def test_multi_class_selection(self): for selector in ('.class1.class3', '.class3.class2', '.class1.class2.class3'): self.assert_selects(selector, ['pmulti']) def test_child_selector(self): self.assert_selects('.s1 > a', ['s1a1', 's1a2']) self.assert_selects('.s1 > a span', ['s1a2s1']) def test_child_selector_id(self): self.assert_selects('.s1 > a#s1a2 span', ['s1a2s1']) def test_attribute_equals(self): self.assert_select_multiple( ('p[class="onep"]', ['p1']), ('p[id="p1"]', ['p1']), ('[class="onep"]', ['p1']), ('[id="p1"]', ['p1']), ('link[rel="stylesheet"]', ['l1']), ('link[type="text/css"]', ['l1']), ('link[href="blah.css"]', ['l1']), ('link[href="no-blah.css"]', []), ('[rel="stylesheet"]', ['l1']), ('[type="text/css"]', ['l1']), ('[href="blah.css"]', ['l1']), ('[href="no-blah.css"]', []), ('p[href="no-blah.css"]', []), ('[href="no-blah.css"]', []), ) def test_attribute_tilde(self): self.assert_select_multiple( ('p[class~="class1"]', ['pmulti']), ('p[class~="class2"]', ['pmulti']), ('p[class~="class3"]', ['pmulti']), ('[class~="class1"]', ['pmulti']), ('[class~="class2"]', ['pmulti']), ('[class~="class3"]', ['pmulti']), ('a[rel~="friend"]', ['bob']), ('a[rel~="met"]', ['bob']), ('[rel~="friend"]', ['bob']), ('[rel~="met"]', ['bob']), ) def test_attribute_startswith(self): self.assert_select_multiple( ('[rel^="style"]', ['l1']), ('link[rel^="style"]', ['l1']), ('notlink[rel^="notstyle"]', []), ('[rel^="notstyle"]', []), ('link[rel^="notstyle"]', []), ('link[href^="bla"]', ['l1']), ('a[href^="http://"]', ['bob', 'me']), ('[href^="http://"]', ['bob', 'me']), ('[id^="p"]', ['pmulti', 'p1']), ('[id^="m"]', ['me', 'main']), ('div[id^="m"]', ['main']), ('a[id^="m"]', ['me']), ('div[data-tag^="dashed"]', ['data1']) ) def test_attribute_endswith(self): self.assert_select_multiple( ('[href$=".css"]', ['l1']), ('link[href$=".css"]', ['l1']), ('link[id$="1"]', ['l1']), ('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']), ('div[id$="1"]', ['data1']), ('[id$="noending"]', []), ) def test_attribute_contains(self): self.assert_select_multiple( # From test_attribute_startswith ('[rel*="style"]', ['l1']), ('link[rel*="style"]', ['l1']), ('notlink[rel*="notstyle"]', []), ('[rel*="notstyle"]', []), ('link[rel*="notstyle"]', []), ('link[href*="bla"]', ['l1']), ('[href*="http://"]', ['bob', 'me']), ('[id*="p"]', ['pmulti', 'p1']), ('div[id*="m"]', ['main']), ('a[id*="m"]', ['me']), # From test_attribute_endswith ('[href*=".css"]', ['l1']), ('link[href*=".css"]', ['l1']), ('link[id*="1"]', ['l1']), ('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']), ('div[id*="1"]', ['data1']), ('[id*="noending"]', []), # New for this test ('[href*="."]', ['bob', 'me', 'l1']), ('a[href*="."]', ['bob', 'me']), ('link[href*="."]', ['l1']), ('div[id*="n"]', ['main', 'inner']), ('div[id*="nn"]', ['inner']), ('div[data-tag*="edval"]', ['data1']) ) def test_attribute_exact_or_hypen(self): self.assert_select_multiple( ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), ('p[lang|="fr"]', ['lang-fr']), ('p[lang|="gb"]', []), ) def test_attribute_exists(self): self.assert_select_multiple( ('[rel]', ['l1', 'bob', 'me']), ('link[rel]', ['l1']), ('a[rel]', ['bob', 'me']), ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']), ('p[class]', ['p1', 'pmulti']), ('[blah]', []), ('p[blah]', []), ('div[data-tag]', ['data1']) ) def test_quoted_space_in_selector_name(self): html = """
nope
yes
""" soup = BeautifulSoup(html, 'html.parser') [chosen] = soup.select('div[style="display: right"]') assert "yes" == chosen.string def test_unsupported_pseudoclass(self): with pytest.raises(NotImplementedError): self.soup.select("a:no-such-pseudoclass") with pytest.raises(SelectorSyntaxError): self.soup.select("a:nth-of-type(a)") def test_nth_of_type(self): # Try to select first paragraph els = self.soup.select('div#inner p:nth-of-type(1)') assert len(els) == 1 assert els[0].string == 'Some text' # Try to select third paragraph els = self.soup.select('div#inner p:nth-of-type(3)') assert len(els) == 1 assert els[0].string == 'Another' # Try to select (non-existent!) fourth paragraph els = self.soup.select('div#inner p:nth-of-type(4)') assert len(els) == 0 # Zero will select no tags. els = self.soup.select('div p:nth-of-type(0)') assert len(els) == 0 def test_nth_of_type_direct_descendant(self): els = self.soup.select('div#inner > p:nth-of-type(1)') assert len(els) == 1 assert els[0].string == 'Some text' def test_id_child_selector_nth_of_type(self): self.assert_selects('#inner > p:nth-of-type(2)', ['p1']) def test_select_on_element(self): # Other tests operate on the tree; this operates on an element # within the tree. inner = self.soup.find("div", id="main") selected = inner.select("div") # The
tag was selected. The