mirror of
				https://gitlab.com/octtspacc/staticoso
				synced 2025-06-05 22:09:23 +02:00 
			
		
		
		
	
		
			
				
	
	
		
			463 lines
		
	
	
		
			17 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			463 lines
		
	
	
		
			17 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# -*- coding: utf-8 -*-
 | 
						|
"""Tests of Beautiful Soup as a whole."""
 | 
						|
 | 
						|
from pdb import set_trace
 | 
						|
import logging
 | 
						|
import os
 | 
						|
import pickle
 | 
						|
import pytest
 | 
						|
import sys
 | 
						|
import tempfile
 | 
						|
 | 
						|
from bs4 import (
 | 
						|
    BeautifulSoup,
 | 
						|
    BeautifulStoneSoup,
 | 
						|
    GuessedAtParserWarning,
 | 
						|
    MarkupResemblesLocatorWarning,
 | 
						|
    dammit,
 | 
						|
)
 | 
						|
from bs4.builder import (
 | 
						|
    builder_registry,
 | 
						|
    TreeBuilder,
 | 
						|
    ParserRejectedMarkup,
 | 
						|
)
 | 
						|
from bs4.element import (
 | 
						|
    Comment,
 | 
						|
    SoupStrainer,
 | 
						|
    Tag,
 | 
						|
    NavigableString,
 | 
						|
)
 | 
						|
 | 
						|
from . import (
 | 
						|
    default_builder,
 | 
						|
    SoupTest,
 | 
						|
    skipIf,
 | 
						|
)
 | 
						|
import warnings
 | 
						|
 | 
						|
try:
 | 
						|
    from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
 | 
						|
    LXML_PRESENT = True
 | 
						|
except ImportError as e:
 | 
						|
    LXML_PRESENT = False
 | 
						|
    
 | 
						|
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
 | 
						|
 | 
						|
class TestConstructor(SoupTest):
 | 
						|
 | 
						|
    def test_short_unicode_input(self):
 | 
						|
        data = "<h1>éé</h1>"
 | 
						|
        soup = self.soup(data)
 | 
						|
        assert "éé" == soup.h1.string
 | 
						|
 | 
						|
    def test_embedded_null(self):
 | 
						|
        data = "<h1>foo\0bar</h1>"
 | 
						|
        soup = self.soup(data)
 | 
						|
        assert "foo\0bar" == soup.h1.string
 | 
						|
 | 
						|
    def test_exclude_encodings(self):
 | 
						|
        utf8_data = "Räksmörgås".encode("utf-8")
 | 
						|
        soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
 | 
						|
        assert "windows-1252" == soup.original_encoding
 | 
						|
 | 
						|
    def test_custom_builder_class(self):
 | 
						|
        # Verify that you can pass in a custom Builder class and
 | 
						|
        # it'll be instantiated with the appropriate keyword arguments.
 | 
						|
        class Mock(object):
 | 
						|
            def __init__(self, **kwargs):
 | 
						|
                self.called_with = kwargs
 | 
						|
                self.is_xml = True
 | 
						|
                self.store_line_numbers = False
 | 
						|
                self.cdata_list_attributes = []
 | 
						|
                self.preserve_whitespace_tags = []
 | 
						|
                self.string_containers = {}
 | 
						|
            def initialize_soup(self, soup):
 | 
						|
                pass
 | 
						|
            def feed(self, markup):
 | 
						|
                self.fed = markup
 | 
						|
            def reset(self):
 | 
						|
                pass
 | 
						|
            def ignore(self, ignore):
 | 
						|
                pass
 | 
						|
            set_up_substitutions = can_be_empty_element = ignore
 | 
						|
            def prepare_markup(self, *args, **kwargs):
 | 
						|
                yield "prepared markup", "original encoding", "declared encoding", "contains replacement characters"
 | 
						|
                
 | 
						|
        kwargs = dict(
 | 
						|
            var="value",
 | 
						|
            # This is a deprecated BS3-era keyword argument, which
 | 
						|
            # will be stripped out.
 | 
						|
            convertEntities=True,
 | 
						|
        )
 | 
						|
        with warnings.catch_warnings(record=True):
 | 
						|
            soup = BeautifulSoup('', builder=Mock, **kwargs)
 | 
						|
        assert isinstance(soup.builder, Mock)
 | 
						|
        assert dict(var="value") == soup.builder.called_with
 | 
						|
        assert "prepared markup" == soup.builder.fed
 | 
						|
        
 | 
						|
        # You can also instantiate the TreeBuilder yourself. In this
 | 
						|
        # case, that specific object is used and any keyword arguments
 | 
						|
        # to the BeautifulSoup constructor are ignored.
 | 
						|
        builder = Mock(**kwargs)
 | 
						|
        with warnings.catch_warnings(record=True) as w:
 | 
						|
            soup = BeautifulSoup(
 | 
						|
                '', builder=builder, ignored_value=True,
 | 
						|
            )
 | 
						|
        msg = str(w[0].message)
 | 
						|
        assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.")
 | 
						|
        assert builder == soup.builder
 | 
						|
        assert kwargs == builder.called_with
 | 
						|
 | 
						|
    def test_parser_markup_rejection(self):
 | 
						|
        # If markup is completely rejected by the parser, an
 | 
						|
        # explanatory ParserRejectedMarkup exception is raised.
 | 
						|
        class Mock(TreeBuilder):
 | 
						|
            def feed(self, *args, **kwargs):
 | 
						|
                raise ParserRejectedMarkup("Nope.")
 | 
						|
 | 
						|
        def prepare_markup(self, *args, **kwargs):
 | 
						|
            # We're going to try two different ways of preparing this markup,
 | 
						|
            # but feed() will reject both of them.
 | 
						|
            yield markup, None, None, False
 | 
						|
            yield markup, None, None, False
 | 
						|
            
 | 
						|
 | 
						|
        import re
 | 
						|
        with pytest.raises(ParserRejectedMarkup) as exc_info:
 | 
						|
            BeautifulSoup('', builder=Mock)
 | 
						|
        assert "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help." in str(exc_info.value)
 | 
						|
        
 | 
						|
    def test_cdata_list_attributes(self):
 | 
						|
        # Most attribute values are represented as scalars, but the
 | 
						|
        # HTML standard says that some attributes, like 'class' have
 | 
						|
        # space-separated lists as values.
 | 
						|
        markup = '<a id=" an id " class=" a class "></a>'
 | 
						|
        soup = self.soup(markup)
 | 
						|
 | 
						|
        # Note that the spaces are stripped for 'class' but not for 'id'.
 | 
						|
        a = soup.a
 | 
						|
        assert " an id " == a['id']
 | 
						|
        assert ["a", "class"] == a['class']
 | 
						|
 | 
						|
        # TreeBuilder takes an argument called 'mutli_valued_attributes'  which lets
 | 
						|
        # you customize or disable this. As always, you can customize the TreeBuilder
 | 
						|
        # by passing in a keyword argument to the BeautifulSoup constructor.
 | 
						|
        soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
 | 
						|
        assert " a class " == soup.a['class']
 | 
						|
 | 
						|
        # Here are two ways of saying that `id` is a multi-valued
 | 
						|
        # attribute in this context, but 'class' is not.
 | 
						|
        for switcheroo in ({'*': 'id'}, {'a': 'id'}):
 | 
						|
            with warnings.catch_warnings(record=True) as w:
 | 
						|
                # This will create a warning about not explicitly
 | 
						|
                # specifying a parser, but we'll ignore it.
 | 
						|
                soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo)
 | 
						|
            a = soup.a
 | 
						|
            assert ["an", "id"] == a['id']
 | 
						|
            assert " a class " == a['class']
 | 
						|
 | 
						|
    def test_replacement_classes(self):
 | 
						|
        # Test the ability to pass in replacements for element classes
 | 
						|
        # which will be used when building the tree.
 | 
						|
        class TagPlus(Tag):
 | 
						|
            pass
 | 
						|
 | 
						|
        class StringPlus(NavigableString):
 | 
						|
            pass
 | 
						|
 | 
						|
        class CommentPlus(Comment):
 | 
						|
            pass
 | 
						|
        
 | 
						|
        soup = self.soup(
 | 
						|
            "<a><b>foo</b>bar</a><!--whee-->",
 | 
						|
            element_classes = {
 | 
						|
                Tag: TagPlus,
 | 
						|
                NavigableString: StringPlus,
 | 
						|
                Comment: CommentPlus,
 | 
						|
            }
 | 
						|
        )
 | 
						|
 | 
						|
        # The tree was built with TagPlus, StringPlus, and CommentPlus objects,
 | 
						|
        # rather than Tag, String, and Comment objects.
 | 
						|
        assert all(
 | 
						|
            isinstance(x, (TagPlus, StringPlus, CommentPlus))
 | 
						|
            for x in soup.recursiveChildGenerator()
 | 
						|
        )
 | 
						|
 | 
						|
    def test_alternate_string_containers(self):
 | 
						|
        # Test the ability to customize the string containers for
 | 
						|
        # different types of tags.
 | 
						|
        class PString(NavigableString):
 | 
						|
            pass
 | 
						|
 | 
						|
        class BString(NavigableString):
 | 
						|
            pass
 | 
						|
 | 
						|
        soup = self.soup(
 | 
						|
            "<div>Hello.<p>Here is <b>some <i>bolded</i></b> text",
 | 
						|
            string_containers = {
 | 
						|
                'b': BString,
 | 
						|
                'p': PString,
 | 
						|
            }
 | 
						|
        )
 | 
						|
 | 
						|
        # The string before the <p> tag is a regular NavigableString.
 | 
						|
        assert isinstance(soup.div.contents[0], NavigableString)
 | 
						|
        
 | 
						|
        # The string inside the <p> tag, but not inside the <i> tag,
 | 
						|
        # is a PString.
 | 
						|
        assert isinstance(soup.p.contents[0], PString)
 | 
						|
 | 
						|
        # Every string inside the <b> tag is a BString, even the one that
 | 
						|
        # was also inside an <i> tag.
 | 
						|
        for s in soup.b.strings:
 | 
						|
            assert isinstance(s, BString)
 | 
						|
 | 
						|
        # Now that parsing was complete, the string_container_stack
 | 
						|
        # (where this information was kept) has been cleared out.
 | 
						|
        assert [] == soup.string_container_stack
 | 
						|
 | 
						|
 | 
						|
class TestWarnings(SoupTest):
 | 
						|
 | 
						|
    def _assert_warning(self, warnings, cls):
 | 
						|
        for w in warnings:
 | 
						|
            if isinstance(w.message, cls):
 | 
						|
                return w
 | 
						|
        raise Exception("%s warning not found in %r" % (cls, warnings))
 | 
						|
    
 | 
						|
    def _assert_no_parser_specified(self, w):
 | 
						|
        warning = self._assert_warning(w, GuessedAtParserWarning)
 | 
						|
        message = str(warning.message)
 | 
						|
        assert message.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:60])
 | 
						|
 | 
						|
    def test_warning_if_no_parser_specified(self):
 | 
						|
        with warnings.catch_warnings(record=True) as w:
 | 
						|
            soup = BeautifulSoup("<a><b></b></a>")
 | 
						|
        self._assert_no_parser_specified(w)
 | 
						|
 | 
						|
    def test_warning_if_parser_specified_too_vague(self):
 | 
						|
        with warnings.catch_warnings(record=True) as w:
 | 
						|
            soup = BeautifulSoup("<a><b></b></a>", "html")
 | 
						|
        self._assert_no_parser_specified(w)
 | 
						|
 | 
						|
    def test_no_warning_if_explicit_parser_specified(self):
 | 
						|
        with warnings.catch_warnings(record=True) as w:
 | 
						|
            soup = BeautifulSoup("<a><b></b></a>", "html.parser")
 | 
						|
        assert [] == w
 | 
						|
 | 
						|
    def test_parseOnlyThese_renamed_to_parse_only(self):
 | 
						|
        with warnings.catch_warnings(record=True) as w:
 | 
						|
            soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
 | 
						|
        msg = str(w[0].message)
 | 
						|
        assert "parseOnlyThese" in msg
 | 
						|
        assert "parse_only" in msg
 | 
						|
        assert b"<b></b>" == soup.encode()
 | 
						|
 | 
						|
    def test_fromEncoding_renamed_to_from_encoding(self):
 | 
						|
        with warnings.catch_warnings(record=True) as w:
 | 
						|
            utf8 = b"\xc3\xa9"
 | 
						|
            soup = self.soup(utf8, fromEncoding="utf8")
 | 
						|
        msg = str(w[0].message)
 | 
						|
        assert "fromEncoding" in msg
 | 
						|
        assert "from_encoding" in msg
 | 
						|
        assert "utf8" == soup.original_encoding
 | 
						|
 | 
						|
    def test_unrecognized_keyword_argument(self):
 | 
						|
        with pytest.raises(TypeError):
 | 
						|
            self.soup("<a>", no_such_argument=True)
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "extension",
 | 
						|
        ['markup.html', 'markup.htm', 'markup.HTML', 'markup.txt',
 | 
						|
         'markup.xhtml', 'markup.xml', "/home/user/file", "c:\\user\file"]
 | 
						|
    )
 | 
						|
    def test_resembles_filename_warning(self, extension):
 | 
						|
        # A warning is issued if the "markup" looks like the name of
 | 
						|
        # an HTML or text file, or a full path to a file on disk.
 | 
						|
        with warnings.catch_warnings(record=True) as w:
 | 
						|
            soup = self.soup("markup" + extension)
 | 
						|
            warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
 | 
						|
            assert "looks more like a filename" in str(warning.message)
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "extension",
 | 
						|
        ['markuphtml', 'markup.com', '', 'markup.js']
 | 
						|
    )
 | 
						|
    def test_resembles_filename_no_warning(self, extension):
 | 
						|
        # The 'looks more like a filename' warning is not issued if
 | 
						|
        # the markup looks like a bare string, a domain name, or a
 | 
						|
        # file that's not an HTML file.
 | 
						|
        with warnings.catch_warnings(record=True) as w:
 | 
						|
            soup = self.soup("markup" + extension)
 | 
						|
        assert [] == w
 | 
						|
        
 | 
						|
    def test_url_warning_with_bytes_url(self):
 | 
						|
        url = b"http://www.crummybytes.com/"
 | 
						|
        with warnings.catch_warnings(record=True) as warning_list:
 | 
						|
            soup = self.soup(url)
 | 
						|
        warning = self._assert_warning(
 | 
						|
            warning_list, MarkupResemblesLocatorWarning
 | 
						|
        )
 | 
						|
        assert "looks more like a URL" in str(warning.message)
 | 
						|
        assert url not in str(warning.message).encode("utf8")
 | 
						|
        
 | 
						|
    def test_url_warning_with_unicode_url(self):
 | 
						|
        url = "http://www.crummyunicode.com/"
 | 
						|
        with warnings.catch_warnings(record=True) as warning_list:
 | 
						|
            # note - this url must differ from the bytes one otherwise
 | 
						|
            # python's warnings system swallows the second warning
 | 
						|
            soup = self.soup(url)
 | 
						|
        warning = self._assert_warning(
 | 
						|
            warning_list, MarkupResemblesLocatorWarning
 | 
						|
        )
 | 
						|
        assert "looks more like a URL" in str(warning.message)
 | 
						|
        assert url not in str(warning.message)
 | 
						|
 | 
						|
    def test_url_warning_with_bytes_and_space(self):
 | 
						|
        # Here the markup contains something besides a URL, so no warning
 | 
						|
        # is issued.
 | 
						|
        with warnings.catch_warnings(record=True) as warning_list:
 | 
						|
            soup = self.soup(b"http://www.crummybytes.com/ is great")
 | 
						|
        assert not any("looks more like a URL" in str(w.message) 
 | 
						|
                       for w in warning_list)
 | 
						|
 | 
						|
    def test_url_warning_with_unicode_and_space(self):
 | 
						|
        with warnings.catch_warnings(record=True) as warning_list:
 | 
						|
            soup = self.soup("http://www.crummyunicode.com/ is great")
 | 
						|
        assert not any("looks more like a URL" in str(w.message) 
 | 
						|
                       for w in warning_list)
 | 
						|
 | 
						|
 | 
						|
class TestSelectiveParsing(SoupTest):
 | 
						|
 | 
						|
    def test_parse_with_soupstrainer(self):
 | 
						|
        markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
 | 
						|
        strainer = SoupStrainer("b")
 | 
						|
        soup = self.soup(markup, parse_only=strainer)
 | 
						|
        assert soup.encode() == b"<b>Yes</b><b>Yes <c>Yes</c></b>"
 | 
						|
 | 
						|
        
 | 
						|
class TestNewTag(SoupTest):
 | 
						|
    """Test the BeautifulSoup.new_tag() method."""
 | 
						|
    def test_new_tag(self):
 | 
						|
        soup = self.soup("")
 | 
						|
        new_tag = soup.new_tag("foo", bar="baz", attrs={"name": "a name"})
 | 
						|
        assert isinstance(new_tag, Tag)
 | 
						|
        assert "foo" == new_tag.name
 | 
						|
        assert dict(bar="baz", name="a name") == new_tag.attrs
 | 
						|
        assert None == new_tag.parent
 | 
						|
        
 | 
						|
    def test_tag_inherits_self_closing_rules_from_builder(self):
 | 
						|
        if LXML_PRESENT:
 | 
						|
            xml_soup = BeautifulSoup("", "lxml-xml")
 | 
						|
            xml_br = xml_soup.new_tag("br")
 | 
						|
            xml_p = xml_soup.new_tag("p")
 | 
						|
 | 
						|
            # Both the <br> and <p> tag are empty-element, just because
 | 
						|
            # they have no contents.
 | 
						|
            assert b"<br/>" == xml_br.encode()
 | 
						|
            assert b"<p/>" == xml_p.encode()
 | 
						|
 | 
						|
        html_soup = BeautifulSoup("", "html.parser")
 | 
						|
        html_br = html_soup.new_tag("br")
 | 
						|
        html_p = html_soup.new_tag("p")
 | 
						|
 | 
						|
        # The HTML builder users HTML's rules about which tags are
 | 
						|
        # empty-element tags, and the new tags reflect these rules.
 | 
						|
        assert b"<br/>" == html_br.encode()
 | 
						|
        assert b"<p></p>" == html_p.encode()
 | 
						|
 | 
						|
class TestNewString(SoupTest):
 | 
						|
    """Test the BeautifulSoup.new_string() method."""
 | 
						|
    def test_new_string_creates_navigablestring(self):
 | 
						|
        soup = self.soup("")
 | 
						|
        s = soup.new_string("foo")
 | 
						|
        assert "foo" == s
 | 
						|
        assert isinstance(s, NavigableString)
 | 
						|
 | 
						|
    def test_new_string_can_create_navigablestring_subclass(self):
 | 
						|
        soup = self.soup("")
 | 
						|
        s = soup.new_string("foo", Comment)
 | 
						|
        assert "foo" == s
 | 
						|
        assert isinstance(s, Comment)
 | 
						|
 | 
						|
 | 
						|
class TestPickle(SoupTest):
 | 
						|
   # Test our ability to pickle the BeautifulSoup object itself.
 | 
						|
 | 
						|
    def test_normal_pickle(self):
 | 
						|
        soup = self.soup("<a>some markup</a>")
 | 
						|
        pickled = pickle.dumps(soup)
 | 
						|
        unpickled = pickle.loads(pickled)
 | 
						|
        assert "some markup" == unpickled.a.string
 | 
						|
        
 | 
						|
    def test_pickle_with_no_builder(self):
 | 
						|
        # We had a bug that prevented pickling from working if
 | 
						|
        # the builder wasn't set.
 | 
						|
        soup = self.soup("some markup")
 | 
						|
        soup.builder = None
 | 
						|
        pickled = pickle.dumps(soup)
 | 
						|
        unpickled = pickle.loads(pickled)
 | 
						|
        assert "some markup" == unpickled.string
 | 
						|
 | 
						|
class TestEncodingConversion(SoupTest):
 | 
						|
    # Test Beautiful Soup's ability to decode and encode from various
 | 
						|
    # encodings.
 | 
						|
 | 
						|
    def setup_method(self):
 | 
						|
        self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
 | 
						|
        self.utf8_data = self.unicode_data.encode("utf-8")
 | 
						|
        # Just so you know what it looks like.
 | 
						|
        assert self.utf8_data == b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>'
 | 
						|
 | 
						|
    def test_ascii_in_unicode_out(self):
 | 
						|
        # ASCII input is converted to Unicode. The original_encoding
 | 
						|
        # attribute is set to 'utf-8', a superset of ASCII.
 | 
						|
        chardet = dammit.chardet_dammit
 | 
						|
        logging.disable(logging.WARNING)
 | 
						|
        try:
 | 
						|
            def noop(str):
 | 
						|
                return None
 | 
						|
            # Disable chardet, which will realize that the ASCII is ASCII.
 | 
						|
            dammit.chardet_dammit = noop
 | 
						|
            ascii = b"<foo>a</foo>"
 | 
						|
            soup_from_ascii = self.soup(ascii)
 | 
						|
            unicode_output = soup_from_ascii.decode()
 | 
						|
            assert isinstance(unicode_output, str)
 | 
						|
            assert unicode_output == self.document_for(ascii.decode())
 | 
						|
            assert soup_from_ascii.original_encoding.lower() == "utf-8"
 | 
						|
        finally:
 | 
						|
            logging.disable(logging.NOTSET)
 | 
						|
            dammit.chardet_dammit = chardet
 | 
						|
 | 
						|
    def test_unicode_in_unicode_out(self):
 | 
						|
        # Unicode input is left alone. The original_encoding attribute
 | 
						|
        # is not set.
 | 
						|
        soup_from_unicode = self.soup(self.unicode_data)
 | 
						|
        assert soup_from_unicode.decode() == self.unicode_data
 | 
						|
        assert soup_from_unicode.foo.string == 'Sacr\xe9 bleu!'
 | 
						|
        assert soup_from_unicode.original_encoding == None
 | 
						|
 | 
						|
    def test_utf8_in_unicode_out(self):
 | 
						|
        # UTF-8 input is converted to Unicode. The original_encoding
 | 
						|
        # attribute is set.
 | 
						|
        soup_from_utf8 = self.soup(self.utf8_data)
 | 
						|
        assert soup_from_utf8.decode() == self.unicode_data
 | 
						|
        assert soup_from_utf8.foo.string == 'Sacr\xe9 bleu!'
 | 
						|
 | 
						|
    def test_utf8_out(self):
 | 
						|
        # The internal data structures can be encoded as UTF-8.
 | 
						|
        soup_from_unicode = self.soup(self.unicode_data)
 | 
						|
        assert soup_from_unicode.encode('utf-8') == self.utf8_data
 | 
						|
 | 
						|
    @skipIf(
 | 
						|
        PYTHON_3_PRE_3_2,
 | 
						|
        "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
 | 
						|
    def test_attribute_name_containing_unicode_characters(self):
 | 
						|
        markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
 | 
						|
        assert self.soup(markup).div.encode("utf8") == markup.encode("utf8")
 | 
						|
 | 
						|
 | 
						|
 |