tag.
"""
def __init__(self, parser=None, builder=None, name=None, namespace=None,
prefix=None, attrs=None, parent=None, previous=None,
is_xml=None, sourceline=None, sourcepos=None,
can_be_empty_element=None, cdata_list_attributes=None,
preserve_whitespace_tags=None,
interesting_string_types=None,
namespaces=None
):
"""Basic constructor.
:param parser: A BeautifulSoup object.
:param builder: A TreeBuilder.
:param name: The name of the tag.
:param namespace: The URI of this Tag's XML namespace, if any.
:param prefix: The prefix for this Tag's XML namespace, if any.
:param attrs: A dictionary of this Tag's attribute values.
:param parent: The PageElement to use as this Tag's parent.
:param previous: The PageElement that was parsed immediately before
this tag.
:param is_xml: If True, this is an XML tag. Otherwise, this is an
HTML tag.
:param sourceline: The line number where this tag was found in its
source document.
:param sourcepos: The character position within `sourceline` where this
tag was found.
:param can_be_empty_element: If True, this tag should be
represented as . If False, this tag should be represented
as .
:param cdata_list_attributes: A list of attributes whose values should
be treated as CDATA if they ever show up on this tag.
:param preserve_whitespace_tags: A list of tag names whose contents
should have their whitespace preserved.
:param interesting_string_types: This is a NavigableString
subclass or a tuple of them. When iterating over this
Tag's strings in methods like Tag.strings or Tag.get_text,
these are the types of strings that are interesting enough
to be considered. The default is to consider
NavigableString and CData the only interesting string
subtypes.
:param namespaces: A dictionary mapping currently active
namespace prefixes to URIs. This can be used later to
construct CSS selectors.
"""
if parser is None:
self.parser_class = None
else:
# We don't actually store the parser object: that lets extracted
# chunks be garbage-collected.
self.parser_class = parser.__class__
if name is None:
raise ValueError("No value provided for new tag's name.")
self.name = name
self.namespace = namespace
self._namespaces = namespaces or {}
self.prefix = prefix
if ((not builder or builder.store_line_numbers)
and (sourceline is not None or sourcepos is not None)):
self.sourceline = sourceline
self.sourcepos = sourcepos
if attrs is None:
attrs = {}
elif attrs:
if builder is not None and builder.cdata_list_attributes:
attrs = builder._replace_cdata_list_attribute_values(
self.name, attrs)
else:
attrs = dict(attrs)
else:
attrs = dict(attrs)
# If possible, determine ahead of time whether this tag is an
# XML tag.
if builder:
self.known_xml = builder.is_xml
else:
self.known_xml = is_xml
self.attrs = attrs
self.contents = []
self.setup(parent, previous)
self.hidden = False
if builder is None:
# In the absence of a TreeBuilder, use whatever values were
# passed in here. They're probably None, unless this is a copy of some
# other tag.
self.can_be_empty_element = can_be_empty_element
self.cdata_list_attributes = cdata_list_attributes
self.preserve_whitespace_tags = preserve_whitespace_tags
self.interesting_string_types = interesting_string_types
else:
# Set up any substitutions for this tag, such as the charset in a META tag.
builder.set_up_substitutions(self)
# Ask the TreeBuilder whether this tag might be an empty-element tag.
self.can_be_empty_element = builder.can_be_empty_element(name)
# Keep track of the list of attributes of this tag that
# might need to be treated as a list.
#
# For performance reasons, we store the whole data structure
# rather than asking the question of every tag. Asking would
# require building a new data structure every time, and
# (unlike can_be_empty_element), we almost never need
# to check this.
self.cdata_list_attributes = builder.cdata_list_attributes
# Keep track of the names that might cause this tag to be treated as a
# whitespace-preserved tag.
self.preserve_whitespace_tags = builder.preserve_whitespace_tags
if self.name in builder.string_containers:
# This sort of tag uses a special string container
# subclass for most of its strings. When we ask the
self.interesting_string_types = builder.string_containers[self.name]
else:
self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES
parserClass = _alias("parser_class") # BS3
def __copy__(self):
"""A copy of a Tag is a new Tag, unconnected to the parse tree.
Its contents are a copy of the old Tag's contents.
"""
clone = type(self)(
None, self.builder, self.name, self.namespace,
self.prefix, self.attrs, is_xml=self._is_xml,
sourceline=self.sourceline, sourcepos=self.sourcepos,
can_be_empty_element=self.can_be_empty_element,
cdata_list_attributes=self.cdata_list_attributes,
preserve_whitespace_tags=self.preserve_whitespace_tags
)
for attr in ('can_be_empty_element', 'hidden'):
setattr(clone, attr, getattr(self, attr))
for child in self.contents:
clone.append(child.__copy__())
return clone
@property
def is_empty_element(self):
"""Is this tag an empty-element tag? (aka a self-closing tag)
A tag that has contents is never an empty-element tag.
A tag that has no contents may or may not be an empty-element
tag. It depends on the builder used to create the tag. If the
builder has a designated list of empty-element tags, then only
a tag whose name shows up in that list is considered an
empty-element tag.
If the builder has no designated list of empty-element tags,
then any tag with no contents is an empty-element tag.
"""
return len(self.contents) == 0 and self.can_be_empty_element
isSelfClosing = is_empty_element # BS3
@property
def string(self):
"""Convenience property to get the single string within this
PageElement.
TODO It might make sense to have NavigableString.string return
itself.
:return: If this element has a single string child, return
value is that string. If this element has one child tag,
return value is the 'string' attribute of the child tag,
recursively. If this element is itself a string, has no
children, or has more than one child, return value is None.
"""
if len(self.contents) != 1:
return None
child = self.contents[0]
if isinstance(child, NavigableString):
return child
return child.string
@string.setter
def string(self, string):
"""Replace this PageElement's contents with `string`."""
self.clear()
self.append(string.__class__(string))
DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData)
def _all_strings(self, strip=False, types=PageElement.default):
"""Yield all strings of certain classes, possibly stripping them.
:param strip: If True, all strings will be stripped before being
yielded.
:param types: A tuple of NavigableString subclasses. Any strings of
a subclass not found in this list will be ignored. By
default, the subclasses considered are the ones found in
self.interesting_string_types. If that's not specified,
only NavigableString and CData objects will be
considered. That means no comments, processing
instructions, etc.
:yield: A sequence of strings.
"""
if types is self.default:
types = self.interesting_string_types
for descendant in self.descendants:
if (types is None and not isinstance(descendant, NavigableString)):
continue
descendant_type = type(descendant)
if isinstance(types, type):
if descendant_type is not types:
# We're not interested in strings of this type.
continue
elif types is not None and descendant_type not in types:
# We're not interested in strings of this type.
continue
if strip:
descendant = descendant.strip()
if len(descendant) == 0:
continue
yield descendant
strings = property(_all_strings)
def decompose(self):
"""Recursively destroys this PageElement and its children.
This element will be removed from the tree and wiped out; so
will everything beneath it.
The behavior of a decomposed PageElement is undefined and you
should never use one for anything, but if you need to _check_
whether an element has been decomposed, you can use the
`decomposed` property.
"""
self.extract()
i = self
while i is not None:
n = i.next_element
i.__dict__.clear()
i.contents = []
i._decomposed = True
i = n
def clear(self, decompose=False):
"""Wipe out all children of this PageElement by calling extract()
on them.
:param decompose: If this is True, decompose() (a more
destructive method) will be called instead of extract().
"""
if decompose:
for element in self.contents[:]:
if isinstance(element, Tag):
element.decompose()
else:
element.extract()
else:
for element in self.contents[:]:
element.extract()
def smooth(self):
"""Smooth out this element's children by consolidating consecutive
strings.
This makes pretty-printed output look more natural following a
lot of operations that modified the tree.
"""
# Mark the first position of every pair of children that need
# to be consolidated. Do this rather than making a copy of
# self.contents, since in most cases very few strings will be
# affected.
marked = []
for i, a in enumerate(self.contents):
if isinstance(a, Tag):
# Recursively smooth children.
a.smooth()
if i == len(self.contents)-1:
# This is the last item in .contents, and it's not a
# tag. There's no chance it needs any work.
continue
b = self.contents[i+1]
if (isinstance(a, NavigableString)
and isinstance(b, NavigableString)
and not isinstance(a, PreformattedString)
and not isinstance(b, PreformattedString)
):
marked.append(i)
# Go over the marked positions in reverse order, so that
# removing items from .contents won't affect the remaining
# positions.
for i in reversed(marked):
a = self.contents[i]
b = self.contents[i+1]
b.extract()
n = NavigableString(a+b)
a.replace_with(n)
def index(self, element):
"""Find the index of a child by identity, not value.
Avoids issues with tag.contents.index(element) getting the
index of equal elements.
:param element: Look for this PageElement in `self.contents`.
"""
for i, child in enumerate(self.contents):
if child is element:
return i
raise ValueError("Tag.index: element not in tag")
def get(self, key, default=None):
"""Returns the value of the 'key' attribute for the tag, or
the value given for 'default' if it doesn't have that
attribute."""
return self.attrs.get(key, default)
def get_attribute_list(self, key, default=None):
"""The same as get(), but always returns a list.
:param key: The attribute to look for.
:param default: Use this value if the attribute is not present
on this PageElement.
:return: A list of values, probably containing only a single
value.
"""
value = self.get(key, default)
if not isinstance(value, list):
value = [value]
return value
def has_attr(self, key):
"""Does this PageElement have an attribute with the given name?"""
return key in self.attrs
def __hash__(self):
return str(self).__hash__()
def __getitem__(self, key):
"""tag[key] returns the value of the 'key' attribute for the Tag,
and throws an exception if it's not there."""
return self.attrs[key]
def __iter__(self):
"Iterating over a Tag iterates over its contents."
return iter(self.contents)
def __len__(self):
"The length of a Tag is the length of its list of contents."
return len(self.contents)
def __contains__(self, x):
return x in self.contents
def __bool__(self):
"A tag is non-None even if it has no contents."
return True
def __setitem__(self, key, value):
"""Setting tag[key] sets the value of the 'key' attribute for the
tag."""
self.attrs[key] = value
def __delitem__(self, key):
"Deleting tag[key] deletes all 'key' attributes for the tag."
self.attrs.pop(key, None)
def __call__(self, *args, **kwargs):
"""Calling a Tag like a function is the same as calling its
find_all() method. Eg. tag('a') returns a list of all the A tags
found within this tag."""
return self.find_all(*args, **kwargs)
def __getattr__(self, tag):
"""Calling tag.subtag is the same as calling tag.find(name="subtag")"""
#print("Getattr %s.%s" % (self.__class__, tag))
if len(tag) > 3 and tag.endswith('Tag'):
# BS3: soup.aTag -> "soup.find("a")
tag_name = tag[:-3]
warnings.warn(
'.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
name=tag_name
),
DeprecationWarning
)
return self.find(tag_name)
# We special case contents to avoid recursion.
elif not tag.startswith("__") and not tag == "contents":
return self.find(tag)
raise AttributeError(
"'%s' object has no attribute '%s'" % (self.__class__, tag))
def __eq__(self, other):
"""Returns true iff this Tag has the same name, the same attributes,
and the same contents (recursively) as `other`."""
if self is other:
return True
if (not hasattr(other, 'name') or
not hasattr(other, 'attrs') or
not hasattr(other, 'contents') or
self.name != other.name or
self.attrs != other.attrs or
len(self) != len(other)):
return False
for i, my_child in enumerate(self.contents):
if my_child != other.contents[i]:
return False
return True
def __ne__(self, other):
"""Returns true iff this Tag is not identical to `other`,
as defined in __eq__."""
return not self == other
def __repr__(self, encoding="unicode-escape"):
"""Renders this PageElement as a string.
:param encoding: The encoding to use (Python 2 only).
TODO: This is now ignored and a warning should be issued
if a value is provided.
:return: A (Unicode) string.
"""
# "The return value must be a string object", i.e. Unicode
return self.decode()
def __unicode__(self):
"""Renders this PageElement as a Unicode string."""
return self.decode()
__str__ = __repr__ = __unicode__
def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
indent_level=None, formatter="minimal",
errors="xmlcharrefreplace"):
"""Render a bytestring representation of this PageElement and its
contents.
:param encoding: The destination encoding.
:param indent_level: Each line of the rendering will be
indented this many levels. (The formatter decides what a
'level' means in terms of spaces or other characters
output.) Used internally in recursive calls while
pretty-printing.
:param formatter: A Formatter object, or a string naming one of
the standard formatters.
:param errors: An error handling strategy such as
'xmlcharrefreplace'. This value is passed along into
encode() and its value should be one of the constants
defined by Python.
:return: A bytestring.
"""
# Turn the data structure into Unicode, then encode the
# Unicode.
u = self.decode(indent_level, encoding, formatter)
return u.encode(encoding, errors)
def decode(self, indent_level=None,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
"""Render a Unicode representation of this PageElement and its
contents.
:param indent_level: Each line of the rendering will be
indented this many spaces. Used internally in
recursive calls while pretty-printing.
:param eventual_encoding: The tag is destined to be
encoded into this encoding. This method is _not_
responsible for performing that encoding. This information
is passed in so that it can be substituted in if the
document contains a tag that mentions the document's
encoding.
:param formatter: A Formatter object, or a string naming one of
the standard formatters.
"""
# First off, turn a non-Formatter `formatter` into a Formatter
# object. This will stop the lookup from happening over and
# over again.
if not isinstance(formatter, Formatter):
formatter = self.formatter_for_name(formatter)
attributes = formatter.attributes(self)
attrs = []
for key, val in attributes:
if val is None:
decoded = key
else:
if isinstance(val, list) or isinstance(val, tuple):
val = ' '.join(val)
elif not isinstance(val, str):
val = str(val)
elif (
isinstance(val, AttributeValueWithCharsetSubstitution)
and eventual_encoding is not None
):
val = val.encode(eventual_encoding)
text = formatter.attribute_value(val)
decoded = (
str(key) + '='
+ formatter.quoted_attribute_value(text))
attrs.append(decoded)
close = ''
closeTag = ''
prefix = ''
if self.prefix:
prefix = self.prefix + ":"
if self.is_empty_element:
close = formatter.void_element_close_prefix or ''
else:
closeTag = '%s%s>' % (prefix, self.name)
pretty_print = self._should_pretty_print(indent_level)
space = ''
indent_space = ''
if indent_level is not None:
indent_space = (formatter.indent * (indent_level - 1))
if pretty_print:
space = indent_space
indent_contents = indent_level + 1
else:
indent_contents = None
contents = self.decode_contents(
indent_contents, eventual_encoding, formatter
)
if self.hidden:
# This is the 'document root' object.
s = contents
else:
s = []
attribute_string = ''
if attrs:
attribute_string = ' ' + ' '.join(attrs)
if indent_level is not None:
# Even if this particular tag is not pretty-printed,
# we should indent up to the start of the tag.
s.append(indent_space)
s.append('<%s%s%s%s>' % (
prefix, self.name, attribute_string, close))
if pretty_print:
s.append("\n")
s.append(contents)
if pretty_print and contents and contents[-1] != "\n":
s.append("\n")
if pretty_print and closeTag:
s.append(space)
s.append(closeTag)
if indent_level is not None and closeTag and self.next_sibling:
# Even if this particular tag is not pretty-printed,
# we're now done with the tag, and we should add a
# newline if appropriate.
s.append("\n")
s = ''.join(s)
return s
def _should_pretty_print(self, indent_level):
"""Should this tag be pretty-printed?
Most of them should, but some (such as in HTML
documents) should not.
"""
return (
indent_level is not None
and (
not self.preserve_whitespace_tags
or self.name not in self.preserve_whitespace_tags
)
)
def prettify(self, encoding=None, formatter="minimal"):
"""Pretty-print this PageElement as a string.
:param encoding: The eventual encoding of the string. If this is None,
a Unicode string will be returned.
:param formatter: A Formatter object, or a string naming one of
the standard formatters.
:return: A Unicode string (if encoding==None) or a bytestring
(otherwise).
"""
if encoding is None:
return self.decode(True, formatter=formatter)
else:
return self.encode(encoding, True, formatter=formatter)
def decode_contents(self, indent_level=None,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
"""Renders the contents of this tag as a Unicode string.
:param indent_level: Each line of the rendering will be
indented this many levels. (The formatter decides what a
'level' means in terms of spaces or other characters
output.) Used internally in recursive calls while
pretty-printing.
:param eventual_encoding: The tag is destined to be
encoded into this encoding. decode_contents() is _not_
responsible for performing that encoding. This information
is passed in so that it can be substituted in if the
document contains a tag that mentions the document's
encoding.
:param formatter: A Formatter object, or a string naming one of
the standard Formatters.
"""
# First off, turn a string formatter into a Formatter object. This
# will stop the lookup from happening over and over again.
if not isinstance(formatter, Formatter):
formatter = self.formatter_for_name(formatter)
pretty_print = (indent_level is not None)
s = []
for c in self:
text = None
if isinstance(c, NavigableString):
text = c.output_ready(formatter)
elif isinstance(c, Tag):
s.append(c.decode(indent_level, eventual_encoding,
formatter))
preserve_whitespace = (
self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
)
if text and indent_level and not preserve_whitespace:
text = text.strip()
if text:
if pretty_print and not preserve_whitespace:
s.append(formatter.indent * (indent_level - 1))
s.append(text)
if pretty_print and not preserve_whitespace:
s.append("\n")
return ''.join(s)
def encode_contents(
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
"""Renders the contents of this PageElement as a bytestring.
:param indent_level: Each line of the rendering will be
indented this many levels. (The formatter decides what a
'level' means in terms of spaces or other characters
output.) Used internally in recursive calls while
pretty-printing.
:param eventual_encoding: The bytestring will be in this encoding.
:param formatter: A Formatter object, or a string naming one of
the standard Formatters.
:return: A bytestring.
"""
contents = self.decode_contents(indent_level, encoding, formatter)
return contents.encode(encoding)
# Old method for BS3 compatibility
def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
prettyPrint=False, indentLevel=0):
"""Deprecated method for BS3 compatibility."""
if not prettyPrint:
indentLevel = None
return self.encode_contents(
indent_level=indentLevel, encoding=encoding)
#Soup methods
def find(self, name=None, attrs={}, recursive=True, string=None,
**kwargs):
"""Look in the children of this PageElement and find the first
PageElement that matches the given criteria.
All find_* methods take a common set of arguments. See the online
documentation for detailed explanations.
:param name: A filter on tag name.
:param attrs: A dictionary of filters on attribute values.
:param recursive: If this is True, find() will perform a
recursive search of this PageElement's children. Otherwise,
only the direct children will be considered.
:param limit: Stop looking after finding this many results.
:kwargs: A dictionary of filters on attribute values.
:return: A PageElement.
:rtype: bs4.element.Tag | bs4.element.NavigableString
"""
r = None
l = self.find_all(name, attrs, recursive, string, 1, **kwargs)
if l:
r = l[0]
return r
findChild = find #BS2
def find_all(self, name=None, attrs={}, recursive=True, string=None,
limit=None, **kwargs):
"""Look in the children of this PageElement and find all
PageElements that match the given criteria.
All find_* methods take a common set of arguments. See the online
documentation for detailed explanations.
:param name: A filter on tag name.
:param attrs: A dictionary of filters on attribute values.
:param recursive: If this is True, find_all() will perform a
recursive search of this PageElement's children. Otherwise,
only the direct children will be considered.
:param limit: Stop looking after finding this many results.
:kwargs: A dictionary of filters on attribute values.
:return: A ResultSet of PageElements.
:rtype: bs4.element.ResultSet
"""
generator = self.descendants
if not recursive:
generator = self.children
return self._find_all(name, attrs, string, limit, generator, **kwargs)
findAll = find_all # BS3
findChildren = find_all # BS2
#Generator methods
@property
def children(self):
"""Iterate over all direct children of this PageElement.
:yield: A sequence of PageElements.
"""
# return iter() to make the purpose of the method clear
return iter(self.contents) # XXX This seems to be untested.
@property
def descendants(self):
"""Iterate over all children of this PageElement in a
breadth-first sequence.
:yield: A sequence of PageElements.
"""
if not len(self.contents):
return
stopNode = self._last_descendant().next_element
current = self.contents[0]
while current is not stopNode:
yield current
current = current.next_element
# CSS selector code
def select_one(self, selector, namespaces=None, **kwargs):
"""Perform a CSS selection operation on the current element.
:param selector: A CSS selector.
:param namespaces: A dictionary mapping namespace prefixes
used in the CSS selector to namespace URIs. By default,
Beautiful Soup will use the prefixes it encountered while
parsing the document.
:param kwargs: Keyword arguments to be passed into SoupSieve's
soupsieve.select() method.
:return: A Tag.
:rtype: bs4.element.Tag
"""
value = self.select(selector, namespaces, 1, **kwargs)
if value:
return value[0]
return None
def select(self, selector, namespaces=None, limit=None, **kwargs):
"""Perform a CSS selection operation on the current element.
This uses the SoupSieve library.
:param selector: A string containing a CSS selector.
:param namespaces: A dictionary mapping namespace prefixes
used in the CSS selector to namespace URIs. By default,
Beautiful Soup will use the prefixes it encountered while
parsing the document.
:param limit: After finding this number of results, stop looking.
:param kwargs: Keyword arguments to be passed into SoupSieve's
soupsieve.select() method.
:return: A ResultSet of Tags.
:rtype: bs4.element.ResultSet
"""
if namespaces is None:
namespaces = self._namespaces
if limit is None:
limit = 0
if soupsieve is None:
raise NotImplementedError(
"Cannot execute CSS selectors because the soupsieve package is not installed."
)
results = soupsieve.select(selector, self, namespaces, limit, **kwargs)
# We do this because it's more consistent and because
# ResultSet.__getattr__ has a helpful error message.
return ResultSet(None, results)
# Old names for backwards compatibility
def childGenerator(self):
"""Deprecated generator."""
return self.children
def recursiveChildGenerator(self):
"""Deprecated generator."""
return self.descendants
def has_key(self, key):
"""Deprecated method. This was kind of misleading because has_key()
(attributes) was different from __in__ (contents).
has_key() is gone in Python 3, anyway.
"""
warnings.warn(
'has_key is deprecated. Use has_attr(key) instead.',
DeprecationWarning
)
return self.has_attr(key)
# Next, a couple classes to represent queries and their results.
class SoupStrainer(object):
"""Encapsulates a number of ways of matching a markup element (tag or
string).
This is primarily used to underpin the find_* methods, but you can
create one yourself and pass it in as `parse_only` to the
`BeautifulSoup` constructor, to parse a subset of a large
document.
"""
def __init__(self, name=None, attrs={}, string=None, **kwargs):
"""Constructor.
The SoupStrainer constructor takes the same arguments passed
into the find_* methods. See the online documentation for
detailed explanations.
:param name: A filter on tag name.
:param attrs: A dictionary of filters on attribute values.
:param string: A filter for a NavigableString with specific text.
:kwargs: A dictionary of filters on attribute values.
"""
if string is None and 'text' in kwargs:
string = kwargs.pop('text')
warnings.warn(
"The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.",
DeprecationWarning
)
self.name = self._normalize_search_value(name)
if not isinstance(attrs, dict):
# Treat a non-dict value for attrs as a search for the 'class'
# attribute.
kwargs['class'] = attrs
attrs = None
if 'class_' in kwargs:
# Treat class_="foo" as a search for the 'class'
# attribute, overriding any non-dict value for attrs.
kwargs['class'] = kwargs['class_']
del kwargs['class_']
if kwargs:
if attrs:
attrs = attrs.copy()
attrs.update(kwargs)
else:
attrs = kwargs
normalized_attrs = {}
for key, value in list(attrs.items()):
normalized_attrs[key] = self._normalize_search_value(value)
self.attrs = normalized_attrs
self.string = self._normalize_search_value(string)
# DEPRECATED but just in case someone is checking this.
self.text = self.string
def _normalize_search_value(self, value):
# Leave it alone if it's a Unicode string, a callable, a
# regular expression, a boolean, or None.
if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')
or isinstance(value, bool) or value is None):
return value
# If it's a bytestring, convert it to Unicode, treating it as UTF-8.
if isinstance(value, bytes):
return value.decode("utf8")
# If it's listlike, convert it into a list of strings.
if hasattr(value, '__iter__'):
new_value = []
for v in value:
if (hasattr(v, '__iter__') and not isinstance(v, bytes)
and not isinstance(v, str)):
# This is almost certainly the user's mistake. In the
# interests of avoiding infinite loops, we'll let
# it through as-is rather than doing a recursive call.
new_value.append(v)
else:
new_value.append(self._normalize_search_value(v))
return new_value
# Otherwise, convert it into a Unicode string.
# The unicode(str()) thing is so this will do the same thing on Python 2
# and Python 3.
return str(str(value))
def __str__(self):
"""A human-readable representation of this SoupStrainer."""
if self.string:
return self.string
else:
return "%s|%s" % (self.name, self.attrs)
def search_tag(self, markup_name=None, markup_attrs={}):
"""Check whether a Tag with the given name and attributes would
match this SoupStrainer.
Used prospectively to decide whether to even bother creating a Tag
object.
:param markup_name: A tag name as found in some markup.
:param markup_attrs: A dictionary of attributes as found in some markup.
:return: True if the prospective tag would match this SoupStrainer;
False otherwise.
"""
found = None
markup = None
if isinstance(markup_name, Tag):
markup = markup_name
markup_attrs = markup
if isinstance(self.name, str):
# Optimization for a very common case where the user is
# searching for a tag with one specific name, and we're
# looking at a tag with a different name.
if markup and not markup.prefix and self.name != markup.name:
return False
call_function_with_tag_data = (
isinstance(self.name, Callable)
and not isinstance(markup_name, Tag))
if ((not self.name)
or call_function_with_tag_data
or (markup and self._matches(markup, self.name))
or (not markup and self._matches(markup_name, self.name))):
if call_function_with_tag_data:
match = self.name(markup_name, markup_attrs)
else:
match = True
markup_attr_map = None
for attr, match_against in list(self.attrs.items()):
if not markup_attr_map:
if hasattr(markup_attrs, 'get'):
markup_attr_map = markup_attrs
else:
markup_attr_map = {}
for k, v in markup_attrs:
markup_attr_map[k] = v
attr_value = markup_attr_map.get(attr)
if not self._matches(attr_value, match_against):
match = False
break
if match:
if markup:
found = markup
else:
found = markup_name
if found and self.string and not self._matches(found.string, self.string):
found = None
return found
# For BS3 compatibility.
searchTag = search_tag
def search(self, markup):
"""Find all items in `markup` that match this SoupStrainer.
Used by the core _find_all() method, which is ultimately
called by all find_* methods.
:param markup: A PageElement or a list of them.
"""
# print('looking for %s in %s' % (self, markup))
found = None
# If given a list of items, scan it for a text element that
# matches.
if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
for element in markup:
if isinstance(element, NavigableString) \
and self.search(element):
found = element
break
# If it's a Tag, make sure its name or attributes match.
# Don't bother with Tags if we're searching for text.
elif isinstance(markup, Tag):
if not self.string or self.name or self.attrs:
found = self.search_tag(markup)
# If it's text, make sure the text matches.
elif isinstance(markup, NavigableString) or \
isinstance(markup, str):
if not self.name and not self.attrs and self._matches(markup, self.string):
found = markup
else:
raise Exception(
"I don't know how to match against a %s" % markup.__class__)
return found
def _matches(self, markup, match_against, already_tried=None):
# print(u"Matching %s against %s" % (markup, match_against))
result = False
if isinstance(markup, list) or isinstance(markup, tuple):
# This should only happen when searching a multi-valued attribute
# like 'class'.
for item in markup:
if self._matches(item, match_against):
return True
# We didn't match any particular value of the multivalue
# attribute, but maybe we match the attribute value when
# considered as a string.
if self._matches(' '.join(markup), match_against):
return True
return False
if match_against is True:
# True matches any non-None value.
return markup is not None
if isinstance(match_against, Callable):
return match_against(markup)
# Custom callables take the tag as an argument, but all
# other ways of matching match the tag name as a string.
original_markup = markup
if isinstance(markup, Tag):
markup = markup.name
# Ensure that `markup` is either a Unicode string, or None.
markup = self._normalize_search_value(markup)
if markup is None:
# None matches None, False, an empty string, an empty list, and so on.
return not match_against
if (hasattr(match_against, '__iter__')
and not isinstance(match_against, str)):
# We're asked to match against an iterable of items.
# The markup must be match at least one item in the
# iterable. We'll try each one in turn.
#
# To avoid infinite recursion we need to keep track of
# items we've already seen.
if not already_tried:
already_tried = set()
for item in match_against:
if item.__hash__:
key = item
else:
key = id(item)
if key in already_tried:
continue
else:
already_tried.add(key)
if self._matches(original_markup, item, already_tried):
return True
else:
return False
# Beyond this point we might need to run the test twice: once against
# the tag's name and once against its prefixed name.
match = False
if not match and isinstance(match_against, str):
# Exact string match
match = markup == match_against
if not match and hasattr(match_against, 'search'):
# Regexp match
return match_against.search(markup)
if (not match
and isinstance(original_markup, Tag)
and original_markup.prefix):
# Try the whole thing again with the prefixed tag name.
return self._matches(
original_markup.prefix + ':' + original_markup.name, match_against
)
return match
class ResultSet(list):
"""A ResultSet is just a list that keeps track of the SoupStrainer
that created it."""
def __init__(self, source, result=()):
"""Constructor.
:param source: A SoupStrainer.
:param result: A list of PageElements.
"""
super(ResultSet, self).__init__(result)
self.source = source
def __getattr__(self, key):
"""Raise a helpful exception to explain a common code fix."""
raise AttributeError(
"ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
)