• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1import collections
2import re
3import sys
4import warnings
5from bs4.dammit import EntitySubstitution
6
7DEFAULT_OUTPUT_ENCODING = "utf-8"
8PY3K = (sys.version_info[0] > 2)
9
10whitespace_re = re.compile("\s+")
11
12def _alias(attr):
13    """Alias one attribute name to another for backward compatibility"""
14    @property
15    def alias(self):
16        return getattr(self, attr)
17
18    @alias.setter
19    def alias(self):
20        return setattr(self, attr)
21    return alias
22
23
24class NamespacedAttribute(unicode):
25
26    def __new__(cls, prefix, name, namespace=None):
27        if name is None:
28            obj = unicode.__new__(cls, prefix)
29        elif prefix is None:
30            # Not really namespaced.
31            obj = unicode.__new__(cls, name)
32        else:
33            obj = unicode.__new__(cls, prefix + ":" + name)
34        obj.prefix = prefix
35        obj.name = name
36        obj.namespace = namespace
37        return obj
38
39class AttributeValueWithCharsetSubstitution(unicode):
40    """A stand-in object for a character encoding specified in HTML."""
41
42class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
43    """A generic stand-in for the value of a meta tag's 'charset' attribute.
44
45    When Beautiful Soup parses the markup '<meta charset="utf8">', the
46    value of the 'charset' attribute will be one of these objects.
47    """
48
49    def __new__(cls, original_value):
50        obj = unicode.__new__(cls, original_value)
51        obj.original_value = original_value
52        return obj
53
54    def encode(self, encoding):
55        return encoding
56
57
58class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
59    """A generic stand-in for the value of a meta tag's 'content' attribute.
60
61    When Beautiful Soup parses the markup:
62     <meta http-equiv="content-type" content="text/html; charset=utf8">
63
64    The value of the 'content' attribute will be one of these objects.
65    """
66
67    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
68
69    def __new__(cls, original_value):
70        match = cls.CHARSET_RE.search(original_value)
71        if match is None:
72            # No substitution necessary.
73            return unicode.__new__(unicode, original_value)
74
75        obj = unicode.__new__(cls, original_value)
76        obj.original_value = original_value
77        return obj
78
79    def encode(self, encoding):
80        def rewrite(match):
81            return match.group(1) + encoding
82        return self.CHARSET_RE.sub(rewrite, self.original_value)
83
84class HTMLAwareEntitySubstitution(EntitySubstitution):
85
86    """Entity substitution rules that are aware of some HTML quirks.
87
88    Specifically, the contents of <script> and <style> tags should not
89    undergo entity substitution.
90
91    Incoming NavigableString objects are checked to see if they're the
92    direct children of a <script> or <style> tag.
93    """
94
95    cdata_containing_tags = set(["script", "style"])
96
97    preformatted_tags = set(["pre"])
98
99    @classmethod
100    def _substitute_if_appropriate(cls, ns, f):
101        if (isinstance(ns, NavigableString)
102            and ns.parent is not None
103            and ns.parent.name in cls.cdata_containing_tags):
104            # Do nothing.
105            return ns
106        # Substitute.
107        return f(ns)
108
109    @classmethod
110    def substitute_html(cls, ns):
111        return cls._substitute_if_appropriate(
112            ns, EntitySubstitution.substitute_html)
113
114    @classmethod
115    def substitute_xml(cls, ns):
116        return cls._substitute_if_appropriate(
117            ns, EntitySubstitution.substitute_xml)
118
119class PageElement(object):
120    """Contains the navigational information for some part of the page
121    (either a tag or a piece of text)"""
122
123    # There are five possible values for the "formatter" argument passed in
124    # to methods like encode() and prettify():
125    #
126    # "html" - All Unicode characters with corresponding HTML entities
127    #   are converted to those entities on output.
128    # "minimal" - Bare ampersands and angle brackets are converted to
129    #   XML entities: &amp; &lt; &gt;
130    # None - The null formatter. Unicode characters are never
131    #   converted to entities.  This is not recommended, but it's
132    #   faster than "minimal".
133    # A function - This function will be called on every string that
134    #  needs to undergo entity substitution.
135    #
136
137    # In an HTML document, the default "html" and "minimal" functions
138    # will leave the contents of <script> and <style> tags alone. For
139    # an XML document, all tags will be given the same treatment.
140
141    HTML_FORMATTERS = {
142        "html" : HTMLAwareEntitySubstitution.substitute_html,
143        "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
144        None : None
145        }
146
147    XML_FORMATTERS = {
148        "html" : EntitySubstitution.substitute_html,
149        "minimal" : EntitySubstitution.substitute_xml,
150        None : None
151        }
152
153    def format_string(self, s, formatter='minimal'):
154        """Format the given string using the given formatter."""
155        if not callable(formatter):
156            formatter = self._formatter_for_name(formatter)
157        if formatter is None:
158            output = s
159        else:
160            output = formatter(s)
161        return output
162
163    @property
164    def _is_xml(self):
165        """Is this element part of an XML tree or an HTML tree?
166
167        This is used when mapping a formatter name ("minimal") to an
168        appropriate function (one that performs entity-substitution on
169        the contents of <script> and <style> tags, or not). It's
170        inefficient, but it should be called very rarely.
171        """
172        if self.parent is None:
173            # This is the top-level object. It should have .is_xml set
174            # from tree creation. If not, take a guess--BS is usually
175            # used on HTML markup.
176            return getattr(self, 'is_xml', False)
177        return self.parent._is_xml
178
179    def _formatter_for_name(self, name):
180        "Look up a formatter function based on its name and the tree."
181        if self._is_xml:
182            return self.XML_FORMATTERS.get(
183                name, EntitySubstitution.substitute_xml)
184        else:
185            return self.HTML_FORMATTERS.get(
186                name, HTMLAwareEntitySubstitution.substitute_xml)
187
188    def setup(self, parent=None, previous_element=None):
189        """Sets up the initial relations between this element and
190        other elements."""
191        self.parent = parent
192        self.previous_element = previous_element
193        if previous_element is not None:
194            self.previous_element.next_element = self
195        self.next_element = None
196        self.previous_sibling = None
197        self.next_sibling = None
198        if self.parent is not None and self.parent.contents:
199            self.previous_sibling = self.parent.contents[-1]
200            self.previous_sibling.next_sibling = self
201
202    nextSibling = _alias("next_sibling")  # BS3
203    previousSibling = _alias("previous_sibling")  # BS3
204
205    def replace_with(self, replace_with):
206        if replace_with is self:
207            return
208        if replace_with is self.parent:
209            raise ValueError("Cannot replace a Tag with its parent.")
210        old_parent = self.parent
211        my_index = self.parent.index(self)
212        self.extract()
213        old_parent.insert(my_index, replace_with)
214        return self
215    replaceWith = replace_with  # BS3
216
217    def unwrap(self):
218        my_parent = self.parent
219        my_index = self.parent.index(self)
220        self.extract()
221        for child in reversed(self.contents[:]):
222            my_parent.insert(my_index, child)
223        return self
224    replace_with_children = unwrap
225    replaceWithChildren = unwrap  # BS3
226
227    def wrap(self, wrap_inside):
228        me = self.replace_with(wrap_inside)
229        wrap_inside.append(me)
230        return wrap_inside
231
232    def extract(self):
233        """Destructively rips this element out of the tree."""
234        if self.parent is not None:
235            del self.parent.contents[self.parent.index(self)]
236
237        #Find the two elements that would be next to each other if
238        #this element (and any children) hadn't been parsed. Connect
239        #the two.
240        last_child = self._last_descendant()
241        next_element = last_child.next_element
242
243        if self.previous_element is not None:
244            self.previous_element.next_element = next_element
245        if next_element is not None:
246            next_element.previous_element = self.previous_element
247        self.previous_element = None
248        last_child.next_element = None
249
250        self.parent = None
251        if self.previous_sibling is not None:
252            self.previous_sibling.next_sibling = self.next_sibling
253        if self.next_sibling is not None:
254            self.next_sibling.previous_sibling = self.previous_sibling
255        self.previous_sibling = self.next_sibling = None
256        return self
257
258    def _last_descendant(self, is_initialized=True, accept_self=True):
259        "Finds the last element beneath this object to be parsed."
260        if is_initialized and self.next_sibling:
261            last_child = self.next_sibling.previous_element
262        else:
263            last_child = self
264            while isinstance(last_child, Tag) and last_child.contents:
265                last_child = last_child.contents[-1]
266        if not accept_self and last_child == self:
267            last_child = None
268        return last_child
269    # BS3: Not part of the API!
270    _lastRecursiveChild = _last_descendant
271
272    def insert(self, position, new_child):
273        if new_child is self:
274            raise ValueError("Cannot insert a tag into itself.")
275        if (isinstance(new_child, basestring)
276            and not isinstance(new_child, NavigableString)):
277            new_child = NavigableString(new_child)
278
279        position = min(position, len(self.contents))
280        if hasattr(new_child, 'parent') and new_child.parent is not None:
281            # We're 'inserting' an element that's already one
282            # of this object's children.
283            if new_child.parent is self:
284                current_index = self.index(new_child)
285                if current_index < position:
286                    # We're moving this element further down the list
287                    # of this object's children. That means that when
288                    # we extract this element, our target index will
289                    # jump down one.
290                    position -= 1
291            new_child.extract()
292
293        new_child.parent = self
294        previous_child = None
295        if position == 0:
296            new_child.previous_sibling = None
297            new_child.previous_element = self
298        else:
299            previous_child = self.contents[position - 1]
300            new_child.previous_sibling = previous_child
301            new_child.previous_sibling.next_sibling = new_child
302            new_child.previous_element = previous_child._last_descendant(False)
303        if new_child.previous_element is not None:
304            new_child.previous_element.next_element = new_child
305
306        new_childs_last_element = new_child._last_descendant(False)
307
308        if position >= len(self.contents):
309            new_child.next_sibling = None
310
311            parent = self
312            parents_next_sibling = None
313            while parents_next_sibling is None and parent is not None:
314                parents_next_sibling = parent.next_sibling
315                parent = parent.parent
316                if parents_next_sibling is not None:
317                    # We found the element that comes next in the document.
318                    break
319            if parents_next_sibling is not None:
320                new_childs_last_element.next_element = parents_next_sibling
321            else:
322                # The last element of this tag is the last element in
323                # the document.
324                new_childs_last_element.next_element = None
325        else:
326            next_child = self.contents[position]
327            new_child.next_sibling = next_child
328            if new_child.next_sibling is not None:
329                new_child.next_sibling.previous_sibling = new_child
330            new_childs_last_element.next_element = next_child
331
332        if new_childs_last_element.next_element is not None:
333            new_childs_last_element.next_element.previous_element = new_childs_last_element
334        self.contents.insert(position, new_child)
335
336    def append(self, tag):
337        """Appends the given tag to the contents of this tag."""
338        self.insert(len(self.contents), tag)
339
340    def insert_before(self, predecessor):
341        """Makes the given element the immediate predecessor of this one.
342
343        The two elements will have the same parent, and the given element
344        will be immediately before this one.
345        """
346        if self is predecessor:
347            raise ValueError("Can't insert an element before itself.")
348        parent = self.parent
349        if parent is None:
350            raise ValueError(
351                "Element has no parent, so 'before' has no meaning.")
352        # Extract first so that the index won't be screwed up if they
353        # are siblings.
354        if isinstance(predecessor, PageElement):
355            predecessor.extract()
356        index = parent.index(self)
357        parent.insert(index, predecessor)
358
359    def insert_after(self, successor):
360        """Makes the given element the immediate successor of this one.
361
362        The two elements will have the same parent, and the given element
363        will be immediately after this one.
364        """
365        if self is successor:
366            raise ValueError("Can't insert an element after itself.")
367        parent = self.parent
368        if parent is None:
369            raise ValueError(
370                "Element has no parent, so 'after' has no meaning.")
371        # Extract first so that the index won't be screwed up if they
372        # are siblings.
373        if isinstance(successor, PageElement):
374            successor.extract()
375        index = parent.index(self)
376        parent.insert(index+1, successor)
377
378    def find_next(self, name=None, attrs={}, text=None, **kwargs):
379        """Returns the first item that matches the given criteria and
380        appears after this Tag in the document."""
381        return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
382    findNext = find_next  # BS3
383
384    def find_all_next(self, name=None, attrs={}, text=None, limit=None,
385                    **kwargs):
386        """Returns all items that match the given criteria and appear
387        after this Tag in the document."""
388        return self._find_all(name, attrs, text, limit, self.next_elements,
389                             **kwargs)
390    findAllNext = find_all_next  # BS3
391
392    def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
393        """Returns the closest sibling to this Tag that matches the
394        given criteria and appears after this Tag in the document."""
395        return self._find_one(self.find_next_siblings, name, attrs, text,
396                             **kwargs)
397    findNextSibling = find_next_sibling  # BS3
398
399    def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
400                           **kwargs):
401        """Returns the siblings of this Tag that match the given
402        criteria and appear after this Tag in the document."""
403        return self._find_all(name, attrs, text, limit,
404                              self.next_siblings, **kwargs)
405    findNextSiblings = find_next_siblings   # BS3
406    fetchNextSiblings = find_next_siblings  # BS2
407
408    def find_previous(self, name=None, attrs={}, text=None, **kwargs):
409        """Returns the first item that matches the given criteria and
410        appears before this Tag in the document."""
411        return self._find_one(
412            self.find_all_previous, name, attrs, text, **kwargs)
413    findPrevious = find_previous  # BS3
414
415    def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
416                        **kwargs):
417        """Returns all items that match the given criteria and appear
418        before this Tag in the document."""
419        return self._find_all(name, attrs, text, limit, self.previous_elements,
420                           **kwargs)
421    findAllPrevious = find_all_previous  # BS3
422    fetchPrevious = find_all_previous    # BS2
423
424    def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
425        """Returns the closest sibling to this Tag that matches the
426        given criteria and appears before this Tag in the document."""
427        return self._find_one(self.find_previous_siblings, name, attrs, text,
428                             **kwargs)
429    findPreviousSibling = find_previous_sibling  # BS3
430
431    def find_previous_siblings(self, name=None, attrs={}, text=None,
432                               limit=None, **kwargs):
433        """Returns the siblings of this Tag that match the given
434        criteria and appear before this Tag in the document."""
435        return self._find_all(name, attrs, text, limit,
436                              self.previous_siblings, **kwargs)
437    findPreviousSiblings = find_previous_siblings   # BS3
438    fetchPreviousSiblings = find_previous_siblings  # BS2
439
440    def find_parent(self, name=None, attrs={}, **kwargs):
441        """Returns the closest parent of this Tag that matches the given
442        criteria."""
443        # NOTE: We can't use _find_one because findParents takes a different
444        # set of arguments.
445        r = None
446        l = self.find_parents(name, attrs, 1, **kwargs)
447        if l:
448            r = l[0]
449        return r
450    findParent = find_parent  # BS3
451
452    def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
453        """Returns the parents of this Tag that match the given
454        criteria."""
455
456        return self._find_all(name, attrs, None, limit, self.parents,
457                             **kwargs)
458    findParents = find_parents   # BS3
459    fetchParents = find_parents  # BS2
460
461    @property
462    def next(self):
463        return self.next_element
464
465    @property
466    def previous(self):
467        return self.previous_element
468
469    #These methods do the real heavy lifting.
470
471    def _find_one(self, method, name, attrs, text, **kwargs):
472        r = None
473        l = method(name, attrs, text, 1, **kwargs)
474        if l:
475            r = l[0]
476        return r
477
478    def _find_all(self, name, attrs, text, limit, generator, **kwargs):
479        "Iterates over a generator looking for things that match."
480
481        if isinstance(name, SoupStrainer):
482            strainer = name
483        else:
484            strainer = SoupStrainer(name, attrs, text, **kwargs)
485
486        if text is None and not limit and not attrs and not kwargs:
487            if name is True or name is None:
488                # Optimization to find all tags.
489                result = (element for element in generator
490                          if isinstance(element, Tag))
491                return ResultSet(strainer, result)
492            elif isinstance(name, basestring):
493                # Optimization to find all tags with a given name.
494                result = (element for element in generator
495                          if isinstance(element, Tag)
496                            and element.name == name)
497                return ResultSet(strainer, result)
498        results = ResultSet(strainer)
499        while True:
500            try:
501                i = next(generator)
502            except StopIteration:
503                break
504            if i:
505                found = strainer.search(i)
506                if found:
507                    results.append(found)
508                    if limit and len(results) >= limit:
509                        break
510        return results
511
512    #These generators can be used to navigate starting from both
513    #NavigableStrings and Tags.
514    @property
515    def next_elements(self):
516        i = self.next_element
517        while i is not None:
518            yield i
519            i = i.next_element
520
521    @property
522    def next_siblings(self):
523        i = self.next_sibling
524        while i is not None:
525            yield i
526            i = i.next_sibling
527
528    @property
529    def previous_elements(self):
530        i = self.previous_element
531        while i is not None:
532            yield i
533            i = i.previous_element
534
535    @property
536    def previous_siblings(self):
537        i = self.previous_sibling
538        while i is not None:
539            yield i
540            i = i.previous_sibling
541
542    @property
543    def parents(self):
544        i = self.parent
545        while i is not None:
546            yield i
547            i = i.parent
548
549    # Methods for supporting CSS selectors.
550
551    tag_name_re = re.compile('^[a-z0-9]+$')
552
553    # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
554    #   \---/  \---/\-------------/    \-------/
555    #     |      |         |               |
556    #     |      |         |           The value
557    #     |      |    ~,|,^,$,* or =
558    #     |   Attribute
559    #    Tag
560    attribselect_re = re.compile(
561        r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' +
562        r'=?"?(?P<value>[^\]"]*)"?\]$'
563        )
564
565    def _attr_value_as_string(self, value, default=None):
566        """Force an attribute value into a string representation.
567
568        A multi-valued attribute will be converted into a
569        space-separated stirng.
570        """
571        value = self.get(value, default)
572        if isinstance(value, list) or isinstance(value, tuple):
573            value =" ".join(value)
574        return value
575
576    def _tag_name_matches_and(self, function, tag_name):
577        if not tag_name:
578            return function
579        else:
580            def _match(tag):
581                return tag.name == tag_name and function(tag)
582            return _match
583
584    def _attribute_checker(self, operator, attribute, value=''):
585        """Create a function that performs a CSS selector operation.
586
587        Takes an operator, attribute and optional value. Returns a
588        function that will return True for elements that match that
589        combination.
590        """
591        if operator == '=':
592            # string representation of `attribute` is equal to `value`
593            return lambda el: el._attr_value_as_string(attribute) == value
594        elif operator == '~':
595            # space-separated list representation of `attribute`
596            # contains `value`
597            def _includes_value(element):
598                attribute_value = element.get(attribute, [])
599                if not isinstance(attribute_value, list):
600                    attribute_value = attribute_value.split()
601                return value in attribute_value
602            return _includes_value
603        elif operator == '^':
604            # string representation of `attribute` starts with `value`
605            return lambda el: el._attr_value_as_string(
606                attribute, '').startswith(value)
607        elif operator == '$':
608            # string represenation of `attribute` ends with `value`
609            return lambda el: el._attr_value_as_string(
610                attribute, '').endswith(value)
611        elif operator == '*':
612            # string representation of `attribute` contains `value`
613            return lambda el: value in el._attr_value_as_string(attribute, '')
614        elif operator == '|':
615            # string representation of `attribute` is either exactly
616            # `value` or starts with `value` and then a dash.
617            def _is_or_starts_with_dash(element):
618                attribute_value = element._attr_value_as_string(attribute, '')
619                return (attribute_value == value or attribute_value.startswith(
620                        value + '-'))
621            return _is_or_starts_with_dash
622        else:
623            return lambda el: el.has_attr(attribute)
624
625    # Old non-property versions of the generators, for backwards
626    # compatibility with BS3.
627    def nextGenerator(self):
628        return self.next_elements
629
630    def nextSiblingGenerator(self):
631        return self.next_siblings
632
633    def previousGenerator(self):
634        return self.previous_elements
635
636    def previousSiblingGenerator(self):
637        return self.previous_siblings
638
639    def parentGenerator(self):
640        return self.parents
641
642
643class NavigableString(unicode, PageElement):
644
645    PREFIX = ''
646    SUFFIX = ''
647
648    def __new__(cls, value):
649        """Create a new NavigableString.
650
651        When unpickling a NavigableString, this method is called with
652        the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
653        passed in to the superclass's __new__ or the superclass won't know
654        how to handle non-ASCII characters.
655        """
656        if isinstance(value, unicode):
657            return unicode.__new__(cls, value)
658        return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
659
660    def __copy__(self):
661        return self
662
663    def __getnewargs__(self):
664        return (unicode(self),)
665
666    def __getattr__(self, attr):
667        """text.string gives you text. This is for backwards
668        compatibility for Navigable*String, but for CData* it lets you
669        get the string without the CData wrapper."""
670        if attr == 'string':
671            return self
672        else:
673            raise AttributeError(
674                "'%s' object has no attribute '%s'" % (
675                    self.__class__.__name__, attr))
676
677    def output_ready(self, formatter="minimal"):
678        output = self.format_string(self, formatter)
679        return self.PREFIX + output + self.SUFFIX
680
681    @property
682    def name(self):
683        return None
684
685    @name.setter
686    def name(self, name):
687        raise AttributeError("A NavigableString cannot be given a name.")
688
689class PreformattedString(NavigableString):
690    """A NavigableString not subject to the normal formatting rules.
691
692    The string will be passed into the formatter (to trigger side effects),
693    but the return value will be ignored.
694    """
695
696    def output_ready(self, formatter="minimal"):
697        """CData strings are passed into the formatter.
698        But the return value is ignored."""
699        self.format_string(self, formatter)
700        return self.PREFIX + self + self.SUFFIX
701
702class CData(PreformattedString):
703
704    PREFIX = u'<![CDATA['
705    SUFFIX = u']]>'
706
707class ProcessingInstruction(PreformattedString):
708
709    PREFIX = u'<?'
710    SUFFIX = u'?>'
711
712class Comment(PreformattedString):
713
714    PREFIX = u'<!--'
715    SUFFIX = u'-->'
716
717
718class Declaration(PreformattedString):
719    PREFIX = u'<!'
720    SUFFIX = u'!>'
721
722
723class Doctype(PreformattedString):
724
725    @classmethod
726    def for_name_and_ids(cls, name, pub_id, system_id):
727        value = name or ''
728        if pub_id is not None:
729            value += ' PUBLIC "%s"' % pub_id
730            if system_id is not None:
731                value += ' "%s"' % system_id
732        elif system_id is not None:
733            value += ' SYSTEM "%s"' % system_id
734
735        return Doctype(value)
736
737    PREFIX = u'<!DOCTYPE '
738    SUFFIX = u'>\n'
739
740
741class Tag(PageElement):
742
743    """Represents a found HTML tag with its attributes and contents."""
744
745    def __init__(self, parser=None, builder=None, name=None, namespace=None,
746                 prefix=None, attrs=None, parent=None, previous=None):
747        "Basic constructor."
748
749        if parser is None:
750            self.parser_class = None
751        else:
752            # We don't actually store the parser object: that lets extracted
753            # chunks be garbage-collected.
754            self.parser_class = parser.__class__
755        if name is None:
756            raise ValueError("No value provided for new tag's name.")
757        self.name = name
758        self.namespace = namespace
759        self.prefix = prefix
760        if attrs is None:
761            attrs = {}
762        elif attrs and builder.cdata_list_attributes:
763            attrs = builder._replace_cdata_list_attribute_values(
764                self.name, attrs)
765        else:
766            attrs = dict(attrs)
767        self.attrs = attrs
768        self.contents = []
769        self.setup(parent, previous)
770        self.hidden = False
771
772        # Set up any substitutions, such as the charset in a META tag.
773        if builder is not None:
774            builder.set_up_substitutions(self)
775            self.can_be_empty_element = builder.can_be_empty_element(name)
776        else:
777            self.can_be_empty_element = False
778
779    parserClass = _alias("parser_class")  # BS3
780
781    @property
782    def is_empty_element(self):
783        """Is this tag an empty-element tag? (aka a self-closing tag)
784
785        A tag that has contents is never an empty-element tag.
786
787        A tag that has no contents may or may not be an empty-element
788        tag. It depends on the builder used to create the tag. If the
789        builder has a designated list of empty-element tags, then only
790        a tag whose name shows up in that list is considered an
791        empty-element tag.
792
793        If the builder has no designated list of empty-element tags,
794        then any tag with no contents is an empty-element tag.
795        """
796        return len(self.contents) == 0 and self.can_be_empty_element
797    isSelfClosing = is_empty_element  # BS3
798
799    @property
800    def string(self):
801        """Convenience property to get the single string within this tag.
802
803        :Return: If this tag has a single string child, return value
804         is that string. If this tag has no children, or more than one
805         child, return value is None. If this tag has one child tag,
806         return value is the 'string' attribute of the child tag,
807         recursively.
808        """
809        if len(self.contents) != 1:
810            return None
811        child = self.contents[0]
812        if isinstance(child, NavigableString):
813            return child
814        return child.string
815
816    @string.setter
817    def string(self, string):
818        self.clear()
819        self.append(string.__class__(string))
820
821    def _all_strings(self, strip=False, types=(NavigableString, CData)):
822        """Yield all strings of certain classes, possibly stripping them.
823
824        By default, yields only NavigableString and CData objects. So
825        no comments, processing instructions, etc.
826        """
827        for descendant in self.descendants:
828            if (
829                (types is None and not isinstance(descendant, NavigableString))
830                or
831                (types is not None and type(descendant) not in types)):
832                continue
833            if strip:
834                descendant = descendant.strip()
835                if len(descendant) == 0:
836                    continue
837            yield descendant
838
839    strings = property(_all_strings)
840
841    @property
842    def stripped_strings(self):
843        for string in self._all_strings(True):
844            yield string
845
846    def get_text(self, separator=u"", strip=False,
847                 types=(NavigableString, CData)):
848        """
849        Get all child strings, concatenated using the given separator.
850        """
851        return separator.join([s for s in self._all_strings(
852                    strip, types=types)])
853    getText = get_text
854    text = property(get_text)
855
856    def decompose(self):
857        """Recursively destroys the contents of this tree."""
858        self.extract()
859        i = self
860        while i is not None:
861            next = i.next_element
862            i.__dict__.clear()
863            i.contents = []
864            i = next
865
866    def clear(self, decompose=False):
867        """
868        Extract all children. If decompose is True, decompose instead.
869        """
870        if decompose:
871            for element in self.contents[:]:
872                if isinstance(element, Tag):
873                    element.decompose()
874                else:
875                    element.extract()
876        else:
877            for element in self.contents[:]:
878                element.extract()
879
880    def index(self, element):
881        """
882        Find the index of a child by identity, not value. Avoids issues with
883        tag.contents.index(element) getting the index of equal elements.
884        """
885        for i, child in enumerate(self.contents):
886            if child is element:
887                return i
888        raise ValueError("Tag.index: element not in tag")
889
890    def get(self, key, default=None):
891        """Returns the value of the 'key' attribute for the tag, or
892        the value given for 'default' if it doesn't have that
893        attribute."""
894        return self.attrs.get(key, default)
895
896    def has_attr(self, key):
897        return key in self.attrs
898
899    def __hash__(self):
900        return str(self).__hash__()
901
902    def __getitem__(self, key):
903        """tag[key] returns the value of the 'key' attribute for the tag,
904        and throws an exception if it's not there."""
905        return self.attrs[key]
906
907    def __iter__(self):
908        "Iterating over a tag iterates over its contents."
909        return iter(self.contents)
910
911    def __len__(self):
912        "The length of a tag is the length of its list of contents."
913        return len(self.contents)
914
915    def __contains__(self, x):
916        return x in self.contents
917
918    def __nonzero__(self):
919        "A tag is non-None even if it has no contents."
920        return True
921
922    def __setitem__(self, key, value):
923        """Setting tag[key] sets the value of the 'key' attribute for the
924        tag."""
925        self.attrs[key] = value
926
927    def __delitem__(self, key):
928        "Deleting tag[key] deletes all 'key' attributes for the tag."
929        self.attrs.pop(key, None)
930
931    def __call__(self, *args, **kwargs):
932        """Calling a tag like a function is the same as calling its
933        find_all() method. Eg. tag('a') returns a list of all the A tags
934        found within this tag."""
935        return self.find_all(*args, **kwargs)
936
937    def __getattr__(self, tag):
938        #print "Getattr %s.%s" % (self.__class__, tag)
939        if len(tag) > 3 and tag.endswith('Tag'):
940            # BS3: soup.aTag -> "soup.find("a")
941            tag_name = tag[:-3]
942            warnings.warn(
943                '.%sTag is deprecated, use .find("%s") instead.' % (
944                    tag_name, tag_name))
945            return self.find(tag_name)
946        # We special case contents to avoid recursion.
947        elif not tag.startswith("__") and not tag=="contents":
948            return self.find(tag)
949        raise AttributeError(
950            "'%s' object has no attribute '%s'" % (self.__class__, tag))
951
952    def __eq__(self, other):
953        """Returns true iff this tag has the same name, the same attributes,
954        and the same contents (recursively) as the given tag."""
955        if self is other:
956            return True
957        if (not hasattr(other, 'name') or
958            not hasattr(other, 'attrs') or
959            not hasattr(other, 'contents') or
960            self.name != other.name or
961            self.attrs != other.attrs or
962            len(self) != len(other)):
963            return False
964        for i, my_child in enumerate(self.contents):
965            if my_child != other.contents[i]:
966                return False
967        return True
968
969    def __ne__(self, other):
970        """Returns true iff this tag is not identical to the other tag,
971        as defined in __eq__."""
972        return not self == other
973
974    def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
975        """Renders this tag as a string."""
976        return self.encode(encoding)
977
978    def __unicode__(self):
979        return self.decode()
980
981    def __str__(self):
982        return self.encode()
983
984    if PY3K:
985        __str__ = __repr__ = __unicode__
986
987    def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
988               indent_level=None, formatter="minimal",
989               errors="xmlcharrefreplace"):
990        # Turn the data structure into Unicode, then encode the
991        # Unicode.
992        u = self.decode(indent_level, encoding, formatter)
993        return u.encode(encoding, errors)
994
995    def _should_pretty_print(self, indent_level):
996        """Should this tag be pretty-printed?"""
997        return (
998            indent_level is not None and
999            (self.name not in HTMLAwareEntitySubstitution.preformatted_tags
1000             or self._is_xml))
1001
1002    def decode(self, indent_level=None,
1003               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1004               formatter="minimal"):
1005        """Returns a Unicode representation of this tag and its contents.
1006
1007        :param eventual_encoding: The tag is destined to be
1008           encoded into this encoding. This method is _not_
1009           responsible for performing that encoding. This information
1010           is passed in so that it can be substituted in if the
1011           document contains a <META> tag that mentions the document's
1012           encoding.
1013        """
1014
1015        # First off, turn a string formatter into a function. This
1016        # will stop the lookup from happening over and over again.
1017        if not callable(formatter):
1018            formatter = self._formatter_for_name(formatter)
1019
1020        attrs = []
1021        if self.attrs:
1022            for key, val in sorted(self.attrs.items()):
1023                if val is None:
1024                    decoded = key
1025                else:
1026                    if isinstance(val, list) or isinstance(val, tuple):
1027                        val = ' '.join(val)
1028                    elif not isinstance(val, basestring):
1029                        val = unicode(val)
1030                    elif (
1031                        isinstance(val, AttributeValueWithCharsetSubstitution)
1032                        and eventual_encoding is not None):
1033                        val = val.encode(eventual_encoding)
1034
1035                    text = self.format_string(val, formatter)
1036                    decoded = (
1037                        unicode(key) + '='
1038                        + EntitySubstitution.quoted_attribute_value(text))
1039                attrs.append(decoded)
1040        close = ''
1041        closeTag = ''
1042
1043        prefix = ''
1044        if self.prefix:
1045            prefix = self.prefix + ":"
1046
1047        if self.is_empty_element:
1048            close = '/'
1049        else:
1050            closeTag = '</%s%s>' % (prefix, self.name)
1051
1052        pretty_print = self._should_pretty_print(indent_level)
1053        space = ''
1054        indent_space = ''
1055        if indent_level is not None:
1056            indent_space = (' ' * (indent_level - 1))
1057        if pretty_print:
1058            space = indent_space
1059            indent_contents = indent_level + 1
1060        else:
1061            indent_contents = None
1062        contents = self.decode_contents(
1063            indent_contents, eventual_encoding, formatter)
1064
1065        if self.hidden:
1066            # This is the 'document root' object.
1067            s = contents
1068        else:
1069            s = []
1070            attribute_string = ''
1071            if attrs:
1072                attribute_string = ' ' + ' '.join(attrs)
1073            if indent_level is not None:
1074                # Even if this particular tag is not pretty-printed,
1075                # we should indent up to the start of the tag.
1076                s.append(indent_space)
1077            s.append('<%s%s%s%s>' % (
1078                    prefix, self.name, attribute_string, close))
1079            if pretty_print:
1080                s.append("\n")
1081            s.append(contents)
1082            if pretty_print and contents and contents[-1] != "\n":
1083                s.append("\n")
1084            if pretty_print and closeTag:
1085                s.append(space)
1086            s.append(closeTag)
1087            if indent_level is not None and closeTag and self.next_sibling:
1088                # Even if this particular tag is not pretty-printed,
1089                # we're now done with the tag, and we should add a
1090                # newline if appropriate.
1091                s.append("\n")
1092            s = ''.join(s)
1093        return s
1094
1095    def prettify(self, encoding=None, formatter="minimal"):
1096        if encoding is None:
1097            return self.decode(True, formatter=formatter)
1098        else:
1099            return self.encode(encoding, True, formatter=formatter)
1100
1101    def decode_contents(self, indent_level=None,
1102                       eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1103                       formatter="minimal"):
1104        """Renders the contents of this tag as a Unicode string.
1105
1106        :param eventual_encoding: The tag is destined to be
1107           encoded into this encoding. This method is _not_
1108           responsible for performing that encoding. This information
1109           is passed in so that it can be substituted in if the
1110           document contains a <META> tag that mentions the document's
1111           encoding.
1112        """
1113        # First off, turn a string formatter into a function. This
1114        # will stop the lookup from happening over and over again.
1115        if not callable(formatter):
1116            formatter = self._formatter_for_name(formatter)
1117
1118        pretty_print = (indent_level is not None)
1119        s = []
1120        for c in self:
1121            text = None
1122            if isinstance(c, NavigableString):
1123                text = c.output_ready(formatter)
1124            elif isinstance(c, Tag):
1125                s.append(c.decode(indent_level, eventual_encoding,
1126                                  formatter))
1127            if text and indent_level and not self.name == 'pre':
1128                text = text.strip()
1129            if text:
1130                if pretty_print and not self.name == 'pre':
1131                    s.append(" " * (indent_level - 1))
1132                s.append(text)
1133                if pretty_print and not self.name == 'pre':
1134                    s.append("\n")
1135        return ''.join(s)
1136
1137    def encode_contents(
1138        self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
1139        formatter="minimal"):
1140        """Renders the contents of this tag as a bytestring."""
1141        contents = self.decode_contents(indent_level, encoding, formatter)
1142        return contents.encode(encoding)
1143
1144    # Old method for BS3 compatibility
1145    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
1146                       prettyPrint=False, indentLevel=0):
1147        if not prettyPrint:
1148            indentLevel = None
1149        return self.encode_contents(
1150            indent_level=indentLevel, encoding=encoding)
1151
1152    #Soup methods
1153
1154    def find(self, name=None, attrs={}, recursive=True, text=None,
1155             **kwargs):
1156        """Return only the first child of this Tag matching the given
1157        criteria."""
1158        r = None
1159        l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
1160        if l:
1161            r = l[0]
1162        return r
1163    findChild = find
1164
1165    def find_all(self, name=None, attrs={}, recursive=True, text=None,
1166                 limit=None, **kwargs):
1167        """Extracts a list of Tag objects that match the given
1168        criteria.  You can specify the name of the Tag and any
1169        attributes you want the Tag to have.
1170
1171        The value of a key-value pair in the 'attrs' map can be a
1172        string, a list of strings, a regular expression object, or a
1173        callable that takes a string and returns whether or not the
1174        string matches for some custom definition of 'matches'. The
1175        same is true of the tag name."""
1176
1177        generator = self.descendants
1178        if not recursive:
1179            generator = self.children
1180        return self._find_all(name, attrs, text, limit, generator, **kwargs)
1181    findAll = find_all       # BS3
1182    findChildren = find_all  # BS2
1183
1184    #Generator methods
1185    @property
1186    def children(self):
1187        # return iter() to make the purpose of the method clear
1188        return iter(self.contents)  # XXX This seems to be untested.
1189
1190    @property
1191    def descendants(self):
1192        if not len(self.contents):
1193            return
1194        stopNode = self._last_descendant().next_element
1195        current = self.contents[0]
1196        while current is not stopNode:
1197            yield current
1198            current = current.next_element
1199
1200    # CSS selector code
1201
1202    _selector_combinators = ['>', '+', '~']
1203    _select_debug = False
1204    def select(self, selector, _candidate_generator=None):
1205        """Perform a CSS selection operation on the current element."""
1206        tokens = selector.split()
1207        current_context = [self]
1208
1209        if tokens[-1] in self._selector_combinators:
1210            raise ValueError(
1211                'Final combinator "%s" is missing an argument.' % tokens[-1])
1212        if self._select_debug:
1213            print 'Running CSS selector "%s"' % selector
1214        for index, token in enumerate(tokens):
1215            if self._select_debug:
1216                print ' Considering token "%s"' % token
1217            recursive_candidate_generator = None
1218            tag_name = None
1219            if tokens[index-1] in self._selector_combinators:
1220                # This token was consumed by the previous combinator. Skip it.
1221                if self._select_debug:
1222                    print '  Token was consumed by the previous combinator.'
1223                continue
1224            # Each operation corresponds to a checker function, a rule
1225            # for determining whether a candidate matches the
1226            # selector. Candidates are generated by the active
1227            # iterator.
1228            checker = None
1229
1230            m = self.attribselect_re.match(token)
1231            if m is not None:
1232                # Attribute selector
1233                tag_name, attribute, operator, value = m.groups()
1234                checker = self._attribute_checker(operator, attribute, value)
1235
1236            elif '#' in token:
1237                # ID selector
1238                tag_name, tag_id = token.split('#', 1)
1239                def id_matches(tag):
1240                    return tag.get('id', None) == tag_id
1241                checker = id_matches
1242
1243            elif '.' in token:
1244                # Class selector
1245                tag_name, klass = token.split('.', 1)
1246                classes = set(klass.split('.'))
1247                def classes_match(candidate):
1248                    return classes.issubset(candidate.get('class', []))
1249                checker = classes_match
1250
1251            elif ':' in token:
1252                # Pseudo-class
1253                tag_name, pseudo = token.split(':', 1)
1254                if tag_name == '':
1255                    raise ValueError(
1256                        "A pseudo-class must be prefixed with a tag name.")
1257                pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
1258                found = []
1259                if pseudo_attributes is not None:
1260                    pseudo_type, pseudo_value = pseudo_attributes.groups()
1261                    if pseudo_type == 'nth-of-type':
1262                        try:
1263                            pseudo_value = int(pseudo_value)
1264                        except:
1265                            raise NotImplementedError(
1266                                'Only numeric values are currently supported for the nth-of-type pseudo-class.')
1267                        if pseudo_value < 1:
1268                            raise ValueError(
1269                                'nth-of-type pseudo-class value must be at least 1.')
1270                        class Counter(object):
1271                            def __init__(self, destination):
1272                                self.count = 0
1273                                self.destination = destination
1274
1275                            def nth_child_of_type(self, tag):
1276                                self.count += 1
1277                                if self.count == self.destination:
1278                                    return True
1279                                if self.count > self.destination:
1280                                    # Stop the generator that's sending us
1281                                    # these things.
1282                                    raise StopIteration()
1283                                return False
1284                        checker = Counter(pseudo_value).nth_child_of_type
1285                    else:
1286                        raise NotImplementedError(
1287                            'Only the following pseudo-classes are implemented: nth-of-type.')
1288
1289            elif token == '*':
1290                # Star selector -- matches everything
1291                pass
1292            elif token == '>':
1293                # Run the next token as a CSS selector against the
1294                # direct children of each tag in the current context.
1295                recursive_candidate_generator = lambda tag: tag.children
1296            elif token == '~':
1297                # Run the next token as a CSS selector against the
1298                # siblings of each tag in the current context.
1299                recursive_candidate_generator = lambda tag: tag.next_siblings
1300            elif token == '+':
1301                # For each tag in the current context, run the next
1302                # token as a CSS selector against the tag's next
1303                # sibling that's a tag.
1304                def next_tag_sibling(tag):
1305                    yield tag.find_next_sibling(True)
1306                recursive_candidate_generator = next_tag_sibling
1307
1308            elif self.tag_name_re.match(token):
1309                # Just a tag name.
1310                tag_name = token
1311            else:
1312                raise ValueError(
1313                    'Unsupported or invalid CSS selector: "%s"' % token)
1314
1315            if recursive_candidate_generator:
1316                # This happens when the selector looks like  "> foo".
1317                #
1318                # The generator calls select() recursively on every
1319                # member of the current context, passing in a different
1320                # candidate generator and a different selector.
1321                #
1322                # In the case of "> foo", the candidate generator is
1323                # one that yields a tag's direct children (">"), and
1324                # the selector is "foo".
1325                next_token = tokens[index+1]
1326                def recursive_select(tag):
1327                    if self._select_debug:
1328                        print '    Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
1329                        print '-' * 40
1330                    for i in tag.select(next_token, recursive_candidate_generator):
1331                        if self._select_debug:
1332                            print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
1333                        yield i
1334                    if self._select_debug:
1335                        print '-' * 40
1336                _use_candidate_generator = recursive_select
1337            elif _candidate_generator is None:
1338                # By default, a tag's candidates are all of its
1339                # children. If tag_name is defined, only yield tags
1340                # with that name.
1341                if self._select_debug:
1342                    if tag_name:
1343                        check = "[any]"
1344                    else:
1345                        check = tag_name
1346                    print '   Default candidate generator, tag name="%s"' % check
1347                if self._select_debug:
1348                    # This is redundant with later code, but it stops
1349                    # a bunch of bogus tags from cluttering up the
1350                    # debug log.
1351                    def default_candidate_generator(tag):
1352                        for child in tag.descendants:
1353                            if not isinstance(child, Tag):
1354                                continue
1355                            if tag_name and not child.name == tag_name:
1356                                continue
1357                            yield child
1358                    _use_candidate_generator = default_candidate_generator
1359                else:
1360                    _use_candidate_generator = lambda tag: tag.descendants
1361            else:
1362                _use_candidate_generator = _candidate_generator
1363
1364            new_context = []
1365            new_context_ids = set([])
1366            for tag in current_context:
1367                if self._select_debug:
1368                    print "    Running candidate generator on %s %s" % (
1369                        tag.name, repr(tag.attrs))
1370                for candidate in _use_candidate_generator(tag):
1371                    if not isinstance(candidate, Tag):
1372                        continue
1373                    if tag_name and candidate.name != tag_name:
1374                        continue
1375                    if checker is not None:
1376                        try:
1377                            result = checker(candidate)
1378                        except StopIteration:
1379                            # The checker has decided we should no longer
1380                            # run the generator.
1381                            break
1382                    if checker is None or result:
1383                        if self._select_debug:
1384                            print "     SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
1385                        if id(candidate) not in new_context_ids:
1386                            # If a tag matches a selector more than once,
1387                            # don't include it in the context more than once.
1388                            new_context.append(candidate)
1389                            new_context_ids.add(id(candidate))
1390                    elif self._select_debug:
1391                        print "     FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
1392
1393            current_context = new_context
1394
1395        if self._select_debug:
1396            print "Final verdict:"
1397            for i in current_context:
1398                print " %s %s" % (i.name, i.attrs)
1399        return current_context
1400
1401    # Old names for backwards compatibility
1402    def childGenerator(self):
1403        return self.children
1404
1405    def recursiveChildGenerator(self):
1406        return self.descendants
1407
1408    def has_key(self, key):
1409        """This was kind of misleading because has_key() (attributes)
1410        was different from __in__ (contents). has_key() is gone in
1411        Python 3, anyway."""
1412        warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
1413                key))
1414        return self.has_attr(key)
1415
1416# Next, a couple classes to represent queries and their results.
1417class SoupStrainer(object):
1418    """Encapsulates a number of ways of matching a markup element (tag or
1419    text)."""
1420
1421    def __init__(self, name=None, attrs={}, text=None, **kwargs):
1422        self.name = self._normalize_search_value(name)
1423        if not isinstance(attrs, dict):
1424            # Treat a non-dict value for attrs as a search for the 'class'
1425            # attribute.
1426            kwargs['class'] = attrs
1427            attrs = None
1428
1429        if 'class_' in kwargs:
1430            # Treat class_="foo" as a search for the 'class'
1431            # attribute, overriding any non-dict value for attrs.
1432            kwargs['class'] = kwargs['class_']
1433            del kwargs['class_']
1434
1435        if kwargs:
1436            if attrs:
1437                attrs = attrs.copy()
1438                attrs.update(kwargs)
1439            else:
1440                attrs = kwargs
1441        normalized_attrs = {}
1442        for key, value in attrs.items():
1443            normalized_attrs[key] = self._normalize_search_value(value)
1444
1445        self.attrs = normalized_attrs
1446        self.text = self._normalize_search_value(text)
1447
1448    def _normalize_search_value(self, value):
1449        # Leave it alone if it's a Unicode string, a callable, a
1450        # regular expression, a boolean, or None.
1451        if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match')
1452            or isinstance(value, bool) or value is None):
1453            return value
1454
1455        # If it's a bytestring, convert it to Unicode, treating it as UTF-8.
1456        if isinstance(value, bytes):
1457            return value.decode("utf8")
1458
1459        # If it's listlike, convert it into a list of strings.
1460        if hasattr(value, '__iter__'):
1461            new_value = []
1462            for v in value:
1463                if (hasattr(v, '__iter__') and not isinstance(v, bytes)
1464                    and not isinstance(v, unicode)):
1465                    # This is almost certainly the user's mistake. In the
1466                    # interests of avoiding infinite loops, we'll let
1467                    # it through as-is rather than doing a recursive call.
1468                    new_value.append(v)
1469                else:
1470                    new_value.append(self._normalize_search_value(v))
1471            return new_value
1472
1473        # Otherwise, convert it into a Unicode string.
1474        # The unicode(str()) thing is so this will do the same thing on Python 2
1475        # and Python 3.
1476        return unicode(str(value))
1477
1478    def __str__(self):
1479        if self.text:
1480            return self.text
1481        else:
1482            return "%s|%s" % (self.name, self.attrs)
1483
1484    def search_tag(self, markup_name=None, markup_attrs={}):
1485        found = None
1486        markup = None
1487        if isinstance(markup_name, Tag):
1488            markup = markup_name
1489            markup_attrs = markup
1490        call_function_with_tag_data = (
1491            isinstance(self.name, collections.Callable)
1492            and not isinstance(markup_name, Tag))
1493
1494        if ((not self.name)
1495            or call_function_with_tag_data
1496            or (markup and self._matches(markup, self.name))
1497            or (not markup and self._matches(markup_name, self.name))):
1498            if call_function_with_tag_data:
1499                match = self.name(markup_name, markup_attrs)
1500            else:
1501                match = True
1502                markup_attr_map = None
1503                for attr, match_against in list(self.attrs.items()):
1504                    if not markup_attr_map:
1505                        if hasattr(markup_attrs, 'get'):
1506                            markup_attr_map = markup_attrs
1507                        else:
1508                            markup_attr_map = {}
1509                            for k, v in markup_attrs:
1510                                markup_attr_map[k] = v
1511                    attr_value = markup_attr_map.get(attr)
1512                    if not self._matches(attr_value, match_against):
1513                        match = False
1514                        break
1515            if match:
1516                if markup:
1517                    found = markup
1518                else:
1519                    found = markup_name
1520        if found and self.text and not self._matches(found.string, self.text):
1521            found = None
1522        return found
1523    searchTag = search_tag
1524
1525    def search(self, markup):
1526        # print 'looking for %s in %s' % (self, markup)
1527        found = None
1528        # If given a list of items, scan it for a text element that
1529        # matches.
1530        if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)):
1531            for element in markup:
1532                if isinstance(element, NavigableString) \
1533                       and self.search(element):
1534                    found = element
1535                    break
1536        # If it's a Tag, make sure its name or attributes match.
1537        # Don't bother with Tags if we're searching for text.
1538        elif isinstance(markup, Tag):
1539            if not self.text or self.name or self.attrs:
1540                found = self.search_tag(markup)
1541        # If it's text, make sure the text matches.
1542        elif isinstance(markup, NavigableString) or \
1543                 isinstance(markup, basestring):
1544            if not self.name and not self.attrs and self._matches(markup, self.text):
1545                found = markup
1546        else:
1547            raise Exception(
1548                "I don't know how to match against a %s" % markup.__class__)
1549        return found
1550
1551    def _matches(self, markup, match_against):
1552        # print u"Matching %s against %s" % (markup, match_against)
1553        result = False
1554        if isinstance(markup, list) or isinstance(markup, tuple):
1555            # This should only happen when searching a multi-valued attribute
1556            # like 'class'.
1557            if (isinstance(match_against, unicode)
1558                and ' ' in match_against):
1559                # A bit of a special case. If they try to match "foo
1560                # bar" on a multivalue attribute's value, only accept
1561                # the literal value "foo bar"
1562                #
1563                # XXX This is going to be pretty slow because we keep
1564                # splitting match_against. But it shouldn't come up
1565                # too often.
1566                return (whitespace_re.split(match_against) == markup)
1567            else:
1568                for item in markup:
1569                    if self._matches(item, match_against):
1570                        return True
1571                return False
1572
1573        if match_against is True:
1574            # True matches any non-None value.
1575            return markup is not None
1576
1577        if isinstance(match_against, collections.Callable):
1578            return match_against(markup)
1579
1580        # Custom callables take the tag as an argument, but all
1581        # other ways of matching match the tag name as a string.
1582        if isinstance(markup, Tag):
1583            markup = markup.name
1584
1585        # Ensure that `markup` is either a Unicode string, or None.
1586        markup = self._normalize_search_value(markup)
1587
1588        if markup is None:
1589            # None matches None, False, an empty string, an empty list, and so on.
1590            return not match_against
1591
1592        if isinstance(match_against, unicode):
1593            # Exact string match
1594            return markup == match_against
1595
1596        if hasattr(match_against, 'match'):
1597            # Regexp match
1598            return match_against.search(markup)
1599
1600        if hasattr(match_against, '__iter__'):
1601            # The markup must be an exact match against something
1602            # in the iterable.
1603            return markup in match_against
1604
1605
1606class ResultSet(list):
1607    """A ResultSet is just a list that keeps track of the SoupStrainer
1608    that created it."""
1609    def __init__(self, source, result=()):
1610        super(ResultSet, self).__init__(result)
1611        self.source = source
1612