1import collections 2import re 3import sys 4import warnings 5from bs4.dammit import EntitySubstitution 6 7DEFAULT_OUTPUT_ENCODING = "utf-8" 8PY3K = (sys.version_info[0] > 2) 9 10whitespace_re = re.compile("\s+") 11 12def _alias(attr): 13 """Alias one attribute name to another for backward compatibility""" 14 @property 15 def alias(self): 16 return getattr(self, attr) 17 18 @alias.setter 19 def alias(self): 20 return setattr(self, attr) 21 return alias 22 23 24class NamespacedAttribute(unicode): 25 26 def __new__(cls, prefix, name, namespace=None): 27 if name is None: 28 obj = unicode.__new__(cls, prefix) 29 elif prefix is None: 30 # Not really namespaced. 31 obj = unicode.__new__(cls, name) 32 else: 33 obj = unicode.__new__(cls, prefix + ":" + name) 34 obj.prefix = prefix 35 obj.name = name 36 obj.namespace = namespace 37 return obj 38 39class AttributeValueWithCharsetSubstitution(unicode): 40 """A stand-in object for a character encoding specified in HTML.""" 41 42class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): 43 """A generic stand-in for the value of a meta tag's 'charset' attribute. 44 45 When Beautiful Soup parses the markup '<meta charset="utf8">', the 46 value of the 'charset' attribute will be one of these objects. 47 """ 48 49 def __new__(cls, original_value): 50 obj = unicode.__new__(cls, original_value) 51 obj.original_value = original_value 52 return obj 53 54 def encode(self, encoding): 55 return encoding 56 57 58class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): 59 """A generic stand-in for the value of a meta tag's 'content' attribute. 60 61 When Beautiful Soup parses the markup: 62 <meta http-equiv="content-type" content="text/html; charset=utf8"> 63 64 The value of the 'content' attribute will be one of these objects. 65 """ 66 67 CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) 68 69 def __new__(cls, original_value): 70 match = cls.CHARSET_RE.search(original_value) 71 if match is None: 72 # No substitution necessary. 73 return unicode.__new__(unicode, original_value) 74 75 obj = unicode.__new__(cls, original_value) 76 obj.original_value = original_value 77 return obj 78 79 def encode(self, encoding): 80 def rewrite(match): 81 return match.group(1) + encoding 82 return self.CHARSET_RE.sub(rewrite, self.original_value) 83 84class HTMLAwareEntitySubstitution(EntitySubstitution): 85 86 """Entity substitution rules that are aware of some HTML quirks. 87 88 Specifically, the contents of <script> and <style> tags should not 89 undergo entity substitution. 90 91 Incoming NavigableString objects are checked to see if they're the 92 direct children of a <script> or <style> tag. 93 """ 94 95 cdata_containing_tags = set(["script", "style"]) 96 97 preformatted_tags = set(["pre"]) 98 99 @classmethod 100 def _substitute_if_appropriate(cls, ns, f): 101 if (isinstance(ns, NavigableString) 102 and ns.parent is not None 103 and ns.parent.name in cls.cdata_containing_tags): 104 # Do nothing. 105 return ns 106 # Substitute. 107 return f(ns) 108 109 @classmethod 110 def substitute_html(cls, ns): 111 return cls._substitute_if_appropriate( 112 ns, EntitySubstitution.substitute_html) 113 114 @classmethod 115 def substitute_xml(cls, ns): 116 return cls._substitute_if_appropriate( 117 ns, EntitySubstitution.substitute_xml) 118 119class PageElement(object): 120 """Contains the navigational information for some part of the page 121 (either a tag or a piece of text)""" 122 123 # There are five possible values for the "formatter" argument passed in 124 # to methods like encode() and prettify(): 125 # 126 # "html" - All Unicode characters with corresponding HTML entities 127 # are converted to those entities on output. 128 # "minimal" - Bare ampersands and angle brackets are converted to 129 # XML entities: & < > 130 # None - The null formatter. Unicode characters are never 131 # converted to entities. This is not recommended, but it's 132 # faster than "minimal". 133 # A function - This function will be called on every string that 134 # needs to undergo entity substitution. 135 # 136 137 # In an HTML document, the default "html" and "minimal" functions 138 # will leave the contents of <script> and <style> tags alone. For 139 # an XML document, all tags will be given the same treatment. 140 141 HTML_FORMATTERS = { 142 "html" : HTMLAwareEntitySubstitution.substitute_html, 143 "minimal" : HTMLAwareEntitySubstitution.substitute_xml, 144 None : None 145 } 146 147 XML_FORMATTERS = { 148 "html" : EntitySubstitution.substitute_html, 149 "minimal" : EntitySubstitution.substitute_xml, 150 None : None 151 } 152 153 def format_string(self, s, formatter='minimal'): 154 """Format the given string using the given formatter.""" 155 if not callable(formatter): 156 formatter = self._formatter_for_name(formatter) 157 if formatter is None: 158 output = s 159 else: 160 output = formatter(s) 161 return output 162 163 @property 164 def _is_xml(self): 165 """Is this element part of an XML tree or an HTML tree? 166 167 This is used when mapping a formatter name ("minimal") to an 168 appropriate function (one that performs entity-substitution on 169 the contents of <script> and <style> tags, or not). It's 170 inefficient, but it should be called very rarely. 171 """ 172 if self.parent is None: 173 # This is the top-level object. It should have .is_xml set 174 # from tree creation. If not, take a guess--BS is usually 175 # used on HTML markup. 176 return getattr(self, 'is_xml', False) 177 return self.parent._is_xml 178 179 def _formatter_for_name(self, name): 180 "Look up a formatter function based on its name and the tree." 181 if self._is_xml: 182 return self.XML_FORMATTERS.get( 183 name, EntitySubstitution.substitute_xml) 184 else: 185 return self.HTML_FORMATTERS.get( 186 name, HTMLAwareEntitySubstitution.substitute_xml) 187 188 def setup(self, parent=None, previous_element=None): 189 """Sets up the initial relations between this element and 190 other elements.""" 191 self.parent = parent 192 self.previous_element = previous_element 193 if previous_element is not None: 194 self.previous_element.next_element = self 195 self.next_element = None 196 self.previous_sibling = None 197 self.next_sibling = None 198 if self.parent is not None and self.parent.contents: 199 self.previous_sibling = self.parent.contents[-1] 200 self.previous_sibling.next_sibling = self 201 202 nextSibling = _alias("next_sibling") # BS3 203 previousSibling = _alias("previous_sibling") # BS3 204 205 def replace_with(self, replace_with): 206 if replace_with is self: 207 return 208 if replace_with is self.parent: 209 raise ValueError("Cannot replace a Tag with its parent.") 210 old_parent = self.parent 211 my_index = self.parent.index(self) 212 self.extract() 213 old_parent.insert(my_index, replace_with) 214 return self 215 replaceWith = replace_with # BS3 216 217 def unwrap(self): 218 my_parent = self.parent 219 my_index = self.parent.index(self) 220 self.extract() 221 for child in reversed(self.contents[:]): 222 my_parent.insert(my_index, child) 223 return self 224 replace_with_children = unwrap 225 replaceWithChildren = unwrap # BS3 226 227 def wrap(self, wrap_inside): 228 me = self.replace_with(wrap_inside) 229 wrap_inside.append(me) 230 return wrap_inside 231 232 def extract(self): 233 """Destructively rips this element out of the tree.""" 234 if self.parent is not None: 235 del self.parent.contents[self.parent.index(self)] 236 237 #Find the two elements that would be next to each other if 238 #this element (and any children) hadn't been parsed. Connect 239 #the two. 240 last_child = self._last_descendant() 241 next_element = last_child.next_element 242 243 if self.previous_element is not None: 244 self.previous_element.next_element = next_element 245 if next_element is not None: 246 next_element.previous_element = self.previous_element 247 self.previous_element = None 248 last_child.next_element = None 249 250 self.parent = None 251 if self.previous_sibling is not None: 252 self.previous_sibling.next_sibling = self.next_sibling 253 if self.next_sibling is not None: 254 self.next_sibling.previous_sibling = self.previous_sibling 255 self.previous_sibling = self.next_sibling = None 256 return self 257 258 def _last_descendant(self, is_initialized=True, accept_self=True): 259 "Finds the last element beneath this object to be parsed." 260 if is_initialized and self.next_sibling: 261 last_child = self.next_sibling.previous_element 262 else: 263 last_child = self 264 while isinstance(last_child, Tag) and last_child.contents: 265 last_child = last_child.contents[-1] 266 if not accept_self and last_child == self: 267 last_child = None 268 return last_child 269 # BS3: Not part of the API! 270 _lastRecursiveChild = _last_descendant 271 272 def insert(self, position, new_child): 273 if new_child is self: 274 raise ValueError("Cannot insert a tag into itself.") 275 if (isinstance(new_child, basestring) 276 and not isinstance(new_child, NavigableString)): 277 new_child = NavigableString(new_child) 278 279 position = min(position, len(self.contents)) 280 if hasattr(new_child, 'parent') and new_child.parent is not None: 281 # We're 'inserting' an element that's already one 282 # of this object's children. 283 if new_child.parent is self: 284 current_index = self.index(new_child) 285 if current_index < position: 286 # We're moving this element further down the list 287 # of this object's children. That means that when 288 # we extract this element, our target index will 289 # jump down one. 290 position -= 1 291 new_child.extract() 292 293 new_child.parent = self 294 previous_child = None 295 if position == 0: 296 new_child.previous_sibling = None 297 new_child.previous_element = self 298 else: 299 previous_child = self.contents[position - 1] 300 new_child.previous_sibling = previous_child 301 new_child.previous_sibling.next_sibling = new_child 302 new_child.previous_element = previous_child._last_descendant(False) 303 if new_child.previous_element is not None: 304 new_child.previous_element.next_element = new_child 305 306 new_childs_last_element = new_child._last_descendant(False) 307 308 if position >= len(self.contents): 309 new_child.next_sibling = None 310 311 parent = self 312 parents_next_sibling = None 313 while parents_next_sibling is None and parent is not None: 314 parents_next_sibling = parent.next_sibling 315 parent = parent.parent 316 if parents_next_sibling is not None: 317 # We found the element that comes next in the document. 318 break 319 if parents_next_sibling is not None: 320 new_childs_last_element.next_element = parents_next_sibling 321 else: 322 # The last element of this tag is the last element in 323 # the document. 324 new_childs_last_element.next_element = None 325 else: 326 next_child = self.contents[position] 327 new_child.next_sibling = next_child 328 if new_child.next_sibling is not None: 329 new_child.next_sibling.previous_sibling = new_child 330 new_childs_last_element.next_element = next_child 331 332 if new_childs_last_element.next_element is not None: 333 new_childs_last_element.next_element.previous_element = new_childs_last_element 334 self.contents.insert(position, new_child) 335 336 def append(self, tag): 337 """Appends the given tag to the contents of this tag.""" 338 self.insert(len(self.contents), tag) 339 340 def insert_before(self, predecessor): 341 """Makes the given element the immediate predecessor of this one. 342 343 The two elements will have the same parent, and the given element 344 will be immediately before this one. 345 """ 346 if self is predecessor: 347 raise ValueError("Can't insert an element before itself.") 348 parent = self.parent 349 if parent is None: 350 raise ValueError( 351 "Element has no parent, so 'before' has no meaning.") 352 # Extract first so that the index won't be screwed up if they 353 # are siblings. 354 if isinstance(predecessor, PageElement): 355 predecessor.extract() 356 index = parent.index(self) 357 parent.insert(index, predecessor) 358 359 def insert_after(self, successor): 360 """Makes the given element the immediate successor of this one. 361 362 The two elements will have the same parent, and the given element 363 will be immediately after this one. 364 """ 365 if self is successor: 366 raise ValueError("Can't insert an element after itself.") 367 parent = self.parent 368 if parent is None: 369 raise ValueError( 370 "Element has no parent, so 'after' has no meaning.") 371 # Extract first so that the index won't be screwed up if they 372 # are siblings. 373 if isinstance(successor, PageElement): 374 successor.extract() 375 index = parent.index(self) 376 parent.insert(index+1, successor) 377 378 def find_next(self, name=None, attrs={}, text=None, **kwargs): 379 """Returns the first item that matches the given criteria and 380 appears after this Tag in the document.""" 381 return self._find_one(self.find_all_next, name, attrs, text, **kwargs) 382 findNext = find_next # BS3 383 384 def find_all_next(self, name=None, attrs={}, text=None, limit=None, 385 **kwargs): 386 """Returns all items that match the given criteria and appear 387 after this Tag in the document.""" 388 return self._find_all(name, attrs, text, limit, self.next_elements, 389 **kwargs) 390 findAllNext = find_all_next # BS3 391 392 def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): 393 """Returns the closest sibling to this Tag that matches the 394 given criteria and appears after this Tag in the document.""" 395 return self._find_one(self.find_next_siblings, name, attrs, text, 396 **kwargs) 397 findNextSibling = find_next_sibling # BS3 398 399 def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, 400 **kwargs): 401 """Returns the siblings of this Tag that match the given 402 criteria and appear after this Tag in the document.""" 403 return self._find_all(name, attrs, text, limit, 404 self.next_siblings, **kwargs) 405 findNextSiblings = find_next_siblings # BS3 406 fetchNextSiblings = find_next_siblings # BS2 407 408 def find_previous(self, name=None, attrs={}, text=None, **kwargs): 409 """Returns the first item that matches the given criteria and 410 appears before this Tag in the document.""" 411 return self._find_one( 412 self.find_all_previous, name, attrs, text, **kwargs) 413 findPrevious = find_previous # BS3 414 415 def find_all_previous(self, name=None, attrs={}, text=None, limit=None, 416 **kwargs): 417 """Returns all items that match the given criteria and appear 418 before this Tag in the document.""" 419 return self._find_all(name, attrs, text, limit, self.previous_elements, 420 **kwargs) 421 findAllPrevious = find_all_previous # BS3 422 fetchPrevious = find_all_previous # BS2 423 424 def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): 425 """Returns the closest sibling to this Tag that matches the 426 given criteria and appears before this Tag in the document.""" 427 return self._find_one(self.find_previous_siblings, name, attrs, text, 428 **kwargs) 429 findPreviousSibling = find_previous_sibling # BS3 430 431 def find_previous_siblings(self, name=None, attrs={}, text=None, 432 limit=None, **kwargs): 433 """Returns the siblings of this Tag that match the given 434 criteria and appear before this Tag in the document.""" 435 return self._find_all(name, attrs, text, limit, 436 self.previous_siblings, **kwargs) 437 findPreviousSiblings = find_previous_siblings # BS3 438 fetchPreviousSiblings = find_previous_siblings # BS2 439 440 def find_parent(self, name=None, attrs={}, **kwargs): 441 """Returns the closest parent of this Tag that matches the given 442 criteria.""" 443 # NOTE: We can't use _find_one because findParents takes a different 444 # set of arguments. 445 r = None 446 l = self.find_parents(name, attrs, 1, **kwargs) 447 if l: 448 r = l[0] 449 return r 450 findParent = find_parent # BS3 451 452 def find_parents(self, name=None, attrs={}, limit=None, **kwargs): 453 """Returns the parents of this Tag that match the given 454 criteria.""" 455 456 return self._find_all(name, attrs, None, limit, self.parents, 457 **kwargs) 458 findParents = find_parents # BS3 459 fetchParents = find_parents # BS2 460 461 @property 462 def next(self): 463 return self.next_element 464 465 @property 466 def previous(self): 467 return self.previous_element 468 469 #These methods do the real heavy lifting. 470 471 def _find_one(self, method, name, attrs, text, **kwargs): 472 r = None 473 l = method(name, attrs, text, 1, **kwargs) 474 if l: 475 r = l[0] 476 return r 477 478 def _find_all(self, name, attrs, text, limit, generator, **kwargs): 479 "Iterates over a generator looking for things that match." 480 481 if isinstance(name, SoupStrainer): 482 strainer = name 483 else: 484 strainer = SoupStrainer(name, attrs, text, **kwargs) 485 486 if text is None and not limit and not attrs and not kwargs: 487 if name is True or name is None: 488 # Optimization to find all tags. 489 result = (element for element in generator 490 if isinstance(element, Tag)) 491 return ResultSet(strainer, result) 492 elif isinstance(name, basestring): 493 # Optimization to find all tags with a given name. 494 result = (element for element in generator 495 if isinstance(element, Tag) 496 and element.name == name) 497 return ResultSet(strainer, result) 498 results = ResultSet(strainer) 499 while True: 500 try: 501 i = next(generator) 502 except StopIteration: 503 break 504 if i: 505 found = strainer.search(i) 506 if found: 507 results.append(found) 508 if limit and len(results) >= limit: 509 break 510 return results 511 512 #These generators can be used to navigate starting from both 513 #NavigableStrings and Tags. 514 @property 515 def next_elements(self): 516 i = self.next_element 517 while i is not None: 518 yield i 519 i = i.next_element 520 521 @property 522 def next_siblings(self): 523 i = self.next_sibling 524 while i is not None: 525 yield i 526 i = i.next_sibling 527 528 @property 529 def previous_elements(self): 530 i = self.previous_element 531 while i is not None: 532 yield i 533 i = i.previous_element 534 535 @property 536 def previous_siblings(self): 537 i = self.previous_sibling 538 while i is not None: 539 yield i 540 i = i.previous_sibling 541 542 @property 543 def parents(self): 544 i = self.parent 545 while i is not None: 546 yield i 547 i = i.parent 548 549 # Methods for supporting CSS selectors. 550 551 tag_name_re = re.compile('^[a-z0-9]+$') 552 553 # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ 554 # \---/ \---/\-------------/ \-------/ 555 # | | | | 556 # | | | The value 557 # | | ~,|,^,$,* or = 558 # | Attribute 559 # Tag 560 attribselect_re = re.compile( 561 r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' + 562 r'=?"?(?P<value>[^\]"]*)"?\]$' 563 ) 564 565 def _attr_value_as_string(self, value, default=None): 566 """Force an attribute value into a string representation. 567 568 A multi-valued attribute will be converted into a 569 space-separated stirng. 570 """ 571 value = self.get(value, default) 572 if isinstance(value, list) or isinstance(value, tuple): 573 value =" ".join(value) 574 return value 575 576 def _tag_name_matches_and(self, function, tag_name): 577 if not tag_name: 578 return function 579 else: 580 def _match(tag): 581 return tag.name == tag_name and function(tag) 582 return _match 583 584 def _attribute_checker(self, operator, attribute, value=''): 585 """Create a function that performs a CSS selector operation. 586 587 Takes an operator, attribute and optional value. Returns a 588 function that will return True for elements that match that 589 combination. 590 """ 591 if operator == '=': 592 # string representation of `attribute` is equal to `value` 593 return lambda el: el._attr_value_as_string(attribute) == value 594 elif operator == '~': 595 # space-separated list representation of `attribute` 596 # contains `value` 597 def _includes_value(element): 598 attribute_value = element.get(attribute, []) 599 if not isinstance(attribute_value, list): 600 attribute_value = attribute_value.split() 601 return value in attribute_value 602 return _includes_value 603 elif operator == '^': 604 # string representation of `attribute` starts with `value` 605 return lambda el: el._attr_value_as_string( 606 attribute, '').startswith(value) 607 elif operator == '$': 608 # string represenation of `attribute` ends with `value` 609 return lambda el: el._attr_value_as_string( 610 attribute, '').endswith(value) 611 elif operator == '*': 612 # string representation of `attribute` contains `value` 613 return lambda el: value in el._attr_value_as_string(attribute, '') 614 elif operator == '|': 615 # string representation of `attribute` is either exactly 616 # `value` or starts with `value` and then a dash. 617 def _is_or_starts_with_dash(element): 618 attribute_value = element._attr_value_as_string(attribute, '') 619 return (attribute_value == value or attribute_value.startswith( 620 value + '-')) 621 return _is_or_starts_with_dash 622 else: 623 return lambda el: el.has_attr(attribute) 624 625 # Old non-property versions of the generators, for backwards 626 # compatibility with BS3. 627 def nextGenerator(self): 628 return self.next_elements 629 630 def nextSiblingGenerator(self): 631 return self.next_siblings 632 633 def previousGenerator(self): 634 return self.previous_elements 635 636 def previousSiblingGenerator(self): 637 return self.previous_siblings 638 639 def parentGenerator(self): 640 return self.parents 641 642 643class NavigableString(unicode, PageElement): 644 645 PREFIX = '' 646 SUFFIX = '' 647 648 def __new__(cls, value): 649 """Create a new NavigableString. 650 651 When unpickling a NavigableString, this method is called with 652 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be 653 passed in to the superclass's __new__ or the superclass won't know 654 how to handle non-ASCII characters. 655 """ 656 if isinstance(value, unicode): 657 return unicode.__new__(cls, value) 658 return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) 659 660 def __copy__(self): 661 return self 662 663 def __getnewargs__(self): 664 return (unicode(self),) 665 666 def __getattr__(self, attr): 667 """text.string gives you text. This is for backwards 668 compatibility for Navigable*String, but for CData* it lets you 669 get the string without the CData wrapper.""" 670 if attr == 'string': 671 return self 672 else: 673 raise AttributeError( 674 "'%s' object has no attribute '%s'" % ( 675 self.__class__.__name__, attr)) 676 677 def output_ready(self, formatter="minimal"): 678 output = self.format_string(self, formatter) 679 return self.PREFIX + output + self.SUFFIX 680 681 @property 682 def name(self): 683 return None 684 685 @name.setter 686 def name(self, name): 687 raise AttributeError("A NavigableString cannot be given a name.") 688 689class PreformattedString(NavigableString): 690 """A NavigableString not subject to the normal formatting rules. 691 692 The string will be passed into the formatter (to trigger side effects), 693 but the return value will be ignored. 694 """ 695 696 def output_ready(self, formatter="minimal"): 697 """CData strings are passed into the formatter. 698 But the return value is ignored.""" 699 self.format_string(self, formatter) 700 return self.PREFIX + self + self.SUFFIX 701 702class CData(PreformattedString): 703 704 PREFIX = u'<![CDATA[' 705 SUFFIX = u']]>' 706 707class ProcessingInstruction(PreformattedString): 708 709 PREFIX = u'<?' 710 SUFFIX = u'?>' 711 712class Comment(PreformattedString): 713 714 PREFIX = u'<!--' 715 SUFFIX = u'-->' 716 717 718class Declaration(PreformattedString): 719 PREFIX = u'<!' 720 SUFFIX = u'!>' 721 722 723class Doctype(PreformattedString): 724 725 @classmethod 726 def for_name_and_ids(cls, name, pub_id, system_id): 727 value = name or '' 728 if pub_id is not None: 729 value += ' PUBLIC "%s"' % pub_id 730 if system_id is not None: 731 value += ' "%s"' % system_id 732 elif system_id is not None: 733 value += ' SYSTEM "%s"' % system_id 734 735 return Doctype(value) 736 737 PREFIX = u'<!DOCTYPE ' 738 SUFFIX = u'>\n' 739 740 741class Tag(PageElement): 742 743 """Represents a found HTML tag with its attributes and contents.""" 744 745 def __init__(self, parser=None, builder=None, name=None, namespace=None, 746 prefix=None, attrs=None, parent=None, previous=None): 747 "Basic constructor." 748 749 if parser is None: 750 self.parser_class = None 751 else: 752 # We don't actually store the parser object: that lets extracted 753 # chunks be garbage-collected. 754 self.parser_class = parser.__class__ 755 if name is None: 756 raise ValueError("No value provided for new tag's name.") 757 self.name = name 758 self.namespace = namespace 759 self.prefix = prefix 760 if attrs is None: 761 attrs = {} 762 elif attrs and builder.cdata_list_attributes: 763 attrs = builder._replace_cdata_list_attribute_values( 764 self.name, attrs) 765 else: 766 attrs = dict(attrs) 767 self.attrs = attrs 768 self.contents = [] 769 self.setup(parent, previous) 770 self.hidden = False 771 772 # Set up any substitutions, such as the charset in a META tag. 773 if builder is not None: 774 builder.set_up_substitutions(self) 775 self.can_be_empty_element = builder.can_be_empty_element(name) 776 else: 777 self.can_be_empty_element = False 778 779 parserClass = _alias("parser_class") # BS3 780 781 @property 782 def is_empty_element(self): 783 """Is this tag an empty-element tag? (aka a self-closing tag) 784 785 A tag that has contents is never an empty-element tag. 786 787 A tag that has no contents may or may not be an empty-element 788 tag. It depends on the builder used to create the tag. If the 789 builder has a designated list of empty-element tags, then only 790 a tag whose name shows up in that list is considered an 791 empty-element tag. 792 793 If the builder has no designated list of empty-element tags, 794 then any tag with no contents is an empty-element tag. 795 """ 796 return len(self.contents) == 0 and self.can_be_empty_element 797 isSelfClosing = is_empty_element # BS3 798 799 @property 800 def string(self): 801 """Convenience property to get the single string within this tag. 802 803 :Return: If this tag has a single string child, return value 804 is that string. If this tag has no children, or more than one 805 child, return value is None. If this tag has one child tag, 806 return value is the 'string' attribute of the child tag, 807 recursively. 808 """ 809 if len(self.contents) != 1: 810 return None 811 child = self.contents[0] 812 if isinstance(child, NavigableString): 813 return child 814 return child.string 815 816 @string.setter 817 def string(self, string): 818 self.clear() 819 self.append(string.__class__(string)) 820 821 def _all_strings(self, strip=False, types=(NavigableString, CData)): 822 """Yield all strings of certain classes, possibly stripping them. 823 824 By default, yields only NavigableString and CData objects. So 825 no comments, processing instructions, etc. 826 """ 827 for descendant in self.descendants: 828 if ( 829 (types is None and not isinstance(descendant, NavigableString)) 830 or 831 (types is not None and type(descendant) not in types)): 832 continue 833 if strip: 834 descendant = descendant.strip() 835 if len(descendant) == 0: 836 continue 837 yield descendant 838 839 strings = property(_all_strings) 840 841 @property 842 def stripped_strings(self): 843 for string in self._all_strings(True): 844 yield string 845 846 def get_text(self, separator=u"", strip=False, 847 types=(NavigableString, CData)): 848 """ 849 Get all child strings, concatenated using the given separator. 850 """ 851 return separator.join([s for s in self._all_strings( 852 strip, types=types)]) 853 getText = get_text 854 text = property(get_text) 855 856 def decompose(self): 857 """Recursively destroys the contents of this tree.""" 858 self.extract() 859 i = self 860 while i is not None: 861 next = i.next_element 862 i.__dict__.clear() 863 i.contents = [] 864 i = next 865 866 def clear(self, decompose=False): 867 """ 868 Extract all children. If decompose is True, decompose instead. 869 """ 870 if decompose: 871 for element in self.contents[:]: 872 if isinstance(element, Tag): 873 element.decompose() 874 else: 875 element.extract() 876 else: 877 for element in self.contents[:]: 878 element.extract() 879 880 def index(self, element): 881 """ 882 Find the index of a child by identity, not value. Avoids issues with 883 tag.contents.index(element) getting the index of equal elements. 884 """ 885 for i, child in enumerate(self.contents): 886 if child is element: 887 return i 888 raise ValueError("Tag.index: element not in tag") 889 890 def get(self, key, default=None): 891 """Returns the value of the 'key' attribute for the tag, or 892 the value given for 'default' if it doesn't have that 893 attribute.""" 894 return self.attrs.get(key, default) 895 896 def has_attr(self, key): 897 return key in self.attrs 898 899 def __hash__(self): 900 return str(self).__hash__() 901 902 def __getitem__(self, key): 903 """tag[key] returns the value of the 'key' attribute for the tag, 904 and throws an exception if it's not there.""" 905 return self.attrs[key] 906 907 def __iter__(self): 908 "Iterating over a tag iterates over its contents." 909 return iter(self.contents) 910 911 def __len__(self): 912 "The length of a tag is the length of its list of contents." 913 return len(self.contents) 914 915 def __contains__(self, x): 916 return x in self.contents 917 918 def __nonzero__(self): 919 "A tag is non-None even if it has no contents." 920 return True 921 922 def __setitem__(self, key, value): 923 """Setting tag[key] sets the value of the 'key' attribute for the 924 tag.""" 925 self.attrs[key] = value 926 927 def __delitem__(self, key): 928 "Deleting tag[key] deletes all 'key' attributes for the tag." 929 self.attrs.pop(key, None) 930 931 def __call__(self, *args, **kwargs): 932 """Calling a tag like a function is the same as calling its 933 find_all() method. Eg. tag('a') returns a list of all the A tags 934 found within this tag.""" 935 return self.find_all(*args, **kwargs) 936 937 def __getattr__(self, tag): 938 #print "Getattr %s.%s" % (self.__class__, tag) 939 if len(tag) > 3 and tag.endswith('Tag'): 940 # BS3: soup.aTag -> "soup.find("a") 941 tag_name = tag[:-3] 942 warnings.warn( 943 '.%sTag is deprecated, use .find("%s") instead.' % ( 944 tag_name, tag_name)) 945 return self.find(tag_name) 946 # We special case contents to avoid recursion. 947 elif not tag.startswith("__") and not tag=="contents": 948 return self.find(tag) 949 raise AttributeError( 950 "'%s' object has no attribute '%s'" % (self.__class__, tag)) 951 952 def __eq__(self, other): 953 """Returns true iff this tag has the same name, the same attributes, 954 and the same contents (recursively) as the given tag.""" 955 if self is other: 956 return True 957 if (not hasattr(other, 'name') or 958 not hasattr(other, 'attrs') or 959 not hasattr(other, 'contents') or 960 self.name != other.name or 961 self.attrs != other.attrs or 962 len(self) != len(other)): 963 return False 964 for i, my_child in enumerate(self.contents): 965 if my_child != other.contents[i]: 966 return False 967 return True 968 969 def __ne__(self, other): 970 """Returns true iff this tag is not identical to the other tag, 971 as defined in __eq__.""" 972 return not self == other 973 974 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): 975 """Renders this tag as a string.""" 976 return self.encode(encoding) 977 978 def __unicode__(self): 979 return self.decode() 980 981 def __str__(self): 982 return self.encode() 983 984 if PY3K: 985 __str__ = __repr__ = __unicode__ 986 987 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, 988 indent_level=None, formatter="minimal", 989 errors="xmlcharrefreplace"): 990 # Turn the data structure into Unicode, then encode the 991 # Unicode. 992 u = self.decode(indent_level, encoding, formatter) 993 return u.encode(encoding, errors) 994 995 def _should_pretty_print(self, indent_level): 996 """Should this tag be pretty-printed?""" 997 return ( 998 indent_level is not None and 999 (self.name not in HTMLAwareEntitySubstitution.preformatted_tags 1000 or self._is_xml)) 1001 1002 def decode(self, indent_level=None, 1003 eventual_encoding=DEFAULT_OUTPUT_ENCODING, 1004 formatter="minimal"): 1005 """Returns a Unicode representation of this tag and its contents. 1006 1007 :param eventual_encoding: The tag is destined to be 1008 encoded into this encoding. This method is _not_ 1009 responsible for performing that encoding. This information 1010 is passed in so that it can be substituted in if the 1011 document contains a <META> tag that mentions the document's 1012 encoding. 1013 """ 1014 1015 # First off, turn a string formatter into a function. This 1016 # will stop the lookup from happening over and over again. 1017 if not callable(formatter): 1018 formatter = self._formatter_for_name(formatter) 1019 1020 attrs = [] 1021 if self.attrs: 1022 for key, val in sorted(self.attrs.items()): 1023 if val is None: 1024 decoded = key 1025 else: 1026 if isinstance(val, list) or isinstance(val, tuple): 1027 val = ' '.join(val) 1028 elif not isinstance(val, basestring): 1029 val = unicode(val) 1030 elif ( 1031 isinstance(val, AttributeValueWithCharsetSubstitution) 1032 and eventual_encoding is not None): 1033 val = val.encode(eventual_encoding) 1034 1035 text = self.format_string(val, formatter) 1036 decoded = ( 1037 unicode(key) + '=' 1038 + EntitySubstitution.quoted_attribute_value(text)) 1039 attrs.append(decoded) 1040 close = '' 1041 closeTag = '' 1042 1043 prefix = '' 1044 if self.prefix: 1045 prefix = self.prefix + ":" 1046 1047 if self.is_empty_element: 1048 close = '/' 1049 else: 1050 closeTag = '</%s%s>' % (prefix, self.name) 1051 1052 pretty_print = self._should_pretty_print(indent_level) 1053 space = '' 1054 indent_space = '' 1055 if indent_level is not None: 1056 indent_space = (' ' * (indent_level - 1)) 1057 if pretty_print: 1058 space = indent_space 1059 indent_contents = indent_level + 1 1060 else: 1061 indent_contents = None 1062 contents = self.decode_contents( 1063 indent_contents, eventual_encoding, formatter) 1064 1065 if self.hidden: 1066 # This is the 'document root' object. 1067 s = contents 1068 else: 1069 s = [] 1070 attribute_string = '' 1071 if attrs: 1072 attribute_string = ' ' + ' '.join(attrs) 1073 if indent_level is not None: 1074 # Even if this particular tag is not pretty-printed, 1075 # we should indent up to the start of the tag. 1076 s.append(indent_space) 1077 s.append('<%s%s%s%s>' % ( 1078 prefix, self.name, attribute_string, close)) 1079 if pretty_print: 1080 s.append("\n") 1081 s.append(contents) 1082 if pretty_print and contents and contents[-1] != "\n": 1083 s.append("\n") 1084 if pretty_print and closeTag: 1085 s.append(space) 1086 s.append(closeTag) 1087 if indent_level is not None and closeTag and self.next_sibling: 1088 # Even if this particular tag is not pretty-printed, 1089 # we're now done with the tag, and we should add a 1090 # newline if appropriate. 1091 s.append("\n") 1092 s = ''.join(s) 1093 return s 1094 1095 def prettify(self, encoding=None, formatter="minimal"): 1096 if encoding is None: 1097 return self.decode(True, formatter=formatter) 1098 else: 1099 return self.encode(encoding, True, formatter=formatter) 1100 1101 def decode_contents(self, indent_level=None, 1102 eventual_encoding=DEFAULT_OUTPUT_ENCODING, 1103 formatter="minimal"): 1104 """Renders the contents of this tag as a Unicode string. 1105 1106 :param eventual_encoding: The tag is destined to be 1107 encoded into this encoding. This method is _not_ 1108 responsible for performing that encoding. This information 1109 is passed in so that it can be substituted in if the 1110 document contains a <META> tag that mentions the document's 1111 encoding. 1112 """ 1113 # First off, turn a string formatter into a function. This 1114 # will stop the lookup from happening over and over again. 1115 if not callable(formatter): 1116 formatter = self._formatter_for_name(formatter) 1117 1118 pretty_print = (indent_level is not None) 1119 s = [] 1120 for c in self: 1121 text = None 1122 if isinstance(c, NavigableString): 1123 text = c.output_ready(formatter) 1124 elif isinstance(c, Tag): 1125 s.append(c.decode(indent_level, eventual_encoding, 1126 formatter)) 1127 if text and indent_level and not self.name == 'pre': 1128 text = text.strip() 1129 if text: 1130 if pretty_print and not self.name == 'pre': 1131 s.append(" " * (indent_level - 1)) 1132 s.append(text) 1133 if pretty_print and not self.name == 'pre': 1134 s.append("\n") 1135 return ''.join(s) 1136 1137 def encode_contents( 1138 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, 1139 formatter="minimal"): 1140 """Renders the contents of this tag as a bytestring.""" 1141 contents = self.decode_contents(indent_level, encoding, formatter) 1142 return contents.encode(encoding) 1143 1144 # Old method for BS3 compatibility 1145 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, 1146 prettyPrint=False, indentLevel=0): 1147 if not prettyPrint: 1148 indentLevel = None 1149 return self.encode_contents( 1150 indent_level=indentLevel, encoding=encoding) 1151 1152 #Soup methods 1153 1154 def find(self, name=None, attrs={}, recursive=True, text=None, 1155 **kwargs): 1156 """Return only the first child of this Tag matching the given 1157 criteria.""" 1158 r = None 1159 l = self.find_all(name, attrs, recursive, text, 1, **kwargs) 1160 if l: 1161 r = l[0] 1162 return r 1163 findChild = find 1164 1165 def find_all(self, name=None, attrs={}, recursive=True, text=None, 1166 limit=None, **kwargs): 1167 """Extracts a list of Tag objects that match the given 1168 criteria. You can specify the name of the Tag and any 1169 attributes you want the Tag to have. 1170 1171 The value of a key-value pair in the 'attrs' map can be a 1172 string, a list of strings, a regular expression object, or a 1173 callable that takes a string and returns whether or not the 1174 string matches for some custom definition of 'matches'. The 1175 same is true of the tag name.""" 1176 1177 generator = self.descendants 1178 if not recursive: 1179 generator = self.children 1180 return self._find_all(name, attrs, text, limit, generator, **kwargs) 1181 findAll = find_all # BS3 1182 findChildren = find_all # BS2 1183 1184 #Generator methods 1185 @property 1186 def children(self): 1187 # return iter() to make the purpose of the method clear 1188 return iter(self.contents) # XXX This seems to be untested. 1189 1190 @property 1191 def descendants(self): 1192 if not len(self.contents): 1193 return 1194 stopNode = self._last_descendant().next_element 1195 current = self.contents[0] 1196 while current is not stopNode: 1197 yield current 1198 current = current.next_element 1199 1200 # CSS selector code 1201 1202 _selector_combinators = ['>', '+', '~'] 1203 _select_debug = False 1204 def select(self, selector, _candidate_generator=None): 1205 """Perform a CSS selection operation on the current element.""" 1206 tokens = selector.split() 1207 current_context = [self] 1208 1209 if tokens[-1] in self._selector_combinators: 1210 raise ValueError( 1211 'Final combinator "%s" is missing an argument.' % tokens[-1]) 1212 if self._select_debug: 1213 print 'Running CSS selector "%s"' % selector 1214 for index, token in enumerate(tokens): 1215 if self._select_debug: 1216 print ' Considering token "%s"' % token 1217 recursive_candidate_generator = None 1218 tag_name = None 1219 if tokens[index-1] in self._selector_combinators: 1220 # This token was consumed by the previous combinator. Skip it. 1221 if self._select_debug: 1222 print ' Token was consumed by the previous combinator.' 1223 continue 1224 # Each operation corresponds to a checker function, a rule 1225 # for determining whether a candidate matches the 1226 # selector. Candidates are generated by the active 1227 # iterator. 1228 checker = None 1229 1230 m = self.attribselect_re.match(token) 1231 if m is not None: 1232 # Attribute selector 1233 tag_name, attribute, operator, value = m.groups() 1234 checker = self._attribute_checker(operator, attribute, value) 1235 1236 elif '#' in token: 1237 # ID selector 1238 tag_name, tag_id = token.split('#', 1) 1239 def id_matches(tag): 1240 return tag.get('id', None) == tag_id 1241 checker = id_matches 1242 1243 elif '.' in token: 1244 # Class selector 1245 tag_name, klass = token.split('.', 1) 1246 classes = set(klass.split('.')) 1247 def classes_match(candidate): 1248 return classes.issubset(candidate.get('class', [])) 1249 checker = classes_match 1250 1251 elif ':' in token: 1252 # Pseudo-class 1253 tag_name, pseudo = token.split(':', 1) 1254 if tag_name == '': 1255 raise ValueError( 1256 "A pseudo-class must be prefixed with a tag name.") 1257 pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) 1258 found = [] 1259 if pseudo_attributes is not None: 1260 pseudo_type, pseudo_value = pseudo_attributes.groups() 1261 if pseudo_type == 'nth-of-type': 1262 try: 1263 pseudo_value = int(pseudo_value) 1264 except: 1265 raise NotImplementedError( 1266 'Only numeric values are currently supported for the nth-of-type pseudo-class.') 1267 if pseudo_value < 1: 1268 raise ValueError( 1269 'nth-of-type pseudo-class value must be at least 1.') 1270 class Counter(object): 1271 def __init__(self, destination): 1272 self.count = 0 1273 self.destination = destination 1274 1275 def nth_child_of_type(self, tag): 1276 self.count += 1 1277 if self.count == self.destination: 1278 return True 1279 if self.count > self.destination: 1280 # Stop the generator that's sending us 1281 # these things. 1282 raise StopIteration() 1283 return False 1284 checker = Counter(pseudo_value).nth_child_of_type 1285 else: 1286 raise NotImplementedError( 1287 'Only the following pseudo-classes are implemented: nth-of-type.') 1288 1289 elif token == '*': 1290 # Star selector -- matches everything 1291 pass 1292 elif token == '>': 1293 # Run the next token as a CSS selector against the 1294 # direct children of each tag in the current context. 1295 recursive_candidate_generator = lambda tag: tag.children 1296 elif token == '~': 1297 # Run the next token as a CSS selector against the 1298 # siblings of each tag in the current context. 1299 recursive_candidate_generator = lambda tag: tag.next_siblings 1300 elif token == '+': 1301 # For each tag in the current context, run the next 1302 # token as a CSS selector against the tag's next 1303 # sibling that's a tag. 1304 def next_tag_sibling(tag): 1305 yield tag.find_next_sibling(True) 1306 recursive_candidate_generator = next_tag_sibling 1307 1308 elif self.tag_name_re.match(token): 1309 # Just a tag name. 1310 tag_name = token 1311 else: 1312 raise ValueError( 1313 'Unsupported or invalid CSS selector: "%s"' % token) 1314 1315 if recursive_candidate_generator: 1316 # This happens when the selector looks like "> foo". 1317 # 1318 # The generator calls select() recursively on every 1319 # member of the current context, passing in a different 1320 # candidate generator and a different selector. 1321 # 1322 # In the case of "> foo", the candidate generator is 1323 # one that yields a tag's direct children (">"), and 1324 # the selector is "foo". 1325 next_token = tokens[index+1] 1326 def recursive_select(tag): 1327 if self._select_debug: 1328 print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs) 1329 print '-' * 40 1330 for i in tag.select(next_token, recursive_candidate_generator): 1331 if self._select_debug: 1332 print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs) 1333 yield i 1334 if self._select_debug: 1335 print '-' * 40 1336 _use_candidate_generator = recursive_select 1337 elif _candidate_generator is None: 1338 # By default, a tag's candidates are all of its 1339 # children. If tag_name is defined, only yield tags 1340 # with that name. 1341 if self._select_debug: 1342 if tag_name: 1343 check = "[any]" 1344 else: 1345 check = tag_name 1346 print ' Default candidate generator, tag name="%s"' % check 1347 if self._select_debug: 1348 # This is redundant with later code, but it stops 1349 # a bunch of bogus tags from cluttering up the 1350 # debug log. 1351 def default_candidate_generator(tag): 1352 for child in tag.descendants: 1353 if not isinstance(child, Tag): 1354 continue 1355 if tag_name and not child.name == tag_name: 1356 continue 1357 yield child 1358 _use_candidate_generator = default_candidate_generator 1359 else: 1360 _use_candidate_generator = lambda tag: tag.descendants 1361 else: 1362 _use_candidate_generator = _candidate_generator 1363 1364 new_context = [] 1365 new_context_ids = set([]) 1366 for tag in current_context: 1367 if self._select_debug: 1368 print " Running candidate generator on %s %s" % ( 1369 tag.name, repr(tag.attrs)) 1370 for candidate in _use_candidate_generator(tag): 1371 if not isinstance(candidate, Tag): 1372 continue 1373 if tag_name and candidate.name != tag_name: 1374 continue 1375 if checker is not None: 1376 try: 1377 result = checker(candidate) 1378 except StopIteration: 1379 # The checker has decided we should no longer 1380 # run the generator. 1381 break 1382 if checker is None or result: 1383 if self._select_debug: 1384 print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)) 1385 if id(candidate) not in new_context_ids: 1386 # If a tag matches a selector more than once, 1387 # don't include it in the context more than once. 1388 new_context.append(candidate) 1389 new_context_ids.add(id(candidate)) 1390 elif self._select_debug: 1391 print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) 1392 1393 current_context = new_context 1394 1395 if self._select_debug: 1396 print "Final verdict:" 1397 for i in current_context: 1398 print " %s %s" % (i.name, i.attrs) 1399 return current_context 1400 1401 # Old names for backwards compatibility 1402 def childGenerator(self): 1403 return self.children 1404 1405 def recursiveChildGenerator(self): 1406 return self.descendants 1407 1408 def has_key(self, key): 1409 """This was kind of misleading because has_key() (attributes) 1410 was different from __in__ (contents). has_key() is gone in 1411 Python 3, anyway.""" 1412 warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % ( 1413 key)) 1414 return self.has_attr(key) 1415 1416# Next, a couple classes to represent queries and their results. 1417class SoupStrainer(object): 1418 """Encapsulates a number of ways of matching a markup element (tag or 1419 text).""" 1420 1421 def __init__(self, name=None, attrs={}, text=None, **kwargs): 1422 self.name = self._normalize_search_value(name) 1423 if not isinstance(attrs, dict): 1424 # Treat a non-dict value for attrs as a search for the 'class' 1425 # attribute. 1426 kwargs['class'] = attrs 1427 attrs = None 1428 1429 if 'class_' in kwargs: 1430 # Treat class_="foo" as a search for the 'class' 1431 # attribute, overriding any non-dict value for attrs. 1432 kwargs['class'] = kwargs['class_'] 1433 del kwargs['class_'] 1434 1435 if kwargs: 1436 if attrs: 1437 attrs = attrs.copy() 1438 attrs.update(kwargs) 1439 else: 1440 attrs = kwargs 1441 normalized_attrs = {} 1442 for key, value in attrs.items(): 1443 normalized_attrs[key] = self._normalize_search_value(value) 1444 1445 self.attrs = normalized_attrs 1446 self.text = self._normalize_search_value(text) 1447 1448 def _normalize_search_value(self, value): 1449 # Leave it alone if it's a Unicode string, a callable, a 1450 # regular expression, a boolean, or None. 1451 if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match') 1452 or isinstance(value, bool) or value is None): 1453 return value 1454 1455 # If it's a bytestring, convert it to Unicode, treating it as UTF-8. 1456 if isinstance(value, bytes): 1457 return value.decode("utf8") 1458 1459 # If it's listlike, convert it into a list of strings. 1460 if hasattr(value, '__iter__'): 1461 new_value = [] 1462 for v in value: 1463 if (hasattr(v, '__iter__') and not isinstance(v, bytes) 1464 and not isinstance(v, unicode)): 1465 # This is almost certainly the user's mistake. In the 1466 # interests of avoiding infinite loops, we'll let 1467 # it through as-is rather than doing a recursive call. 1468 new_value.append(v) 1469 else: 1470 new_value.append(self._normalize_search_value(v)) 1471 return new_value 1472 1473 # Otherwise, convert it into a Unicode string. 1474 # The unicode(str()) thing is so this will do the same thing on Python 2 1475 # and Python 3. 1476 return unicode(str(value)) 1477 1478 def __str__(self): 1479 if self.text: 1480 return self.text 1481 else: 1482 return "%s|%s" % (self.name, self.attrs) 1483 1484 def search_tag(self, markup_name=None, markup_attrs={}): 1485 found = None 1486 markup = None 1487 if isinstance(markup_name, Tag): 1488 markup = markup_name 1489 markup_attrs = markup 1490 call_function_with_tag_data = ( 1491 isinstance(self.name, collections.Callable) 1492 and not isinstance(markup_name, Tag)) 1493 1494 if ((not self.name) 1495 or call_function_with_tag_data 1496 or (markup and self._matches(markup, self.name)) 1497 or (not markup and self._matches(markup_name, self.name))): 1498 if call_function_with_tag_data: 1499 match = self.name(markup_name, markup_attrs) 1500 else: 1501 match = True 1502 markup_attr_map = None 1503 for attr, match_against in list(self.attrs.items()): 1504 if not markup_attr_map: 1505 if hasattr(markup_attrs, 'get'): 1506 markup_attr_map = markup_attrs 1507 else: 1508 markup_attr_map = {} 1509 for k, v in markup_attrs: 1510 markup_attr_map[k] = v 1511 attr_value = markup_attr_map.get(attr) 1512 if not self._matches(attr_value, match_against): 1513 match = False 1514 break 1515 if match: 1516 if markup: 1517 found = markup 1518 else: 1519 found = markup_name 1520 if found and self.text and not self._matches(found.string, self.text): 1521 found = None 1522 return found 1523 searchTag = search_tag 1524 1525 def search(self, markup): 1526 # print 'looking for %s in %s' % (self, markup) 1527 found = None 1528 # If given a list of items, scan it for a text element that 1529 # matches. 1530 if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)): 1531 for element in markup: 1532 if isinstance(element, NavigableString) \ 1533 and self.search(element): 1534 found = element 1535 break 1536 # If it's a Tag, make sure its name or attributes match. 1537 # Don't bother with Tags if we're searching for text. 1538 elif isinstance(markup, Tag): 1539 if not self.text or self.name or self.attrs: 1540 found = self.search_tag(markup) 1541 # If it's text, make sure the text matches. 1542 elif isinstance(markup, NavigableString) or \ 1543 isinstance(markup, basestring): 1544 if not self.name and not self.attrs and self._matches(markup, self.text): 1545 found = markup 1546 else: 1547 raise Exception( 1548 "I don't know how to match against a %s" % markup.__class__) 1549 return found 1550 1551 def _matches(self, markup, match_against): 1552 # print u"Matching %s against %s" % (markup, match_against) 1553 result = False 1554 if isinstance(markup, list) or isinstance(markup, tuple): 1555 # This should only happen when searching a multi-valued attribute 1556 # like 'class'. 1557 if (isinstance(match_against, unicode) 1558 and ' ' in match_against): 1559 # A bit of a special case. If they try to match "foo 1560 # bar" on a multivalue attribute's value, only accept 1561 # the literal value "foo bar" 1562 # 1563 # XXX This is going to be pretty slow because we keep 1564 # splitting match_against. But it shouldn't come up 1565 # too often. 1566 return (whitespace_re.split(match_against) == markup) 1567 else: 1568 for item in markup: 1569 if self._matches(item, match_against): 1570 return True 1571 return False 1572 1573 if match_against is True: 1574 # True matches any non-None value. 1575 return markup is not None 1576 1577 if isinstance(match_against, collections.Callable): 1578 return match_against(markup) 1579 1580 # Custom callables take the tag as an argument, but all 1581 # other ways of matching match the tag name as a string. 1582 if isinstance(markup, Tag): 1583 markup = markup.name 1584 1585 # Ensure that `markup` is either a Unicode string, or None. 1586 markup = self._normalize_search_value(markup) 1587 1588 if markup is None: 1589 # None matches None, False, an empty string, an empty list, and so on. 1590 return not match_against 1591 1592 if isinstance(match_against, unicode): 1593 # Exact string match 1594 return markup == match_against 1595 1596 if hasattr(match_against, 'match'): 1597 # Regexp match 1598 return match_against.search(markup) 1599 1600 if hasattr(match_against, '__iter__'): 1601 # The markup must be an exact match against something 1602 # in the iterable. 1603 return markup in match_against 1604 1605 1606class ResultSet(list): 1607 """A ResultSet is just a list that keeps track of the SoupStrainer 1608 that created it.""" 1609 def __init__(self, source, result=()): 1610 super(ResultSet, self).__init__(result) 1611 self.source = source 1612