1"""Beautiful Soup 2Elixir and Tonic 3"The Screen-Scraper's Friend" 4http://www.crummy.com/software/BeautifulSoup/ 5 6Beautiful Soup parses a (possibly invalid) XML or HTML document into a 7tree representation. It provides methods and Pythonic idioms that make 8it easy to navigate, search, and modify the tree. 9 10A well-formed XML/HTML document yields a well-formed data 11structure. An ill-formed XML/HTML document yields a correspondingly 12ill-formed data structure. If your document is only locally 13well-formed, you can use this library to find and process the 14well-formed part of it. 15 16Beautiful Soup works with Python 2.2 and up. It has no external 17dependencies, but you'll have more success at converting data to UTF-8 18if you also install these three packages: 19 20* chardet, for auto-detecting character encodings 21 http://chardet.feedparser.org/ 22* cjkcodecs and iconv_codec, which add more encodings to the ones supported 23 by stock Python. 24 http://cjkpython.i18n.org/ 25 26Beautiful Soup defines classes for two main parsing strategies: 27 28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific 29 language that kind of looks like XML. 30 31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid 32 or invalid. This class has web browser-like heuristics for 33 obtaining a sensible parse tree in the face of common HTML errors. 34 35Beautiful Soup also defines a class (UnicodeDammit) for autodetecting 36the encoding of an HTML or XML document, and converting it to 37Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. 38 39For more than you ever wanted to know about Beautiful Soup, see the 40documentation: 41http://www.crummy.com/software/BeautifulSoup/documentation.html 42 43Here, have some legalese: 44 45Copyright (c) 2004-2009, Leonard Richardson 46 47All rights reserved. 48 49Redistribution and use in source and binary forms, with or without 50modification, are permitted provided that the following conditions are 51met: 52 53 * Redistributions of source code must retain the above copyright 54 notice, this list of conditions and the following disclaimer. 55 56 * Redistributions in binary form must reproduce the above 57 copyright notice, this list of conditions and the following 58 disclaimer in the documentation and/or other materials provided 59 with the distribution. 60 61 * Neither the name of the the Beautiful Soup Consortium and All 62 Night Kosher Bakery nor the names of its contributors may be 63 used to endorse or promote products derived from this software 64 without specific prior written permission. 65 66THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 67"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 68LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 69A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 70CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 71EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 72PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 73PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 74LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 75NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 76SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. 77 78""" 79from __future__ import generators 80 81__author__ = "Leonard Richardson (leonardr@segfault.org)" 82__version__ = "3.1.0.1" 83__copyright__ = "Copyright (c) 2004-2009 Leonard Richardson" 84__license__ = "New-style BSD" 85 86import codecs 87import markupbase 88import types 89import re 90from HTMLParser import HTMLParser, HTMLParseError 91try: 92 from htmlentitydefs import name2codepoint 93except ImportError: 94 name2codepoint = {} 95try: 96 set 97except NameError: 98 from sets import Set as set 99 100#These hacks make Beautiful Soup able to parse XML with namespaces 101markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match 102 103DEFAULT_OUTPUT_ENCODING = "utf-8" 104 105# First, the classes that represent markup elements. 106 107def sob(unicode, encoding): 108 """Returns either the given Unicode string or its encoding.""" 109 if encoding is None: 110 return unicode 111 else: 112 return unicode.encode(encoding) 113 114class PageElement: 115 """Contains the navigational information for some part of the page 116 (either a tag or a piece of text)""" 117 118 def setup(self, parent=None, previous=None): 119 """Sets up the initial relations between this element and 120 other elements.""" 121 self.parent = parent 122 self.previous = previous 123 self.next = None 124 self.previousSibling = None 125 self.nextSibling = None 126 if self.parent and self.parent.contents: 127 self.previousSibling = self.parent.contents[-1] 128 self.previousSibling.nextSibling = self 129 130 def replaceWith(self, replaceWith): 131 oldParent = self.parent 132 myIndex = self.parent.contents.index(self) 133 if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: 134 # We're replacing this element with one of its siblings. 135 index = self.parent.contents.index(replaceWith) 136 if index and index < myIndex: 137 # Furthermore, it comes before this element. That 138 # means that when we extract it, the index of this 139 # element will change. 140 myIndex = myIndex - 1 141 self.extract() 142 oldParent.insert(myIndex, replaceWith) 143 144 def extract(self): 145 """Destructively rips this element out of the tree.""" 146 if self.parent: 147 try: 148 self.parent.contents.remove(self) 149 except ValueError: 150 pass 151 152 #Find the two elements that would be next to each other if 153 #this element (and any children) hadn't been parsed. Connect 154 #the two. 155 lastChild = self._lastRecursiveChild() 156 nextElement = lastChild.next 157 158 if self.previous: 159 self.previous.next = nextElement 160 if nextElement: 161 nextElement.previous = self.previous 162 self.previous = None 163 lastChild.next = None 164 165 self.parent = None 166 if self.previousSibling: 167 self.previousSibling.nextSibling = self.nextSibling 168 if self.nextSibling: 169 self.nextSibling.previousSibling = self.previousSibling 170 self.previousSibling = self.nextSibling = None 171 return self 172 173 def _lastRecursiveChild(self): 174 "Finds the last element beneath this object to be parsed." 175 lastChild = self 176 while hasattr(lastChild, 'contents') and lastChild.contents: 177 lastChild = lastChild.contents[-1] 178 return lastChild 179 180 def insert(self, position, newChild): 181 if (isinstance(newChild, basestring) 182 or isinstance(newChild, unicode)) \ 183 and not isinstance(newChild, NavigableString): 184 newChild = NavigableString(newChild) 185 186 position = min(position, len(self.contents)) 187 if hasattr(newChild, 'parent') and newChild.parent != None: 188 # We're 'inserting' an element that's already one 189 # of this object's children. 190 if newChild.parent == self: 191 index = self.find(newChild) 192 if index and index < position: 193 # Furthermore we're moving it further down the 194 # list of this object's children. That means that 195 # when we extract this element, our target index 196 # will jump down one. 197 position = position - 1 198 newChild.extract() 199 200 newChild.parent = self 201 previousChild = None 202 if position == 0: 203 newChild.previousSibling = None 204 newChild.previous = self 205 else: 206 previousChild = self.contents[position-1] 207 newChild.previousSibling = previousChild 208 newChild.previousSibling.nextSibling = newChild 209 newChild.previous = previousChild._lastRecursiveChild() 210 if newChild.previous: 211 newChild.previous.next = newChild 212 213 newChildsLastElement = newChild._lastRecursiveChild() 214 215 if position >= len(self.contents): 216 newChild.nextSibling = None 217 218 parent = self 219 parentsNextSibling = None 220 while not parentsNextSibling: 221 parentsNextSibling = parent.nextSibling 222 parent = parent.parent 223 if not parent: # This is the last element in the document. 224 break 225 if parentsNextSibling: 226 newChildsLastElement.next = parentsNextSibling 227 else: 228 newChildsLastElement.next = None 229 else: 230 nextChild = self.contents[position] 231 newChild.nextSibling = nextChild 232 if newChild.nextSibling: 233 newChild.nextSibling.previousSibling = newChild 234 newChildsLastElement.next = nextChild 235 236 if newChildsLastElement.next: 237 newChildsLastElement.next.previous = newChildsLastElement 238 self.contents.insert(position, newChild) 239 240 def append(self, tag): 241 """Appends the given tag to the contents of this tag.""" 242 self.insert(len(self.contents), tag) 243 244 def findNext(self, name=None, attrs={}, text=None, **kwargs): 245 """Returns the first item that matches the given criteria and 246 appears after this Tag in the document.""" 247 return self._findOne(self.findAllNext, name, attrs, text, **kwargs) 248 249 def findAllNext(self, name=None, attrs={}, text=None, limit=None, 250 **kwargs): 251 """Returns all items that match the given criteria and appear 252 after this Tag in the document.""" 253 return self._findAll(name, attrs, text, limit, self.nextGenerator, 254 **kwargs) 255 256 def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): 257 """Returns the closest sibling to this Tag that matches the 258 given criteria and appears after this Tag in the document.""" 259 return self._findOne(self.findNextSiblings, name, attrs, text, 260 **kwargs) 261 262 def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, 263 **kwargs): 264 """Returns the siblings of this Tag that match the given 265 criteria and appear after this Tag in the document.""" 266 return self._findAll(name, attrs, text, limit, 267 self.nextSiblingGenerator, **kwargs) 268 fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x 269 270 def findPrevious(self, name=None, attrs={}, text=None, **kwargs): 271 """Returns the first item that matches the given criteria and 272 appears before this Tag in the document.""" 273 return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) 274 275 def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, 276 **kwargs): 277 """Returns all items that match the given criteria and appear 278 before this Tag in the document.""" 279 return self._findAll(name, attrs, text, limit, self.previousGenerator, 280 **kwargs) 281 fetchPrevious = findAllPrevious # Compatibility with pre-3.x 282 283 def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): 284 """Returns the closest sibling to this Tag that matches the 285 given criteria and appears before this Tag in the document.""" 286 return self._findOne(self.findPreviousSiblings, name, attrs, text, 287 **kwargs) 288 289 def findPreviousSiblings(self, name=None, attrs={}, text=None, 290 limit=None, **kwargs): 291 """Returns the siblings of this Tag that match the given 292 criteria and appear before this Tag in the document.""" 293 return self._findAll(name, attrs, text, limit, 294 self.previousSiblingGenerator, **kwargs) 295 fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x 296 297 def findParent(self, name=None, attrs={}, **kwargs): 298 """Returns the closest parent of this Tag that matches the given 299 criteria.""" 300 # NOTE: We can't use _findOne because findParents takes a different 301 # set of arguments. 302 r = None 303 l = self.findParents(name, attrs, 1) 304 if l: 305 r = l[0] 306 return r 307 308 def findParents(self, name=None, attrs={}, limit=None, **kwargs): 309 """Returns the parents of this Tag that match the given 310 criteria.""" 311 312 return self._findAll(name, attrs, None, limit, self.parentGenerator, 313 **kwargs) 314 fetchParents = findParents # Compatibility with pre-3.x 315 316 #These methods do the real heavy lifting. 317 318 def _findOne(self, method, name, attrs, text, **kwargs): 319 r = None 320 l = method(name, attrs, text, 1, **kwargs) 321 if l: 322 r = l[0] 323 return r 324 325 def _findAll(self, name, attrs, text, limit, generator, **kwargs): 326 "Iterates over a generator looking for things that match." 327 328 if isinstance(name, SoupStrainer): 329 strainer = name 330 else: 331 # Build a SoupStrainer 332 strainer = SoupStrainer(name, attrs, text, **kwargs) 333 results = ResultSet(strainer) 334 g = generator() 335 while True: 336 try: 337 i = g.next() 338 except StopIteration: 339 break 340 if i: 341 found = strainer.search(i) 342 if found: 343 results.append(found) 344 if limit and len(results) >= limit: 345 break 346 return results 347 348 #These Generators can be used to navigate starting from both 349 #NavigableStrings and Tags. 350 def nextGenerator(self): 351 i = self 352 while i: 353 i = i.next 354 yield i 355 356 def nextSiblingGenerator(self): 357 i = self 358 while i: 359 i = i.nextSibling 360 yield i 361 362 def previousGenerator(self): 363 i = self 364 while i: 365 i = i.previous 366 yield i 367 368 def previousSiblingGenerator(self): 369 i = self 370 while i: 371 i = i.previousSibling 372 yield i 373 374 def parentGenerator(self): 375 i = self 376 while i: 377 i = i.parent 378 yield i 379 380 # Utility methods 381 def substituteEncoding(self, str, encoding=None): 382 encoding = encoding or "utf-8" 383 return str.replace("%SOUP-ENCODING%", encoding) 384 385 def toEncoding(self, s, encoding=None): 386 """Encodes an object to a string in some encoding, or to Unicode. 387 .""" 388 if isinstance(s, unicode): 389 if encoding: 390 s = s.encode(encoding) 391 elif isinstance(s, str): 392 if encoding: 393 s = s.encode(encoding) 394 else: 395 s = unicode(s) 396 else: 397 if encoding: 398 s = self.toEncoding(str(s), encoding) 399 else: 400 s = unicode(s) 401 return s 402 403class NavigableString(unicode, PageElement): 404 405 def __new__(cls, value): 406 """Create a new NavigableString. 407 408 When unpickling a NavigableString, this method is called with 409 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be 410 passed in to the superclass's __new__ or the superclass won't know 411 how to handle non-ASCII characters. 412 """ 413 if isinstance(value, unicode): 414 return unicode.__new__(cls, value) 415 return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) 416 417 def __getnewargs__(self): 418 return (unicode(self),) 419 420 def __getattr__(self, attr): 421 """text.string gives you text. This is for backwards 422 compatibility for Navigable*String, but for CData* it lets you 423 get the string without the CData wrapper.""" 424 if attr == 'string': 425 return self 426 else: 427 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) 428 429 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING): 430 return self.decode().encode(encoding) 431 432 def decodeGivenEventualEncoding(self, eventualEncoding): 433 return self 434 435class CData(NavigableString): 436 437 def decodeGivenEventualEncoding(self, eventualEncoding): 438 return u'<![CDATA[' + self + u']]>' 439 440class ProcessingInstruction(NavigableString): 441 442 def decodeGivenEventualEncoding(self, eventualEncoding): 443 output = self 444 if u'%SOUP-ENCODING%' in output: 445 output = self.substituteEncoding(output, eventualEncoding) 446 return u'<?' + output + u'?>' 447 448class Comment(NavigableString): 449 def decodeGivenEventualEncoding(self, eventualEncoding): 450 return u'<!--' + self + u'-->' 451 452class Declaration(NavigableString): 453 def decodeGivenEventualEncoding(self, eventualEncoding): 454 return u'<!' + self + u'>' 455 456class Tag(PageElement): 457 458 """Represents a found HTML tag with its attributes and contents.""" 459 460 def _invert(h): 461 "Cheap function to invert a hash." 462 i = {} 463 for k,v in h.items(): 464 i[v] = k 465 return i 466 467 XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", 468 "quot" : '"', 469 "amp" : "&", 470 "lt" : "<", 471 "gt" : ">" } 472 473 XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) 474 475 def _convertEntities(self, match): 476 """Used in a call to re.sub to replace HTML, XML, and numeric 477 entities with the appropriate Unicode characters. If HTML 478 entities are being converted, any unrecognized entities are 479 escaped.""" 480 x = match.group(1) 481 if self.convertHTMLEntities and x in name2codepoint: 482 return unichr(name2codepoint[x]) 483 elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: 484 if self.convertXMLEntities: 485 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] 486 else: 487 return u'&%s;' % x 488 elif len(x) > 0 and x[0] == '#': 489 # Handle numeric entities 490 if len(x) > 1 and x[1] == 'x': 491 return unichr(int(x[2:], 16)) 492 else: 493 return unichr(int(x[1:])) 494 495 elif self.escapeUnrecognizedEntities: 496 return u'&%s;' % x 497 else: 498 return u'&%s;' % x 499 500 def __init__(self, parser, name, attrs=None, parent=None, 501 previous=None): 502 "Basic constructor." 503 504 # We don't actually store the parser object: that lets extracted 505 # chunks be garbage-collected 506 self.parserClass = parser.__class__ 507 self.isSelfClosing = parser.isSelfClosingTag(name) 508 self.name = name 509 if attrs == None: 510 attrs = [] 511 self.attrs = attrs 512 self.contents = [] 513 self.setup(parent, previous) 514 self.hidden = False 515 self.containsSubstitutions = False 516 self.convertHTMLEntities = parser.convertHTMLEntities 517 self.convertXMLEntities = parser.convertXMLEntities 518 self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities 519 520 def convert(kval): 521 "Converts HTML, XML and numeric entities in the attribute value." 522 k, val = kval 523 if val is None: 524 return kval 525 return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", 526 self._convertEntities, val)) 527 self.attrs = map(convert, self.attrs) 528 529 def get(self, key, default=None): 530 """Returns the value of the 'key' attribute for the tag, or 531 the value given for 'default' if it doesn't have that 532 attribute.""" 533 return self._getAttrMap().get(key, default) 534 535 def has_key(self, key): 536 return self._getAttrMap().has_key(key) 537 538 def __getitem__(self, key): 539 """tag[key] returns the value of the 'key' attribute for the tag, 540 and throws an exception if it's not there.""" 541 return self._getAttrMap()[key] 542 543 def __iter__(self): 544 "Iterating over a tag iterates over its contents." 545 return iter(self.contents) 546 547 def __len__(self): 548 "The length of a tag is the length of its list of contents." 549 return len(self.contents) 550 551 def __contains__(self, x): 552 return x in self.contents 553 554 def __nonzero__(self): 555 "A tag is non-None even if it has no contents." 556 return True 557 558 def __setitem__(self, key, value): 559 """Setting tag[key] sets the value of the 'key' attribute for the 560 tag.""" 561 self._getAttrMap() 562 self.attrMap[key] = value 563 found = False 564 for i in range(0, len(self.attrs)): 565 if self.attrs[i][0] == key: 566 self.attrs[i] = (key, value) 567 found = True 568 if not found: 569 self.attrs.append((key, value)) 570 self._getAttrMap()[key] = value 571 572 def __delitem__(self, key): 573 "Deleting tag[key] deletes all 'key' attributes for the tag." 574 for item in self.attrs: 575 if item[0] == key: 576 self.attrs.remove(item) 577 #We don't break because bad HTML can define the same 578 #attribute multiple times. 579 self._getAttrMap() 580 if self.attrMap.has_key(key): 581 del self.attrMap[key] 582 583 def __call__(self, *args, **kwargs): 584 """Calling a tag like a function is the same as calling its 585 findAll() method. Eg. tag('a') returns a list of all the A tags 586 found within this tag.""" 587 return apply(self.findAll, args, kwargs) 588 589 def __getattr__(self, tag): 590 #print "Getattr %s.%s" % (self.__class__, tag) 591 if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: 592 return self.find(tag[:-3]) 593 elif tag.find('__') != 0: 594 return self.find(tag) 595 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) 596 597 def __eq__(self, other): 598 """Returns true iff this tag has the same name, the same attributes, 599 and the same contents (recursively) as the given tag. 600 601 NOTE: right now this will return false if two tags have the 602 same attributes in a different order. Should this be fixed?""" 603 if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): 604 return False 605 for i in range(0, len(self.contents)): 606 if self.contents[i] != other.contents[i]: 607 return False 608 return True 609 610 def __ne__(self, other): 611 """Returns true iff this tag is not identical to the other tag, 612 as defined in __eq__.""" 613 return not self == other 614 615 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): 616 """Renders this tag as a string.""" 617 return self.decode(eventualEncoding=encoding) 618 619 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" 620 + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" 621 + ")") 622 623 def _sub_entity(self, x): 624 """Used with a regular expression to substitute the 625 appropriate XML entity for an XML special character.""" 626 return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" 627 628 def __unicode__(self): 629 return self.decode() 630 631 def __str__(self): 632 return self.encode() 633 634 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, 635 prettyPrint=False, indentLevel=0): 636 return self.decode(prettyPrint, indentLevel, encoding).encode(encoding) 637 638 def decode(self, prettyPrint=False, indentLevel=0, 639 eventualEncoding=DEFAULT_OUTPUT_ENCODING): 640 """Returns a string or Unicode representation of this tag and 641 its contents. To get Unicode, pass None for encoding.""" 642 643 attrs = [] 644 if self.attrs: 645 for key, val in self.attrs: 646 fmt = '%s="%s"' 647 if isString(val): 648 if (self.containsSubstitutions 649 and eventualEncoding is not None 650 and '%SOUP-ENCODING%' in val): 651 val = self.substituteEncoding(val, eventualEncoding) 652 653 # The attribute value either: 654 # 655 # * Contains no embedded double quotes or single quotes. 656 # No problem: we enclose it in double quotes. 657 # * Contains embedded single quotes. No problem: 658 # double quotes work here too. 659 # * Contains embedded double quotes. No problem: 660 # we enclose it in single quotes. 661 # * Embeds both single _and_ double quotes. This 662 # can't happen naturally, but it can happen if 663 # you modify an attribute value after parsing 664 # the document. Now we have a bit of a 665 # problem. We solve it by enclosing the 666 # attribute in single quotes, and escaping any 667 # embedded single quotes to XML entities. 668 if '"' in val: 669 fmt = "%s='%s'" 670 if "'" in val: 671 # TODO: replace with apos when 672 # appropriate. 673 val = val.replace("'", "&squot;") 674 675 # Now we're okay w/r/t quotes. But the attribute 676 # value might also contain angle brackets, or 677 # ampersands that aren't part of entities. We need 678 # to escape those to XML entities too. 679 val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) 680 if val is None: 681 # Handle boolean attributes. 682 decoded = key 683 else: 684 decoded = fmt % (key, val) 685 attrs.append(decoded) 686 close = '' 687 closeTag = '' 688 if self.isSelfClosing: 689 close = ' /' 690 else: 691 closeTag = '</%s>' % self.name 692 693 indentTag, indentContents = 0, 0 694 if prettyPrint: 695 indentTag = indentLevel 696 space = (' ' * (indentTag-1)) 697 indentContents = indentTag + 1 698 contents = self.decodeContents(prettyPrint, indentContents, 699 eventualEncoding) 700 if self.hidden: 701 s = contents 702 else: 703 s = [] 704 attributeString = '' 705 if attrs: 706 attributeString = ' ' + ' '.join(attrs) 707 if prettyPrint: 708 s.append(space) 709 s.append('<%s%s%s>' % (self.name, attributeString, close)) 710 if prettyPrint: 711 s.append("\n") 712 s.append(contents) 713 if prettyPrint and contents and contents[-1] != "\n": 714 s.append("\n") 715 if prettyPrint and closeTag: 716 s.append(space) 717 s.append(closeTag) 718 if prettyPrint and closeTag and self.nextSibling: 719 s.append("\n") 720 s = ''.join(s) 721 return s 722 723 def decompose(self): 724 """Recursively destroys the contents of this tree.""" 725 contents = [i for i in self.contents] 726 for i in contents: 727 if isinstance(i, Tag): 728 i.decompose() 729 else: 730 i.extract() 731 self.extract() 732 733 def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): 734 return self.encode(encoding, True) 735 736 def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING, 737 prettyPrint=False, indentLevel=0): 738 return self.decodeContents(prettyPrint, indentLevel).encode(encoding) 739 740 def decodeContents(self, prettyPrint=False, indentLevel=0, 741 eventualEncoding=DEFAULT_OUTPUT_ENCODING): 742 """Renders the contents of this tag as a string in the given 743 encoding. If encoding is None, returns a Unicode string..""" 744 s=[] 745 for c in self: 746 text = None 747 if isinstance(c, NavigableString): 748 text = c.decodeGivenEventualEncoding(eventualEncoding) 749 elif isinstance(c, Tag): 750 s.append(c.decode(prettyPrint, indentLevel, eventualEncoding)) 751 if text and prettyPrint: 752 text = text.strip() 753 if text: 754 if prettyPrint: 755 s.append(" " * (indentLevel-1)) 756 s.append(text) 757 if prettyPrint: 758 s.append("\n") 759 return ''.join(s) 760 761 #Soup methods 762 763 def find(self, name=None, attrs={}, recursive=True, text=None, 764 **kwargs): 765 """Return only the first child of this Tag matching the given 766 criteria.""" 767 r = None 768 l = self.findAll(name, attrs, recursive, text, 1, **kwargs) 769 if l: 770 r = l[0] 771 return r 772 findChild = find 773 774 def findAll(self, name=None, attrs={}, recursive=True, text=None, 775 limit=None, **kwargs): 776 """Extracts a list of Tag objects that match the given 777 criteria. You can specify the name of the Tag and any 778 attributes you want the Tag to have. 779 780 The value of a key-value pair in the 'attrs' map can be a 781 string, a list of strings, a regular expression object, or a 782 callable that takes a string and returns whether or not the 783 string matches for some custom definition of 'matches'. The 784 same is true of the tag name.""" 785 generator = self.recursiveChildGenerator 786 if not recursive: 787 generator = self.childGenerator 788 return self._findAll(name, attrs, text, limit, generator, **kwargs) 789 findChildren = findAll 790 791 # Pre-3.x compatibility methods. Will go away in 4.0. 792 first = find 793 fetch = findAll 794 795 def fetchText(self, text=None, recursive=True, limit=None): 796 return self.findAll(text=text, recursive=recursive, limit=limit) 797 798 def firstText(self, text=None, recursive=True): 799 return self.find(text=text, recursive=recursive) 800 801 # 3.x compatibility methods. Will go away in 4.0. 802 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, 803 prettyPrint=False, indentLevel=0): 804 if encoding is None: 805 return self.decodeContents(prettyPrint, indentLevel, encoding) 806 else: 807 return self.encodeContents(encoding, prettyPrint, indentLevel) 808 809 810 #Private methods 811 812 def _getAttrMap(self): 813 """Initializes a map representation of this tag's attributes, 814 if not already initialized.""" 815 if not getattr(self, 'attrMap'): 816 self.attrMap = {} 817 for (key, value) in self.attrs: 818 self.attrMap[key] = value 819 return self.attrMap 820 821 #Generator methods 822 def recursiveChildGenerator(self): 823 if not len(self.contents): 824 raise StopIteration 825 stopNode = self._lastRecursiveChild().next 826 current = self.contents[0] 827 while current is not stopNode: 828 yield current 829 current = current.next 830 831 def childGenerator(self): 832 if not len(self.contents): 833 raise StopIteration 834 current = self.contents[0] 835 while current: 836 yield current 837 current = current.nextSibling 838 raise StopIteration 839 840# Next, a couple classes to represent queries and their results. 841class SoupStrainer: 842 """Encapsulates a number of ways of matching a markup element (tag or 843 text).""" 844 845 def __init__(self, name=None, attrs={}, text=None, **kwargs): 846 self.name = name 847 if isString(attrs): 848 kwargs['class'] = attrs 849 attrs = None 850 if kwargs: 851 if attrs: 852 attrs = attrs.copy() 853 attrs.update(kwargs) 854 else: 855 attrs = kwargs 856 self.attrs = attrs 857 self.text = text 858 859 def __str__(self): 860 if self.text: 861 return self.text 862 else: 863 return "%s|%s" % (self.name, self.attrs) 864 865 def searchTag(self, markupName=None, markupAttrs={}): 866 found = None 867 markup = None 868 if isinstance(markupName, Tag): 869 markup = markupName 870 markupAttrs = markup 871 callFunctionWithTagData = callable(self.name) \ 872 and not isinstance(markupName, Tag) 873 874 if (not self.name) \ 875 or callFunctionWithTagData \ 876 or (markup and self._matches(markup, self.name)) \ 877 or (not markup and self._matches(markupName, self.name)): 878 if callFunctionWithTagData: 879 match = self.name(markupName, markupAttrs) 880 else: 881 match = True 882 markupAttrMap = None 883 for attr, matchAgainst in self.attrs.items(): 884 if not markupAttrMap: 885 if hasattr(markupAttrs, 'get'): 886 markupAttrMap = markupAttrs 887 else: 888 markupAttrMap = {} 889 for k,v in markupAttrs: 890 markupAttrMap[k] = v 891 attrValue = markupAttrMap.get(attr) 892 if not self._matches(attrValue, matchAgainst): 893 match = False 894 break 895 if match: 896 if markup: 897 found = markup 898 else: 899 found = markupName 900 return found 901 902 def search(self, markup): 903 #print 'looking for %s in %s' % (self, markup) 904 found = None 905 # If given a list of items, scan it for a text element that 906 # matches. 907 if isList(markup) and not isinstance(markup, Tag): 908 for element in markup: 909 if isinstance(element, NavigableString) \ 910 and self.search(element): 911 found = element 912 break 913 # If it's a Tag, make sure its name or attributes match. 914 # Don't bother with Tags if we're searching for text. 915 elif isinstance(markup, Tag): 916 if not self.text: 917 found = self.searchTag(markup) 918 # If it's text, make sure the text matches. 919 elif isinstance(markup, NavigableString) or \ 920 isString(markup): 921 if self._matches(markup, self.text): 922 found = markup 923 else: 924 raise Exception, "I don't know how to match against a %s" \ 925 % markup.__class__ 926 return found 927 928 def _matches(self, markup, matchAgainst): 929 #print "Matching %s against %s" % (markup, matchAgainst) 930 result = False 931 if matchAgainst == True and type(matchAgainst) == types.BooleanType: 932 result = markup != None 933 elif callable(matchAgainst): 934 result = matchAgainst(markup) 935 else: 936 #Custom match methods take the tag as an argument, but all 937 #other ways of matching match the tag name as a string. 938 if isinstance(markup, Tag): 939 markup = markup.name 940 if markup is not None and not isString(markup): 941 markup = unicode(markup) 942 #Now we know that chunk is either a string, or None. 943 if hasattr(matchAgainst, 'match'): 944 # It's a regexp object. 945 result = markup and matchAgainst.search(markup) 946 elif (isList(matchAgainst) 947 and (markup is not None or not isString(matchAgainst))): 948 result = markup in matchAgainst 949 elif hasattr(matchAgainst, 'items'): 950 result = markup.has_key(matchAgainst) 951 elif matchAgainst and isString(markup): 952 if isinstance(markup, unicode): 953 matchAgainst = unicode(matchAgainst) 954 else: 955 matchAgainst = str(matchAgainst) 956 957 if not result: 958 result = matchAgainst == markup 959 return result 960 961class ResultSet(list): 962 """A ResultSet is just a list that keeps track of the SoupStrainer 963 that created it.""" 964 def __init__(self, source): 965 list.__init__([]) 966 self.source = source 967 968# Now, some helper functions. 969 970def isList(l): 971 """Convenience method that works with all 2.x versions of Python 972 to determine whether or not something is listlike.""" 973 return ((hasattr(l, '__iter__') and not isString(l)) 974 or (type(l) in (types.ListType, types.TupleType))) 975 976def isString(s): 977 """Convenience method that works with all 2.x versions of Python 978 to determine whether or not something is stringlike.""" 979 try: 980 return isinstance(s, unicode) or isinstance(s, basestring) 981 except NameError: 982 return isinstance(s, str) 983 984def buildTagMap(default, *args): 985 """Turns a list of maps, lists, or scalars into a single map. 986 Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and 987 NESTING_RESET_TAGS maps out of lists and partial maps.""" 988 built = {} 989 for portion in args: 990 if hasattr(portion, 'items'): 991 #It's a map. Merge it. 992 for k,v in portion.items(): 993 built[k] = v 994 elif isList(portion) and not isString(portion): 995 #It's a list. Map each item to the default. 996 for k in portion: 997 built[k] = default 998 else: 999 #It's a scalar. Map it to the default. 1000 built[portion] = default 1001 return built 1002 1003# Now, the parser classes. 1004 1005class HTMLParserBuilder(HTMLParser): 1006 1007 def __init__(self, soup): 1008 HTMLParser.__init__(self) 1009 self.soup = soup 1010 1011 # We inherit feed() and reset(). 1012 1013 def handle_starttag(self, name, attrs): 1014 if name == 'meta': 1015 self.soup.extractCharsetFromMeta(attrs) 1016 else: 1017 self.soup.unknown_starttag(name, attrs) 1018 1019 def handle_endtag(self, name): 1020 self.soup.unknown_endtag(name) 1021 1022 def handle_data(self, content): 1023 self.soup.handle_data(content) 1024 1025 def _toStringSubclass(self, text, subclass): 1026 """Adds a certain piece of text to the tree as a NavigableString 1027 subclass.""" 1028 self.soup.endData() 1029 self.handle_data(text) 1030 self.soup.endData(subclass) 1031 1032 def handle_pi(self, text): 1033 """Handle a processing instruction as a ProcessingInstruction 1034 object, possibly one with a %SOUP-ENCODING% slot into which an 1035 encoding will be plugged later.""" 1036 if text[:3] == "xml": 1037 text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" 1038 self._toStringSubclass(text, ProcessingInstruction) 1039 1040 def handle_comment(self, text): 1041 "Handle comments as Comment objects." 1042 self._toStringSubclass(text, Comment) 1043 1044 def handle_charref(self, ref): 1045 "Handle character references as data." 1046 if self.soup.convertEntities: 1047 data = unichr(int(ref)) 1048 else: 1049 data = '&#%s;' % ref 1050 self.handle_data(data) 1051 1052 def handle_entityref(self, ref): 1053 """Handle entity references as data, possibly converting known 1054 HTML and/or XML entity references to the corresponding Unicode 1055 characters.""" 1056 data = None 1057 if self.soup.convertHTMLEntities: 1058 try: 1059 data = unichr(name2codepoint[ref]) 1060 except KeyError: 1061 pass 1062 1063 if not data and self.soup.convertXMLEntities: 1064 data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) 1065 1066 if not data and self.soup.convertHTMLEntities and \ 1067 not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): 1068 # TODO: We've got a problem here. We're told this is 1069 # an entity reference, but it's not an XML entity 1070 # reference or an HTML entity reference. Nonetheless, 1071 # the logical thing to do is to pass it through as an 1072 # unrecognized entity reference. 1073 # 1074 # Except: when the input is "&carol;" this function 1075 # will be called with input "carol". When the input is 1076 # "AT&T", this function will be called with input 1077 # "T". We have no way of knowing whether a semicolon 1078 # was present originally, so we don't know whether 1079 # this is an unknown entity or just a misplaced 1080 # ampersand. 1081 # 1082 # The more common case is a misplaced ampersand, so I 1083 # escape the ampersand and omit the trailing semicolon. 1084 data = "&%s" % ref 1085 if not data: 1086 # This case is different from the one above, because we 1087 # haven't already gone through a supposedly comprehensive 1088 # mapping of entities to Unicode characters. We might not 1089 # have gone through any mapping at all. So the chances are 1090 # very high that this is a real entity, and not a 1091 # misplaced ampersand. 1092 data = "&%s;" % ref 1093 self.handle_data(data) 1094 1095 def handle_decl(self, data): 1096 "Handle DOCTYPEs and the like as Declaration objects." 1097 self._toStringSubclass(data, Declaration) 1098 1099 def parse_declaration(self, i): 1100 """Treat a bogus SGML declaration as raw data. Treat a CDATA 1101 declaration as a CData object.""" 1102 j = None 1103 if self.rawdata[i:i+9] == '<![CDATA[': 1104 k = self.rawdata.find(']]>', i) 1105 if k == -1: 1106 k = len(self.rawdata) 1107 data = self.rawdata[i+9:k] 1108 j = k+3 1109 self._toStringSubclass(data, CData) 1110 else: 1111 try: 1112 j = HTMLParser.parse_declaration(self, i) 1113 except HTMLParseError: 1114 toHandle = self.rawdata[i:] 1115 self.handle_data(toHandle) 1116 j = i + len(toHandle) 1117 return j 1118 1119 1120class BeautifulStoneSoup(Tag): 1121 1122 """This class contains the basic parser and search code. It defines 1123 a parser that knows nothing about tag behavior except for the 1124 following: 1125 1126 You can't close a tag without closing all the tags it encloses. 1127 That is, "<foo><bar></foo>" actually means 1128 "<foo><bar></bar></foo>". 1129 1130 [Another possible explanation is "<foo><bar /></foo>", but since 1131 this class defines no SELF_CLOSING_TAGS, it will never use that 1132 explanation.] 1133 1134 This class is useful for parsing XML or made-up markup languages, 1135 or when BeautifulSoup makes an assumption counter to what you were 1136 expecting.""" 1137 1138 SELF_CLOSING_TAGS = {} 1139 NESTABLE_TAGS = {} 1140 RESET_NESTING_TAGS = {} 1141 QUOTE_TAGS = {} 1142 PRESERVE_WHITESPACE_TAGS = [] 1143 1144 MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), 1145 lambda x: x.group(1) + ' />'), 1146 (re.compile('<!\s+([^<>]*)>'), 1147 lambda x: '<!' + x.group(1) + '>') 1148 ] 1149 1150 ROOT_TAG_NAME = u'[document]' 1151 1152 HTML_ENTITIES = "html" 1153 XML_ENTITIES = "xml" 1154 XHTML_ENTITIES = "xhtml" 1155 # TODO: This only exists for backwards-compatibility 1156 ALL_ENTITIES = XHTML_ENTITIES 1157 1158 # Used when determining whether a text node is all whitespace and 1159 # can be replaced with a single space. A text node that contains 1160 # fancy Unicode spaces (usually non-breaking) should be left 1161 # alone. 1162 STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } 1163 1164 def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, 1165 markupMassage=True, smartQuotesTo=XML_ENTITIES, 1166 convertEntities=None, selfClosingTags=None, isHTML=False, 1167 builder=HTMLParserBuilder): 1168 """The Soup object is initialized as the 'root tag', and the 1169 provided markup (which can be a string or a file-like object) 1170 is fed into the underlying parser. 1171 1172 HTMLParser will process most bad HTML, and the BeautifulSoup 1173 class has some tricks for dealing with some HTML that kills 1174 HTMLParser, but Beautiful Soup can nonetheless choke or lose data 1175 if your data uses self-closing tags or declarations 1176 incorrectly. 1177 1178 By default, Beautiful Soup uses regexes to sanitize input, 1179 avoiding the vast majority of these problems. If the problems 1180 don't apply to you, pass in False for markupMassage, and 1181 you'll get better performance. 1182 1183 The default parser massage techniques fix the two most common 1184 instances of invalid HTML that choke HTMLParser: 1185 1186 <br/> (No space between name of closing tag and tag close) 1187 <! --Comment--> (Extraneous whitespace in declaration) 1188 1189 You can pass in a custom list of (RE object, replace method) 1190 tuples to get Beautiful Soup to scrub your input the way you 1191 want.""" 1192 1193 self.parseOnlyThese = parseOnlyThese 1194 self.fromEncoding = fromEncoding 1195 self.smartQuotesTo = smartQuotesTo 1196 self.convertEntities = convertEntities 1197 # Set the rules for how we'll deal with the entities we 1198 # encounter 1199 if self.convertEntities: 1200 # It doesn't make sense to convert encoded characters to 1201 # entities even while you're converting entities to Unicode. 1202 # Just convert it all to Unicode. 1203 self.smartQuotesTo = None 1204 if convertEntities == self.HTML_ENTITIES: 1205 self.convertXMLEntities = False 1206 self.convertHTMLEntities = True 1207 self.escapeUnrecognizedEntities = True 1208 elif convertEntities == self.XHTML_ENTITIES: 1209 self.convertXMLEntities = True 1210 self.convertHTMLEntities = True 1211 self.escapeUnrecognizedEntities = False 1212 elif convertEntities == self.XML_ENTITIES: 1213 self.convertXMLEntities = True 1214 self.convertHTMLEntities = False 1215 self.escapeUnrecognizedEntities = False 1216 else: 1217 self.convertXMLEntities = False 1218 self.convertHTMLEntities = False 1219 self.escapeUnrecognizedEntities = False 1220 1221 self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) 1222 self.builder = builder(self) 1223 self.reset() 1224 1225 if hasattr(markup, 'read'): # It's a file-type object. 1226 markup = markup.read() 1227 self.markup = markup 1228 self.markupMassage = markupMassage 1229 try: 1230 self._feed(isHTML=isHTML) 1231 except StopParsing: 1232 pass 1233 self.markup = None # The markup can now be GCed. 1234 self.builder = None # So can the builder. 1235 1236 def _feed(self, inDocumentEncoding=None, isHTML=False): 1237 # Convert the document to Unicode. 1238 markup = self.markup 1239 if isinstance(markup, unicode): 1240 if not hasattr(self, 'originalEncoding'): 1241 self.originalEncoding = None 1242 else: 1243 dammit = UnicodeDammit\ 1244 (markup, [self.fromEncoding, inDocumentEncoding], 1245 smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) 1246 markup = dammit.unicode 1247 self.originalEncoding = dammit.originalEncoding 1248 self.declaredHTMLEncoding = dammit.declaredHTMLEncoding 1249 if markup: 1250 if self.markupMassage: 1251 if not isList(self.markupMassage): 1252 self.markupMassage = self.MARKUP_MASSAGE 1253 for fix, m in self.markupMassage: 1254 markup = fix.sub(m, markup) 1255 # TODO: We get rid of markupMassage so that the 1256 # soup object can be deepcopied later on. Some 1257 # Python installations can't copy regexes. If anyone 1258 # was relying on the existence of markupMassage, this 1259 # might cause problems. 1260 del(self.markupMassage) 1261 self.builder.reset() 1262 1263 self.builder.feed(markup) 1264 # Close out any unfinished strings and close all the open tags. 1265 self.endData() 1266 while self.currentTag.name != self.ROOT_TAG_NAME: 1267 self.popTag() 1268 1269 def isSelfClosingTag(self, name): 1270 """Returns true iff the given string is the name of a 1271 self-closing tag according to this parser.""" 1272 return self.SELF_CLOSING_TAGS.has_key(name) \ 1273 or self.instanceSelfClosingTags.has_key(name) 1274 1275 def reset(self): 1276 Tag.__init__(self, self, self.ROOT_TAG_NAME) 1277 self.hidden = 1 1278 self.builder.reset() 1279 self.currentData = [] 1280 self.currentTag = None 1281 self.tagStack = [] 1282 self.quoteStack = [] 1283 self.pushTag(self) 1284 1285 def popTag(self): 1286 tag = self.tagStack.pop() 1287 # Tags with just one string-owning child get the child as a 1288 # 'string' property, so that soup.tag.string is shorthand for 1289 # soup.tag.contents[0] 1290 if len(self.currentTag.contents) == 1 and \ 1291 isinstance(self.currentTag.contents[0], NavigableString): 1292 self.currentTag.string = self.currentTag.contents[0] 1293 1294 #print "Pop", tag.name 1295 if self.tagStack: 1296 self.currentTag = self.tagStack[-1] 1297 return self.currentTag 1298 1299 def pushTag(self, tag): 1300 #print "Push", tag.name 1301 if self.currentTag: 1302 self.currentTag.contents.append(tag) 1303 self.tagStack.append(tag) 1304 self.currentTag = self.tagStack[-1] 1305 1306 def endData(self, containerClass=NavigableString): 1307 if self.currentData: 1308 currentData = u''.join(self.currentData) 1309 if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and 1310 not set([tag.name for tag in self.tagStack]).intersection( 1311 self.PRESERVE_WHITESPACE_TAGS)): 1312 if '\n' in currentData: 1313 currentData = '\n' 1314 else: 1315 currentData = ' ' 1316 self.currentData = [] 1317 if self.parseOnlyThese and len(self.tagStack) <= 1 and \ 1318 (not self.parseOnlyThese.text or \ 1319 not self.parseOnlyThese.search(currentData)): 1320 return 1321 o = containerClass(currentData) 1322 o.setup(self.currentTag, self.previous) 1323 if self.previous: 1324 self.previous.next = o 1325 self.previous = o 1326 self.currentTag.contents.append(o) 1327 1328 1329 def _popToTag(self, name, inclusivePop=True): 1330 """Pops the tag stack up to and including the most recent 1331 instance of the given tag. If inclusivePop is false, pops the tag 1332 stack up to but *not* including the most recent instqance of 1333 the given tag.""" 1334 #print "Popping to %s" % name 1335 if name == self.ROOT_TAG_NAME: 1336 return 1337 1338 numPops = 0 1339 mostRecentTag = None 1340 for i in range(len(self.tagStack)-1, 0, -1): 1341 if name == self.tagStack[i].name: 1342 numPops = len(self.tagStack)-i 1343 break 1344 if not inclusivePop: 1345 numPops = numPops - 1 1346 1347 for i in range(0, numPops): 1348 mostRecentTag = self.popTag() 1349 return mostRecentTag 1350 1351 def _smartPop(self, name): 1352 1353 """We need to pop up to the previous tag of this type, unless 1354 one of this tag's nesting reset triggers comes between this 1355 tag and the previous tag of this type, OR unless this tag is a 1356 generic nesting trigger and another generic nesting trigger 1357 comes between this tag and the previous tag of this type. 1358 1359 Examples: 1360 <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'. 1361 <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'. 1362 <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'. 1363 1364 <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. 1365 <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' 1366 <td><tr><td> *<td>* should pop to 'tr', not the first 'td' 1367 """ 1368 1369 nestingResetTriggers = self.NESTABLE_TAGS.get(name) 1370 isNestable = nestingResetTriggers != None 1371 isResetNesting = self.RESET_NESTING_TAGS.has_key(name) 1372 popTo = None 1373 inclusive = True 1374 for i in range(len(self.tagStack)-1, 0, -1): 1375 p = self.tagStack[i] 1376 if (not p or p.name == name) and not isNestable: 1377 #Non-nestable tags get popped to the top or to their 1378 #last occurance. 1379 popTo = name 1380 break 1381 if (nestingResetTriggers != None 1382 and p.name in nestingResetTriggers) \ 1383 or (nestingResetTriggers == None and isResetNesting 1384 and self.RESET_NESTING_TAGS.has_key(p.name)): 1385 1386 #If we encounter one of the nesting reset triggers 1387 #peculiar to this tag, or we encounter another tag 1388 #that causes nesting to reset, pop up to but not 1389 #including that tag. 1390 popTo = p.name 1391 inclusive = False 1392 break 1393 p = p.parent 1394 if popTo: 1395 self._popToTag(popTo, inclusive) 1396 1397 def unknown_starttag(self, name, attrs, selfClosing=0): 1398 #print "Start tag %s: %s" % (name, attrs) 1399 if self.quoteStack: 1400 #This is not a real tag. 1401 #print "<%s> is not real!" % name 1402 attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) 1403 self.handle_data('<%s%s>' % (name, attrs)) 1404 return 1405 self.endData() 1406 1407 if not self.isSelfClosingTag(name) and not selfClosing: 1408 self._smartPop(name) 1409 1410 if self.parseOnlyThese and len(self.tagStack) <= 1 \ 1411 and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): 1412 return 1413 1414 tag = Tag(self, name, attrs, self.currentTag, self.previous) 1415 if self.previous: 1416 self.previous.next = tag 1417 self.previous = tag 1418 self.pushTag(tag) 1419 if selfClosing or self.isSelfClosingTag(name): 1420 self.popTag() 1421 if name in self.QUOTE_TAGS: 1422 #print "Beginning quote (%s)" % name 1423 self.quoteStack.append(name) 1424 self.literal = 1 1425 return tag 1426 1427 def unknown_endtag(self, name): 1428 #print "End tag %s" % name 1429 if self.quoteStack and self.quoteStack[-1] != name: 1430 #This is not a real end tag. 1431 #print "</%s> is not real!" % name 1432 self.handle_data('</%s>' % name) 1433 return 1434 self.endData() 1435 self._popToTag(name) 1436 if self.quoteStack and self.quoteStack[-1] == name: 1437 self.quoteStack.pop() 1438 self.literal = (len(self.quoteStack) > 0) 1439 1440 def handle_data(self, data): 1441 self.currentData.append(data) 1442 1443 def extractCharsetFromMeta(self, attrs): 1444 self.unknown_starttag('meta', attrs) 1445 1446 1447class BeautifulSoup(BeautifulStoneSoup): 1448 1449 """This parser knows the following facts about HTML: 1450 1451 * Some tags have no closing tag and should be interpreted as being 1452 closed as soon as they are encountered. 1453 1454 * The text inside some tags (ie. 'script') may contain tags which 1455 are not really part of the document and which should be parsed 1456 as text, not tags. If you want to parse the text as tags, you can 1457 always fetch it and parse it explicitly. 1458 1459 * Tag nesting rules: 1460 1461 Most tags can't be nested at all. For instance, the occurance of 1462 a <p> tag should implicitly close the previous <p> tag. 1463 1464 <p>Para1<p>Para2 1465 should be transformed into: 1466 <p>Para1</p><p>Para2 1467 1468 Some tags can be nested arbitrarily. For instance, the occurance 1469 of a <blockquote> tag should _not_ implicitly close the previous 1470 <blockquote> tag. 1471 1472 Alice said: <blockquote>Bob said: <blockquote>Blah 1473 should NOT be transformed into: 1474 Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah 1475 1476 Some tags can be nested, but the nesting is reset by the 1477 interposition of other tags. For instance, a <tr> tag should 1478 implicitly close the previous <tr> tag within the same <table>, 1479 but not close a <tr> tag in another table. 1480 1481 <table><tr>Blah<tr>Blah 1482 should be transformed into: 1483 <table><tr>Blah</tr><tr>Blah 1484 but, 1485 <tr>Blah<table><tr>Blah 1486 should NOT be transformed into 1487 <tr>Blah<table></tr><tr>Blah 1488 1489 Differing assumptions about tag nesting rules are a major source 1490 of problems with the BeautifulSoup class. If BeautifulSoup is not 1491 treating as nestable a tag your page author treats as nestable, 1492 try ICantBelieveItsBeautifulSoup, MinimalSoup, or 1493 BeautifulStoneSoup before writing your own subclass.""" 1494 1495 def __init__(self, *args, **kwargs): 1496 if not kwargs.has_key('smartQuotesTo'): 1497 kwargs['smartQuotesTo'] = self.HTML_ENTITIES 1498 kwargs['isHTML'] = True 1499 BeautifulStoneSoup.__init__(self, *args, **kwargs) 1500 1501 SELF_CLOSING_TAGS = buildTagMap(None, 1502 ['br' , 'hr', 'input', 'img', 'meta', 1503 'spacer', 'link', 'frame', 'base']) 1504 1505 PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) 1506 1507 QUOTE_TAGS = {'script' : None, 'textarea' : None} 1508 1509 #According to the HTML standard, each of these inline tags can 1510 #contain another tag of the same type. Furthermore, it's common 1511 #to actually use these tags this way. 1512 NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 1513 'center'] 1514 1515 #According to the HTML standard, these block tags can contain 1516 #another tag of the same type. Furthermore, it's common 1517 #to actually use these tags this way. 1518 NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del'] 1519 1520 #Lists can contain other lists, but there are restrictions. 1521 NESTABLE_LIST_TAGS = { 'ol' : [], 1522 'ul' : [], 1523 'li' : ['ul', 'ol'], 1524 'dl' : [], 1525 'dd' : ['dl'], 1526 'dt' : ['dl'] } 1527 1528 #Tables can contain other tables, but there are restrictions. 1529 NESTABLE_TABLE_TAGS = {'table' : [], 1530 'tr' : ['table', 'tbody', 'tfoot', 'thead'], 1531 'td' : ['tr'], 1532 'th' : ['tr'], 1533 'thead' : ['table'], 1534 'tbody' : ['table'], 1535 'tfoot' : ['table'], 1536 } 1537 1538 NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre'] 1539 1540 #If one of these tags is encountered, all tags up to the next tag of 1541 #this type are popped. 1542 RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', 1543 NON_NESTABLE_BLOCK_TAGS, 1544 NESTABLE_LIST_TAGS, 1545 NESTABLE_TABLE_TAGS) 1546 1547 NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, 1548 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) 1549 1550 # Used to detect the charset in a META tag; see start_meta 1551 CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) 1552 1553 def extractCharsetFromMeta(self, attrs): 1554 """Beautiful Soup can detect a charset included in a META tag, 1555 try to convert the document to that charset, and re-parse the 1556 document from the beginning.""" 1557 httpEquiv = None 1558 contentType = None 1559 contentTypeIndex = None 1560 tagNeedsEncodingSubstitution = False 1561 1562 for i in range(0, len(attrs)): 1563 key, value = attrs[i] 1564 key = key.lower() 1565 if key == 'http-equiv': 1566 httpEquiv = value 1567 elif key == 'content': 1568 contentType = value 1569 contentTypeIndex = i 1570 1571 if httpEquiv and contentType: # It's an interesting meta tag. 1572 match = self.CHARSET_RE.search(contentType) 1573 if match: 1574 if (self.declaredHTMLEncoding is not None or 1575 self.originalEncoding == self.fromEncoding): 1576 # An HTML encoding was sniffed while converting 1577 # the document to Unicode, or an HTML encoding was 1578 # sniffed during a previous pass through the 1579 # document, or an encoding was specified 1580 # explicitly and it worked. Rewrite the meta tag. 1581 def rewrite(match): 1582 return match.group(1) + "%SOUP-ENCODING%" 1583 newAttr = self.CHARSET_RE.sub(rewrite, contentType) 1584 attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], 1585 newAttr) 1586 tagNeedsEncodingSubstitution = True 1587 else: 1588 # This is our first pass through the document. 1589 # Go through it again with the encoding information. 1590 newCharset = match.group(3) 1591 if newCharset and newCharset != self.originalEncoding: 1592 self.declaredHTMLEncoding = newCharset 1593 self._feed(self.declaredHTMLEncoding) 1594 raise StopParsing 1595 pass 1596 tag = self.unknown_starttag("meta", attrs) 1597 if tag and tagNeedsEncodingSubstitution: 1598 tag.containsSubstitutions = True 1599 1600 1601class StopParsing(Exception): 1602 pass 1603 1604class ICantBelieveItsBeautifulSoup(BeautifulSoup): 1605 1606 """The BeautifulSoup class is oriented towards skipping over 1607 common HTML errors like unclosed tags. However, sometimes it makes 1608 errors of its own. For instance, consider this fragment: 1609 1610 <b>Foo<b>Bar</b></b> 1611 1612 This is perfectly valid (if bizarre) HTML. However, the 1613 BeautifulSoup class will implicitly close the first b tag when it 1614 encounters the second 'b'. It will think the author wrote 1615 "<b>Foo<b>Bar", and didn't close the first 'b' tag, because 1616 there's no real-world reason to bold something that's already 1617 bold. When it encounters '</b></b>' it will close two more 'b' 1618 tags, for a grand total of three tags closed instead of two. This 1619 can throw off the rest of your document structure. The same is 1620 true of a number of other tags, listed below. 1621 1622 It's much more common for someone to forget to close a 'b' tag 1623 than to actually use nested 'b' tags, and the BeautifulSoup class 1624 handles the common case. This class handles the not-co-common 1625 case: where you can't believe someone wrote what they did, but 1626 it's valid HTML and BeautifulSoup screwed up by assuming it 1627 wouldn't be.""" 1628 1629 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ 1630 ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', 1631 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', 1632 'big'] 1633 1634 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript'] 1635 1636 NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, 1637 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, 1638 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) 1639 1640class MinimalSoup(BeautifulSoup): 1641 """The MinimalSoup class is for parsing HTML that contains 1642 pathologically bad markup. It makes no assumptions about tag 1643 nesting, but it does know which tags are self-closing, that 1644 <script> tags contain Javascript and should not be parsed, that 1645 META tags may contain encoding information, and so on. 1646 1647 This also makes it better for subclassing than BeautifulStoneSoup 1648 or BeautifulSoup.""" 1649 1650 RESET_NESTING_TAGS = buildTagMap('noscript') 1651 NESTABLE_TAGS = {} 1652 1653class BeautifulSOAP(BeautifulStoneSoup): 1654 """This class will push a tag with only a single string child into 1655 the tag's parent as an attribute. The attribute's name is the tag 1656 name, and the value is the string child. An example should give 1657 the flavor of the change: 1658 1659 <foo><bar>baz</bar></foo> 1660 => 1661 <foo bar="baz"><bar>baz</bar></foo> 1662 1663 You can then access fooTag['bar'] instead of fooTag.barTag.string. 1664 1665 This is, of course, useful for scraping structures that tend to 1666 use subelements instead of attributes, such as SOAP messages. Note 1667 that it modifies its input, so don't print the modified version 1668 out. 1669 1670 I'm not sure how many people really want to use this class; let me 1671 know if you do. Mainly I like the name.""" 1672 1673 def popTag(self): 1674 if len(self.tagStack) > 1: 1675 tag = self.tagStack[-1] 1676 parent = self.tagStack[-2] 1677 parent._getAttrMap() 1678 if (isinstance(tag, Tag) and len(tag.contents) == 1 and 1679 isinstance(tag.contents[0], NavigableString) and 1680 not parent.attrMap.has_key(tag.name)): 1681 parent[tag.name] = tag.contents[0] 1682 BeautifulStoneSoup.popTag(self) 1683 1684#Enterprise class names! It has come to our attention that some people 1685#think the names of the Beautiful Soup parser classes are too silly 1686#and "unprofessional" for use in enterprise screen-scraping. We feel 1687#your pain! For such-minded folk, the Beautiful Soup Consortium And 1688#All-Night Kosher Bakery recommends renaming this file to 1689#"RobustParser.py" (or, in cases of extreme enterprisiness, 1690#"RobustParserBeanInterface.class") and using the following 1691#enterprise-friendly class aliases: 1692class RobustXMLParser(BeautifulStoneSoup): 1693 pass 1694class RobustHTMLParser(BeautifulSoup): 1695 pass 1696class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup): 1697 pass 1698class RobustInsanelyWackAssHTMLParser(MinimalSoup): 1699 pass 1700class SimplifyingSOAPParser(BeautifulSOAP): 1701 pass 1702 1703###################################################### 1704# 1705# Bonus library: Unicode, Dammit 1706# 1707# This class forces XML data into a standard format (usually to UTF-8 1708# or Unicode). It is heavily based on code from Mark Pilgrim's 1709# Universal Feed Parser. It does not rewrite the XML or HTML to 1710# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi 1711# (XML) and BeautifulSoup.start_meta (HTML). 1712 1713# Autodetects character encodings. 1714# Download from http://chardet.feedparser.org/ 1715try: 1716 import chardet 1717# import chardet.constants 1718# chardet.constants._debug = 1 1719except ImportError: 1720 chardet = None 1721 1722# cjkcodecs and iconv_codec make Python know about more character encodings. 1723# Both are available from http://cjkpython.i18n.org/ 1724# They're built in if you use Python 2.4. 1725try: 1726 import cjkcodecs.aliases 1727except ImportError: 1728 pass 1729try: 1730 import iconv_codec 1731except ImportError: 1732 pass 1733 1734class UnicodeDammit: 1735 """A class for detecting the encoding of a *ML document and 1736 converting it to a Unicode string. If the source encoding is 1737 windows-1252, can replace MS smart quotes with their HTML or XML 1738 equivalents.""" 1739 1740 # This dictionary maps commonly seen values for "charset" in HTML 1741 # meta tags to the corresponding Python codec names. It only covers 1742 # values that aren't in Python's aliases and can't be determined 1743 # by the heuristics in find_codec. 1744 CHARSET_ALIASES = { "macintosh" : "mac-roman", 1745 "x-sjis" : "shift-jis" } 1746 1747 def __init__(self, markup, overrideEncodings=[], 1748 smartQuotesTo='xml', isHTML=False): 1749 self.declaredHTMLEncoding = None 1750 self.markup, documentEncoding, sniffedEncoding = \ 1751 self._detectEncoding(markup, isHTML) 1752 self.smartQuotesTo = smartQuotesTo 1753 self.triedEncodings = [] 1754 if markup == '' or isinstance(markup, unicode): 1755 self.originalEncoding = None 1756 self.unicode = unicode(markup) 1757 return 1758 1759 u = None 1760 for proposedEncoding in overrideEncodings: 1761 u = self._convertFrom(proposedEncoding) 1762 if u: break 1763 if not u: 1764 for proposedEncoding in (documentEncoding, sniffedEncoding): 1765 u = self._convertFrom(proposedEncoding) 1766 if u: break 1767 1768 # If no luck and we have auto-detection library, try that: 1769 if not u and chardet and not isinstance(self.markup, unicode): 1770 u = self._convertFrom(chardet.detect(self.markup)['encoding']) 1771 1772 # As a last resort, try utf-8 and windows-1252: 1773 if not u: 1774 for proposed_encoding in ("utf-8", "windows-1252"): 1775 u = self._convertFrom(proposed_encoding) 1776 if u: break 1777 1778 self.unicode = u 1779 if not u: self.originalEncoding = None 1780 1781 def _subMSChar(self, match): 1782 """Changes a MS smart quote character to an XML or HTML 1783 entity.""" 1784 orig = match.group(1) 1785 sub = self.MS_CHARS.get(orig) 1786 if type(sub) == types.TupleType: 1787 if self.smartQuotesTo == 'xml': 1788 sub = '&#x'.encode() + sub[1].encode() + ';'.encode() 1789 else: 1790 sub = '&'.encode() + sub[0].encode() + ';'.encode() 1791 else: 1792 sub = sub.encode() 1793 return sub 1794 1795 def _convertFrom(self, proposed): 1796 proposed = self.find_codec(proposed) 1797 if not proposed or proposed in self.triedEncodings: 1798 return None 1799 self.triedEncodings.append(proposed) 1800 markup = self.markup 1801 1802 # Convert smart quotes to HTML if coming from an encoding 1803 # that might have them. 1804 if self.smartQuotesTo and proposed.lower() in("windows-1252", 1805 "iso-8859-1", 1806 "iso-8859-2"): 1807 smart_quotes_re = "([\x80-\x9f])" 1808 smart_quotes_compiled = re.compile(smart_quotes_re) 1809 markup = smart_quotes_compiled.sub(self._subMSChar, markup) 1810 1811 try: 1812 # print "Trying to convert document to %s" % proposed 1813 u = self._toUnicode(markup, proposed) 1814 self.markup = u 1815 self.originalEncoding = proposed 1816 except Exception, e: 1817 # print "That didn't work!" 1818 # print e 1819 return None 1820 #print "Correct encoding: %s" % proposed 1821 return self.markup 1822 1823 def _toUnicode(self, data, encoding): 1824 '''Given a string and its encoding, decodes the string into Unicode. 1825 %encoding is a string recognized by encodings.aliases''' 1826 1827 # strip Byte Order Mark (if present) 1828 if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ 1829 and (data[2:4] != '\x00\x00'): 1830 encoding = 'utf-16be' 1831 data = data[2:] 1832 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ 1833 and (data[2:4] != '\x00\x00'): 1834 encoding = 'utf-16le' 1835 data = data[2:] 1836 elif data[:3] == '\xef\xbb\xbf': 1837 encoding = 'utf-8' 1838 data = data[3:] 1839 elif data[:4] == '\x00\x00\xfe\xff': 1840 encoding = 'utf-32be' 1841 data = data[4:] 1842 elif data[:4] == '\xff\xfe\x00\x00': 1843 encoding = 'utf-32le' 1844 data = data[4:] 1845 newdata = unicode(data, encoding) 1846 return newdata 1847 1848 def _detectEncoding(self, xml_data, isHTML=False): 1849 """Given a document, tries to detect its XML encoding.""" 1850 xml_encoding = sniffed_xml_encoding = None 1851 try: 1852 if xml_data[:4] == '\x4c\x6f\xa7\x94': 1853 # EBCDIC 1854 xml_data = self._ebcdic_to_ascii(xml_data) 1855 elif xml_data[:4] == '\x00\x3c\x00\x3f': 1856 # UTF-16BE 1857 sniffed_xml_encoding = 'utf-16be' 1858 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') 1859 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ 1860 and (xml_data[2:4] != '\x00\x00'): 1861 # UTF-16BE with BOM 1862 sniffed_xml_encoding = 'utf-16be' 1863 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') 1864 elif xml_data[:4] == '\x3c\x00\x3f\x00': 1865 # UTF-16LE 1866 sniffed_xml_encoding = 'utf-16le' 1867 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') 1868 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ 1869 (xml_data[2:4] != '\x00\x00'): 1870 # UTF-16LE with BOM 1871 sniffed_xml_encoding = 'utf-16le' 1872 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') 1873 elif xml_data[:4] == '\x00\x00\x00\x3c': 1874 # UTF-32BE 1875 sniffed_xml_encoding = 'utf-32be' 1876 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') 1877 elif xml_data[:4] == '\x3c\x00\x00\x00': 1878 # UTF-32LE 1879 sniffed_xml_encoding = 'utf-32le' 1880 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') 1881 elif xml_data[:4] == '\x00\x00\xfe\xff': 1882 # UTF-32BE with BOM 1883 sniffed_xml_encoding = 'utf-32be' 1884 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') 1885 elif xml_data[:4] == '\xff\xfe\x00\x00': 1886 # UTF-32LE with BOM 1887 sniffed_xml_encoding = 'utf-32le' 1888 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') 1889 elif xml_data[:3] == '\xef\xbb\xbf': 1890 # UTF-8 with BOM 1891 sniffed_xml_encoding = 'utf-8' 1892 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') 1893 else: 1894 sniffed_xml_encoding = 'ascii' 1895 pass 1896 except: 1897 xml_encoding_match = None 1898 xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode() 1899 xml_encoding_match = re.compile(xml_encoding_re).match(xml_data) 1900 if not xml_encoding_match and isHTML: 1901 meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode() 1902 regexp = re.compile(meta_re, re.I) 1903 xml_encoding_match = regexp.search(xml_data) 1904 if xml_encoding_match is not None: 1905 xml_encoding = xml_encoding_match.groups()[0].decode( 1906 'ascii').lower() 1907 if isHTML: 1908 self.declaredHTMLEncoding = xml_encoding 1909 if sniffed_xml_encoding and \ 1910 (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 1911 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 1912 'utf-16', 'utf-32', 'utf_16', 'utf_32', 1913 'utf16', 'u16')): 1914 xml_encoding = sniffed_xml_encoding 1915 return xml_data, xml_encoding, sniffed_xml_encoding 1916 1917 1918 def find_codec(self, charset): 1919 return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ 1920 or (charset and self._codec(charset.replace("-", ""))) \ 1921 or (charset and self._codec(charset.replace("-", "_"))) \ 1922 or charset 1923 1924 def _codec(self, charset): 1925 if not charset: return charset 1926 codec = None 1927 try: 1928 codecs.lookup(charset) 1929 codec = charset 1930 except (LookupError, ValueError): 1931 pass 1932 return codec 1933 1934 EBCDIC_TO_ASCII_MAP = None 1935 def _ebcdic_to_ascii(self, s): 1936 c = self.__class__ 1937 if not c.EBCDIC_TO_ASCII_MAP: 1938 emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, 1939 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, 1940 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, 1941 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, 1942 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, 1943 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, 1944 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, 1945 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, 1946 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, 1947 201,202,106,107,108,109,110,111,112,113,114,203,204,205, 1948 206,207,208,209,126,115,116,117,118,119,120,121,122,210, 1949 211,212,213,214,215,216,217,218,219,220,221,222,223,224, 1950 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, 1951 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, 1952 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, 1953 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, 1954 250,251,252,253,254,255) 1955 import string 1956 c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ 1957 ''.join(map(chr, range(256))), ''.join(map(chr, emap))) 1958 return s.translate(c.EBCDIC_TO_ASCII_MAP) 1959 1960 MS_CHARS = { '\x80' : ('euro', '20AC'), 1961 '\x81' : ' ', 1962 '\x82' : ('sbquo', '201A'), 1963 '\x83' : ('fnof', '192'), 1964 '\x84' : ('bdquo', '201E'), 1965 '\x85' : ('hellip', '2026'), 1966 '\x86' : ('dagger', '2020'), 1967 '\x87' : ('Dagger', '2021'), 1968 '\x88' : ('circ', '2C6'), 1969 '\x89' : ('permil', '2030'), 1970 '\x8A' : ('Scaron', '160'), 1971 '\x8B' : ('lsaquo', '2039'), 1972 '\x8C' : ('OElig', '152'), 1973 '\x8D' : '?', 1974 '\x8E' : ('#x17D', '17D'), 1975 '\x8F' : '?', 1976 '\x90' : '?', 1977 '\x91' : ('lsquo', '2018'), 1978 '\x92' : ('rsquo', '2019'), 1979 '\x93' : ('ldquo', '201C'), 1980 '\x94' : ('rdquo', '201D'), 1981 '\x95' : ('bull', '2022'), 1982 '\x96' : ('ndash', '2013'), 1983 '\x97' : ('mdash', '2014'), 1984 '\x98' : ('tilde', '2DC'), 1985 '\x99' : ('trade', '2122'), 1986 '\x9a' : ('scaron', '161'), 1987 '\x9b' : ('rsaquo', '203A'), 1988 '\x9c' : ('oelig', '153'), 1989 '\x9d' : '?', 1990 '\x9e' : ('#x17E', '17E'), 1991 '\x9f' : ('Yuml', ''),} 1992 1993####################################################################### 1994 1995 1996#By default, act as an HTML pretty-printer. 1997if __name__ == '__main__': 1998 import sys 1999 soup = BeautifulSoup(sys.stdin) 2000 print soup.prettify() 2001