1"""Beautiful Soup 2Elixir and Tonic 3"The Screen-Scraper's Friend" 4http://www.crummy.com/software/BeautifulSoup/ 5 6Beautiful Soup parses a (possibly invalid) XML or HTML document into a 7tree representation. It provides methods and Pythonic idioms that make 8it easy to navigate, search, and modify the tree. 9 10A well-formed XML/HTML document yields a well-formed data 11structure. An ill-formed XML/HTML document yields a correspondingly 12ill-formed data structure. If your document is only locally 13well-formed, you can use this library to find and process the 14well-formed part of it. 15 16Beautiful Soup works with Python 2.2 and up. It has no external 17dependencies, but you'll have more success at converting data to UTF-8 18if you also install these three packages: 19 20* chardet, for auto-detecting character encodings 21 http://chardet.feedparser.org/ 22* cjkcodecs and iconv_codec, which add more encodings to the ones supported 23 by stock Python. 24 http://cjkpython.i18n.org/ 25 26Beautiful Soup defines classes for two main parsing strategies: 27 28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific 29 language that kind of looks like XML. 30 31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid 32 or invalid. This class has web browser-like heuristics for 33 obtaining a sensible parse tree in the face of common HTML errors. 34 35Beautiful Soup also defines a class (UnicodeDammit) for autodetecting 36the encoding of an HTML or XML document, and converting it to 37Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. 38 39For more than you ever wanted to know about Beautiful Soup, see the 40documentation: 41http://www.crummy.com/software/BeautifulSoup/documentation.html 42 43Here, have some legalese: 44 45Copyright (c) 2004-2010, Leonard Richardson 46 47All rights reserved. 48 49Redistribution and use in source and binary forms, with or without 50modification, are permitted provided that the following conditions are 51met: 52 53 * Redistributions of source code must retain the above copyright 54 notice, this list of conditions and the following disclaimer. 55 56 * Redistributions in binary form must reproduce the above 57 copyright notice, this list of conditions and the following 58 disclaimer in the documentation and/or other materials provided 59 with the distribution. 60 61 * Neither the name of the the Beautiful Soup Consortium and All 62 Night Kosher Bakery nor the names of its contributors may be 63 used to endorse or promote products derived from this software 64 without specific prior written permission. 65 66THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 67"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 68LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 69A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 70CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 71EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 72PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 73PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 74LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 75NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 76SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. 77 78""" 79from __future__ import generators 80 81__author__ = "Leonard Richardson (leonardr@segfault.org)" 82__version__ = "3.2.1" 83__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson" 84__license__ = "New-style BSD" 85 86from sgmllib import SGMLParser, SGMLParseError 87import codecs 88import markupbase 89import types 90import re 91import sgmllib 92try: 93 from htmlentitydefs import name2codepoint 94except ImportError: 95 name2codepoint = {} 96try: 97 set 98except NameError: 99 from sets import Set as set 100 101#These hacks make Beautiful Soup able to parse XML with namespaces 102sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') 103markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match 104 105DEFAULT_OUTPUT_ENCODING = "utf-8" 106 107def _match_css_class(str): 108 """Build a RE to match the given CSS class.""" 109 return re.compile(r"(^|.*\s)%s($|\s)" % str) 110 111# First, the classes that represent markup elements. 112 113class PageElement(object): 114 """Contains the navigational information for some part of the page 115 (either a tag or a piece of text)""" 116 117 def _invert(h): 118 "Cheap function to invert a hash." 119 i = {} 120 for k,v in h.items(): 121 i[v] = k 122 return i 123 124 XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", 125 "quot" : '"', 126 "amp" : "&", 127 "lt" : "<", 128 "gt" : ">" } 129 130 XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) 131 132 def setup(self, parent=None, previous=None): 133 """Sets up the initial relations between this element and 134 other elements.""" 135 self.parent = parent 136 self.previous = previous 137 self.next = None 138 self.previousSibling = None 139 self.nextSibling = None 140 if self.parent and self.parent.contents: 141 self.previousSibling = self.parent.contents[-1] 142 self.previousSibling.nextSibling = self 143 144 def replaceWith(self, replaceWith): 145 oldParent = self.parent 146 myIndex = self.parent.index(self) 147 if hasattr(replaceWith, "parent")\ 148 and replaceWith.parent is self.parent: 149 # We're replacing this element with one of its siblings. 150 index = replaceWith.parent.index(replaceWith) 151 if index and index < myIndex: 152 # Furthermore, it comes before this element. That 153 # means that when we extract it, the index of this 154 # element will change. 155 myIndex = myIndex - 1 156 self.extract() 157 oldParent.insert(myIndex, replaceWith) 158 159 def replaceWithChildren(self): 160 myParent = self.parent 161 myIndex = self.parent.index(self) 162 self.extract() 163 reversedChildren = list(self.contents) 164 reversedChildren.reverse() 165 for child in reversedChildren: 166 myParent.insert(myIndex, child) 167 168 def extract(self): 169 """Destructively rips this element out of the tree.""" 170 if self.parent: 171 try: 172 del self.parent.contents[self.parent.index(self)] 173 except ValueError: 174 pass 175 176 #Find the two elements that would be next to each other if 177 #this element (and any children) hadn't been parsed. Connect 178 #the two. 179 lastChild = self._lastRecursiveChild() 180 nextElement = lastChild.next 181 182 if self.previous: 183 self.previous.next = nextElement 184 if nextElement: 185 nextElement.previous = self.previous 186 self.previous = None 187 lastChild.next = None 188 189 self.parent = None 190 if self.previousSibling: 191 self.previousSibling.nextSibling = self.nextSibling 192 if self.nextSibling: 193 self.nextSibling.previousSibling = self.previousSibling 194 self.previousSibling = self.nextSibling = None 195 return self 196 197 def _lastRecursiveChild(self): 198 "Finds the last element beneath this object to be parsed." 199 lastChild = self 200 while hasattr(lastChild, 'contents') and lastChild.contents: 201 lastChild = lastChild.contents[-1] 202 return lastChild 203 204 def insert(self, position, newChild): 205 if isinstance(newChild, basestring) \ 206 and not isinstance(newChild, NavigableString): 207 newChild = NavigableString(newChild) 208 209 position = min(position, len(self.contents)) 210 if hasattr(newChild, 'parent') and newChild.parent is not None: 211 # We're 'inserting' an element that's already one 212 # of this object's children. 213 if newChild.parent is self: 214 index = self.index(newChild) 215 if index > position: 216 # Furthermore we're moving it further down the 217 # list of this object's children. That means that 218 # when we extract this element, our target index 219 # will jump down one. 220 position = position - 1 221 newChild.extract() 222 223 newChild.parent = self 224 previousChild = None 225 if position == 0: 226 newChild.previousSibling = None 227 newChild.previous = self 228 else: 229 previousChild = self.contents[position-1] 230 newChild.previousSibling = previousChild 231 newChild.previousSibling.nextSibling = newChild 232 newChild.previous = previousChild._lastRecursiveChild() 233 if newChild.previous: 234 newChild.previous.next = newChild 235 236 newChildsLastElement = newChild._lastRecursiveChild() 237 238 if position >= len(self.contents): 239 newChild.nextSibling = None 240 241 parent = self 242 parentsNextSibling = None 243 while not parentsNextSibling: 244 parentsNextSibling = parent.nextSibling 245 parent = parent.parent 246 if not parent: # This is the last element in the document. 247 break 248 if parentsNextSibling: 249 newChildsLastElement.next = parentsNextSibling 250 else: 251 newChildsLastElement.next = None 252 else: 253 nextChild = self.contents[position] 254 newChild.nextSibling = nextChild 255 if newChild.nextSibling: 256 newChild.nextSibling.previousSibling = newChild 257 newChildsLastElement.next = nextChild 258 259 if newChildsLastElement.next: 260 newChildsLastElement.next.previous = newChildsLastElement 261 self.contents.insert(position, newChild) 262 263 def append(self, tag): 264 """Appends the given tag to the contents of this tag.""" 265 self.insert(len(self.contents), tag) 266 267 def findNext(self, name=None, attrs={}, text=None, **kwargs): 268 """Returns the first item that matches the given criteria and 269 appears after this Tag in the document.""" 270 return self._findOne(self.findAllNext, name, attrs, text, **kwargs) 271 272 def findAllNext(self, name=None, attrs={}, text=None, limit=None, 273 **kwargs): 274 """Returns all items that match the given criteria and appear 275 after this Tag in the document.""" 276 return self._findAll(name, attrs, text, limit, self.nextGenerator, 277 **kwargs) 278 279 def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): 280 """Returns the closest sibling to this Tag that matches the 281 given criteria and appears after this Tag in the document.""" 282 return self._findOne(self.findNextSiblings, name, attrs, text, 283 **kwargs) 284 285 def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, 286 **kwargs): 287 """Returns the siblings of this Tag that match the given 288 criteria and appear after this Tag in the document.""" 289 return self._findAll(name, attrs, text, limit, 290 self.nextSiblingGenerator, **kwargs) 291 fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x 292 293 def findPrevious(self, name=None, attrs={}, text=None, **kwargs): 294 """Returns the first item that matches the given criteria and 295 appears before this Tag in the document.""" 296 return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) 297 298 def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, 299 **kwargs): 300 """Returns all items that match the given criteria and appear 301 before this Tag in the document.""" 302 return self._findAll(name, attrs, text, limit, self.previousGenerator, 303 **kwargs) 304 fetchPrevious = findAllPrevious # Compatibility with pre-3.x 305 306 def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): 307 """Returns the closest sibling to this Tag that matches the 308 given criteria and appears before this Tag in the document.""" 309 return self._findOne(self.findPreviousSiblings, name, attrs, text, 310 **kwargs) 311 312 def findPreviousSiblings(self, name=None, attrs={}, text=None, 313 limit=None, **kwargs): 314 """Returns the siblings of this Tag that match the given 315 criteria and appear before this Tag in the document.""" 316 return self._findAll(name, attrs, text, limit, 317 self.previousSiblingGenerator, **kwargs) 318 fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x 319 320 def findParent(self, name=None, attrs={}, **kwargs): 321 """Returns the closest parent of this Tag that matches the given 322 criteria.""" 323 # NOTE: We can't use _findOne because findParents takes a different 324 # set of arguments. 325 r = None 326 l = self.findParents(name, attrs, 1) 327 if l: 328 r = l[0] 329 return r 330 331 def findParents(self, name=None, attrs={}, limit=None, **kwargs): 332 """Returns the parents of this Tag that match the given 333 criteria.""" 334 335 return self._findAll(name, attrs, None, limit, self.parentGenerator, 336 **kwargs) 337 fetchParents = findParents # Compatibility with pre-3.x 338 339 #These methods do the real heavy lifting. 340 341 def _findOne(self, method, name, attrs, text, **kwargs): 342 r = None 343 l = method(name, attrs, text, 1, **kwargs) 344 if l: 345 r = l[0] 346 return r 347 348 def _findAll(self, name, attrs, text, limit, generator, **kwargs): 349 "Iterates over a generator looking for things that match." 350 351 if isinstance(name, SoupStrainer): 352 strainer = name 353 # (Possibly) special case some findAll*(...) searches 354 elif text is None and not limit and not attrs and not kwargs: 355 # findAll*(True) 356 if name is True: 357 return [element for element in generator() 358 if isinstance(element, Tag)] 359 # findAll*('tag-name') 360 elif isinstance(name, basestring): 361 return [element for element in generator() 362 if isinstance(element, Tag) and 363 element.name == name] 364 else: 365 strainer = SoupStrainer(name, attrs, text, **kwargs) 366 # Build a SoupStrainer 367 else: 368 strainer = SoupStrainer(name, attrs, text, **kwargs) 369 results = ResultSet(strainer) 370 g = generator() 371 while True: 372 try: 373 i = g.next() 374 except StopIteration: 375 break 376 if i: 377 found = strainer.search(i) 378 if found: 379 results.append(found) 380 if limit and len(results) >= limit: 381 break 382 return results 383 384 #These Generators can be used to navigate starting from both 385 #NavigableStrings and Tags. 386 def nextGenerator(self): 387 i = self 388 while i is not None: 389 i = i.next 390 yield i 391 392 def nextSiblingGenerator(self): 393 i = self 394 while i is not None: 395 i = i.nextSibling 396 yield i 397 398 def previousGenerator(self): 399 i = self 400 while i is not None: 401 i = i.previous 402 yield i 403 404 def previousSiblingGenerator(self): 405 i = self 406 while i is not None: 407 i = i.previousSibling 408 yield i 409 410 def parentGenerator(self): 411 i = self 412 while i is not None: 413 i = i.parent 414 yield i 415 416 # Utility methods 417 def substituteEncoding(self, str, encoding=None): 418 encoding = encoding or "utf-8" 419 return str.replace("%SOUP-ENCODING%", encoding) 420 421 def toEncoding(self, s, encoding=None): 422 """Encodes an object to a string in some encoding, or to Unicode. 423 .""" 424 if isinstance(s, unicode): 425 if encoding: 426 s = s.encode(encoding) 427 elif isinstance(s, str): 428 if encoding: 429 s = s.encode(encoding) 430 else: 431 s = unicode(s) 432 else: 433 if encoding: 434 s = self.toEncoding(str(s), encoding) 435 else: 436 s = unicode(s) 437 return s 438 439 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" 440 + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" 441 + ")") 442 443 def _sub_entity(self, x): 444 """Used with a regular expression to substitute the 445 appropriate XML entity for an XML special character.""" 446 return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" 447 448 449class NavigableString(unicode, PageElement): 450 451 def __new__(cls, value): 452 """Create a new NavigableString. 453 454 When unpickling a NavigableString, this method is called with 455 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be 456 passed in to the superclass's __new__ or the superclass won't know 457 how to handle non-ASCII characters. 458 """ 459 if isinstance(value, unicode): 460 return unicode.__new__(cls, value) 461 return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) 462 463 def __getnewargs__(self): 464 return (NavigableString.__str__(self),) 465 466 def __getattr__(self, attr): 467 """text.string gives you text. This is for backwards 468 compatibility for Navigable*String, but for CData* it lets you 469 get the string without the CData wrapper.""" 470 if attr == 'string': 471 return self 472 else: 473 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) 474 475 def __unicode__(self): 476 return str(self).decode(DEFAULT_OUTPUT_ENCODING) 477 478 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 479 # Substitute outgoing XML entities. 480 data = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, self) 481 if encoding: 482 return data.encode(encoding) 483 else: 484 return data 485 486class CData(NavigableString): 487 488 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 489 return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding) 490 491class ProcessingInstruction(NavigableString): 492 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 493 output = self 494 if "%SOUP-ENCODING%" in output: 495 output = self.substituteEncoding(output, encoding) 496 return "<?%s?>" % self.toEncoding(output, encoding) 497 498class Comment(NavigableString): 499 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 500 return "<!--%s-->" % NavigableString.__str__(self, encoding) 501 502class Declaration(NavigableString): 503 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 504 return "<!%s>" % NavigableString.__str__(self, encoding) 505 506class Tag(PageElement): 507 508 """Represents a found HTML tag with its attributes and contents.""" 509 510 def _convertEntities(self, match): 511 """Used in a call to re.sub to replace HTML, XML, and numeric 512 entities with the appropriate Unicode characters. If HTML 513 entities are being converted, any unrecognized entities are 514 escaped.""" 515 x = match.group(1) 516 if self.convertHTMLEntities and x in name2codepoint: 517 return unichr(name2codepoint[x]) 518 elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: 519 if self.convertXMLEntities: 520 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] 521 else: 522 return u'&%s;' % x 523 elif len(x) > 0 and x[0] == '#': 524 # Handle numeric entities 525 if len(x) > 1 and x[1] == 'x': 526 return unichr(int(x[2:], 16)) 527 else: 528 return unichr(int(x[1:])) 529 530 elif self.escapeUnrecognizedEntities: 531 return u'&%s;' % x 532 else: 533 return u'&%s;' % x 534 535 def __init__(self, parser, name, attrs=None, parent=None, 536 previous=None): 537 "Basic constructor." 538 539 # We don't actually store the parser object: that lets extracted 540 # chunks be garbage-collected 541 self.parserClass = parser.__class__ 542 self.isSelfClosing = parser.isSelfClosingTag(name) 543 self.name = name 544 if attrs is None: 545 attrs = [] 546 elif isinstance(attrs, dict): 547 attrs = attrs.items() 548 self.attrs = attrs 549 self.contents = [] 550 self.setup(parent, previous) 551 self.hidden = False 552 self.containsSubstitutions = False 553 self.convertHTMLEntities = parser.convertHTMLEntities 554 self.convertXMLEntities = parser.convertXMLEntities 555 self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities 556 557 # Convert any HTML, XML, or numeric entities in the attribute values. 558 convert = lambda(k, val): (k, 559 re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", 560 self._convertEntities, 561 val)) 562 self.attrs = map(convert, self.attrs) 563 564 def getString(self): 565 if (len(self.contents) == 1 566 and isinstance(self.contents[0], NavigableString)): 567 return self.contents[0] 568 569 def setString(self, string): 570 """Replace the contents of the tag with a string""" 571 self.clear() 572 self.append(string) 573 574 string = property(getString, setString) 575 576 def getText(self, separator=u""): 577 if not len(self.contents): 578 return u"" 579 stopNode = self._lastRecursiveChild().next 580 strings = [] 581 current = self.contents[0] 582 while current is not stopNode: 583 if isinstance(current, NavigableString): 584 strings.append(current.strip()) 585 current = current.next 586 return separator.join(strings) 587 588 text = property(getText) 589 590 def get(self, key, default=None): 591 """Returns the value of the 'key' attribute for the tag, or 592 the value given for 'default' if it doesn't have that 593 attribute.""" 594 return self._getAttrMap().get(key, default) 595 596 def clear(self): 597 """Extract all children.""" 598 for child in self.contents[:]: 599 child.extract() 600 601 def index(self, element): 602 for i, child in enumerate(self.contents): 603 if child is element: 604 return i 605 raise ValueError("Tag.index: element not in tag") 606 607 def has_key(self, key): 608 return self._getAttrMap().has_key(key) 609 610 def __getitem__(self, key): 611 """tag[key] returns the value of the 'key' attribute for the tag, 612 and throws an exception if it's not there.""" 613 return self._getAttrMap()[key] 614 615 def __iter__(self): 616 "Iterating over a tag iterates over its contents." 617 return iter(self.contents) 618 619 def __len__(self): 620 "The length of a tag is the length of its list of contents." 621 return len(self.contents) 622 623 def __contains__(self, x): 624 return x in self.contents 625 626 def __nonzero__(self): 627 "A tag is non-None even if it has no contents." 628 return True 629 630 def __setitem__(self, key, value): 631 """Setting tag[key] sets the value of the 'key' attribute for the 632 tag.""" 633 self._getAttrMap() 634 self.attrMap[key] = value 635 found = False 636 for i in range(0, len(self.attrs)): 637 if self.attrs[i][0] == key: 638 self.attrs[i] = (key, value) 639 found = True 640 if not found: 641 self.attrs.append((key, value)) 642 self._getAttrMap()[key] = value 643 644 def __delitem__(self, key): 645 "Deleting tag[key] deletes all 'key' attributes for the tag." 646 for item in self.attrs: 647 if item[0] == key: 648 self.attrs.remove(item) 649 #We don't break because bad HTML can define the same 650 #attribute multiple times. 651 self._getAttrMap() 652 if self.attrMap.has_key(key): 653 del self.attrMap[key] 654 655 def __call__(self, *args, **kwargs): 656 """Calling a tag like a function is the same as calling its 657 findAll() method. Eg. tag('a') returns a list of all the A tags 658 found within this tag.""" 659 return apply(self.findAll, args, kwargs) 660 661 def __getattr__(self, tag): 662 #print "Getattr %s.%s" % (self.__class__, tag) 663 if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: 664 return self.find(tag[:-3]) 665 elif tag.find('__') != 0: 666 return self.find(tag) 667 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) 668 669 def __eq__(self, other): 670 """Returns true iff this tag has the same name, the same attributes, 671 and the same contents (recursively) as the given tag. 672 673 NOTE: right now this will return false if two tags have the 674 same attributes in a different order. Should this be fixed?""" 675 if other is self: 676 return True 677 if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): 678 return False 679 for i in range(0, len(self.contents)): 680 if self.contents[i] != other.contents[i]: 681 return False 682 return True 683 684 def __ne__(self, other): 685 """Returns true iff this tag is not identical to the other tag, 686 as defined in __eq__.""" 687 return not self == other 688 689 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): 690 """Renders this tag as a string.""" 691 return self.__str__(encoding) 692 693 def __unicode__(self): 694 return self.__str__(None) 695 696 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, 697 prettyPrint=False, indentLevel=0): 698 """Returns a string or Unicode representation of this tag and 699 its contents. To get Unicode, pass None for encoding. 700 701 NOTE: since Python's HTML parser consumes whitespace, this 702 method is not certain to reproduce the whitespace present in 703 the original string.""" 704 705 encodedName = self.toEncoding(self.name, encoding) 706 707 attrs = [] 708 if self.attrs: 709 for key, val in self.attrs: 710 fmt = '%s="%s"' 711 if isinstance(val, basestring): 712 if self.containsSubstitutions and '%SOUP-ENCODING%' in val: 713 val = self.substituteEncoding(val, encoding) 714 715 # The attribute value either: 716 # 717 # * Contains no embedded double quotes or single quotes. 718 # No problem: we enclose it in double quotes. 719 # * Contains embedded single quotes. No problem: 720 # double quotes work here too. 721 # * Contains embedded double quotes. No problem: 722 # we enclose it in single quotes. 723 # * Embeds both single _and_ double quotes. This 724 # can't happen naturally, but it can happen if 725 # you modify an attribute value after parsing 726 # the document. Now we have a bit of a 727 # problem. We solve it by enclosing the 728 # attribute in single quotes, and escaping any 729 # embedded single quotes to XML entities. 730 if '"' in val: 731 fmt = "%s='%s'" 732 if "'" in val: 733 # TODO: replace with apos when 734 # appropriate. 735 val = val.replace("'", "&squot;") 736 737 # Now we're okay w/r/t quotes. But the attribute 738 # value might also contain angle brackets, or 739 # ampersands that aren't part of entities. We need 740 # to escape those to XML entities too. 741 val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) 742 743 attrs.append(fmt % (self.toEncoding(key, encoding), 744 self.toEncoding(val, encoding))) 745 close = '' 746 closeTag = '' 747 if self.isSelfClosing: 748 close = ' /' 749 else: 750 closeTag = '</%s>' % encodedName 751 752 indentTag, indentContents = 0, 0 753 if prettyPrint: 754 indentTag = indentLevel 755 space = (' ' * (indentTag-1)) 756 indentContents = indentTag + 1 757 contents = self.renderContents(encoding, prettyPrint, indentContents) 758 if self.hidden: 759 s = contents 760 else: 761 s = [] 762 attributeString = '' 763 if attrs: 764 attributeString = ' ' + ' '.join(attrs) 765 if prettyPrint: 766 s.append(space) 767 s.append('<%s%s%s>' % (encodedName, attributeString, close)) 768 if prettyPrint: 769 s.append("\n") 770 s.append(contents) 771 if prettyPrint and contents and contents[-1] != "\n": 772 s.append("\n") 773 if prettyPrint and closeTag: 774 s.append(space) 775 s.append(closeTag) 776 if prettyPrint and closeTag and self.nextSibling: 777 s.append("\n") 778 s = ''.join(s) 779 return s 780 781 def decompose(self): 782 """Recursively destroys the contents of this tree.""" 783 self.extract() 784 if len(self.contents) == 0: 785 return 786 current = self.contents[0] 787 while current is not None: 788 next = current.next 789 if isinstance(current, Tag): 790 del current.contents[:] 791 current.parent = None 792 current.previous = None 793 current.previousSibling = None 794 current.next = None 795 current.nextSibling = None 796 current = next 797 798 def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): 799 return self.__str__(encoding, True) 800 801 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, 802 prettyPrint=False, indentLevel=0): 803 """Renders the contents of this tag as a string in the given 804 encoding. If encoding is None, returns a Unicode string..""" 805 s=[] 806 for c in self: 807 text = None 808 if isinstance(c, NavigableString): 809 text = c.__str__(encoding) 810 elif isinstance(c, Tag): 811 s.append(c.__str__(encoding, prettyPrint, indentLevel)) 812 if text and prettyPrint: 813 text = text.strip() 814 if text: 815 if prettyPrint: 816 s.append(" " * (indentLevel-1)) 817 s.append(text) 818 if prettyPrint: 819 s.append("\n") 820 return ''.join(s) 821 822 #Soup methods 823 824 def find(self, name=None, attrs={}, recursive=True, text=None, 825 **kwargs): 826 """Return only the first child of this Tag matching the given 827 criteria.""" 828 r = None 829 l = self.findAll(name, attrs, recursive, text, 1, **kwargs) 830 if l: 831 r = l[0] 832 return r 833 findChild = find 834 835 def findAll(self, name=None, attrs={}, recursive=True, text=None, 836 limit=None, **kwargs): 837 """Extracts a list of Tag objects that match the given 838 criteria. You can specify the name of the Tag and any 839 attributes you want the Tag to have. 840 841 The value of a key-value pair in the 'attrs' map can be a 842 string, a list of strings, a regular expression object, or a 843 callable that takes a string and returns whether or not the 844 string matches for some custom definition of 'matches'. The 845 same is true of the tag name.""" 846 generator = self.recursiveChildGenerator 847 if not recursive: 848 generator = self.childGenerator 849 return self._findAll(name, attrs, text, limit, generator, **kwargs) 850 findChildren = findAll 851 852 # Pre-3.x compatibility methods 853 first = find 854 fetch = findAll 855 856 def fetchText(self, text=None, recursive=True, limit=None): 857 return self.findAll(text=text, recursive=recursive, limit=limit) 858 859 def firstText(self, text=None, recursive=True): 860 return self.find(text=text, recursive=recursive) 861 862 #Private methods 863 864 def _getAttrMap(self): 865 """Initializes a map representation of this tag's attributes, 866 if not already initialized.""" 867 if not getattr(self, 'attrMap'): 868 self.attrMap = {} 869 for (key, value) in self.attrs: 870 self.attrMap[key] = value 871 return self.attrMap 872 873 #Generator methods 874 def childGenerator(self): 875 # Just use the iterator from the contents 876 return iter(self.contents) 877 878 def recursiveChildGenerator(self): 879 if not len(self.contents): 880 raise StopIteration 881 stopNode = self._lastRecursiveChild().next 882 current = self.contents[0] 883 while current is not stopNode: 884 yield current 885 current = current.next 886 887 888# Next, a couple classes to represent queries and their results. 889class SoupStrainer: 890 """Encapsulates a number of ways of matching a markup element (tag or 891 text).""" 892 893 def __init__(self, name=None, attrs={}, text=None, **kwargs): 894 self.name = name 895 if isinstance(attrs, basestring): 896 kwargs['class'] = _match_css_class(attrs) 897 attrs = None 898 if kwargs: 899 if attrs: 900 attrs = attrs.copy() 901 attrs.update(kwargs) 902 else: 903 attrs = kwargs 904 self.attrs = attrs 905 self.text = text 906 907 def __str__(self): 908 if self.text: 909 return self.text 910 else: 911 return "%s|%s" % (self.name, self.attrs) 912 913 def searchTag(self, markupName=None, markupAttrs={}): 914 found = None 915 markup = None 916 if isinstance(markupName, Tag): 917 markup = markupName 918 markupAttrs = markup 919 callFunctionWithTagData = callable(self.name) \ 920 and not isinstance(markupName, Tag) 921 922 if (not self.name) \ 923 or callFunctionWithTagData \ 924 or (markup and self._matches(markup, self.name)) \ 925 or (not markup and self._matches(markupName, self.name)): 926 if callFunctionWithTagData: 927 match = self.name(markupName, markupAttrs) 928 else: 929 match = True 930 markupAttrMap = None 931 for attr, matchAgainst in self.attrs.items(): 932 if not markupAttrMap: 933 if hasattr(markupAttrs, 'get'): 934 markupAttrMap = markupAttrs 935 else: 936 markupAttrMap = {} 937 for k,v in markupAttrs: 938 markupAttrMap[k] = v 939 attrValue = markupAttrMap.get(attr) 940 if not self._matches(attrValue, matchAgainst): 941 match = False 942 break 943 if match: 944 if markup: 945 found = markup 946 else: 947 found = markupName 948 return found 949 950 def search(self, markup): 951 #print 'looking for %s in %s' % (self, markup) 952 found = None 953 # If given a list of items, scan it for a text element that 954 # matches. 955 if hasattr(markup, "__iter__") \ 956 and not isinstance(markup, Tag): 957 for element in markup: 958 if isinstance(element, NavigableString) \ 959 and self.search(element): 960 found = element 961 break 962 # If it's a Tag, make sure its name or attributes match. 963 # Don't bother with Tags if we're searching for text. 964 elif isinstance(markup, Tag): 965 if not self.text: 966 found = self.searchTag(markup) 967 # If it's text, make sure the text matches. 968 elif isinstance(markup, NavigableString) or \ 969 isinstance(markup, basestring): 970 if self._matches(markup, self.text): 971 found = markup 972 else: 973 raise Exception, "I don't know how to match against a %s" \ 974 % markup.__class__ 975 return found 976 977 def _matches(self, markup, matchAgainst): 978 #print "Matching %s against %s" % (markup, matchAgainst) 979 result = False 980 if matchAgainst is True: 981 result = markup is not None 982 elif callable(matchAgainst): 983 result = matchAgainst(markup) 984 else: 985 #Custom match methods take the tag as an argument, but all 986 #other ways of matching match the tag name as a string. 987 if isinstance(markup, Tag): 988 markup = markup.name 989 if markup and not isinstance(markup, basestring): 990 markup = unicode(markup) 991 #Now we know that chunk is either a string, or None. 992 if hasattr(matchAgainst, 'match'): 993 # It's a regexp object. 994 result = markup and matchAgainst.search(markup) 995 elif hasattr(matchAgainst, '__iter__'): # list-like 996 result = markup in matchAgainst 997 elif hasattr(matchAgainst, 'items'): 998 result = markup.has_key(matchAgainst) 999 elif matchAgainst and isinstance(markup, basestring): 1000 if isinstance(markup, unicode): 1001 matchAgainst = unicode(matchAgainst) 1002 else: 1003 matchAgainst = str(matchAgainst) 1004 1005 if not result: 1006 result = matchAgainst == markup 1007 return result 1008 1009class ResultSet(list): 1010 """A ResultSet is just a list that keeps track of the SoupStrainer 1011 that created it.""" 1012 def __init__(self, source): 1013 list.__init__([]) 1014 self.source = source 1015 1016# Now, some helper functions. 1017 1018def buildTagMap(default, *args): 1019 """Turns a list of maps, lists, or scalars into a single map. 1020 Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and 1021 NESTING_RESET_TAGS maps out of lists and partial maps.""" 1022 built = {} 1023 for portion in args: 1024 if hasattr(portion, 'items'): 1025 #It's a map. Merge it. 1026 for k,v in portion.items(): 1027 built[k] = v 1028 elif hasattr(portion, '__iter__'): # is a list 1029 #It's a list. Map each item to the default. 1030 for k in portion: 1031 built[k] = default 1032 else: 1033 #It's a scalar. Map it to the default. 1034 built[portion] = default 1035 return built 1036 1037# Now, the parser classes. 1038 1039class BeautifulStoneSoup(Tag, SGMLParser): 1040 1041 """This class contains the basic parser and search code. It defines 1042 a parser that knows nothing about tag behavior except for the 1043 following: 1044 1045 You can't close a tag without closing all the tags it encloses. 1046 That is, "<foo><bar></foo>" actually means 1047 "<foo><bar></bar></foo>". 1048 1049 [Another possible explanation is "<foo><bar /></foo>", but since 1050 this class defines no SELF_CLOSING_TAGS, it will never use that 1051 explanation.] 1052 1053 This class is useful for parsing XML or made-up markup languages, 1054 or when BeautifulSoup makes an assumption counter to what you were 1055 expecting.""" 1056 1057 SELF_CLOSING_TAGS = {} 1058 NESTABLE_TAGS = {} 1059 RESET_NESTING_TAGS = {} 1060 QUOTE_TAGS = {} 1061 PRESERVE_WHITESPACE_TAGS = [] 1062 1063 MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), 1064 lambda x: x.group(1) + ' />'), 1065 (re.compile('<!\s+([^<>]*)>'), 1066 lambda x: '<!' + x.group(1) + '>') 1067 ] 1068 1069 ROOT_TAG_NAME = u'[document]' 1070 1071 HTML_ENTITIES = "html" 1072 XML_ENTITIES = "xml" 1073 XHTML_ENTITIES = "xhtml" 1074 # TODO: This only exists for backwards-compatibility 1075 ALL_ENTITIES = XHTML_ENTITIES 1076 1077 # Used when determining whether a text node is all whitespace and 1078 # can be replaced with a single space. A text node that contains 1079 # fancy Unicode spaces (usually non-breaking) should be left 1080 # alone. 1081 STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } 1082 1083 def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, 1084 markupMassage=True, smartQuotesTo=XML_ENTITIES, 1085 convertEntities=None, selfClosingTags=None, isHTML=False): 1086 """The Soup object is initialized as the 'root tag', and the 1087 provided markup (which can be a string or a file-like object) 1088 is fed into the underlying parser. 1089 1090 sgmllib will process most bad HTML, and the BeautifulSoup 1091 class has some tricks for dealing with some HTML that kills 1092 sgmllib, but Beautiful Soup can nonetheless choke or lose data 1093 if your data uses self-closing tags or declarations 1094 incorrectly. 1095 1096 By default, Beautiful Soup uses regexes to sanitize input, 1097 avoiding the vast majority of these problems. If the problems 1098 don't apply to you, pass in False for markupMassage, and 1099 you'll get better performance. 1100 1101 The default parser massage techniques fix the two most common 1102 instances of invalid HTML that choke sgmllib: 1103 1104 <br/> (No space between name of closing tag and tag close) 1105 <! --Comment--> (Extraneous whitespace in declaration) 1106 1107 You can pass in a custom list of (RE object, replace method) 1108 tuples to get Beautiful Soup to scrub your input the way you 1109 want.""" 1110 1111 self.parseOnlyThese = parseOnlyThese 1112 self.fromEncoding = fromEncoding 1113 self.smartQuotesTo = smartQuotesTo 1114 self.convertEntities = convertEntities 1115 # Set the rules for how we'll deal with the entities we 1116 # encounter 1117 if self.convertEntities: 1118 # It doesn't make sense to convert encoded characters to 1119 # entities even while you're converting entities to Unicode. 1120 # Just convert it all to Unicode. 1121 self.smartQuotesTo = None 1122 if convertEntities == self.HTML_ENTITIES: 1123 self.convertXMLEntities = False 1124 self.convertHTMLEntities = True 1125 self.escapeUnrecognizedEntities = True 1126 elif convertEntities == self.XHTML_ENTITIES: 1127 self.convertXMLEntities = True 1128 self.convertHTMLEntities = True 1129 self.escapeUnrecognizedEntities = False 1130 elif convertEntities == self.XML_ENTITIES: 1131 self.convertXMLEntities = True 1132 self.convertHTMLEntities = False 1133 self.escapeUnrecognizedEntities = False 1134 else: 1135 self.convertXMLEntities = False 1136 self.convertHTMLEntities = False 1137 self.escapeUnrecognizedEntities = False 1138 1139 self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) 1140 SGMLParser.__init__(self) 1141 1142 if hasattr(markup, 'read'): # It's a file-type object. 1143 markup = markup.read() 1144 self.markup = markup 1145 self.markupMassage = markupMassage 1146 try: 1147 self._feed(isHTML=isHTML) 1148 except StopParsing: 1149 pass 1150 self.markup = None # The markup can now be GCed 1151 1152 def convert_charref(self, name): 1153 """This method fixes a bug in Python's SGMLParser.""" 1154 try: 1155 n = int(name) 1156 except ValueError: 1157 return 1158 if not 0 <= n <= 127 : # ASCII ends at 127, not 255 1159 return 1160 return self.convert_codepoint(n) 1161 1162 def _feed(self, inDocumentEncoding=None, isHTML=False): 1163 # Convert the document to Unicode. 1164 markup = self.markup 1165 if isinstance(markup, unicode): 1166 if not hasattr(self, 'originalEncoding'): 1167 self.originalEncoding = None 1168 else: 1169 dammit = UnicodeDammit\ 1170 (markup, [self.fromEncoding, inDocumentEncoding], 1171 smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) 1172 markup = dammit.unicode 1173 self.originalEncoding = dammit.originalEncoding 1174 self.declaredHTMLEncoding = dammit.declaredHTMLEncoding 1175 if markup: 1176 if self.markupMassage: 1177 if not hasattr(self.markupMassage, "__iter__"): 1178 self.markupMassage = self.MARKUP_MASSAGE 1179 for fix, m in self.markupMassage: 1180 markup = fix.sub(m, markup) 1181 # TODO: We get rid of markupMassage so that the 1182 # soup object can be deepcopied later on. Some 1183 # Python installations can't copy regexes. If anyone 1184 # was relying on the existence of markupMassage, this 1185 # might cause problems. 1186 del(self.markupMassage) 1187 self.reset() 1188 1189 SGMLParser.feed(self, markup) 1190 # Close out any unfinished strings and close all the open tags. 1191 self.endData() 1192 while self.currentTag.name != self.ROOT_TAG_NAME: 1193 self.popTag() 1194 1195 def __getattr__(self, methodName): 1196 """This method routes method call requests to either the SGMLParser 1197 superclass or the Tag superclass, depending on the method name.""" 1198 #print "__getattr__ called on %s.%s" % (self.__class__, methodName) 1199 1200 if methodName.startswith('start_') or methodName.startswith('end_') \ 1201 or methodName.startswith('do_'): 1202 return SGMLParser.__getattr__(self, methodName) 1203 elif not methodName.startswith('__'): 1204 return Tag.__getattr__(self, methodName) 1205 else: 1206 raise AttributeError 1207 1208 def isSelfClosingTag(self, name): 1209 """Returns true iff the given string is the name of a 1210 self-closing tag according to this parser.""" 1211 return self.SELF_CLOSING_TAGS.has_key(name) \ 1212 or self.instanceSelfClosingTags.has_key(name) 1213 1214 def reset(self): 1215 Tag.__init__(self, self, self.ROOT_TAG_NAME) 1216 self.hidden = 1 1217 SGMLParser.reset(self) 1218 self.currentData = [] 1219 self.currentTag = None 1220 self.tagStack = [] 1221 self.quoteStack = [] 1222 self.pushTag(self) 1223 1224 def popTag(self): 1225 tag = self.tagStack.pop() 1226 1227 #print "Pop", tag.name 1228 if self.tagStack: 1229 self.currentTag = self.tagStack[-1] 1230 return self.currentTag 1231 1232 def pushTag(self, tag): 1233 #print "Push", tag.name 1234 if self.currentTag: 1235 self.currentTag.contents.append(tag) 1236 self.tagStack.append(tag) 1237 self.currentTag = self.tagStack[-1] 1238 1239 def endData(self, containerClass=NavigableString): 1240 if self.currentData: 1241 currentData = u''.join(self.currentData) 1242 if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and 1243 not set([tag.name for tag in self.tagStack]).intersection( 1244 self.PRESERVE_WHITESPACE_TAGS)): 1245 if '\n' in currentData: 1246 currentData = '\n' 1247 else: 1248 currentData = ' ' 1249 self.currentData = [] 1250 if self.parseOnlyThese and len(self.tagStack) <= 1 and \ 1251 (not self.parseOnlyThese.text or \ 1252 not self.parseOnlyThese.search(currentData)): 1253 return 1254 o = containerClass(currentData) 1255 o.setup(self.currentTag, self.previous) 1256 if self.previous: 1257 self.previous.next = o 1258 self.previous = o 1259 self.currentTag.contents.append(o) 1260 1261 1262 def _popToTag(self, name, inclusivePop=True): 1263 """Pops the tag stack up to and including the most recent 1264 instance of the given tag. If inclusivePop is false, pops the tag 1265 stack up to but *not* including the most recent instqance of 1266 the given tag.""" 1267 #print "Popping to %s" % name 1268 if name == self.ROOT_TAG_NAME: 1269 return 1270 1271 numPops = 0 1272 mostRecentTag = None 1273 for i in range(len(self.tagStack)-1, 0, -1): 1274 if name == self.tagStack[i].name: 1275 numPops = len(self.tagStack)-i 1276 break 1277 if not inclusivePop: 1278 numPops = numPops - 1 1279 1280 for i in range(0, numPops): 1281 mostRecentTag = self.popTag() 1282 return mostRecentTag 1283 1284 def _smartPop(self, name): 1285 1286 """We need to pop up to the previous tag of this type, unless 1287 one of this tag's nesting reset triggers comes between this 1288 tag and the previous tag of this type, OR unless this tag is a 1289 generic nesting trigger and another generic nesting trigger 1290 comes between this tag and the previous tag of this type. 1291 1292 Examples: 1293 <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'. 1294 <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'. 1295 <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'. 1296 1297 <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. 1298 <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' 1299 <td><tr><td> *<td>* should pop to 'tr', not the first 'td' 1300 """ 1301 1302 nestingResetTriggers = self.NESTABLE_TAGS.get(name) 1303 isNestable = nestingResetTriggers != None 1304 isResetNesting = self.RESET_NESTING_TAGS.has_key(name) 1305 popTo = None 1306 inclusive = True 1307 for i in range(len(self.tagStack)-1, 0, -1): 1308 p = self.tagStack[i] 1309 if (not p or p.name == name) and not isNestable: 1310 #Non-nestable tags get popped to the top or to their 1311 #last occurance. 1312 popTo = name 1313 break 1314 if (nestingResetTriggers is not None 1315 and p.name in nestingResetTriggers) \ 1316 or (nestingResetTriggers is None and isResetNesting 1317 and self.RESET_NESTING_TAGS.has_key(p.name)): 1318 1319 #If we encounter one of the nesting reset triggers 1320 #peculiar to this tag, or we encounter another tag 1321 #that causes nesting to reset, pop up to but not 1322 #including that tag. 1323 popTo = p.name 1324 inclusive = False 1325 break 1326 p = p.parent 1327 if popTo: 1328 self._popToTag(popTo, inclusive) 1329 1330 def unknown_starttag(self, name, attrs, selfClosing=0): 1331 #print "Start tag %s: %s" % (name, attrs) 1332 if self.quoteStack: 1333 #This is not a real tag. 1334 #print "<%s> is not real!" % name 1335 attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs]) 1336 self.handle_data('<%s%s>' % (name, attrs)) 1337 return 1338 self.endData() 1339 1340 if not self.isSelfClosingTag(name) and not selfClosing: 1341 self._smartPop(name) 1342 1343 if self.parseOnlyThese and len(self.tagStack) <= 1 \ 1344 and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): 1345 return 1346 1347 tag = Tag(self, name, attrs, self.currentTag, self.previous) 1348 if self.previous: 1349 self.previous.next = tag 1350 self.previous = tag 1351 self.pushTag(tag) 1352 if selfClosing or self.isSelfClosingTag(name): 1353 self.popTag() 1354 if name in self.QUOTE_TAGS: 1355 #print "Beginning quote (%s)" % name 1356 self.quoteStack.append(name) 1357 self.literal = 1 1358 return tag 1359 1360 def unknown_endtag(self, name): 1361 #print "End tag %s" % name 1362 if self.quoteStack and self.quoteStack[-1] != name: 1363 #This is not a real end tag. 1364 #print "</%s> is not real!" % name 1365 self.handle_data('</%s>' % name) 1366 return 1367 self.endData() 1368 self._popToTag(name) 1369 if self.quoteStack and self.quoteStack[-1] == name: 1370 self.quoteStack.pop() 1371 self.literal = (len(self.quoteStack) > 0) 1372 1373 def handle_data(self, data): 1374 self.currentData.append(data) 1375 1376 def _toStringSubclass(self, text, subclass): 1377 """Adds a certain piece of text to the tree as a NavigableString 1378 subclass.""" 1379 self.endData() 1380 self.handle_data(text) 1381 self.endData(subclass) 1382 1383 def handle_pi(self, text): 1384 """Handle a processing instruction as a ProcessingInstruction 1385 object, possibly one with a %SOUP-ENCODING% slot into which an 1386 encoding will be plugged later.""" 1387 if text[:3] == "xml": 1388 text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" 1389 self._toStringSubclass(text, ProcessingInstruction) 1390 1391 def handle_comment(self, text): 1392 "Handle comments as Comment objects." 1393 self._toStringSubclass(text, Comment) 1394 1395 def handle_charref(self, ref): 1396 "Handle character references as data." 1397 if self.convertEntities: 1398 data = unichr(int(ref)) 1399 else: 1400 data = '&#%s;' % ref 1401 self.handle_data(data) 1402 1403 def handle_entityref(self, ref): 1404 """Handle entity references as data, possibly converting known 1405 HTML and/or XML entity references to the corresponding Unicode 1406 characters.""" 1407 data = None 1408 if self.convertHTMLEntities: 1409 try: 1410 data = unichr(name2codepoint[ref]) 1411 except KeyError: 1412 pass 1413 1414 if not data and self.convertXMLEntities: 1415 data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) 1416 1417 if not data and self.convertHTMLEntities and \ 1418 not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): 1419 # TODO: We've got a problem here. We're told this is 1420 # an entity reference, but it's not an XML entity 1421 # reference or an HTML entity reference. Nonetheless, 1422 # the logical thing to do is to pass it through as an 1423 # unrecognized entity reference. 1424 # 1425 # Except: when the input is "&carol;" this function 1426 # will be called with input "carol". When the input is 1427 # "AT&T", this function will be called with input 1428 # "T". We have no way of knowing whether a semicolon 1429 # was present originally, so we don't know whether 1430 # this is an unknown entity or just a misplaced 1431 # ampersand. 1432 # 1433 # The more common case is a misplaced ampersand, so I 1434 # escape the ampersand and omit the trailing semicolon. 1435 data = "&%s" % ref 1436 if not data: 1437 # This case is different from the one above, because we 1438 # haven't already gone through a supposedly comprehensive 1439 # mapping of entities to Unicode characters. We might not 1440 # have gone through any mapping at all. So the chances are 1441 # very high that this is a real entity, and not a 1442 # misplaced ampersand. 1443 data = "&%s;" % ref 1444 self.handle_data(data) 1445 1446 def handle_decl(self, data): 1447 "Handle DOCTYPEs and the like as Declaration objects." 1448 self._toStringSubclass(data, Declaration) 1449 1450 def parse_declaration(self, i): 1451 """Treat a bogus SGML declaration as raw data. Treat a CDATA 1452 declaration as a CData object.""" 1453 j = None 1454 if self.rawdata[i:i+9] == '<![CDATA[': 1455 k = self.rawdata.find(']]>', i) 1456 if k == -1: 1457 k = len(self.rawdata) 1458 data = self.rawdata[i+9:k] 1459 j = k+3 1460 self._toStringSubclass(data, CData) 1461 else: 1462 try: 1463 j = SGMLParser.parse_declaration(self, i) 1464 except SGMLParseError: 1465 toHandle = self.rawdata[i:] 1466 self.handle_data(toHandle) 1467 j = i + len(toHandle) 1468 return j 1469 1470class BeautifulSoup(BeautifulStoneSoup): 1471 1472 """This parser knows the following facts about HTML: 1473 1474 * Some tags have no closing tag and should be interpreted as being 1475 closed as soon as they are encountered. 1476 1477 * The text inside some tags (ie. 'script') may contain tags which 1478 are not really part of the document and which should be parsed 1479 as text, not tags. If you want to parse the text as tags, you can 1480 always fetch it and parse it explicitly. 1481 1482 * Tag nesting rules: 1483 1484 Most tags can't be nested at all. For instance, the occurance of 1485 a <p> tag should implicitly close the previous <p> tag. 1486 1487 <p>Para1<p>Para2 1488 should be transformed into: 1489 <p>Para1</p><p>Para2 1490 1491 Some tags can be nested arbitrarily. For instance, the occurance 1492 of a <blockquote> tag should _not_ implicitly close the previous 1493 <blockquote> tag. 1494 1495 Alice said: <blockquote>Bob said: <blockquote>Blah 1496 should NOT be transformed into: 1497 Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah 1498 1499 Some tags can be nested, but the nesting is reset by the 1500 interposition of other tags. For instance, a <tr> tag should 1501 implicitly close the previous <tr> tag within the same <table>, 1502 but not close a <tr> tag in another table. 1503 1504 <table><tr>Blah<tr>Blah 1505 should be transformed into: 1506 <table><tr>Blah</tr><tr>Blah 1507 but, 1508 <tr>Blah<table><tr>Blah 1509 should NOT be transformed into 1510 <tr>Blah<table></tr><tr>Blah 1511 1512 Differing assumptions about tag nesting rules are a major source 1513 of problems with the BeautifulSoup class. If BeautifulSoup is not 1514 treating as nestable a tag your page author treats as nestable, 1515 try ICantBelieveItsBeautifulSoup, MinimalSoup, or 1516 BeautifulStoneSoup before writing your own subclass.""" 1517 1518 def __init__(self, *args, **kwargs): 1519 if not kwargs.has_key('smartQuotesTo'): 1520 kwargs['smartQuotesTo'] = self.HTML_ENTITIES 1521 kwargs['isHTML'] = True 1522 BeautifulStoneSoup.__init__(self, *args, **kwargs) 1523 1524 SELF_CLOSING_TAGS = buildTagMap(None, 1525 ('br' , 'hr', 'input', 'img', 'meta', 1526 'spacer', 'link', 'frame', 'base', 'col')) 1527 1528 PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) 1529 1530 QUOTE_TAGS = {'script' : None, 'textarea' : None} 1531 1532 #According to the HTML standard, each of these inline tags can 1533 #contain another tag of the same type. Furthermore, it's common 1534 #to actually use these tags this way. 1535 NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 1536 'center') 1537 1538 #According to the HTML standard, these block tags can contain 1539 #another tag of the same type. Furthermore, it's common 1540 #to actually use these tags this way. 1541 NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del') 1542 1543 #Lists can contain other lists, but there are restrictions. 1544 NESTABLE_LIST_TAGS = { 'ol' : [], 1545 'ul' : [], 1546 'li' : ['ul', 'ol'], 1547 'dl' : [], 1548 'dd' : ['dl'], 1549 'dt' : ['dl'] } 1550 1551 #Tables can contain other tables, but there are restrictions. 1552 NESTABLE_TABLE_TAGS = {'table' : [], 1553 'tr' : ['table', 'tbody', 'tfoot', 'thead'], 1554 'td' : ['tr'], 1555 'th' : ['tr'], 1556 'thead' : ['table'], 1557 'tbody' : ['table'], 1558 'tfoot' : ['table'], 1559 } 1560 1561 NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre') 1562 1563 #If one of these tags is encountered, all tags up to the next tag of 1564 #this type are popped. 1565 RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', 1566 NON_NESTABLE_BLOCK_TAGS, 1567 NESTABLE_LIST_TAGS, 1568 NESTABLE_TABLE_TAGS) 1569 1570 NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, 1571 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) 1572 1573 # Used to detect the charset in a META tag; see start_meta 1574 CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) 1575 1576 def start_meta(self, attrs): 1577 """Beautiful Soup can detect a charset included in a META tag, 1578 try to convert the document to that charset, and re-parse the 1579 document from the beginning.""" 1580 httpEquiv = None 1581 contentType = None 1582 contentTypeIndex = None 1583 tagNeedsEncodingSubstitution = False 1584 1585 for i in range(0, len(attrs)): 1586 key, value = attrs[i] 1587 key = key.lower() 1588 if key == 'http-equiv': 1589 httpEquiv = value 1590 elif key == 'content': 1591 contentType = value 1592 contentTypeIndex = i 1593 1594 if httpEquiv and contentType: # It's an interesting meta tag. 1595 match = self.CHARSET_RE.search(contentType) 1596 if match: 1597 if (self.declaredHTMLEncoding is not None or 1598 self.originalEncoding == self.fromEncoding): 1599 # An HTML encoding was sniffed while converting 1600 # the document to Unicode, or an HTML encoding was 1601 # sniffed during a previous pass through the 1602 # document, or an encoding was specified 1603 # explicitly and it worked. Rewrite the meta tag. 1604 def rewrite(match): 1605 return match.group(1) + "%SOUP-ENCODING%" 1606 newAttr = self.CHARSET_RE.sub(rewrite, contentType) 1607 attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], 1608 newAttr) 1609 tagNeedsEncodingSubstitution = True 1610 else: 1611 # This is our first pass through the document. 1612 # Go through it again with the encoding information. 1613 newCharset = match.group(3) 1614 if newCharset and newCharset != self.originalEncoding: 1615 self.declaredHTMLEncoding = newCharset 1616 self._feed(self.declaredHTMLEncoding) 1617 raise StopParsing 1618 pass 1619 tag = self.unknown_starttag("meta", attrs) 1620 if tag and tagNeedsEncodingSubstitution: 1621 tag.containsSubstitutions = True 1622 1623class StopParsing(Exception): 1624 pass 1625 1626class ICantBelieveItsBeautifulSoup(BeautifulSoup): 1627 1628 """The BeautifulSoup class is oriented towards skipping over 1629 common HTML errors like unclosed tags. However, sometimes it makes 1630 errors of its own. For instance, consider this fragment: 1631 1632 <b>Foo<b>Bar</b></b> 1633 1634 This is perfectly valid (if bizarre) HTML. However, the 1635 BeautifulSoup class will implicitly close the first b tag when it 1636 encounters the second 'b'. It will think the author wrote 1637 "<b>Foo<b>Bar", and didn't close the first 'b' tag, because 1638 there's no real-world reason to bold something that's already 1639 bold. When it encounters '</b></b>' it will close two more 'b' 1640 tags, for a grand total of three tags closed instead of two. This 1641 can throw off the rest of your document structure. The same is 1642 true of a number of other tags, listed below. 1643 1644 It's much more common for someone to forget to close a 'b' tag 1645 than to actually use nested 'b' tags, and the BeautifulSoup class 1646 handles the common case. This class handles the not-co-common 1647 case: where you can't believe someone wrote what they did, but 1648 it's valid HTML and BeautifulSoup screwed up by assuming it 1649 wouldn't be.""" 1650 1651 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ 1652 ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', 1653 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', 1654 'big') 1655 1656 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',) 1657 1658 NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, 1659 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, 1660 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) 1661 1662class MinimalSoup(BeautifulSoup): 1663 """The MinimalSoup class is for parsing HTML that contains 1664 pathologically bad markup. It makes no assumptions about tag 1665 nesting, but it does know which tags are self-closing, that 1666 <script> tags contain Javascript and should not be parsed, that 1667 META tags may contain encoding information, and so on. 1668 1669 This also makes it better for subclassing than BeautifulStoneSoup 1670 or BeautifulSoup.""" 1671 1672 RESET_NESTING_TAGS = buildTagMap('noscript') 1673 NESTABLE_TAGS = {} 1674 1675class BeautifulSOAP(BeautifulStoneSoup): 1676 """This class will push a tag with only a single string child into 1677 the tag's parent as an attribute. The attribute's name is the tag 1678 name, and the value is the string child. An example should give 1679 the flavor of the change: 1680 1681 <foo><bar>baz</bar></foo> 1682 => 1683 <foo bar="baz"><bar>baz</bar></foo> 1684 1685 You can then access fooTag['bar'] instead of fooTag.barTag.string. 1686 1687 This is, of course, useful for scraping structures that tend to 1688 use subelements instead of attributes, such as SOAP messages. Note 1689 that it modifies its input, so don't print the modified version 1690 out. 1691 1692 I'm not sure how many people really want to use this class; let me 1693 know if you do. Mainly I like the name.""" 1694 1695 def popTag(self): 1696 if len(self.tagStack) > 1: 1697 tag = self.tagStack[-1] 1698 parent = self.tagStack[-2] 1699 parent._getAttrMap() 1700 if (isinstance(tag, Tag) and len(tag.contents) == 1 and 1701 isinstance(tag.contents[0], NavigableString) and 1702 not parent.attrMap.has_key(tag.name)): 1703 parent[tag.name] = tag.contents[0] 1704 BeautifulStoneSoup.popTag(self) 1705 1706#Enterprise class names! It has come to our attention that some people 1707#think the names of the Beautiful Soup parser classes are too silly 1708#and "unprofessional" for use in enterprise screen-scraping. We feel 1709#your pain! For such-minded folk, the Beautiful Soup Consortium And 1710#All-Night Kosher Bakery recommends renaming this file to 1711#"RobustParser.py" (or, in cases of extreme enterprisiness, 1712#"RobustParserBeanInterface.class") and using the following 1713#enterprise-friendly class aliases: 1714class RobustXMLParser(BeautifulStoneSoup): 1715 pass 1716class RobustHTMLParser(BeautifulSoup): 1717 pass 1718class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup): 1719 pass 1720class RobustInsanelyWackAssHTMLParser(MinimalSoup): 1721 pass 1722class SimplifyingSOAPParser(BeautifulSOAP): 1723 pass 1724 1725###################################################### 1726# 1727# Bonus library: Unicode, Dammit 1728# 1729# This class forces XML data into a standard format (usually to UTF-8 1730# or Unicode). It is heavily based on code from Mark Pilgrim's 1731# Universal Feed Parser. It does not rewrite the XML or HTML to 1732# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi 1733# (XML) and BeautifulSoup.start_meta (HTML). 1734 1735# Autodetects character encodings. 1736# Download from http://chardet.feedparser.org/ 1737try: 1738 import chardet 1739# import chardet.constants 1740# chardet.constants._debug = 1 1741except ImportError: 1742 chardet = None 1743 1744# cjkcodecs and iconv_codec make Python know about more character encodings. 1745# Both are available from http://cjkpython.i18n.org/ 1746# They're built in if you use Python 2.4. 1747try: 1748 import cjkcodecs.aliases 1749except ImportError: 1750 pass 1751try: 1752 import iconv_codec 1753except ImportError: 1754 pass 1755 1756class UnicodeDammit: 1757 """A class for detecting the encoding of a *ML document and 1758 converting it to a Unicode string. If the source encoding is 1759 windows-1252, can replace MS smart quotes with their HTML or XML 1760 equivalents.""" 1761 1762 # This dictionary maps commonly seen values for "charset" in HTML 1763 # meta tags to the corresponding Python codec names. It only covers 1764 # values that aren't in Python's aliases and can't be determined 1765 # by the heuristics in find_codec. 1766 CHARSET_ALIASES = { "macintosh" : "mac-roman", 1767 "x-sjis" : "shift-jis" } 1768 1769 def __init__(self, markup, overrideEncodings=[], 1770 smartQuotesTo='xml', isHTML=False): 1771 self.declaredHTMLEncoding = None 1772 self.markup, documentEncoding, sniffedEncoding = \ 1773 self._detectEncoding(markup, isHTML) 1774 self.smartQuotesTo = smartQuotesTo 1775 self.triedEncodings = [] 1776 if markup == '' or isinstance(markup, unicode): 1777 self.originalEncoding = None 1778 self.unicode = unicode(markup) 1779 return 1780 1781 u = None 1782 for proposedEncoding in overrideEncodings: 1783 u = self._convertFrom(proposedEncoding) 1784 if u: break 1785 if not u: 1786 for proposedEncoding in (documentEncoding, sniffedEncoding): 1787 u = self._convertFrom(proposedEncoding) 1788 if u: break 1789 1790 # If no luck and we have auto-detection library, try that: 1791 if not u and chardet and not isinstance(self.markup, unicode): 1792 u = self._convertFrom(chardet.detect(self.markup)['encoding']) 1793 1794 # As a last resort, try utf-8 and windows-1252: 1795 if not u: 1796 for proposed_encoding in ("utf-8", "windows-1252"): 1797 u = self._convertFrom(proposed_encoding) 1798 if u: break 1799 1800 self.unicode = u 1801 if not u: self.originalEncoding = None 1802 1803 def _subMSChar(self, orig): 1804 """Changes a MS smart quote character to an XML or HTML 1805 entity.""" 1806 sub = self.MS_CHARS.get(orig) 1807 if isinstance(sub, tuple): 1808 if self.smartQuotesTo == 'xml': 1809 sub = '&#x%s;' % sub[1] 1810 else: 1811 sub = '&%s;' % sub[0] 1812 return sub 1813 1814 def _convertFrom(self, proposed): 1815 proposed = self.find_codec(proposed) 1816 if not proposed or proposed in self.triedEncodings: 1817 return None 1818 self.triedEncodings.append(proposed) 1819 markup = self.markup 1820 1821 # Convert smart quotes to HTML if coming from an encoding 1822 # that might have them. 1823 if self.smartQuotesTo and proposed.lower() in("windows-1252", 1824 "iso-8859-1", 1825 "iso-8859-2"): 1826 markup = re.compile("([\x80-\x9f])").sub \ 1827 (lambda(x): self._subMSChar(x.group(1)), 1828 markup) 1829 1830 try: 1831 # print "Trying to convert document to %s" % proposed 1832 u = self._toUnicode(markup, proposed) 1833 self.markup = u 1834 self.originalEncoding = proposed 1835 except Exception, e: 1836 # print "That didn't work!" 1837 # print e 1838 return None 1839 #print "Correct encoding: %s" % proposed 1840 return self.markup 1841 1842 def _toUnicode(self, data, encoding): 1843 '''Given a string and its encoding, decodes the string into Unicode. 1844 %encoding is a string recognized by encodings.aliases''' 1845 1846 # strip Byte Order Mark (if present) 1847 if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ 1848 and (data[2:4] != '\x00\x00'): 1849 encoding = 'utf-16be' 1850 data = data[2:] 1851 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ 1852 and (data[2:4] != '\x00\x00'): 1853 encoding = 'utf-16le' 1854 data = data[2:] 1855 elif data[:3] == '\xef\xbb\xbf': 1856 encoding = 'utf-8' 1857 data = data[3:] 1858 elif data[:4] == '\x00\x00\xfe\xff': 1859 encoding = 'utf-32be' 1860 data = data[4:] 1861 elif data[:4] == '\xff\xfe\x00\x00': 1862 encoding = 'utf-32le' 1863 data = data[4:] 1864 newdata = unicode(data, encoding) 1865 return newdata 1866 1867 def _detectEncoding(self, xml_data, isHTML=False): 1868 """Given a document, tries to detect its XML encoding.""" 1869 xml_encoding = sniffed_xml_encoding = None 1870 try: 1871 if xml_data[:4] == '\x4c\x6f\xa7\x94': 1872 # EBCDIC 1873 xml_data = self._ebcdic_to_ascii(xml_data) 1874 elif xml_data[:4] == '\x00\x3c\x00\x3f': 1875 # UTF-16BE 1876 sniffed_xml_encoding = 'utf-16be' 1877 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') 1878 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ 1879 and (xml_data[2:4] != '\x00\x00'): 1880 # UTF-16BE with BOM 1881 sniffed_xml_encoding = 'utf-16be' 1882 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') 1883 elif xml_data[:4] == '\x3c\x00\x3f\x00': 1884 # UTF-16LE 1885 sniffed_xml_encoding = 'utf-16le' 1886 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') 1887 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ 1888 (xml_data[2:4] != '\x00\x00'): 1889 # UTF-16LE with BOM 1890 sniffed_xml_encoding = 'utf-16le' 1891 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') 1892 elif xml_data[:4] == '\x00\x00\x00\x3c': 1893 # UTF-32BE 1894 sniffed_xml_encoding = 'utf-32be' 1895 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') 1896 elif xml_data[:4] == '\x3c\x00\x00\x00': 1897 # UTF-32LE 1898 sniffed_xml_encoding = 'utf-32le' 1899 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') 1900 elif xml_data[:4] == '\x00\x00\xfe\xff': 1901 # UTF-32BE with BOM 1902 sniffed_xml_encoding = 'utf-32be' 1903 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') 1904 elif xml_data[:4] == '\xff\xfe\x00\x00': 1905 # UTF-32LE with BOM 1906 sniffed_xml_encoding = 'utf-32le' 1907 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') 1908 elif xml_data[:3] == '\xef\xbb\xbf': 1909 # UTF-8 with BOM 1910 sniffed_xml_encoding = 'utf-8' 1911 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') 1912 else: 1913 sniffed_xml_encoding = 'ascii' 1914 pass 1915 except: 1916 xml_encoding_match = None 1917 xml_encoding_match = re.compile( 1918 '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data) 1919 if not xml_encoding_match and isHTML: 1920 regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I) 1921 xml_encoding_match = regexp.search(xml_data) 1922 if xml_encoding_match is not None: 1923 xml_encoding = xml_encoding_match.groups()[0].lower() 1924 if isHTML: 1925 self.declaredHTMLEncoding = xml_encoding 1926 if sniffed_xml_encoding and \ 1927 (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 1928 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 1929 'utf-16', 'utf-32', 'utf_16', 'utf_32', 1930 'utf16', 'u16')): 1931 xml_encoding = sniffed_xml_encoding 1932 return xml_data, xml_encoding, sniffed_xml_encoding 1933 1934 1935 def find_codec(self, charset): 1936 return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ 1937 or (charset and self._codec(charset.replace("-", ""))) \ 1938 or (charset and self._codec(charset.replace("-", "_"))) \ 1939 or charset 1940 1941 def _codec(self, charset): 1942 if not charset: return charset 1943 codec = None 1944 try: 1945 codecs.lookup(charset) 1946 codec = charset 1947 except (LookupError, ValueError): 1948 pass 1949 return codec 1950 1951 EBCDIC_TO_ASCII_MAP = None 1952 def _ebcdic_to_ascii(self, s): 1953 c = self.__class__ 1954 if not c.EBCDIC_TO_ASCII_MAP: 1955 emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, 1956 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, 1957 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, 1958 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, 1959 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, 1960 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, 1961 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, 1962 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, 1963 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, 1964 201,202,106,107,108,109,110,111,112,113,114,203,204,205, 1965 206,207,208,209,126,115,116,117,118,119,120,121,122,210, 1966 211,212,213,214,215,216,217,218,219,220,221,222,223,224, 1967 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, 1968 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, 1969 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, 1970 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, 1971 250,251,252,253,254,255) 1972 import string 1973 c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ 1974 ''.join(map(chr, range(256))), ''.join(map(chr, emap))) 1975 return s.translate(c.EBCDIC_TO_ASCII_MAP) 1976 1977 MS_CHARS = { '\x80' : ('euro', '20AC'), 1978 '\x81' : ' ', 1979 '\x82' : ('sbquo', '201A'), 1980 '\x83' : ('fnof', '192'), 1981 '\x84' : ('bdquo', '201E'), 1982 '\x85' : ('hellip', '2026'), 1983 '\x86' : ('dagger', '2020'), 1984 '\x87' : ('Dagger', '2021'), 1985 '\x88' : ('circ', '2C6'), 1986 '\x89' : ('permil', '2030'), 1987 '\x8A' : ('Scaron', '160'), 1988 '\x8B' : ('lsaquo', '2039'), 1989 '\x8C' : ('OElig', '152'), 1990 '\x8D' : '?', 1991 '\x8E' : ('#x17D', '17D'), 1992 '\x8F' : '?', 1993 '\x90' : '?', 1994 '\x91' : ('lsquo', '2018'), 1995 '\x92' : ('rsquo', '2019'), 1996 '\x93' : ('ldquo', '201C'), 1997 '\x94' : ('rdquo', '201D'), 1998 '\x95' : ('bull', '2022'), 1999 '\x96' : ('ndash', '2013'), 2000 '\x97' : ('mdash', '2014'), 2001 '\x98' : ('tilde', '2DC'), 2002 '\x99' : ('trade', '2122'), 2003 '\x9a' : ('scaron', '161'), 2004 '\x9b' : ('rsaquo', '203A'), 2005 '\x9c' : ('oelig', '153'), 2006 '\x9d' : '?', 2007 '\x9e' : ('#x17E', '17E'), 2008 '\x9f' : ('Yuml', ''),} 2009 2010####################################################################### 2011 2012 2013#By default, act as an HTML pretty-printer. 2014if __name__ == '__main__': 2015 import sys 2016 soup = BeautifulSoup(sys.stdin) 2017 print soup.prettify() 2018