1"""Demonstrate how different parsers parse the same markup. 2 3Beautiful Soup can use any of a number of different parsers. Every 4parser should behave more or less the same on valid markup, and 5Beautiful Soup's unit tests make sure this is the case. But every 6parser handles invalid markup differently. Even different versions of 7the same parser handle invalid markup differently. So instead of unit 8tests I've created this educational demonstration script. 9 10The file demonstration_markup.txt contains many lines of HTML. This 11script tests each line of markup against every parser you have 12installed, and prints out how each parser sees that markup. This may 13help you choose a parser, or understand why Beautiful Soup presents 14your document the way it does. 15""" 16 17import os 18import sys 19from bs4 import BeautifulSoup 20parsers = ['html.parser'] 21 22try: 23 from bs4.builder import _lxml 24 parsers.append('lxml') 25except ImportError, e: 26 pass 27 28try: 29 from bs4.builder import _html5lib 30 parsers.append('html5lib') 31except ImportError, e: 32 pass 33 34class Demonstration(object): 35 def __init__(self, markup): 36 self.results = {} 37 self.markup = markup 38 39 def run_against(self, *parser_names): 40 uniform_results = True 41 previous_output = None 42 for parser in parser_names: 43 try: 44 soup = BeautifulSoup(self.markup, parser) 45 if markup.startswith("<div>"): 46 # Extract the interesting part 47 output = soup.div 48 else: 49 output = soup 50 except Exception, e: 51 output = "[EXCEPTION] %s" % str(e) 52 self.results[parser] = output 53 if previous_output is None: 54 previous_output = output 55 elif previous_output != output: 56 uniform_results = False 57 return uniform_results 58 59 def dump(self): 60 print "%s: %s" % ("Markup".rjust(13), self.markup.encode("utf8")) 61 for parser, output in self.results.items(): 62 print "%s: %s" % (parser.rjust(13), output.encode("utf8")) 63 64different_results = [] 65uniform_results = [] 66 67print "= Testing the following parsers: %s =" % ", ".join(parsers) 68print 69 70input_file = sys.stdin 71if sys.stdin.isatty(): 72 for filename in [ 73 "demonstration_markup.txt", 74 os.path.join("scripts", "demonstration_markup.txt")]: 75 if os.path.exists(filename): 76 input_file = open(filename) 77 78for markup in input_file: 79 demo = Demonstration(markup.decode("utf8").strip().replace("\\n", "\n")) 80 is_uniform = demo.run_against(*parsers) 81 if is_uniform: 82 uniform_results.append(demo) 83 else: 84 different_results.append(demo) 85 86print "== Markup that's handled the same in every parser ==" 87print 88for demo in uniform_results: 89 demo.dump() 90 print 91print "== Markup that's not handled the same in every parser ==" 92print 93for demo in different_results: 94 demo.dump() 95 print 96