• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Demonstrate how different parsers parse the same markup.
2
3Beautiful Soup can use any of a number of different parsers. Every
4parser should behave more or less the same on valid markup, and
5Beautiful Soup's unit tests make sure this is the case. But every
6parser handles invalid markup differently. Even different versions of
7the same parser handle invalid markup differently. So instead of unit
8tests I've created this educational demonstration script.
9
10The file demonstration_markup.txt contains many lines of HTML. This
11script tests each line of markup against every parser you have
12installed, and prints out how each parser sees that markup. This may
13help you choose a parser, or understand why Beautiful Soup presents
14your document the way it does.
15"""
16
17import os
18import sys
19from bs4 import BeautifulSoup
20parsers = ['html.parser']
21
22try:
23    from bs4.builder import _lxml
24    parsers.append('lxml')
25except ImportError, e:
26    pass
27
28try:
29    from bs4.builder import _html5lib
30    parsers.append('html5lib')
31except ImportError, e:
32    pass
33
34class Demonstration(object):
35    def __init__(self, markup):
36        self.results = {}
37        self.markup = markup
38
39    def run_against(self, *parser_names):
40        uniform_results = True
41        previous_output = None
42        for parser in parser_names:
43            try:
44                soup = BeautifulSoup(self.markup, parser)
45                if markup.startswith("<div>"):
46                    # Extract the interesting part
47                    output = soup.div
48                else:
49                    output = soup
50            except Exception, e:
51                output = "[EXCEPTION] %s" % str(e)
52            self.results[parser] = output
53            if previous_output is None:
54                previous_output = output
55            elif previous_output != output:
56                uniform_results = False
57        return uniform_results
58
59    def dump(self):
60        print "%s: %s" % ("Markup".rjust(13), self.markup.encode("utf8"))
61        for parser, output in self.results.items():
62            print "%s: %s" % (parser.rjust(13), output.encode("utf8"))
63
64different_results = []
65uniform_results = []
66
67print "= Testing the following parsers: %s =" % ", ".join(parsers)
68print
69
70input_file = sys.stdin
71if sys.stdin.isatty():
72    for filename in [
73        "demonstration_markup.txt",
74        os.path.join("scripts", "demonstration_markup.txt")]:
75        if os.path.exists(filename):
76            input_file = open(filename)
77
78for markup in input_file:
79    demo = Demonstration(markup.decode("utf8").strip().replace("\\n", "\n"))
80    is_uniform = demo.run_against(*parsers)
81    if is_uniform:
82        uniform_results.append(demo)
83    else:
84        different_results.append(demo)
85
86print "== Markup that's handled the same in every parser =="
87print
88for demo in uniform_results:
89    demo.dump()
90    print
91print "== Markup that's not handled the same in every parser =="
92print
93for demo in different_results:
94    demo.dump()
95    print
96