• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Diagnostic functions, mainly for use when doing tech support."""
2import cProfile
3from StringIO import StringIO
4from HTMLParser import HTMLParser
5import bs4
6from bs4 import BeautifulSoup, __version__
7from bs4.builder import builder_registry
8
9import os
10import pstats
11import random
12import tempfile
13import time
14import traceback
15import sys
16import cProfile
17
18def diagnose(data):
19    """Diagnostic suite for isolating common problems."""
20    print "Diagnostic running on Beautiful Soup %s" % __version__
21    print "Python version %s" % sys.version
22
23    basic_parsers = ["html.parser", "html5lib", "lxml"]
24    for name in basic_parsers:
25        for builder in builder_registry.builders:
26            if name in builder.features:
27                break
28        else:
29            basic_parsers.remove(name)
30            print (
31                "I noticed that %s is not installed. Installing it may help." %
32                name)
33
34    if 'lxml' in basic_parsers:
35        basic_parsers.append(["lxml", "xml"])
36        from lxml import etree
37        print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
38
39    if 'html5lib' in basic_parsers:
40        import html5lib
41        print "Found html5lib version %s" % html5lib.__version__
42
43    if hasattr(data, 'read'):
44        data = data.read()
45    elif os.path.exists(data):
46        print '"%s" looks like a filename. Reading data from the file.' % data
47        data = open(data).read()
48    elif data.startswith("http:") or data.startswith("https:"):
49        print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
50        print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
51        return
52    print
53
54    for parser in basic_parsers:
55        print "Trying to parse your markup with %s" % parser
56        success = False
57        try:
58            soup = BeautifulSoup(data, parser)
59            success = True
60        except Exception, e:
61            print "%s could not parse the markup." % parser
62            traceback.print_exc()
63        if success:
64            print "Here's what %s did with the markup:" % parser
65            print soup.prettify()
66
67        print "-" * 80
68
69def lxml_trace(data, html=True, **kwargs):
70    """Print out the lxml events that occur during parsing.
71
72    This lets you see how lxml parses a document when no Beautiful
73    Soup code is running.
74    """
75    from lxml import etree
76    for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
77        print("%s, %4s, %s" % (event, element.tag, element.text))
78
79class AnnouncingParser(HTMLParser):
80    """Announces HTMLParser parse events, without doing anything else."""
81
82    def _p(self, s):
83        print(s)
84
85    def handle_starttag(self, name, attrs):
86        self._p("%s START" % name)
87
88    def handle_endtag(self, name):
89        self._p("%s END" % name)
90
91    def handle_data(self, data):
92        self._p("%s DATA" % data)
93
94    def handle_charref(self, name):
95        self._p("%s CHARREF" % name)
96
97    def handle_entityref(self, name):
98        self._p("%s ENTITYREF" % name)
99
100    def handle_comment(self, data):
101        self._p("%s COMMENT" % data)
102
103    def handle_decl(self, data):
104        self._p("%s DECL" % data)
105
106    def unknown_decl(self, data):
107        self._p("%s UNKNOWN-DECL" % data)
108
109    def handle_pi(self, data):
110        self._p("%s PI" % data)
111
112def htmlparser_trace(data):
113    """Print out the HTMLParser events that occur during parsing.
114
115    This lets you see how HTMLParser parses a document when no
116    Beautiful Soup code is running.
117    """
118    parser = AnnouncingParser()
119    parser.feed(data)
120
121_vowels = "aeiou"
122_consonants = "bcdfghjklmnpqrstvwxyz"
123
124def rword(length=5):
125    "Generate a random word-like string."
126    s = ''
127    for i in range(length):
128        if i % 2 == 0:
129            t = _consonants
130        else:
131            t = _vowels
132        s += random.choice(t)
133    return s
134
135def rsentence(length=4):
136    "Generate a random sentence-like string."
137    return " ".join(rword(random.randint(4,9)) for i in range(length))
138
139def rdoc(num_elements=1000):
140    """Randomly generate an invalid HTML document."""
141    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
142    elements = []
143    for i in range(num_elements):
144        choice = random.randint(0,3)
145        if choice == 0:
146            # New tag.
147            tag_name = random.choice(tag_names)
148            elements.append("<%s>" % tag_name)
149        elif choice == 1:
150            elements.append(rsentence(random.randint(1,4)))
151        elif choice == 2:
152            # Close a tag.
153            tag_name = random.choice(tag_names)
154            elements.append("</%s>" % tag_name)
155    return "<html>" + "\n".join(elements) + "</html>"
156
157def benchmark_parsers(num_elements=100000):
158    """Very basic head-to-head performance benchmark."""
159    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
160    data = rdoc(num_elements)
161    print "Generated a large invalid HTML document (%d bytes)." % len(data)
162
163    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
164        success = False
165        try:
166            a = time.time()
167            soup = BeautifulSoup(data, parser)
168            b = time.time()
169            success = True
170        except Exception, e:
171            print "%s could not parse the markup." % parser
172            traceback.print_exc()
173        if success:
174            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
175
176    from lxml import etree
177    a = time.time()
178    etree.HTML(data)
179    b = time.time()
180    print "Raw lxml parsed the markup in %.2fs." % (b-a)
181
182    import html5lib
183    parser = html5lib.HTMLParser()
184    a = time.time()
185    parser.parse(data)
186    b = time.time()
187    print "Raw html5lib parsed the markup in %.2fs." % (b-a)
188
189def profile(num_elements=100000, parser="lxml"):
190
191    filehandle = tempfile.NamedTemporaryFile()
192    filename = filehandle.name
193
194    data = rdoc(num_elements)
195    vars = dict(bs4=bs4, data=data, parser=parser)
196    cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
197
198    stats = pstats.Stats(filename)
199    # stats.strip_dirs()
200    stats.sort_stats("cumulative")
201    stats.print_stats('_html5lib|bs4', 50)
202
203if __name__ == '__main__':
204    diagnose(sys.stdin.read())
205