1"""Diagnostic functions, mainly for use when doing tech support.""" 2import cProfile 3from StringIO import StringIO 4from HTMLParser import HTMLParser 5import bs4 6from bs4 import BeautifulSoup, __version__ 7from bs4.builder import builder_registry 8 9import os 10import pstats 11import random 12import tempfile 13import time 14import traceback 15import sys 16import cProfile 17 18def diagnose(data): 19 """Diagnostic suite for isolating common problems.""" 20 print "Diagnostic running on Beautiful Soup %s" % __version__ 21 print "Python version %s" % sys.version 22 23 basic_parsers = ["html.parser", "html5lib", "lxml"] 24 for name in basic_parsers: 25 for builder in builder_registry.builders: 26 if name in builder.features: 27 break 28 else: 29 basic_parsers.remove(name) 30 print ( 31 "I noticed that %s is not installed. Installing it may help." % 32 name) 33 34 if 'lxml' in basic_parsers: 35 basic_parsers.append(["lxml", "xml"]) 36 from lxml import etree 37 print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) 38 39 if 'html5lib' in basic_parsers: 40 import html5lib 41 print "Found html5lib version %s" % html5lib.__version__ 42 43 if hasattr(data, 'read'): 44 data = data.read() 45 elif os.path.exists(data): 46 print '"%s" looks like a filename. Reading data from the file.' % data 47 data = open(data).read() 48 elif data.startswith("http:") or data.startswith("https:"): 49 print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data 50 print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." 51 return 52 print 53 54 for parser in basic_parsers: 55 print "Trying to parse your markup with %s" % parser 56 success = False 57 try: 58 soup = BeautifulSoup(data, parser) 59 success = True 60 except Exception, e: 61 print "%s could not parse the markup." % parser 62 traceback.print_exc() 63 if success: 64 print "Here's what %s did with the markup:" % parser 65 print soup.prettify() 66 67 print "-" * 80 68 69def lxml_trace(data, html=True, **kwargs): 70 """Print out the lxml events that occur during parsing. 71 72 This lets you see how lxml parses a document when no Beautiful 73 Soup code is running. 74 """ 75 from lxml import etree 76 for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): 77 print("%s, %4s, %s" % (event, element.tag, element.text)) 78 79class AnnouncingParser(HTMLParser): 80 """Announces HTMLParser parse events, without doing anything else.""" 81 82 def _p(self, s): 83 print(s) 84 85 def handle_starttag(self, name, attrs): 86 self._p("%s START" % name) 87 88 def handle_endtag(self, name): 89 self._p("%s END" % name) 90 91 def handle_data(self, data): 92 self._p("%s DATA" % data) 93 94 def handle_charref(self, name): 95 self._p("%s CHARREF" % name) 96 97 def handle_entityref(self, name): 98 self._p("%s ENTITYREF" % name) 99 100 def handle_comment(self, data): 101 self._p("%s COMMENT" % data) 102 103 def handle_decl(self, data): 104 self._p("%s DECL" % data) 105 106 def unknown_decl(self, data): 107 self._p("%s UNKNOWN-DECL" % data) 108 109 def handle_pi(self, data): 110 self._p("%s PI" % data) 111 112def htmlparser_trace(data): 113 """Print out the HTMLParser events that occur during parsing. 114 115 This lets you see how HTMLParser parses a document when no 116 Beautiful Soup code is running. 117 """ 118 parser = AnnouncingParser() 119 parser.feed(data) 120 121_vowels = "aeiou" 122_consonants = "bcdfghjklmnpqrstvwxyz" 123 124def rword(length=5): 125 "Generate a random word-like string." 126 s = '' 127 for i in range(length): 128 if i % 2 == 0: 129 t = _consonants 130 else: 131 t = _vowels 132 s += random.choice(t) 133 return s 134 135def rsentence(length=4): 136 "Generate a random sentence-like string." 137 return " ".join(rword(random.randint(4,9)) for i in range(length)) 138 139def rdoc(num_elements=1000): 140 """Randomly generate an invalid HTML document.""" 141 tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] 142 elements = [] 143 for i in range(num_elements): 144 choice = random.randint(0,3) 145 if choice == 0: 146 # New tag. 147 tag_name = random.choice(tag_names) 148 elements.append("<%s>" % tag_name) 149 elif choice == 1: 150 elements.append(rsentence(random.randint(1,4))) 151 elif choice == 2: 152 # Close a tag. 153 tag_name = random.choice(tag_names) 154 elements.append("</%s>" % tag_name) 155 return "<html>" + "\n".join(elements) + "</html>" 156 157def benchmark_parsers(num_elements=100000): 158 """Very basic head-to-head performance benchmark.""" 159 print "Comparative parser benchmark on Beautiful Soup %s" % __version__ 160 data = rdoc(num_elements) 161 print "Generated a large invalid HTML document (%d bytes)." % len(data) 162 163 for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: 164 success = False 165 try: 166 a = time.time() 167 soup = BeautifulSoup(data, parser) 168 b = time.time() 169 success = True 170 except Exception, e: 171 print "%s could not parse the markup." % parser 172 traceback.print_exc() 173 if success: 174 print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) 175 176 from lxml import etree 177 a = time.time() 178 etree.HTML(data) 179 b = time.time() 180 print "Raw lxml parsed the markup in %.2fs." % (b-a) 181 182 import html5lib 183 parser = html5lib.HTMLParser() 184 a = time.time() 185 parser.parse(data) 186 b = time.time() 187 print "Raw html5lib parsed the markup in %.2fs." % (b-a) 188 189def profile(num_elements=100000, parser="lxml"): 190 191 filehandle = tempfile.NamedTemporaryFile() 192 filename = filehandle.name 193 194 data = rdoc(num_elements) 195 vars = dict(bs4=bs4, data=data, parser=parser) 196 cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) 197 198 stats = pstats.Stats(filename) 199 # stats.strip_dirs() 200 stats.sort_stats("cumulative") 201 stats.print_stats('_html5lib|bs4', 50) 202 203if __name__ == '__main__': 204 diagnose(sys.stdin.read()) 205