1"""A parser for HTML and XHTML.""" 2 3# This file is based on sgmllib.py, but the API is slightly different. 4 5# XXX There should be a way to distinguish between PCDATA (parsed 6# character data -- the normal case), RCDATA (replaceable character 7# data -- only char and entity references and end tags are special) 8# and CDATA (character data -- only end tags are special). 9 10 11import re 12import _markupbase 13 14from html import unescape 15 16 17__all__ = ['HTMLParser'] 18 19# Regular expressions used for parsing 20 21interesting_normal = re.compile('[&<]') 22incomplete = re.compile('&[a-zA-Z#]') 23 24entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') 25charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') 26 27starttagopen = re.compile('<[a-zA-Z]') 28piclose = re.compile('>') 29commentclose = re.compile(r'--\s*>') 30# Note: 31# 1) if you change tagfind/attrfind remember to update locatestarttagend too; 32# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will 33# explode, so don't do it. 34# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state 35# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state 36tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*') 37attrfind_tolerant = re.compile( 38 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' 39 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') 40locatestarttagend_tolerant = re.compile(r""" 41 <[a-zA-Z][^\t\n\r\f />\x00]* # tag name 42 (?:[\s/]* # optional whitespace before attribute name 43 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name 44 (?:\s*=+\s* # value indicator 45 (?:'[^']*' # LITA-enclosed value 46 |"[^"]*" # LIT-enclosed value 47 |(?!['"])[^>\s]* # bare value 48 ) 49 \s* # possibly followed by a space 50 )?(?:\s|/(?!>))* 51 )* 52 )? 53 \s* # trailing whitespace 54""", re.VERBOSE) 55endendtag = re.compile('>') 56# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between 57# </ and the tag name, so maybe this should be fixed 58endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') 59 60 61 62class HTMLParser(_markupbase.ParserBase): 63 """Find tags and other markup and call handler functions. 64 65 Usage: 66 p = HTMLParser() 67 p.feed(data) 68 ... 69 p.close() 70 71 Start tags are handled by calling self.handle_starttag() or 72 self.handle_startendtag(); end tags by self.handle_endtag(). The 73 data between tags is passed from the parser to the derived class 74 by calling self.handle_data() with the data as argument (the data 75 may be split up in arbitrary chunks). If convert_charrefs is 76 True the character references are converted automatically to the 77 corresponding Unicode character (and self.handle_data() is no 78 longer split in chunks), otherwise they are passed by calling 79 self.handle_entityref() or self.handle_charref() with the string 80 containing respectively the named or numeric reference as the 81 argument. 82 """ 83 84 CDATA_CONTENT_ELEMENTS = ("script", "style") 85 86 def __init__(self, *, convert_charrefs=True): 87 """Initialize and reset this instance. 88 89 If convert_charrefs is True (the default), all character references 90 are automatically converted to the corresponding Unicode characters. 91 """ 92 self.convert_charrefs = convert_charrefs 93 self.reset() 94 95 def reset(self): 96 """Reset this instance. Loses all unprocessed data.""" 97 self.rawdata = '' 98 self.lasttag = '???' 99 self.interesting = interesting_normal 100 self.cdata_elem = None 101 _markupbase.ParserBase.reset(self) 102 103 def feed(self, data): 104 r"""Feed data to the parser. 105 106 Call this as often as you want, with as little or as much text 107 as you want (may include '\n'). 108 """ 109 self.rawdata = self.rawdata + data 110 self.goahead(0) 111 112 def close(self): 113 """Handle any buffered data.""" 114 self.goahead(1) 115 116 __starttag_text = None 117 118 def get_starttag_text(self): 119 """Return full source of start tag: '<...>'.""" 120 return self.__starttag_text 121 122 def set_cdata_mode(self, elem): 123 self.cdata_elem = elem.lower() 124 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) 125 126 def clear_cdata_mode(self): 127 self.interesting = interesting_normal 128 self.cdata_elem = None 129 130 # Internal -- handle data as far as reasonable. May leave state 131 # and data to be processed by a subsequent call. If 'end' is 132 # true, force handling all data as if followed by EOF marker. 133 def goahead(self, end): 134 rawdata = self.rawdata 135 i = 0 136 n = len(rawdata) 137 while i < n: 138 if self.convert_charrefs and not self.cdata_elem: 139 j = rawdata.find('<', i) 140 if j < 0: 141 # if we can't find the next <, either we are at the end 142 # or there's more text incoming. If the latter is True, 143 # we can't pass the text to handle_data in case we have 144 # a charref cut in half at end. Try to determine if 145 # this is the case before proceeding by looking for an 146 # & near the end and see if it's followed by a space or ;. 147 amppos = rawdata.rfind('&', max(i, n-34)) 148 if (amppos >= 0 and 149 not re.compile(r'[\s;]').search(rawdata, amppos)): 150 break # wait till we get all the text 151 j = n 152 else: 153 match = self.interesting.search(rawdata, i) # < or & 154 if match: 155 j = match.start() 156 else: 157 if self.cdata_elem: 158 break 159 j = n 160 if i < j: 161 if self.convert_charrefs and not self.cdata_elem: 162 self.handle_data(unescape(rawdata[i:j])) 163 else: 164 self.handle_data(rawdata[i:j]) 165 i = self.updatepos(i, j) 166 if i == n: break 167 startswith = rawdata.startswith 168 if startswith('<', i): 169 if starttagopen.match(rawdata, i): # < + letter 170 k = self.parse_starttag(i) 171 elif startswith("</", i): 172 k = self.parse_endtag(i) 173 elif startswith("<!--", i): 174 k = self.parse_comment(i) 175 elif startswith("<?", i): 176 k = self.parse_pi(i) 177 elif startswith("<!", i): 178 k = self.parse_html_declaration(i) 179 elif (i + 1) < n: 180 self.handle_data("<") 181 k = i + 1 182 else: 183 break 184 if k < 0: 185 if not end: 186 break 187 k = rawdata.find('>', i + 1) 188 if k < 0: 189 k = rawdata.find('<', i + 1) 190 if k < 0: 191 k = i + 1 192 else: 193 k += 1 194 if self.convert_charrefs and not self.cdata_elem: 195 self.handle_data(unescape(rawdata[i:k])) 196 else: 197 self.handle_data(rawdata[i:k]) 198 i = self.updatepos(i, k) 199 elif startswith("&#", i): 200 match = charref.match(rawdata, i) 201 if match: 202 name = match.group()[2:-1] 203 self.handle_charref(name) 204 k = match.end() 205 if not startswith(';', k-1): 206 k = k - 1 207 i = self.updatepos(i, k) 208 continue 209 else: 210 if ";" in rawdata[i:]: # bail by consuming &# 211 self.handle_data(rawdata[i:i+2]) 212 i = self.updatepos(i, i+2) 213 break 214 elif startswith('&', i): 215 match = entityref.match(rawdata, i) 216 if match: 217 name = match.group(1) 218 self.handle_entityref(name) 219 k = match.end() 220 if not startswith(';', k-1): 221 k = k - 1 222 i = self.updatepos(i, k) 223 continue 224 match = incomplete.match(rawdata, i) 225 if match: 226 # match.group() will contain at least 2 chars 227 if end and match.group() == rawdata[i:]: 228 k = match.end() 229 if k <= i: 230 k = n 231 i = self.updatepos(i, i + 1) 232 # incomplete 233 break 234 elif (i + 1) < n: 235 # not the end of the buffer, and can't be confused 236 # with some other construct 237 self.handle_data("&") 238 i = self.updatepos(i, i + 1) 239 else: 240 break 241 else: 242 assert 0, "interesting.search() lied" 243 # end while 244 if end and i < n and not self.cdata_elem: 245 if self.convert_charrefs and not self.cdata_elem: 246 self.handle_data(unescape(rawdata[i:n])) 247 else: 248 self.handle_data(rawdata[i:n]) 249 i = self.updatepos(i, n) 250 self.rawdata = rawdata[i:] 251 252 # Internal -- parse html declarations, return length or -1 if not terminated 253 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state 254 # See also parse_declaration in _markupbase 255 def parse_html_declaration(self, i): 256 rawdata = self.rawdata 257 assert rawdata[i:i+2] == '<!', ('unexpected call to ' 258 'parse_html_declaration()') 259 if rawdata[i:i+4] == '<!--': 260 # this case is actually already handled in goahead() 261 return self.parse_comment(i) 262 elif rawdata[i:i+3] == '<![': 263 return self.parse_marked_section(i) 264 elif rawdata[i:i+9].lower() == '<!doctype': 265 # find the closing > 266 gtpos = rawdata.find('>', i+9) 267 if gtpos == -1: 268 return -1 269 self.handle_decl(rawdata[i+2:gtpos]) 270 return gtpos+1 271 else: 272 return self.parse_bogus_comment(i) 273 274 # Internal -- parse bogus comment, return length or -1 if not terminated 275 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state 276 def parse_bogus_comment(self, i, report=1): 277 rawdata = self.rawdata 278 assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to ' 279 'parse_comment()') 280 pos = rawdata.find('>', i+2) 281 if pos == -1: 282 return -1 283 if report: 284 self.handle_comment(rawdata[i+2:pos]) 285 return pos + 1 286 287 # Internal -- parse processing instr, return end or -1 if not terminated 288 def parse_pi(self, i): 289 rawdata = self.rawdata 290 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()' 291 match = piclose.search(rawdata, i+2) # > 292 if not match: 293 return -1 294 j = match.start() 295 self.handle_pi(rawdata[i+2: j]) 296 j = match.end() 297 return j 298 299 # Internal -- handle starttag, return end or -1 if not terminated 300 def parse_starttag(self, i): 301 self.__starttag_text = None 302 endpos = self.check_for_whole_start_tag(i) 303 if endpos < 0: 304 return endpos 305 rawdata = self.rawdata 306 self.__starttag_text = rawdata[i:endpos] 307 308 # Now parse the data between i+1 and j into a tag and attrs 309 attrs = [] 310 match = tagfind_tolerant.match(rawdata, i+1) 311 assert match, 'unexpected call to parse_starttag()' 312 k = match.end() 313 self.lasttag = tag = match.group(1).lower() 314 while k < endpos: 315 m = attrfind_tolerant.match(rawdata, k) 316 if not m: 317 break 318 attrname, rest, attrvalue = m.group(1, 2, 3) 319 if not rest: 320 attrvalue = None 321 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 322 attrvalue[:1] == '"' == attrvalue[-1:]: 323 attrvalue = attrvalue[1:-1] 324 if attrvalue: 325 attrvalue = unescape(attrvalue) 326 attrs.append((attrname.lower(), attrvalue)) 327 k = m.end() 328 329 end = rawdata[k:endpos].strip() 330 if end not in (">", "/>"): 331 lineno, offset = self.getpos() 332 if "\n" in self.__starttag_text: 333 lineno = lineno + self.__starttag_text.count("\n") 334 offset = len(self.__starttag_text) \ 335 - self.__starttag_text.rfind("\n") 336 else: 337 offset = offset + len(self.__starttag_text) 338 self.handle_data(rawdata[i:endpos]) 339 return endpos 340 if end.endswith('/>'): 341 # XHTML-style empty tag: <span attr="value" /> 342 self.handle_startendtag(tag, attrs) 343 else: 344 self.handle_starttag(tag, attrs) 345 if tag in self.CDATA_CONTENT_ELEMENTS: 346 self.set_cdata_mode(tag) 347 return endpos 348 349 # Internal -- check to see if we have a complete starttag; return end 350 # or -1 if incomplete. 351 def check_for_whole_start_tag(self, i): 352 rawdata = self.rawdata 353 m = locatestarttagend_tolerant.match(rawdata, i) 354 if m: 355 j = m.end() 356 next = rawdata[j:j+1] 357 if next == ">": 358 return j + 1 359 if next == "/": 360 if rawdata.startswith("/>", j): 361 return j + 2 362 if rawdata.startswith("/", j): 363 # buffer boundary 364 return -1 365 # else bogus input 366 if j > i: 367 return j 368 else: 369 return i + 1 370 if next == "": 371 # end of input 372 return -1 373 if next in ("abcdefghijklmnopqrstuvwxyz=/" 374 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): 375 # end of input in or before attribute value, or we have the 376 # '/' from a '/>' ending 377 return -1 378 if j > i: 379 return j 380 else: 381 return i + 1 382 raise AssertionError("we should not get here!") 383 384 # Internal -- parse endtag, return end or -1 if incomplete 385 def parse_endtag(self, i): 386 rawdata = self.rawdata 387 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" 388 match = endendtag.search(rawdata, i+1) # > 389 if not match: 390 return -1 391 gtpos = match.end() 392 match = endtagfind.match(rawdata, i) # </ + tag + > 393 if not match: 394 if self.cdata_elem is not None: 395 self.handle_data(rawdata[i:gtpos]) 396 return gtpos 397 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state 398 namematch = tagfind_tolerant.match(rawdata, i+2) 399 if not namematch: 400 # w3.org/TR/html5/tokenization.html#end-tag-open-state 401 if rawdata[i:i+3] == '</>': 402 return i+3 403 else: 404 return self.parse_bogus_comment(i) 405 tagname = namematch.group(1).lower() 406 # consume and ignore other stuff between the name and the > 407 # Note: this is not 100% correct, since we might have things like 408 # </tag attr=">">, but looking for > after the name should cover 409 # most of the cases and is much simpler 410 gtpos = rawdata.find('>', namematch.end()) 411 self.handle_endtag(tagname) 412 return gtpos+1 413 414 elem = match.group(1).lower() # script or style 415 if self.cdata_elem is not None: 416 if elem != self.cdata_elem: 417 self.handle_data(rawdata[i:gtpos]) 418 return gtpos 419 420 self.handle_endtag(elem) 421 self.clear_cdata_mode() 422 return gtpos 423 424 # Overridable -- finish processing of start+end tag: <tag.../> 425 def handle_startendtag(self, tag, attrs): 426 self.handle_starttag(tag, attrs) 427 self.handle_endtag(tag) 428 429 # Overridable -- handle start tag 430 def handle_starttag(self, tag, attrs): 431 pass 432 433 # Overridable -- handle end tag 434 def handle_endtag(self, tag): 435 pass 436 437 # Overridable -- handle character reference 438 def handle_charref(self, name): 439 pass 440 441 # Overridable -- handle entity reference 442 def handle_entityref(self, name): 443 pass 444 445 # Overridable -- handle data 446 def handle_data(self, data): 447 pass 448 449 # Overridable -- handle comment 450 def handle_comment(self, data): 451 pass 452 453 # Overridable -- handle declaration 454 def handle_decl(self, decl): 455 pass 456 457 # Overridable -- handle processing instruction 458 def handle_pi(self, data): 459 pass 460 461 def unknown_decl(self, data): 462 pass 463