1"""Shared support for scanning document type declarations in HTML and XHTML. 2 3This module is used as a foundation for the HTMLParser and sgmllib 4modules (indirectly, for htmllib as well). It has no documented 5public API and should not be used directly. 6 7""" 8 9import re 10 11_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match 12_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match 13_commentclose = re.compile(r'--\s*>') 14_markedsectionclose = re.compile(r']\s*]\s*>') 15 16# An analysis of the MS-Word extensions is available at 17# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf 18 19_msmarkedsectionclose = re.compile(r']\s*>') 20 21del re 22 23 24class ParserBase: 25 """Parser base class which provides some common support methods used 26 by the SGML/HTML and XHTML parsers.""" 27 28 def __init__(self): 29 if self.__class__ is ParserBase: 30 raise RuntimeError( 31 "markupbase.ParserBase must be subclassed") 32 33 def error(self, message): 34 raise NotImplementedError( 35 "subclasses of ParserBase must override error()") 36 37 def reset(self): 38 self.lineno = 1 39 self.offset = 0 40 41 def getpos(self): 42 """Return current line number and offset.""" 43 return self.lineno, self.offset 44 45 # Internal -- update line number and offset. This should be 46 # called for each piece of data exactly once, in order -- in other 47 # words the concatenation of all the input strings to this 48 # function should be exactly the entire input. 49 def updatepos(self, i, j): 50 if i >= j: 51 return j 52 rawdata = self.rawdata 53 nlines = rawdata.count("\n", i, j) 54 if nlines: 55 self.lineno = self.lineno + nlines 56 pos = rawdata.rindex("\n", i, j) # Should not fail 57 self.offset = j-(pos+1) 58 else: 59 self.offset = self.offset + j-i 60 return j 61 62 _decl_otherchars = '' 63 64 # Internal -- parse declaration (for use by subclasses). 65 def parse_declaration(self, i): 66 # This is some sort of declaration; in "HTML as 67 # deployed," this should only be the document type 68 # declaration ("<!DOCTYPE html...>"). 69 # ISO 8879:1986, however, has more complex 70 # declaration syntax for elements in <!...>, including: 71 # --comment-- 72 # [marked section] 73 # name in the following list: ENTITY, DOCTYPE, ELEMENT, 74 # ATTLIST, NOTATION, SHORTREF, USEMAP, 75 # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM 76 rawdata = self.rawdata 77 j = i + 2 78 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration" 79 if rawdata[j:j+1] == ">": 80 # the empty comment <!> 81 return j + 1 82 if rawdata[j:j+1] in ("-", ""): 83 # Start of comment followed by buffer boundary, 84 # or just a buffer boundary. 85 return -1 86 # A simple, practical version could look like: ((name|stringlit) S*) + '>' 87 n = len(rawdata) 88 if rawdata[j:j+2] == '--': #comment 89 # Locate --.*-- as the body of the comment 90 return self.parse_comment(i) 91 elif rawdata[j] == '[': #marked section 92 # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section 93 # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA 94 # Note that this is extended by Microsoft Office "Save as Web" function 95 # to include [if...] and [endif]. 96 return self.parse_marked_section(i) 97 else: #all other declaration elements 98 decltype, j = self._scan_name(j, i) 99 if j < 0: 100 return j 101 if decltype == "doctype": 102 self._decl_otherchars = '' 103 while j < n: 104 c = rawdata[j] 105 if c == ">": 106 # end of declaration syntax 107 data = rawdata[i+2:j] 108 if decltype == "doctype": 109 self.handle_decl(data) 110 else: 111 self.unknown_decl(data) 112 return j + 1 113 if c in "\"'": 114 m = _declstringlit_match(rawdata, j) 115 if not m: 116 return -1 # incomplete 117 j = m.end() 118 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": 119 name, j = self._scan_name(j, i) 120 elif c in self._decl_otherchars: 121 j = j + 1 122 elif c == "[": 123 # this could be handled in a separate doctype parser 124 if decltype == "doctype": 125 j = self._parse_doctype_subset(j + 1, i) 126 elif decltype in ("attlist", "linktype", "link", "element"): 127 # must tolerate []'d groups in a content model in an element declaration 128 # also in data attribute specifications of attlist declaration 129 # also link type declaration subsets in linktype declarations 130 # also link attribute specification lists in link declarations 131 self.error("unsupported '[' char in %s declaration" % decltype) 132 else: 133 self.error("unexpected '[' char in declaration") 134 else: 135 self.error( 136 "unexpected %r char in declaration" % rawdata[j]) 137 if j < 0: 138 return j 139 return -1 # incomplete 140 141 # Internal -- parse a marked section 142 # Override this to handle MS-word extension syntax <![if word]>content<![endif]> 143 def parse_marked_section(self, i, report=1): 144 rawdata= self.rawdata 145 assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()" 146 sectName, j = self._scan_name( i+3, i ) 147 if j < 0: 148 return j 149 if sectName in ("temp", "cdata", "ignore", "include", "rcdata"): 150 # look for standard ]]> ending 151 match= _markedsectionclose.search(rawdata, i+3) 152 elif sectName in ("if", "else", "endif"): 153 # look for MS Office ]> ending 154 match= _msmarkedsectionclose.search(rawdata, i+3) 155 else: 156 self.error('unknown status keyword %r in marked section' % rawdata[i+3:j]) 157 if not match: 158 return -1 159 if report: 160 j = match.start(0) 161 self.unknown_decl(rawdata[i+3: j]) 162 return match.end(0) 163 164 # Internal -- parse comment, return length or -1 if not terminated 165 def parse_comment(self, i, report=1): 166 rawdata = self.rawdata 167 if rawdata[i:i+4] != '<!--': 168 self.error('unexpected call to parse_comment()') 169 match = _commentclose.search(rawdata, i+4) 170 if not match: 171 return -1 172 if report: 173 j = match.start(0) 174 self.handle_comment(rawdata[i+4: j]) 175 return match.end(0) 176 177 # Internal -- scan past the internal subset in a <!DOCTYPE declaration, 178 # returning the index just past any whitespace following the trailing ']'. 179 def _parse_doctype_subset(self, i, declstartpos): 180 rawdata = self.rawdata 181 n = len(rawdata) 182 j = i 183 while j < n: 184 c = rawdata[j] 185 if c == "<": 186 s = rawdata[j:j+2] 187 if s == "<": 188 # end of buffer; incomplete 189 return -1 190 if s != "<!": 191 self.updatepos(declstartpos, j + 1) 192 self.error("unexpected char in internal subset (in %r)" % s) 193 if (j + 2) == n: 194 # end of buffer; incomplete 195 return -1 196 if (j + 4) > n: 197 # end of buffer; incomplete 198 return -1 199 if rawdata[j:j+4] == "<!--": 200 j = self.parse_comment(j, report=0) 201 if j < 0: 202 return j 203 continue 204 name, j = self._scan_name(j + 2, declstartpos) 205 if j == -1: 206 return -1 207 if name not in ("attlist", "element", "entity", "notation"): 208 self.updatepos(declstartpos, j + 2) 209 self.error( 210 "unknown declaration %r in internal subset" % name) 211 # handle the individual names 212 meth = getattr(self, "_parse_doctype_" + name) 213 j = meth(j, declstartpos) 214 if j < 0: 215 return j 216 elif c == "%": 217 # parameter entity reference 218 if (j + 1) == n: 219 # end of buffer; incomplete 220 return -1 221 s, j = self._scan_name(j + 1, declstartpos) 222 if j < 0: 223 return j 224 if rawdata[j] == ";": 225 j = j + 1 226 elif c == "]": 227 j = j + 1 228 while j < n and rawdata[j].isspace(): 229 j = j + 1 230 if j < n: 231 if rawdata[j] == ">": 232 return j 233 self.updatepos(declstartpos, j) 234 self.error("unexpected char after internal subset") 235 else: 236 return -1 237 elif c.isspace(): 238 j = j + 1 239 else: 240 self.updatepos(declstartpos, j) 241 self.error("unexpected char %r in internal subset" % c) 242 # end of buffer reached 243 return -1 244 245 # Internal -- scan past <!ELEMENT declarations 246 def _parse_doctype_element(self, i, declstartpos): 247 name, j = self._scan_name(i, declstartpos) 248 if j == -1: 249 return -1 250 # style content model; just skip until '>' 251 rawdata = self.rawdata 252 if '>' in rawdata[j:]: 253 return rawdata.find(">", j) + 1 254 return -1 255 256 # Internal -- scan past <!ATTLIST declarations 257 def _parse_doctype_attlist(self, i, declstartpos): 258 rawdata = self.rawdata 259 name, j = self._scan_name(i, declstartpos) 260 c = rawdata[j:j+1] 261 if c == "": 262 return -1 263 if c == ">": 264 return j + 1 265 while 1: 266 # scan a series of attribute descriptions; simplified: 267 # name type [value] [#constraint] 268 name, j = self._scan_name(j, declstartpos) 269 if j < 0: 270 return j 271 c = rawdata[j:j+1] 272 if c == "": 273 return -1 274 if c == "(": 275 # an enumerated type; look for ')' 276 if ")" in rawdata[j:]: 277 j = rawdata.find(")", j) + 1 278 else: 279 return -1 280 while rawdata[j:j+1].isspace(): 281 j = j + 1 282 if not rawdata[j:]: 283 # end of buffer, incomplete 284 return -1 285 else: 286 name, j = self._scan_name(j, declstartpos) 287 c = rawdata[j:j+1] 288 if not c: 289 return -1 290 if c in "'\"": 291 m = _declstringlit_match(rawdata, j) 292 if m: 293 j = m.end() 294 else: 295 return -1 296 c = rawdata[j:j+1] 297 if not c: 298 return -1 299 if c == "#": 300 if rawdata[j:] == "#": 301 # end of buffer 302 return -1 303 name, j = self._scan_name(j + 1, declstartpos) 304 if j < 0: 305 return j 306 c = rawdata[j:j+1] 307 if not c: 308 return -1 309 if c == '>': 310 # all done 311 return j + 1 312 313 # Internal -- scan past <!NOTATION declarations 314 def _parse_doctype_notation(self, i, declstartpos): 315 name, j = self._scan_name(i, declstartpos) 316 if j < 0: 317 return j 318 rawdata = self.rawdata 319 while 1: 320 c = rawdata[j:j+1] 321 if not c: 322 # end of buffer; incomplete 323 return -1 324 if c == '>': 325 return j + 1 326 if c in "'\"": 327 m = _declstringlit_match(rawdata, j) 328 if not m: 329 return -1 330 j = m.end() 331 else: 332 name, j = self._scan_name(j, declstartpos) 333 if j < 0: 334 return j 335 336 # Internal -- scan past <!ENTITY declarations 337 def _parse_doctype_entity(self, i, declstartpos): 338 rawdata = self.rawdata 339 if rawdata[i:i+1] == "%": 340 j = i + 1 341 while 1: 342 c = rawdata[j:j+1] 343 if not c: 344 return -1 345 if c.isspace(): 346 j = j + 1 347 else: 348 break 349 else: 350 j = i 351 name, j = self._scan_name(j, declstartpos) 352 if j < 0: 353 return j 354 while 1: 355 c = self.rawdata[j:j+1] 356 if not c: 357 return -1 358 if c in "'\"": 359 m = _declstringlit_match(rawdata, j) 360 if m: 361 j = m.end() 362 else: 363 return -1 # incomplete 364 elif c == ">": 365 return j + 1 366 else: 367 name, j = self._scan_name(j, declstartpos) 368 if j < 0: 369 return j 370 371 # Internal -- scan a name token and the new position and the token, or 372 # return -1 if we've reached the end of the buffer. 373 def _scan_name(self, i, declstartpos): 374 rawdata = self.rawdata 375 n = len(rawdata) 376 if i == n: 377 return None, -1 378 m = _declname_match(rawdata, i) 379 if m: 380 s = m.group() 381 name = s.strip() 382 if (i + len(s)) == n: 383 return None, -1 # end of buffer 384 return name.lower(), m.end() 385 else: 386 self.updatepos(declstartpos, i) 387 self.error("expected name token at %r" 388 % rawdata[declstartpos:declstartpos+20]) 389 390 # To be overridden -- handlers for unknown objects 391 def unknown_decl(self, data): 392 pass 393