1# -*- coding: iso-8859-1 -*- 2""" A SAX2 driver for libxml2, on top of it's XmlReader API 3 4USAGE 5 # put this file (drv_libxml2.py) in PYTHONPATH 6 import xml.sax 7 reader = xml.sax.make_parser(["drv_libxml2"]) 8 # ...and the rest is standard python sax. 9 10CAVEATS 11 - Lexical handlers are supported, except for start/endEntity 12 (waiting for XmlReader.ResolveEntity) and start/endDTD 13 - Error callbacks are not exactly synchronous, they tend 14 to be invoked before the corresponding content callback, 15 because the underlying reader interface parses 16 data by chunks of 512 bytes 17 18TODO 19 - search for TODO 20 - some ErrorHandler events (warning) 21 - some ContentHandler events (setDocumentLocator, skippedEntity) 22 - EntityResolver (using libxml2.?) 23 - DTDHandler (if/when libxml2 exposes such node types) 24 - DeclHandler (if/when libxml2 exposes such node types) 25 - property_xml_string? 26 - feature_string_interning? 27 - Incremental parser 28 - additional performance tuning: 29 - one might cache callbacks to avoid some name lookups 30 - one might implement a smarter way to pass attributes to startElement 31 (some kind of lazy evaluation?) 32 - there might be room for improvement in start/endPrefixMapping 33 - other? 34 35""" 36 37__author__ = "St�phane Bidoul <sbi@skynet.be>" 38__version__ = "0.3" 39 40import sys 41import codecs 42 43if sys.version_info[0] < 3: 44 __author__ = codecs.unicode_escape_decode(__author__)[0] 45 46 StringTypes = (str, unicode) 47 # libxml2 returns strings as UTF8 48 _decoder = codecs.lookup("utf8")[1] 49 def _d(s): 50 if s is None: 51 return s 52 else: 53 return _decoder(s)[0] 54else: 55 StringTypes = str 56 # s is Unicode `str` already 57 def _d(s): 58 return s 59 60from xml.sax._exceptions import * 61from xml.sax import xmlreader, saxutils 62from xml.sax.handler import \ 63 feature_namespaces, \ 64 feature_namespace_prefixes, \ 65 feature_string_interning, \ 66 feature_validation, \ 67 feature_external_ges, \ 68 feature_external_pes, \ 69 property_lexical_handler, \ 70 property_declaration_handler, \ 71 property_dom_node, \ 72 property_xml_string 73 74try: 75 import libxml2 76except ImportError: 77 raise SAXReaderNotAvailable("libxml2 not available: " \ 78 "import error was: %s" % sys.exc_info()[1]) 79 80class Locator(xmlreader.Locator): 81 """SAX Locator adapter for libxml2.xmlTextReaderLocator""" 82 83 def __init__(self,locator): 84 self.__locator = locator 85 86 def getColumnNumber(self): 87 "Return the column number where the current event ends." 88 return -1 89 90 def getLineNumber(self): 91 "Return the line number where the current event ends." 92 return self.__locator.LineNumber() 93 94 def getPublicId(self): 95 "Return the public identifier for the current event." 96 return None 97 98 def getSystemId(self): 99 "Return the system identifier for the current event." 100 return self.__locator.BaseURI() 101 102class LibXml2Reader(xmlreader.XMLReader): 103 104 def __init__(self): 105 xmlreader.XMLReader.__init__(self) 106 # features 107 self.__ns = 0 108 self.__nspfx = 0 109 self.__validate = 0 110 self.__extparams = 1 111 # parsing flag 112 self.__parsing = 0 113 # additional handlers 114 self.__lex_handler = None 115 self.__decl_handler = None 116 # error messages accumulator 117 self.__errors = None 118 119 def _errorHandler(self,arg,msg,severity,locator): 120 if self.__errors is None: 121 self.__errors = [] 122 self.__errors.append((severity, 123 SAXParseException(msg,None, 124 Locator(locator)))) 125 126 def _reportErrors(self,fatal): 127 for severity,exception in self.__errors: 128 if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING, 129 libxml2.PARSER_SEVERITY_WARNING): 130 self._err_handler.warning(exception) 131 else: 132 # when fatal is set, the parse will stop; 133 # we consider that the last error reported 134 # is the fatal one. 135 if fatal and exception is self.__errors[-1][1]: 136 self._err_handler.fatalError(exception) 137 else: 138 self._err_handler.error(exception) 139 self.__errors = None 140 141 def parse(self, source): 142 self.__parsing = 1 143 try: 144 # prepare source and create reader 145 if isinstance(source, StringTypes): 146 reader = libxml2.newTextReaderFilename(source) 147 else: 148 source = saxutils.prepare_input_source(source) 149 input = libxml2.inputBuffer(source.getByteStream()) 150 reader = input.newTextReader(source.getSystemId()) 151 reader.SetErrorHandler(self._errorHandler,None) 152 # configure reader 153 if self.__extparams: 154 reader.SetParserProp(libxml2.PARSER_LOADDTD,1) 155 reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1) 156 reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1) 157 reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate) 158 else: 159 reader.SetParserProp(libxml2.PARSER_LOADDTD, 0) 160 # we reuse attribute maps (for a slight performance gain) 161 if self.__ns: 162 attributesNSImpl = xmlreader.AttributesNSImpl({},{}) 163 else: 164 attributesImpl = xmlreader.AttributesImpl({}) 165 # prefixes to pop (for endPrefixMapping) 166 prefixes = [] 167 # start loop 168 self._cont_handler.startDocument() 169 while 1: 170 r = reader.Read() 171 # check for errors 172 if r == 1: 173 if not self.__errors is None: 174 self._reportErrors(0) 175 elif r == 0: 176 if not self.__errors is None: 177 self._reportErrors(0) 178 break # end of parse 179 else: 180 if not self.__errors is None: 181 self._reportErrors(1) 182 else: 183 self._err_handler.fatalError(\ 184 SAXException("Read failed (no details available)")) 185 break # fatal parse error 186 # get node type 187 nodeType = reader.NodeType() 188 # Element 189 if nodeType == 1: 190 if self.__ns: 191 eltName = (_d(reader.NamespaceUri()),\ 192 _d(reader.LocalName())) 193 eltQName = _d(reader.Name()) 194 attributesNSImpl._attrs = attrs = {} 195 attributesNSImpl._qnames = qnames = {} 196 newPrefixes = [] 197 while reader.MoveToNextAttribute(): 198 qname = _d(reader.Name()) 199 value = _d(reader.Value()) 200 if qname.startswith("xmlns"): 201 if len(qname) > 5: 202 newPrefix = qname[6:] 203 else: 204 newPrefix = None 205 newPrefixes.append(newPrefix) 206 self._cont_handler.startPrefixMapping(\ 207 newPrefix,value) 208 if not self.__nspfx: 209 continue # don't report xmlns attribute 210 attName = (_d(reader.NamespaceUri()), 211 _d(reader.LocalName())) 212 qnames[attName] = qname 213 attrs[attName] = value 214 reader.MoveToElement() 215 self._cont_handler.startElementNS( \ 216 eltName,eltQName,attributesNSImpl) 217 if reader.IsEmptyElement(): 218 self._cont_handler.endElementNS(eltName,eltQName) 219 for newPrefix in newPrefixes: 220 self._cont_handler.endPrefixMapping(newPrefix) 221 else: 222 prefixes.append(newPrefixes) 223 else: 224 eltName = _d(reader.Name()) 225 attributesImpl._attrs = attrs = {} 226 while reader.MoveToNextAttribute(): 227 attName = _d(reader.Name()) 228 attrs[attName] = _d(reader.Value()) 229 reader.MoveToElement() 230 self._cont_handler.startElement( \ 231 eltName,attributesImpl) 232 if reader.IsEmptyElement(): 233 self._cont_handler.endElement(eltName) 234 # EndElement 235 elif nodeType == 15: 236 if self.__ns: 237 self._cont_handler.endElementNS( \ 238 (_d(reader.NamespaceUri()),_d(reader.LocalName())), 239 _d(reader.Name())) 240 for prefix in prefixes.pop(): 241 self._cont_handler.endPrefixMapping(prefix) 242 else: 243 self._cont_handler.endElement(_d(reader.Name())) 244 # Text 245 elif nodeType == 3: 246 self._cont_handler.characters(_d(reader.Value())) 247 # Whitespace 248 elif nodeType == 13: 249 self._cont_handler.ignorableWhitespace(_d(reader.Value())) 250 # SignificantWhitespace 251 elif nodeType == 14: 252 self._cont_handler.characters(_d(reader.Value())) 253 # CDATA 254 elif nodeType == 4: 255 if not self.__lex_handler is None: 256 self.__lex_handler.startCDATA() 257 self._cont_handler.characters(_d(reader.Value())) 258 if not self.__lex_handler is None: 259 self.__lex_handler.endCDATA() 260 # EntityReference 261 elif nodeType == 5: 262 if not self.__lex_handler is None: 263 self.startEntity(_d(reader.Name())) 264 reader.ResolveEntity() 265 # EndEntity 266 elif nodeType == 16: 267 if not self.__lex_handler is None: 268 self.endEntity(_d(reader.Name())) 269 # ProcessingInstruction 270 elif nodeType == 7: 271 self._cont_handler.processingInstruction( \ 272 _d(reader.Name()),_d(reader.Value())) 273 # Comment 274 elif nodeType == 8: 275 if not self.__lex_handler is None: 276 self.__lex_handler.comment(_d(reader.Value())) 277 # DocumentType 278 elif nodeType == 10: 279 #if not self.__lex_handler is None: 280 # self.__lex_handler.startDTD() 281 pass # TODO (how to detect endDTD? on first non-dtd event?) 282 # XmlDeclaration 283 elif nodeType == 17: 284 pass # TODO 285 # Entity 286 elif nodeType == 6: 287 pass # TODO (entity decl) 288 # Notation (decl) 289 elif nodeType == 12: 290 pass # TODO 291 # Attribute (never in this loop) 292 #elif nodeType == 2: 293 # pass 294 # Document (not exposed) 295 #elif nodeType == 9: 296 # pass 297 # DocumentFragment (never returned by XmlReader) 298 #elif nodeType == 11: 299 # pass 300 # None 301 #elif nodeType == 0: 302 # pass 303 # - 304 else: 305 raise SAXException("Unexpected node type %d" % nodeType) 306 if r == 0: 307 self._cont_handler.endDocument() 308 reader.Close() 309 finally: 310 self.__parsing = 0 311 312 def setDTDHandler(self, handler): 313 # TODO (when supported, the inherited method works just fine) 314 raise SAXNotSupportedException("DTDHandler not supported") 315 316 def setEntityResolver(self, resolver): 317 # TODO (when supported, the inherited method works just fine) 318 raise SAXNotSupportedException("EntityResolver not supported") 319 320 def getFeature(self, name): 321 if name == feature_namespaces: 322 return self.__ns 323 elif name == feature_namespace_prefixes: 324 return self.__nspfx 325 elif name == feature_validation: 326 return self.__validate 327 elif name == feature_external_ges: 328 return 1 # TODO (does that relate to PARSER_LOADDTD)? 329 elif name == feature_external_pes: 330 return self.__extparams 331 else: 332 raise SAXNotRecognizedException("Feature '%s' not recognized" % \ 333 name) 334 335 def setFeature(self, name, state): 336 if self.__parsing: 337 raise SAXNotSupportedException("Cannot set feature %s " \ 338 "while parsing" % name) 339 if name == feature_namespaces: 340 self.__ns = state 341 elif name == feature_namespace_prefixes: 342 self.__nspfx = state 343 elif name == feature_validation: 344 self.__validate = state 345 elif name == feature_external_ges: 346 if state == 0: 347 # TODO (does that relate to PARSER_LOADDTD)? 348 raise SAXNotSupportedException("Feature '%s' not supported" % \ 349 name) 350 elif name == feature_external_pes: 351 self.__extparams = state 352 else: 353 raise SAXNotRecognizedException("Feature '%s' not recognized" % \ 354 name) 355 356 def getProperty(self, name): 357 if name == property_lexical_handler: 358 return self.__lex_handler 359 elif name == property_declaration_handler: 360 return self.__decl_handler 361 else: 362 raise SAXNotRecognizedException("Property '%s' not recognized" % \ 363 name) 364 365 def setProperty(self, name, value): 366 if name == property_lexical_handler: 367 self.__lex_handler = value 368 elif name == property_declaration_handler: 369 # TODO: remove if/when libxml2 supports dtd events 370 raise SAXNotSupportedException("Property '%s' not supported" % \ 371 name) 372 self.__decl_handler = value 373 else: 374 raise SAXNotRecognizedException("Property '%s' not recognized" % \ 375 name) 376 377def create_parser(): 378 return LibXml2Reader() 379 380