1""" 2SAX driver for the pyexpat C module. This driver works with 3pyexpat.__version__ == '2.22'. 4""" 5 6version = "0.20" 7 8from xml.sax._exceptions import * 9from xml.sax.handler import feature_validation, feature_namespaces 10from xml.sax.handler import feature_namespace_prefixes 11from xml.sax.handler import feature_external_ges, feature_external_pes 12from xml.sax.handler import feature_string_interning 13from xml.sax.handler import property_xml_string, property_interning_dict 14 15try: 16 from xml.parsers import expat 17except ImportError: 18 raise SAXReaderNotAvailable("expat not supported", None) 19else: 20 if not hasattr(expat, "ParserCreate"): 21 raise SAXReaderNotAvailable("expat not supported", None) 22from xml.sax import xmlreader, saxutils, handler 23 24AttributesImpl = xmlreader.AttributesImpl 25AttributesNSImpl = xmlreader.AttributesNSImpl 26 27# If we're using a sufficiently recent version of Python, we can use 28# weak references to avoid cycles between the parser and content 29# handler, otherwise we'll just have to pretend. 30try: 31 import _weakref 32except ImportError: 33 def _mkproxy(o): 34 return o 35else: 36 import weakref 37 _mkproxy = weakref.proxy 38 del weakref, _weakref 39 40class _ClosedParser: 41 pass 42 43# --- ExpatLocator 44 45class ExpatLocator(xmlreader.Locator): 46 """Locator for use with the ExpatParser class. 47 48 This uses a weak reference to the parser object to avoid creating 49 a circular reference between the parser and the content handler. 50 """ 51 def __init__(self, parser): 52 self._ref = _mkproxy(parser) 53 54 def getColumnNumber(self): 55 parser = self._ref 56 if parser._parser is None: 57 return None 58 return parser._parser.ErrorColumnNumber 59 60 def getLineNumber(self): 61 parser = self._ref 62 if parser._parser is None: 63 return 1 64 return parser._parser.ErrorLineNumber 65 66 def getPublicId(self): 67 parser = self._ref 68 if parser is None: 69 return None 70 return parser._source.getPublicId() 71 72 def getSystemId(self): 73 parser = self._ref 74 if parser is None: 75 return None 76 return parser._source.getSystemId() 77 78 79# --- ExpatParser 80 81class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator): 82 """SAX driver for the pyexpat C module.""" 83 84 def __init__(self, namespaceHandling=0, bufsize=2**16-20): 85 xmlreader.IncrementalParser.__init__(self, bufsize) 86 self._source = xmlreader.InputSource() 87 self._parser = None 88 self._namespaces = namespaceHandling 89 self._lex_handler_prop = None 90 self._parsing = False 91 self._entity_stack = [] 92 self._external_ges = 0 93 self._interning = None 94 95 # XMLReader methods 96 97 def parse(self, source): 98 "Parse an XML document from a URL or an InputSource." 99 source = saxutils.prepare_input_source(source) 100 101 self._source = source 102 try: 103 self.reset() 104 self._cont_handler.setDocumentLocator(ExpatLocator(self)) 105 xmlreader.IncrementalParser.parse(self, source) 106 except: 107 # bpo-30264: Close the source on error to not leak resources: 108 # xml.sax.parse() doesn't give access to the underlying parser 109 # to the caller 110 self._close_source() 111 raise 112 113 def prepareParser(self, source): 114 if source.getSystemId() is not None: 115 self._parser.SetBase(source.getSystemId()) 116 117 # Redefined setContentHandler to allow changing handlers during parsing 118 119 def setContentHandler(self, handler): 120 xmlreader.IncrementalParser.setContentHandler(self, handler) 121 if self._parsing: 122 self._reset_cont_handler() 123 124 def getFeature(self, name): 125 if name == feature_namespaces: 126 return self._namespaces 127 elif name == feature_string_interning: 128 return self._interning is not None 129 elif name in (feature_validation, feature_external_pes, 130 feature_namespace_prefixes): 131 return 0 132 elif name == feature_external_ges: 133 return self._external_ges 134 raise SAXNotRecognizedException("Feature '%s' not recognized" % name) 135 136 def setFeature(self, name, state): 137 if self._parsing: 138 raise SAXNotSupportedException("Cannot set features while parsing") 139 140 if name == feature_namespaces: 141 self._namespaces = state 142 elif name == feature_external_ges: 143 self._external_ges = state 144 elif name == feature_string_interning: 145 if state: 146 if self._interning is None: 147 self._interning = {} 148 else: 149 self._interning = None 150 elif name == feature_validation: 151 if state: 152 raise SAXNotSupportedException( 153 "expat does not support validation") 154 elif name == feature_external_pes: 155 if state: 156 raise SAXNotSupportedException( 157 "expat does not read external parameter entities") 158 elif name == feature_namespace_prefixes: 159 if state: 160 raise SAXNotSupportedException( 161 "expat does not report namespace prefixes") 162 else: 163 raise SAXNotRecognizedException( 164 "Feature '%s' not recognized" % name) 165 166 def getProperty(self, name): 167 if name == handler.property_lexical_handler: 168 return self._lex_handler_prop 169 elif name == property_interning_dict: 170 return self._interning 171 elif name == property_xml_string: 172 if self._parser: 173 if hasattr(self._parser, "GetInputContext"): 174 return self._parser.GetInputContext() 175 else: 176 raise SAXNotRecognizedException( 177 "This version of expat does not support getting" 178 " the XML string") 179 else: 180 raise SAXNotSupportedException( 181 "XML string cannot be returned when not parsing") 182 raise SAXNotRecognizedException("Property '%s' not recognized" % name) 183 184 def setProperty(self, name, value): 185 if name == handler.property_lexical_handler: 186 self._lex_handler_prop = value 187 if self._parsing: 188 self._reset_lex_handler_prop() 189 elif name == property_interning_dict: 190 self._interning = value 191 elif name == property_xml_string: 192 raise SAXNotSupportedException("Property '%s' cannot be set" % 193 name) 194 else: 195 raise SAXNotRecognizedException("Property '%s' not recognized" % 196 name) 197 198 # IncrementalParser methods 199 200 def feed(self, data, isFinal=False): 201 if not self._parsing: 202 self.reset() 203 self._parsing = True 204 self._cont_handler.startDocument() 205 206 try: 207 # The isFinal parameter is internal to the expat reader. 208 # If it is set to true, expat will check validity of the entire 209 # document. When feeding chunks, they are not normally final - 210 # except when invoked from close. 211 self._parser.Parse(data, isFinal) 212 except expat.error as e: 213 exc = SAXParseException(expat.ErrorString(e.code), e, self) 214 # FIXME: when to invoke error()? 215 self._err_handler.fatalError(exc) 216 217 def flush(self): 218 if self._parser is None: 219 return 220 221 was_enabled = self._parser.GetReparseDeferralEnabled() 222 try: 223 self._parser.SetReparseDeferralEnabled(False) 224 self._parser.Parse(b"", False) 225 except expat.error as e: 226 exc = SAXParseException(expat.ErrorString(e.code), e, self) 227 self._err_handler.fatalError(exc) 228 finally: 229 self._parser.SetReparseDeferralEnabled(was_enabled) 230 231 def _close_source(self): 232 source = self._source 233 try: 234 file = source.getCharacterStream() 235 if file is not None: 236 file.close() 237 finally: 238 file = source.getByteStream() 239 if file is not None: 240 file.close() 241 242 def close(self): 243 if (self._entity_stack or self._parser is None or 244 isinstance(self._parser, _ClosedParser)): 245 # If we are completing an external entity, do nothing here 246 return 247 try: 248 self.feed(b"", isFinal=True) 249 self._cont_handler.endDocument() 250 self._parsing = False 251 # break cycle created by expat handlers pointing to our methods 252 self._parser = None 253 finally: 254 self._parsing = False 255 if self._parser is not None: 256 # Keep ErrorColumnNumber and ErrorLineNumber after closing. 257 parser = _ClosedParser() 258 parser.ErrorColumnNumber = self._parser.ErrorColumnNumber 259 parser.ErrorLineNumber = self._parser.ErrorLineNumber 260 self._parser = parser 261 self._close_source() 262 263 def _reset_cont_handler(self): 264 self._parser.ProcessingInstructionHandler = \ 265 self._cont_handler.processingInstruction 266 self._parser.CharacterDataHandler = self._cont_handler.characters 267 268 def _reset_lex_handler_prop(self): 269 lex = self._lex_handler_prop 270 parser = self._parser 271 if lex is None: 272 parser.CommentHandler = None 273 parser.StartCdataSectionHandler = None 274 parser.EndCdataSectionHandler = None 275 parser.StartDoctypeDeclHandler = None 276 parser.EndDoctypeDeclHandler = None 277 else: 278 parser.CommentHandler = lex.comment 279 parser.StartCdataSectionHandler = lex.startCDATA 280 parser.EndCdataSectionHandler = lex.endCDATA 281 parser.StartDoctypeDeclHandler = self.start_doctype_decl 282 parser.EndDoctypeDeclHandler = lex.endDTD 283 284 def reset(self): 285 if self._namespaces: 286 self._parser = expat.ParserCreate(self._source.getEncoding(), " ", 287 intern=self._interning) 288 self._parser.namespace_prefixes = 1 289 self._parser.StartElementHandler = self.start_element_ns 290 self._parser.EndElementHandler = self.end_element_ns 291 else: 292 self._parser = expat.ParserCreate(self._source.getEncoding(), 293 intern = self._interning) 294 self._parser.StartElementHandler = self.start_element 295 self._parser.EndElementHandler = self.end_element 296 297 self._reset_cont_handler() 298 self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl 299 self._parser.NotationDeclHandler = self.notation_decl 300 self._parser.StartNamespaceDeclHandler = self.start_namespace_decl 301 self._parser.EndNamespaceDeclHandler = self.end_namespace_decl 302 303 self._decl_handler_prop = None 304 if self._lex_handler_prop: 305 self._reset_lex_handler_prop() 306# self._parser.DefaultHandler = 307# self._parser.DefaultHandlerExpand = 308# self._parser.NotStandaloneHandler = 309 self._parser.ExternalEntityRefHandler = self.external_entity_ref 310 try: 311 self._parser.SkippedEntityHandler = self.skipped_entity_handler 312 except AttributeError: 313 # This pyexpat does not support SkippedEntity 314 pass 315 self._parser.SetParamEntityParsing( 316 expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE) 317 318 self._parsing = False 319 self._entity_stack = [] 320 321 # Locator methods 322 323 def getColumnNumber(self): 324 if self._parser is None: 325 return None 326 return self._parser.ErrorColumnNumber 327 328 def getLineNumber(self): 329 if self._parser is None: 330 return 1 331 return self._parser.ErrorLineNumber 332 333 def getPublicId(self): 334 return self._source.getPublicId() 335 336 def getSystemId(self): 337 return self._source.getSystemId() 338 339 # event handlers 340 def start_element(self, name, attrs): 341 self._cont_handler.startElement(name, AttributesImpl(attrs)) 342 343 def end_element(self, name): 344 self._cont_handler.endElement(name) 345 346 def start_element_ns(self, name, attrs): 347 pair = name.split() 348 if len(pair) == 1: 349 # no namespace 350 pair = (None, name) 351 elif len(pair) == 3: 352 pair = pair[0], pair[1] 353 else: 354 # default namespace 355 pair = tuple(pair) 356 357 newattrs = {} 358 qnames = {} 359 for (aname, value) in attrs.items(): 360 parts = aname.split() 361 length = len(parts) 362 if length == 1: 363 # no namespace 364 qname = aname 365 apair = (None, aname) 366 elif length == 3: 367 qname = "%s:%s" % (parts[2], parts[1]) 368 apair = parts[0], parts[1] 369 else: 370 # default namespace 371 qname = parts[1] 372 apair = tuple(parts) 373 374 newattrs[apair] = value 375 qnames[apair] = qname 376 377 self._cont_handler.startElementNS(pair, None, 378 AttributesNSImpl(newattrs, qnames)) 379 380 def end_element_ns(self, name): 381 pair = name.split() 382 if len(pair) == 1: 383 pair = (None, name) 384 elif len(pair) == 3: 385 pair = pair[0], pair[1] 386 else: 387 pair = tuple(pair) 388 389 self._cont_handler.endElementNS(pair, None) 390 391 # this is not used (call directly to ContentHandler) 392 def processing_instruction(self, target, data): 393 self._cont_handler.processingInstruction(target, data) 394 395 # this is not used (call directly to ContentHandler) 396 def character_data(self, data): 397 self._cont_handler.characters(data) 398 399 def start_namespace_decl(self, prefix, uri): 400 self._cont_handler.startPrefixMapping(prefix, uri) 401 402 def end_namespace_decl(self, prefix): 403 self._cont_handler.endPrefixMapping(prefix) 404 405 def start_doctype_decl(self, name, sysid, pubid, has_internal_subset): 406 self._lex_handler_prop.startDTD(name, pubid, sysid) 407 408 def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name): 409 self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name) 410 411 def notation_decl(self, name, base, sysid, pubid): 412 self._dtd_handler.notationDecl(name, pubid, sysid) 413 414 def external_entity_ref(self, context, base, sysid, pubid): 415 if not self._external_ges: 416 return 1 417 418 source = self._ent_handler.resolveEntity(pubid, sysid) 419 source = saxutils.prepare_input_source(source, 420 self._source.getSystemId() or 421 "") 422 423 self._entity_stack.append((self._parser, self._source)) 424 self._parser = self._parser.ExternalEntityParserCreate(context) 425 self._source = source 426 427 try: 428 xmlreader.IncrementalParser.parse(self, source) 429 except: 430 return 0 # FIXME: save error info here? 431 432 (self._parser, self._source) = self._entity_stack[-1] 433 del self._entity_stack[-1] 434 return 1 435 436 def skipped_entity_handler(self, name, is_pe): 437 if is_pe: 438 # The SAX spec requires to report skipped PEs with a '%' 439 name = '%'+name 440 self._cont_handler.skippedEntity(name) 441 442# --- 443 444def create_parser(*args, **kwargs): 445 return ExpatParser(*args, **kwargs) 446 447# --- 448 449if __name__ == "__main__": 450 import xml.sax.saxutils 451 p = create_parser() 452 p.setContentHandler(xml.sax.saxutils.XMLGenerator()) 453 p.setErrorHandler(xml.sax.ErrorHandler()) 454 p.parse("http://www.ibiblio.org/xml/examples/shakespeare/hamlet.xml") 455