1""" 2SAX driver for the pyexpat C module. This driver works with 3pyexpat.__version__ == '2.22'. 4""" 5 6version = "0.20" 7 8from xml.sax._exceptions import * 9from xml.sax.handler import feature_validation, feature_namespaces 10from xml.sax.handler import feature_namespace_prefixes 11from xml.sax.handler import feature_external_ges, feature_external_pes 12from xml.sax.handler import feature_string_interning 13from xml.sax.handler import property_xml_string, property_interning_dict 14 15# xml.parsers.expat does not raise ImportError in Jython 16import sys 17if sys.platform[:4] == "java": 18 raise SAXReaderNotAvailable("expat not available in Java", None) 19del sys 20 21try: 22 from xml.parsers import expat 23except ImportError: 24 raise SAXReaderNotAvailable("expat not supported", None) 25else: 26 if not hasattr(expat, "ParserCreate"): 27 raise SAXReaderNotAvailable("expat not supported", None) 28from xml.sax import xmlreader, saxutils, handler 29 30AttributesImpl = xmlreader.AttributesImpl 31AttributesNSImpl = xmlreader.AttributesNSImpl 32 33# If we're using a sufficiently recent version of Python, we can use 34# weak references to avoid cycles between the parser and content 35# handler, otherwise we'll just have to pretend. 36try: 37 import _weakref 38except ImportError: 39 def _mkproxy(o): 40 return o 41else: 42 import weakref 43 _mkproxy = weakref.proxy 44 del weakref, _weakref 45 46class _ClosedParser: 47 pass 48 49# --- ExpatLocator 50 51class ExpatLocator(xmlreader.Locator): 52 """Locator for use with the ExpatParser class. 53 54 This uses a weak reference to the parser object to avoid creating 55 a circular reference between the parser and the content handler. 56 """ 57 def __init__(self, parser): 58 self._ref = _mkproxy(parser) 59 60 def getColumnNumber(self): 61 parser = self._ref 62 if parser._parser is None: 63 return None 64 return parser._parser.ErrorColumnNumber 65 66 def getLineNumber(self): 67 parser = self._ref 68 if parser._parser is None: 69 return 1 70 return parser._parser.ErrorLineNumber 71 72 def getPublicId(self): 73 parser = self._ref 74 if parser is None: 75 return None 76 return parser._source.getPublicId() 77 78 def getSystemId(self): 79 parser = self._ref 80 if parser is None: 81 return None 82 return parser._source.getSystemId() 83 84 85# --- ExpatParser 86 87class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator): 88 """SAX driver for the pyexpat C module.""" 89 90 def __init__(self, namespaceHandling=0, bufsize=2**16-20): 91 xmlreader.IncrementalParser.__init__(self, bufsize) 92 self._source = xmlreader.InputSource() 93 self._parser = None 94 self._namespaces = namespaceHandling 95 self._lex_handler_prop = None 96 self._parsing = 0 97 self._entity_stack = [] 98 self._external_ges = 1 99 self._interning = None 100 101 # XMLReader methods 102 103 def parse(self, source): 104 "Parse an XML document from a URL or an InputSource." 105 source = saxutils.prepare_input_source(source) 106 107 self._source = source 108 self.reset() 109 self._cont_handler.setDocumentLocator(ExpatLocator(self)) 110 xmlreader.IncrementalParser.parse(self, source) 111 112 def prepareParser(self, source): 113 if source.getSystemId() is not None: 114 base = source.getSystemId() 115 if isinstance(base, unicode): 116 base = base.encode('utf-8') 117 self._parser.SetBase(base) 118 119 # Redefined setContentHandler to allow changing handlers during parsing 120 121 def setContentHandler(self, handler): 122 xmlreader.IncrementalParser.setContentHandler(self, handler) 123 if self._parsing: 124 self._reset_cont_handler() 125 126 def getFeature(self, name): 127 if name == feature_namespaces: 128 return self._namespaces 129 elif name == feature_string_interning: 130 return self._interning is not None 131 elif name in (feature_validation, feature_external_pes, 132 feature_namespace_prefixes): 133 return 0 134 elif name == feature_external_ges: 135 return self._external_ges 136 raise SAXNotRecognizedException("Feature '%s' not recognized" % name) 137 138 def setFeature(self, name, state): 139 if self._parsing: 140 raise SAXNotSupportedException("Cannot set features while parsing") 141 142 if name == feature_namespaces: 143 self._namespaces = state 144 elif name == feature_external_ges: 145 self._external_ges = state 146 elif name == feature_string_interning: 147 if state: 148 if self._interning is None: 149 self._interning = {} 150 else: 151 self._interning = None 152 elif name == feature_validation: 153 if state: 154 raise SAXNotSupportedException( 155 "expat does not support validation") 156 elif name == feature_external_pes: 157 if state: 158 raise SAXNotSupportedException( 159 "expat does not read external parameter entities") 160 elif name == feature_namespace_prefixes: 161 if state: 162 raise SAXNotSupportedException( 163 "expat does not report namespace prefixes") 164 else: 165 raise SAXNotRecognizedException( 166 "Feature '%s' not recognized" % name) 167 168 def getProperty(self, name): 169 if name == handler.property_lexical_handler: 170 return self._lex_handler_prop 171 elif name == property_interning_dict: 172 return self._interning 173 elif name == property_xml_string: 174 if self._parser: 175 if hasattr(self._parser, "GetInputContext"): 176 return self._parser.GetInputContext() 177 else: 178 raise SAXNotRecognizedException( 179 "This version of expat does not support getting" 180 " the XML string") 181 else: 182 raise SAXNotSupportedException( 183 "XML string cannot be returned when not parsing") 184 raise SAXNotRecognizedException("Property '%s' not recognized" % name) 185 186 def setProperty(self, name, value): 187 if name == handler.property_lexical_handler: 188 self._lex_handler_prop = value 189 if self._parsing: 190 self._reset_lex_handler_prop() 191 elif name == property_interning_dict: 192 self._interning = value 193 elif name == property_xml_string: 194 raise SAXNotSupportedException("Property '%s' cannot be set" % 195 name) 196 else: 197 raise SAXNotRecognizedException("Property '%s' not recognized" % 198 name) 199 200 # IncrementalParser methods 201 202 def feed(self, data, isFinal = 0): 203 if not self._parsing: 204 self.reset() 205 self._parsing = 1 206 self._cont_handler.startDocument() 207 208 try: 209 # The isFinal parameter is internal to the expat reader. 210 # If it is set to true, expat will check validity of the entire 211 # document. When feeding chunks, they are not normally final - 212 # except when invoked from close. 213 self._parser.Parse(data, isFinal) 214 except expat.error, e: 215 exc = SAXParseException(expat.ErrorString(e.code), e, self) 216 # FIXME: when to invoke error()? 217 self._err_handler.fatalError(exc) 218 219 def close(self): 220 if (self._entity_stack or self._parser is None or 221 isinstance(self._parser, _ClosedParser)): 222 # If we are completing an external entity, do nothing here 223 return 224 try: 225 self.feed("", isFinal = 1) 226 self._cont_handler.endDocument() 227 self._parsing = 0 228 # break cycle created by expat handlers pointing to our methods 229 self._parser = None 230 finally: 231 self._parsing = 0 232 if self._parser is not None: 233 # Keep ErrorColumnNumber and ErrorLineNumber after closing. 234 parser = _ClosedParser() 235 parser.ErrorColumnNumber = self._parser.ErrorColumnNumber 236 parser.ErrorLineNumber = self._parser.ErrorLineNumber 237 self._parser = parser 238 239 def _reset_cont_handler(self): 240 self._parser.ProcessingInstructionHandler = \ 241 self._cont_handler.processingInstruction 242 self._parser.CharacterDataHandler = self._cont_handler.characters 243 244 def _reset_lex_handler_prop(self): 245 lex = self._lex_handler_prop 246 parser = self._parser 247 if lex is None: 248 parser.CommentHandler = None 249 parser.StartCdataSectionHandler = None 250 parser.EndCdataSectionHandler = None 251 parser.StartDoctypeDeclHandler = None 252 parser.EndDoctypeDeclHandler = None 253 else: 254 parser.CommentHandler = lex.comment 255 parser.StartCdataSectionHandler = lex.startCDATA 256 parser.EndCdataSectionHandler = lex.endCDATA 257 parser.StartDoctypeDeclHandler = self.start_doctype_decl 258 parser.EndDoctypeDeclHandler = lex.endDTD 259 260 def reset(self): 261 if self._namespaces: 262 self._parser = expat.ParserCreate(self._source.getEncoding(), " ", 263 intern=self._interning) 264 self._parser.namespace_prefixes = 1 265 self._parser.StartElementHandler = self.start_element_ns 266 self._parser.EndElementHandler = self.end_element_ns 267 else: 268 self._parser = expat.ParserCreate(self._source.getEncoding(), 269 intern = self._interning) 270 self._parser.StartElementHandler = self.start_element 271 self._parser.EndElementHandler = self.end_element 272 273 self._reset_cont_handler() 274 self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl 275 self._parser.NotationDeclHandler = self.notation_decl 276 self._parser.StartNamespaceDeclHandler = self.start_namespace_decl 277 self._parser.EndNamespaceDeclHandler = self.end_namespace_decl 278 279 self._decl_handler_prop = None 280 if self._lex_handler_prop: 281 self._reset_lex_handler_prop() 282# self._parser.DefaultHandler = 283# self._parser.DefaultHandlerExpand = 284# self._parser.NotStandaloneHandler = 285 self._parser.ExternalEntityRefHandler = self.external_entity_ref 286 try: 287 self._parser.SkippedEntityHandler = self.skipped_entity_handler 288 except AttributeError: 289 # This pyexpat does not support SkippedEntity 290 pass 291 self._parser.SetParamEntityParsing( 292 expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE) 293 294 self._parsing = 0 295 self._entity_stack = [] 296 297 # Locator methods 298 299 def getColumnNumber(self): 300 if self._parser is None: 301 return None 302 return self._parser.ErrorColumnNumber 303 304 def getLineNumber(self): 305 if self._parser is None: 306 return 1 307 return self._parser.ErrorLineNumber 308 309 def getPublicId(self): 310 return self._source.getPublicId() 311 312 def getSystemId(self): 313 return self._source.getSystemId() 314 315 # event handlers 316 def start_element(self, name, attrs): 317 self._cont_handler.startElement(name, AttributesImpl(attrs)) 318 319 def end_element(self, name): 320 self._cont_handler.endElement(name) 321 322 def start_element_ns(self, name, attrs): 323 pair = name.split() 324 if len(pair) == 1: 325 # no namespace 326 pair = (None, name) 327 elif len(pair) == 3: 328 pair = pair[0], pair[1] 329 else: 330 # default namespace 331 pair = tuple(pair) 332 333 newattrs = {} 334 qnames = {} 335 for (aname, value) in attrs.items(): 336 parts = aname.split() 337 length = len(parts) 338 if length == 1: 339 # no namespace 340 qname = aname 341 apair = (None, aname) 342 elif length == 3: 343 qname = "%s:%s" % (parts[2], parts[1]) 344 apair = parts[0], parts[1] 345 else: 346 # default namespace 347 qname = parts[1] 348 apair = tuple(parts) 349 350 newattrs[apair] = value 351 qnames[apair] = qname 352 353 self._cont_handler.startElementNS(pair, None, 354 AttributesNSImpl(newattrs, qnames)) 355 356 def end_element_ns(self, name): 357 pair = name.split() 358 if len(pair) == 1: 359 pair = (None, name) 360 elif len(pair) == 3: 361 pair = pair[0], pair[1] 362 else: 363 pair = tuple(pair) 364 365 self._cont_handler.endElementNS(pair, None) 366 367 # this is not used (call directly to ContentHandler) 368 def processing_instruction(self, target, data): 369 self._cont_handler.processingInstruction(target, data) 370 371 # this is not used (call directly to ContentHandler) 372 def character_data(self, data): 373 self._cont_handler.characters(data) 374 375 def start_namespace_decl(self, prefix, uri): 376 self._cont_handler.startPrefixMapping(prefix, uri) 377 378 def end_namespace_decl(self, prefix): 379 self._cont_handler.endPrefixMapping(prefix) 380 381 def start_doctype_decl(self, name, sysid, pubid, has_internal_subset): 382 self._lex_handler_prop.startDTD(name, pubid, sysid) 383 384 def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name): 385 self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name) 386 387 def notation_decl(self, name, base, sysid, pubid): 388 self._dtd_handler.notationDecl(name, pubid, sysid) 389 390 def external_entity_ref(self, context, base, sysid, pubid): 391 if not self._external_ges: 392 return 1 393 394 source = self._ent_handler.resolveEntity(pubid, sysid) 395 source = saxutils.prepare_input_source(source, 396 self._source.getSystemId() or 397 "") 398 399 self._entity_stack.append((self._parser, self._source)) 400 self._parser = self._parser.ExternalEntityParserCreate(context) 401 self._source = source 402 403 try: 404 xmlreader.IncrementalParser.parse(self, source) 405 except: 406 return 0 # FIXME: save error info here? 407 408 (self._parser, self._source) = self._entity_stack[-1] 409 del self._entity_stack[-1] 410 return 1 411 412 def skipped_entity_handler(self, name, is_pe): 413 if is_pe: 414 # The SAX spec requires to report skipped PEs with a '%' 415 name = '%'+name 416 self._cont_handler.skippedEntity(name) 417 418# --- 419 420def create_parser(*args, **kwargs): 421 return ExpatParser(*args, **kwargs) 422 423# --- 424 425if __name__ == "__main__": 426 import xml.sax.saxutils 427 p = create_parser() 428 p.setContentHandler(xml.sax.saxutils.XMLGenerator()) 429 p.setErrorHandler(xml.sax.ErrorHandler()) 430 p.parse("http://www.ibiblio.org/xml/examples/shakespeare/hamlet.xml") 431