• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""
2SAX driver for the pyexpat C module.  This driver works with
3pyexpat.__version__ == '2.22'.
4"""
5
6version = "0.20"
7
8from xml.sax._exceptions import *
9from xml.sax.handler import feature_validation, feature_namespaces
10from xml.sax.handler import feature_namespace_prefixes
11from xml.sax.handler import feature_external_ges, feature_external_pes
12from xml.sax.handler import feature_string_interning
13from xml.sax.handler import property_xml_string, property_interning_dict
14
15try:
16    from xml.parsers import expat
17except ImportError:
18    raise SAXReaderNotAvailable("expat not supported", None)
19else:
20    if not hasattr(expat, "ParserCreate"):
21        raise SAXReaderNotAvailable("expat not supported", None)
22from xml.sax import xmlreader, saxutils, handler
23
24AttributesImpl = xmlreader.AttributesImpl
25AttributesNSImpl = xmlreader.AttributesNSImpl
26
27# If we're using a sufficiently recent version of Python, we can use
28# weak references to avoid cycles between the parser and content
29# handler, otherwise we'll just have to pretend.
30try:
31    import _weakref
32except ImportError:
33    def _mkproxy(o):
34        return o
35else:
36    import weakref
37    _mkproxy = weakref.proxy
38    del weakref, _weakref
39
40class _ClosedParser:
41    pass
42
43# --- ExpatLocator
44
45class ExpatLocator(xmlreader.Locator):
46    """Locator for use with the ExpatParser class.
47
48    This uses a weak reference to the parser object to avoid creating
49    a circular reference between the parser and the content handler.
50    """
51    def __init__(self, parser):
52        self._ref = _mkproxy(parser)
53
54    def getColumnNumber(self):
55        parser = self._ref
56        if parser._parser is None:
57            return None
58        return parser._parser.ErrorColumnNumber
59
60    def getLineNumber(self):
61        parser = self._ref
62        if parser._parser is None:
63            return 1
64        return parser._parser.ErrorLineNumber
65
66    def getPublicId(self):
67        parser = self._ref
68        if parser is None:
69            return None
70        return parser._source.getPublicId()
71
72    def getSystemId(self):
73        parser = self._ref
74        if parser is None:
75            return None
76        return parser._source.getSystemId()
77
78
79# --- ExpatParser
80
81class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator):
82    """SAX driver for the pyexpat C module."""
83
84    def __init__(self, namespaceHandling=0, bufsize=2**16-20):
85        xmlreader.IncrementalParser.__init__(self, bufsize)
86        self._source = xmlreader.InputSource()
87        self._parser = None
88        self._namespaces = namespaceHandling
89        self._lex_handler_prop = None
90        self._parsing = False
91        self._entity_stack = []
92        self._external_ges = 0
93        self._interning = None
94
95    # XMLReader methods
96
97    def parse(self, source):
98        "Parse an XML document from a URL or an InputSource."
99        source = saxutils.prepare_input_source(source)
100
101        self._source = source
102        try:
103            self.reset()
104            self._cont_handler.setDocumentLocator(ExpatLocator(self))
105            xmlreader.IncrementalParser.parse(self, source)
106        except:
107            # bpo-30264: Close the source on error to not leak resources:
108            # xml.sax.parse() doesn't give access to the underlying parser
109            # to the caller
110            self._close_source()
111            raise
112
113    def prepareParser(self, source):
114        if source.getSystemId() is not None:
115            self._parser.SetBase(source.getSystemId())
116
117    # Redefined setContentHandler to allow changing handlers during parsing
118
119    def setContentHandler(self, handler):
120        xmlreader.IncrementalParser.setContentHandler(self, handler)
121        if self._parsing:
122            self._reset_cont_handler()
123
124    def getFeature(self, name):
125        if name == feature_namespaces:
126            return self._namespaces
127        elif name == feature_string_interning:
128            return self._interning is not None
129        elif name in (feature_validation, feature_external_pes,
130                      feature_namespace_prefixes):
131            return 0
132        elif name == feature_external_ges:
133            return self._external_ges
134        raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
135
136    def setFeature(self, name, state):
137        if self._parsing:
138            raise SAXNotSupportedException("Cannot set features while parsing")
139
140        if name == feature_namespaces:
141            self._namespaces = state
142        elif name == feature_external_ges:
143            self._external_ges = state
144        elif name == feature_string_interning:
145            if state:
146                if self._interning is None:
147                    self._interning = {}
148            else:
149                self._interning = None
150        elif name == feature_validation:
151            if state:
152                raise SAXNotSupportedException(
153                    "expat does not support validation")
154        elif name == feature_external_pes:
155            if state:
156                raise SAXNotSupportedException(
157                    "expat does not read external parameter entities")
158        elif name == feature_namespace_prefixes:
159            if state:
160                raise SAXNotSupportedException(
161                    "expat does not report namespace prefixes")
162        else:
163            raise SAXNotRecognizedException(
164                "Feature '%s' not recognized" % name)
165
166    def getProperty(self, name):
167        if name == handler.property_lexical_handler:
168            return self._lex_handler_prop
169        elif name == property_interning_dict:
170            return self._interning
171        elif name == property_xml_string:
172            if self._parser:
173                if hasattr(self._parser, "GetInputContext"):
174                    return self._parser.GetInputContext()
175                else:
176                    raise SAXNotRecognizedException(
177                        "This version of expat does not support getting"
178                        " the XML string")
179            else:
180                raise SAXNotSupportedException(
181                    "XML string cannot be returned when not parsing")
182        raise SAXNotRecognizedException("Property '%s' not recognized" % name)
183
184    def setProperty(self, name, value):
185        if name == handler.property_lexical_handler:
186            self._lex_handler_prop = value
187            if self._parsing:
188                self._reset_lex_handler_prop()
189        elif name == property_interning_dict:
190            self._interning = value
191        elif name == property_xml_string:
192            raise SAXNotSupportedException("Property '%s' cannot be set" %
193                                           name)
194        else:
195            raise SAXNotRecognizedException("Property '%s' not recognized" %
196                                            name)
197
198    # IncrementalParser methods
199
200    def feed(self, data, isFinal=False):
201        if not self._parsing:
202            self.reset()
203            self._parsing = True
204            self._cont_handler.startDocument()
205
206        try:
207            # The isFinal parameter is internal to the expat reader.
208            # If it is set to true, expat will check validity of the entire
209            # document. When feeding chunks, they are not normally final -
210            # except when invoked from close.
211            self._parser.Parse(data, isFinal)
212        except expat.error as e:
213            exc = SAXParseException(expat.ErrorString(e.code), e, self)
214            # FIXME: when to invoke error()?
215            self._err_handler.fatalError(exc)
216
217    def flush(self):
218        if self._parser is None:
219            return
220
221        was_enabled = self._parser.GetReparseDeferralEnabled()
222        try:
223            self._parser.SetReparseDeferralEnabled(False)
224            self._parser.Parse(b"", False)
225        except expat.error as e:
226            exc = SAXParseException(expat.ErrorString(e.code), e, self)
227            self._err_handler.fatalError(exc)
228        finally:
229            self._parser.SetReparseDeferralEnabled(was_enabled)
230
231    def _close_source(self):
232        source = self._source
233        try:
234            file = source.getCharacterStream()
235            if file is not None:
236                file.close()
237        finally:
238            file = source.getByteStream()
239            if file is not None:
240                file.close()
241
242    def close(self):
243        if (self._entity_stack or self._parser is None or
244            isinstance(self._parser, _ClosedParser)):
245            # If we are completing an external entity, do nothing here
246            return
247        try:
248            self.feed(b"", isFinal=True)
249            self._cont_handler.endDocument()
250            self._parsing = False
251            # break cycle created by expat handlers pointing to our methods
252            self._parser = None
253        finally:
254            self._parsing = False
255            if self._parser is not None:
256                # Keep ErrorColumnNumber and ErrorLineNumber after closing.
257                parser = _ClosedParser()
258                parser.ErrorColumnNumber = self._parser.ErrorColumnNumber
259                parser.ErrorLineNumber = self._parser.ErrorLineNumber
260                self._parser = parser
261            self._close_source()
262
263    def _reset_cont_handler(self):
264        self._parser.ProcessingInstructionHandler = \
265                                    self._cont_handler.processingInstruction
266        self._parser.CharacterDataHandler = self._cont_handler.characters
267
268    def _reset_lex_handler_prop(self):
269        lex = self._lex_handler_prop
270        parser = self._parser
271        if lex is None:
272            parser.CommentHandler = None
273            parser.StartCdataSectionHandler = None
274            parser.EndCdataSectionHandler = None
275            parser.StartDoctypeDeclHandler = None
276            parser.EndDoctypeDeclHandler = None
277        else:
278            parser.CommentHandler = lex.comment
279            parser.StartCdataSectionHandler = lex.startCDATA
280            parser.EndCdataSectionHandler = lex.endCDATA
281            parser.StartDoctypeDeclHandler = self.start_doctype_decl
282            parser.EndDoctypeDeclHandler = lex.endDTD
283
284    def reset(self):
285        if self._namespaces:
286            self._parser = expat.ParserCreate(self._source.getEncoding(), " ",
287                                              intern=self._interning)
288            self._parser.namespace_prefixes = 1
289            self._parser.StartElementHandler = self.start_element_ns
290            self._parser.EndElementHandler = self.end_element_ns
291        else:
292            self._parser = expat.ParserCreate(self._source.getEncoding(),
293                                              intern = self._interning)
294            self._parser.StartElementHandler = self.start_element
295            self._parser.EndElementHandler = self.end_element
296
297        self._reset_cont_handler()
298        self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl
299        self._parser.NotationDeclHandler = self.notation_decl
300        self._parser.StartNamespaceDeclHandler = self.start_namespace_decl
301        self._parser.EndNamespaceDeclHandler = self.end_namespace_decl
302
303        self._decl_handler_prop = None
304        if self._lex_handler_prop:
305            self._reset_lex_handler_prop()
306#         self._parser.DefaultHandler =
307#         self._parser.DefaultHandlerExpand =
308#         self._parser.NotStandaloneHandler =
309        self._parser.ExternalEntityRefHandler = self.external_entity_ref
310        try:
311            self._parser.SkippedEntityHandler = self.skipped_entity_handler
312        except AttributeError:
313            # This pyexpat does not support SkippedEntity
314            pass
315        self._parser.SetParamEntityParsing(
316            expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE)
317
318        self._parsing = False
319        self._entity_stack = []
320
321    # Locator methods
322
323    def getColumnNumber(self):
324        if self._parser is None:
325            return None
326        return self._parser.ErrorColumnNumber
327
328    def getLineNumber(self):
329        if self._parser is None:
330            return 1
331        return self._parser.ErrorLineNumber
332
333    def getPublicId(self):
334        return self._source.getPublicId()
335
336    def getSystemId(self):
337        return self._source.getSystemId()
338
339    # event handlers
340    def start_element(self, name, attrs):
341        self._cont_handler.startElement(name, AttributesImpl(attrs))
342
343    def end_element(self, name):
344        self._cont_handler.endElement(name)
345
346    def start_element_ns(self, name, attrs):
347        pair = name.split()
348        if len(pair) == 1:
349            # no namespace
350            pair = (None, name)
351        elif len(pair) == 3:
352            pair = pair[0], pair[1]
353        else:
354            # default namespace
355            pair = tuple(pair)
356
357        newattrs = {}
358        qnames = {}
359        for (aname, value) in attrs.items():
360            parts = aname.split()
361            length = len(parts)
362            if length == 1:
363                # no namespace
364                qname = aname
365                apair = (None, aname)
366            elif length == 3:
367                qname = "%s:%s" % (parts[2], parts[1])
368                apair = parts[0], parts[1]
369            else:
370                # default namespace
371                qname = parts[1]
372                apair = tuple(parts)
373
374            newattrs[apair] = value
375            qnames[apair] = qname
376
377        self._cont_handler.startElementNS(pair, None,
378                                          AttributesNSImpl(newattrs, qnames))
379
380    def end_element_ns(self, name):
381        pair = name.split()
382        if len(pair) == 1:
383            pair = (None, name)
384        elif len(pair) == 3:
385            pair = pair[0], pair[1]
386        else:
387            pair = tuple(pair)
388
389        self._cont_handler.endElementNS(pair, None)
390
391    # this is not used (call directly to ContentHandler)
392    def processing_instruction(self, target, data):
393        self._cont_handler.processingInstruction(target, data)
394
395    # this is not used (call directly to ContentHandler)
396    def character_data(self, data):
397        self._cont_handler.characters(data)
398
399    def start_namespace_decl(self, prefix, uri):
400        self._cont_handler.startPrefixMapping(prefix, uri)
401
402    def end_namespace_decl(self, prefix):
403        self._cont_handler.endPrefixMapping(prefix)
404
405    def start_doctype_decl(self, name, sysid, pubid, has_internal_subset):
406        self._lex_handler_prop.startDTD(name, pubid, sysid)
407
408    def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name):
409        self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name)
410
411    def notation_decl(self, name, base, sysid, pubid):
412        self._dtd_handler.notationDecl(name, pubid, sysid)
413
414    def external_entity_ref(self, context, base, sysid, pubid):
415        if not self._external_ges:
416            return 1
417
418        source = self._ent_handler.resolveEntity(pubid, sysid)
419        source = saxutils.prepare_input_source(source,
420                                               self._source.getSystemId() or
421                                               "")
422
423        self._entity_stack.append((self._parser, self._source))
424        self._parser = self._parser.ExternalEntityParserCreate(context)
425        self._source = source
426
427        try:
428            xmlreader.IncrementalParser.parse(self, source)
429        except:
430            return 0  # FIXME: save error info here?
431
432        (self._parser, self._source) = self._entity_stack[-1]
433        del self._entity_stack[-1]
434        return 1
435
436    def skipped_entity_handler(self, name, is_pe):
437        if is_pe:
438            # The SAX spec requires to report skipped PEs with a '%'
439            name = '%'+name
440        self._cont_handler.skippedEntity(name)
441
442# ---
443
444def create_parser(*args, **kwargs):
445    return ExpatParser(*args, **kwargs)
446
447# ---
448
449if __name__ == "__main__":
450    import xml.sax.saxutils
451    p = create_parser()
452    p.setContentHandler(xml.sax.saxutils.XMLGenerator())
453    p.setErrorHandler(xml.sax.ErrorHandler())
454    p.parse("http://www.ibiblio.org/xml/examples/shakespeare/hamlet.xml")
455