• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""
2SAX driver for the pyexpat C module.  This driver works with
3pyexpat.__version__ == '2.22'.
4"""
5
6version = "0.20"
7
8from xml.sax._exceptions import *
9from xml.sax.handler import feature_validation, feature_namespaces
10from xml.sax.handler import feature_namespace_prefixes
11from xml.sax.handler import feature_external_ges, feature_external_pes
12from xml.sax.handler import feature_string_interning
13from xml.sax.handler import property_xml_string, property_interning_dict
14
15# xml.parsers.expat does not raise ImportError in Jython
16import sys
17if sys.platform[:4] == "java":
18    raise SAXReaderNotAvailable("expat not available in Java", None)
19del sys
20
21try:
22    from xml.parsers import expat
23except ImportError:
24    raise SAXReaderNotAvailable("expat not supported", None)
25else:
26    if not hasattr(expat, "ParserCreate"):
27        raise SAXReaderNotAvailable("expat not supported", None)
28from xml.sax import xmlreader, saxutils, handler
29
30AttributesImpl = xmlreader.AttributesImpl
31AttributesNSImpl = xmlreader.AttributesNSImpl
32
33# If we're using a sufficiently recent version of Python, we can use
34# weak references to avoid cycles between the parser and content
35# handler, otherwise we'll just have to pretend.
36try:
37    import _weakref
38except ImportError:
39    def _mkproxy(o):
40        return o
41else:
42    import weakref
43    _mkproxy = weakref.proxy
44    del weakref, _weakref
45
46class _ClosedParser:
47    pass
48
49# --- ExpatLocator
50
51class ExpatLocator(xmlreader.Locator):
52    """Locator for use with the ExpatParser class.
53
54    This uses a weak reference to the parser object to avoid creating
55    a circular reference between the parser and the content handler.
56    """
57    def __init__(self, parser):
58        self._ref = _mkproxy(parser)
59
60    def getColumnNumber(self):
61        parser = self._ref
62        if parser._parser is None:
63            return None
64        return parser._parser.ErrorColumnNumber
65
66    def getLineNumber(self):
67        parser = self._ref
68        if parser._parser is None:
69            return 1
70        return parser._parser.ErrorLineNumber
71
72    def getPublicId(self):
73        parser = self._ref
74        if parser is None:
75            return None
76        return parser._source.getPublicId()
77
78    def getSystemId(self):
79        parser = self._ref
80        if parser is None:
81            return None
82        return parser._source.getSystemId()
83
84
85# --- ExpatParser
86
87class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator):
88    """SAX driver for the pyexpat C module."""
89
90    def __init__(self, namespaceHandling=0, bufsize=2**16-20):
91        xmlreader.IncrementalParser.__init__(self, bufsize)
92        self._source = xmlreader.InputSource()
93        self._parser = None
94        self._namespaces = namespaceHandling
95        self._lex_handler_prop = None
96        self._parsing = 0
97        self._entity_stack = []
98        self._external_ges = 1
99        self._interning = None
100
101    # XMLReader methods
102
103    def parse(self, source):
104        "Parse an XML document from a URL or an InputSource."
105        source = saxutils.prepare_input_source(source)
106
107        self._source = source
108        self.reset()
109        self._cont_handler.setDocumentLocator(ExpatLocator(self))
110        xmlreader.IncrementalParser.parse(self, source)
111
112    def prepareParser(self, source):
113        if source.getSystemId() is not None:
114            base = source.getSystemId()
115            if isinstance(base, unicode):
116                base = base.encode('utf-8')
117            self._parser.SetBase(base)
118
119    # Redefined setContentHandler to allow changing handlers during parsing
120
121    def setContentHandler(self, handler):
122        xmlreader.IncrementalParser.setContentHandler(self, handler)
123        if self._parsing:
124            self._reset_cont_handler()
125
126    def getFeature(self, name):
127        if name == feature_namespaces:
128            return self._namespaces
129        elif name == feature_string_interning:
130            return self._interning is not None
131        elif name in (feature_validation, feature_external_pes,
132                      feature_namespace_prefixes):
133            return 0
134        elif name == feature_external_ges:
135            return self._external_ges
136        raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
137
138    def setFeature(self, name, state):
139        if self._parsing:
140            raise SAXNotSupportedException("Cannot set features while parsing")
141
142        if name == feature_namespaces:
143            self._namespaces = state
144        elif name == feature_external_ges:
145            self._external_ges = state
146        elif name == feature_string_interning:
147            if state:
148                if self._interning is None:
149                    self._interning = {}
150            else:
151                self._interning = None
152        elif name == feature_validation:
153            if state:
154                raise SAXNotSupportedException(
155                    "expat does not support validation")
156        elif name == feature_external_pes:
157            if state:
158                raise SAXNotSupportedException(
159                    "expat does not read external parameter entities")
160        elif name == feature_namespace_prefixes:
161            if state:
162                raise SAXNotSupportedException(
163                    "expat does not report namespace prefixes")
164        else:
165            raise SAXNotRecognizedException(
166                "Feature '%s' not recognized" % name)
167
168    def getProperty(self, name):
169        if name == handler.property_lexical_handler:
170            return self._lex_handler_prop
171        elif name == property_interning_dict:
172            return self._interning
173        elif name == property_xml_string:
174            if self._parser:
175                if hasattr(self._parser, "GetInputContext"):
176                    return self._parser.GetInputContext()
177                else:
178                    raise SAXNotRecognizedException(
179                        "This version of expat does not support getting"
180                        " the XML string")
181            else:
182                raise SAXNotSupportedException(
183                    "XML string cannot be returned when not parsing")
184        raise SAXNotRecognizedException("Property '%s' not recognized" % name)
185
186    def setProperty(self, name, value):
187        if name == handler.property_lexical_handler:
188            self._lex_handler_prop = value
189            if self._parsing:
190                self._reset_lex_handler_prop()
191        elif name == property_interning_dict:
192            self._interning = value
193        elif name == property_xml_string:
194            raise SAXNotSupportedException("Property '%s' cannot be set" %
195                                           name)
196        else:
197            raise SAXNotRecognizedException("Property '%s' not recognized" %
198                                            name)
199
200    # IncrementalParser methods
201
202    def feed(self, data, isFinal = 0):
203        if not self._parsing:
204            self.reset()
205            self._parsing = 1
206            self._cont_handler.startDocument()
207
208        try:
209            # The isFinal parameter is internal to the expat reader.
210            # If it is set to true, expat will check validity of the entire
211            # document. When feeding chunks, they are not normally final -
212            # except when invoked from close.
213            self._parser.Parse(data, isFinal)
214        except expat.error, e:
215            exc = SAXParseException(expat.ErrorString(e.code), e, self)
216            # FIXME: when to invoke error()?
217            self._err_handler.fatalError(exc)
218
219    def close(self):
220        if (self._entity_stack or self._parser is None or
221            isinstance(self._parser, _ClosedParser)):
222            # If we are completing an external entity, do nothing here
223            return
224        try:
225            self.feed("", isFinal = 1)
226            self._cont_handler.endDocument()
227            self._parsing = 0
228            # break cycle created by expat handlers pointing to our methods
229            self._parser = None
230        finally:
231            self._parsing = 0
232            if self._parser is not None:
233                # Keep ErrorColumnNumber and ErrorLineNumber after closing.
234                parser = _ClosedParser()
235                parser.ErrorColumnNumber = self._parser.ErrorColumnNumber
236                parser.ErrorLineNumber = self._parser.ErrorLineNumber
237                self._parser = parser
238
239    def _reset_cont_handler(self):
240        self._parser.ProcessingInstructionHandler = \
241                                    self._cont_handler.processingInstruction
242        self._parser.CharacterDataHandler = self._cont_handler.characters
243
244    def _reset_lex_handler_prop(self):
245        lex = self._lex_handler_prop
246        parser = self._parser
247        if lex is None:
248            parser.CommentHandler = None
249            parser.StartCdataSectionHandler = None
250            parser.EndCdataSectionHandler = None
251            parser.StartDoctypeDeclHandler = None
252            parser.EndDoctypeDeclHandler = None
253        else:
254            parser.CommentHandler = lex.comment
255            parser.StartCdataSectionHandler = lex.startCDATA
256            parser.EndCdataSectionHandler = lex.endCDATA
257            parser.StartDoctypeDeclHandler = self.start_doctype_decl
258            parser.EndDoctypeDeclHandler = lex.endDTD
259
260    def reset(self):
261        if self._namespaces:
262            self._parser = expat.ParserCreate(self._source.getEncoding(), " ",
263                                              intern=self._interning)
264            self._parser.namespace_prefixes = 1
265            self._parser.StartElementHandler = self.start_element_ns
266            self._parser.EndElementHandler = self.end_element_ns
267        else:
268            self._parser = expat.ParserCreate(self._source.getEncoding(),
269                                              intern = self._interning)
270            self._parser.StartElementHandler = self.start_element
271            self._parser.EndElementHandler = self.end_element
272
273        self._reset_cont_handler()
274        self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl
275        self._parser.NotationDeclHandler = self.notation_decl
276        self._parser.StartNamespaceDeclHandler = self.start_namespace_decl
277        self._parser.EndNamespaceDeclHandler = self.end_namespace_decl
278
279        self._decl_handler_prop = None
280        if self._lex_handler_prop:
281            self._reset_lex_handler_prop()
282#         self._parser.DefaultHandler =
283#         self._parser.DefaultHandlerExpand =
284#         self._parser.NotStandaloneHandler =
285        self._parser.ExternalEntityRefHandler = self.external_entity_ref
286        try:
287            self._parser.SkippedEntityHandler = self.skipped_entity_handler
288        except AttributeError:
289            # This pyexpat does not support SkippedEntity
290            pass
291        self._parser.SetParamEntityParsing(
292            expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE)
293
294        self._parsing = 0
295        self._entity_stack = []
296
297    # Locator methods
298
299    def getColumnNumber(self):
300        if self._parser is None:
301            return None
302        return self._parser.ErrorColumnNumber
303
304    def getLineNumber(self):
305        if self._parser is None:
306            return 1
307        return self._parser.ErrorLineNumber
308
309    def getPublicId(self):
310        return self._source.getPublicId()
311
312    def getSystemId(self):
313        return self._source.getSystemId()
314
315    # event handlers
316    def start_element(self, name, attrs):
317        self._cont_handler.startElement(name, AttributesImpl(attrs))
318
319    def end_element(self, name):
320        self._cont_handler.endElement(name)
321
322    def start_element_ns(self, name, attrs):
323        pair = name.split()
324        if len(pair) == 1:
325            # no namespace
326            pair = (None, name)
327        elif len(pair) == 3:
328            pair = pair[0], pair[1]
329        else:
330            # default namespace
331            pair = tuple(pair)
332
333        newattrs = {}
334        qnames = {}
335        for (aname, value) in attrs.items():
336            parts = aname.split()
337            length = len(parts)
338            if length == 1:
339                # no namespace
340                qname = aname
341                apair = (None, aname)
342            elif length == 3:
343                qname = "%s:%s" % (parts[2], parts[1])
344                apair = parts[0], parts[1]
345            else:
346                # default namespace
347                qname = parts[1]
348                apair = tuple(parts)
349
350            newattrs[apair] = value
351            qnames[apair] = qname
352
353        self._cont_handler.startElementNS(pair, None,
354                                          AttributesNSImpl(newattrs, qnames))
355
356    def end_element_ns(self, name):
357        pair = name.split()
358        if len(pair) == 1:
359            pair = (None, name)
360        elif len(pair) == 3:
361            pair = pair[0], pair[1]
362        else:
363            pair = tuple(pair)
364
365        self._cont_handler.endElementNS(pair, None)
366
367    # this is not used (call directly to ContentHandler)
368    def processing_instruction(self, target, data):
369        self._cont_handler.processingInstruction(target, data)
370
371    # this is not used (call directly to ContentHandler)
372    def character_data(self, data):
373        self._cont_handler.characters(data)
374
375    def start_namespace_decl(self, prefix, uri):
376        self._cont_handler.startPrefixMapping(prefix, uri)
377
378    def end_namespace_decl(self, prefix):
379        self._cont_handler.endPrefixMapping(prefix)
380
381    def start_doctype_decl(self, name, sysid, pubid, has_internal_subset):
382        self._lex_handler_prop.startDTD(name, pubid, sysid)
383
384    def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name):
385        self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name)
386
387    def notation_decl(self, name, base, sysid, pubid):
388        self._dtd_handler.notationDecl(name, pubid, sysid)
389
390    def external_entity_ref(self, context, base, sysid, pubid):
391        if not self._external_ges:
392            return 1
393
394        source = self._ent_handler.resolveEntity(pubid, sysid)
395        source = saxutils.prepare_input_source(source,
396                                               self._source.getSystemId() or
397                                               "")
398
399        self._entity_stack.append((self._parser, self._source))
400        self._parser = self._parser.ExternalEntityParserCreate(context)
401        self._source = source
402
403        try:
404            xmlreader.IncrementalParser.parse(self, source)
405        except:
406            return 0  # FIXME: save error info here?
407
408        (self._parser, self._source) = self._entity_stack[-1]
409        del self._entity_stack[-1]
410        return 1
411
412    def skipped_entity_handler(self, name, is_pe):
413        if is_pe:
414            # The SAX spec requires to report skipped PEs with a '%'
415            name = '%'+name
416        self._cont_handler.skippedEntity(name)
417
418# ---
419
420def create_parser(*args, **kwargs):
421    return ExpatParser(*args, **kwargs)
422
423# ---
424
425if __name__ == "__main__":
426    import xml.sax.saxutils
427    p = create_parser()
428    p.setContentHandler(xml.sax.saxutils.XMLGenerator())
429    p.setErrorHandler(xml.sax.ErrorHandler())
430    p.parse("http://www.ibiblio.org/xml/examples/shakespeare/hamlet.xml")
431