• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# -*- coding: iso-8859-1 -*-
2""" A SAX2 driver for libxml2, on top of it's XmlReader API
3
4USAGE
5    # put this file (drv_libxml2.py) in PYTHONPATH
6    import xml.sax
7    reader = xml.sax.make_parser(["drv_libxml2"])
8    # ...and the rest is standard python sax.
9
10CAVEATS
11    - Lexical handlers are supported, except for start/endEntity
12      (waiting for XmlReader.ResolveEntity) and start/endDTD
13    - Error callbacks are not exactly synchronous, they tend
14      to be invoked before the corresponding content callback,
15      because the underlying reader interface parses
16      data by chunks of 512 bytes
17
18TODO
19    - search for TODO
20    - some ErrorHandler events (warning)
21    - some ContentHandler events (setDocumentLocator, skippedEntity)
22    - EntityResolver (using libxml2.?)
23    - DTDHandler (if/when libxml2 exposes such node types)
24    - DeclHandler (if/when libxml2 exposes such node types)
25    - property_xml_string?
26    - feature_string_interning?
27    - Incremental parser
28    - additional performance tuning:
29      - one might cache callbacks to avoid some name lookups
30      - one might implement a smarter way to pass attributes to startElement
31        (some kind of lazy evaluation?)
32      - there might be room for improvement in start/endPrefixMapping
33      - other?
34
35"""
36
37__author__  = "St�phane Bidoul <sbi@skynet.be>"
38__version__ = "0.3"
39
40import sys
41import codecs
42
43if sys.version_info[0] < 3:
44    __author__  = codecs.unicode_escape_decode(__author__)[0]
45
46    StringTypes = (str, unicode)
47    # libxml2 returns strings as UTF8
48    _decoder = codecs.lookup("utf8")[1]
49    def _d(s):
50        if s is None:
51            return s
52        else:
53            return _decoder(s)[0]
54else:
55    StringTypes = str
56    # s is Unicode `str` already
57    def _d(s):
58        return s
59
60from xml.sax._exceptions import *
61from xml.sax import xmlreader, saxutils
62from xml.sax.handler import \
63     feature_namespaces, \
64     feature_namespace_prefixes, \
65     feature_string_interning, \
66     feature_validation, \
67     feature_external_ges, \
68     feature_external_pes, \
69     property_lexical_handler, \
70     property_declaration_handler, \
71     property_dom_node, \
72     property_xml_string
73
74try:
75    import libxml2
76except ImportError:
77    raise SAXReaderNotAvailable("libxml2 not available: " \
78                                "import error was: %s" % sys.exc_info()[1])
79
80class Locator(xmlreader.Locator):
81    """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
82
83    def __init__(self,locator):
84        self.__locator = locator
85
86    def getColumnNumber(self):
87        "Return the column number where the current event ends."
88        return -1
89
90    def getLineNumber(self):
91        "Return the line number where the current event ends."
92        return self.__locator.LineNumber()
93
94    def getPublicId(self):
95        "Return the public identifier for the current event."
96        return None
97
98    def getSystemId(self):
99        "Return the system identifier for the current event."
100        return self.__locator.BaseURI()
101
102class LibXml2Reader(xmlreader.XMLReader):
103
104    def __init__(self):
105        xmlreader.XMLReader.__init__(self)
106        # features
107        self.__ns = 0
108        self.__nspfx = 0
109        self.__validate = 0
110        self.__extparams = 1
111        # parsing flag
112        self.__parsing = 0
113        # additional handlers
114        self.__lex_handler = None
115        self.__decl_handler = None
116        # error messages accumulator
117        self.__errors = None
118
119    def _errorHandler(self,arg,msg,severity,locator):
120        if self.__errors is None:
121            self.__errors = []
122        self.__errors.append((severity,
123                              SAXParseException(msg,None,
124                                                Locator(locator))))
125
126    def _reportErrors(self,fatal):
127        for severity,exception in self.__errors:
128            if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
129                            libxml2.PARSER_SEVERITY_WARNING):
130                self._err_handler.warning(exception)
131            else:
132                # when fatal is set, the parse will stop;
133                # we consider that the last error reported
134                # is the fatal one.
135                if fatal and exception is self.__errors[-1][1]:
136                    self._err_handler.fatalError(exception)
137                else:
138                    self._err_handler.error(exception)
139        self.__errors = None
140
141    def parse(self, source):
142        self.__parsing = 1
143        try:
144            # prepare source and create reader
145            if isinstance(source, StringTypes):
146                reader = libxml2.newTextReaderFilename(source)
147            else:
148                source = saxutils.prepare_input_source(source)
149                input = libxml2.inputBuffer(source.getByteStream())
150                reader = input.newTextReader(source.getSystemId())
151            reader.SetErrorHandler(self._errorHandler,None)
152            # configure reader
153            if self.__extparams:
154                reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
155                reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
156                reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
157                reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
158            else:
159                reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
160            # we reuse attribute maps (for a slight performance gain)
161            if self.__ns:
162                attributesNSImpl = xmlreader.AttributesNSImpl({},{})
163            else:
164                attributesImpl = xmlreader.AttributesImpl({})
165            # prefixes to pop (for endPrefixMapping)
166            prefixes = []
167            # start loop
168            self._cont_handler.startDocument()
169            while 1:
170                r = reader.Read()
171                # check for errors
172                if r == 1:
173                    if not self.__errors is None:
174                        self._reportErrors(0)
175                elif r == 0:
176                    if not self.__errors is None:
177                        self._reportErrors(0)
178                    break # end of parse
179                else:
180                    if not self.__errors is None:
181                        self._reportErrors(1)
182                    else:
183                        self._err_handler.fatalError(\
184                            SAXException("Read failed (no details available)"))
185                    break # fatal parse error
186                # get node type
187                nodeType = reader.NodeType()
188                # Element
189                if nodeType == 1:
190                    if self.__ns:
191                        eltName = (_d(reader.NamespaceUri()),\
192                                   _d(reader.LocalName()))
193                        eltQName = _d(reader.Name())
194                        attributesNSImpl._attrs = attrs = {}
195                        attributesNSImpl._qnames = qnames = {}
196                        newPrefixes = []
197                        while reader.MoveToNextAttribute():
198                            qname = _d(reader.Name())
199                            value = _d(reader.Value())
200                            if qname.startswith("xmlns"):
201                                if len(qname) > 5:
202                                    newPrefix = qname[6:]
203                                else:
204                                    newPrefix = None
205                                newPrefixes.append(newPrefix)
206                                self._cont_handler.startPrefixMapping(\
207                                    newPrefix,value)
208                                if not self.__nspfx:
209                                    continue # don't report xmlns attribute
210                            attName = (_d(reader.NamespaceUri()),
211                                       _d(reader.LocalName()))
212                            qnames[attName] = qname
213                            attrs[attName] = value
214                        reader.MoveToElement()
215                        self._cont_handler.startElementNS( \
216                            eltName,eltQName,attributesNSImpl)
217                        if reader.IsEmptyElement():
218                            self._cont_handler.endElementNS(eltName,eltQName)
219                            for newPrefix in newPrefixes:
220                                self._cont_handler.endPrefixMapping(newPrefix)
221                        else:
222                            prefixes.append(newPrefixes)
223                    else:
224                        eltName = _d(reader.Name())
225                        attributesImpl._attrs = attrs = {}
226                        while reader.MoveToNextAttribute():
227                            attName = _d(reader.Name())
228                            attrs[attName] = _d(reader.Value())
229                        reader.MoveToElement()
230                        self._cont_handler.startElement( \
231                            eltName,attributesImpl)
232                        if reader.IsEmptyElement():
233                            self._cont_handler.endElement(eltName)
234                # EndElement
235                elif nodeType == 15:
236                    if self.__ns:
237                        self._cont_handler.endElementNS( \
238                             (_d(reader.NamespaceUri()),_d(reader.LocalName())),
239                             _d(reader.Name()))
240                        for prefix in prefixes.pop():
241                            self._cont_handler.endPrefixMapping(prefix)
242                    else:
243                        self._cont_handler.endElement(_d(reader.Name()))
244                # Text
245                elif nodeType == 3:
246                    self._cont_handler.characters(_d(reader.Value()))
247                # Whitespace
248                elif nodeType == 13:
249                    self._cont_handler.ignorableWhitespace(_d(reader.Value()))
250                # SignificantWhitespace
251                elif nodeType == 14:
252                    self._cont_handler.characters(_d(reader.Value()))
253                # CDATA
254                elif nodeType == 4:
255                    if not self.__lex_handler is None:
256                        self.__lex_handler.startCDATA()
257                    self._cont_handler.characters(_d(reader.Value()))
258                    if not self.__lex_handler is None:
259                        self.__lex_handler.endCDATA()
260                # EntityReference
261                elif nodeType == 5:
262                    if not self.__lex_handler is None:
263                        self.startEntity(_d(reader.Name()))
264                    reader.ResolveEntity()
265                # EndEntity
266                elif nodeType == 16:
267                    if not self.__lex_handler is None:
268                        self.endEntity(_d(reader.Name()))
269                # ProcessingInstruction
270                elif nodeType == 7:
271                    self._cont_handler.processingInstruction( \
272                        _d(reader.Name()),_d(reader.Value()))
273                # Comment
274                elif nodeType == 8:
275                    if not self.__lex_handler is None:
276                        self.__lex_handler.comment(_d(reader.Value()))
277                # DocumentType
278                elif nodeType == 10:
279                    #if not self.__lex_handler is None:
280                    #    self.__lex_handler.startDTD()
281                    pass # TODO (how to detect endDTD? on first non-dtd event?)
282                # XmlDeclaration
283                elif nodeType == 17:
284                    pass # TODO
285                # Entity
286                elif nodeType == 6:
287                    pass # TODO (entity decl)
288                # Notation (decl)
289                elif nodeType == 12:
290                    pass # TODO
291                # Attribute (never in this loop)
292                #elif nodeType == 2:
293                #    pass
294                # Document (not exposed)
295                #elif nodeType == 9:
296                #    pass
297                # DocumentFragment (never returned by XmlReader)
298                #elif nodeType == 11:
299                #    pass
300                # None
301                #elif nodeType == 0:
302                #    pass
303                # -
304                else:
305                    raise SAXException("Unexpected node type %d" % nodeType)
306            if r == 0:
307                self._cont_handler.endDocument()
308            reader.Close()
309        finally:
310            self.__parsing = 0
311
312    def setDTDHandler(self, handler):
313        # TODO (when supported, the inherited method works just fine)
314        raise SAXNotSupportedException("DTDHandler not supported")
315
316    def setEntityResolver(self, resolver):
317        # TODO (when supported, the inherited method works just fine)
318        raise SAXNotSupportedException("EntityResolver not supported")
319
320    def getFeature(self, name):
321        if name == feature_namespaces:
322            return self.__ns
323        elif name == feature_namespace_prefixes:
324            return self.__nspfx
325        elif name == feature_validation:
326            return self.__validate
327        elif name == feature_external_ges:
328            return 1 # TODO (does that relate to PARSER_LOADDTD)?
329        elif name == feature_external_pes:
330            return self.__extparams
331        else:
332            raise SAXNotRecognizedException("Feature '%s' not recognized" % \
333                                            name)
334
335    def setFeature(self, name, state):
336        if self.__parsing:
337            raise SAXNotSupportedException("Cannot set feature %s " \
338                                           "while parsing" % name)
339        if name == feature_namespaces:
340            self.__ns = state
341        elif name == feature_namespace_prefixes:
342            self.__nspfx = state
343        elif name == feature_validation:
344            self.__validate = state
345        elif name == feature_external_ges:
346            if state == 0:
347                # TODO (does that relate to PARSER_LOADDTD)?
348                raise SAXNotSupportedException("Feature '%s' not supported" % \
349                                               name)
350        elif name == feature_external_pes:
351            self.__extparams = state
352        else:
353            raise SAXNotRecognizedException("Feature '%s' not recognized" % \
354                                            name)
355
356    def getProperty(self, name):
357        if name == property_lexical_handler:
358            return self.__lex_handler
359        elif name == property_declaration_handler:
360            return self.__decl_handler
361        else:
362            raise SAXNotRecognizedException("Property '%s' not recognized" % \
363                                            name)
364
365    def setProperty(self, name, value):
366        if name == property_lexical_handler:
367            self.__lex_handler = value
368        elif name == property_declaration_handler:
369            # TODO: remove if/when libxml2 supports dtd events
370            raise SAXNotSupportedException("Property '%s' not supported" % \
371                                           name)
372            self.__decl_handler = value
373        else:
374            raise SAXNotRecognizedException("Property '%s' not recognized" % \
375                                            name)
376
377def create_parser():
378    return LibXml2Reader()
379
380