• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# -*- coding: iso-8859-1 -*-
2""" A SAX2 driver for libxml2, on top of it's XmlReader API
3
4USAGE
5    # put this file (drv_libxml2.py) in PYTHONPATH
6    import xml.sax
7    reader = xml.sax.make_parser(["drv_libxml2"])
8    # ...and the rest is standard python sax.
9
10CAVEATS
11    - Lexical handlers are supported, except for start/endEntity
12      (waiting for XmlReader.ResolveEntity) and start/endDTD
13    - Error callbacks are not exactly synchronous, they tend
14      to be invoked before the corresponding content callback,
15      because the underlying reader interface parses
16      data by chunks of 512 bytes
17
18TODO
19    - search for TODO
20    - some ErrorHandler events (warning)
21    - some ContentHandler events (setDocumentLocator, skippedEntity)
22    - EntityResolver (using libxml2.?)
23    - DTDHandler (if/when libxml2 exposes such node types)
24    - DeclHandler (if/when libxml2 exposes such node types)
25    - property_xml_string?
26    - feature_string_interning?
27    - Incremental parser
28    - additional performance tuning:
29      - one might cache callbacks to avoid some name lookups
30      - one might implement a smarter way to pass attributes to startElement
31        (some kind of lazy evaluation?)
32      - there might be room for improvement in start/endPrefixMapping
33      - other?
34
35"""
36
37__author__  = "St�phane Bidoul <sbi@skynet.be>"
38__version__ = "0.3"
39
40import sys
41import codecs
42
43if sys.version_info[0] < 3:
44    __author__  = codecs.unicode_escape_decode(__author__)[0]
45
46    StringTypes = (str, unicode)
47else:
48    StringTypes = str
49
50from xml.sax._exceptions import *
51from xml.sax import xmlreader, saxutils
52from xml.sax.handler import \
53     feature_namespaces, \
54     feature_namespace_prefixes, \
55     feature_string_interning, \
56     feature_validation, \
57     feature_external_ges, \
58     feature_external_pes, \
59     property_lexical_handler, \
60     property_declaration_handler, \
61     property_dom_node, \
62     property_xml_string
63
64# libxml2 returns strings as UTF8
65_decoder = codecs.lookup("utf8")[1]
66def _d(s):
67    if s is None:
68        return s
69    else:
70        return _decoder(s)[0]
71
72try:
73    import libxml2
74except ImportError:
75    raise SAXReaderNotAvailable("libxml2 not available: " \
76                                "import error was: %s" % sys.exc_info()[1])
77
78class Locator(xmlreader.Locator):
79    """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
80
81    def __init__(self,locator):
82        self.__locator = locator
83
84    def getColumnNumber(self):
85        "Return the column number where the current event ends."
86        return -1
87
88    def getLineNumber(self):
89        "Return the line number where the current event ends."
90        return self.__locator.LineNumber()
91
92    def getPublicId(self):
93        "Return the public identifier for the current event."
94        return None
95
96    def getSystemId(self):
97        "Return the system identifier for the current event."
98        return self.__locator.BaseURI()
99
100class LibXml2Reader(xmlreader.XMLReader):
101
102    def __init__(self):
103        xmlreader.XMLReader.__init__(self)
104        # features
105        self.__ns = 0
106        self.__nspfx = 0
107        self.__validate = 0
108        self.__extparams = 1
109        # parsing flag
110        self.__parsing = 0
111        # additional handlers
112        self.__lex_handler = None
113        self.__decl_handler = None
114        # error messages accumulator
115        self.__errors = None
116
117    def _errorHandler(self,arg,msg,severity,locator):
118        if self.__errors is None:
119            self.__errors = []
120        self.__errors.append((severity,
121                              SAXParseException(msg,None,
122                                                Locator(locator))))
123
124    def _reportErrors(self,fatal):
125        for severity,exception in self.__errors:
126            if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
127                            libxml2.PARSER_SEVERITY_WARNING):
128                self._err_handler.warning(exception)
129            else:
130                # when fatal is set, the parse will stop;
131                # we consider that the last error reported
132                # is the fatal one.
133                if fatal and exception is self.__errors[-1][1]:
134                    self._err_handler.fatalError(exception)
135                else:
136                    self._err_handler.error(exception)
137        self.__errors = None
138
139    def parse(self, source):
140        self.__parsing = 1
141        try:
142            # prepare source and create reader
143            if isinstance(source, StringTypes):
144                reader = libxml2.newTextReaderFilename(source)
145            else:
146                source = saxutils.prepare_input_source(source)
147                input = libxml2.inputBuffer(source.getByteStream())
148                reader = input.newTextReader(source.getSystemId())
149            reader.SetErrorHandler(self._errorHandler,None)
150            # configure reader
151            if self.__extparams:
152                reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
153                reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
154                reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
155                reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
156            else:
157                reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
158            # we reuse attribute maps (for a slight performance gain)
159            if self.__ns:
160                attributesNSImpl = xmlreader.AttributesNSImpl({},{})
161            else:
162                attributesImpl = xmlreader.AttributesImpl({})
163            # prefixes to pop (for endPrefixMapping)
164            prefixes = []
165            # start loop
166            self._cont_handler.startDocument()
167            while 1:
168                r = reader.Read()
169                # check for errors
170                if r == 1:
171                    if not self.__errors is None:
172                        self._reportErrors(0)
173                elif r == 0:
174                    if not self.__errors is None:
175                        self._reportErrors(0)
176                    break # end of parse
177                else:
178                    if not self.__errors is None:
179                        self._reportErrors(1)
180                    else:
181                        self._err_handler.fatalError(\
182                            SAXException("Read failed (no details available)"))
183                    break # fatal parse error
184                # get node type
185                nodeType = reader.NodeType()
186                # Element
187                if nodeType == 1:
188                    if self.__ns:
189                        eltName = (_d(reader.NamespaceUri()),\
190                                   _d(reader.LocalName()))
191                        eltQName = _d(reader.Name())
192                        attributesNSImpl._attrs = attrs = {}
193                        attributesNSImpl._qnames = qnames = {}
194                        newPrefixes = []
195                        while reader.MoveToNextAttribute():
196                            qname = _d(reader.Name())
197                            value = _d(reader.Value())
198                            if qname.startswith("xmlns"):
199                                if len(qname) > 5:
200                                    newPrefix = qname[6:]
201                                else:
202                                    newPrefix = None
203                                newPrefixes.append(newPrefix)
204                                self._cont_handler.startPrefixMapping(\
205                                    newPrefix,value)
206                                if not self.__nspfx:
207                                    continue # don't report xmlns attribute
208                            attName = (_d(reader.NamespaceUri()),
209                                       _d(reader.LocalName()))
210                            qnames[attName] = qname
211                            attrs[attName] = value
212                        reader.MoveToElement()
213                        self._cont_handler.startElementNS( \
214                            eltName,eltQName,attributesNSImpl)
215                        if reader.IsEmptyElement():
216                            self._cont_handler.endElementNS(eltName,eltQName)
217                            for newPrefix in newPrefixes:
218                                self._cont_handler.endPrefixMapping(newPrefix)
219                        else:
220                            prefixes.append(newPrefixes)
221                    else:
222                        eltName = _d(reader.Name())
223                        attributesImpl._attrs = attrs = {}
224                        while reader.MoveToNextAttribute():
225                            attName = _d(reader.Name())
226                            attrs[attName] = _d(reader.Value())
227                        reader.MoveToElement()
228                        self._cont_handler.startElement( \
229                            eltName,attributesImpl)
230                        if reader.IsEmptyElement():
231                            self._cont_handler.endElement(eltName)
232                # EndElement
233                elif nodeType == 15:
234                    if self.__ns:
235                        self._cont_handler.endElementNS( \
236                             (_d(reader.NamespaceUri()),_d(reader.LocalName())),
237                             _d(reader.Name()))
238                        for prefix in prefixes.pop():
239                            self._cont_handler.endPrefixMapping(prefix)
240                    else:
241                        self._cont_handler.endElement(_d(reader.Name()))
242                # Text
243                elif nodeType == 3:
244                    self._cont_handler.characters(_d(reader.Value()))
245                # Whitespace
246                elif nodeType == 13:
247                    self._cont_handler.ignorableWhitespace(_d(reader.Value()))
248                # SignificantWhitespace
249                elif nodeType == 14:
250                    self._cont_handler.characters(_d(reader.Value()))
251                # CDATA
252                elif nodeType == 4:
253                    if not self.__lex_handler is None:
254                        self.__lex_handler.startCDATA()
255                    self._cont_handler.characters(_d(reader.Value()))
256                    if not self.__lex_handler is None:
257                        self.__lex_handler.endCDATA()
258                # EntityReference
259                elif nodeType == 5:
260                    if not self.__lex_handler is None:
261                        self.startEntity(_d(reader.Name()))
262                    reader.ResolveEntity()
263                # EndEntity
264                elif nodeType == 16:
265                    if not self.__lex_handler is None:
266                        self.endEntity(_d(reader.Name()))
267                # ProcessingInstruction
268                elif nodeType == 7:
269                    self._cont_handler.processingInstruction( \
270                        _d(reader.Name()),_d(reader.Value()))
271                # Comment
272                elif nodeType == 8:
273                    if not self.__lex_handler is None:
274                        self.__lex_handler.comment(_d(reader.Value()))
275                # DocumentType
276                elif nodeType == 10:
277                    #if not self.__lex_handler is None:
278                    #    self.__lex_handler.startDTD()
279                    pass # TODO (how to detect endDTD? on first non-dtd event?)
280                # XmlDeclaration
281                elif nodeType == 17:
282                    pass # TODO
283                # Entity
284                elif nodeType == 6:
285                    pass # TODO (entity decl)
286                # Notation (decl)
287                elif nodeType == 12:
288                    pass # TODO
289                # Attribute (never in this loop)
290                #elif nodeType == 2:
291                #    pass
292                # Document (not exposed)
293                #elif nodeType == 9:
294                #    pass
295                # DocumentFragment (never returned by XmlReader)
296                #elif nodeType == 11:
297                #    pass
298                # None
299                #elif nodeType == 0:
300                #    pass
301                # -
302                else:
303                    raise SAXException("Unexpected node type %d" % nodeType)
304            if r == 0:
305                self._cont_handler.endDocument()
306            reader.Close()
307        finally:
308            self.__parsing = 0
309
310    def setDTDHandler(self, handler):
311        # TODO (when supported, the inherited method works just fine)
312        raise SAXNotSupportedException("DTDHandler not supported")
313
314    def setEntityResolver(self, resolver):
315        # TODO (when supported, the inherited method works just fine)
316        raise SAXNotSupportedException("EntityResolver not supported")
317
318    def getFeature(self, name):
319        if name == feature_namespaces:
320            return self.__ns
321        elif name == feature_namespace_prefixes:
322            return self.__nspfx
323        elif name == feature_validation:
324            return self.__validate
325        elif name == feature_external_ges:
326            return 1 # TODO (does that relate to PARSER_LOADDTD)?
327        elif name == feature_external_pes:
328            return self.__extparams
329        else:
330            raise SAXNotRecognizedException("Feature '%s' not recognized" % \
331                                            name)
332
333    def setFeature(self, name, state):
334        if self.__parsing:
335            raise SAXNotSupportedException("Cannot set feature %s " \
336                                           "while parsing" % name)
337        if name == feature_namespaces:
338            self.__ns = state
339        elif name == feature_namespace_prefixes:
340            self.__nspfx = state
341        elif name == feature_validation:
342            self.__validate = state
343        elif name == feature_external_ges:
344            if state == 0:
345                # TODO (does that relate to PARSER_LOADDTD)?
346                raise SAXNotSupportedException("Feature '%s' not supported" % \
347                                               name)
348        elif name == feature_external_pes:
349            self.__extparams = state
350        else:
351            raise SAXNotRecognizedException("Feature '%s' not recognized" % \
352                                            name)
353
354    def getProperty(self, name):
355        if name == property_lexical_handler:
356            return self.__lex_handler
357        elif name == property_declaration_handler:
358            return self.__decl_handler
359        else:
360            raise SAXNotRecognizedException("Property '%s' not recognized" % \
361                                            name)
362
363    def setProperty(self, name, value):
364        if name == property_lexical_handler:
365            self.__lex_handler = value
366        elif name == property_declaration_handler:
367            # TODO: remove if/when libxml2 supports dtd events
368            raise SAXNotSupportedException("Property '%s' not supported" % \
369                                           name)
370            self.__decl_handler = value
371        else:
372            raise SAXNotRecognizedException("Property '%s' not recognized" % \
373                                            name)
374
375def create_parser():
376    return LibXml2Reader()
377
378