1"""An XML Reader is the SAX 2 name for an XML parser. XML Parsers 2should be based on this code. """ 3 4from . import handler 5 6from ._exceptions import SAXNotSupportedException, SAXNotRecognizedException 7 8 9# ===== XMLREADER ===== 10 11class XMLReader: 12 """Interface for reading an XML document using callbacks. 13 14 XMLReader is the interface that an XML parser's SAX2 driver must 15 implement. This interface allows an application to set and query 16 features and properties in the parser, to register event handlers 17 for document processing, and to initiate a document parse. 18 19 All SAX interfaces are assumed to be synchronous: the parse 20 methods must not return until parsing is complete, and readers 21 must wait for an event-handler callback to return before reporting 22 the next event.""" 23 24 def __init__(self): 25 self._cont_handler = handler.ContentHandler() 26 self._dtd_handler = handler.DTDHandler() 27 self._ent_handler = handler.EntityResolver() 28 self._err_handler = handler.ErrorHandler() 29 30 def parse(self, source): 31 "Parse an XML document from a system identifier or an InputSource." 32 raise NotImplementedError("This method must be implemented!") 33 34 def getContentHandler(self): 35 "Returns the current ContentHandler." 36 return self._cont_handler 37 38 def setContentHandler(self, handler): 39 "Registers a new object to receive document content events." 40 self._cont_handler = handler 41 42 def getDTDHandler(self): 43 "Returns the current DTD handler." 44 return self._dtd_handler 45 46 def setDTDHandler(self, handler): 47 "Register an object to receive basic DTD-related events." 48 self._dtd_handler = handler 49 50 def getEntityResolver(self): 51 "Returns the current EntityResolver." 52 return self._ent_handler 53 54 def setEntityResolver(self, resolver): 55 "Register an object to resolve external entities." 56 self._ent_handler = resolver 57 58 def getErrorHandler(self): 59 "Returns the current ErrorHandler." 60 return self._err_handler 61 62 def setErrorHandler(self, handler): 63 "Register an object to receive error-message events." 64 self._err_handler = handler 65 66 def setLocale(self, locale): 67 """Allow an application to set the locale for errors and warnings. 68 69 SAX parsers are not required to provide localization for errors 70 and warnings; if they cannot support the requested locale, 71 however, they must raise a SAX exception. Applications may 72 request a locale change in the middle of a parse.""" 73 raise SAXNotSupportedException("Locale support not implemented") 74 75 def getFeature(self, name): 76 "Looks up and returns the state of a SAX2 feature." 77 raise SAXNotRecognizedException("Feature '%s' not recognized" % name) 78 79 def setFeature(self, name, state): 80 "Sets the state of a SAX2 feature." 81 raise SAXNotRecognizedException("Feature '%s' not recognized" % name) 82 83 def getProperty(self, name): 84 "Looks up and returns the value of a SAX2 property." 85 raise SAXNotRecognizedException("Property '%s' not recognized" % name) 86 87 def setProperty(self, name, value): 88 "Sets the value of a SAX2 property." 89 raise SAXNotRecognizedException("Property '%s' not recognized" % name) 90 91class IncrementalParser(XMLReader): 92 """This interface adds three extra methods to the XMLReader 93 interface that allow XML parsers to support incremental 94 parsing. Support for this interface is optional, since not all 95 underlying XML parsers support this functionality. 96 97 When the parser is instantiated it is ready to begin accepting 98 data from the feed method immediately. After parsing has been 99 finished with a call to close the reset method must be called to 100 make the parser ready to accept new data, either from feed or 101 using the parse method. 102 103 Note that these methods must _not_ be called during parsing, that 104 is, after parse has been called and before it returns. 105 106 By default, the class also implements the parse method of the XMLReader 107 interface using the feed, close and reset methods of the 108 IncrementalParser interface as a convenience to SAX 2.0 driver 109 writers.""" 110 111 def __init__(self, bufsize=2**16): 112 self._bufsize = bufsize 113 XMLReader.__init__(self) 114 115 def parse(self, source): 116 from . import saxutils 117 source = saxutils.prepare_input_source(source) 118 119 self.prepareParser(source) 120 file = source.getCharacterStream() 121 if file is None: 122 file = source.getByteStream() 123 buffer = file.read(self._bufsize) 124 while buffer: 125 self.feed(buffer) 126 buffer = file.read(self._bufsize) 127 self.close() 128 129 def feed(self, data): 130 """This method gives the raw XML data in the data parameter to 131 the parser and makes it parse the data, emitting the 132 corresponding events. It is allowed for XML constructs to be 133 split across several calls to feed. 134 135 feed may raise SAXException.""" 136 raise NotImplementedError("This method must be implemented!") 137 138 def prepareParser(self, source): 139 """This method is called by the parse implementation to allow 140 the SAX 2.0 driver to prepare itself for parsing.""" 141 raise NotImplementedError("prepareParser must be overridden!") 142 143 def close(self): 144 """This method is called when the entire XML document has been 145 passed to the parser through the feed method, to notify the 146 parser that there are no more data. This allows the parser to 147 do the final checks on the document and empty the internal 148 data buffer. 149 150 The parser will not be ready to parse another document until 151 the reset method has been called. 152 153 close may raise SAXException.""" 154 raise NotImplementedError("This method must be implemented!") 155 156 def reset(self): 157 """This method is called after close has been called to reset 158 the parser so that it is ready to parse new documents. The 159 results of calling parse or feed after close without calling 160 reset are undefined.""" 161 raise NotImplementedError("This method must be implemented!") 162 163# ===== LOCATOR ===== 164 165class Locator: 166 """Interface for associating a SAX event with a document 167 location. A locator object will return valid results only during 168 calls to DocumentHandler methods; at any other time, the 169 results are unpredictable.""" 170 171 def getColumnNumber(self): 172 "Return the column number where the current event ends." 173 return -1 174 175 def getLineNumber(self): 176 "Return the line number where the current event ends." 177 return -1 178 179 def getPublicId(self): 180 "Return the public identifier for the current event." 181 return None 182 183 def getSystemId(self): 184 "Return the system identifier for the current event." 185 return None 186 187# ===== INPUTSOURCE ===== 188 189class InputSource: 190 """Encapsulation of the information needed by the XMLReader to 191 read entities. 192 193 This class may include information about the public identifier, 194 system identifier, byte stream (possibly with character encoding 195 information) and/or the character stream of an entity. 196 197 Applications will create objects of this class for use in the 198 XMLReader.parse method and for returning from 199 EntityResolver.resolveEntity. 200 201 An InputSource belongs to the application, the XMLReader is not 202 allowed to modify InputSource objects passed to it from the 203 application, although it may make copies and modify those.""" 204 205 def __init__(self, system_id = None): 206 self.__system_id = system_id 207 self.__public_id = None 208 self.__encoding = None 209 self.__bytefile = None 210 self.__charfile = None 211 212 def setPublicId(self, public_id): 213 "Sets the public identifier of this InputSource." 214 self.__public_id = public_id 215 216 def getPublicId(self): 217 "Returns the public identifier of this InputSource." 218 return self.__public_id 219 220 def setSystemId(self, system_id): 221 "Sets the system identifier of this InputSource." 222 self.__system_id = system_id 223 224 def getSystemId(self): 225 "Returns the system identifier of this InputSource." 226 return self.__system_id 227 228 def setEncoding(self, encoding): 229 """Sets the character encoding of this InputSource. 230 231 The encoding must be a string acceptable for an XML encoding 232 declaration (see section 4.3.3 of the XML recommendation). 233 234 The encoding attribute of the InputSource is ignored if the 235 InputSource also contains a character stream.""" 236 self.__encoding = encoding 237 238 def getEncoding(self): 239 "Get the character encoding of this InputSource." 240 return self.__encoding 241 242 def setByteStream(self, bytefile): 243 """Set the byte stream (a Python file-like object which does 244 not perform byte-to-character conversion) for this input 245 source. 246 247 The SAX parser will ignore this if there is also a character 248 stream specified, but it will use a byte stream in preference 249 to opening a URI connection itself. 250 251 If the application knows the character encoding of the byte 252 stream, it should set it with the setEncoding method.""" 253 self.__bytefile = bytefile 254 255 def getByteStream(self): 256 """Get the byte stream for this input source. 257 258 The getEncoding method will return the character encoding for 259 this byte stream, or None if unknown.""" 260 return self.__bytefile 261 262 def setCharacterStream(self, charfile): 263 """Set the character stream for this input source. (The stream 264 must be a Python 2.0 Unicode-wrapped file-like that performs 265 conversion to Unicode strings.) 266 267 If there is a character stream specified, the SAX parser will 268 ignore any byte stream and will not attempt to open a URI 269 connection to the system identifier.""" 270 self.__charfile = charfile 271 272 def getCharacterStream(self): 273 "Get the character stream for this input source." 274 return self.__charfile 275 276# ===== ATTRIBUTESIMPL ===== 277 278class AttributesImpl: 279 280 def __init__(self, attrs): 281 """Non-NS-aware implementation. 282 283 attrs should be of the form {name : value}.""" 284 self._attrs = attrs 285 286 def getLength(self): 287 return len(self._attrs) 288 289 def getType(self, name): 290 return "CDATA" 291 292 def getValue(self, name): 293 return self._attrs[name] 294 295 def getValueByQName(self, name): 296 return self._attrs[name] 297 298 def getNameByQName(self, name): 299 if name not in self._attrs: 300 raise KeyError(name) 301 return name 302 303 def getQNameByName(self, name): 304 if name not in self._attrs: 305 raise KeyError(name) 306 return name 307 308 def getNames(self): 309 return list(self._attrs.keys()) 310 311 def getQNames(self): 312 return list(self._attrs.keys()) 313 314 def __len__(self): 315 return len(self._attrs) 316 317 def __getitem__(self, name): 318 return self._attrs[name] 319 320 def keys(self): 321 return list(self._attrs.keys()) 322 323 def __contains__(self, name): 324 return name in self._attrs 325 326 def get(self, name, alternative=None): 327 return self._attrs.get(name, alternative) 328 329 def copy(self): 330 return self.__class__(self._attrs) 331 332 def items(self): 333 return list(self._attrs.items()) 334 335 def values(self): 336 return list(self._attrs.values()) 337 338# ===== ATTRIBUTESNSIMPL ===== 339 340class AttributesNSImpl(AttributesImpl): 341 342 def __init__(self, attrs, qnames): 343 """NS-aware implementation. 344 345 attrs should be of the form {(ns_uri, lname): value, ...}. 346 qnames of the form {(ns_uri, lname): qname, ...}.""" 347 self._attrs = attrs 348 self._qnames = qnames 349 350 def getValueByQName(self, name): 351 for (nsname, qname) in self._qnames.items(): 352 if qname == name: 353 return self._attrs[nsname] 354 355 raise KeyError(name) 356 357 def getNameByQName(self, name): 358 for (nsname, qname) in self._qnames.items(): 359 if qname == name: 360 return nsname 361 362 raise KeyError(name) 363 364 def getQNameByName(self, name): 365 return self._qnames[name] 366 367 def getQNames(self): 368 return list(self._qnames.values()) 369 370 def copy(self): 371 return self.__class__(self._attrs, self._qnames) 372 373 374def _test(): 375 XMLReader() 376 IncrementalParser() 377 Locator() 378 379if __name__ == "__main__": 380 _test() 381