1"""A collection of modules for iterating through different kinds of 2tree, generating tokens identical to those produced by the tokenizer 3module. 4 5To create a tree walker for a new type of tree, you need to do 6implement a tree walker object (called TreeWalker by convention) that 7implements a 'serialize' method taking a tree as sole argument and 8returning an iterator generating tokens. 9""" 10 11from __future__ import absolute_import, division, unicode_literals 12 13__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree", 14 "pulldom"] 15 16import sys 17 18from .. import constants 19from ..utils import default_etree 20 21treeWalkerCache = {} 22 23 24def getTreeWalker(treeType, implementation=None, **kwargs): 25 """Get a TreeWalker class for various types of tree with built-in support 26 27 treeType - the name of the tree type required (case-insensitive). Supported 28 values are: 29 30 "dom" - The xml.dom.minidom DOM implementation 31 "pulldom" - The xml.dom.pulldom event stream 32 "etree" - A generic walker for tree implementations exposing an 33 elementtree-like interface (known to work with 34 ElementTree, cElementTree and lxml.etree). 35 "lxml" - Optimized walker for lxml.etree 36 "genshi" - a Genshi stream 37 38 implementation - (Currently applies to the "etree" tree type only). A module 39 implementing the tree type e.g. xml.etree.ElementTree or 40 cElementTree.""" 41 42 treeType = treeType.lower() 43 if treeType not in treeWalkerCache: 44 if treeType in ("dom", "pulldom"): 45 name = "%s.%s" % (__name__, treeType) 46 __import__(name) 47 mod = sys.modules[name] 48 treeWalkerCache[treeType] = mod.TreeWalker 49 elif treeType == "genshi": 50 from . import genshistream 51 treeWalkerCache[treeType] = genshistream.TreeWalker 52 elif treeType == "lxml": 53 from . import lxmletree 54 treeWalkerCache[treeType] = lxmletree.TreeWalker 55 elif treeType == "etree": 56 from . import etree 57 if implementation is None: 58 implementation = default_etree 59 # XXX: NEVER cache here, caching is done in the etree submodule 60 return etree.getETreeModule(implementation, **kwargs).TreeWalker 61 return treeWalkerCache.get(treeType) 62 63 64def concatenateCharacterTokens(tokens): 65 pendingCharacters = [] 66 for token in tokens: 67 type = token["type"] 68 if type in ("Characters", "SpaceCharacters"): 69 pendingCharacters.append(token["data"]) 70 else: 71 if pendingCharacters: 72 yield {"type": "Characters", "data": "".join(pendingCharacters)} 73 pendingCharacters = [] 74 yield token 75 if pendingCharacters: 76 yield {"type": "Characters", "data": "".join(pendingCharacters)} 77 78 79def pprint(walker): 80 """Pretty printer for tree walkers""" 81 output = [] 82 indent = 0 83 for token in concatenateCharacterTokens(walker): 84 type = token["type"] 85 if type in ("StartTag", "EmptyTag"): 86 # tag name 87 if token["namespace"] and token["namespace"] != constants.namespaces["html"]: 88 if token["namespace"] in constants.prefixes: 89 ns = constants.prefixes[token["namespace"]] 90 else: 91 ns = token["namespace"] 92 name = "%s %s" % (ns, token["name"]) 93 else: 94 name = token["name"] 95 output.append("%s<%s>" % (" " * indent, name)) 96 indent += 2 97 # attributes (sorted for consistent ordering) 98 attrs = token["data"] 99 for (namespace, localname), value in sorted(attrs.items()): 100 if namespace: 101 if namespace in constants.prefixes: 102 ns = constants.prefixes[namespace] 103 else: 104 ns = namespace 105 name = "%s %s" % (ns, localname) 106 else: 107 name = localname 108 output.append("%s%s=\"%s\"" % (" " * indent, name, value)) 109 # self-closing 110 if type == "EmptyTag": 111 indent -= 2 112 113 elif type == "EndTag": 114 indent -= 2 115 116 elif type == "Comment": 117 output.append("%s<!-- %s -->" % (" " * indent, token["data"])) 118 119 elif type == "Doctype": 120 if token["name"]: 121 if token["publicId"]: 122 output.append("""%s<!DOCTYPE %s "%s" "%s">""" % 123 (" " * indent, 124 token["name"], 125 token["publicId"], 126 token["systemId"] if token["systemId"] else "")) 127 elif token["systemId"]: 128 output.append("""%s<!DOCTYPE %s "" "%s">""" % 129 (" " * indent, 130 token["name"], 131 token["systemId"])) 132 else: 133 output.append("%s<!DOCTYPE %s>" % (" " * indent, 134 token["name"])) 135 else: 136 output.append("%s<!DOCTYPE >" % (" " * indent,)) 137 138 elif type == "Characters": 139 output.append("%s\"%s\"" % (" " * indent, token["data"])) 140 141 elif type == "SpaceCharacters": 142 assert False, "concatenateCharacterTokens should have got rid of all Space tokens" 143 144 else: 145 raise ValueError("Unknown token type, %s" % type) 146 147 return "\n".join(output) 148