1from __future__ import absolute_import, division, unicode_literals 2from six import text_type 3 4from ..constants import scopingElements, tableInsertModeElements, namespaces 5 6# The scope markers are inserted when entering object elements, 7# marquees, table cells, and table captions, and are used to prevent formatting 8# from "leaking" into tables, object elements, and marquees. 9Marker = None 10 11listElementsMap = { 12 None: (frozenset(scopingElements), False), 13 "button": (frozenset(scopingElements | set([(namespaces["html"], "button")])), False), 14 "list": (frozenset(scopingElements | set([(namespaces["html"], "ol"), 15 (namespaces["html"], "ul")])), False), 16 "table": (frozenset([(namespaces["html"], "html"), 17 (namespaces["html"], "table")]), False), 18 "select": (frozenset([(namespaces["html"], "optgroup"), 19 (namespaces["html"], "option")]), True) 20} 21 22 23class Node(object): 24 def __init__(self, name): 25 """Node representing an item in the tree. 26 name - The tag name associated with the node 27 parent - The parent of the current node (or None for the document node) 28 value - The value of the current node (applies to text nodes and 29 comments 30 attributes - a dict holding name, value pairs for attributes of the node 31 childNodes - a list of child nodes of the current node. This must 32 include all elements but not necessarily other node types 33 _flags - A list of miscellaneous flags that can be set on the node 34 """ 35 self.name = name 36 self.parent = None 37 self.value = None 38 self.attributes = {} 39 self.childNodes = [] 40 self._flags = [] 41 42 def __str__(self): 43 attributesStr = " ".join(["%s=\"%s\"" % (name, value) 44 for name, value in 45 self.attributes.items()]) 46 if attributesStr: 47 return "<%s %s>" % (self.name, attributesStr) 48 else: 49 return "<%s>" % (self.name) 50 51 def __repr__(self): 52 return "<%s>" % (self.name) 53 54 def appendChild(self, node): 55 """Insert node as a child of the current node 56 """ 57 raise NotImplementedError 58 59 def insertText(self, data, insertBefore=None): 60 """Insert data as text in the current node, positioned before the 61 start of node insertBefore or to the end of the node's text. 62 """ 63 raise NotImplementedError 64 65 def insertBefore(self, node, refNode): 66 """Insert node as a child of the current node, before refNode in the 67 list of child nodes. Raises ValueError if refNode is not a child of 68 the current node""" 69 raise NotImplementedError 70 71 def removeChild(self, node): 72 """Remove node from the children of the current node 73 """ 74 raise NotImplementedError 75 76 def reparentChildren(self, newParent): 77 """Move all the children of the current node to newParent. 78 This is needed so that trees that don't store text as nodes move the 79 text in the correct way 80 """ 81 # XXX - should this method be made more general? 82 for child in self.childNodes: 83 newParent.appendChild(child) 84 self.childNodes = [] 85 86 def cloneNode(self): 87 """Return a shallow copy of the current node i.e. a node with the same 88 name and attributes but with no parent or child nodes 89 """ 90 raise NotImplementedError 91 92 def hasContent(self): 93 """Return true if the node has children or text, false otherwise 94 """ 95 raise NotImplementedError 96 97 98class ActiveFormattingElements(list): 99 def append(self, node): 100 equalCount = 0 101 if node != Marker: 102 for element in self[::-1]: 103 if element == Marker: 104 break 105 if self.nodesEqual(element, node): 106 equalCount += 1 107 if equalCount == 3: 108 self.remove(element) 109 break 110 list.append(self, node) 111 112 def nodesEqual(self, node1, node2): 113 if not node1.nameTuple == node2.nameTuple: 114 return False 115 116 if not node1.attributes == node2.attributes: 117 return False 118 119 return True 120 121 122class TreeBuilder(object): 123 """Base treebuilder implementation 124 documentClass - the class to use for the bottommost node of a document 125 elementClass - the class to use for HTML Elements 126 commentClass - the class to use for comments 127 doctypeClass - the class to use for doctypes 128 """ 129 130 # Document class 131 documentClass = None 132 133 # The class to use for creating a node 134 elementClass = None 135 136 # The class to use for creating comments 137 commentClass = None 138 139 # The class to use for creating doctypes 140 doctypeClass = None 141 142 # Fragment class 143 fragmentClass = None 144 145 def __init__(self, namespaceHTMLElements): 146 if namespaceHTMLElements: 147 self.defaultNamespace = "http://www.w3.org/1999/xhtml" 148 else: 149 self.defaultNamespace = None 150 self.reset() 151 152 def reset(self): 153 self.openElements = [] 154 self.activeFormattingElements = ActiveFormattingElements() 155 156 # XXX - rename these to headElement, formElement 157 self.headPointer = None 158 self.formPointer = None 159 160 self.insertFromTable = False 161 162 self.document = self.documentClass() 163 164 def elementInScope(self, target, variant=None): 165 166 # If we pass a node in we match that. if we pass a string 167 # match any node with that name 168 exactNode = hasattr(target, "nameTuple") 169 170 listElements, invert = listElementsMap[variant] 171 172 for node in reversed(self.openElements): 173 if (node.name == target and not exactNode or 174 node == target and exactNode): 175 return True 176 elif (invert ^ (node.nameTuple in listElements)): 177 return False 178 179 assert False # We should never reach this point 180 181 def reconstructActiveFormattingElements(self): 182 # Within this algorithm the order of steps described in the 183 # specification is not quite the same as the order of steps in the 184 # code. It should still do the same though. 185 186 # Step 1: stop the algorithm when there's nothing to do. 187 if not self.activeFormattingElements: 188 return 189 190 # Step 2 and step 3: we start with the last element. So i is -1. 191 i = len(self.activeFormattingElements) - 1 192 entry = self.activeFormattingElements[i] 193 if entry == Marker or entry in self.openElements: 194 return 195 196 # Step 6 197 while entry != Marker and entry not in self.openElements: 198 if i == 0: 199 # This will be reset to 0 below 200 i = -1 201 break 202 i -= 1 203 # Step 5: let entry be one earlier in the list. 204 entry = self.activeFormattingElements[i] 205 206 while True: 207 # Step 7 208 i += 1 209 210 # Step 8 211 entry = self.activeFormattingElements[i] 212 clone = entry.cloneNode() # Mainly to get a new copy of the attributes 213 214 # Step 9 215 element = self.insertElement({"type": "StartTag", 216 "name": clone.name, 217 "namespace": clone.namespace, 218 "data": clone.attributes}) 219 220 # Step 10 221 self.activeFormattingElements[i] = element 222 223 # Step 11 224 if element == self.activeFormattingElements[-1]: 225 break 226 227 def clearActiveFormattingElements(self): 228 entry = self.activeFormattingElements.pop() 229 while self.activeFormattingElements and entry != Marker: 230 entry = self.activeFormattingElements.pop() 231 232 def elementInActiveFormattingElements(self, name): 233 """Check if an element exists between the end of the active 234 formatting elements and the last marker. If it does, return it, else 235 return false""" 236 237 for item in self.activeFormattingElements[::-1]: 238 # Check for Marker first because if it's a Marker it doesn't have a 239 # name attribute. 240 if item == Marker: 241 break 242 elif item.name == name: 243 return item 244 return False 245 246 def insertRoot(self, token): 247 element = self.createElement(token) 248 self.openElements.append(element) 249 self.document.appendChild(element) 250 251 def insertDoctype(self, token): 252 name = token["name"] 253 publicId = token["publicId"] 254 systemId = token["systemId"] 255 256 doctype = self.doctypeClass(name, publicId, systemId) 257 self.document.appendChild(doctype) 258 259 def insertComment(self, token, parent=None): 260 if parent is None: 261 parent = self.openElements[-1] 262 parent.appendChild(self.commentClass(token["data"])) 263 264 def createElement(self, token): 265 """Create an element but don't insert it anywhere""" 266 name = token["name"] 267 namespace = token.get("namespace", self.defaultNamespace) 268 element = self.elementClass(name, namespace) 269 element.attributes = token["data"] 270 return element 271 272 def _getInsertFromTable(self): 273 return self._insertFromTable 274 275 def _setInsertFromTable(self, value): 276 """Switch the function used to insert an element from the 277 normal one to the misnested table one and back again""" 278 self._insertFromTable = value 279 if value: 280 self.insertElement = self.insertElementTable 281 else: 282 self.insertElement = self.insertElementNormal 283 284 insertFromTable = property(_getInsertFromTable, _setInsertFromTable) 285 286 def insertElementNormal(self, token): 287 name = token["name"] 288 assert isinstance(name, text_type), "Element %s not unicode" % name 289 namespace = token.get("namespace", self.defaultNamespace) 290 element = self.elementClass(name, namespace) 291 element.attributes = token["data"] 292 self.openElements[-1].appendChild(element) 293 self.openElements.append(element) 294 return element 295 296 def insertElementTable(self, token): 297 """Create an element and insert it into the tree""" 298 element = self.createElement(token) 299 if self.openElements[-1].name not in tableInsertModeElements: 300 return self.insertElementNormal(token) 301 else: 302 # We should be in the InTable mode. This means we want to do 303 # special magic element rearranging 304 parent, insertBefore = self.getTableMisnestedNodePosition() 305 if insertBefore is None: 306 parent.appendChild(element) 307 else: 308 parent.insertBefore(element, insertBefore) 309 self.openElements.append(element) 310 return element 311 312 def insertText(self, data, parent=None): 313 """Insert text data.""" 314 if parent is None: 315 parent = self.openElements[-1] 316 317 if (not self.insertFromTable or (self.insertFromTable and 318 self.openElements[-1].name 319 not in tableInsertModeElements)): 320 parent.insertText(data) 321 else: 322 # We should be in the InTable mode. This means we want to do 323 # special magic element rearranging 324 parent, insertBefore = self.getTableMisnestedNodePosition() 325 parent.insertText(data, insertBefore) 326 327 def getTableMisnestedNodePosition(self): 328 """Get the foster parent element, and sibling to insert before 329 (or None) when inserting a misnested table node""" 330 # The foster parent element is the one which comes before the most 331 # recently opened table element 332 # XXX - this is really inelegant 333 lastTable = None 334 fosterParent = None 335 insertBefore = None 336 for elm in self.openElements[::-1]: 337 if elm.name == "table": 338 lastTable = elm 339 break 340 if lastTable: 341 # XXX - we should really check that this parent is actually a 342 # node here 343 if lastTable.parent: 344 fosterParent = lastTable.parent 345 insertBefore = lastTable 346 else: 347 fosterParent = self.openElements[ 348 self.openElements.index(lastTable) - 1] 349 else: 350 fosterParent = self.openElements[0] 351 return fosterParent, insertBefore 352 353 def generateImpliedEndTags(self, exclude=None): 354 name = self.openElements[-1].name 355 # XXX td, th and tr are not actually needed 356 if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) 357 and name != exclude): 358 self.openElements.pop() 359 # XXX This is not entirely what the specification says. We should 360 # investigate it more closely. 361 self.generateImpliedEndTags(exclude) 362 363 def getDocument(self): 364 "Return the final tree" 365 return self.document 366 367 def getFragment(self): 368 "Return the final fragment" 369 # assert self.innerHTML 370 fragment = self.fragmentClass() 371 self.openElements[0].reparentChildren(fragment) 372 return fragment 373 374 def testSerializer(self, node): 375 """Serialize the subtree of node in the format required by unit tests 376 node - the node from which to start serializing""" 377 raise NotImplementedError 378