1# 2# ElementTree 3# $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $ 4# 5# limited xpath support for element trees 6# 7# history: 8# 2003-05-23 fl created 9# 2003-05-28 fl added support for // etc 10# 2003-08-27 fl fixed parsing of periods in element names 11# 2007-09-10 fl new selection engine 12# 2007-09-12 fl fixed parent selector 13# 2007-09-13 fl added iterfind; changed findall to return a list 14# 2007-11-30 fl added namespaces support 15# 2009-10-30 fl added child element value filter 16# 17# Copyright (c) 2003-2009 by Fredrik Lundh. All rights reserved. 18# 19# fredrik@pythonware.com 20# http://www.pythonware.com 21# 22# -------------------------------------------------------------------- 23# The ElementTree toolkit is 24# 25# Copyright (c) 1999-2009 by Fredrik Lundh 26# 27# By obtaining, using, and/or copying this software and/or its 28# associated documentation, you agree that you have read, understood, 29# and will comply with the following terms and conditions: 30# 31# Permission to use, copy, modify, and distribute this software and 32# its associated documentation for any purpose and without fee is 33# hereby granted, provided that the above copyright notice appears in 34# all copies, and that both that copyright notice and this permission 35# notice appear in supporting documentation, and that the name of 36# Secret Labs AB or the author not be used in advertising or publicity 37# pertaining to distribution of the software without specific, written 38# prior permission. 39# 40# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD 41# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- 42# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR 43# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY 44# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 45# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 46# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 47# OF THIS SOFTWARE. 48# -------------------------------------------------------------------- 49 50# Licensed to PSF under a Contributor Agreement. 51# See http://www.python.org/psf/license for licensing details. 52 53## 54# Implementation module for XPath support. There's usually no reason 55# to import this module directly; the <b>ElementTree</b> does this for 56# you, if needed. 57## 58 59import re 60 61xpath_tokenizer_re = re.compile( 62 r"(" 63 r"'[^']*'|\"[^\"]*\"|" 64 r"::|" 65 r"//?|" 66 r"\.\.|" 67 r"\(\)|" 68 r"[/.*:\[\]\(\)@=])|" 69 r"((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|" 70 r"\s+" 71 ) 72 73def xpath_tokenizer(pattern, namespaces=None): 74 default_namespace = namespaces.get('') if namespaces else None 75 parsing_attribute = False 76 for token in xpath_tokenizer_re.findall(pattern): 77 ttype, tag = token 78 if tag and tag[0] != "{": 79 if ":" in tag: 80 prefix, uri = tag.split(":", 1) 81 try: 82 if not namespaces: 83 raise KeyError 84 yield ttype, "{%s}%s" % (namespaces[prefix], uri) 85 except KeyError: 86 raise SyntaxError("prefix %r not found in prefix map" % prefix) from None 87 elif default_namespace and not parsing_attribute: 88 yield ttype, "{%s}%s" % (default_namespace, tag) 89 else: 90 yield token 91 parsing_attribute = False 92 else: 93 yield token 94 parsing_attribute = ttype == '@' 95 96 97def get_parent_map(context): 98 parent_map = context.parent_map 99 if parent_map is None: 100 context.parent_map = parent_map = {} 101 for p in context.root.iter(): 102 for e in p: 103 parent_map[e] = p 104 return parent_map 105 106 107def _is_wildcard_tag(tag): 108 return tag[:3] == '{*}' or tag[-2:] == '}*' 109 110 111def _prepare_tag(tag): 112 _isinstance, _str = isinstance, str 113 if tag == '{*}*': 114 # Same as '*', but no comments or processing instructions. 115 # It can be a surprise that '*' includes those, but there is no 116 # justification for '{*}*' doing the same. 117 def select(context, result): 118 for elem in result: 119 if _isinstance(elem.tag, _str): 120 yield elem 121 elif tag == '{}*': 122 # Any tag that is not in a namespace. 123 def select(context, result): 124 for elem in result: 125 el_tag = elem.tag 126 if _isinstance(el_tag, _str) and el_tag[0] != '{': 127 yield elem 128 elif tag[:3] == '{*}': 129 # The tag in any (or no) namespace. 130 suffix = tag[2:] # '}name' 131 no_ns = slice(-len(suffix), None) 132 tag = tag[3:] 133 def select(context, result): 134 for elem in result: 135 el_tag = elem.tag 136 if el_tag == tag or _isinstance(el_tag, _str) and el_tag[no_ns] == suffix: 137 yield elem 138 elif tag[-2:] == '}*': 139 # Any tag in the given namespace. 140 ns = tag[:-1] 141 ns_only = slice(None, len(ns)) 142 def select(context, result): 143 for elem in result: 144 el_tag = elem.tag 145 if _isinstance(el_tag, _str) and el_tag[ns_only] == ns: 146 yield elem 147 else: 148 raise RuntimeError(f"internal parser error, got {tag}") 149 return select 150 151 152def prepare_child(next, token): 153 tag = token[1] 154 if _is_wildcard_tag(tag): 155 select_tag = _prepare_tag(tag) 156 def select(context, result): 157 def select_child(result): 158 for elem in result: 159 yield from elem 160 return select_tag(context, select_child(result)) 161 else: 162 if tag[:2] == '{}': 163 tag = tag[2:] # '{}tag' == 'tag' 164 def select(context, result): 165 for elem in result: 166 for e in elem: 167 if e.tag == tag: 168 yield e 169 return select 170 171def prepare_star(next, token): 172 def select(context, result): 173 for elem in result: 174 yield from elem 175 return select 176 177def prepare_self(next, token): 178 def select(context, result): 179 yield from result 180 return select 181 182def prepare_descendant(next, token): 183 try: 184 token = next() 185 except StopIteration: 186 return 187 if token[0] == "*": 188 tag = "*" 189 elif not token[0]: 190 tag = token[1] 191 else: 192 raise SyntaxError("invalid descendant") 193 194 if _is_wildcard_tag(tag): 195 select_tag = _prepare_tag(tag) 196 def select(context, result): 197 def select_child(result): 198 for elem in result: 199 for e in elem.iter(): 200 if e is not elem: 201 yield e 202 return select_tag(context, select_child(result)) 203 else: 204 if tag[:2] == '{}': 205 tag = tag[2:] # '{}tag' == 'tag' 206 def select(context, result): 207 for elem in result: 208 for e in elem.iter(tag): 209 if e is not elem: 210 yield e 211 return select 212 213def prepare_parent(next, token): 214 def select(context, result): 215 # FIXME: raise error if .. is applied at toplevel? 216 parent_map = get_parent_map(context) 217 result_map = {} 218 for elem in result: 219 if elem in parent_map: 220 parent = parent_map[elem] 221 if parent not in result_map: 222 result_map[parent] = None 223 yield parent 224 return select 225 226def prepare_predicate(next, token): 227 # FIXME: replace with real parser!!! refs: 228 # http://effbot.org/zone/simple-iterator-parser.htm 229 # http://javascript.crockford.com/tdop/tdop.html 230 signature = [] 231 predicate = [] 232 while 1: 233 try: 234 token = next() 235 except StopIteration: 236 return 237 if token[0] == "]": 238 break 239 if token == ('', ''): 240 # ignore whitespace 241 continue 242 if token[0] and token[0][:1] in "'\"": 243 token = "'", token[0][1:-1] 244 signature.append(token[0] or "-") 245 predicate.append(token[1]) 246 signature = "".join(signature) 247 # use signature to determine predicate type 248 if signature == "@-": 249 # [@attribute] predicate 250 key = predicate[1] 251 def select(context, result): 252 for elem in result: 253 if elem.get(key) is not None: 254 yield elem 255 return select 256 if signature == "@-='": 257 # [@attribute='value'] 258 key = predicate[1] 259 value = predicate[-1] 260 def select(context, result): 261 for elem in result: 262 if elem.get(key) == value: 263 yield elem 264 return select 265 if signature == "-" and not re.match(r"\-?\d+$", predicate[0]): 266 # [tag] 267 tag = predicate[0] 268 def select(context, result): 269 for elem in result: 270 if elem.find(tag) is not None: 271 yield elem 272 return select 273 if signature == ".='" or (signature == "-='" and not re.match(r"\-?\d+$", predicate[0])): 274 # [.='value'] or [tag='value'] 275 tag = predicate[0] 276 value = predicate[-1] 277 if tag: 278 def select(context, result): 279 for elem in result: 280 for e in elem.findall(tag): 281 if "".join(e.itertext()) == value: 282 yield elem 283 break 284 else: 285 def select(context, result): 286 for elem in result: 287 if "".join(elem.itertext()) == value: 288 yield elem 289 return select 290 if signature == "-" or signature == "-()" or signature == "-()-": 291 # [index] or [last()] or [last()-index] 292 if signature == "-": 293 # [index] 294 index = int(predicate[0]) - 1 295 if index < 0: 296 raise SyntaxError("XPath position >= 1 expected") 297 else: 298 if predicate[0] != "last": 299 raise SyntaxError("unsupported function") 300 if signature == "-()-": 301 try: 302 index = int(predicate[2]) - 1 303 except ValueError: 304 raise SyntaxError("unsupported expression") 305 if index > -2: 306 raise SyntaxError("XPath offset from last() must be negative") 307 else: 308 index = -1 309 def select(context, result): 310 parent_map = get_parent_map(context) 311 for elem in result: 312 try: 313 parent = parent_map[elem] 314 # FIXME: what if the selector is "*" ? 315 elems = list(parent.findall(elem.tag)) 316 if elems[index] is elem: 317 yield elem 318 except (IndexError, KeyError): 319 pass 320 return select 321 raise SyntaxError("invalid predicate") 322 323ops = { 324 "": prepare_child, 325 "*": prepare_star, 326 ".": prepare_self, 327 "..": prepare_parent, 328 "//": prepare_descendant, 329 "[": prepare_predicate, 330 } 331 332_cache = {} 333 334class _SelectorContext: 335 parent_map = None 336 def __init__(self, root): 337 self.root = root 338 339# -------------------------------------------------------------------- 340 341## 342# Generate all matching objects. 343 344def iterfind(elem, path, namespaces=None): 345 # compile selector pattern 346 if path[-1:] == "/": 347 path = path + "*" # implicit all (FIXME: keep this?) 348 349 cache_key = (path,) 350 if namespaces: 351 cache_key += tuple(sorted(namespaces.items())) 352 353 try: 354 selector = _cache[cache_key] 355 except KeyError: 356 if len(_cache) > 100: 357 _cache.clear() 358 if path[:1] == "/": 359 raise SyntaxError("cannot use absolute path on element") 360 next = iter(xpath_tokenizer(path, namespaces)).__next__ 361 try: 362 token = next() 363 except StopIteration: 364 return 365 selector = [] 366 while 1: 367 try: 368 selector.append(ops[token[0]](next, token)) 369 except StopIteration: 370 raise SyntaxError("invalid path") from None 371 try: 372 token = next() 373 if token[0] == "/": 374 token = next() 375 except StopIteration: 376 break 377 _cache[cache_key] = selector 378 # execute selector pattern 379 result = [elem] 380 context = _SelectorContext(elem) 381 for select in selector: 382 result = select(context, result) 383 return result 384 385## 386# Find first matching object. 387 388def find(elem, path, namespaces=None): 389 return next(iterfind(elem, path, namespaces), None) 390 391## 392# Find all matching objects. 393 394def findall(elem, path, namespaces=None): 395 return list(iterfind(elem, path, namespaces)) 396 397## 398# Find text for first matching object. 399 400def findtext(elem, path, default=None, namespaces=None): 401 try: 402 elem = next(iterfind(elem, path, namespaces)) 403 return elem.text or "" 404 except StopIteration: 405 return default 406