1# 2# ElementTree 3# $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $ 4# 5# limited xpath support for element trees 6# 7# history: 8# 2003-05-23 fl created 9# 2003-05-28 fl added support for // etc 10# 2003-08-27 fl fixed parsing of periods in element names 11# 2007-09-10 fl new selection engine 12# 2007-09-12 fl fixed parent selector 13# 2007-09-13 fl added iterfind; changed findall to return a list 14# 2007-11-30 fl added namespaces support 15# 2009-10-30 fl added child element value filter 16# 17# Copyright (c) 2003-2009 by Fredrik Lundh. All rights reserved. 18# 19# fredrik@pythonware.com 20# http://www.pythonware.com 21# 22# -------------------------------------------------------------------- 23# The ElementTree toolkit is 24# 25# Copyright (c) 1999-2009 by Fredrik Lundh 26# 27# By obtaining, using, and/or copying this software and/or its 28# associated documentation, you agree that you have read, understood, 29# and will comply with the following terms and conditions: 30# 31# Permission to use, copy, modify, and distribute this software and 32# its associated documentation for any purpose and without fee is 33# hereby granted, provided that the above copyright notice appears in 34# all copies, and that both that copyright notice and this permission 35# notice appear in supporting documentation, and that the name of 36# Secret Labs AB or the author not be used in advertising or publicity 37# pertaining to distribution of the software without specific, written 38# prior permission. 39# 40# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD 41# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- 42# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR 43# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY 44# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 45# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 46# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 47# OF THIS SOFTWARE. 48# -------------------------------------------------------------------- 49 50# Licensed to PSF under a Contributor Agreement. 51# See https://www.python.org/psf/license for licensing details. 52 53## 54# Implementation module for XPath support. There's usually no reason 55# to import this module directly; the <b>ElementTree</b> does this for 56# you, if needed. 57## 58 59import re 60 61xpath_tokenizer_re = re.compile( 62 r"(" 63 r"'[^']*'|\"[^\"]*\"|" 64 r"::|" 65 r"//?|" 66 r"\.\.|" 67 r"\(\)|" 68 r"!=|" 69 r"[/.*:\[\]\(\)@=])|" 70 r"((?:\{[^}]+\})?[^/\[\]\(\)@!=\s]+)|" 71 r"\s+" 72 ) 73 74def xpath_tokenizer(pattern, namespaces=None): 75 default_namespace = namespaces.get('') if namespaces else None 76 parsing_attribute = False 77 for token in xpath_tokenizer_re.findall(pattern): 78 ttype, tag = token 79 if tag and tag[0] != "{": 80 if ":" in tag: 81 prefix, uri = tag.split(":", 1) 82 try: 83 if not namespaces: 84 raise KeyError 85 yield ttype, "{%s}%s" % (namespaces[prefix], uri) 86 except KeyError: 87 raise SyntaxError("prefix %r not found in prefix map" % prefix) from None 88 elif default_namespace and not parsing_attribute: 89 yield ttype, "{%s}%s" % (default_namespace, tag) 90 else: 91 yield token 92 parsing_attribute = False 93 else: 94 yield token 95 parsing_attribute = ttype == '@' 96 97 98def get_parent_map(context): 99 parent_map = context.parent_map 100 if parent_map is None: 101 context.parent_map = parent_map = {} 102 for p in context.root.iter(): 103 for e in p: 104 parent_map[e] = p 105 return parent_map 106 107 108def _is_wildcard_tag(tag): 109 return tag[:3] == '{*}' or tag[-2:] == '}*' 110 111 112def _prepare_tag(tag): 113 _isinstance, _str = isinstance, str 114 if tag == '{*}*': 115 # Same as '*', but no comments or processing instructions. 116 # It can be a surprise that '*' includes those, but there is no 117 # justification for '{*}*' doing the same. 118 def select(context, result): 119 for elem in result: 120 if _isinstance(elem.tag, _str): 121 yield elem 122 elif tag == '{}*': 123 # Any tag that is not in a namespace. 124 def select(context, result): 125 for elem in result: 126 el_tag = elem.tag 127 if _isinstance(el_tag, _str) and el_tag[0] != '{': 128 yield elem 129 elif tag[:3] == '{*}': 130 # The tag in any (or no) namespace. 131 suffix = tag[2:] # '}name' 132 no_ns = slice(-len(suffix), None) 133 tag = tag[3:] 134 def select(context, result): 135 for elem in result: 136 el_tag = elem.tag 137 if el_tag == tag or _isinstance(el_tag, _str) and el_tag[no_ns] == suffix: 138 yield elem 139 elif tag[-2:] == '}*': 140 # Any tag in the given namespace. 141 ns = tag[:-1] 142 ns_only = slice(None, len(ns)) 143 def select(context, result): 144 for elem in result: 145 el_tag = elem.tag 146 if _isinstance(el_tag, _str) and el_tag[ns_only] == ns: 147 yield elem 148 else: 149 raise RuntimeError(f"internal parser error, got {tag}") 150 return select 151 152 153def prepare_child(next, token): 154 tag = token[1] 155 if _is_wildcard_tag(tag): 156 select_tag = _prepare_tag(tag) 157 def select(context, result): 158 def select_child(result): 159 for elem in result: 160 yield from elem 161 return select_tag(context, select_child(result)) 162 else: 163 if tag[:2] == '{}': 164 tag = tag[2:] # '{}tag' == 'tag' 165 def select(context, result): 166 for elem in result: 167 for e in elem: 168 if e.tag == tag: 169 yield e 170 return select 171 172def prepare_star(next, token): 173 def select(context, result): 174 for elem in result: 175 yield from elem 176 return select 177 178def prepare_self(next, token): 179 def select(context, result): 180 yield from result 181 return select 182 183def prepare_descendant(next, token): 184 try: 185 token = next() 186 except StopIteration: 187 return 188 if token[0] == "*": 189 tag = "*" 190 elif not token[0]: 191 tag = token[1] 192 else: 193 raise SyntaxError("invalid descendant") 194 195 if _is_wildcard_tag(tag): 196 select_tag = _prepare_tag(tag) 197 def select(context, result): 198 def select_child(result): 199 for elem in result: 200 for e in elem.iter(): 201 if e is not elem: 202 yield e 203 return select_tag(context, select_child(result)) 204 else: 205 if tag[:2] == '{}': 206 tag = tag[2:] # '{}tag' == 'tag' 207 def select(context, result): 208 for elem in result: 209 for e in elem.iter(tag): 210 if e is not elem: 211 yield e 212 return select 213 214def prepare_parent(next, token): 215 def select(context, result): 216 # FIXME: raise error if .. is applied at toplevel? 217 parent_map = get_parent_map(context) 218 result_map = {} 219 for elem in result: 220 if elem in parent_map: 221 parent = parent_map[elem] 222 if parent not in result_map: 223 result_map[parent] = None 224 yield parent 225 return select 226 227def prepare_predicate(next, token): 228 # FIXME: replace with real parser!!! refs: 229 # http://effbot.org/zone/simple-iterator-parser.htm 230 # http://javascript.crockford.com/tdop/tdop.html 231 signature = [] 232 predicate = [] 233 while 1: 234 try: 235 token = next() 236 except StopIteration: 237 return 238 if token[0] == "]": 239 break 240 if token == ('', ''): 241 # ignore whitespace 242 continue 243 if token[0] and token[0][:1] in "'\"": 244 token = "'", token[0][1:-1] 245 signature.append(token[0] or "-") 246 predicate.append(token[1]) 247 signature = "".join(signature) 248 # use signature to determine predicate type 249 if signature == "@-": 250 # [@attribute] predicate 251 key = predicate[1] 252 def select(context, result): 253 for elem in result: 254 if elem.get(key) is not None: 255 yield elem 256 return select 257 if signature == "@-='" or signature == "@-!='": 258 # [@attribute='value'] or [@attribute!='value'] 259 key = predicate[1] 260 value = predicate[-1] 261 def select(context, result): 262 for elem in result: 263 if elem.get(key) == value: 264 yield elem 265 def select_negated(context, result): 266 for elem in result: 267 if (attr_value := elem.get(key)) is not None and attr_value != value: 268 yield elem 269 return select_negated if '!=' in signature else select 270 if signature == "-" and not re.match(r"\-?\d+$", predicate[0]): 271 # [tag] 272 tag = predicate[0] 273 def select(context, result): 274 for elem in result: 275 if elem.find(tag) is not None: 276 yield elem 277 return select 278 if signature == ".='" or signature == ".!='" or ( 279 (signature == "-='" or signature == "-!='") 280 and not re.match(r"\-?\d+$", predicate[0])): 281 # [.='value'] or [tag='value'] or [.!='value'] or [tag!='value'] 282 tag = predicate[0] 283 value = predicate[-1] 284 if tag: 285 def select(context, result): 286 for elem in result: 287 for e in elem.findall(tag): 288 if "".join(e.itertext()) == value: 289 yield elem 290 break 291 def select_negated(context, result): 292 for elem in result: 293 for e in elem.iterfind(tag): 294 if "".join(e.itertext()) != value: 295 yield elem 296 break 297 else: 298 def select(context, result): 299 for elem in result: 300 if "".join(elem.itertext()) == value: 301 yield elem 302 def select_negated(context, result): 303 for elem in result: 304 if "".join(elem.itertext()) != value: 305 yield elem 306 return select_negated if '!=' in signature else select 307 if signature == "-" or signature == "-()" or signature == "-()-": 308 # [index] or [last()] or [last()-index] 309 if signature == "-": 310 # [index] 311 index = int(predicate[0]) - 1 312 if index < 0: 313 raise SyntaxError("XPath position >= 1 expected") 314 else: 315 if predicate[0] != "last": 316 raise SyntaxError("unsupported function") 317 if signature == "-()-": 318 try: 319 index = int(predicate[2]) - 1 320 except ValueError: 321 raise SyntaxError("unsupported expression") 322 if index > -2: 323 raise SyntaxError("XPath offset from last() must be negative") 324 else: 325 index = -1 326 def select(context, result): 327 parent_map = get_parent_map(context) 328 for elem in result: 329 try: 330 parent = parent_map[elem] 331 # FIXME: what if the selector is "*" ? 332 elems = list(parent.findall(elem.tag)) 333 if elems[index] is elem: 334 yield elem 335 except (IndexError, KeyError): 336 pass 337 return select 338 raise SyntaxError("invalid predicate") 339 340ops = { 341 "": prepare_child, 342 "*": prepare_star, 343 ".": prepare_self, 344 "..": prepare_parent, 345 "//": prepare_descendant, 346 "[": prepare_predicate, 347 } 348 349_cache = {} 350 351class _SelectorContext: 352 parent_map = None 353 def __init__(self, root): 354 self.root = root 355 356# -------------------------------------------------------------------- 357 358## 359# Generate all matching objects. 360 361def iterfind(elem, path, namespaces=None): 362 # compile selector pattern 363 if path[-1:] == "/": 364 path = path + "*" # implicit all (FIXME: keep this?) 365 366 cache_key = (path,) 367 if namespaces: 368 cache_key += tuple(sorted(namespaces.items())) 369 370 try: 371 selector = _cache[cache_key] 372 except KeyError: 373 if len(_cache) > 100: 374 _cache.clear() 375 if path[:1] == "/": 376 raise SyntaxError("cannot use absolute path on element") 377 next = iter(xpath_tokenizer(path, namespaces)).__next__ 378 try: 379 token = next() 380 except StopIteration: 381 return 382 selector = [] 383 while 1: 384 try: 385 selector.append(ops[token[0]](next, token)) 386 except StopIteration: 387 raise SyntaxError("invalid path") from None 388 try: 389 token = next() 390 if token[0] == "/": 391 token = next() 392 except StopIteration: 393 break 394 _cache[cache_key] = selector 395 # execute selector pattern 396 result = [elem] 397 context = _SelectorContext(elem) 398 for select in selector: 399 result = select(context, result) 400 return result 401 402## 403# Find first matching object. 404 405def find(elem, path, namespaces=None): 406 return next(iterfind(elem, path, namespaces), None) 407 408## 409# Find all matching objects. 410 411def findall(elem, path, namespaces=None): 412 return list(iterfind(elem, path, namespaces)) 413 414## 415# Find text for first matching object. 416 417def findtext(elem, path, default=None, namespaces=None): 418 try: 419 elem = next(iterfind(elem, path, namespaces)) 420 return elem.text or "" 421 except StopIteration: 422 return default 423