1""" 2Iterator based sre token scanner 3""" 4import sre_parse, sre_compile, sre_constants 5from sre_constants import BRANCH, SUBPATTERN 6from re import VERBOSE, MULTILINE, DOTALL 7import re 8 9__all__ = ['Scanner', 'pattern'] 10 11FLAGS = (VERBOSE | MULTILINE | DOTALL) 12class Scanner(object): 13 def __init__(self, lexicon, flags=FLAGS): 14 self.actions = [None] 15 # combine phrases into a compound pattern 16 s = sre_parse.Pattern() 17 s.flags = flags 18 p = [] 19 for idx, token in enumerate(lexicon): 20 phrase = token.pattern 21 try: 22 subpattern = sre_parse.SubPattern(s, 23 [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))]) 24 except sre_constants.error: 25 raise 26 p.append(subpattern) 27 self.actions.append(token) 28 29 p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) 30 self.scanner = sre_compile.compile(p) 31 32 33 def iterscan(self, string, idx=0, context=None): 34 """ 35 Yield match, end_idx for each match 36 """ 37 match = self.scanner.scanner(string, idx).match 38 actions = self.actions 39 lastend = idx 40 end = len(string) 41 while True: 42 m = match() 43 if m is None: 44 break 45 matchbegin, matchend = m.span() 46 if lastend == matchend: 47 break 48 action = actions[m.lastindex] 49 if action is not None: 50 rval, next_pos = action(m, context) 51 if next_pos is not None and next_pos != matchend: 52 # "fast forward" the scanner 53 matchend = next_pos 54 match = self.scanner.scanner(string, matchend).match 55 yield rval, matchend 56 lastend = matchend 57 58def pattern(pattern, flags=FLAGS): 59 def decorator(fn): 60 fn.pattern = pattern 61 fn.regex = re.compile(pattern, flags) 62 return fn 63 return decorator 64