1import itertools 2import os 3import re 4import sys 5 6 7def get_characters(): 8 """Find every Unicode character that is valid in a Python `identifier`_ but 9 is not matched by the regex ``\\w`` group. 10 11 ``\\w`` matches some characters that aren't valid in identifiers, but 12 :meth:`str.isidentifier` will catch that later in lexing. 13 14 All start characters are valid continue characters, so we only test for 15 continue characters. 16 17 _identifier: https://docs.python.org/3/reference/lexical_analysis.html#identifiers 18 """ 19 for cp in range(sys.maxunicode + 1): 20 s = chr(cp) 21 22 if ("a" + s).isidentifier() and not re.match(r"\w", s): 23 yield s 24 25 26def collapse_ranges(data): 27 """Given a sorted list of unique characters, generate ranges representing 28 sequential code points. 29 30 Source: https://stackoverflow.com/a/4629241/400617 31 """ 32 for _, b in itertools.groupby(enumerate(data), lambda x: ord(x[1]) - x[0]): 33 b = list(b) 34 yield b[0][1], b[-1][1] 35 36 37def build_pattern(ranges): 38 """Output the regex pattern for ranges of characters. 39 40 One and two character ranges output the individual characters. 41 """ 42 out = [] 43 44 for a, b in ranges: 45 if a == b: # single char 46 out.append(a) 47 elif ord(b) - ord(a) == 1: # two chars, range is redundant 48 out.append(a) 49 out.append(b) 50 else: 51 out.append(f"{a}-{b}") 52 53 return "".join(out) 54 55 56def main(): 57 """Build the regex pattern and write it to 58 ``jinja2/_identifier.py``. 59 """ 60 pattern = build_pattern(collapse_ranges(get_characters())) 61 filename = os.path.abspath( 62 os.path.join(os.path.dirname(__file__), "..", "src", "jinja2", "_identifier.py") 63 ) 64 65 with open(filename, "w", encoding="utf8") as f: 66 f.write("import re\n\n") 67 f.write("# generated by scripts/generate_identifier_pattern.py\n") 68 f.write("pattern = re.compile(\n") 69 f.write(f' r"[\\w{pattern}]+" # noqa: B950\n') 70 f.write(")\n") 71 72 73if __name__ == "__main__": 74 main() 75