• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1import itertools
2import os
3import re
4import sys
5
6
7def get_characters():
8    """Find every Unicode character that is valid in a Python `identifier`_ but
9    is not matched by the regex ``\\w`` group.
10
11    ``\\w`` matches some characters that aren't valid in identifiers, but
12    :meth:`str.isidentifier` will catch that later in lexing.
13
14    All start characters are valid continue characters, so we only test for
15    continue characters.
16
17    _identifier: https://docs.python.org/3/reference/lexical_analysis.html#identifiers
18    """
19    for cp in range(sys.maxunicode + 1):
20        s = chr(cp)
21
22        if ("a" + s).isidentifier() and not re.match(r"\w", s):
23            yield s
24
25
26def collapse_ranges(data):
27    """Given a sorted list of unique characters, generate ranges representing
28    sequential code points.
29
30    Source: https://stackoverflow.com/a/4629241/400617
31    """
32    for _, b in itertools.groupby(enumerate(data), lambda x: ord(x[1]) - x[0]):
33        b = list(b)
34        yield b[0][1], b[-1][1]
35
36
37def build_pattern(ranges):
38    """Output the regex pattern for ranges of characters.
39
40    One and two character ranges output the individual characters.
41    """
42    out = []
43
44    for a, b in ranges:
45        if a == b:  # single char
46            out.append(a)
47        elif ord(b) - ord(a) == 1:  # two chars, range is redundant
48            out.append(a)
49            out.append(b)
50        else:
51            out.append(f"{a}-{b}")
52
53    return "".join(out)
54
55
56def main():
57    """Build the regex pattern and write it to
58    ``jinja2/_identifier.py``.
59    """
60    pattern = build_pattern(collapse_ranges(get_characters()))
61    filename = os.path.abspath(
62        os.path.join(os.path.dirname(__file__), "..", "src", "jinja2", "_identifier.py")
63    )
64
65    with open(filename, "w", encoding="utf8") as f:
66        f.write("import re\n\n")
67        f.write("# generated by scripts/generate_identifier_pattern.py\n")
68        f.write("pattern = re.compile(\n")
69        f.write(f'    r"[\\w{pattern}]+"  # noqa: B950\n')
70        f.write(")\n")
71
72
73if __name__ == "__main__":
74    main()
75