1#! /usr/bin/env python3 2# This script generates token related files from Grammar/Tokens: 3# 4# Doc/library/token-list.inc 5# Include/token.h 6# Parser/token.c 7# Lib/token.py 8 9 10SCRIPT_NAME = 'Tools/build/generate_token.py' 11AUTO_GENERATED_BY_SCRIPT = f'Auto-generated by {SCRIPT_NAME}' 12NT_OFFSET = 256 13 14def load_tokens(path): 15 tok_names = [] 16 string_to_tok = {} 17 ERRORTOKEN = None 18 with open(path) as fp: 19 for line in fp: 20 line = line.strip() 21 # strip comments 22 i = line.find('#') 23 if i >= 0: 24 line = line[:i].strip() 25 if not line: 26 continue 27 fields = line.split() 28 name = fields[0] 29 value = len(tok_names) 30 if name == 'ERRORTOKEN': 31 ERRORTOKEN = value 32 string = fields[1] if len(fields) > 1 else None 33 if string: 34 string = eval(string) 35 string_to_tok[string] = value 36 tok_names.append(name) 37 return tok_names, ERRORTOKEN, string_to_tok 38 39 40def update_file(file, content): 41 try: 42 with open(file, 'r') as fobj: 43 if fobj.read() == content: 44 return False 45 except (OSError, ValueError): 46 pass 47 with open(file, 'w') as fobj: 48 fobj.write(content) 49 return True 50 51 52token_h_template = f"""\ 53// {AUTO_GENERATED_BY_SCRIPT} 54""" 55token_h_template += """\ 56 57/* Token types */ 58#ifndef Py_INTERNAL_TOKEN_H 59#define Py_INTERNAL_TOKEN_H 60#ifdef __cplusplus 61extern "C" { 62#endif 63 64#ifndef Py_BUILD_CORE 65# error "this header requires Py_BUILD_CORE define" 66#endif 67 68#undef TILDE /* Prevent clash of our definition with system macro. Ex AIX, ioctl.h */ 69 70%s\ 71#define N_TOKENS %d 72#define NT_OFFSET %d 73 74/* Special definitions for cooperation with parser */ 75 76#define ISTERMINAL(x) ((x) < NT_OFFSET) 77#define ISNONTERMINAL(x) ((x) >= NT_OFFSET) 78#define ISEOF(x) ((x) == ENDMARKER) 79#define ISWHITESPACE(x) ((x) == ENDMARKER || \\ 80 (x) == NEWLINE || \\ 81 (x) == INDENT || \\ 82 (x) == DEDENT) 83#define ISSTRINGLIT(x) ((x) == STRING || \\ 84 (x) == FSTRING_MIDDLE) 85 86 87// Export these 4 symbols for 'test_peg_generator' 88PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */ 89PyAPI_FUNC(int) _PyToken_OneChar(int); 90PyAPI_FUNC(int) _PyToken_TwoChars(int, int); 91PyAPI_FUNC(int) _PyToken_ThreeChars(int, int, int); 92 93#ifdef __cplusplus 94} 95#endif 96#endif // !Py_INTERNAL_TOKEN_H 97""" 98 99def make_h(infile, outfile='Include/internal/pycore_token.h'): 100 tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile) 101 102 defines = [] 103 for value, name in enumerate(tok_names[:ERRORTOKEN + 1]): 104 defines.append("#define %-15s %d\n" % (name, value)) 105 106 if update_file(outfile, token_h_template % ( 107 ''.join(defines), 108 len(tok_names), 109 NT_OFFSET 110 )): 111 print("%s regenerated from %s" % (outfile, infile)) 112 113 114token_c_template = f"""\ 115/* {AUTO_GENERATED_BY_SCRIPT} */ 116""" 117token_c_template += """\ 118 119#include "Python.h" 120#include "pycore_token.h" 121 122/* Token names */ 123 124const char * const _PyParser_TokenNames[] = { 125%s\ 126}; 127 128/* Return the token corresponding to a single character */ 129 130int 131_PyToken_OneChar(int c1) 132{ 133%s\ 134 return OP; 135} 136 137int 138_PyToken_TwoChars(int c1, int c2) 139{ 140%s\ 141 return OP; 142} 143 144int 145_PyToken_ThreeChars(int c1, int c2, int c3) 146{ 147%s\ 148 return OP; 149} 150""" 151 152def generate_chars_to_token(mapping, n=1): 153 result = [] 154 write = result.append 155 indent = ' ' * n 156 write(indent) 157 write('switch (c%d) {\n' % (n,)) 158 for c in sorted(mapping): 159 write(indent) 160 value = mapping[c] 161 if isinstance(value, dict): 162 write("case '%s':\n" % (c,)) 163 write(generate_chars_to_token(value, n + 1)) 164 write(indent) 165 write(' break;\n') 166 else: 167 write("case '%s': return %s;\n" % (c, value)) 168 write(indent) 169 write('}\n') 170 return ''.join(result) 171 172def make_c(infile, outfile='Parser/token.c'): 173 tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile) 174 string_to_tok['<>'] = string_to_tok['!='] 175 chars_to_token = {} 176 for string, value in string_to_tok.items(): 177 assert 1 <= len(string) <= 3 178 name = tok_names[value] 179 m = chars_to_token.setdefault(len(string), {}) 180 for c in string[:-1]: 181 m = m.setdefault(c, {}) 182 m[string[-1]] = name 183 184 names = [] 185 for value, name in enumerate(tok_names): 186 if value >= ERRORTOKEN: 187 name = '<%s>' % name 188 names.append(' "%s",\n' % name) 189 names.append(' "<N_TOKENS>",\n') 190 191 if update_file(outfile, token_c_template % ( 192 ''.join(names), 193 generate_chars_to_token(chars_to_token[1]), 194 generate_chars_to_token(chars_to_token[2]), 195 generate_chars_to_token(chars_to_token[3]) 196 )): 197 print("%s regenerated from %s" % (outfile, infile)) 198 199 200token_inc_template = f"""\ 201.. {AUTO_GENERATED_BY_SCRIPT} 202%s 203.. data:: N_TOKENS 204 205.. data:: NT_OFFSET 206""" 207 208def make_rst(infile, outfile='Doc/library/token-list.inc'): 209 tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile) 210 tok_to_string = {value: s for s, value in string_to_tok.items()} 211 212 names = [] 213 for value, name in enumerate(tok_names[:ERRORTOKEN + 1]): 214 names.append('.. data:: %s' % (name,)) 215 if value in tok_to_string: 216 names.append('') 217 names.append(' Token value for ``"%s"``.' % tok_to_string[value]) 218 names.append('') 219 220 if update_file(outfile, token_inc_template % '\n'.join(names)): 221 print("%s regenerated from %s" % (outfile, infile)) 222 223 224token_py_template = f'''\ 225"""Token constants.""" 226# {AUTO_GENERATED_BY_SCRIPT} 227''' 228token_py_template += ''' 229__all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF', 230 'EXACT_TOKEN_TYPES'] 231 232%s 233N_TOKENS = %d 234# Special definitions for cooperation with parser 235NT_OFFSET = %d 236 237tok_name = {value: name 238 for name, value in globals().items() 239 if isinstance(value, int) and not name.startswith('_')} 240__all__.extend(tok_name.values()) 241 242EXACT_TOKEN_TYPES = { 243%s 244} 245 246def ISTERMINAL(x): 247 return x < NT_OFFSET 248 249def ISNONTERMINAL(x): 250 return x >= NT_OFFSET 251 252def ISEOF(x): 253 return x == ENDMARKER 254''' 255 256def make_py(infile, outfile='Lib/token.py'): 257 tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile) 258 259 constants = [] 260 for value, name in enumerate(tok_names): 261 constants.append('%s = %d' % (name, value)) 262 constants.insert(ERRORTOKEN, 263 "# These aren't used by the C tokenizer but are needed for tokenize.py") 264 265 token_types = [] 266 for s, value in sorted(string_to_tok.items()): 267 token_types.append(' %r: %s,' % (s, tok_names[value])) 268 269 if update_file(outfile, token_py_template % ( 270 '\n'.join(constants), 271 len(tok_names), 272 NT_OFFSET, 273 '\n'.join(token_types), 274 )): 275 print("%s regenerated from %s" % (outfile, infile)) 276 277 278def main(op, infile='Grammar/Tokens', *args): 279 make = globals()['make_' + op] 280 make(infile, *args) 281 282 283if __name__ == '__main__': 284 import sys 285 main(*sys.argv[1:]) 286