1#!/usr/bin/env python 2# Copyright 2018 the V8 project authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6import os 7import sys 8import subprocess 9import re 10import math 11 12INPUT_PATH = "src/parsing/keywords.txt" 13OUTPUT_PATH = "src/parsing/keywords-gen.h" 14 15# TODO(leszeks): Trimming seems to regress performance, investigate. 16TRIM_CHAR_TABLE = False 17 18 19def next_power_of_2(x): 20 return 1 if x == 0 else 2**int(math.ceil(math.log(x, 2))) 21 22 23def call_with_input(cmd, input_string=""): 24 p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) 25 stdout, _ = p.communicate(input_string) 26 retcode = p.wait() 27 if retcode != 0: 28 raise subprocess.CalledProcessError(retcode, cmd) 29 return stdout 30 31 32def checked_sub(pattern, sub, out, count=1, flags=0): 33 out, n = re.subn(pattern, sub, out, flags=flags) 34 if n != count: 35 raise Exception("Didn't get exactly %d replacement(s) for pattern: %s" % 36 (count, pattern)) 37 return out 38 39 40def change_sizet_to_int(out): 41 # Literal buffer lengths are given as ints, not size_t 42 return checked_sub(r'\bsize_t\b', 'int', out, count=4) 43 44 45def drop_line_directives(out): 46 # #line causes gcov issue, so drop it 47 return re.sub(r'^#\s*line .*$\n', '', out, flags=re.MULTILINE) 48 49 50def trim_and_dcheck_char_table(out): 51 # Potential keyword strings are known to be lowercase ascii, so chop off the 52 # rest of the table and mask out the char 53 54 reads_re = re.compile( 55 r'asso_values\[static_cast<unsigned char>\(str\[(\d+)\]\)\]') 56 57 dchecks = [] 58 for str_read in reads_re.finditer(out): 59 dchecks.append("DCHECK_LT(str[%d], 128);" % int(str_read.group(1))) 60 61 if TRIM_CHAR_TABLE: 62 out = checked_sub( 63 r'static const unsigned char asso_values\[\]\s*=\s*\{(\s*\d+\s*,){96}', 64 "".join(dchecks) + r'static const unsigned char asso_values[32] = {', 65 out, 66 flags=re.MULTILINE) 67 out = checked_sub( 68 reads_re.pattern, 69 r'asso_values[static_cast<unsigned char>(str[(\1)]&31)]', 70 out, 71 count=len(dchecks), 72 flags=re.MULTILINE) 73 else: 74 out = checked_sub( 75 r'static const unsigned char asso_values\[\]\s*=\s*\{', 76 "".join(dchecks) + r'static const unsigned char asso_values[128] = {', 77 out, 78 flags=re.MULTILINE) 79 80 return out 81 82 83def use_isinrange(out): 84 # Our IsInRange method is more efficient than checking for min/max length 85 return checked_sub(r'if \(len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH\)', 86 r'if (base::IsInRange(len, MIN_WORD_LENGTH, ' 87 + r'MAX_WORD_LENGTH))', 88 out) 89 90 91def pad_tables(out): 92 # We don't want to compare against the max hash value, so pad the tables up 93 # to a power of two and mask the hash. 94 95 # First get the new size 96 max_hash_value = int(re.search(r'MAX_HASH_VALUE\s*=\s*(\d+)', out).group(1)) 97 old_table_length = max_hash_value + 1 98 new_table_length = next_power_of_2(old_table_length) 99 table_padding_len = new_table_length - old_table_length 100 101 # Pad the length table. 102 single_lengthtable_entry = r'\d+' 103 out = checked_sub( 104 r""" 105 static\ const\ unsigned\ char\ kPerfectKeywordLengthTable\[\]\s*=\s*\{ 106 ( 107 \s*%(single_lengthtable_entry)s\s* 108 (?:,\s*%(single_lengthtable_entry)s\s*)* 109 ) 110 \} 111 """ % {'single_lengthtable_entry': single_lengthtable_entry}, 112 r'static const unsigned char kPerfectKeywordLengthTable[%d] = { \1 %s }' 113 % (new_table_length, "".join([',0'] * table_padding_len)), 114 out, 115 flags=re.MULTILINE | re.VERBOSE) 116 117 # Pad the word list. 118 single_wordlist_entry = r""" 119 (?:\#line\ \d+\ ".*"$\s*)? 120 \{\s*"[a-z]*"\s*,\s*Token::[A-Z_]+\} 121 """ 122 out = checked_sub( 123 r""" 124 static\ const\ struct\ PerfectKeywordHashTableEntry\ kPerfectKeywordHashTable\[\]\s*=\s*\{ 125 ( 126 \s*%(single_wordlist_entry)s\s* 127 (?:,\s*%(single_wordlist_entry)s\s*)* 128 ) 129 \} 130 """ % {'single_wordlist_entry': single_wordlist_entry}, 131 r'static const struct PerfectKeywordHashTableEntry kPerfectKeywordHashTable[%d] = {\1 %s }' 132 % (new_table_length, "".join( 133 [',{"",Token::IDENTIFIER}'] * table_padding_len)), 134 out, 135 flags=re.MULTILINE | re.VERBOSE) 136 137 # Mask the hash and replace the range check with DCHECKs. 138 out = checked_sub(r'Hash\s*\(\s*str,\s*len\s*\)', 139 r'Hash(str, len)&0x%x' % (new_table_length - 1), out) 140 out = checked_sub( 141 r'if \(key <= MAX_HASH_VALUE\)', 142 r'DCHECK_LT(key, arraysize(kPerfectKeywordLengthTable));DCHECK_LT(key, arraysize(kPerfectKeywordHashTable));', 143 out) 144 145 return out 146 147 148def return_token(out): 149 # We want to return the actual token rather than the table entry. 150 151 # Change the return type of the function. Make it inline too. 152 out = checked_sub( 153 r'const\s*struct\s*PerfectKeywordHashTableEntry\s*\*\s*((?:PerfectKeywordHash::)?GetToken)', 154 r'inline Token::Value \1', 155 out, 156 count=2) 157 158 # Change the return value when the keyword is found 159 out = checked_sub(r'return &kPerfectKeywordHashTable\[key\];', 160 r'return kPerfectKeywordHashTable[key].value;', out) 161 162 # Change the return value when the keyword is not found 163 out = checked_sub(r'return 0;', r'return Token::IDENTIFIER;', out) 164 165 return out 166 167 168def memcmp_to_while(out): 169 # It's faster to loop over the keyword with a while loop than calling memcmp. 170 # Careful, this replacement is quite flaky, because otherwise the regex is 171 # unreadable. 172 return checked_sub( 173 re.escape("if (*str == *s && !memcmp (str + 1, s + 1, len - 1))") + r"\s*" 174 + re.escape("return kPerfectKeywordHashTable[key].value;"), 175 """ 176 while(*s!=0) { 177 if (*s++ != *str++) return Token::IDENTIFIER; 178 } 179 return kPerfectKeywordHashTable[key].value; 180 """, 181 out, 182 flags=re.MULTILINE) 183 184 185def wrap_namespace(out): 186 return """// Copyright 2018 the V8 project authors. All rights reserved. 187// Use of this source code is governed by a BSD-style license that can be 188// found in the LICENSE file. 189 190// This file is automatically generated by gen-keywords-gen-h.py and should not 191// be modified manually. 192 193#ifndef V8_PARSING_KEYWORDS_GEN_H_ 194#define V8_PARSING_KEYWORDS_GEN_H_ 195 196#include "src/parsing/token.h" 197 198namespace v8 { 199namespace internal { 200 201%s 202 203} // namespace internal 204} // namespace v8 205 206#endif // V8_PARSING_KEYWORDS_GEN_H_ 207""" % (out) 208 209 210def trim_character_set_warning(out): 211 # gperf generates an error message that is too large, trim it 212 213 return out.replace( 214 '"gperf generated tables don\'t work with this execution character set. Please report a bug to <bug-gperf@gnu.org>."', 215 '"gperf generated tables don\'t work with this execution character set."\\\n// If you see this error, please report a bug to <bug-gperf@gnu.org>.' 216 ) 217 218 219def main(): 220 try: 221 script_dir = os.path.dirname(sys.argv[0]) 222 root_dir = os.path.join(script_dir, '..') 223 224 out = subprocess.check_output(["gperf", "-m100", INPUT_PATH], cwd=root_dir) 225 226 # And now some munging of the generated file. 227 out = change_sizet_to_int(out) 228 out = drop_line_directives(out) 229 out = trim_and_dcheck_char_table(out) 230 out = use_isinrange(out) 231 out = pad_tables(out) 232 out = return_token(out) 233 out = memcmp_to_while(out) 234 out = wrap_namespace(out) 235 out = trim_character_set_warning(out) 236 237 # Final formatting. 238 clang_format_path = os.path.join(root_dir, 239 'third_party/depot_tools/clang-format') 240 out = call_with_input([clang_format_path], out) 241 242 with open(os.path.join(root_dir, OUTPUT_PATH), 'w') as f: 243 f.write(out) 244 245 return 0 246 247 except subprocess.CalledProcessError as e: 248 sys.stderr.write("Error calling '{}'\n".format(" ".join(e.cmd))) 249 return e.returncode 250 251 252if __name__ == '__main__': 253 sys.exit(main()) 254