1#! /usr/bin/python 2 3# PCRE2 UNICODE PROPERTY SUPPORT 4# ------------------------------ 5# 6# This file auto-generates unicode property tests and their expected output. 7# It is recommended to re-run this generator after the unicode files are 8# updated. The names of the generated files are `testinput26` and `testoutput26` 9 10import re 11import sys 12 13from GenerateCommon import \ 14 script_names, \ 15 script_abbrevs 16 17def write_both(text): 18 input_file.write(text) 19 output_file.write(text) 20 21def to_string_char(ch_idx): 22 if ch_idx < 128: 23 if ch_idx < 16: 24 return "\\x{0%x}" % ch_idx 25 if ch_idx >= 32: 26 return chr(ch_idx) 27 return "\\x{%x}" % ch_idx 28 29output_directory = "" 30 31if len(sys.argv) > 2: 32 print('** Too many arguments: just give a directory name') 33 sys.exit(1) 34if len(sys.argv) == 2: 35 output_directory = sys.argv[1] 36 if not output_directory.endswith("/"): 37 output_directory += "/" 38 39try: 40 input_file = open(output_directory + "testinput26", "w") 41 output_file = open(output_directory + "testoutput26", "w") 42except IOError: 43 print ("** Couldn't open output files") 44 sys.exit(1) 45 46write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n") 47 48# --------------------------------------------------------------------------- 49# UNICODE SCRIPT EXTENSION TESTS 50# --------------------------------------------------------------------------- 51 52write_both("# Unicode Script Extension tests.\n\n") 53 54def gen_script_tests(): 55 script_data = [None] * len(script_names) 56 char_data = [None] * 0x110000 57 58 property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #") 59 prev_name = "" 60 script_idx = -1 61 62 with open("Unicode.tables/Scripts.txt") as f: 63 for line in f: 64 match_obj = property_re.match(line) 65 66 if match_obj == None: 67 continue 68 69 name = match_obj.group(3) 70 if name != prev_name: 71 script_idx = script_names.index(name) 72 prev_name = name 73 74 low = int(match_obj.group(1), 16) 75 high = low 76 char_data[low] = name 77 78 if match_obj.group(2) != None: 79 high = int(match_obj.group(2), 16) 80 for idx in range(low + 1, high + 1): 81 char_data[idx] = name 82 83 if script_data[script_idx] == None: 84 script_data[script_idx] = [low, None, None, None, None] 85 script_data[script_idx][1] = high 86 87 extended_script_indicies = {} 88 89 with open("Unicode.tables/ScriptExtensions.txt") as f: 90 for line in f: 91 match_obj = property_re.match(line) 92 93 if match_obj == None: 94 continue 95 96 low = int(match_obj.group(1), 16) 97 high = low 98 if match_obj.group(2) != None: 99 high = int(match_obj.group(2), 16) 100 101 for abbrev in match_obj.group(3).split(" "): 102 if abbrev not in extended_script_indicies: 103 idx = script_abbrevs.index(abbrev) 104 extended_script_indicies[abbrev] = idx 105 rec = script_data[idx] 106 rec[2] = low 107 rec[3] = high 108 else: 109 idx = extended_script_indicies[abbrev] 110 rec = script_data[idx] 111 if rec[2] > low: 112 rec[2] = low 113 if rec[3] < high: 114 rec[3] = high 115 116 if rec[4] == None: 117 name = script_names[idx] 118 for idx in range(low, high + 1): 119 if char_data[idx] != name: 120 rec[4] = idx 121 break 122 123 long_property_name = False 124 125 for idx, rec in enumerate(script_data): 126 script_name = script_names[idx] 127 128 if script_name == "Unknown": 129 continue 130 131 script_abbrev = script_abbrevs[idx] 132 133 write_both("# Base script check\n") 134 write_both("/^\\p{sc=%s}/utf\n" % script_name) 135 write_both(" %s\n" % to_string_char(rec[0])) 136 output_file.write(" 0: %s\n" % to_string_char(rec[0])) 137 write_both("\n") 138 139 write_both("/^\\p{Script=%s}/utf\n" % script_abbrev) 140 write_both(" %s\n" % to_string_char(rec[1])) 141 output_file.write(" 0: %s\n" % to_string_char(rec[1])) 142 write_both("\n") 143 144 if rec[2] != None: 145 property_name = "scx" 146 if long_property_name: 147 property_name = "Script_Extensions" 148 149 write_both("# Script extension check\n") 150 write_both("/^\\p{%s}/utf\n" % script_name) 151 write_both(" %s\n" % to_string_char(rec[2])) 152 output_file.write(" 0: %s\n" % to_string_char(rec[2])) 153 write_both("\n") 154 155 write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev)) 156 write_both(" %s\n" % to_string_char(rec[3])) 157 output_file.write(" 0: %s\n" % to_string_char(rec[3])) 158 write_both("\n") 159 160 long_property_name = not long_property_name 161 162 if rec[4] != None: 163 write_both("# Script extension only character\n") 164 write_both("/^\\p{%s}/utf\n" % script_name) 165 write_both(" %s\n" % to_string_char(rec[4])) 166 output_file.write(" 0: %s\n" % to_string_char(rec[4])) 167 write_both("\n") 168 169 write_both("/^\\p{sc=%s}/utf\n" % script_name) 170 write_both(" %s\n" % to_string_char(rec[4])) 171 output_file.write("No match\n") 172 write_both("\n") 173 else: 174 print("External character has not found for %s" % script_name) 175 176 high = rec[1] 177 if rec[3] != None and rec[3] > rec[1]: 178 high = rec[3] 179 write_both("# Character not in script\n") 180 write_both("/^\\p{%s}/utf\n" % script_name) 181 write_both(" %s\n" % to_string_char(high + 1)) 182 output_file.write("No match\n") 183 write_both("\n") 184 185 186gen_script_tests() 187 188write_both("# End of testinput26\n") 189