1#!/usr/bin/env python3 2# Copyright (C) 1998, 1999 Tom Tromey 3# Copyright (C) 2001 Red Hat Software 4# 5# This program is free software; you can redistribute it and/or modify 6# it under the terms of the GNU General Public License as published by 7# the Free Software Foundation; either version 2, or (at your option) 8# any later version. 9# 10# This program is distributed in the hope that it will be useful, 11# but WITHOUT ANY WARRANTY; without even the implied warranty of 12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13# GNU General Public License for more details. 14# 15# You should have received a copy of the GNU General Public License 16# along with this program; if not, see <http://www.gnu.org/licenses/>. 17 18""" 19gen-casemap-txt.py - Generate test cases for case mapping from Unicode data. 20See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html 21Usage: 22 I consider the output of this program to be unrestricted. 23 Use it as you will. 24""" 25 26import sys 27import argparse 28 29 30# Disable line length warnings as wrapping the test templates would be hard 31# flake8: noqa: E501 32 33 34def main(argv): 35 parser = argparse.ArgumentParser( 36 description="Generate test cases for case mapping from Unicode data" 37 ) 38 parser.add_argument("UNICODE-VERSION") 39 parser.add_argument("UnicodeData.txt") 40 parser.add_argument("SpecialCasing.txt") 41 args = parser.parse_args(argv[1:]) 42 version = getattr(args, "UNICODE-VERSION") 43 filename_udata = getattr(args, "UnicodeData.txt") 44 filename_casing = getattr(args, "SpecialCasing.txt") 45 46 # Names of fields in Unicode data table. 47 ( 48 CODE, 49 NAME, 50 CATEGORY, 51 COMBINING_CLASSES, 52 BIDI_CATEGORY, 53 DECOMPOSITION, 54 DECIMAL_VALUE, 55 DIGIT_VALUE, 56 NUMERIC_VALUE, 57 MIRRORED, 58 OLD_NAME, 59 COMMENT, 60 UPPER, 61 LOWER, 62 TITLE, 63 ) = range(15) 64 65 # Names of fields in the SpecialCasing table 66 CASE_CODE, CASE_LOWER, CASE_TITLE, CASE_UPPER, CASE_CONDITION = range(5) 67 68 upper = {} 69 title = {} 70 lower = {} 71 72 def make_hex(codes): 73 """Converts a string of white space separated code points encoded as 74 hex values to a Unicode string. Any extra white space is ignored. 75 """ 76 return "".join([chr(int(c, 16)) for c in codes.split()]) 77 78 def process_one(code, fields): 79 type_ = fields[CATEGORY] 80 if type_ == "Ll": 81 upper[code] = make_hex(fields[UPPER]) 82 lower[code] = chr(code) 83 title[code] = make_hex(fields[TITLE]) 84 elif type_ == "Lu": 85 lower[code] = make_hex(fields[LOWER]) 86 upper[code] = chr(code) 87 title[code] = make_hex(fields[TITLE]) 88 elif type_ == "Lt": 89 upper[code] = make_hex(fields[UPPER]) 90 lower[code] = make_hex(fields[LOWER]) 91 title[code] = make_hex(fields[LOWER]) 92 93 with open(filename_udata, encoding="utf-8") as fileobj: 94 last_code = -1 95 for line in fileobj: 96 line = line.strip() 97 fields = [f.strip() for f in line.split(";")] 98 if len(fields) != 15: 99 raise SystemExit( 100 "Entry for %s has wrong number of fields (%d)" 101 % (fields[CODE], len(fields)) 102 ) 103 104 code = int(fields[CODE], 16) 105 106 if code > last_code + 1: 107 # Found a gap 108 if fields[NAME].endswith("Last>"): 109 # Fill the gap with the last character read, 110 # since this was a range specified in the char database 111 gfields = fields 112 else: 113 # The gap represents undefined characters. Only the type 114 # matters. 115 gfields = [ 116 "", 117 "", 118 "Cn", 119 "0", 120 "", 121 "", 122 "", 123 "", 124 "", 125 "", 126 "", 127 "", 128 "", 129 "", 130 "", 131 ] 132 133 last_code += 1 134 while last_code < code: 135 gfields[CODE] = "%04x" % last_code 136 process_one(last_code, gfields) 137 last_code += 1 138 139 process_one(code, fields) 140 last_code = code 141 142 with open(filename_casing, encoding="utf-8") as fileobj: 143 last_code = -1 144 for line in fileobj: 145 # strip comments and skip empty lines 146 line = line.split("#", 1)[0].strip() 147 if not line: 148 continue 149 150 # all lines end with ";" so just remove it 151 line = line.rstrip(";").rstrip() 152 fields = [f.strip() for f in line.split(";")] 153 if len(fields) not in (4, 5): 154 raise SystemExit( 155 "Entry for %s has wrong number of fields (%d)" 156 % (fields[CASE_CODE], len(fields)) 157 ) 158 159 if len(fields) == 5: 160 # Ignore conditional special cases - we'll handle them manually 161 continue 162 163 code = int(fields[CASE_CODE], 16) 164 165 upper[code] = make_hex(fields[CASE_UPPER]) 166 lower[code] = make_hex(fields[CASE_LOWER]) 167 title[code] = make_hex(fields[CASE_TITLE]) 168 169 print_tests(version, upper, title, lower) 170 171 172def print_tests(version, upper, title, lower): 173 print( 174 """\ 175# Test cases generated from Unicode {} data 176# by gen-casemap-txt.py. Do not edit. 177# 178# Some special hand crafted tests 179# 180tr_TR\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE 181tr_TR\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I 182tr_TR\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I 183tr_TR.UTF-8\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE 184tr_TR.UTF-8\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I 185tr_TR.UTF-8\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I 186# Test reordering of YPOGEGRAMMENI across other accents 187\t\u03b1\u0345\u0314\t\u03b1\u0345\u0314\t\u0391\u0345\u0314\t\u0391\u0314\u0399\t 188\t\u03b1\u0314\u0345\t\u03b1\u0314\u0345\t\u0391\u0314\u0345\t\u0391\u0314\u0399\t 189# Handling of final and nonfinal sigma 190\tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ \t 191\tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ\t 192\tΣΙΓΜΑ σιγμα Σιγμα ΣΙΓΜΑ\t 193# Lithuanian rule of i followed by letter with dot. Not at all sure 194# about the titlecase part here 195lt_LT\ti\u0117\ti\u0117\tIe\tIE\t 196lt_LT\tie\u0307\tie\u0307\tIe\tIE\t 197lt_LT\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE 198lt_LT\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE 199lt_LT\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE 200lt_LT\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent) 201lt_LT\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent) 202lt_LT\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above) 203lt_LT\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent) 204lt_LT\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent) 205lt_LT\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent) 206lt_LT.UTF-8\ti\u0117\ti\u0117\tIe\tIE\t 207lt_LT.UTF-8\tie\u0307\tie\u0307\tIe\tIE\t 208lt_LT.UTF-8\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE 209lt_LT.UTF-8\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE 210lt_LT.UTF-8\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE 211lt_LT.UTF-8\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent) 212lt_LT.UTF-8\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent) 213lt_LT.UTF-8\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above) 214lt_LT.UTF-8\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent) 215lt_LT.UTF-8\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent) 216lt_LT.UTF-8\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent) 217# Special case not at initial position 218\ta\ufb04\ta\ufb04\tAffl\tAFFL\t# FB04 219# 220# Now the automatic tests 221#""".format( 222 version 223 ) 224 ) 225 226 for i in range(0x10FFFF): 227 if i == 0x3A3: 228 # Greek sigma needs special tests 229 continue 230 231 up = upper.get(i, "") 232 lo = lower.get(i, "") 233 ti = title.get(i, "") 234 235 if any([up, lo, ti]): 236 print("\t%s\t%s\t%s\t%s\t# %4X" % (chr(i), lo, ti, up, i)) 237 238 239if __name__ == "__main__": 240 sys.exit(main(sys.argv)) 241