1#!/usr/bin/env python3 2# Copyright (C) 1998, 1999 Tom Tromey 3# Copyright (C) 2001 Red Hat Software 4# 5# This program is free software; you can redistribute it and/or modify 6# it under the terms of the GNU General Public License as published by 7# the Free Software Foundation; either version 2, or (at your option) 8# any later version. 9# 10# This program is distributed in the hope that it will be useful, 11# but WITHOUT ANY WARRANTY; without even the implied warranty of 12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13# GNU General Public License for more details. 14# 15# You should have received a copy of the GNU General Public License 16# along with this program; if not, see <http://www.gnu.org/licenses/>. 17 18""" 19gen-casemap-txt.py - Generate test cases for case mapping from Unicode data. 20See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html 21Usage: 22 I consider the output of this program to be unrestricted. 23 Use it as you will. 24""" 25 26import sys 27import argparse 28 29 30def main(argv): 31 parser = argparse.ArgumentParser( 32 description="Generate test cases for case mapping from Unicode data") 33 parser.add_argument("UNICODE-VERSION") 34 parser.add_argument("UnicodeData.txt") 35 parser.add_argument("SpecialCasing.txt") 36 args = parser.parse_args(argv[1:]) 37 version = getattr(args, "UNICODE-VERSION") 38 filename_udata = getattr(args, "UnicodeData.txt") 39 filename_casing = getattr(args, "SpecialCasing.txt") 40 41 # Names of fields in Unicode data table. 42 CODE, NAME, CATEGORY, COMBINING_CLASSES, BIDI_CATEGORY, DECOMPOSITION, \ 43 DECIMAL_VALUE, DIGIT_VALUE, NUMERIC_VALUE, MIRRORED, OLD_NAME, \ 44 COMMENT, UPPER, LOWER, TITLE = range(15) 45 46 # Names of fields in the SpecialCasing table 47 CASE_CODE, CASE_LOWER, CASE_TITLE, CASE_UPPER, CASE_CONDITION = range(5) 48 49 upper = {} 50 title = {} 51 lower = {} 52 53 def make_hex(codes): 54 """Converts a string of white space separated code points encoded as 55 hex values to a Unicode string. Any extra white space is ignored. 56 """ 57 return "".join([chr(int(c, 16)) for c in codes.split()]) 58 59 def process_one(code, fields): 60 type_ = fields[CATEGORY] 61 if type_ == "Ll": 62 upper[code] = make_hex(fields[UPPER]) 63 lower[code] = chr(code) 64 title[code] = make_hex(fields[TITLE]) 65 elif type_ == "Lu": 66 lower[code] = make_hex(fields[LOWER]) 67 upper[code] = chr(code) 68 title[code] = make_hex(fields[TITLE]) 69 elif type_ == "Lt": 70 upper[code] = make_hex(fields[UPPER]) 71 lower[code] = make_hex(fields[LOWER]) 72 title[code] = make_hex(fields[LOWER]) 73 74 with open(filename_udata, encoding="utf-8") as fileobj: 75 last_code = -1 76 for line in fileobj: 77 line = line.strip() 78 fields = [f.strip() for f in line.split(";")] 79 if len(fields) != 15: 80 raise SystemExit( 81 "Entry for %s has wrong number of fields (%d)" % ( 82 fields[CODE], len(fields))) 83 84 code = int(fields[CODE], 16) 85 86 if code > last_code + 1: 87 # Found a gap 88 if fields[NAME].endswith("Last>"): 89 # Fill the gap with the last character read, 90 # since this was a range specified in the char database 91 gfields = fields 92 else: 93 # The gap represents undefined characters. Only the type 94 # matters. 95 gfields = ['', '', 'Cn', '0', '', '', '', '', '', '', '', 96 '', '', '', ''] 97 98 last_code += 1 99 while last_code < code: 100 gfields[CODE] = "%04x" % last_code 101 process_one(last_code, gfields) 102 last_code += 1 103 104 process_one(code, fields) 105 last_code = code 106 107 with open(filename_casing, encoding="utf-8") as fileobj: 108 last_code = -1 109 for line in fileobj: 110 # strip comments and skip empty lines 111 line = line.split("#", 1)[0].strip() 112 if not line: 113 continue 114 115 # all lines end with ";" so just remove it 116 line = line.rstrip(";").rstrip() 117 fields = [f.strip() for f in line.split(";")] 118 if len(fields) not in (4, 5): 119 raise SystemExit( 120 "Entry for %s has wrong number of fields (%d)" % ( 121 fields[CASE_CODE], len(fields))) 122 123 if len(fields) == 5: 124 # Ignore conditional special cases - we'll handle them manually 125 continue 126 127 code = int(fields[CASE_CODE], 16) 128 129 upper[code] = make_hex(fields[CASE_UPPER]) 130 lower[code] = make_hex(fields[CASE_LOWER]) 131 title[code] = make_hex(fields[CASE_TITLE]) 132 133 print_tests(version, upper, title, lower) 134 135 136def print_tests(version, upper, title, lower): 137 print("""\ 138# Test cases generated from Unicode {} data 139# by gen-casemap-txt.py. Do not edit. 140# 141# Some special hand crafted tests 142# 143tr_TR\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE 144tr_TR\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I 145tr_TR\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I 146tr_TR.UTF-8\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE 147tr_TR.UTF-8\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I 148tr_TR.UTF-8\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I 149# Test reordering of YPOGEGRAMMENI across other accents 150\t\u03b1\u0345\u0314\t\u03b1\u0345\u0314\t\u0391\u0345\u0314\t\u0391\u0314\u0399\t 151\t\u03b1\u0314\u0345\t\u03b1\u0314\u0345\t\u0391\u0314\u0345\t\u0391\u0314\u0399\t 152# Handling of final and nonfinal sigma 153\tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ 154\tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ 155\tΣΙΓΜΑ σιγμα Σιγμα ΣΙΓΜΑ 156# Lithuanian rule of i followed by letter with dot. Not at all sure 157# about the titlecase part here 158lt_LT\ti\u0117\ti\u0117\tIe\tIE\t 159lt_LT\tie\u0307\tie\u0307\tIe\tIE\t 160lt_LT\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE 161lt_LT\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE 162lt_LT\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE 163lt_LT\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent) 164lt_LT\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent) 165lt_LT\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above) 166lt_LT\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent) 167lt_LT\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent) 168lt_LT\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent) 169lt_LT.UTF-8\ti\u0117\ti\u0117\tIe\tIE\t 170lt_LT.UTF-8\tie\u0307\tie\u0307\tIe\tIE\t 171lt_LT.UTF-8\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE 172lt_LT.UTF-8\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE 173lt_LT.UTF-8\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE 174lt_LT.UTF-8\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent) 175lt_LT.UTF-8\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent) 176lt_LT.UTF-8\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above) 177lt_LT.UTF-8\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent) 178lt_LT.UTF-8\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent) 179lt_LT.UTF-8\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent) 180# Special case not at initial position 181\ta\ufb04\ta\ufb04\tAffl\tAFFL\t# FB04 182# 183# Now the automatic tests 184#""".format(version)) 185 186 for i in range(0x10ffff): 187 if i == 0x3A3: 188 # Greek sigma needs special tests 189 continue 190 191 up = upper.get(i, "") 192 lo = lower.get(i, "") 193 ti = title.get(i, "") 194 195 if any([up, lo, ti]): 196 print("\t%s\t%s\t%s\t%s\t# %4X" % (chr(i), lo, ti, up, i)) 197 198 199if __name__ == "__main__": 200 sys.exit(main(sys.argv)) 201