• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2# Copyright (C) 1998, 1999 Tom Tromey
3# Copyright (C) 2001 Red Hat Software
4#
5# This program is free software; you can redistribute it and/or modify
6# it under the terms of the GNU General Public License as published by
7# the Free Software Foundation; either version 2, or (at your option)
8# any later version.
9#
10# This program is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13# GNU General Public License for more details.
14#
15# You should have received a copy of the GNU General Public License
16# along with this program; if not, see <http://www.gnu.org/licenses/>.
17
18"""
19gen-casemap-txt.py - Generate test cases for case mapping from Unicode data.
20See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
21Usage:
22    I consider the output of this program to be unrestricted.
23    Use it as you will.
24"""
25
26import sys
27import argparse
28
29
30def main(argv):
31    parser = argparse.ArgumentParser(
32        description="Generate test cases for case mapping from Unicode data")
33    parser.add_argument("UNICODE-VERSION")
34    parser.add_argument("UnicodeData.txt")
35    parser.add_argument("SpecialCasing.txt")
36    args = parser.parse_args(argv[1:])
37    version = getattr(args, "UNICODE-VERSION")
38    filename_udata = getattr(args, "UnicodeData.txt")
39    filename_casing = getattr(args, "SpecialCasing.txt")
40
41    # Names of fields in Unicode data table.
42    CODE, NAME, CATEGORY, COMBINING_CLASSES, BIDI_CATEGORY, DECOMPOSITION, \
43        DECIMAL_VALUE, DIGIT_VALUE, NUMERIC_VALUE, MIRRORED, OLD_NAME, \
44        COMMENT, UPPER, LOWER, TITLE = range(15)
45
46    # Names of fields in the SpecialCasing table
47    CASE_CODE, CASE_LOWER, CASE_TITLE, CASE_UPPER, CASE_CONDITION = range(5)
48
49    upper = {}
50    title = {}
51    lower = {}
52
53    def make_hex(codes):
54        """Converts a string of white space separated code points encoded as
55        hex values to a Unicode string. Any extra white space is ignored.
56        """
57        return "".join([chr(int(c, 16)) for c in codes.split()])
58
59    def process_one(code, fields):
60        type_ = fields[CATEGORY]
61        if type_ == "Ll":
62            upper[code] = make_hex(fields[UPPER])
63            lower[code] = chr(code)
64            title[code] = make_hex(fields[TITLE])
65        elif type_ == "Lu":
66            lower[code] = make_hex(fields[LOWER])
67            upper[code] = chr(code)
68            title[code] = make_hex(fields[TITLE])
69        elif type_ == "Lt":
70            upper[code] = make_hex(fields[UPPER])
71            lower[code] = make_hex(fields[LOWER])
72            title[code] = make_hex(fields[LOWER])
73
74    with open(filename_udata, encoding="utf-8") as fileobj:
75        last_code = -1
76        for line in fileobj:
77            line = line.strip()
78            fields = [f.strip() for f in line.split(";")]
79            if len(fields) != 15:
80                raise SystemExit(
81                    "Entry for %s has wrong number of fields (%d)" % (
82                        fields[CODE], len(fields)))
83
84            code = int(fields[CODE], 16)
85
86            if code > last_code + 1:
87                # Found a gap
88                if fields[NAME].endswith("Last>"):
89                    # Fill the gap with the last character read,
90                    # since this was a range specified in the char database
91                    gfields = fields
92                else:
93                    # The gap represents undefined characters.  Only the type
94                    # matters.
95                    gfields = ['', '', 'Cn', '0', '', '', '', '', '', '', '',
96                               '', '', '', '']
97
98                last_code += 1
99                while last_code < code:
100                    gfields[CODE] = "%04x" % last_code
101                    process_one(last_code, gfields)
102                    last_code += 1
103
104            process_one(code, fields)
105            last_code = code
106
107    with open(filename_casing, encoding="utf-8") as fileobj:
108        last_code = -1
109        for line in fileobj:
110            # strip comments and skip empty lines
111            line = line.split("#", 1)[0].strip()
112            if not line:
113                continue
114
115            # all lines end with ";" so just remove it
116            line = line.rstrip(";").rstrip()
117            fields = [f.strip() for f in line.split(";")]
118            if len(fields) not in (4, 5):
119                raise SystemExit(
120                    "Entry for %s has wrong number of fields (%d)" % (
121                        fields[CASE_CODE], len(fields)))
122
123            if len(fields) == 5:
124                # Ignore conditional special cases - we'll handle them manually
125                continue
126
127            code = int(fields[CASE_CODE], 16)
128
129            upper[code] = make_hex(fields[CASE_UPPER])
130            lower[code] = make_hex(fields[CASE_LOWER])
131            title[code] = make_hex(fields[CASE_TITLE])
132
133    print_tests(version, upper, title, lower)
134
135
136def print_tests(version, upper, title, lower):
137    print("""\
138# Test cases generated from Unicode {} data
139# by gen-casemap-txt.py. Do not edit.
140#
141# Some special hand crafted tests
142#
143tr_TR\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
144tr_TR\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
145tr_TR\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I
146tr_TR.UTF-8\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
147tr_TR.UTF-8\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
148tr_TR.UTF-8\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I
149# Test reordering of YPOGEGRAMMENI across other accents
150\t\u03b1\u0345\u0314\t\u03b1\u0345\u0314\t\u0391\u0345\u0314\t\u0391\u0314\u0399\t
151\t\u03b1\u0314\u0345\t\u03b1\u0314\u0345\t\u0391\u0314\u0345\t\u0391\u0314\u0399\t
152# Handling of final and nonfinal sigma
153\tΜΆΙΟΣ 	μάιος 	Μάιος 	ΜΆΙΟΣ
154\tΜΆΙΟΣ	μάιος	Μάιος	ΜΆΙΟΣ
155\tΣΙΓΜΑ	σιγμα	Σιγμα	ΣΙΓΜΑ
156# Lithuanian rule of i followed by letter with dot. Not at all sure
157# about the titlecase part here
158lt_LT\ti\u0117\ti\u0117\tIe\tIE\t
159lt_LT\tie\u0307\tie\u0307\tIe\tIE\t
160lt_LT\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE
161lt_LT\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE
162lt_LT\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE
163lt_LT\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent)
164lt_LT\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent)
165lt_LT\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above)
166lt_LT\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
167lt_LT\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent)
168lt_LT\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
169lt_LT.UTF-8\ti\u0117\ti\u0117\tIe\tIE\t
170lt_LT.UTF-8\tie\u0307\tie\u0307\tIe\tIE\t
171lt_LT.UTF-8\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE
172lt_LT.UTF-8\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE
173lt_LT.UTF-8\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE
174lt_LT.UTF-8\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent)
175lt_LT.UTF-8\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent)
176lt_LT.UTF-8\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above)
177lt_LT.UTF-8\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
178lt_LT.UTF-8\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent)
179lt_LT.UTF-8\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
180# Special case not at initial position
181\ta\ufb04\ta\ufb04\tAffl\tAFFL\t# FB04
182#
183# Now the automatic tests
184#""".format(version))
185
186    for i in range(0x10ffff):
187        if i == 0x3A3:
188            # Greek sigma needs special tests
189            continue
190
191        up = upper.get(i, "")
192        lo = lower.get(i, "")
193        ti = title.get(i, "")
194
195        if any([up, lo, ti]):
196            print("\t%s\t%s\t%s\t%s\t# %4X" % (chr(i), lo, ti, up, i))
197
198
199if __name__ == "__main__":
200    sys.exit(main(sys.argv))
201