• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2# Copyright (C) 1998, 1999 Tom Tromey
3# Copyright (C) 2001 Red Hat Software
4#
5# This program is free software; you can redistribute it and/or modify
6# it under the terms of the GNU General Public License as published by
7# the Free Software Foundation; either version 2, or (at your option)
8# any later version.
9#
10# This program is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13# GNU General Public License for more details.
14#
15# You should have received a copy of the GNU General Public License
16# along with this program; if not, see <http://www.gnu.org/licenses/>.
17
18"""
19gen-casemap-txt.py - Generate test cases for case mapping from Unicode data.
20See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
21Usage:
22    I consider the output of this program to be unrestricted.
23    Use it as you will.
24"""
25
26import sys
27import argparse
28
29
30# Disable line length warnings as wrapping the test templates would be hard
31# flake8: noqa: E501
32
33
34def main(argv):
35    parser = argparse.ArgumentParser(
36        description="Generate test cases for case mapping from Unicode data"
37    )
38    parser.add_argument("UNICODE-VERSION")
39    parser.add_argument("UnicodeData.txt")
40    parser.add_argument("SpecialCasing.txt")
41    args = parser.parse_args(argv[1:])
42    version = getattr(args, "UNICODE-VERSION")
43    filename_udata = getattr(args, "UnicodeData.txt")
44    filename_casing = getattr(args, "SpecialCasing.txt")
45
46    # Names of fields in Unicode data table.
47    (
48        CODE,
49        NAME,
50        CATEGORY,
51        COMBINING_CLASSES,
52        BIDI_CATEGORY,
53        DECOMPOSITION,
54        DECIMAL_VALUE,
55        DIGIT_VALUE,
56        NUMERIC_VALUE,
57        MIRRORED,
58        OLD_NAME,
59        COMMENT,
60        UPPER,
61        LOWER,
62        TITLE,
63    ) = range(15)
64
65    # Names of fields in the SpecialCasing table
66    CASE_CODE, CASE_LOWER, CASE_TITLE, CASE_UPPER, CASE_CONDITION = range(5)
67
68    upper = {}
69    title = {}
70    lower = {}
71
72    def make_hex(codes):
73        """Converts a string of white space separated code points encoded as
74        hex values to a Unicode string. Any extra white space is ignored.
75        """
76        return "".join([chr(int(c, 16)) for c in codes.split()])
77
78    def process_one(code, fields):
79        type_ = fields[CATEGORY]
80        if type_ == "Ll":
81            upper[code] = make_hex(fields[UPPER])
82            lower[code] = chr(code)
83            title[code] = make_hex(fields[TITLE])
84        elif type_ == "Lu":
85            lower[code] = make_hex(fields[LOWER])
86            upper[code] = chr(code)
87            title[code] = make_hex(fields[TITLE])
88        elif type_ == "Lt":
89            upper[code] = make_hex(fields[UPPER])
90            lower[code] = make_hex(fields[LOWER])
91            title[code] = make_hex(fields[LOWER])
92
93    with open(filename_udata, encoding="utf-8") as fileobj:
94        last_code = -1
95        for line in fileobj:
96            line = line.strip()
97            fields = [f.strip() for f in line.split(";")]
98            if len(fields) != 15:
99                raise SystemExit(
100                    "Entry for %s has wrong number of fields (%d)"
101                    % (fields[CODE], len(fields))
102                )
103
104            code = int(fields[CODE], 16)
105
106            if code > last_code + 1:
107                # Found a gap
108                if fields[NAME].endswith("Last>"):
109                    # Fill the gap with the last character read,
110                    # since this was a range specified in the char database
111                    gfields = fields
112                else:
113                    # The gap represents undefined characters.  Only the type
114                    # matters.
115                    gfields = [
116                        "",
117                        "",
118                        "Cn",
119                        "0",
120                        "",
121                        "",
122                        "",
123                        "",
124                        "",
125                        "",
126                        "",
127                        "",
128                        "",
129                        "",
130                        "",
131                    ]
132
133                last_code += 1
134                while last_code < code:
135                    gfields[CODE] = "%04x" % last_code
136                    process_one(last_code, gfields)
137                    last_code += 1
138
139            process_one(code, fields)
140            last_code = code
141
142    with open(filename_casing, encoding="utf-8") as fileobj:
143        last_code = -1
144        for line in fileobj:
145            # strip comments and skip empty lines
146            line = line.split("#", 1)[0].strip()
147            if not line:
148                continue
149
150            # all lines end with ";" so just remove it
151            line = line.rstrip(";").rstrip()
152            fields = [f.strip() for f in line.split(";")]
153            if len(fields) not in (4, 5):
154                raise SystemExit(
155                    "Entry for %s has wrong number of fields (%d)"
156                    % (fields[CASE_CODE], len(fields))
157                )
158
159            if len(fields) == 5:
160                # Ignore conditional special cases - we'll handle them manually
161                continue
162
163            code = int(fields[CASE_CODE], 16)
164
165            upper[code] = make_hex(fields[CASE_UPPER])
166            lower[code] = make_hex(fields[CASE_LOWER])
167            title[code] = make_hex(fields[CASE_TITLE])
168
169    print_tests(version, upper, title, lower)
170
171
172def print_tests(version, upper, title, lower):
173    print(
174        """\
175# Test cases generated from Unicode {} data
176# by gen-casemap-txt.py. Do not edit.
177#
178# Some special hand crafted tests
179#
180tr_TR\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
181tr_TR\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
182tr_TR\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I
183tr_TR.UTF-8\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
184tr_TR.UTF-8\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
185tr_TR.UTF-8\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I
186# Test reordering of YPOGEGRAMMENI across other accents
187\t\u03b1\u0345\u0314\t\u03b1\u0345\u0314\t\u0391\u0345\u0314\t\u0391\u0314\u0399\t
188\t\u03b1\u0314\u0345\t\u03b1\u0314\u0345\t\u0391\u0314\u0345\t\u0391\u0314\u0399\t
189# Handling of final and nonfinal sigma
190\tΜΆΙΟΣ 	μάιος 	Μάιος 	ΜΆΙΟΣ \t
191\tΜΆΙΟΣ	μάιος	Μάιος	ΜΆΙΟΣ\t
192\tΣΙΓΜΑ	σιγμα	Σιγμα	ΣΙΓΜΑ\t
193# Lithuanian rule of i followed by letter with dot. Not at all sure
194# about the titlecase part here
195lt_LT\ti\u0117\ti\u0117\tIe\tIE\t
196lt_LT\tie\u0307\tie\u0307\tIe\tIE\t
197lt_LT\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE
198lt_LT\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE
199lt_LT\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE
200lt_LT\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent)
201lt_LT\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent)
202lt_LT\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above)
203lt_LT\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
204lt_LT\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent)
205lt_LT\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
206lt_LT.UTF-8\ti\u0117\ti\u0117\tIe\tIE\t
207lt_LT.UTF-8\tie\u0307\tie\u0307\tIe\tIE\t
208lt_LT.UTF-8\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE
209lt_LT.UTF-8\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE
210lt_LT.UTF-8\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE
211lt_LT.UTF-8\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent)
212lt_LT.UTF-8\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent)
213lt_LT.UTF-8\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above)
214lt_LT.UTF-8\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
215lt_LT.UTF-8\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent)
216lt_LT.UTF-8\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
217# Special case not at initial position
218\ta\ufb04\ta\ufb04\tAffl\tAFFL\t# FB04
219#
220# Now the automatic tests
221#""".format(
222            version
223        )
224    )
225
226    for i in range(0x10FFFF):
227        if i == 0x3A3:
228            # Greek sigma needs special tests
229            continue
230
231        up = upper.get(i, "")
232        lo = lower.get(i, "")
233        ti = title.get(i, "")
234
235        if any([up, lo, ti]):
236            print("\t%s\t%s\t%s\t%s\t# %4X" % (chr(i), lo, ti, up, i))
237
238
239if __name__ == "__main__":
240    sys.exit(main(sys.argv))
241