• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/perl -w
2
3#    Copyright (C) 1998, 1999 Tom Tromey
4#    Copyright (C) 2001 Red Hat Software
5
6#    This program is free software; you can redistribute it and/or modify
7#    it under the terms of the GNU General Public License as published by
8#    the Free Software Foundation; either version 2, or (at your option)
9#    any later version.
10
11#    This program is distributed in the hope that it will be useful,
12#    but WITHOUT ANY WARRANTY; without even the implied warranty of
13#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14#    GNU General Public License for more details.
15
16#    You should have received a copy of the GNU General Public License
17#    along with this program; if not, write to the Free Software
18#    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
19#    02111-1307, USA.
20
21# gen-casemap-test.pl - Generate test cases for case mapping from Unicode data.
22# See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
23# I consider the output of this program to be unrestricted.  Use it as
24# you will.
25
26require 5.006;
27use utf8;
28
29if (@ARGV != 3) {
30    $0 =~ s@.*/@@;
31    die "Usage: $0 UNICODE-VERSION UnicodeData.txt SpecialCasing.txt\n";
32}
33
34use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME $CASE_CODE $CASE_LOWER $CASE_TITLE $CASE_UPPER $CASE_CONDITION);
35
36# Names of fields in Unicode data table.
37$CODE = 0;
38$NAME = 1;
39$CATEGORY = 2;
40$COMBINING_CLASSES = 3;
41$BIDI_CATEGORY = 4;
42$DECOMPOSITION = 5;
43$DECIMAL_VALUE = 6;
44$DIGIT_VALUE = 7;
45$NUMERIC_VALUE = 8;
46$MIRRORED = 9;
47$OLD_NAME = 10;
48$COMMENT = 11;
49$UPPER = 12;
50$LOWER = 13;
51$TITLE = 14;
52
53# Names of fields in the SpecialCasing table
54$CASE_CODE = 0;
55$CASE_LOWER = 1;
56$CASE_TITLE = 2;
57$CASE_UPPER = 3;
58$CASE_CONDITION = 4;
59
60my @upper;
61my @title;
62my @lower;
63
64binmode STDOUT, ":utf8";
65open (INPUT, "< $ARGV[1]") || exit 1;
66
67$last_code = -1;
68while (<INPUT>)
69{
70    chop;
71    @fields = split (';', $_, 30);
72    if ($#fields != 14)
73    {
74	printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields);
75    }
76
77    $code = hex ($fields[$CODE]);
78
79    if ($code > $last_code + 1)
80    {
81	# Found a gap.
82	if ($fields[$NAME] =~ /Last>/)
83	{
84	    # Fill the gap with the last character read,
85            # since this was a range specified in the char database
86	    @gfields = @fields;
87	}
88	else
89	{
90	    # The gap represents undefined characters.  Only the type
91	    # matters.
92	    @gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
93			'', '', '', '');
94	}
95	for (++$last_code; $last_code < $code; ++$last_code)
96	{
97	    $gfields{$CODE} = sprintf ("%04x", $last_code);
98	    &process_one ($last_code, @gfields);
99	}
100    }
101    &process_one ($code, @fields);
102    $last_code = $code;
103}
104
105close INPUT;
106
107open (INPUT, "< $ARGV[2]") || exit 1;
108
109while (<INPUT>)
110{
111    my $code;
112
113    chop;
114
115    next if /^#/;
116    next if /^\s*$/;
117
118    s/\s*#.*//;
119
120    @fields = split ('\s*;\s*', $_, 30);
121
122    $raw_code = $fields[$CASE_CODE];
123    $code = hex ($raw_code);
124
125    if ($#fields != 4 && $#fields != 5)
126    {
127	printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);
128	next;
129    }
130
131    if (defined $fields[5]) {
132	# Ignore conditional special cases - we'll handle them manually
133	next;
134    }
135
136    $upper[$code] = &make_hex ($fields[$CASE_UPPER]);
137    $lower[$code] = &make_hex ($fields[$CASE_LOWER]);
138    $title[$code] = &make_hex ($fields[$CASE_TITLE]);
139}
140
141close INPUT;
142
143print <<EOT;
144# Test cases generated from Unicode $ARGV[0] data
145# by gen-case-tests.pl. Do not edit.
146#
147# Some special hand crafted tests
148#
149tr_TR\ti\ti\t\x{0130}\t\x{0130}\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
150tr_TR\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
151tr_TR\tI\x{0307}\ti\tI\x{0307}\tI\x{0307}\t# I => LATIN SMALL LETTER DOTLESS I
152tr_TR.UTF-8\ti\ti\t\x{0130}\t\x{0130}\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
153tr_TR.UTF-8\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
154tr_TR.UTF-8\tI\x{0307}\ti\tI\x{0307}\tI\x{0307}\t# I => LATIN SMALL LETTER DOTLESS I
155# Test reordering of YPOGEGRAMMENI across other accents
156\t\x{03b1}\x{0345}\x{0314}\t\x{03b1}\x{0345}\x{314}\t\x{0391}\x{0345}\x{0314}\t\x{0391}\x{0314}\x{0399}\t
157\t\x{03b1}\x{0314}\x{0345}\t\x{03b1}\x{314}\x{0345}\t\x{0391}\x{0314}\x{0345}\t\x{0391}\x{0314}\x{0399}\t
158# Handling of final and nonfinal sigma
159	ΜΆΙΟΣ 	μάιος 	Μάιος 	ΜΆΙΟΣ
160	ΜΆΙΟΣ	μάιος	Μάιος	ΜΆΙΟΣ
161	ΣΙΓΜΑ	σιγμα	Σιγμα	ΣΙΓΜΑ
162# Lithuanian rule of i followed by letter with dot. Not at all sure
163# about the titlecase part here
164lt_LT\ti\x{117}\ti\x{117}\tIe\tIE\t
165lt_LT\tie\x{307}\tie\x{307}\tIe\tIE\t
166lt_LT\t\x{00cc}\ti\x{0307}\x{0300}\t\x{00cc}\t\x{00cc}\t # LATIN CAPITAL LETTER I WITH GRAVE
167lt_LT\t\x{00CD}\ti\x{0307}\x{0301}\t\x{00CD}\t\x{00CD}\t # LATIN CAPITAL LETTER I WITH ACUTE
168lt_LT\t\x{0128}\ti\x{0307}\x{0303}\t\x{0128}\t\x{0128}\t # LATIN CAPITAL LETTER I WITH TILDE
169lt_LT\tI\x{0301}\ti\x{0307}\x{0301}\tI\x{0301}\tI\x{0301}\t # LATIN CAPITAL LETTER I (with acute accent)
170lt_LT\tI\x{0300}\ti\x{0307}\x{0300}\tI\x{0300}\tI\x{0300}\t # LATIN CAPITAL LETTER I (with grave accent)
171lt_LT\tI\x{0303}\ti\x{0307}\x{0303}\tI\x{0303}\tI\x{0303}\t # LATIN CAPITAL LETTER I (with tilde above)
172lt_LT\tI\x{0328}\x{0301}\ti\x{0307}\x{0328}\x{0301}\tI\x{0328}\x{0301}\tI\x{0328}\x{0301}\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
173lt_LT\tJ\x{0301}\tj\x{0307}\x{0301}\tJ\x{0301}\tJ\x{0301}\t # LATIN CAPITAL LETTER J (with acute accent)
174lt_LT\t\x{012e}\x{0301}\t\x{012f}\x{0307}\x{0301}\t\x{012e}\x{0301}\t\x{012e}\x{0301}\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
175lt_LT.UTF-8\ti\x{117}\ti\x{117}\tIe\tIE\t
176lt_LT.UTF-8\tie\x{307}\tie\x{307}\tIe\tIE\t
177lt_LT.UTF-8\t\x{00cc}\ti\x{0307}\x{0300}\t\x{00cc}\t\x{00cc}\t # LATIN CAPITAL LETTER I WITH GRAVE
178lt_LT.UTF-8\t\x{00CD}\ti\x{0307}\x{0301}\t\x{00CD}\t\x{00CD}\t # LATIN CAPITAL LETTER I WITH ACUTE
179lt_LT.UTF-8\t\x{0128}\ti\x{0307}\x{0303}\t\x{0128}\t\x{0128}\t # LATIN CAPITAL LETTER I WITH TILDE
180lt_LT.UTF-8\tI\x{0301}\ti\x{0307}\x{0301}\tI\x{0301}\tI\x{0301}\t # LATIN CAPITAL LETTER I (with acute accent)
181lt_LT.UTF-8\tI\x{0300}\ti\x{0307}\x{0300}\tI\x{0300}\tI\x{0300}\t # LATIN CAPITAL LETTER I (with grave accent)
182lt_LT.UTF-8\tI\x{0303}\ti\x{0307}\x{0303}\tI\x{0303}\tI\x{0303}\t # LATIN CAPITAL LETTER I (with tilde above)
183lt_LT.UTF-8\tI\x{0328}\x{0301}\ti\x{0307}\x{0328}\x{0301}\tI\x{0328}\x{0301}\tI\x{0328}\x{0301}\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
184lt_LT.UTF-8\tJ\x{0301}\tj\x{0307}\x{0301}\tJ\x{0301}\tJ\x{0301}\t # LATIN CAPITAL LETTER J (with acute accent)
185lt_LT.UTF-8\t\x{012e}\x{0301}\t\x{012f}\x{0307}\x{0301}\t\x{012e}\x{0301}\t\x{012e}\x{0301}\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
186# Special case not at initial position
187\ta\x{fb04}\ta\x{fb04}\tAffl\tAFFL\t# FB04
188#
189# Now the automatic tests
190#
191EOT
192&print_tests;
193
194exit 0;
195
196# Process a single character.
197sub process_one
198{
199    my ($code, @fields) = @_;
200
201    my $type =  $fields[$CATEGORY];
202    if ($type eq 'Ll')
203    {
204	$upper[$code] = make_hex ($fields[$UPPER]);
205	$lower[$code] = pack ("U", $code);
206	$title[$code] = make_hex ($fields[$TITLE]);
207    }
208    elsif ($type eq 'Lu')
209    {
210	$lower[$code] = make_hex ($fields[$LOWER]);
211	$upper[$code] = pack ("U", $code);
212	$title[$code] = make_hex ($fields[$TITLE]);
213    }
214
215    if ($type eq 'Lt')
216    {
217	$upper[$code] = make_hex ($fields[$UPPER]);
218	$lower[$code] = pack ("U", hex ($fields[$LOWER]));
219	$title[$code] = make_hex ($fields[$LOWER]);
220    }
221}
222
223sub print_tests
224{
225    for ($i = 0; $i < 0x10ffff; $i++) {
226	if ($i == 0x3A3) {
227	    # Greek sigma needs special tests
228	    next;
229	}
230
231	my $lower = $lower[$i];
232	my $title = $title[$i];
233	my $upper = $upper[$i];
234
235	if (defined $upper || defined $lower || defined $title) {
236	    printf "\t%s\t%s\t%s\t%s\t# %4X\n",
237		    pack ("U", $i),
238		    (defined $lower ? $lower : ""),
239		    (defined $title ? $title : ""),
240		    (defined $upper ? $upper : ""),
241                    $i;
242	}
243    }
244}
245
246sub make_hex
247{
248    my $codes = shift;
249
250    $codes =~ s/^\s+//;
251    $codes =~ s/\s+$//;
252
253    if ($codes eq "0" || $codes eq "") {
254	return "";
255    } else {
256	return pack ("U*", map { hex ($_) } split /\s+/, $codes);
257    }
258}
259