• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4 *******************************************************************************
5 * Copyright (C) 1996-2010, International Business Machines Corporation and    *
6 * others. All Rights Reserved.                                                *
7 *******************************************************************************
8 */
9 
10 package com.ibm.icu.dev.demo.translit;
11 import java.util.HashMap;
12 import java.util.HashSet;
13 import java.util.Iterator;
14 import java.util.Map;
15 import java.util.Set;
16 import java.util.TreeSet;
17 
18 import com.ibm.icu.lang.UCharacter;
19 import com.ibm.icu.text.Transliterator;
20 import com.ibm.icu.text.UTF16;
21 import com.ibm.icu.text.UnicodeSet;
22 
23 /**
24  * Incrementally returns the set of all strings that case-fold to the same value.
25  */
26 public class CaseIterator {
27 
28     // testing stuff
29     static Transliterator toName = Transliterator.getInstance("[:^ascii:] Any-Name");
30     static Transliterator toHex = Transliterator.getInstance("[:^ascii:] Any-Hex");
31     static Transliterator toHex2 = Transliterator.getInstance("[[^\u0021-\u007F]-[,]] Any-Hex");
32 
33     // global tables (could be precompiled)
34     private static Map fromCaseFold = new HashMap();
35     private static Map toCaseFold = new HashMap();
36     private static int maxLength = 0;
37 
38     // This exception list is generated on the console by turning on the GENERATED flag,
39     // which MUST be false for normal operation.
40     // Once the list is generated, it is pasted in here.
41     // A bit of a cludge, but this bootstrapping is the easiest way
42     // to get around certain complications in the data.
43 
44     private static final boolean GENERATE = false;
45 
46     private static final boolean DUMP = false;
47 
48     private static String[][] exceptionList = {
49         // a\N{MODIFIER LETTER RIGHT HALF RING}
50         {"a\u02BE","A\u02BE","a\u02BE",},
51         // ff
52         {"ff","FF","Ff","fF","ff",},
53         // ffi
54         {"ffi","FFI","FFi","FfI","Ffi","F\uFB01","fFI","fFi","ffI","ffi","f\uFB01","\uFB00I","\uFB00i",},
55         // ffl
56         {"ffl","FFL","FFl","FfL","Ffl","F\uFB02","fFL","fFl","ffL","ffl","f\uFB02","\uFB00L","\uFB00l",},
57         // fi
58         {"fi","FI","Fi","fI","fi",},
59         // fl
60         {"fl","FL","Fl","fL","fl",},
61         // h\N{COMBINING MACRON BELOW}
62         {"h\u0331","H\u0331","h\u0331",},
63         // i\N{COMBINING DOT ABOVE}
64         {"i\u0307","I\u0307","i\u0307",},
65         // j\N{COMBINING CARON}
66         {"j\u030C","J\u030C","j\u030C",},
67         // ss
68         {"ss","SS","Ss","S\u017F","sS","ss","s\u017F","\u017FS","\u017Fs","\u017F\u017F",},
69         // st
70         {"st","ST","St","sT","st","\u017FT","\u017Ft",},
71         // t\N{COMBINING DIAERESIS}
72         {"t\u0308","T\u0308","t\u0308",},
73         // w\N{COMBINING RING ABOVE}
74         {"w\u030A","W\u030A","w\u030A",},
75         // y\N{COMBINING RING ABOVE}
76         {"y\u030A","Y\u030A","y\u030A",},
77         // \N{MODIFIER LETTER APOSTROPHE}n
78         {"\u02BCn","\u02BCN","\u02BCn",},
79         // \N{GREEK SMALL LETTER ALPHA WITH TONOS}\N{GREEK SMALL LETTER IOTA}
80         {"\u03AC\u03B9","\u0386\u0345","\u0386\u0399","\u0386\u03B9","\u0386\u1FBE","\u03AC\u0345","\u03AC\u0399","\u03AC\u03B9","\u03AC\u1FBE",},
81         // \N{GREEK SMALL LETTER ETA WITH TONOS}\N{GREEK SMALL LETTER IOTA}
82         {"\u03AE\u03B9","\u0389\u0345","\u0389\u0399","\u0389\u03B9","\u0389\u1FBE","\u03AE\u0345","\u03AE\u0399","\u03AE\u03B9","\u03AE\u1FBE",},
83         // \N{GREEK SMALL LETTER ALPHA}\N{COMBINING GREEK PERISPOMENI}
84         {"\u03B1\u0342","\u0391\u0342","\u03B1\u0342",},
85         // \N{GREEK SMALL LETTER ALPHA}\N{COMBINING GREEK PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
86         {"\u03B1\u0342\u03B9","\u0391\u0342\u0345","\u0391\u0342\u0399","\u0391\u0342\u03B9","\u0391\u0342\u1FBE",
87             "\u03B1\u0342\u0345","\u03B1\u0342\u0399","\u03B1\u0342\u03B9","\u03B1\u0342\u1FBE","\u1FB6\u0345",
88             "\u1FB6\u0399","\u1FB6\u03B9","\u1FB6\u1FBE",},
89         // \N{GREEK SMALL LETTER ALPHA}\N{GREEK SMALL LETTER IOTA}
90         {"\u03B1\u03B9","\u0391\u0345","\u0391\u0399","\u0391\u03B9","\u0391\u1FBE","\u03B1\u0345","\u03B1\u0399","\u03B1\u03B9","\u03B1\u1FBE",},
91         // \N{GREEK SMALL LETTER ETA}\N{COMBINING GREEK PERISPOMENI}
92         {"\u03B7\u0342","\u0397\u0342","\u03B7\u0342",},
93         // \N{GREEK SMALL LETTER ETA}\N{COMBINING GREEK PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
94         {"\u03B7\u0342\u03B9","\u0397\u0342\u0345","\u0397\u0342\u0399","\u0397\u0342\u03B9","\u0397\u0342\u1FBE",
95             "\u03B7\u0342\u0345","\u03B7\u0342\u0399","\u03B7\u0342\u03B9","\u03B7\u0342\u1FBE","\u1FC6\u0345","\u1FC6\u0399",
96             "\u1FC6\u03B9","\u1FC6\u1FBE",},
97         // \N{GREEK SMALL LETTER ETA}\N{GREEK SMALL LETTER IOTA}
98         {"\u03B7\u03B9","\u0397\u0345","\u0397\u0399","\u0397\u03B9","\u0397\u1FBE","\u03B7\u0345","\u03B7\u0399","\u03B7\u03B9","\u03B7\u1FBE",},
99         // \N{GREEK SMALL LETTER IOTA}\N{COMBINING DIAERESIS}\N{COMBINING GRAVE ACCENT}
100         {"\u03B9\u0308\u0300","\u0345\u0308\u0300","\u0399\u0308\u0300","\u03B9\u0308\u0300","\u1FBE\u0308\u0300",},
101         // \N{GREEK SMALL LETTER IOTA}\N{COMBINING DIAERESIS}\N{COMBINING ACUTE ACCENT}
102         {"\u03B9\u0308\u0301","\u0345\u0308\u0301","\u0399\u0308\u0301","\u03B9\u0308\u0301","\u1FBE\u0308\u0301",},
103         // \N{GREEK SMALL LETTER IOTA}\N{COMBINING DIAERESIS}\N{COMBINING GREEK PERISPOMENI}
104         {"\u03B9\u0308\u0342","\u0345\u0308\u0342","\u0399\u0308\u0342","\u03B9\u0308\u0342","\u1FBE\u0308\u0342",},
105         // \N{GREEK SMALL LETTER IOTA}\N{COMBINING GREEK PERISPOMENI}
106         {"\u03B9\u0342","\u0345\u0342","\u0399\u0342","\u03B9\u0342","\u1FBE\u0342",},
107         // \N{GREEK SMALL LETTER RHO}\N{COMBINING COMMA ABOVE}
108         {"\u03C1\u0313","\u03A1\u0313","\u03C1\u0313","\u03F1\u0313",},
109         // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING DIAERESIS}\N{COMBINING GRAVE ACCENT}
110         {"\u03C5\u0308\u0300","\u03A5\u0308\u0300","\u03C5\u0308\u0300",},
111         // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING DIAERESIS}\N{COMBINING ACUTE ACCENT}
112         {"\u03C5\u0308\u0301","\u03A5\u0308\u0301","\u03C5\u0308\u0301",},
113         // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING DIAERESIS}\N{COMBINING GREEK PERISPOMENI}
114         {"\u03C5\u0308\u0342","\u03A5\u0308\u0342","\u03C5\u0308\u0342",},
115         // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}
116         {"\u03C5\u0313","\u03A5\u0313","\u03C5\u0313",},
117         // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\N{COMBINING GRAVE ACCENT}
118         {"\u03C5\u0313\u0300","\u03A5\u0313\u0300","\u03C5\u0313\u0300","\u1F50\u0300",},
119         // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\N{COMBINING ACUTE ACCENT}
120         {"\u03C5\u0313\u0301","\u03A5\u0313\u0301","\u03C5\u0313\u0301","\u1F50\u0301",},
121         // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\N{COMBINING GREEK PERISPOMENI}
122         {"\u03C5\u0313\u0342","\u03A5\u0313\u0342","\u03C5\u0313\u0342","\u1F50\u0342",},
123         // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING GREEK PERISPOMENI}
124         {"\u03C5\u0342","\u03A5\u0342","\u03C5\u0342",},
125         // \N{GREEK SMALL LETTER OMEGA}\N{COMBINING GREEK PERISPOMENI}
126         {"\u03C9\u0342","\u03A9\u0342","\u03C9\u0342","\u2126\u0342",},
127         // \N{GREEK SMALL LETTER OMEGA}\N{COMBINING GREEK PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
128         {"\u03C9\u0342\u03B9","\u03A9\u0342\u0345","\u03A9\u0342\u0399","\u03A9\u0342\u03B9","\u03A9\u0342\u1FBE","\u03C9\u0342\u0345","\u03C9\u0342\u0399","\u03C9\u0342\u03B9","\u03C9\u0342\u1FBE","\u1FF6\u0345",
129             "\u1FF6\u0399","\u1FF6\u03B9","\u1FF6\u1FBE","\u2126\u0342\u0345","\u2126\u0342\u0399","\u2126\u0342\u03B9","\u2126\u0342\u1FBE",},
130         // \N{GREEK SMALL LETTER OMEGA}\N{GREEK SMALL LETTER IOTA}
131         {"\u03C9\u03B9","\u03A9\u0345","\u03A9\u0399","\u03A9\u03B9","\u03A9\u1FBE","\u03C9\u0345","\u03C9\u0399","\u03C9\u03B9","\u03C9\u1FBE","\u2126\u0345","\u2126\u0399","\u2126\u03B9","\u2126\u1FBE",},
132         // \N{GREEK SMALL LETTER OMEGA WITH TONOS}\N{GREEK SMALL LETTER IOTA}
133         {"\u03CE\u03B9","\u038F\u0345","\u038F\u0399","\u038F\u03B9","\u038F\u1FBE","\u03CE\u0345","\u03CE\u0399","\u03CE\u03B9","\u03CE\u1FBE",},
134         // \N{ARMENIAN SMALL LETTER ECH}\N{ARMENIAN SMALL LETTER YIWN}
135         {"\u0565\u0582","\u0535\u0552","\u0535\u0582","\u0565\u0552","\u0565\u0582",},
136         // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER ECH}
137         {"\u0574\u0565","\u0544\u0535","\u0544\u0565","\u0574\u0535","\u0574\u0565",},
138         // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER INI}
139         {"\u0574\u056B","\u0544\u053B","\u0544\u056B","\u0574\u053B","\u0574\u056B",},
140         // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER XEH}
141         {"\u0574\u056D","\u0544\u053D","\u0544\u056D","\u0574\u053D","\u0574\u056D",},
142         // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER NOW}
143         {"\u0574\u0576","\u0544\u0546","\u0544\u0576","\u0574\u0546","\u0574\u0576",},
144         // \N{ARMENIAN SMALL LETTER VEW}\N{ARMENIAN SMALL LETTER NOW}
145         {"\u057E\u0576","\u054E\u0546","\u054E\u0576","\u057E\u0546","\u057E\u0576",},
146         // \N{GREEK SMALL LETTER ALPHA WITH PSILI}\N{GREEK SMALL LETTER IOTA}
147         {"\u1F00\u03B9","\u1F00\u0345","\u1F00\u0399","\u1F00\u03B9","\u1F00\u1FBE","\u1F08\u0345","\u1F08\u0399","\u1F08\u03B9","\u1F08\u1FBE",},
148         // \N{GREEK SMALL LETTER ALPHA WITH DASIA}\N{GREEK SMALL LETTER IOTA}
149         {"\u1F01\u03B9","\u1F01\u0345","\u1F01\u0399","\u1F01\u03B9","\u1F01\u1FBE","\u1F09\u0345","\u1F09\u0399","\u1F09\u03B9","\u1F09\u1FBE",},
150         // \N{GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA}\N{GREEK SMALL LETTER IOTA}
151         {"\u1F02\u03B9","\u1F02\u0345","\u1F02\u0399","\u1F02\u03B9","\u1F02\u1FBE","\u1F0A\u0345","\u1F0A\u0399","\u1F0A\u03B9","\u1F0A\u1FBE",},
152         // \N{GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA}\N{GREEK SMALL LETTER IOTA}
153         {"\u1F03\u03B9","\u1F03\u0345","\u1F03\u0399","\u1F03\u03B9","\u1F03\u1FBE","\u1F0B\u0345","\u1F0B\u0399","\u1F0B\u03B9","\u1F0B\u1FBE",},
154         // \N{GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA}\N{GREEK SMALL LETTER IOTA}
155         {"\u1F04\u03B9","\u1F04\u0345","\u1F04\u0399","\u1F04\u03B9","\u1F04\u1FBE","\u1F0C\u0345","\u1F0C\u0399","\u1F0C\u03B9","\u1F0C\u1FBE",},
156         // \N{GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA}\N{GREEK SMALL LETTER IOTA}
157         {"\u1F05\u03B9","\u1F05\u0345","\u1F05\u0399","\u1F05\u03B9","\u1F05\u1FBE","\u1F0D\u0345","\u1F0D\u0399","\u1F0D\u03B9","\u1F0D\u1FBE",},
158         // \N{GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
159         {"\u1F06\u03B9","\u1F06\u0345","\u1F06\u0399","\u1F06\u03B9","\u1F06\u1FBE","\u1F0E\u0345","\u1F0E\u0399","\u1F0E\u03B9","\u1F0E\u1FBE",},
160         // \N{GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
161         {"\u1F07\u03B9","\u1F07\u0345","\u1F07\u0399","\u1F07\u03B9","\u1F07\u1FBE","\u1F0F\u0345","\u1F0F\u0399","\u1F0F\u03B9","\u1F0F\u1FBE",},
162         // \N{GREEK SMALL LETTER ETA WITH PSILI}\N{GREEK SMALL LETTER IOTA}
163         {"\u1F20\u03B9","\u1F20\u0345","\u1F20\u0399","\u1F20\u03B9","\u1F20\u1FBE","\u1F28\u0345","\u1F28\u0399","\u1F28\u03B9","\u1F28\u1FBE",},
164         // \N{GREEK SMALL LETTER ETA WITH DASIA}\N{GREEK SMALL LETTER IOTA}
165         {"\u1F21\u03B9","\u1F21\u0345","\u1F21\u0399","\u1F21\u03B9","\u1F21\u1FBE","\u1F29\u0345","\u1F29\u0399","\u1F29\u03B9","\u1F29\u1FBE",},
166         // \N{GREEK SMALL LETTER ETA WITH PSILI AND VARIA}\N{GREEK SMALL LETTER IOTA}
167         {"\u1F22\u03B9","\u1F22\u0345","\u1F22\u0399","\u1F22\u03B9","\u1F22\u1FBE","\u1F2A\u0345","\u1F2A\u0399","\u1F2A\u03B9","\u1F2A\u1FBE",},
168         // \N{GREEK SMALL LETTER ETA WITH DASIA AND VARIA}\N{GREEK SMALL LETTER IOTA}
169         {"\u1F23\u03B9","\u1F23\u0345","\u1F23\u0399","\u1F23\u03B9","\u1F23\u1FBE","\u1F2B\u0345","\u1F2B\u0399","\u1F2B\u03B9","\u1F2B\u1FBE",},
170         // \N{GREEK SMALL LETTER ETA WITH PSILI AND OXIA}\N{GREEK SMALL LETTER IOTA}
171         {"\u1F24\u03B9","\u1F24\u0345","\u1F24\u0399","\u1F24\u03B9","\u1F24\u1FBE","\u1F2C\u0345","\u1F2C\u0399","\u1F2C\u03B9","\u1F2C\u1FBE",},
172         // \N{GREEK SMALL LETTER ETA WITH DASIA AND OXIA}\N{GREEK SMALL LETTER IOTA}
173         {"\u1F25\u03B9","\u1F25\u0345","\u1F25\u0399","\u1F25\u03B9","\u1F25\u1FBE","\u1F2D\u0345","\u1F2D\u0399","\u1F2D\u03B9","\u1F2D\u1FBE",},
174         // \N{GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
175         {"\u1F26\u03B9","\u1F26\u0345","\u1F26\u0399","\u1F26\u03B9","\u1F26\u1FBE","\u1F2E\u0345","\u1F2E\u0399","\u1F2E\u03B9","\u1F2E\u1FBE",},
176         // \N{GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
177         {"\u1F27\u03B9","\u1F27\u0345","\u1F27\u0399","\u1F27\u03B9","\u1F27\u1FBE","\u1F2F\u0345","\u1F2F\u0399","\u1F2F\u03B9","\u1F2F\u1FBE",},
178         // \N{GREEK SMALL LETTER OMEGA WITH PSILI}\N{GREEK SMALL LETTER IOTA}
179         {"\u1F60\u03B9","\u1F60\u0345","\u1F60\u0399","\u1F60\u03B9","\u1F60\u1FBE","\u1F68\u0345","\u1F68\u0399","\u1F68\u03B9","\u1F68\u1FBE",},
180         // \N{GREEK SMALL LETTER OMEGA WITH DASIA}\N{GREEK SMALL LETTER IOTA}
181         {"\u1F61\u03B9","\u1F61\u0345","\u1F61\u0399","\u1F61\u03B9","\u1F61\u1FBE","\u1F69\u0345","\u1F69\u0399","\u1F69\u03B9","\u1F69\u1FBE",},
182         // \N{GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA}\N{GREEK SMALL LETTER IOTA}
183         {"\u1F62\u03B9","\u1F62\u0345","\u1F62\u0399","\u1F62\u03B9","\u1F62\u1FBE","\u1F6A\u0345","\u1F6A\u0399","\u1F6A\u03B9","\u1F6A\u1FBE",},
184         // \N{GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA}\N{GREEK SMALL LETTER IOTA}
185         {"\u1F63\u03B9","\u1F63\u0345","\u1F63\u0399","\u1F63\u03B9","\u1F63\u1FBE","\u1F6B\u0345","\u1F6B\u0399","\u1F6B\u03B9","\u1F6B\u1FBE",},
186         // \N{GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA}\N{GREEK SMALL LETTER IOTA}
187         {"\u1F64\u03B9","\u1F64\u0345","\u1F64\u0399","\u1F64\u03B9","\u1F64\u1FBE","\u1F6C\u0345","\u1F6C\u0399","\u1F6C\u03B9","\u1F6C\u1FBE",},
188         // \N{GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA}\N{GREEK SMALL LETTER IOTA}
189         {"\u1F65\u03B9","\u1F65\u0345","\u1F65\u0399","\u1F65\u03B9","\u1F65\u1FBE","\u1F6D\u0345","\u1F6D\u0399","\u1F6D\u03B9","\u1F6D\u1FBE",},
190         // \N{GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
191         {"\u1F66\u03B9","\u1F66\u0345","\u1F66\u0399","\u1F66\u03B9","\u1F66\u1FBE","\u1F6E\u0345","\u1F6E\u0399","\u1F6E\u03B9","\u1F6E\u1FBE",},
192         // \N{GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
193         {"\u1F67\u03B9","\u1F67\u0345","\u1F67\u0399","\u1F67\u03B9","\u1F67\u1FBE","\u1F6F\u0345","\u1F6F\u0399","\u1F6F\u03B9","\u1F6F\u1FBE",},
194         // \N{GREEK SMALL LETTER ALPHA WITH VARIA}\N{GREEK SMALL LETTER IOTA}
195         {"\u1F70\u03B9","\u1F70\u0345","\u1F70\u0399","\u1F70\u03B9","\u1F70\u1FBE","\u1FBA\u0345","\u1FBA\u0399","\u1FBA\u03B9","\u1FBA\u1FBE",},
196         // \N{GREEK SMALL LETTER ETA WITH VARIA}\N{GREEK SMALL LETTER IOTA}
197         {"\u1F74\u03B9","\u1F74\u0345","\u1F74\u0399","\u1F74\u03B9","\u1F74\u1FBE","\u1FCA\u0345","\u1FCA\u0399","\u1FCA\u03B9","\u1FCA\u1FBE",},
198         // \N{GREEK SMALL LETTER OMEGA WITH VARIA}\N{GREEK SMALL LETTER IOTA}
199         {"\u1F7C\u03B9","\u1F7C\u0345","\u1F7C\u0399","\u1F7C\u03B9","\u1F7C\u1FBE","\u1FFA\u0345","\u1FFA\u0399","\u1FFA\u03B9","\u1FFA\u1FBE",},
200     };
201 
202     // this initializes the data used to generated the case-equivalents
203 
204     static {
205 
206         // Gather up the exceptions in a form we can use
207 
208         if (!GENERATE) {
209             for (int i = 0; i < exceptionList.length; ++i) {
210                 String[] exception = exceptionList[i];
211                 Set s = new HashSet();
212                 // there has to be some method to do the following, but I can't find it in the collections
213                 for (int j = 0; j < exception.length; ++j) {
214                     s.add(exception[j]);
215                 }
fromCaseFold.put(exception[0], s)216                 fromCaseFold.put(exception[0], s);
217             }
218         }
219 
220         // walk through all the characters, and at every case fold result,
221         // put a set of all the characters that map to that result
222 
223         boolean defaultmapping = true; // false for turkish
224         for (int i = 0; i <= 0x10FFFF; ++i) {
225             int cat = UCharacter.getType(i);
226             if (cat == Character.UNASSIGNED || cat == Character.PRIVATE_USE) continue;
227 
228             String cp = UTF16.valueOf(i);
229             String mapped = UCharacter.foldCase(cp, defaultmapping);
230             if (mapped.equals(cp)) continue;
231 
232             if (maxLength < mapped.length()) maxLength = mapped.length();
233 
234             // at this point, have different case folding
235 
236             Set s = (Set) fromCaseFold.get(mapped);
237             if (s == null) {
238                 s = new HashSet();
239                 s.add(mapped); // add the case fold result itself
fromCaseFold.put(mapped, s)240                 fromCaseFold.put(mapped, s);
241             }
242             s.add(cp);
toCaseFold.put(cp, mapped)243             toCaseFold.put(cp, mapped);
toCaseFold.put(mapped, mapped)244             toCaseFold.put(mapped, mapped); // add mapping to self
245         }
246 
247         // Emit the final data
248 
249         if (DUMP) {
250             System.out.println("maxLength = " + maxLength);
251 
252             System.out.println("\nfromCaseFold:");
253             Iterator it = fromCaseFold.keySet().iterator();
254             while (it.hasNext()) {
255                 Object key = it.next();
256                 System.out.print(" " + toHex2.transliterate((String)key) + ": ");
257                 Set s = (Set) fromCaseFold.get(key);
258                 Iterator it2 = s.iterator();
259                 boolean first = true;
260                 while (it2.hasNext()) {
261                     if (first) {
262                         first = false;
263                     } else {
264                         System.out.print(", ");
265                     }
266                     System.out.print(toHex2.transliterate((String)it2.next()));
267                 }
268                 System.out.println("");
269             }
270 
271             System.out.println("\ntoCaseFold:");
272             it = toCaseFold.keySet().iterator();
273             while (it.hasNext()) {
274                 String key = (String) it.next();
275                 String value = (String) toCaseFold.get(key);
276                 System.out.println(" " + toHex2.transliterate(key) + ": " + toHex2.transliterate(value));
277             }
278         }
279 
280         // Now convert all those sets into linear arrays
281         // We can't do this in place in Java, so make a temporary target array
282 
283         // Note: This could be transformed into a single array, with offsets into it.
284         // Might be best choice in C.
285 
286 
287         Map fromCaseFold2 = new HashMap();
288         Iterator it = fromCaseFold.keySet().iterator();
289         while (it.hasNext()) {
290             Object key = it.next();
291             Set s = (Set) fromCaseFold.get(key);
292             String[] temp = new String[s.size()];
293             s.toArray(temp);
fromCaseFold2.put(key, temp)294             fromCaseFold2.put(key, temp);
295         }
296         fromCaseFold = fromCaseFold2;
297 
298         // We have processed everything, so the iterator will now work
299         // The following is normally OFF.
300         // It is here to generate (under the GENERATE flag) the static exception list.
301         // It must be at the very end of initialization, so that the iterator is functional.
302         // (easiest to do it that way)
303 
304         if (GENERATE) {
305 
306             // first get small set of items that have multiple characters
307 
308             Set multichars = new TreeSet();
309             it = fromCaseFold.keySet().iterator();
310             while (it.hasNext()) {
311                 String key = (String) it.next();
312                 if (UTF16.countCodePoint(key) < 2) continue;
313                 multichars.add(key);
314             }
315 
316             // now we will go through each of them.
317 
318             CaseIterator ci = new CaseIterator();
319             it = multichars.iterator();
320 
321             while (it.hasNext()) {
322                 String key = (String) it.next();
323 
324                 // here is a nasty complication. Take 'ffi' ligature. We
325                 // can't just close it, since we would miss the combination
326                 // that includes the 'fi' => "fi" ligature
327                 // so first do a pass through, and add substring combinations
328                 // we call this a 'partial closure'
329 
330                 Set partialClosure = new TreeSet();
331                 partialClosure.add(key);
332 
333                 if (UTF16.countCodePoint(key) > 2) {
334                     Iterator multiIt2 = multichars.iterator();
335                     while (multiIt2.hasNext()) {
336                         String otherKey = (String) multiIt2.next();
337                         if (otherKey.length() >= key.length()) continue;
338                         int pos = -1;
339                         while (true) {
340                             // The following is not completely general
341                             // but works for the actual cased stuff,
342                             // and should work for future characters, since we won't have
343                             // more ligatures & other oddities.
344                             pos = key.indexOf(otherKey, pos+1);
345                             if (pos < 0) break;
346                             int endPos = pos + otherKey.length();
347                             // we know we have a proper substring,
348                             // so get the combinations
349                             String[] choices = (String[]) fromCaseFold.get(otherKey);
350                             for (int ii = 0; ii < choices.length; ++ii) {
351                                 String patchwork = key.substring(0, pos)
352                                     + choices[ii]
353                                     + key.substring(endPos);
354                                 partialClosure.add(patchwork);
355                             }
356                         }
357                     }
358                 }
359 
360                 // now, for each thing in the partial closure, get its
361                 // case closure and add it to the final result.
362 
363                 Set closure = new TreeSet(); // this will be the real closure
364                 Iterator partialIt = partialClosure.iterator();
365                 while (partialIt.hasNext()) {
366                     String key2 = (String) partialIt.next();
367                     ci.reset(key2);
368                     for (String temp = ci.next(); temp != null; temp = ci.next()) {
369                         closure.add(temp);
370                     }
371                     // form closure
372                     /*String[] choices = (String[]) fromCaseFold.get(key2);
373                     for (int i = 0; i < choices.length; ++i) {
374                         ci.reset(choices[i]);
375                         String temp;
376                         while (null != (temp = ci.next())) {
377                             closure.add(temp);
378                         }
379                     }
380                     */
381                 }
382 
383                 // print it out, so that it can be cut and pasted back into this document.
384 
385                 Iterator it2 = closure.iterator();
386                 System.out.println("\t// " + toName.transliterate(key));
387                 System.out.print("\t{\"" + toHex.transliterate(key) + "\",");
388                 while (it2.hasNext()) {
389                     String item = (String)it2.next();
390                     System.out.print("\"" + toHex.transliterate(item) + "\",");
391                 }
392                 System.out.println("},");
393             }
394         }
395     }
396 
397     // ============ PRIVATE CLASS DATA ============
398 
399     // pieces that we will put together
400     // is not changed during iteration
401     private int count = 0;
402     private String[][] variants;
403 
404     // state information, changes during iteration
405     private boolean done = false;
406     private int[] counts;
407 
408     // internal buffer for efficiency
409     private StringBuffer nextBuffer = new StringBuffer();
410 
411     // ========================
412 
413     /**
414      * Reset to different source. Once reset, the iteration starts from the beginning.
415      * @param source The string to get case variants for
416      */
reset(String source)417     public void reset(String source) {
418 
419         // allocate arrays to store pieces
420         // using length might be slightly too long, but we don't care much
421 
422         counts = new int[source.length()];
423         variants = new String[source.length()][];
424 
425         // walk through the source, and break up into pieces
426         // each piece becomes an array of equivalent values
427         // TODO: could optimized this later to coalesce all single string pieces
428 
429         String piece = null;
430         count = 0;
431         for (int i = 0; i < source.length(); i += piece.length()) {
432 
433             // find *longest* matching piece
434             String caseFold = null;
435 
436             if (GENERATE) {
437                 // do exactly one CP
438                 piece = UTF16.valueOf(source, i);
439                 caseFold = (String) toCaseFold.get(piece);
440             } else {
441                 int max = i + maxLength;
442                 if (max > source.length()) max = source.length();
443                 for (int j = max; j > i; --j) {
444                     piece = source.substring(i, j);
445                     caseFold = (String) toCaseFold.get(piece);
446                     if (caseFold != null) break;
447                 }
448             }
449 
450             // if we fail, pick one code point
451             if (caseFold == null) {
452                 piece = UTF16.valueOf(source, i);
453                 variants[count++] = new String[] {piece}; // single item string
454             } else {
455                 variants[count++] = (String[])fromCaseFold.get(caseFold);
456             }
457         }
458         reset();
459     }
460 
461     /**
462      * Restart the iteration from the beginning, but with same source
463      */
reset()464     public void reset() {
465         done = false;
466         for (int i = 0; i < count; ++i) {
467             counts[i] = 0;
468         }
469     }
470 
471     /**
472      * Iterates through the case variants.
473      * @return next case variant. Each variant will case-fold to the same value as the source will.
474      * When the iteration is done, null is returned.
475      */
next()476     public String next() {
477 
478         if (done) return null;
479         int i;
480 
481         // TODO Optimize so we keep the piece before and after the current position
482         // so we don't have so much concatenation
483 
484         // get the result, a concatenation
485 
486         nextBuffer.setLength(0);
487         for (i = 0; i < count; ++i) {
488             nextBuffer.append(variants[i][counts[i]]);
489         }
490 
491         // find the next right set of pieces to concatenate
492 
493         for (i = count-1; i >= 0; --i) {
494             counts[i]++;
495             if (counts[i] < variants[i].length) break;
496             counts[i] = 0;
497         }
498 
499         // if we go too far, bail
500 
501         if (i < 0) {
502             done = true;
503         }
504 
505         return nextBuffer.toString();
506     }
507 
508 
509     /**
510      * Temporary test, just to see how the stuff works.
511      */
main(String[] args)512     static public void main(String[] args) {
513         String[] testCases = {"fiss", "h\u03a3"};
514         CaseIterator ci = new CaseIterator();
515 
516         for (int i = 0; i < testCases.length; ++i) {
517             String item = testCases[i];
518             System.out.println();
519             System.out.println("Testing: " + toName.transliterate(item));
520             System.out.println();
521             ci.reset(item);
522             int count = 0;
523             for (String temp = ci.next(); temp != null; temp = ci.next()) {
524                 System.out.println(toName.transliterate(temp));
525                 count++;
526             }
527             System.out.println("Total: " + count);
528         }
529 
530         // generate a list of all caseless characters -- characters whose
531         // case closure is themselves.
532 
533         UnicodeSet caseless = new UnicodeSet();
534 
535         for (int i = 0; i <= 0x10FFFF; ++i) {
536             String cp = UTF16.valueOf(i);
537             ci.reset(cp);
538             int count = 0;
539             String fold = null;
540             for (String temp = ci.next(); temp != null; temp = ci.next()) {
541                 fold = temp;
542                 if (++count > 1) break;
543             }
544             if (count==1 && fold.equals(cp)) {
545                 caseless.add(i);
546             }
547         }
548 
549         System.out.println("caseless = " + caseless.toPattern(true));
550 
551         UnicodeSet not_lc = new UnicodeSet("[:^lc:]");
552 
553         UnicodeSet a = new UnicodeSet();
554         a.set(not_lc);
555         a.removeAll(caseless);
556         System.out.println("[:^lc:] - caseless = " + a.toPattern(true));
557 
558         a.set(caseless);
559         a.removeAll(not_lc);
560         System.out.println("caseless - [:^lc:] = " + a.toPattern(true));
561     }
562 }
563