• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *******************************************************************************
3  * Copyright (C) 2008-2015, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  *******************************************************************************
6  */
7 package com.ibm.icu.dev.test.collator;
8 import java.util.ArrayList;
9 import java.util.Arrays;
10 import java.util.Collection;
11 import java.util.Iterator;
12 import java.util.LinkedHashSet;
13 import java.util.List;
14 import java.util.Locale;
15 import java.util.Set;
16 import java.util.TreeSet;
17 
18 import com.ibm.icu.dev.test.TestFmwk;
19 import com.ibm.icu.dev.util.CollectionUtilities;
20 import com.ibm.icu.impl.ICUDebug;
21 import com.ibm.icu.impl.Row;
22 import com.ibm.icu.impl.Row.R4;
23 import com.ibm.icu.lang.UCharacter;
24 import com.ibm.icu.lang.UProperty;
25 import com.ibm.icu.lang.UScript;
26 import com.ibm.icu.text.AlphabeticIndex;
27 import com.ibm.icu.text.AlphabeticIndex.Bucket;
28 import com.ibm.icu.text.AlphabeticIndex.Bucket.LabelType;
29 import com.ibm.icu.text.AlphabeticIndex.ImmutableIndex;
30 import com.ibm.icu.text.AlphabeticIndex.Record;
31 import com.ibm.icu.text.Collator;
32 import com.ibm.icu.text.Normalizer2;
33 import com.ibm.icu.text.RawCollationKey;
34 import com.ibm.icu.text.RuleBasedCollator;
35 import com.ibm.icu.text.UTF16;
36 import com.ibm.icu.text.UnicodeSet;
37 import com.ibm.icu.util.ULocale;
38 
39 /**
40  * @author Mark Davis
41  */
42 public class AlphabeticIndexTest extends TestFmwk {
43     /**
44      *
45      */
46     private static final String ARROW = "\u2192";
47     private static final boolean DEBUG = ICUDebug.enabled("alphabeticindex");
48 
49     public static Set<String> KEY_LOCALES = new LinkedHashSet(Arrays.asList(
50             "en", "es", "de", "fr", "ja", "it", "tr", "pt", "zh", "nl",
51             "pl", "ar", "ru", "zh_Hant", "ko", "th", "sv", "fi", "da",
52             "he", "nb", "el", "hr", "bg", "sk", "lt", "vi", "lv", "sr",
53             "pt_PT", "ro", "hu", "cs", "id", "sl", "fil", "fa", "uk",
54             "ca", "hi", "et", "eu", "is", "sw", "ms", "bn", "am", "ta",
55             "te", "mr", "ur", "ml", "kn", "gu", "or"));
56     private String[][] localeAndIndexCharactersLists = new String[][] {
57             /* Arabic*/ {"ar", "\u0627:\u0628:\u062A:\u062B:\u062C:\u062D:\u062E:\u062F:\u0630:\u0631:\u0632:\u0633:\u0634:\u0635:\u0636:\u0637:\u0638:\u0639:\u063A:\u0641:\u0642:\u0643:\u0644:\u0645:\u0646:\u0647:\u0648:\u064A"},
58             /* Bulgarian*/  {"bg", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0415:\u0416:\u0417:\u0418:\u0419:\u041A:\u041B:\u041C:\u041D:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u0428:\u0429:\u042E:\u042F"},
59             /* Catalan*/    {"ca", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
60             /* Czech*/  {"cs", "A:B:C:\u010C:D:E:F:G:H:CH:I:J:K:L:M:N:O:P:Q:R:\u0158:S:\u0160:T:U:V:W:X:Y:Z:\u017D"},
61             /* Danish*/ {"da", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C6:\u00D8:\u00C5"},
62             /* German*/ {"de", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
63             /* Greek*/  {"el", "\u0391:\u0392:\u0393:\u0394:\u0395:\u0396:\u0397:\u0398:\u0399:\u039A:\u039B:\u039C:\u039D:\u039E:\u039F:\u03A0:\u03A1:\u03A3:\u03A4:\u03A5:\u03A6:\u03A7:\u03A8:\u03A9"},
64             /* English*/    {"en", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
65             /* Spanish*/    {"es", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:\u00D1:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
66             /* Estonian*/   {"et", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:\u0160:Z:\u017D:T:U:V:\u00D5:\u00C4:\u00D6:\u00DC:X:Y"},
67             /* Basque*/ {"eu", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
68             /* Finnish*/    {"fi", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C5:\u00C4:\u00D6"},
69             /* Filipino*/   {"fil", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
70             /* French*/ {"fr", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
71             /* Hebrew*/ {"he", "\u05D0:\u05D1:\u05D2:\u05D3:\u05D4:\u05D5:\u05D6:\u05D7:\u05D8:\u05D9:\u05DB:\u05DC:\u05DE:\u05E0:\u05E1:\u05E2:\u05E4:\u05E6:\u05E7:\u05E8:\u05E9:\u05EA"},
72             /* Icelandic*/  {"is", "A:\u00C1:B:C:D:\u00D0:E:\u00C9:F:G:H:I:\u00CD:J:K:L:M:N:O:\u00D3:P:Q:R:S:T:U:\u00DA:V:W:X:Y:\u00DD:Z:\u00DE:\u00C6:\u00D6"},
73             /* Italian*/    {"it", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
74             /* Japanese*/   {"ja", "\u3042:\u304B:\u3055:\u305F:\u306A:\u306F:\u307E:\u3084:\u3089:\u308F"},
75             /* Korean*/ {"ko", "\u3131:\u3134:\u3137:\u3139:\u3141:\u3142:\u3145:\u3147:\u3148:\u314A:\u314B:\u314C:\u314D:\u314E"},
76             /* Lithuanian*/ {"lt", "A:B:C:\u010C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:\u0160:T:U:V:Z:\u017D"},
77             /* Latvian*/    {"lv", "A:B:C:\u010C:D:E:F:G:\u0122:H:I:J:K:\u0136:L:\u013B:M:N:\u0145:O:P:Q:R:S:\u0160:T:U:V:W:X:Z:\u017D"},
78             /* Norwegian Bokm\u00E5l*/  {"nb", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C6:\u00D8:\u00C5"},
79             /* Dutch*/  {"nl", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
80             /* Polish*/ {"pl", "A:\u0104:B:C:\u0106:D:E:\u0118:F:G:H:I:J:K:L:\u0141:M:N:\u0143:O:\u00D3:P:Q:R:S:\u015A:T:U:V:W:X:Y:Z:\u0179:\u017B"},
81             /* Portuguese*/ {"pt", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
82             /* Romanian*/   {"ro", "A:\u0102:\u00C2:B:C:D:E:F:G:H:I:\u00CE:J:K:L:M:N:O:P:Q:R:S:\u0218:T:\u021A:U:V:W:X:Y:Z"},
83             /* Russian*/    {"ru", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0415:\u0416:\u0417:\u0418:\u0419:\u041A:\u041B:\u041C:\u041D:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u0428:\u0429:\u042B:\u042D:\u042E:\u042F"},
84             /* Slovak*/ {"sk", "A:\u00C4:B:C:\u010C:D:E:F:G:H:CH:I:J:K:L:M:N:O:\u00D4:P:Q:R:S:\u0160:T:U:V:W:X:Y:Z:\u017D"},
85             /* Slovenian*/  {"sl", "A:B:C:\u010C:\u0106:D:\u0110:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:\u0160:T:U:V:W:X:Y:Z:\u017D"},
86             /* Serbian*/    {"sr", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0402:\u0415:\u0416:\u0417:\u0418:\u0408:\u041A:\u041B:\u0409:\u041C:\u041D:\u040A:\u041E:\u041F:\u0420:\u0421:\u0422:\u040B:\u0423:\u0424:\u0425:\u0426:\u0427:\u040F:\u0428"},
87             /* Swedish*/    {"sv", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C5:\u00C4:\u00D6"},
88             /* Turkish*/    {"tr", "A:B:C:\u00C7:D:E:F:G:H:I:\u0130:J:K:L:M:N:O:\u00D6:P:Q:R:S:\u015E:T:U:\u00DC:V:W:X:Y:Z"},
89             /* Ukrainian*/  {"uk", "\u0410:\u0411:\u0412:\u0413:\u0490:\u0414:\u0415:\u0404:\u0416:\u0417:\u0418:\u0406:\u0407:\u0419:\u041A:\u041B:\u041C:\u041D:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u0428:\u0429:\u042E:\u042F"},
90             /* Vietnamese*/ {"vi", "A:\u0102:\u00C2:B:C:D:\u0110:E:\u00CA:F:G:H:I:J:K:L:M:N:O:\u00D4:\u01A0:P:Q:R:S:T:U:\u01AF:V:W:X:Y:Z"},
91             /* Chinese*/    {"zh", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
92             /* Chinese (Traditional Han)*/  {"zh_Hant", "1\u5283:2\u5283:3\u5283:4\u5283:5\u5283:6\u5283:7\u5283:8\u5283:9\u5283:10\u5283:11\u5283:12\u5283:13\u5283:14\u5283:15\u5283:16\u5283:17\u5283:18\u5283:19\u5283:20\u5283:21\u5283:22\u5283:23\u5283:24\u5283:25\u5283:26\u5283:27\u5283:28\u5283:29\u5283:30\u5283:31\u5283:32\u5283:33\u5283:35\u5283:36\u5283:39\u5283:48\u5283"},
93 
94             // Comment these out to make the test run faster. Later, make these run under extended
95 
96             //            /* Afrikaans*/  {"af", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
97             //            /* Akan*/   {"ak", "A:B:C:D:E:\u0190:F:G:H:I:J:K:L:M:N:O:\u0186:P:Q:R:S:T:U:V:W:X:Y:Z"},
98             //            /* Asu*/    {"asa", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"},
99             //            /* Azerbaijani*/    {"az", "A:B:C:\u00C7:D:E:\u018F:F:G:\u011E:H:X:I:\u0130:J:K:Q:L:M:N:O:\u00D6:P:R:S:\u015E:T:U:\u00DC:V:W:Y:Z"},
100             //            /* Belarusian*/ {"be", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0415:\u0416:\u0417:\u0406:\u0419:\u041A:\u041B:\u041C:\u041D:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u0428:\u042B:\u042D:\u042E:\u042F"},
101             //            /* Bemba*/  {"bem", "A:B:C:E:F:G:I:J:K:L:M:N:O:P:S:T:U:W:Y"},
102             //            /* Bena*/   {"bez", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:Y:Z"},
103             //            /* Bambara*/    {"bm", "A:B:C:D:E:\u0190:F:G:H:I:J:K:L:M:N:\u019D:\u014A:O:\u0186:P:R:S:T:U:W:Y:Z"},
104             //            /* Tibetan*/    {"bo", "\u0F40:\u0F41:\u0F42:\u0F44:\u0F45:\u0F46:\u0F47:\u0F49:\u0F4F:\u0F50:\u0F51:\u0F53:\u0F54:\u0F55:\u0F56:\u0F58:\u0F59:\u0F5A:\u0F5B:\u0F5D:\u0F5E:\u0F5F:\u0F60:\u0F61:\u0F62:\u0F63:\u0F64:\u0F66:\u0F67:\u0F68"},
105             //            /* Chiga*/  {"cgg", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
106             //            /* Cherokee*/   {"chr", "\u13A0:\u13A6:\u13AD:\u13B3:\u13B9:\u13BE:\u13C6:\u13CC:\u13D3:\u13DC:\u13E3:\u13E9:\u13EF"},
107             //            /* Welsh*/  {"cy", "A:B:C:CH:D:E:F:FF:G:H:I:J:L:LL:M:N:O:P:PH:R:RH:S:T:TH:U:W:Y"},
108             //            /* Taita*/  {"dav", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"},
109             //            /* Embu*/   {"ebu", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
110             //            /* Ewe*/    {"ee", "A:B:C:D:\u0189:E:\u0190:F:\u0191:G:\u0194:H:I:J:K:L:M:N:\u014A:O:\u0186:P:Q:R:S:T:U:V:\u01B2:W:X:Y:Z"},
111             //            /* Esperanto*/  {"eo", "A:B:C:\u0108:D:E:F:G:\u011C:H:\u0124:I:J:\u0134:K:L:M:N:O:P:R:S:\u015C:T:U:\u016C:V:Z"},
112             //            /* Fulah*/  {"ff", "A:B:\u0181:C:D:\u018A:E:F:G:H:I:J:K:L:M:N:\u014A:O:P:R:S:T:U:W:Y:\u01B3"},
113             //            /* Faroese*/    {"fo", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C6:\u00D8"},
114             //            /* Gusii*/  {"guz", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"},
115             //            /* Hausa*/  {"ha", "A:B:\u0181:C:D:\u018A:E:F:G:H:I:J:K:\u0198:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
116             //            /* Igbo*/   {"ig", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
117             //            /* Machame*/    {"jmc", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"},
118             //            /* Kabyle*/ {"kab", "A:B:C:D:E:\u0190:F:G:\u0194:H:I:J:K:L:M:N:P:Q:R:S:T:U:W:X:Y:Z"},
119             //            /* Kamba*/  {"kam", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
120             //            /* Makonde*/    {"kde", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
121             //            /* Kabuverdianu*/   {"kea", "A:B:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:X:Z"},
122             //            /* Koyra Chiini*/   {"khq", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:\u019D:\u014A:O:P:Q:R:S:T:U:W:X:Y:Z"},
123             //            /* Kikuyu*/ {"ki", "A:B:C:D:E:G:H:I:J:K:M:N:O:R:T:U:W:Y"},
124             //            /* Kalenjin*/   {"kln", "A:B:C:D:E:G:H:I:J:K:L:M:N:O:P:R:S:T:U:W:Y"},
125             //            /* Langi*/  {"lag", "A:B:C:D:E:F:G:H:I:\u0197:J:K:L:M:N:O:P:Q:R:S:T:U:\u0244:V:W:X:Y:Z"},
126             //            /* Ganda*/  {"lg", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
127             //            /* Luo*/    {"luo", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y"},
128             //            /* Luyia*/  {"luy", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
129             //            /* Masai*/  {"mas", "A:B:C:D:E:\u0190:G:H:I:\u0197:J:K:L:M:N:\u014A:O:\u0186:P:R:S:T:U:\u0244:W:Y"},
130             //            /* Meru*/   {"mer", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
131             //            /* Morisyen*/   {"mfe", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:X:Y:Z"},
132             //            /* Malagasy*/   {"mg", "A:B:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:V:Y:Z"},
133             // This should be the correct data.  Commented till it is fixed in CLDR collation data.
134             // {"mk", "\u0410:\u0411:\u0412:\u0413:\u0403:\u0414:\u0415:\u0416:\u0417:\u0405:\u0418:\u0408:\u041A:\u040C:\u041B:\u0409:\u041C:\u041D:\u040A:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u040F:\u0428"},
135             //            /* Macedonian*/ {"mk", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0403:\u0415:\u0416:\u0417:\u0405:\u0418:\u0408:\u041A:\u041B:\u0409:\u041C:\u041D:\u040A:\u041E:\u041F:\u0420:\u0421:\u0422:\u040C:\u0423:\u0424:\u0425:\u0426:\u0427:\u040F:\u0428"},
136             // This should be the correct data.  Commented till it is fixed in CLDR collation data.
137             // {"mt", "A:B:C:\u010A:D:E:F:\u0120:G:G\u0126:H:\u0126:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:\u017B:Z"},
138             //            /* Maltese*/    {"mt", "A:B:\u010A:C:D:E:F:\u0120:G:G\u0126:H:\u0126:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:\u017B:Z"},
139             //            /* Nama*/   {"naq", "A:B:C:D:E:F:G:H:I:K:M:N:O:P:Q:R:S:T:U:W:X:Y:Z"},
140             //            /* North Ndebele*/  {"nd", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:S:T:U:V:W:X:Y:Z"},
141             //            /* Norwegian Nynorsk*/  {"nn", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C6:\u00D8:\u00C5"},
142             //            /* Nyankole*/   {"nyn", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
143             //            /* Oromo*/  {"om", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
144             //            /* Romansh*/    {"rm", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
145             //            /* Rombo*/  {"rof", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"},
146             //            /* Kinyarwanda*/    {"rw", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
147             //            /* Rwa*/    {"rwk", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"},
148             //            /* Samburu*/    {"saq", "A:B:C:D:E:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y"},
149             //            /* Sena*/   {"seh", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
150             //            /* Koyraboro Senni*/    {"ses", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:\u019D:\u014A:O:P:Q:R:S:T:U:W:X:Y:Z"},
151             //            /* Sango*/  {"sg", "A:B:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"},
152             //            /* Tachelhit*/  {"shi", "A:B:C:D:E:\u0190:F:G:\u0194:H:I:J:K:L:M:N:Q:R:S:T:U:W:X:Y:Z"},
153             //            /* Tachelhit (Tifinagh)*/   {"shi_Tfng", "\u2D30:\u2D31:\u2D33:\u2D37:\u2D39:\u2D3B:\u2D3C:\u2D3D:\u2D40:\u2D43:\u2D44:\u2D45:\u2D47:\u2D49:\u2D4A:\u2D4D:\u2D4E:\u2D4F:\u2D53:\u2D54:\u2D55:\u2D56:\u2D59:\u2D5A:\u2D5B:\u2D5C:\u2D5F:\u2D61:\u2D62:\u2D63:\u2D65"},
154             //            /* Shona*/  {"sn", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"},
155             //            /* Teso*/   {"teo", "A:B:C:D:E:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:X:Y"},
156             //            /* Tonga*/  {"to", "A:B:C:D:E:F:G:H:\u02BB:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
157             //            /* Central Morocco Tamazight*/  {"tzm", "A:B:C:D:E:\u0190:F:G:\u0194:H:I:J:K:L:M:N:Q:R:S:T:U:W:X:Y:Z"},
158             //            /* Uzbek (Latin)*/  {"uz_Latn", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u02BF"},
159             //            /* Vunjo*/  {"vun", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"},
160             //            /* Soga*/   {"xog", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
161             //            /* Yoruba*/ {"yo", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
162 
163     };
main(String[] args)164     public static void main(String[] args) throws Exception{
165         new AlphabeticIndexTest().run(args);
166     }
167 
168 //    public void TestAAKeyword() {
169 //    ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle.getBundleInstance(
170 //            ICUResourceBundle.ICU_COLLATION_BASE_NAME, "zh");
171 //    showBundle(rb, 0);
172 //        String[] keywords = Collator.getKeywords();
173 //        System.out.println(Arrays.asList(keywords));
174 //        String locale = "zh";
175 //        ULocale ulocale = new ULocale(locale);
176 //        for (String keyword : keywords) {
177 //            List<String> values = Arrays.asList(Collator.getKeywordValuesForLocale(keyword, ulocale, false));
178 //            List<String> allValues = Arrays.asList(Collator.getKeywordValues(keyword));
179 //            for (String value : allValues) {
180 //                System.out.println(keyword + "=" + value);
181 //                checkKeyword(locale, value, values.contains(value));
182 //            }
183 //        }
184 //    }
185 //
186 //    private void checkKeyword(String locale, String collationValue, boolean shouldExist) {
187 //        final ULocale base = new ULocale(locale);
188 //        final ULocale desired = new ULocale(locale + "@collation=" + collationValue);
189 //        Collator foo = Collator.getInstance(desired);
190 //        ULocale actual = foo.getLocale(ULocale.ACTUAL_LOCALE);
191 //        if (shouldExist) {
192 //            assertEquals("actual should match desired", desired, actual);
193 //        } else {
194 //            assertEquals("actual should match base", base, actual);
195 //        }
196 //        int comp = foo.compare("a", "ā");
197 //        assertEquals("should fall back to default for zh", -1, comp);
198 //    }
199 //
200 //    /**
201 //     * @param rb
202 //     * @param i
203 //     */
204 //    private static void showBundle(UResourceBundle rb, int i) {
205 //        for (String key : rb.keySet()) {
206 //            System.out.print("\n" + Utility.repeat("  ", i) + key);
207 //            UResourceBundle rb2 = rb.get(key);
208 //            showBundle(rb2, i+1);
209 //        }
210 //    }
211 
212 
TestA()213     public void TestA() {
214         String[][] tests = {{"zh_Hant", "渡辺", "12劃"},
215                 {"zh", "渡辺", "D"}
216                 /*, "zh@collation=unihan", "ja@collation=unihan", "ko@collation=unihan"*/
217                 };
218         for (String[] test : tests) {
219             AlphabeticIndex<Integer> alphabeticIndex = new AlphabeticIndex<Integer>(new ULocale(test[0]));
220             final String probe = test[1];
221             final String expectedLabel = test[2];
222             alphabeticIndex.addRecord(probe, 1);
223             List labels = alphabeticIndex.getBucketLabels();
224             logln(labels.toString());
225             Bucket<Integer> bucket = find(alphabeticIndex, probe);
226             assertEquals("locale " + test[0] + " name=" + probe + " in bucket",
227                     expectedLabel, bucket.getLabel());
228         }
229     }
230 
find(AlphabeticIndex<Integer> alphabeticIndex, final String probe)231     private Bucket<Integer> find(AlphabeticIndex<Integer> alphabeticIndex, final String probe) {
232         for (Bucket<Integer> bucket : alphabeticIndex) {
233             for (Record<Integer> record : bucket) {
234                 if (record.getName().equals(probe)) {
235                     return bucket;
236                 }
237             }
238         }
239         return null;
240     }
241 
TestFirstCharacters()242     public void TestFirstCharacters() {
243 
244         AlphabeticIndex alphabeticIndex = new AlphabeticIndex(Locale.ENGLISH);
245         RuleBasedCollator collator = alphabeticIndex.getCollator();
246         collator.setStrength(Collator.IDENTICAL);
247         Collection<String> firsts = alphabeticIndex.getFirstCharactersInScripts();
248         // Verify that each script is represented exactly once.
249         // Exclude pseudo-scripts like Common (no letters).
250         // Exclude scripts like Braille and Sutton SignWriting
251         // because they only have symbols, not letters.
252         UnicodeSet missingScripts = new UnicodeSet(
253                 "[^[:inherited:][:unknown:][:common:][:Braille:][:SignWriting:]]");
254         String last = "";
255         for (String index : firsts) {
256             if (collator.compare(last,index) >= 0) {
257                 errln("Characters not in order: " + last + " !< " + index);
258             }
259             int script = getFirstRealScript(index);
260             if (script == UScript.UNKNOWN) { continue; }
261             UnicodeSet s = new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, script);
262             if (missingScripts.containsNone(s)) {
263                 errln("2nd character in script: " + index + "\t" + new UnicodeSet(missingScripts).retainAll(s).toPattern(false));
264             }
265             missingScripts.removeAll(s);
266         }
267         if (missingScripts.size() != 0) {
268             String missingScriptNames = "";
269             UnicodeSet missingChars = new UnicodeSet(missingScripts);
270             for(;;) {
271                 int c = missingChars.charAt(0);
272                 if (c < 0) {
273                     break;
274                 }
275                 int script = UScript.getScript(c);
276                 missingScriptNames += " " +
277                         UCharacter.getPropertyValueName(
278                                 UProperty.SCRIPT, script, UProperty.NameChoice.SHORT);
279                 missingChars.removeAll(new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, script));
280             }
281             errln("Missing character from:" + missingScriptNames + " -- " + missingScripts);
282         }
283     }
284 
getFirstRealScript(CharSequence s)285     private static final int getFirstRealScript(CharSequence s) {
286         for (int i = 0; i < s.length();) {
287             int c = Character.codePointAt(s, i);
288             int script = UScript.getScript(c);
289             if (script != UScript.UNKNOWN && script != UScript.INHERITED && script != UScript.COMMON) {
290                 return script;
291             }
292             i += Character.charCount(c);
293         }
294         return UScript.UNKNOWN;
295     }
296 
TestBuckets()297     public void TestBuckets() {
298         ULocale additionalLocale = ULocale.ENGLISH;
299 
300         for (String[] pair : localeAndIndexCharactersLists) {
301             checkBuckets(pair[0], SimpleTests, additionalLocale, "E", "edgar", "Effron", "Effron");
302         }
303     }
304 
TestEmpty()305     public void TestEmpty() {
306         // just verify that it doesn't blow up.
307         Set<ULocale> locales = new LinkedHashSet<ULocale>();
308         locales.add(ULocale.ROOT);
309         locales.addAll(Arrays.asList(ULocale.getAvailableLocales()));
310         for (ULocale locale : locales) {
311             try {
312                 AlphabeticIndex<String> alphabeticIndex = new AlphabeticIndex(locale);
313                 alphabeticIndex.addRecord("hi", "HI");
314                 for (Bucket<String> bucket : alphabeticIndex) {
315                     @SuppressWarnings("unused")
316                     LabelType labelType = bucket.getLabelType();
317                 }
318             } catch (Exception e) {
319                 errln("Exception when creating AlphabeticIndex for:\t" + locale.toLanguageTag());
320                 errln(e.toString());
321             }
322         }
323     }
324 
TestInflow()325     public void TestInflow() {
326         Object[][] tests = {
327                 {0, ULocale.ENGLISH},
328                 {0, ULocale.ENGLISH, new ULocale("el")},
329                 {1, ULocale.ENGLISH, new ULocale("ru")},
330                 {0, ULocale.ENGLISH, new ULocale("el"), new UnicodeSet("[\u2C80]"), new ULocale("ru")},
331                 {0, ULocale.ENGLISH},
332                 {2, ULocale.ENGLISH, new ULocale("ru"), ULocale.JAPANESE},
333         };
334         for (Object[] test : tests) {
335             int expected = (Integer) test[0];
336             AlphabeticIndex<Double> alphabeticIndex = new AlphabeticIndex((ULocale)test[1]);
337             for (int i = 2; i < test.length; ++i) {
338                 if (test[i] instanceof ULocale) {
339                     alphabeticIndex.addLabels((ULocale)test[i]);
340                 } else {
341                     alphabeticIndex.addLabels((UnicodeSet)test[i]);
342                 }
343             }
344             Counter<AlphabeticIndex.Bucket.LabelType> counter = new Counter();
345             for (Bucket<Double> bucket : alphabeticIndex) {
346                 LabelType labelType = bucket.getLabelType();
347                 counter.add(labelType, 1);
348             }
349             String printList = Arrays.asList(test).toString();
350             assertEquals(LabelType.UNDERFLOW + "\t" + printList, 1, counter.get(LabelType.UNDERFLOW));
351             assertEquals(LabelType.INFLOW + "\t" + printList, expected, counter.get(LabelType.INFLOW));
352             if (expected != counter.get(LabelType.INFLOW)) {
353                 // for debugging
354                 AlphabeticIndex<Double> indexCharacters2 = new AlphabeticIndex((ULocale)test[1]);
355                 for (int i = 2; i < test.length; ++i) {
356                     if (test[i] instanceof ULocale) {
357                         indexCharacters2.addLabels((ULocale)test[i]);
358                     } else {
359                         indexCharacters2.addLabels((UnicodeSet)test[i]);
360                     }
361                 }
362                 List<Bucket<Double>> buckets = CollectionUtilities.addAll(alphabeticIndex.iterator(), new ArrayList<Bucket<Double>>());
363                 logln(buckets.toString());
364             }
365             assertEquals(LabelType.OVERFLOW + "\t" + printList, 1, counter.get(LabelType.OVERFLOW));
366         }
367     }
368 
checkBuckets(String localeString, String[] test, ULocale additionalLocale, String testBucket, String... items)369     private void checkBuckets(String localeString, String[] test, ULocale additionalLocale, String testBucket, String... items) {
370         StringBuilder UI = new StringBuilder();
371         ULocale desiredLocale = new ULocale(localeString);
372 
373         // Create a simple index where the values for the strings are Integers, and add the strings
374         AlphabeticIndex<Integer> index = new AlphabeticIndex<Integer>(desiredLocale).addLabels(additionalLocale);
375         int counter = 0;
376         Counter<String> itemCount = new Counter();
377         for (String item : test) {
378             index.addRecord(item, counter++);
379             itemCount.add(item, 1);
380         }
381 
382         List<String> labels = index.getBucketLabels();
383         ImmutableIndex<Integer> immIndex = index.buildImmutableIndex();
384 
385         logln(desiredLocale + "\t" + desiredLocale.getDisplayName(ULocale.ENGLISH) + " - " + desiredLocale.getDisplayName(desiredLocale) + "\t"
386                 + index.getCollator().getLocale(ULocale.ACTUAL_LOCALE));
387         UI.setLength(0);
388         UI.append(desiredLocale + "\t");
389         boolean showAll = true;
390 
391         // Show index at top. We could skip or gray out empty buckets
392         for (AlphabeticIndex.Bucket<Integer> bucket : index) {
393             if (showAll || bucket.size() != 0) {
394                 showLabelAtTop(UI, bucket.getLabel());
395             }
396         }
397         logln(UI.toString());
398 
399         // Show the buckets with their contents, skipping empty buckets
400         int bucketIndex = 0;
401         for (Bucket<Integer> bucket : index) {
402             assertEquals("bucket label vs. iterator",
403                     labels.get(bucketIndex), bucket.getLabel());
404             assertEquals("bucket label vs. immutable",
405                     labels.get(bucketIndex), immIndex.getBucket(bucketIndex).getLabel());
406             assertEquals("bucket label type vs. immutable",
407                     bucket.getLabelType(), immIndex.getBucket(bucketIndex).getLabelType());
408             for (Record<Integer> r : bucket) {
409                 CharSequence name = r.getName();
410                 assertEquals("getBucketIndex(" + name + ")",
411                         bucketIndex, index.getBucketIndex(name));
412                 assertEquals("immutable getBucketIndex(" + name + ")",
413                         bucketIndex, immIndex.getBucketIndex(name));
414             }
415             if (bucket.getLabel().equals(testBucket)) {
416                 Counter<String> keys = getKeys(bucket);
417                 for (String item : items) {
418                     long globalCount = itemCount.get(item);
419                     long localeCount = keys.get(item);
420                     if (globalCount != localeCount) {
421                         errln("Error: in " + "'" + testBucket + "', '" + item + "' should have count "
422                                 + globalCount + " but has count " + localeCount);
423                     }
424 
425                 }
426             }
427 
428             if (bucket.size() != 0) {
429                 showLabelInList(UI, bucket.getLabel());
430                 for (AlphabeticIndex.Record<Integer> item : bucket) {
431                     showIndexedItem(UI, item.getName(), item.getData());
432                 }
433                 logln(UI.toString());
434             }
435             ++bucketIndex;
436         }
437         assertEquals("getBucketCount()", bucketIndex, index.getBucketCount());
438         assertEquals("immutable getBucketCount()", bucketIndex, immIndex.getBucketCount());
439 
440         assertNull("immutable getBucket(-1)", immIndex.getBucket(-1));
441         assertNull("immutable getBucket(count)", immIndex.getBucket(bucketIndex));
442 
443         for (Bucket<Integer> bucket : immIndex) {
444             assertEquals("immutable bucket size", 0, bucket.size());
445             assertFalse("immutable bucket iterator.hasNext()", bucket.iterator().hasNext());
446         }
447     }
448 
showIndex(AlphabeticIndex<T> index, boolean showEmpty)449     public <T> void showIndex(AlphabeticIndex<T> index, boolean showEmpty) {
450         logln("Actual");
451         StringBuilder UI = new StringBuilder();
452         for (Bucket<T> bucket : index) {
453             if (showEmpty || bucket.size() != 0) {
454                 showLabelInList(UI, bucket.getLabel());
455                 for (Record<T> item : bucket) {
456                     showIndexedItem(UI, item.getName(), item.getData());
457                 }
458                 logln(UI.toString());
459             }
460         }
461     }
462 
463     /**
464      * @param myBucketLabels
465      * @param myBucketContents
466      * @param b
467      */
showIndex(List<String> myBucketLabels, ArrayList<Set<R4<RawCollationKey, String, Integer, Double>>> myBucketContents, boolean showEmpty)468     private void showIndex(List<String> myBucketLabels, ArrayList<Set<R4<RawCollationKey, String, Integer, Double>>> myBucketContents, boolean showEmpty) {
469         logln("Alternative");
470         StringBuilder UI = new StringBuilder();
471 
472         for (int i = 0; i < myBucketLabels.size(); ++i) {
473             Set<R4<RawCollationKey, String, Integer, Double>> bucket = myBucketContents.get(i);
474             if (!showEmpty && bucket.size() == 0) {
475                 continue;
476             }
477             UI.setLength(0);
478             UI.append("*").append(myBucketLabels.get(i));
479             for (R4<RawCollationKey, String, Integer, Double> item : bucket) {
480                 UI.append("\t ").append(item.get1().toString()).append(ARROW).append(item.get3().toString());
481             }
482             logln(UI.toString());
483         }
484     }
485 
showLabelAtTop(StringBuilder buffer, String label)486     private void showLabelAtTop(StringBuilder buffer, String label) {
487         buffer.append(label + " ");
488     }
489 
showIndexedItem(StringBuilder buffer, CharSequence key, T value)490     private <T> void showIndexedItem(StringBuilder buffer, CharSequence key, T value) {
491         buffer.append("\t " + key + ARROW + value);
492     }
493 
showLabelInList(StringBuilder buffer, String label)494     private void showLabelInList(StringBuilder buffer, String label) {
495         buffer.setLength(0);
496         buffer.append(label);
497     }
498 
getKeys(AlphabeticIndex.Bucket<Integer> entry)499     private Counter<String> getKeys(AlphabeticIndex.Bucket<Integer> entry) {
500         Counter<String> keys = new Counter<String>();
501         for (AlphabeticIndex.Record x : entry) {
502             String key = x.getName().toString();
503             keys.add(key, 1);
504         }
505         return keys;
506     }
507 
TestIndexCharactersList()508     public void TestIndexCharactersList() {
509         for (String[] localeAndIndexCharacters : localeAndIndexCharactersLists) {
510             ULocale locale = new ULocale(localeAndIndexCharacters[0]);
511             String expectedIndexCharacters = "\u2026:" + localeAndIndexCharacters[1] + ":\u2026";
512             Collection<String> alphabeticIndex = new AlphabeticIndex(locale).getBucketLabels();
513 
514             // Join the elements of the list to a string with delimiter ":"
515             StringBuilder sb = new StringBuilder();
516             Iterator<String> iter = alphabeticIndex.iterator();
517             while (iter.hasNext()) {
518                 sb.append(iter.next());
519                 if (!iter.hasNext()) {
520                     break;
521                 }
522                 sb.append(":");
523             }
524             String actualIndexCharacters = sb.toString();
525             if (!expectedIndexCharacters.equals(actualIndexCharacters)) {
526                 errln("Test failed for locale " + localeAndIndexCharacters[0] +
527                         "\n  Expected = |" + expectedIndexCharacters + "|\n  actual   = |" + actualIndexCharacters + "|");
528             }
529         }
530     }
531 
TestBasics()532     public void TestBasics() {
533         ULocale[] list = ULocale.getAvailableLocales();
534         // get keywords combinations
535         // don't bother with multiple combinations at this point
536         List keywords = new ArrayList();
537         keywords.add("");
538 
539         String[] collationValues = Collator.getKeywordValues("collation");
540         for (int j = 0; j < collationValues.length; ++j) {
541             keywords.add("@collation=" + collationValues[j]);
542         }
543 
544         for (int i = 0; i < list.length; ++i) {
545             for (Iterator it = keywords.iterator(); it.hasNext();) {
546                 String collationValue = (String) it.next();
547                 String localeString = list[i].toString();
548                 if (!KEY_LOCALES.contains(localeString)) continue; // TODO change in exhaustive
549                 ULocale locale = new ULocale(localeString + collationValue);
550                 if (collationValue.length() > 0 && !Collator.getFunctionalEquivalent("collation", locale).equals(locale)) {
551                     //logln("Skipping " + locale);
552                     continue;
553                 }
554 
555                 if (locale.getCountry().length() != 0) {
556                     continue;
557                 }
558                 boolean isUnihan = collationValue.contains("unihan");
559                 AlphabeticIndex alphabeticIndex = new AlphabeticIndex(locale);
560                 if (isUnihan) {
561                     // Unihan tailorings have a label per radical, and there are at least 214,
562                     // if not more when simplified radicals are distinguished.
563                     alphabeticIndex.setMaxLabelCount(500);
564                 }
565                 final Collection mainChars = alphabeticIndex.getBucketLabels();
566                 String mainCharString = mainChars.toString();
567                 if (mainCharString.length() > 500) {
568                     mainCharString = mainCharString.substring(0,500) + "...";
569                 }
570                 logln(mainChars.size() + "\t" + locale + "\t" + locale.getDisplayName(ULocale.ENGLISH));
571                 logln("Index:\t" + mainCharString);
572                 if (!isUnihan && mainChars.size() > 100) {
573                     errln("Index character set too large: " +
574                             locale + " [" + mainChars.size() + "]:\n    " + mainChars);
575                 }
576             }
577         }
578     }
579 
TestClientSupport()580     public void TestClientSupport() {
581         for (String localeString : new String[] {"zh"}) { // KEY_LOCALES, new String[] {"zh"}
582             ULocale ulocale = new ULocale(localeString);
583             AlphabeticIndex<Double> alphabeticIndex = new AlphabeticIndex<Double>(ulocale).addLabels(ULocale.ENGLISH);
584             RuleBasedCollator collator = alphabeticIndex.getCollator();
585             String [][] tests;
586 
587             if (!localeString.equals("zh") ) {
588                 tests = new String[][] {SimpleTests};
589             } else {
590                 tests = new String[][] {SimpleTests, hackPinyin, simplifiedNames};
591             }
592 
593             for (String [] shortTest : tests) {
594                 double testValue = 100;
595                 alphabeticIndex.clearRecords();
596                 for (String name : shortTest) {
597                     alphabeticIndex.addRecord(name, testValue++);
598                 }
599 
600                 if (DEBUG) showIndex(alphabeticIndex, false);
601 
602                 // make my own copy
603                 testValue = 100;
604                 List<String> myBucketLabels = alphabeticIndex.getBucketLabels();
605                 ArrayList<Set<R4<RawCollationKey, String, Integer, Double>>> myBucketContents = new ArrayList<Set<R4<RawCollationKey, String, Integer, Double>>>(myBucketLabels.size());
606                 for (int i = 0; i < myBucketLabels.size(); ++i) {
607                     myBucketContents.add(new TreeSet<R4<RawCollationKey, String, Integer, Double>>());
608                 }
609                 for (String name : shortTest) {
610                     int bucketIndex = alphabeticIndex.getBucketIndex(name);
611                     if (bucketIndex > myBucketContents.size()) {
612                         alphabeticIndex.getBucketIndex(name); // call again for debugging
613                     }
614                     Set<R4<RawCollationKey, String, Integer, Double>> myBucket = myBucketContents.get(bucketIndex);
615                     RawCollationKey rawCollationKey = collator.getRawCollationKey(name, null);
616                     R4<RawCollationKey, String, Integer, Double> row = Row.of(rawCollationKey, name, name.length(), testValue++);
617                     myBucket.add(row);
618                 }
619                 if (DEBUG) showIndex(myBucketLabels, myBucketContents, false);
620 
621                 // now compare
622                 int index = 0;
623                 boolean gotError = false;
624                 for (AlphabeticIndex.Bucket<Double> bucket : alphabeticIndex) {
625                     String bucketLabel = bucket.getLabel();
626                     String myLabel = myBucketLabels.get(index);
627                     if (!bucketLabel.equals(myLabel)) {
628                         gotError |= !assertEquals(ulocale + "\tBucket Labels (" + index + ")", bucketLabel, myLabel);
629                     }
630                     Set<R4<RawCollationKey, String, Integer, Double>> myBucket = myBucketContents.get(index);
631                     Iterator<R4<RawCollationKey, String, Integer, Double>> myBucketIterator = myBucket.iterator();
632                     int recordIndex = 0;
633                     for (Record<Double> record : bucket) {
634                         String myName = null;
635                         if (myBucketIterator.hasNext()) {
636                             R4<RawCollationKey, String, Integer, Double> myRecord = myBucketIterator.next();
637                             myName = (String) myRecord.get1();
638                         }
639                         if (!record.getName().equals(myName)) {
640                             gotError |= !assertEquals(ulocale + "\t" + bucketLabel + "\t" + "Record Names (" + index + "." + recordIndex++ + ")", record.getName(), myName);
641                         }
642                     }
643                     while (myBucketIterator.hasNext()) {
644                         R4<RawCollationKey, String, Integer, Double> myRecord = myBucketIterator.next();
645                         String myName = (String) myRecord.get1();
646                         gotError |= !assertEquals(ulocale + "\t" + bucketLabel + "\t" + "Record Names (" + index + "." + recordIndex++ + ")", null, myName);
647                     }
648                     index++;
649                 }
650                 if (gotError) {
651                     showIndex(myBucketLabels, myBucketContents, false);
652                     showIndex(alphabeticIndex, false);
653                 }
654             }
655         }
656     }
657 
TestFirstScriptCharacters()658     public void TestFirstScriptCharacters() {
659         Collection<String> firstCharacters =
660                 new AlphabeticIndex(ULocale.ENGLISH).getFirstCharactersInScripts();
661         Collection<String> expectedFirstCharacters = firstStringsInScript((RuleBasedCollator) Collator.getInstance(ULocale.ROOT));
662         Collection<String> diff = new TreeSet<String>(firstCharacters);
663         diff.removeAll(expectedFirstCharacters);
664         assertTrue("First Characters contains unexpected ones: " + diff, diff.isEmpty());
665         diff.clear();
666         diff.addAll(expectedFirstCharacters);
667         diff.removeAll(firstCharacters);
668         assertTrue("First Characters missing expected ones: " + diff, diff.isEmpty());
669     }
670 
671     private static final UnicodeSet TO_TRY = new UnicodeSet("[[:^nfcqc=no:]-[:sc=Common:]-[:sc=Inherited:]-[:sc=Unknown:]]").freeze();
672 
673     /**
674      * Returns a collection of all the "First" characters of scripts, according to the collation.
675      */
firstStringsInScript(RuleBasedCollator ruleBasedCollator)676     private static Collection<String> firstStringsInScript(RuleBasedCollator ruleBasedCollator) {
677         String[] results = new String[UScript.CODE_LIMIT];
678         for (String current : TO_TRY) {
679             if (ruleBasedCollator.compare(current, "a") < 0) { // we only want "real" script characters, not symbols.
680                 continue;
681             }
682             int script = UScript.getScript(current.codePointAt(0));
683             if (results[script] == null) {
684                 results[script] = current;
685             } else if (ruleBasedCollator.compare(current, results[script]) < 0) {
686                 results[script] = current;
687             }
688         }
689 
690         try {
691             UnicodeSet extras = new UnicodeSet();
692             UnicodeSet expansions = new UnicodeSet();
693             ruleBasedCollator.getContractionsAndExpansions(extras, expansions, true);
694             extras.addAll(expansions).removeAll(TO_TRY);
695             if (extras.size() != 0) {
696                 Normalizer2 normalizer = Normalizer2.getNFKCInstance();
697                 for (String current : extras) {
698                     if (!normalizer.isNormalized(current) || ruleBasedCollator.compare(current, "9") <= 0) {
699                         continue;
700                     }
701                     int script = getFirstRealScript(current);
702                     if (script == UScript.UNKNOWN && !isUnassignedBoundary(current)) { continue; }
703                     if (results[script] == null) {
704                         results[script] = current;
705                     } else if (ruleBasedCollator.compare(current, results[script]) < 0) {
706                         results[script] = current;
707                     }
708                 }
709             }
710         } catch (Exception e) {
711         } // why have a checked exception???
712 
713         // TODO: We should not test that we get the same strings, but that we
714         // get strings that sort primary-equal to those from the implementation.
715 
716         Collection<String> result = new ArrayList<String>();
717         for (int i = 0; i < results.length; ++i) {
718             if (results[i] != null) {
719                 result.add(results[i]);
720             }
721         }
722         return result;
723     }
724 
isUnassignedBoundary(CharSequence s)725     private static final boolean isUnassignedBoundary(CharSequence s) {
726         // The root collator provides a script-first-primary boundary contraction
727         // for the unassigned-implicit range.
728         return s.charAt(0) == 0xfdd1 &&
729                 UScript.getScript(Character.codePointAt(s, 1)) == UScript.UNKNOWN;
730     }
731 
TestZZZ()732     public void TestZZZ() {
733         //            int x = 3;
734         //            AlphabeticIndex index = new AlphabeticIndex(ULocale.ENGLISH);
735         //            UnicodeSet additions = new UnicodeSet();
736         //            additions.add(0x410).add(0x415);  // Cyrillic
737         //            // additions.add(0x391).add(0x393);     // Greek
738         //            index.addLabels(additions);
739         //            int lc = index.getLabels().size();
740         //            List  labels = index.getLabels();
741         //            System.out.println("Label Count = " + lc + "\t" + labels);
742         //            System.out.println("Bucket Count =" + index.getBucketCount());
743     }
744 
TestSimplified()745     public void TestSimplified() {
746         checkBuckets("zh", simplifiedNames, ULocale.ENGLISH, "W", "\u897f");
747     }
TestTraditional()748     public void TestTraditional() {
749         checkBuckets("zh_Hant", traditionalNames, ULocale.ENGLISH, "\u4e9f", "\u5357\u9580");
750     }
751 
752     static final String[] SimpleTests = {
753         "斎藤",
754         "\u1f2d\u03c1\u03b1",
755         "$", "\u00a3", "12", "2",
756         "Davis", "Davis", "Abbot", "\u1D05avis", "Zach", "\u1D05avis", "\u01b5", "\u0130stanbul", "Istanbul", "istanbul", "\u0131stanbul",
757         "\u00deor", "\u00c5berg", "\u00d6stlund",
758         "\u1f2d\u03c1\u03b1", "\u1f08\u03b8\u03b7\u03bd\u1fb6",
759         "\u0396\u03b5\u03cd\u03c2", "\u03a0\u03bf\u03c3\u03b5\u03b9\u03b4\u1f63\u03bd", "\u1f0d\u03b9\u03b4\u03b7\u03c2", "\u0394\u03b7\u03bc\u03ae\u03c4\u03b7\u03c1", "\u1f19\u03c3\u03c4\u03b9\u03ac",
760         //"\u1f08\u03c0\u03cc\u03bb\u03bb\u03c9\u03bd", "\u1f0c\u03c1\u03c4\u03b5\u03bc\u03b9\u03c2", "\u1f19\u03c1\u03bc\u1f23\u03c2", "\u1f0c\u03c1\u03b7\u03c2", "\u1f08\u03c6\u03c1\u03bf\u03b4\u03af\u03c4\u03b7", "\u1f2d\u03c6\u03b1\u03b9\u03c3\u03c4\u03bf\u03c2", "\u0394\u03b9\u03cc\u03bd\u03c5\u03c3\u03bf\u03c2",
761         "\u6589\u85e4", "\u4f50\u85e4", "\u9234\u6728", "\u9ad8\u6a4b", "\u7530\u4e2d", "\u6e21\u8fba", "\u4f0a\u85e4", "\u5c71\u672c", "\u4e2d\u6751", "\u5c0f\u6797", "\u658e\u85e4", "\u52a0\u85e4",
762         //"\u5409\u7530", "\u5c71\u7530", "\u4f50\u3005\u6728", "\u5c71\u53e3", "\u677e\u672c", "\u4e95\u4e0a", "\u6728\u6751", "\u6797", "\u6e05\u6c34"
763     };
764 
765     static final String[] hackPinyin = {
766         "a", "\u5416", "\u58ba", //
767         "b", "\u516b", "\u62d4", "\u8500", //
768         "c", "\u5693", "\u7938", "\u9e7e", //
769         "d", "\u5491", "\u8fcf", "\u964a", //
770         "e","\u59b8", "\u92e8", "\u834b", //
771         "f", "\u53d1", "\u9197", "\u99a5", //
772         "g", "\u7324", "\u91d3", "\u8142", //
773         "h", "\u598e", "\u927f", "\u593b", //
774         "j", "\u4e0c", "\u6785", "\u9d58", //
775         "k", "\u5494", "\u958b", "\u7a52", //
776         "l", "\u5783", "\u62c9", "\u9ba5", //
777         "m", "\u5638", "\u9ebb", "\u65c0", //
778         "n", "\u62ff", "\u80ad", "\u685b", //
779         "o", "\u5662", "\u6bee", "\u8bb4", //
780         "p", "\u5991", "\u8019", "\u8c31", //
781         "q", "\u4e03", "\u6053", "\u7f56", //
782         "r", "\u5465", "\u72aa", "\u6e03", //
783         "s", "\u4ee8", "\u9491", "\u93c1", //
784         "t", "\u4ed6", "\u9248", "\u67dd", //
785         "w", "\u5c72", "\u5558", "\u5a7a", //
786         "x", "\u5915", "\u5438", "\u6bbe", //
787         "y", "\u4e2b", "\u82bd", "\u8574", //
788         "z", "\u5e00", "\u707d", "\u5c0a"
789     };
790 
791     static final String[] simplifiedNames = {
792         "Abbot", "Morton", "Zachary", "Williams", "\u8d75", "\u94b1", "\u5b59", "\u674e", "\u5468", "\u5434", "\u90d1", "\u738b", "\u51af", "\u9648", "\u696e", "\u536b", "\u848b", "\u6c88",
793         "\u97e9", "\u6768", "\u6731", "\u79e6", "\u5c24", "\u8bb8", "\u4f55", "\u5415", "\u65bd", "\u5f20", "\u5b54", "\u66f9", "\u4e25", "\u534e", "\u91d1", "\u9b4f", "\u9676", "\u59dc", "\u621a", "\u8c22", "\u90b9",
794         "\u55bb", "\u67cf", "\u6c34", "\u7aa6", "\u7ae0", "\u4e91", "\u82cf", "\u6f58", "\u845b", "\u595a", "\u8303", "\u5f6d", "\u90ce", "\u9c81", "\u97e6", "\u660c", "\u9a6c", "\u82d7", "\u51e4", "\u82b1", "\u65b9",
795         "\u4fde", "\u4efb", "\u8881", "\u67f3", "\u9146", "\u9c8d", "\u53f2", "\u5510", "\u8d39", "\u5ec9", "\u5c91", "\u859b", "\u96f7", "\u8d3a", "\u502a", "\u6c64", "\u6ed5", "\u6bb7", "\u7f57", "\u6bd5", "\u90dd",
796         "\u90ac", "\u5b89", "\u5e38", "\u4e50", "\u4e8e", "\u65f6", "\u5085", "\u76ae", "\u535e", "\u9f50", "\u5eb7", "\u4f0d", "\u4f59", "\u5143", "\u535c", "\u987e", "\u5b5f", "\u5e73", "\u9ec4", "\u548c", "\u7a46",
797         "\u8427", "\u5c39", "\u59da", "\u90b5", "\u6e5b", "\u6c6a", "\u7941", "\u6bdb", "\u79b9", "\u72c4", "\u7c73", "\u8d1d", "\u660e", "\u81e7", "\u8ba1", "\u4f0f", "\u6210", "\u6234", "\u8c08", "\u5b8b", "\u8305",
798         "\u5e9e", "\u718a", "\u7eaa", "\u8212", "\u5c48", "\u9879", "\u795d", "\u8463", "\u6881", "\u675c", "\u962e", "\u84dd", "\u95fd", "\u5e2d", "\u5b63", "\u9ebb", "\u5f3a", "\u8d3e", "\u8def", "\u5a04", "\u5371",
799         "\u6c5f", "\u7ae5", "\u989c", "\u90ed", "\u6885", "\u76db", "\u6797", "\u5201", "\u953a", "\u5f90", "\u4e18", "\u9a86", "\u9ad8", "\u590f", "\u8521", "\u7530", "\u6a0a", "\u80e1", "\u51cc", "\u970d", "\u865e",
800         "\u4e07", "\u652f", "\u67ef", "\u661d", "\u7ba1", "\u5362", "\u83ab", "\u7ecf", "\u623f", "\u88d8", "\u7f2a", "\u5e72", "\u89e3", "\u5e94", "\u5b97", "\u4e01", "\u5ba3", "\u8d32", "\u9093", "\u90c1", "\u5355",
801         "\u676d", "\u6d2a", "\u5305", "\u8bf8", "\u5de6", "\u77f3", "\u5d14", "\u5409", "\u94ae", "\u9f9a", "\u7a0b", "\u5d47", "\u90a2", "\u6ed1", "\u88f4", "\u9646", "\u8363", "\u7fc1", "\u8340", "\u7f8a", "\u65bc",
802         "\u60e0", "\u7504", "\u9eb9", "\u5bb6", "\u5c01", "\u82ae", "\u7fbf", "\u50a8", "\u9773", "\u6c72", "\u90b4", "\u7cdc", "\u677e", "\u4e95", "\u6bb5", "\u5bcc", "\u5deb", "\u4e4c", "\u7126", "\u5df4", "\u5f13",
803         "\u7267", "\u9697", "\u5c71", "\u8c37", "\u8f66", "\u4faf", "\u5b93", "\u84ec", "\u5168", "\u90d7", "\u73ed", "\u4ef0", "\u79cb", "\u4ef2", "\u4f0a", "\u5bab", "\u5b81", "\u4ec7", "\u683e", "\u66b4", "\u7518",
804         "\u659c", "\u5389", "\u620e", "\u7956", "\u6b66", "\u7b26", "\u5218", "\u666f", "\u8a79", "\u675f", "\u9f99", "\u53f6", "\u5e78", "\u53f8", "\u97f6", "\u90dc", "\u9ece", "\u84df", "\u8584", "\u5370", "\u5bbf",
805         "\u767d", "\u6000", "\u84b2", "\u90b0", "\u4ece", "\u9102", "\u7d22", "\u54b8", "\u7c4d", "\u8d56", "\u5353", "\u853a", "\u5c60", "\u8499", "\u6c60", "\u4e54", "\u9634", "\u90c1", "\u80e5", "\u80fd", "\u82cd",
806         "\u53cc", "\u95fb", "\u8398", "\u515a", "\u7fdf", "\u8c2d", "\u8d21", "\u52b3", "\u9004", "\u59ec", "\u7533", "\u6276", "\u5835", "\u5189", "\u5bb0", "\u90e6", "\u96cd", "\u90e4", "\u74a9", "\u6851", "\u6842",
807         "\u6fee", "\u725b", "\u5bff", "\u901a", "\u8fb9", "\u6248", "\u71d5", "\u5180", "\u90cf", "\u6d66", "\u5c1a", "\u519c", "\u6e29", "\u522b", "\u5e84", "\u664f", "\u67f4", "\u77bf", "\u960e", "\u5145", "\u6155",
808         "\u8fde", "\u8339", "\u4e60", "\u5ba6", "\u827e", "\u9c7c", "\u5bb9", "\u5411", "\u53e4", "\u6613", "\u614e", "\u6208", "\u5ed6", "\u5ebe", "\u7ec8", "\u66a8", "\u5c45", "\u8861", "\u6b65", "\u90fd", "\u803f",
809         "\u6ee1", "\u5f18", "\u5321", "\u56fd", "\u6587", "\u5bc7", "\u5e7f", "\u7984", "\u9619", "\u4e1c", "\u6b27", "\u6bb3", "\u6c83", "\u5229", "\u851a", "\u8d8a", "\u5914", "\u9686", "\u5e08", "\u5de9", "\u538d",
810         "\u8042", "\u6641", "\u52fe", "\u6556", "\u878d", "\u51b7", "\u8a3e", "\u8f9b", "\u961a", "\u90a3", "\u7b80", "\u9976", "\u7a7a", "\u66fe", "\u6bcb", "\u6c99", "\u4e5c", "\u517b", "\u97a0", "\u987b", "\u4e30",
811         "\u5de2", "\u5173", "\u84af", "\u76f8", "\u67e5", "\u540e", "\u8346", "\u7ea2", "\u6e38", "\u7afa", "\u6743", "\u9011", "\u76d6", "\u76ca", "\u6853", "\u516c", "\u4e07\u4fdf", "\u53f8\u9a6c", "\u4e0a\u5b98", "\u6b27\u9633",
812         "\u590f\u4faf", "\u8bf8\u845b", "\u95fb\u4eba", "\u4e1c\u65b9", "\u8d6b\u8fde", "\u7687\u752b", "\u5c09\u8fdf", "\u516c\u7f8a", "\u6fb9\u53f0", "\u516c\u51b6", "\u5b97\u653f", "\u6fee\u9633", "\u6df3\u4e8e", "\u5355\u4e8e", "\u592a\u53d4", "\u7533\u5c60", "\u516c\u5b59", "\u4ef2\u5b59",
813         "\u8f69\u8f95", "\u4ee4\u72d0", "\u953a\u79bb", "\u5b87\u6587", "\u957f\u5b59", "\u6155\u5bb9", "\u9c9c\u4e8e", "\u95fe\u4e18", "\u53f8\u5f92", "\u53f8\u7a7a", "\u4e0c\u5b98", "\u53f8\u5bc7", "\u4ec9", "\u7763", "\u5b50\u8f66", "\u989b\u5b59", "\u7aef\u6728", "\u5deb\u9a6c",
814         "\u516c\u897f", "\u6f06\u96d5", "\u4e50\u6b63", "\u58e4\u9a77", "\u516c\u826f", "\u62d3\u62d4", "\u5939\u8c37", "\u5bb0\u7236", "\u8c37\u6881", "\u664b", "\u695a", "\u960e", "\u6cd5", "\u6c5d", "\u9122", "\u6d82", "\u94a6", "\u6bb5\u5e72", "\u767e\u91cc",
815         "\u4e1c\u90ed", "\u5357\u95e8", "\u547c\u5ef6", "\u5f52", "\u6d77", "\u7f8a\u820c", "\u5fae\u751f", "\u5cb3", "\u5e05", "\u7f11", "\u4ea2", "\u51b5", "\u540e", "\u6709", "\u7434", "\u6881\u4e18", "\u5de6\u4e18", "\u4e1c\u95e8", "\u897f\u95e8",
816         "\u5546", "\u725f", "\u4f58", "\u4f74", "\u4f2f", "\u8d4f", "\u5357\u5bab", "\u58a8", "\u54c8", "\u8c2f", "\u7b2a", "\u5e74", "\u7231", "\u9633", "\u4f5f"
817     };
818 
819     static final String[] traditionalNames = { "丁", "Abbot", "Morton", "Zachary", "Williams", "\u8d99", "\u9322", "\u5b6b",
820             "\u674e", "\u5468", "\u5433", "\u912d", "\u738b", "\u99ae", "\u9673", "\u696e", "\u885b", "\u8523",
821             "\u6c88", "\u97d3", "\u694a", "\u6731", "\u79e6", "\u5c24", "\u8a31", "\u4f55", "\u5442", "\u65bd",
822             "\u5f35", "\u5b54", "\u66f9", "\u56b4", "\u83ef", "\u91d1", "\u9b4f", "\u9676", "\u59dc", "\u621a",
823             "\u8b1d", "\u9112", "\u55bb", "\u67cf", "\u6c34", "\u7ac7", "\u7ae0", "\u96f2", "\u8607", "\u6f58",
824             "\u845b", "\u595a", "\u7bc4", "\u5f6d", "\u90ce", "\u9b6f", "\u97cb", "\u660c", "\u99ac", "\u82d7",
825             "\u9cf3", "\u82b1", "\u65b9", "\u4fde", "\u4efb", "\u8881", "\u67f3", "\u9146", "\u9b91", "\u53f2",
826             "\u5510", "\u8cbb", "\u5ec9", "\u5c91", "\u859b", "\u96f7", "\u8cc0", "\u502a", "\u6e6f", "\u6ed5",
827             "\u6bb7", "\u7f85", "\u7562", "\u90dd", "\u9114", "\u5b89", "\u5e38", "\u6a02", "\u65bc", "\u6642",
828             "\u5085", "\u76ae", "\u535e", "\u9f4a", "\u5eb7", "\u4f0d", "\u9918", "\u5143", "\u535c", "\u9867",
829             "\u5b5f", "\u5e73", "\u9ec3", "\u548c", "\u7a46", "\u856d", "\u5c39", "\u59da", "\u90b5", "\u6e5b",
830             "\u6c6a", "\u7941", "\u6bdb", "\u79b9", "\u72c4", "\u7c73", "\u8c9d", "\u660e", "\u81e7", "\u8a08",
831             "\u4f0f", "\u6210", "\u6234", "\u8ac7", "\u5b8b", "\u8305", "\u9f90", "\u718a", "\u7d00", "\u8212",
832             "\u5c48", "\u9805", "\u795d", "\u8463", "\u6881", "\u675c", "\u962e", "\u85cd", "\u95a9", "\u5e2d",
833             "\u5b63", "\u9ebb", "\u5f37", "\u8cc8", "\u8def", "\u5a41", "\u5371", "\u6c5f", "\u7ae5", "\u984f",
834             "\u90ed", "\u6885", "\u76db", "\u6797", "\u5201", "\u937e", "\u5f90", "\u4e18", "\u99f1", "\u9ad8",
835             "\u590f", "\u8521", "\u7530", "\u6a0a", "\u80e1", "\u51cc", "\u970d", "\u865e", "\u842c", "\u652f",
836             "\u67ef", "\u661d", "\u7ba1", "\u76e7", "\u83ab", "\u7d93", "\u623f", "\u88d8", "\u7e46", "\u5e79",
837             "\u89e3", "\u61c9", "\u5b97", "\u4e01", "\u5ba3", "\u8cc1", "\u9127", "\u9b31", "\u55ae", "\u676d",
838             "\u6d2a", "\u5305", "\u8af8", "\u5de6", "\u77f3", "\u5d14", "\u5409", "\u9215", "\u9f94", "\u7a0b",
839             "\u5d47", "\u90a2", "\u6ed1", "\u88f4", "\u9678", "\u69ae", "\u7fc1", "\u8340", "\u7f8a", "\u65bc",
840             "\u60e0", "\u7504", "\u9eb4", "\u5bb6", "\u5c01", "\u82ae", "\u7fbf", "\u5132", "\u9773", "\u6c72",
841             "\u90b4", "\u7cdc", "\u677e", "\u4e95", "\u6bb5", "\u5bcc", "\u5deb", "\u70cf", "\u7126", "\u5df4",
842             "\u5f13", "\u7267", "\u9697", "\u5c71", "\u8c37", "\u8eca", "\u4faf", "\u5b93", "\u84ec", "\u5168",
843             "\u90d7", "\u73ed", "\u4ef0", "\u79cb", "\u4ef2", "\u4f0a", "\u5bae", "\u5be7", "\u4ec7", "\u6b12",
844             "\u66b4", "\u7518", "\u659c", "\u53b2", "\u620e", "\u7956", "\u6b66", "\u7b26", "\u5289", "\u666f",
845             "\u8a79", "\u675f", "\u9f8d", "\u8449", "\u5e78", "\u53f8", "\u97f6", "\u90dc", "\u9ece", "\u858a",
846             "\u8584", "\u5370", "\u5bbf", "\u767d", "\u61f7", "\u84b2", "\u90b0", "\u5f9e", "\u9102", "\u7d22",
847             "\u54b8", "\u7c4d", "\u8cf4", "\u5353", "\u85fa", "\u5c60", "\u8499", "\u6c60", "\u55ac", "\u9670",
848             "\u9b31", "\u80e5", "\u80fd", "\u84bc", "\u96d9", "\u805e", "\u8398", "\u9ee8", "\u7fdf", "\u8b5a",
849             "\u8ca2", "\u52de", "\u9004", "\u59ec", "\u7533", "\u6276", "\u5835", "\u5189", "\u5bb0", "\u9148",
850             "\u96cd", "\u90e4", "\u74a9", "\u6851", "\u6842", "\u6fee", "\u725b", "\u58fd", "\u901a", "\u908a",
851             "\u6248", "\u71d5", "\u5180", "\u90df", "\u6d66", "\u5c1a", "\u8fb2", "\u6eab", "\u5225", "\u838a",
852             "\u664f", "\u67f4", "\u77bf", "\u95bb", "\u5145", "\u6155", "\u9023", "\u8339", "\u7fd2", "\u5ba6",
853             "\u827e", "\u9b5a", "\u5bb9", "\u5411", "\u53e4", "\u6613", "\u614e", "\u6208", "\u5ed6", "\u5ebe",
854             "\u7d42", "\u66a8", "\u5c45", "\u8861", "\u6b65", "\u90fd", "\u803f", "\u6eff", "\u5f18", "\u5321",
855             "\u570b", "\u6587", "\u5bc7", "\u5ee3", "\u797f", "\u95d5", "\u6771", "\u6b50", "\u6bb3", "\u6c83",
856             "\u5229", "\u851a", "\u8d8a", "\u5914", "\u9686", "\u5e2b", "\u978f", "\u5399", "\u8076", "\u6641",
857             "\u52fe", "\u6556", "\u878d", "\u51b7", "\u8a3e", "\u8f9b", "\u95de", "\u90a3", "\u7c21", "\u9952",
858             "\u7a7a", "\u66fe", "\u6bcb", "\u6c99", "\u4e5c", "\u990a", "\u97a0", "\u9808", "\u8c50", "\u5de2",
859             "\u95dc", "\u84af", "\u76f8", "\u67e5", "\u5f8c", "\u834a", "\u7d05", "\u904a", "\u7afa", "\u6b0a",
860             "\u9011", "\u84cb", "\u76ca", "\u6853", "\u516c", "\u4e07\u4fdf", "\u53f8\u99ac", "\u4e0a\u5b98",
861             "\u6b50\u967d", "\u590f\u4faf", "\u8af8\u845b", "\u805e\u4eba", "\u6771\u65b9", "\u8d6b\u9023",
862             "\u7687\u752b", "\u5c09\u9072", "\u516c\u7f8a", "\u6fb9\u53f0", "\u516c\u51b6", "\u5b97\u653f",
863             "\u6fee\u967d", "\u6df3\u4e8e", "\u55ae\u4e8e", "\u592a\u53d4", "\u7533\u5c60", "\u516c\u5b6b",
864             "\u4ef2\u5b6b", "\u8ed2\u8f45", "\u4ee4\u72d0", "\u937e\u96e2", "\u5b87\u6587", "\u9577\u5b6b",
865             "\u6155\u5bb9", "\u9bae\u4e8e", "\u95ad\u4e18", "\u53f8\u5f92", "\u53f8\u7a7a", "\u4e0c\u5b98",
866             "\u53f8\u5bc7", "\u4ec9", "\u7763", "\u5b50\u8eca", "\u9853\u5b6b", "\u7aef\u6728", "\u5deb\u99ac",
867             "\u516c\u897f", "\u6f06\u96d5", "\u6a02\u6b63", "\u58e4\u99df", "\u516c\u826f", "\u62d3\u62d4",
868             "\u593e\u8c37", "\u5bb0\u7236", "\u7a40\u6881", "\u6649", "\u695a", "\u95bb", "\u6cd5", "\u6c5d", "\u9122",
869             "\u5857", "\u6b3d", "\u6bb5\u5e72", "\u767e\u91cc", "\u6771\u90ed", "\u5357\u9580", "\u547c\u5ef6",
870             "\u6b78", "\u6d77", "\u7f8a\u820c", "\u5fae\u751f", "\u5cb3", "\u5e25", "\u7df1", "\u4ea2", "\u6cc1",
871             "\u5f8c", "\u6709", "\u7434", "\u6881\u4e18", "\u5de6\u4e18", "\u6771\u9580", "\u897f\u9580", "\u5546",
872             "\u725f", "\u4f58", "\u4f74", "\u4f2f", "\u8cde", "\u5357\u5bae", "\u58a8", "\u54c8", "\u8b59", "\u7b2a",
873             "\u5e74", "\u611b", "\u967d", "\u4f5f", "\u3401", "\u3422", "\u3426", "\u3493", "\u34A5", "\u34A7",
874             "\u34AA", "\u3536", "\u4A3B", "\u4E00", "\u4E01", "\u4E07", "\u4E0D", "\u4E17", "\u4E23", "\u4E26",
875             "\u4E34", "\u4E82", "\u4EB8", "\u4EB9", "\u511F", "\u512D", "\u513D", "\u513E", "\u53B5", "\u56D4",
876             "\u56D6", "\u7065", "\u7069", "\u706A", "\u7E9E", "\u9750", "\u9F49", "\u9F7E", "\u9F98", "\uD840\uDC35",
877             "\uD840\uDC3D", "\uD840\uDC3E", "\uD840\uDC41", "\uD840\uDC46", "\uD840\uDC4C", "\uD840\uDC4E",
878             "\uD840\uDC53", "\uD840\uDC55", "\uD840\uDC56", "\uD840\uDC5F", "\uD840\uDC60", "\uD840\uDC7A",
879             "\uD840\uDC7B", "\uD840\uDCC8", "\uD840\uDD9E", "\uD840\uDD9F", "\uD840\uDDA0", "\uD840\uDDA1",
880             "\uD841\uDD3B", "\uD842\uDCCA", "\uD842\uDCCB", "\uD842\uDD6C", "\uD842\uDE0B", "\uD842\uDE0C",
881             "\uD842\uDED1", "\uD844\uDD9F", "\uD845\uDD19", "\uD845\uDD1A", "\uD846\uDD3B", "\uD84C\uDF5C",
882             "\uD85A\uDDC4", "\uD85A\uDDC5", "\uD85C\uDD98", "\uD85E\uDCB1", "\uD861\uDC04", "\uD864\uDDD3",
883             "\uD865\uDE63", "\uD869\uDCCA", "\uD86B\uDE9A", };
884 
885     /**
886      * Test AlphabeticIndex vs. root with script reordering.
887      */
TestHaniFirst()888     public void TestHaniFirst() {
889         RuleBasedCollator coll = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT);
890         coll.setReorderCodes(UScript.HAN);
891         AlphabeticIndex index = new AlphabeticIndex(coll);
892         assertEquals("getBucketCount()", 1, index.getBucketCount());   // ... (underflow only)
893         index.addLabels(ULocale.ENGLISH);
894         assertEquals("getBucketCount()", 28, index.getBucketCount());  // ... A-Z ...
895         int bucketIndex = index.getBucketIndex("\u897f");
896         assertEquals("getBucketIndex(U+897F)", 0, bucketIndex);  // underflow bucket
897         bucketIndex = index.getBucketIndex("i");
898         assertEquals("getBucketIndex(i)", 9, bucketIndex);
899         bucketIndex = index.getBucketIndex("\u03B1");
900         assertEquals("getBucketIndex(Greek alpha)", 27, bucketIndex);
901         // U+50005 is an unassigned code point which sorts at the end, independent of the Hani group.
902         bucketIndex = index.getBucketIndex(UTF16.valueOf(0x50005));
903         assertEquals("getBucketIndex(U+50005)", 27, bucketIndex);
904         bucketIndex = index.getBucketIndex("\uFFFF");
905         assertEquals("getBucketIndex(U+FFFF)", 27, bucketIndex);
906     }
907 
908     /**
909      * Test AlphabeticIndex vs. Pinyin with script reordering.
910      */
TestPinyinFirst()911     public void TestPinyinFirst() {
912         RuleBasedCollator coll = (RuleBasedCollator) Collator.getInstance(ULocale.CHINESE);
913         coll.setReorderCodes(UScript.HAN);
914         AlphabeticIndex index = new AlphabeticIndex(coll);
915         assertEquals("getBucketCount()", 28, index.getBucketCount());   // ... A-Z ...
916         index.addLabels(ULocale.CHINESE);
917         assertEquals("getBucketCount()", 28, index.getBucketCount());  // ... A-Z ...
918         int bucketIndex = index.getBucketIndex("\u897f");
919         assertEquals("getBucketIndex(U+897F)", 'X' - 'A' + 1, bucketIndex);
920         bucketIndex = index.getBucketIndex("i");
921         assertEquals("getBucketIndex(i)", 9, bucketIndex);
922         bucketIndex = index.getBucketIndex("\u03B1");
923         assertEquals("getBucketIndex(Greek alpha)", 27, bucketIndex);
924         // U+50005 is an unassigned code point which sorts at the end, independent of the Hani group.
925         bucketIndex = index.getBucketIndex(UTF16.valueOf(0x50005));
926         assertEquals("getBucketIndex(U+50005)", 27, bucketIndex);
927         bucketIndex = index.getBucketIndex("\uFFFF");
928         assertEquals("getBucketIndex(U+FFFF)", 27, bucketIndex);
929     }
930 
931     /**
932      * Test labels with multiple primary weights.
933      */
TestSchSt()934     public void TestSchSt() {
935         AlphabeticIndex index = new AlphabeticIndex(ULocale.GERMAN);
936         index.addLabels(new UnicodeSet("[Æ{Sch*}{St*}]"));
937         // ... A Æ B-R S Sch St T-Z ...
938         ImmutableIndex immIndex = index.buildImmutableIndex();
939         assertEquals("getBucketCount()", 31, index.getBucketCount());
940         assertEquals("immutable getBucketCount()", 31, immIndex.getBucketCount());
941         String[][] testCases = new String[][] {
942             // name, bucket index, bucket label
943             { "Adelbert", "1", "A" },
944             { "Afrika", "1", "A" },
945             { "Æsculap", "2", "Æ" },
946             { "Aesthet", "2", "Æ" },
947             { "Berlin", "3", "B" },
948             { "Rilke", "19", "R" },
949             { "Sacher", "20", "S" },
950             { "Seiler", "20", "S" },
951             { "Sultan", "20", "S" },
952             { "Schiller", "21", "Sch" },
953             { "Steiff", "22", "St" },
954             { "Thomas", "23", "T" }
955         };
956         List<String> labels = index.getBucketLabels();
957         for (String[] testCase : testCases) {
958             String name = testCase[0];
959             int bucketIndex = Integer.valueOf(testCase[1]);
960             String label = testCase[2];
961             String msg = "getBucketIndex(" + name + ")";
962             assertEquals(msg, bucketIndex, index.getBucketIndex(name));
963             msg = "immutable " + msg;
964             assertEquals(msg, bucketIndex, immIndex.getBucketIndex(name));
965             msg = "bucket label (" + name + ")";
966             assertEquals(msg, label, labels.get(index.getBucketIndex(name)));
967             msg = "immutable " + msg;
968             assertEquals(msg, label, immIndex.getBucket(bucketIndex).getLabel());
969         }
970     }
971 
972     /**
973      * With no real labels, there should be only the underflow label.
974      */
TestNoLabels()975     public void TestNoLabels() {
976         RuleBasedCollator coll = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT);
977         AlphabeticIndex<Integer> index = new AlphabeticIndex<Integer>(coll);
978         index.addRecord("\u897f", 0);
979         index.addRecord("i", 0);
980         index.addRecord("\u03B1", 0);
981         assertEquals("getBucketCount()", 1, index.getBucketCount());  // ...
982         Bucket<Integer> bucket = index.iterator().next();
983         assertEquals("underflow label type", LabelType.UNDERFLOW, bucket.getLabelType());
984         assertEquals("all records in the underflow bucket", 3, bucket.size());
985     }
986 
987     /**
988      * Test with the Bopomofo-phonetic tailoring.
989      */
TestChineseZhuyin()990     public void TestChineseZhuyin() {
991         AlphabeticIndex index = new AlphabeticIndex(ULocale.forLanguageTag("zh-u-co-zhuyin"));
992         ImmutableIndex immIndex = index.buildImmutableIndex();
993         assertEquals("getBucketCount()", 38, immIndex.getBucketCount());  // ... ㄅ ㄆ ㄇ ㄈ ㄉ -- ㄩ ...
994         assertEquals("label 1", "ㄅ", immIndex.getBucket(1).getLabel());
995         assertEquals("label 2", "ㄆ", immIndex.getBucket(2).getLabel());
996         assertEquals("label 3", "ㄇ", immIndex.getBucket(3).getLabel());
997         assertEquals("label 4", "ㄈ", immIndex.getBucket(4).getLabel());
998         assertEquals("label 5", "ㄉ", immIndex.getBucket(5).getLabel());
999     }
1000 
TestJapaneseKanji()1001     public void TestJapaneseKanji() {
1002         AlphabeticIndex index = new AlphabeticIndex(ULocale.JAPANESE);
1003         AlphabeticIndex.ImmutableIndex immIndex = index.buildImmutableIndex();
1004         // There are no index characters for Kanji in the Japanese standard collator.
1005         // They should all go into the overflow bucket.
1006         final int[] kanji = { 0x4E9C, 0x95C7, 0x4E00, 0x58F1 };
1007         int overflowIndex = immIndex.getBucketCount() - 1;
1008         for(int i = 0; i < kanji.length; ++i) {
1009             String msg = String.format("kanji[%d]=U+%04X in overflow bucket", i, kanji[i]);
1010             assertEquals(msg, overflowIndex, immIndex.getBucketIndex(UTF16.valueOf(kanji[i])));
1011         }
1012     }
1013 
TestFrozenCollator()1014     public void TestFrozenCollator() {
1015         // Ticket #9472
1016         RuleBasedCollator coll = (RuleBasedCollator) Collator.getInstance(new ULocale("da"));
1017         coll.setStrength(Collator.IDENTICAL);
1018         coll.freeze();
1019         // The AlphabeticIndex constructor used to throw an exception
1020         // because it cloned the collator (which preserves frozenness)
1021         // and set the clone's strength to PRIMARY.
1022         AlphabeticIndex index = new AlphabeticIndex(coll);
1023         assertEquals("same strength as input Collator",
1024                 Collator.IDENTICAL, index.getCollator().getStrength());
1025     }
1026 
TestChineseUnihan()1027     public void TestChineseUnihan() {
1028         AlphabeticIndex index = new AlphabeticIndex(new ULocale("zh-u-co-unihan"));
1029         index.setMaxLabelCount(500);  // ICU 54 default is 99.
1030         AlphabeticIndex.ImmutableIndex immIndex = index.buildImmutableIndex();
1031         int bucketCount = immIndex.getBucketCount();
1032         if(bucketCount < 216) {
1033             // There should be at least an underflow and overflow label,
1034             // and one for each of 214 radicals,
1035             // and maybe additional labels for simplified radicals.
1036             // (ICU4C: dataerrln(), prints only a warning if the data is missing)
1037             errln("too few buckets/labels for Chinese/unihan: " + bucketCount +
1038                     " (is zh/unihan data available?)");
1039             return;
1040         } else {
1041             logln("Chinese/unihan has " + bucketCount + " buckets/labels");
1042         }
1043         // bucketIndex = radical number, adjusted for simplified radicals in lower buckets.
1044         int bucketIndex = index.getBucketIndex("\u4e5d");
1045         assertEquals("getBucketIndex(U+4E5D)", 5, bucketIndex);
1046         // radical 100, and there is a 90' since Unicode 8
1047         bucketIndex = index.getBucketIndex("\u7527");
1048         assertEquals("getBucketIndex(U+7527)", 101, bucketIndex);
1049     }
1050 }
1051