1 /* 2 ******************************************************************************* 3 * Copyright (C) 2008-2015, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 package com.ibm.icu.dev.test.collator; 8 import java.util.ArrayList; 9 import java.util.Arrays; 10 import java.util.Collection; 11 import java.util.Iterator; 12 import java.util.LinkedHashSet; 13 import java.util.List; 14 import java.util.Locale; 15 import java.util.Set; 16 import java.util.TreeSet; 17 18 import com.ibm.icu.dev.test.TestFmwk; 19 import com.ibm.icu.dev.util.CollectionUtilities; 20 import com.ibm.icu.impl.ICUDebug; 21 import com.ibm.icu.impl.Row; 22 import com.ibm.icu.impl.Row.R4; 23 import com.ibm.icu.lang.UCharacter; 24 import com.ibm.icu.lang.UProperty; 25 import com.ibm.icu.lang.UScript; 26 import com.ibm.icu.text.AlphabeticIndex; 27 import com.ibm.icu.text.AlphabeticIndex.Bucket; 28 import com.ibm.icu.text.AlphabeticIndex.Bucket.LabelType; 29 import com.ibm.icu.text.AlphabeticIndex.ImmutableIndex; 30 import com.ibm.icu.text.AlphabeticIndex.Record; 31 import com.ibm.icu.text.Collator; 32 import com.ibm.icu.text.Normalizer2; 33 import com.ibm.icu.text.RawCollationKey; 34 import com.ibm.icu.text.RuleBasedCollator; 35 import com.ibm.icu.text.UTF16; 36 import com.ibm.icu.text.UnicodeSet; 37 import com.ibm.icu.util.ULocale; 38 39 /** 40 * @author Mark Davis 41 */ 42 public class AlphabeticIndexTest extends TestFmwk { 43 /** 44 * 45 */ 46 private static final String ARROW = "\u2192"; 47 private static final boolean DEBUG = ICUDebug.enabled("alphabeticindex"); 48 49 public static Set<String> KEY_LOCALES = new LinkedHashSet(Arrays.asList( 50 "en", "es", "de", "fr", "ja", "it", "tr", "pt", "zh", "nl", 51 "pl", "ar", "ru", "zh_Hant", "ko", "th", "sv", "fi", "da", 52 "he", "nb", "el", "hr", "bg", "sk", "lt", "vi", "lv", "sr", 53 "pt_PT", "ro", "hu", "cs", "id", "sl", "fil", "fa", "uk", 54 "ca", "hi", "et", "eu", "is", "sw", "ms", "bn", "am", "ta", 55 "te", "mr", "ur", "ml", "kn", "gu", "or")); 56 private String[][] localeAndIndexCharactersLists = new String[][] { 57 /* Arabic*/ {"ar", "\u0627:\u0628:\u062A:\u062B:\u062C:\u062D:\u062E:\u062F:\u0630:\u0631:\u0632:\u0633:\u0634:\u0635:\u0636:\u0637:\u0638:\u0639:\u063A:\u0641:\u0642:\u0643:\u0644:\u0645:\u0646:\u0647:\u0648:\u064A"}, 58 /* Bulgarian*/ {"bg", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0415:\u0416:\u0417:\u0418:\u0419:\u041A:\u041B:\u041C:\u041D:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u0428:\u0429:\u042E:\u042F"}, 59 /* Catalan*/ {"ca", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 60 /* Czech*/ {"cs", "A:B:C:\u010C:D:E:F:G:H:CH:I:J:K:L:M:N:O:P:Q:R:\u0158:S:\u0160:T:U:V:W:X:Y:Z:\u017D"}, 61 /* Danish*/ {"da", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C6:\u00D8:\u00C5"}, 62 /* German*/ {"de", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 63 /* Greek*/ {"el", "\u0391:\u0392:\u0393:\u0394:\u0395:\u0396:\u0397:\u0398:\u0399:\u039A:\u039B:\u039C:\u039D:\u039E:\u039F:\u03A0:\u03A1:\u03A3:\u03A4:\u03A5:\u03A6:\u03A7:\u03A8:\u03A9"}, 64 /* English*/ {"en", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 65 /* Spanish*/ {"es", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:\u00D1:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 66 /* Estonian*/ {"et", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:\u0160:Z:\u017D:T:U:V:\u00D5:\u00C4:\u00D6:\u00DC:X:Y"}, 67 /* Basque*/ {"eu", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 68 /* Finnish*/ {"fi", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C5:\u00C4:\u00D6"}, 69 /* Filipino*/ {"fil", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 70 /* French*/ {"fr", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 71 /* Hebrew*/ {"he", "\u05D0:\u05D1:\u05D2:\u05D3:\u05D4:\u05D5:\u05D6:\u05D7:\u05D8:\u05D9:\u05DB:\u05DC:\u05DE:\u05E0:\u05E1:\u05E2:\u05E4:\u05E6:\u05E7:\u05E8:\u05E9:\u05EA"}, 72 /* Icelandic*/ {"is", "A:\u00C1:B:C:D:\u00D0:E:\u00C9:F:G:H:I:\u00CD:J:K:L:M:N:O:\u00D3:P:Q:R:S:T:U:\u00DA:V:W:X:Y:\u00DD:Z:\u00DE:\u00C6:\u00D6"}, 73 /* Italian*/ {"it", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 74 /* Japanese*/ {"ja", "\u3042:\u304B:\u3055:\u305F:\u306A:\u306F:\u307E:\u3084:\u3089:\u308F"}, 75 /* Korean*/ {"ko", "\u3131:\u3134:\u3137:\u3139:\u3141:\u3142:\u3145:\u3147:\u3148:\u314A:\u314B:\u314C:\u314D:\u314E"}, 76 /* Lithuanian*/ {"lt", "A:B:C:\u010C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:\u0160:T:U:V:Z:\u017D"}, 77 /* Latvian*/ {"lv", "A:B:C:\u010C:D:E:F:G:\u0122:H:I:J:K:\u0136:L:\u013B:M:N:\u0145:O:P:Q:R:S:\u0160:T:U:V:W:X:Z:\u017D"}, 78 /* Norwegian Bokm\u00E5l*/ {"nb", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C6:\u00D8:\u00C5"}, 79 /* Dutch*/ {"nl", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 80 /* Polish*/ {"pl", "A:\u0104:B:C:\u0106:D:E:\u0118:F:G:H:I:J:K:L:\u0141:M:N:\u0143:O:\u00D3:P:Q:R:S:\u015A:T:U:V:W:X:Y:Z:\u0179:\u017B"}, 81 /* Portuguese*/ {"pt", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 82 /* Romanian*/ {"ro", "A:\u0102:\u00C2:B:C:D:E:F:G:H:I:\u00CE:J:K:L:M:N:O:P:Q:R:S:\u0218:T:\u021A:U:V:W:X:Y:Z"}, 83 /* Russian*/ {"ru", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0415:\u0416:\u0417:\u0418:\u0419:\u041A:\u041B:\u041C:\u041D:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u0428:\u0429:\u042B:\u042D:\u042E:\u042F"}, 84 /* Slovak*/ {"sk", "A:\u00C4:B:C:\u010C:D:E:F:G:H:CH:I:J:K:L:M:N:O:\u00D4:P:Q:R:S:\u0160:T:U:V:W:X:Y:Z:\u017D"}, 85 /* Slovenian*/ {"sl", "A:B:C:\u010C:\u0106:D:\u0110:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:\u0160:T:U:V:W:X:Y:Z:\u017D"}, 86 /* Serbian*/ {"sr", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0402:\u0415:\u0416:\u0417:\u0418:\u0408:\u041A:\u041B:\u0409:\u041C:\u041D:\u040A:\u041E:\u041F:\u0420:\u0421:\u0422:\u040B:\u0423:\u0424:\u0425:\u0426:\u0427:\u040F:\u0428"}, 87 /* Swedish*/ {"sv", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C5:\u00C4:\u00D6"}, 88 /* Turkish*/ {"tr", "A:B:C:\u00C7:D:E:F:G:H:I:\u0130:J:K:L:M:N:O:\u00D6:P:Q:R:S:\u015E:T:U:\u00DC:V:W:X:Y:Z"}, 89 /* Ukrainian*/ {"uk", "\u0410:\u0411:\u0412:\u0413:\u0490:\u0414:\u0415:\u0404:\u0416:\u0417:\u0418:\u0406:\u0407:\u0419:\u041A:\u041B:\u041C:\u041D:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u0428:\u0429:\u042E:\u042F"}, 90 /* Vietnamese*/ {"vi", "A:\u0102:\u00C2:B:C:D:\u0110:E:\u00CA:F:G:H:I:J:K:L:M:N:O:\u00D4:\u01A0:P:Q:R:S:T:U:\u01AF:V:W:X:Y:Z"}, 91 /* Chinese*/ {"zh", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 92 /* Chinese (Traditional Han)*/ {"zh_Hant", "1\u5283:2\u5283:3\u5283:4\u5283:5\u5283:6\u5283:7\u5283:8\u5283:9\u5283:10\u5283:11\u5283:12\u5283:13\u5283:14\u5283:15\u5283:16\u5283:17\u5283:18\u5283:19\u5283:20\u5283:21\u5283:22\u5283:23\u5283:24\u5283:25\u5283:26\u5283:27\u5283:28\u5283:29\u5283:30\u5283:31\u5283:32\u5283:33\u5283:35\u5283:36\u5283:39\u5283:48\u5283"}, 93 94 // Comment these out to make the test run faster. Later, make these run under extended 95 96 // /* Afrikaans*/ {"af", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 97 // /* Akan*/ {"ak", "A:B:C:D:E:\u0190:F:G:H:I:J:K:L:M:N:O:\u0186:P:Q:R:S:T:U:V:W:X:Y:Z"}, 98 // /* Asu*/ {"asa", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 99 // /* Azerbaijani*/ {"az", "A:B:C:\u00C7:D:E:\u018F:F:G:\u011E:H:X:I:\u0130:J:K:Q:L:M:N:O:\u00D6:P:R:S:\u015E:T:U:\u00DC:V:W:Y:Z"}, 100 // /* Belarusian*/ {"be", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0415:\u0416:\u0417:\u0406:\u0419:\u041A:\u041B:\u041C:\u041D:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u0428:\u042B:\u042D:\u042E:\u042F"}, 101 // /* Bemba*/ {"bem", "A:B:C:E:F:G:I:J:K:L:M:N:O:P:S:T:U:W:Y"}, 102 // /* Bena*/ {"bez", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:Y:Z"}, 103 // /* Bambara*/ {"bm", "A:B:C:D:E:\u0190:F:G:H:I:J:K:L:M:N:\u019D:\u014A:O:\u0186:P:R:S:T:U:W:Y:Z"}, 104 // /* Tibetan*/ {"bo", "\u0F40:\u0F41:\u0F42:\u0F44:\u0F45:\u0F46:\u0F47:\u0F49:\u0F4F:\u0F50:\u0F51:\u0F53:\u0F54:\u0F55:\u0F56:\u0F58:\u0F59:\u0F5A:\u0F5B:\u0F5D:\u0F5E:\u0F5F:\u0F60:\u0F61:\u0F62:\u0F63:\u0F64:\u0F66:\u0F67:\u0F68"}, 105 // /* Chiga*/ {"cgg", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 106 // /* Cherokee*/ {"chr", "\u13A0:\u13A6:\u13AD:\u13B3:\u13B9:\u13BE:\u13C6:\u13CC:\u13D3:\u13DC:\u13E3:\u13E9:\u13EF"}, 107 // /* Welsh*/ {"cy", "A:B:C:CH:D:E:F:FF:G:H:I:J:L:LL:M:N:O:P:PH:R:RH:S:T:TH:U:W:Y"}, 108 // /* Taita*/ {"dav", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 109 // /* Embu*/ {"ebu", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 110 // /* Ewe*/ {"ee", "A:B:C:D:\u0189:E:\u0190:F:\u0191:G:\u0194:H:I:J:K:L:M:N:\u014A:O:\u0186:P:Q:R:S:T:U:V:\u01B2:W:X:Y:Z"}, 111 // /* Esperanto*/ {"eo", "A:B:C:\u0108:D:E:F:G:\u011C:H:\u0124:I:J:\u0134:K:L:M:N:O:P:R:S:\u015C:T:U:\u016C:V:Z"}, 112 // /* Fulah*/ {"ff", "A:B:\u0181:C:D:\u018A:E:F:G:H:I:J:K:L:M:N:\u014A:O:P:R:S:T:U:W:Y:\u01B3"}, 113 // /* Faroese*/ {"fo", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C6:\u00D8"}, 114 // /* Gusii*/ {"guz", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 115 // /* Hausa*/ {"ha", "A:B:\u0181:C:D:\u018A:E:F:G:H:I:J:K:\u0198:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 116 // /* Igbo*/ {"ig", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 117 // /* Machame*/ {"jmc", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 118 // /* Kabyle*/ {"kab", "A:B:C:D:E:\u0190:F:G:\u0194:H:I:J:K:L:M:N:P:Q:R:S:T:U:W:X:Y:Z"}, 119 // /* Kamba*/ {"kam", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 120 // /* Makonde*/ {"kde", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 121 // /* Kabuverdianu*/ {"kea", "A:B:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:X:Z"}, 122 // /* Koyra Chiini*/ {"khq", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:\u019D:\u014A:O:P:Q:R:S:T:U:W:X:Y:Z"}, 123 // /* Kikuyu*/ {"ki", "A:B:C:D:E:G:H:I:J:K:M:N:O:R:T:U:W:Y"}, 124 // /* Kalenjin*/ {"kln", "A:B:C:D:E:G:H:I:J:K:L:M:N:O:P:R:S:T:U:W:Y"}, 125 // /* Langi*/ {"lag", "A:B:C:D:E:F:G:H:I:\u0197:J:K:L:M:N:O:P:Q:R:S:T:U:\u0244:V:W:X:Y:Z"}, 126 // /* Ganda*/ {"lg", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 127 // /* Luo*/ {"luo", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y"}, 128 // /* Luyia*/ {"luy", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 129 // /* Masai*/ {"mas", "A:B:C:D:E:\u0190:G:H:I:\u0197:J:K:L:M:N:\u014A:O:\u0186:P:R:S:T:U:\u0244:W:Y"}, 130 // /* Meru*/ {"mer", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 131 // /* Morisyen*/ {"mfe", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:X:Y:Z"}, 132 // /* Malagasy*/ {"mg", "A:B:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:V:Y:Z"}, 133 // This should be the correct data. Commented till it is fixed in CLDR collation data. 134 // {"mk", "\u0410:\u0411:\u0412:\u0413:\u0403:\u0414:\u0415:\u0416:\u0417:\u0405:\u0418:\u0408:\u041A:\u040C:\u041B:\u0409:\u041C:\u041D:\u040A:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u040F:\u0428"}, 135 // /* Macedonian*/ {"mk", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0403:\u0415:\u0416:\u0417:\u0405:\u0418:\u0408:\u041A:\u041B:\u0409:\u041C:\u041D:\u040A:\u041E:\u041F:\u0420:\u0421:\u0422:\u040C:\u0423:\u0424:\u0425:\u0426:\u0427:\u040F:\u0428"}, 136 // This should be the correct data. Commented till it is fixed in CLDR collation data. 137 // {"mt", "A:B:C:\u010A:D:E:F:\u0120:G:G\u0126:H:\u0126:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:\u017B:Z"}, 138 // /* Maltese*/ {"mt", "A:B:\u010A:C:D:E:F:\u0120:G:G\u0126:H:\u0126:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:\u017B:Z"}, 139 // /* Nama*/ {"naq", "A:B:C:D:E:F:G:H:I:K:M:N:O:P:Q:R:S:T:U:W:X:Y:Z"}, 140 // /* North Ndebele*/ {"nd", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:S:T:U:V:W:X:Y:Z"}, 141 // /* Norwegian Nynorsk*/ {"nn", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C6:\u00D8:\u00C5"}, 142 // /* Nyankole*/ {"nyn", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 143 // /* Oromo*/ {"om", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 144 // /* Romansh*/ {"rm", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 145 // /* Rombo*/ {"rof", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 146 // /* Kinyarwanda*/ {"rw", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 147 // /* Rwa*/ {"rwk", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 148 // /* Samburu*/ {"saq", "A:B:C:D:E:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y"}, 149 // /* Sena*/ {"seh", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 150 // /* Koyraboro Senni*/ {"ses", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:\u019D:\u014A:O:P:Q:R:S:T:U:W:X:Y:Z"}, 151 // /* Sango*/ {"sg", "A:B:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 152 // /* Tachelhit*/ {"shi", "A:B:C:D:E:\u0190:F:G:\u0194:H:I:J:K:L:M:N:Q:R:S:T:U:W:X:Y:Z"}, 153 // /* Tachelhit (Tifinagh)*/ {"shi_Tfng", "\u2D30:\u2D31:\u2D33:\u2D37:\u2D39:\u2D3B:\u2D3C:\u2D3D:\u2D40:\u2D43:\u2D44:\u2D45:\u2D47:\u2D49:\u2D4A:\u2D4D:\u2D4E:\u2D4F:\u2D53:\u2D54:\u2D55:\u2D56:\u2D59:\u2D5A:\u2D5B:\u2D5C:\u2D5F:\u2D61:\u2D62:\u2D63:\u2D65"}, 154 // /* Shona*/ {"sn", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 155 // /* Teso*/ {"teo", "A:B:C:D:E:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:X:Y"}, 156 // /* Tonga*/ {"to", "A:B:C:D:E:F:G:H:\u02BB:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 157 // /* Central Morocco Tamazight*/ {"tzm", "A:B:C:D:E:\u0190:F:G:\u0194:H:I:J:K:L:M:N:Q:R:S:T:U:W:X:Y:Z"}, 158 // /* Uzbek (Latin)*/ {"uz_Latn", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u02BF"}, 159 // /* Vunjo*/ {"vun", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 160 // /* Soga*/ {"xog", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 161 // /* Yoruba*/ {"yo", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 162 163 }; main(String[] args)164 public static void main(String[] args) throws Exception{ 165 new AlphabeticIndexTest().run(args); 166 } 167 168 // public void TestAAKeyword() { 169 // ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle.getBundleInstance( 170 // ICUResourceBundle.ICU_COLLATION_BASE_NAME, "zh"); 171 // showBundle(rb, 0); 172 // String[] keywords = Collator.getKeywords(); 173 // System.out.println(Arrays.asList(keywords)); 174 // String locale = "zh"; 175 // ULocale ulocale = new ULocale(locale); 176 // for (String keyword : keywords) { 177 // List<String> values = Arrays.asList(Collator.getKeywordValuesForLocale(keyword, ulocale, false)); 178 // List<String> allValues = Arrays.asList(Collator.getKeywordValues(keyword)); 179 // for (String value : allValues) { 180 // System.out.println(keyword + "=" + value); 181 // checkKeyword(locale, value, values.contains(value)); 182 // } 183 // } 184 // } 185 // 186 // private void checkKeyword(String locale, String collationValue, boolean shouldExist) { 187 // final ULocale base = new ULocale(locale); 188 // final ULocale desired = new ULocale(locale + "@collation=" + collationValue); 189 // Collator foo = Collator.getInstance(desired); 190 // ULocale actual = foo.getLocale(ULocale.ACTUAL_LOCALE); 191 // if (shouldExist) { 192 // assertEquals("actual should match desired", desired, actual); 193 // } else { 194 // assertEquals("actual should match base", base, actual); 195 // } 196 // int comp = foo.compare("a", "ā"); 197 // assertEquals("should fall back to default for zh", -1, comp); 198 // } 199 // 200 // /** 201 // * @param rb 202 // * @param i 203 // */ 204 // private static void showBundle(UResourceBundle rb, int i) { 205 // for (String key : rb.keySet()) { 206 // System.out.print("\n" + Utility.repeat(" ", i) + key); 207 // UResourceBundle rb2 = rb.get(key); 208 // showBundle(rb2, i+1); 209 // } 210 // } 211 212 TestA()213 public void TestA() { 214 String[][] tests = {{"zh_Hant", "渡辺", "12劃"}, 215 {"zh", "渡辺", "D"} 216 /*, "zh@collation=unihan", "ja@collation=unihan", "ko@collation=unihan"*/ 217 }; 218 for (String[] test : tests) { 219 AlphabeticIndex<Integer> alphabeticIndex = new AlphabeticIndex<Integer>(new ULocale(test[0])); 220 final String probe = test[1]; 221 final String expectedLabel = test[2]; 222 alphabeticIndex.addRecord(probe, 1); 223 List labels = alphabeticIndex.getBucketLabels(); 224 logln(labels.toString()); 225 Bucket<Integer> bucket = find(alphabeticIndex, probe); 226 assertEquals("locale " + test[0] + " name=" + probe + " in bucket", 227 expectedLabel, bucket.getLabel()); 228 } 229 } 230 find(AlphabeticIndex<Integer> alphabeticIndex, final String probe)231 private Bucket<Integer> find(AlphabeticIndex<Integer> alphabeticIndex, final String probe) { 232 for (Bucket<Integer> bucket : alphabeticIndex) { 233 for (Record<Integer> record : bucket) { 234 if (record.getName().equals(probe)) { 235 return bucket; 236 } 237 } 238 } 239 return null; 240 } 241 TestFirstCharacters()242 public void TestFirstCharacters() { 243 244 AlphabeticIndex alphabeticIndex = new AlphabeticIndex(Locale.ENGLISH); 245 RuleBasedCollator collator = alphabeticIndex.getCollator(); 246 collator.setStrength(Collator.IDENTICAL); 247 Collection<String> firsts = alphabeticIndex.getFirstCharactersInScripts(); 248 // Verify that each script is represented exactly once. 249 // Exclude pseudo-scripts like Common (no letters). 250 // Exclude scripts like Braille and Sutton SignWriting 251 // because they only have symbols, not letters. 252 UnicodeSet missingScripts = new UnicodeSet( 253 "[^[:inherited:][:unknown:][:common:][:Braille:][:SignWriting:]]"); 254 String last = ""; 255 for (String index : firsts) { 256 if (collator.compare(last,index) >= 0) { 257 errln("Characters not in order: " + last + " !< " + index); 258 } 259 int script = getFirstRealScript(index); 260 if (script == UScript.UNKNOWN) { continue; } 261 UnicodeSet s = new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, script); 262 if (missingScripts.containsNone(s)) { 263 errln("2nd character in script: " + index + "\t" + new UnicodeSet(missingScripts).retainAll(s).toPattern(false)); 264 } 265 missingScripts.removeAll(s); 266 } 267 if (missingScripts.size() != 0) { 268 String missingScriptNames = ""; 269 UnicodeSet missingChars = new UnicodeSet(missingScripts); 270 for(;;) { 271 int c = missingChars.charAt(0); 272 if (c < 0) { 273 break; 274 } 275 int script = UScript.getScript(c); 276 missingScriptNames += " " + 277 UCharacter.getPropertyValueName( 278 UProperty.SCRIPT, script, UProperty.NameChoice.SHORT); 279 missingChars.removeAll(new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, script)); 280 } 281 errln("Missing character from:" + missingScriptNames + " -- " + missingScripts); 282 } 283 } 284 getFirstRealScript(CharSequence s)285 private static final int getFirstRealScript(CharSequence s) { 286 for (int i = 0; i < s.length();) { 287 int c = Character.codePointAt(s, i); 288 int script = UScript.getScript(c); 289 if (script != UScript.UNKNOWN && script != UScript.INHERITED && script != UScript.COMMON) { 290 return script; 291 } 292 i += Character.charCount(c); 293 } 294 return UScript.UNKNOWN; 295 } 296 TestBuckets()297 public void TestBuckets() { 298 ULocale additionalLocale = ULocale.ENGLISH; 299 300 for (String[] pair : localeAndIndexCharactersLists) { 301 checkBuckets(pair[0], SimpleTests, additionalLocale, "E", "edgar", "Effron", "Effron"); 302 } 303 } 304 TestEmpty()305 public void TestEmpty() { 306 // just verify that it doesn't blow up. 307 Set<ULocale> locales = new LinkedHashSet<ULocale>(); 308 locales.add(ULocale.ROOT); 309 locales.addAll(Arrays.asList(ULocale.getAvailableLocales())); 310 for (ULocale locale : locales) { 311 try { 312 AlphabeticIndex<String> alphabeticIndex = new AlphabeticIndex(locale); 313 alphabeticIndex.addRecord("hi", "HI"); 314 for (Bucket<String> bucket : alphabeticIndex) { 315 @SuppressWarnings("unused") 316 LabelType labelType = bucket.getLabelType(); 317 } 318 } catch (Exception e) { 319 errln("Exception when creating AlphabeticIndex for:\t" + locale.toLanguageTag()); 320 errln(e.toString()); 321 } 322 } 323 } 324 TestInflow()325 public void TestInflow() { 326 Object[][] tests = { 327 {0, ULocale.ENGLISH}, 328 {0, ULocale.ENGLISH, new ULocale("el")}, 329 {1, ULocale.ENGLISH, new ULocale("ru")}, 330 {0, ULocale.ENGLISH, new ULocale("el"), new UnicodeSet("[\u2C80]"), new ULocale("ru")}, 331 {0, ULocale.ENGLISH}, 332 {2, ULocale.ENGLISH, new ULocale("ru"), ULocale.JAPANESE}, 333 }; 334 for (Object[] test : tests) { 335 int expected = (Integer) test[0]; 336 AlphabeticIndex<Double> alphabeticIndex = new AlphabeticIndex((ULocale)test[1]); 337 for (int i = 2; i < test.length; ++i) { 338 if (test[i] instanceof ULocale) { 339 alphabeticIndex.addLabels((ULocale)test[i]); 340 } else { 341 alphabeticIndex.addLabels((UnicodeSet)test[i]); 342 } 343 } 344 Counter<AlphabeticIndex.Bucket.LabelType> counter = new Counter(); 345 for (Bucket<Double> bucket : alphabeticIndex) { 346 LabelType labelType = bucket.getLabelType(); 347 counter.add(labelType, 1); 348 } 349 String printList = Arrays.asList(test).toString(); 350 assertEquals(LabelType.UNDERFLOW + "\t" + printList, 1, counter.get(LabelType.UNDERFLOW)); 351 assertEquals(LabelType.INFLOW + "\t" + printList, expected, counter.get(LabelType.INFLOW)); 352 if (expected != counter.get(LabelType.INFLOW)) { 353 // for debugging 354 AlphabeticIndex<Double> indexCharacters2 = new AlphabeticIndex((ULocale)test[1]); 355 for (int i = 2; i < test.length; ++i) { 356 if (test[i] instanceof ULocale) { 357 indexCharacters2.addLabels((ULocale)test[i]); 358 } else { 359 indexCharacters2.addLabels((UnicodeSet)test[i]); 360 } 361 } 362 List<Bucket<Double>> buckets = CollectionUtilities.addAll(alphabeticIndex.iterator(), new ArrayList<Bucket<Double>>()); 363 logln(buckets.toString()); 364 } 365 assertEquals(LabelType.OVERFLOW + "\t" + printList, 1, counter.get(LabelType.OVERFLOW)); 366 } 367 } 368 checkBuckets(String localeString, String[] test, ULocale additionalLocale, String testBucket, String... items)369 private void checkBuckets(String localeString, String[] test, ULocale additionalLocale, String testBucket, String... items) { 370 StringBuilder UI = new StringBuilder(); 371 ULocale desiredLocale = new ULocale(localeString); 372 373 // Create a simple index where the values for the strings are Integers, and add the strings 374 AlphabeticIndex<Integer> index = new AlphabeticIndex<Integer>(desiredLocale).addLabels(additionalLocale); 375 int counter = 0; 376 Counter<String> itemCount = new Counter(); 377 for (String item : test) { 378 index.addRecord(item, counter++); 379 itemCount.add(item, 1); 380 } 381 382 List<String> labels = index.getBucketLabels(); 383 ImmutableIndex<Integer> immIndex = index.buildImmutableIndex(); 384 385 logln(desiredLocale + "\t" + desiredLocale.getDisplayName(ULocale.ENGLISH) + " - " + desiredLocale.getDisplayName(desiredLocale) + "\t" 386 + index.getCollator().getLocale(ULocale.ACTUAL_LOCALE)); 387 UI.setLength(0); 388 UI.append(desiredLocale + "\t"); 389 boolean showAll = true; 390 391 // Show index at top. We could skip or gray out empty buckets 392 for (AlphabeticIndex.Bucket<Integer> bucket : index) { 393 if (showAll || bucket.size() != 0) { 394 showLabelAtTop(UI, bucket.getLabel()); 395 } 396 } 397 logln(UI.toString()); 398 399 // Show the buckets with their contents, skipping empty buckets 400 int bucketIndex = 0; 401 for (Bucket<Integer> bucket : index) { 402 assertEquals("bucket label vs. iterator", 403 labels.get(bucketIndex), bucket.getLabel()); 404 assertEquals("bucket label vs. immutable", 405 labels.get(bucketIndex), immIndex.getBucket(bucketIndex).getLabel()); 406 assertEquals("bucket label type vs. immutable", 407 bucket.getLabelType(), immIndex.getBucket(bucketIndex).getLabelType()); 408 for (Record<Integer> r : bucket) { 409 CharSequence name = r.getName(); 410 assertEquals("getBucketIndex(" + name + ")", 411 bucketIndex, index.getBucketIndex(name)); 412 assertEquals("immutable getBucketIndex(" + name + ")", 413 bucketIndex, immIndex.getBucketIndex(name)); 414 } 415 if (bucket.getLabel().equals(testBucket)) { 416 Counter<String> keys = getKeys(bucket); 417 for (String item : items) { 418 long globalCount = itemCount.get(item); 419 long localeCount = keys.get(item); 420 if (globalCount != localeCount) { 421 errln("Error: in " + "'" + testBucket + "', '" + item + "' should have count " 422 + globalCount + " but has count " + localeCount); 423 } 424 425 } 426 } 427 428 if (bucket.size() != 0) { 429 showLabelInList(UI, bucket.getLabel()); 430 for (AlphabeticIndex.Record<Integer> item : bucket) { 431 showIndexedItem(UI, item.getName(), item.getData()); 432 } 433 logln(UI.toString()); 434 } 435 ++bucketIndex; 436 } 437 assertEquals("getBucketCount()", bucketIndex, index.getBucketCount()); 438 assertEquals("immutable getBucketCount()", bucketIndex, immIndex.getBucketCount()); 439 440 assertNull("immutable getBucket(-1)", immIndex.getBucket(-1)); 441 assertNull("immutable getBucket(count)", immIndex.getBucket(bucketIndex)); 442 443 for (Bucket<Integer> bucket : immIndex) { 444 assertEquals("immutable bucket size", 0, bucket.size()); 445 assertFalse("immutable bucket iterator.hasNext()", bucket.iterator().hasNext()); 446 } 447 } 448 showIndex(AlphabeticIndex<T> index, boolean showEmpty)449 public <T> void showIndex(AlphabeticIndex<T> index, boolean showEmpty) { 450 logln("Actual"); 451 StringBuilder UI = new StringBuilder(); 452 for (Bucket<T> bucket : index) { 453 if (showEmpty || bucket.size() != 0) { 454 showLabelInList(UI, bucket.getLabel()); 455 for (Record<T> item : bucket) { 456 showIndexedItem(UI, item.getName(), item.getData()); 457 } 458 logln(UI.toString()); 459 } 460 } 461 } 462 463 /** 464 * @param myBucketLabels 465 * @param myBucketContents 466 * @param b 467 */ showIndex(List<String> myBucketLabels, ArrayList<Set<R4<RawCollationKey, String, Integer, Double>>> myBucketContents, boolean showEmpty)468 private void showIndex(List<String> myBucketLabels, ArrayList<Set<R4<RawCollationKey, String, Integer, Double>>> myBucketContents, boolean showEmpty) { 469 logln("Alternative"); 470 StringBuilder UI = new StringBuilder(); 471 472 for (int i = 0; i < myBucketLabels.size(); ++i) { 473 Set<R4<RawCollationKey, String, Integer, Double>> bucket = myBucketContents.get(i); 474 if (!showEmpty && bucket.size() == 0) { 475 continue; 476 } 477 UI.setLength(0); 478 UI.append("*").append(myBucketLabels.get(i)); 479 for (R4<RawCollationKey, String, Integer, Double> item : bucket) { 480 UI.append("\t ").append(item.get1().toString()).append(ARROW).append(item.get3().toString()); 481 } 482 logln(UI.toString()); 483 } 484 } 485 showLabelAtTop(StringBuilder buffer, String label)486 private void showLabelAtTop(StringBuilder buffer, String label) { 487 buffer.append(label + " "); 488 } 489 showIndexedItem(StringBuilder buffer, CharSequence key, T value)490 private <T> void showIndexedItem(StringBuilder buffer, CharSequence key, T value) { 491 buffer.append("\t " + key + ARROW + value); 492 } 493 showLabelInList(StringBuilder buffer, String label)494 private void showLabelInList(StringBuilder buffer, String label) { 495 buffer.setLength(0); 496 buffer.append(label); 497 } 498 getKeys(AlphabeticIndex.Bucket<Integer> entry)499 private Counter<String> getKeys(AlphabeticIndex.Bucket<Integer> entry) { 500 Counter<String> keys = new Counter<String>(); 501 for (AlphabeticIndex.Record x : entry) { 502 String key = x.getName().toString(); 503 keys.add(key, 1); 504 } 505 return keys; 506 } 507 TestIndexCharactersList()508 public void TestIndexCharactersList() { 509 for (String[] localeAndIndexCharacters : localeAndIndexCharactersLists) { 510 ULocale locale = new ULocale(localeAndIndexCharacters[0]); 511 String expectedIndexCharacters = "\u2026:" + localeAndIndexCharacters[1] + ":\u2026"; 512 Collection<String> alphabeticIndex = new AlphabeticIndex(locale).getBucketLabels(); 513 514 // Join the elements of the list to a string with delimiter ":" 515 StringBuilder sb = new StringBuilder(); 516 Iterator<String> iter = alphabeticIndex.iterator(); 517 while (iter.hasNext()) { 518 sb.append(iter.next()); 519 if (!iter.hasNext()) { 520 break; 521 } 522 sb.append(":"); 523 } 524 String actualIndexCharacters = sb.toString(); 525 if (!expectedIndexCharacters.equals(actualIndexCharacters)) { 526 errln("Test failed for locale " + localeAndIndexCharacters[0] + 527 "\n Expected = |" + expectedIndexCharacters + "|\n actual = |" + actualIndexCharacters + "|"); 528 } 529 } 530 } 531 TestBasics()532 public void TestBasics() { 533 ULocale[] list = ULocale.getAvailableLocales(); 534 // get keywords combinations 535 // don't bother with multiple combinations at this point 536 List keywords = new ArrayList(); 537 keywords.add(""); 538 539 String[] collationValues = Collator.getKeywordValues("collation"); 540 for (int j = 0; j < collationValues.length; ++j) { 541 keywords.add("@collation=" + collationValues[j]); 542 } 543 544 for (int i = 0; i < list.length; ++i) { 545 for (Iterator it = keywords.iterator(); it.hasNext();) { 546 String collationValue = (String) it.next(); 547 String localeString = list[i].toString(); 548 if (!KEY_LOCALES.contains(localeString)) continue; // TODO change in exhaustive 549 ULocale locale = new ULocale(localeString + collationValue); 550 if (collationValue.length() > 0 && !Collator.getFunctionalEquivalent("collation", locale).equals(locale)) { 551 //logln("Skipping " + locale); 552 continue; 553 } 554 555 if (locale.getCountry().length() != 0) { 556 continue; 557 } 558 boolean isUnihan = collationValue.contains("unihan"); 559 AlphabeticIndex alphabeticIndex = new AlphabeticIndex(locale); 560 if (isUnihan) { 561 // Unihan tailorings have a label per radical, and there are at least 214, 562 // if not more when simplified radicals are distinguished. 563 alphabeticIndex.setMaxLabelCount(500); 564 } 565 final Collection mainChars = alphabeticIndex.getBucketLabels(); 566 String mainCharString = mainChars.toString(); 567 if (mainCharString.length() > 500) { 568 mainCharString = mainCharString.substring(0,500) + "..."; 569 } 570 logln(mainChars.size() + "\t" + locale + "\t" + locale.getDisplayName(ULocale.ENGLISH)); 571 logln("Index:\t" + mainCharString); 572 if (!isUnihan && mainChars.size() > 100) { 573 errln("Index character set too large: " + 574 locale + " [" + mainChars.size() + "]:\n " + mainChars); 575 } 576 } 577 } 578 } 579 TestClientSupport()580 public void TestClientSupport() { 581 for (String localeString : new String[] {"zh"}) { // KEY_LOCALES, new String[] {"zh"} 582 ULocale ulocale = new ULocale(localeString); 583 AlphabeticIndex<Double> alphabeticIndex = new AlphabeticIndex<Double>(ulocale).addLabels(ULocale.ENGLISH); 584 RuleBasedCollator collator = alphabeticIndex.getCollator(); 585 String [][] tests; 586 587 if (!localeString.equals("zh") ) { 588 tests = new String[][] {SimpleTests}; 589 } else { 590 tests = new String[][] {SimpleTests, hackPinyin, simplifiedNames}; 591 } 592 593 for (String [] shortTest : tests) { 594 double testValue = 100; 595 alphabeticIndex.clearRecords(); 596 for (String name : shortTest) { 597 alphabeticIndex.addRecord(name, testValue++); 598 } 599 600 if (DEBUG) showIndex(alphabeticIndex, false); 601 602 // make my own copy 603 testValue = 100; 604 List<String> myBucketLabels = alphabeticIndex.getBucketLabels(); 605 ArrayList<Set<R4<RawCollationKey, String, Integer, Double>>> myBucketContents = new ArrayList<Set<R4<RawCollationKey, String, Integer, Double>>>(myBucketLabels.size()); 606 for (int i = 0; i < myBucketLabels.size(); ++i) { 607 myBucketContents.add(new TreeSet<R4<RawCollationKey, String, Integer, Double>>()); 608 } 609 for (String name : shortTest) { 610 int bucketIndex = alphabeticIndex.getBucketIndex(name); 611 if (bucketIndex > myBucketContents.size()) { 612 alphabeticIndex.getBucketIndex(name); // call again for debugging 613 } 614 Set<R4<RawCollationKey, String, Integer, Double>> myBucket = myBucketContents.get(bucketIndex); 615 RawCollationKey rawCollationKey = collator.getRawCollationKey(name, null); 616 R4<RawCollationKey, String, Integer, Double> row = Row.of(rawCollationKey, name, name.length(), testValue++); 617 myBucket.add(row); 618 } 619 if (DEBUG) showIndex(myBucketLabels, myBucketContents, false); 620 621 // now compare 622 int index = 0; 623 boolean gotError = false; 624 for (AlphabeticIndex.Bucket<Double> bucket : alphabeticIndex) { 625 String bucketLabel = bucket.getLabel(); 626 String myLabel = myBucketLabels.get(index); 627 if (!bucketLabel.equals(myLabel)) { 628 gotError |= !assertEquals(ulocale + "\tBucket Labels (" + index + ")", bucketLabel, myLabel); 629 } 630 Set<R4<RawCollationKey, String, Integer, Double>> myBucket = myBucketContents.get(index); 631 Iterator<R4<RawCollationKey, String, Integer, Double>> myBucketIterator = myBucket.iterator(); 632 int recordIndex = 0; 633 for (Record<Double> record : bucket) { 634 String myName = null; 635 if (myBucketIterator.hasNext()) { 636 R4<RawCollationKey, String, Integer, Double> myRecord = myBucketIterator.next(); 637 myName = (String) myRecord.get1(); 638 } 639 if (!record.getName().equals(myName)) { 640 gotError |= !assertEquals(ulocale + "\t" + bucketLabel + "\t" + "Record Names (" + index + "." + recordIndex++ + ")", record.getName(), myName); 641 } 642 } 643 while (myBucketIterator.hasNext()) { 644 R4<RawCollationKey, String, Integer, Double> myRecord = myBucketIterator.next(); 645 String myName = (String) myRecord.get1(); 646 gotError |= !assertEquals(ulocale + "\t" + bucketLabel + "\t" + "Record Names (" + index + "." + recordIndex++ + ")", null, myName); 647 } 648 index++; 649 } 650 if (gotError) { 651 showIndex(myBucketLabels, myBucketContents, false); 652 showIndex(alphabeticIndex, false); 653 } 654 } 655 } 656 } 657 TestFirstScriptCharacters()658 public void TestFirstScriptCharacters() { 659 Collection<String> firstCharacters = 660 new AlphabeticIndex(ULocale.ENGLISH).getFirstCharactersInScripts(); 661 Collection<String> expectedFirstCharacters = firstStringsInScript((RuleBasedCollator) Collator.getInstance(ULocale.ROOT)); 662 Collection<String> diff = new TreeSet<String>(firstCharacters); 663 diff.removeAll(expectedFirstCharacters); 664 assertTrue("First Characters contains unexpected ones: " + diff, diff.isEmpty()); 665 diff.clear(); 666 diff.addAll(expectedFirstCharacters); 667 diff.removeAll(firstCharacters); 668 assertTrue("First Characters missing expected ones: " + diff, diff.isEmpty()); 669 } 670 671 private static final UnicodeSet TO_TRY = new UnicodeSet("[[:^nfcqc=no:]-[:sc=Common:]-[:sc=Inherited:]-[:sc=Unknown:]]").freeze(); 672 673 /** 674 * Returns a collection of all the "First" characters of scripts, according to the collation. 675 */ firstStringsInScript(RuleBasedCollator ruleBasedCollator)676 private static Collection<String> firstStringsInScript(RuleBasedCollator ruleBasedCollator) { 677 String[] results = new String[UScript.CODE_LIMIT]; 678 for (String current : TO_TRY) { 679 if (ruleBasedCollator.compare(current, "a") < 0) { // we only want "real" script characters, not symbols. 680 continue; 681 } 682 int script = UScript.getScript(current.codePointAt(0)); 683 if (results[script] == null) { 684 results[script] = current; 685 } else if (ruleBasedCollator.compare(current, results[script]) < 0) { 686 results[script] = current; 687 } 688 } 689 690 try { 691 UnicodeSet extras = new UnicodeSet(); 692 UnicodeSet expansions = new UnicodeSet(); 693 ruleBasedCollator.getContractionsAndExpansions(extras, expansions, true); 694 extras.addAll(expansions).removeAll(TO_TRY); 695 if (extras.size() != 0) { 696 Normalizer2 normalizer = Normalizer2.getNFKCInstance(); 697 for (String current : extras) { 698 if (!normalizer.isNormalized(current) || ruleBasedCollator.compare(current, "9") <= 0) { 699 continue; 700 } 701 int script = getFirstRealScript(current); 702 if (script == UScript.UNKNOWN && !isUnassignedBoundary(current)) { continue; } 703 if (results[script] == null) { 704 results[script] = current; 705 } else if (ruleBasedCollator.compare(current, results[script]) < 0) { 706 results[script] = current; 707 } 708 } 709 } 710 } catch (Exception e) { 711 } // why have a checked exception??? 712 713 // TODO: We should not test that we get the same strings, but that we 714 // get strings that sort primary-equal to those from the implementation. 715 716 Collection<String> result = new ArrayList<String>(); 717 for (int i = 0; i < results.length; ++i) { 718 if (results[i] != null) { 719 result.add(results[i]); 720 } 721 } 722 return result; 723 } 724 isUnassignedBoundary(CharSequence s)725 private static final boolean isUnassignedBoundary(CharSequence s) { 726 // The root collator provides a script-first-primary boundary contraction 727 // for the unassigned-implicit range. 728 return s.charAt(0) == 0xfdd1 && 729 UScript.getScript(Character.codePointAt(s, 1)) == UScript.UNKNOWN; 730 } 731 TestZZZ()732 public void TestZZZ() { 733 // int x = 3; 734 // AlphabeticIndex index = new AlphabeticIndex(ULocale.ENGLISH); 735 // UnicodeSet additions = new UnicodeSet(); 736 // additions.add(0x410).add(0x415); // Cyrillic 737 // // additions.add(0x391).add(0x393); // Greek 738 // index.addLabels(additions); 739 // int lc = index.getLabels().size(); 740 // List labels = index.getLabels(); 741 // System.out.println("Label Count = " + lc + "\t" + labels); 742 // System.out.println("Bucket Count =" + index.getBucketCount()); 743 } 744 TestSimplified()745 public void TestSimplified() { 746 checkBuckets("zh", simplifiedNames, ULocale.ENGLISH, "W", "\u897f"); 747 } TestTraditional()748 public void TestTraditional() { 749 checkBuckets("zh_Hant", traditionalNames, ULocale.ENGLISH, "\u4e9f", "\u5357\u9580"); 750 } 751 752 static final String[] SimpleTests = { 753 "斎藤", 754 "\u1f2d\u03c1\u03b1", 755 "$", "\u00a3", "12", "2", 756 "Davis", "Davis", "Abbot", "\u1D05avis", "Zach", "\u1D05avis", "\u01b5", "\u0130stanbul", "Istanbul", "istanbul", "\u0131stanbul", 757 "\u00deor", "\u00c5berg", "\u00d6stlund", 758 "\u1f2d\u03c1\u03b1", "\u1f08\u03b8\u03b7\u03bd\u1fb6", 759 "\u0396\u03b5\u03cd\u03c2", "\u03a0\u03bf\u03c3\u03b5\u03b9\u03b4\u1f63\u03bd", "\u1f0d\u03b9\u03b4\u03b7\u03c2", "\u0394\u03b7\u03bc\u03ae\u03c4\u03b7\u03c1", "\u1f19\u03c3\u03c4\u03b9\u03ac", 760 //"\u1f08\u03c0\u03cc\u03bb\u03bb\u03c9\u03bd", "\u1f0c\u03c1\u03c4\u03b5\u03bc\u03b9\u03c2", "\u1f19\u03c1\u03bc\u1f23\u03c2", "\u1f0c\u03c1\u03b7\u03c2", "\u1f08\u03c6\u03c1\u03bf\u03b4\u03af\u03c4\u03b7", "\u1f2d\u03c6\u03b1\u03b9\u03c3\u03c4\u03bf\u03c2", "\u0394\u03b9\u03cc\u03bd\u03c5\u03c3\u03bf\u03c2", 761 "\u6589\u85e4", "\u4f50\u85e4", "\u9234\u6728", "\u9ad8\u6a4b", "\u7530\u4e2d", "\u6e21\u8fba", "\u4f0a\u85e4", "\u5c71\u672c", "\u4e2d\u6751", "\u5c0f\u6797", "\u658e\u85e4", "\u52a0\u85e4", 762 //"\u5409\u7530", "\u5c71\u7530", "\u4f50\u3005\u6728", "\u5c71\u53e3", "\u677e\u672c", "\u4e95\u4e0a", "\u6728\u6751", "\u6797", "\u6e05\u6c34" 763 }; 764 765 static final String[] hackPinyin = { 766 "a", "\u5416", "\u58ba", // 767 "b", "\u516b", "\u62d4", "\u8500", // 768 "c", "\u5693", "\u7938", "\u9e7e", // 769 "d", "\u5491", "\u8fcf", "\u964a", // 770 "e","\u59b8", "\u92e8", "\u834b", // 771 "f", "\u53d1", "\u9197", "\u99a5", // 772 "g", "\u7324", "\u91d3", "\u8142", // 773 "h", "\u598e", "\u927f", "\u593b", // 774 "j", "\u4e0c", "\u6785", "\u9d58", // 775 "k", "\u5494", "\u958b", "\u7a52", // 776 "l", "\u5783", "\u62c9", "\u9ba5", // 777 "m", "\u5638", "\u9ebb", "\u65c0", // 778 "n", "\u62ff", "\u80ad", "\u685b", // 779 "o", "\u5662", "\u6bee", "\u8bb4", // 780 "p", "\u5991", "\u8019", "\u8c31", // 781 "q", "\u4e03", "\u6053", "\u7f56", // 782 "r", "\u5465", "\u72aa", "\u6e03", // 783 "s", "\u4ee8", "\u9491", "\u93c1", // 784 "t", "\u4ed6", "\u9248", "\u67dd", // 785 "w", "\u5c72", "\u5558", "\u5a7a", // 786 "x", "\u5915", "\u5438", "\u6bbe", // 787 "y", "\u4e2b", "\u82bd", "\u8574", // 788 "z", "\u5e00", "\u707d", "\u5c0a" 789 }; 790 791 static final String[] simplifiedNames = { 792 "Abbot", "Morton", "Zachary", "Williams", "\u8d75", "\u94b1", "\u5b59", "\u674e", "\u5468", "\u5434", "\u90d1", "\u738b", "\u51af", "\u9648", "\u696e", "\u536b", "\u848b", "\u6c88", 793 "\u97e9", "\u6768", "\u6731", "\u79e6", "\u5c24", "\u8bb8", "\u4f55", "\u5415", "\u65bd", "\u5f20", "\u5b54", "\u66f9", "\u4e25", "\u534e", "\u91d1", "\u9b4f", "\u9676", "\u59dc", "\u621a", "\u8c22", "\u90b9", 794 "\u55bb", "\u67cf", "\u6c34", "\u7aa6", "\u7ae0", "\u4e91", "\u82cf", "\u6f58", "\u845b", "\u595a", "\u8303", "\u5f6d", "\u90ce", "\u9c81", "\u97e6", "\u660c", "\u9a6c", "\u82d7", "\u51e4", "\u82b1", "\u65b9", 795 "\u4fde", "\u4efb", "\u8881", "\u67f3", "\u9146", "\u9c8d", "\u53f2", "\u5510", "\u8d39", "\u5ec9", "\u5c91", "\u859b", "\u96f7", "\u8d3a", "\u502a", "\u6c64", "\u6ed5", "\u6bb7", "\u7f57", "\u6bd5", "\u90dd", 796 "\u90ac", "\u5b89", "\u5e38", "\u4e50", "\u4e8e", "\u65f6", "\u5085", "\u76ae", "\u535e", "\u9f50", "\u5eb7", "\u4f0d", "\u4f59", "\u5143", "\u535c", "\u987e", "\u5b5f", "\u5e73", "\u9ec4", "\u548c", "\u7a46", 797 "\u8427", "\u5c39", "\u59da", "\u90b5", "\u6e5b", "\u6c6a", "\u7941", "\u6bdb", "\u79b9", "\u72c4", "\u7c73", "\u8d1d", "\u660e", "\u81e7", "\u8ba1", "\u4f0f", "\u6210", "\u6234", "\u8c08", "\u5b8b", "\u8305", 798 "\u5e9e", "\u718a", "\u7eaa", "\u8212", "\u5c48", "\u9879", "\u795d", "\u8463", "\u6881", "\u675c", "\u962e", "\u84dd", "\u95fd", "\u5e2d", "\u5b63", "\u9ebb", "\u5f3a", "\u8d3e", "\u8def", "\u5a04", "\u5371", 799 "\u6c5f", "\u7ae5", "\u989c", "\u90ed", "\u6885", "\u76db", "\u6797", "\u5201", "\u953a", "\u5f90", "\u4e18", "\u9a86", "\u9ad8", "\u590f", "\u8521", "\u7530", "\u6a0a", "\u80e1", "\u51cc", "\u970d", "\u865e", 800 "\u4e07", "\u652f", "\u67ef", "\u661d", "\u7ba1", "\u5362", "\u83ab", "\u7ecf", "\u623f", "\u88d8", "\u7f2a", "\u5e72", "\u89e3", "\u5e94", "\u5b97", "\u4e01", "\u5ba3", "\u8d32", "\u9093", "\u90c1", "\u5355", 801 "\u676d", "\u6d2a", "\u5305", "\u8bf8", "\u5de6", "\u77f3", "\u5d14", "\u5409", "\u94ae", "\u9f9a", "\u7a0b", "\u5d47", "\u90a2", "\u6ed1", "\u88f4", "\u9646", "\u8363", "\u7fc1", "\u8340", "\u7f8a", "\u65bc", 802 "\u60e0", "\u7504", "\u9eb9", "\u5bb6", "\u5c01", "\u82ae", "\u7fbf", "\u50a8", "\u9773", "\u6c72", "\u90b4", "\u7cdc", "\u677e", "\u4e95", "\u6bb5", "\u5bcc", "\u5deb", "\u4e4c", "\u7126", "\u5df4", "\u5f13", 803 "\u7267", "\u9697", "\u5c71", "\u8c37", "\u8f66", "\u4faf", "\u5b93", "\u84ec", "\u5168", "\u90d7", "\u73ed", "\u4ef0", "\u79cb", "\u4ef2", "\u4f0a", "\u5bab", "\u5b81", "\u4ec7", "\u683e", "\u66b4", "\u7518", 804 "\u659c", "\u5389", "\u620e", "\u7956", "\u6b66", "\u7b26", "\u5218", "\u666f", "\u8a79", "\u675f", "\u9f99", "\u53f6", "\u5e78", "\u53f8", "\u97f6", "\u90dc", "\u9ece", "\u84df", "\u8584", "\u5370", "\u5bbf", 805 "\u767d", "\u6000", "\u84b2", "\u90b0", "\u4ece", "\u9102", "\u7d22", "\u54b8", "\u7c4d", "\u8d56", "\u5353", "\u853a", "\u5c60", "\u8499", "\u6c60", "\u4e54", "\u9634", "\u90c1", "\u80e5", "\u80fd", "\u82cd", 806 "\u53cc", "\u95fb", "\u8398", "\u515a", "\u7fdf", "\u8c2d", "\u8d21", "\u52b3", "\u9004", "\u59ec", "\u7533", "\u6276", "\u5835", "\u5189", "\u5bb0", "\u90e6", "\u96cd", "\u90e4", "\u74a9", "\u6851", "\u6842", 807 "\u6fee", "\u725b", "\u5bff", "\u901a", "\u8fb9", "\u6248", "\u71d5", "\u5180", "\u90cf", "\u6d66", "\u5c1a", "\u519c", "\u6e29", "\u522b", "\u5e84", "\u664f", "\u67f4", "\u77bf", "\u960e", "\u5145", "\u6155", 808 "\u8fde", "\u8339", "\u4e60", "\u5ba6", "\u827e", "\u9c7c", "\u5bb9", "\u5411", "\u53e4", "\u6613", "\u614e", "\u6208", "\u5ed6", "\u5ebe", "\u7ec8", "\u66a8", "\u5c45", "\u8861", "\u6b65", "\u90fd", "\u803f", 809 "\u6ee1", "\u5f18", "\u5321", "\u56fd", "\u6587", "\u5bc7", "\u5e7f", "\u7984", "\u9619", "\u4e1c", "\u6b27", "\u6bb3", "\u6c83", "\u5229", "\u851a", "\u8d8a", "\u5914", "\u9686", "\u5e08", "\u5de9", "\u538d", 810 "\u8042", "\u6641", "\u52fe", "\u6556", "\u878d", "\u51b7", "\u8a3e", "\u8f9b", "\u961a", "\u90a3", "\u7b80", "\u9976", "\u7a7a", "\u66fe", "\u6bcb", "\u6c99", "\u4e5c", "\u517b", "\u97a0", "\u987b", "\u4e30", 811 "\u5de2", "\u5173", "\u84af", "\u76f8", "\u67e5", "\u540e", "\u8346", "\u7ea2", "\u6e38", "\u7afa", "\u6743", "\u9011", "\u76d6", "\u76ca", "\u6853", "\u516c", "\u4e07\u4fdf", "\u53f8\u9a6c", "\u4e0a\u5b98", "\u6b27\u9633", 812 "\u590f\u4faf", "\u8bf8\u845b", "\u95fb\u4eba", "\u4e1c\u65b9", "\u8d6b\u8fde", "\u7687\u752b", "\u5c09\u8fdf", "\u516c\u7f8a", "\u6fb9\u53f0", "\u516c\u51b6", "\u5b97\u653f", "\u6fee\u9633", "\u6df3\u4e8e", "\u5355\u4e8e", "\u592a\u53d4", "\u7533\u5c60", "\u516c\u5b59", "\u4ef2\u5b59", 813 "\u8f69\u8f95", "\u4ee4\u72d0", "\u953a\u79bb", "\u5b87\u6587", "\u957f\u5b59", "\u6155\u5bb9", "\u9c9c\u4e8e", "\u95fe\u4e18", "\u53f8\u5f92", "\u53f8\u7a7a", "\u4e0c\u5b98", "\u53f8\u5bc7", "\u4ec9", "\u7763", "\u5b50\u8f66", "\u989b\u5b59", "\u7aef\u6728", "\u5deb\u9a6c", 814 "\u516c\u897f", "\u6f06\u96d5", "\u4e50\u6b63", "\u58e4\u9a77", "\u516c\u826f", "\u62d3\u62d4", "\u5939\u8c37", "\u5bb0\u7236", "\u8c37\u6881", "\u664b", "\u695a", "\u960e", "\u6cd5", "\u6c5d", "\u9122", "\u6d82", "\u94a6", "\u6bb5\u5e72", "\u767e\u91cc", 815 "\u4e1c\u90ed", "\u5357\u95e8", "\u547c\u5ef6", "\u5f52", "\u6d77", "\u7f8a\u820c", "\u5fae\u751f", "\u5cb3", "\u5e05", "\u7f11", "\u4ea2", "\u51b5", "\u540e", "\u6709", "\u7434", "\u6881\u4e18", "\u5de6\u4e18", "\u4e1c\u95e8", "\u897f\u95e8", 816 "\u5546", "\u725f", "\u4f58", "\u4f74", "\u4f2f", "\u8d4f", "\u5357\u5bab", "\u58a8", "\u54c8", "\u8c2f", "\u7b2a", "\u5e74", "\u7231", "\u9633", "\u4f5f" 817 }; 818 819 static final String[] traditionalNames = { "丁", "Abbot", "Morton", "Zachary", "Williams", "\u8d99", "\u9322", "\u5b6b", 820 "\u674e", "\u5468", "\u5433", "\u912d", "\u738b", "\u99ae", "\u9673", "\u696e", "\u885b", "\u8523", 821 "\u6c88", "\u97d3", "\u694a", "\u6731", "\u79e6", "\u5c24", "\u8a31", "\u4f55", "\u5442", "\u65bd", 822 "\u5f35", "\u5b54", "\u66f9", "\u56b4", "\u83ef", "\u91d1", "\u9b4f", "\u9676", "\u59dc", "\u621a", 823 "\u8b1d", "\u9112", "\u55bb", "\u67cf", "\u6c34", "\u7ac7", "\u7ae0", "\u96f2", "\u8607", "\u6f58", 824 "\u845b", "\u595a", "\u7bc4", "\u5f6d", "\u90ce", "\u9b6f", "\u97cb", "\u660c", "\u99ac", "\u82d7", 825 "\u9cf3", "\u82b1", "\u65b9", "\u4fde", "\u4efb", "\u8881", "\u67f3", "\u9146", "\u9b91", "\u53f2", 826 "\u5510", "\u8cbb", "\u5ec9", "\u5c91", "\u859b", "\u96f7", "\u8cc0", "\u502a", "\u6e6f", "\u6ed5", 827 "\u6bb7", "\u7f85", "\u7562", "\u90dd", "\u9114", "\u5b89", "\u5e38", "\u6a02", "\u65bc", "\u6642", 828 "\u5085", "\u76ae", "\u535e", "\u9f4a", "\u5eb7", "\u4f0d", "\u9918", "\u5143", "\u535c", "\u9867", 829 "\u5b5f", "\u5e73", "\u9ec3", "\u548c", "\u7a46", "\u856d", "\u5c39", "\u59da", "\u90b5", "\u6e5b", 830 "\u6c6a", "\u7941", "\u6bdb", "\u79b9", "\u72c4", "\u7c73", "\u8c9d", "\u660e", "\u81e7", "\u8a08", 831 "\u4f0f", "\u6210", "\u6234", "\u8ac7", "\u5b8b", "\u8305", "\u9f90", "\u718a", "\u7d00", "\u8212", 832 "\u5c48", "\u9805", "\u795d", "\u8463", "\u6881", "\u675c", "\u962e", "\u85cd", "\u95a9", "\u5e2d", 833 "\u5b63", "\u9ebb", "\u5f37", "\u8cc8", "\u8def", "\u5a41", "\u5371", "\u6c5f", "\u7ae5", "\u984f", 834 "\u90ed", "\u6885", "\u76db", "\u6797", "\u5201", "\u937e", "\u5f90", "\u4e18", "\u99f1", "\u9ad8", 835 "\u590f", "\u8521", "\u7530", "\u6a0a", "\u80e1", "\u51cc", "\u970d", "\u865e", "\u842c", "\u652f", 836 "\u67ef", "\u661d", "\u7ba1", "\u76e7", "\u83ab", "\u7d93", "\u623f", "\u88d8", "\u7e46", "\u5e79", 837 "\u89e3", "\u61c9", "\u5b97", "\u4e01", "\u5ba3", "\u8cc1", "\u9127", "\u9b31", "\u55ae", "\u676d", 838 "\u6d2a", "\u5305", "\u8af8", "\u5de6", "\u77f3", "\u5d14", "\u5409", "\u9215", "\u9f94", "\u7a0b", 839 "\u5d47", "\u90a2", "\u6ed1", "\u88f4", "\u9678", "\u69ae", "\u7fc1", "\u8340", "\u7f8a", "\u65bc", 840 "\u60e0", "\u7504", "\u9eb4", "\u5bb6", "\u5c01", "\u82ae", "\u7fbf", "\u5132", "\u9773", "\u6c72", 841 "\u90b4", "\u7cdc", "\u677e", "\u4e95", "\u6bb5", "\u5bcc", "\u5deb", "\u70cf", "\u7126", "\u5df4", 842 "\u5f13", "\u7267", "\u9697", "\u5c71", "\u8c37", "\u8eca", "\u4faf", "\u5b93", "\u84ec", "\u5168", 843 "\u90d7", "\u73ed", "\u4ef0", "\u79cb", "\u4ef2", "\u4f0a", "\u5bae", "\u5be7", "\u4ec7", "\u6b12", 844 "\u66b4", "\u7518", "\u659c", "\u53b2", "\u620e", "\u7956", "\u6b66", "\u7b26", "\u5289", "\u666f", 845 "\u8a79", "\u675f", "\u9f8d", "\u8449", "\u5e78", "\u53f8", "\u97f6", "\u90dc", "\u9ece", "\u858a", 846 "\u8584", "\u5370", "\u5bbf", "\u767d", "\u61f7", "\u84b2", "\u90b0", "\u5f9e", "\u9102", "\u7d22", 847 "\u54b8", "\u7c4d", "\u8cf4", "\u5353", "\u85fa", "\u5c60", "\u8499", "\u6c60", "\u55ac", "\u9670", 848 "\u9b31", "\u80e5", "\u80fd", "\u84bc", "\u96d9", "\u805e", "\u8398", "\u9ee8", "\u7fdf", "\u8b5a", 849 "\u8ca2", "\u52de", "\u9004", "\u59ec", "\u7533", "\u6276", "\u5835", "\u5189", "\u5bb0", "\u9148", 850 "\u96cd", "\u90e4", "\u74a9", "\u6851", "\u6842", "\u6fee", "\u725b", "\u58fd", "\u901a", "\u908a", 851 "\u6248", "\u71d5", "\u5180", "\u90df", "\u6d66", "\u5c1a", "\u8fb2", "\u6eab", "\u5225", "\u838a", 852 "\u664f", "\u67f4", "\u77bf", "\u95bb", "\u5145", "\u6155", "\u9023", "\u8339", "\u7fd2", "\u5ba6", 853 "\u827e", "\u9b5a", "\u5bb9", "\u5411", "\u53e4", "\u6613", "\u614e", "\u6208", "\u5ed6", "\u5ebe", 854 "\u7d42", "\u66a8", "\u5c45", "\u8861", "\u6b65", "\u90fd", "\u803f", "\u6eff", "\u5f18", "\u5321", 855 "\u570b", "\u6587", "\u5bc7", "\u5ee3", "\u797f", "\u95d5", "\u6771", "\u6b50", "\u6bb3", "\u6c83", 856 "\u5229", "\u851a", "\u8d8a", "\u5914", "\u9686", "\u5e2b", "\u978f", "\u5399", "\u8076", "\u6641", 857 "\u52fe", "\u6556", "\u878d", "\u51b7", "\u8a3e", "\u8f9b", "\u95de", "\u90a3", "\u7c21", "\u9952", 858 "\u7a7a", "\u66fe", "\u6bcb", "\u6c99", "\u4e5c", "\u990a", "\u97a0", "\u9808", "\u8c50", "\u5de2", 859 "\u95dc", "\u84af", "\u76f8", "\u67e5", "\u5f8c", "\u834a", "\u7d05", "\u904a", "\u7afa", "\u6b0a", 860 "\u9011", "\u84cb", "\u76ca", "\u6853", "\u516c", "\u4e07\u4fdf", "\u53f8\u99ac", "\u4e0a\u5b98", 861 "\u6b50\u967d", "\u590f\u4faf", "\u8af8\u845b", "\u805e\u4eba", "\u6771\u65b9", "\u8d6b\u9023", 862 "\u7687\u752b", "\u5c09\u9072", "\u516c\u7f8a", "\u6fb9\u53f0", "\u516c\u51b6", "\u5b97\u653f", 863 "\u6fee\u967d", "\u6df3\u4e8e", "\u55ae\u4e8e", "\u592a\u53d4", "\u7533\u5c60", "\u516c\u5b6b", 864 "\u4ef2\u5b6b", "\u8ed2\u8f45", "\u4ee4\u72d0", "\u937e\u96e2", "\u5b87\u6587", "\u9577\u5b6b", 865 "\u6155\u5bb9", "\u9bae\u4e8e", "\u95ad\u4e18", "\u53f8\u5f92", "\u53f8\u7a7a", "\u4e0c\u5b98", 866 "\u53f8\u5bc7", "\u4ec9", "\u7763", "\u5b50\u8eca", "\u9853\u5b6b", "\u7aef\u6728", "\u5deb\u99ac", 867 "\u516c\u897f", "\u6f06\u96d5", "\u6a02\u6b63", "\u58e4\u99df", "\u516c\u826f", "\u62d3\u62d4", 868 "\u593e\u8c37", "\u5bb0\u7236", "\u7a40\u6881", "\u6649", "\u695a", "\u95bb", "\u6cd5", "\u6c5d", "\u9122", 869 "\u5857", "\u6b3d", "\u6bb5\u5e72", "\u767e\u91cc", "\u6771\u90ed", "\u5357\u9580", "\u547c\u5ef6", 870 "\u6b78", "\u6d77", "\u7f8a\u820c", "\u5fae\u751f", "\u5cb3", "\u5e25", "\u7df1", "\u4ea2", "\u6cc1", 871 "\u5f8c", "\u6709", "\u7434", "\u6881\u4e18", "\u5de6\u4e18", "\u6771\u9580", "\u897f\u9580", "\u5546", 872 "\u725f", "\u4f58", "\u4f74", "\u4f2f", "\u8cde", "\u5357\u5bae", "\u58a8", "\u54c8", "\u8b59", "\u7b2a", 873 "\u5e74", "\u611b", "\u967d", "\u4f5f", "\u3401", "\u3422", "\u3426", "\u3493", "\u34A5", "\u34A7", 874 "\u34AA", "\u3536", "\u4A3B", "\u4E00", "\u4E01", "\u4E07", "\u4E0D", "\u4E17", "\u4E23", "\u4E26", 875 "\u4E34", "\u4E82", "\u4EB8", "\u4EB9", "\u511F", "\u512D", "\u513D", "\u513E", "\u53B5", "\u56D4", 876 "\u56D6", "\u7065", "\u7069", "\u706A", "\u7E9E", "\u9750", "\u9F49", "\u9F7E", "\u9F98", "\uD840\uDC35", 877 "\uD840\uDC3D", "\uD840\uDC3E", "\uD840\uDC41", "\uD840\uDC46", "\uD840\uDC4C", "\uD840\uDC4E", 878 "\uD840\uDC53", "\uD840\uDC55", "\uD840\uDC56", "\uD840\uDC5F", "\uD840\uDC60", "\uD840\uDC7A", 879 "\uD840\uDC7B", "\uD840\uDCC8", "\uD840\uDD9E", "\uD840\uDD9F", "\uD840\uDDA0", "\uD840\uDDA1", 880 "\uD841\uDD3B", "\uD842\uDCCA", "\uD842\uDCCB", "\uD842\uDD6C", "\uD842\uDE0B", "\uD842\uDE0C", 881 "\uD842\uDED1", "\uD844\uDD9F", "\uD845\uDD19", "\uD845\uDD1A", "\uD846\uDD3B", "\uD84C\uDF5C", 882 "\uD85A\uDDC4", "\uD85A\uDDC5", "\uD85C\uDD98", "\uD85E\uDCB1", "\uD861\uDC04", "\uD864\uDDD3", 883 "\uD865\uDE63", "\uD869\uDCCA", "\uD86B\uDE9A", }; 884 885 /** 886 * Test AlphabeticIndex vs. root with script reordering. 887 */ TestHaniFirst()888 public void TestHaniFirst() { 889 RuleBasedCollator coll = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT); 890 coll.setReorderCodes(UScript.HAN); 891 AlphabeticIndex index = new AlphabeticIndex(coll); 892 assertEquals("getBucketCount()", 1, index.getBucketCount()); // ... (underflow only) 893 index.addLabels(ULocale.ENGLISH); 894 assertEquals("getBucketCount()", 28, index.getBucketCount()); // ... A-Z ... 895 int bucketIndex = index.getBucketIndex("\u897f"); 896 assertEquals("getBucketIndex(U+897F)", 0, bucketIndex); // underflow bucket 897 bucketIndex = index.getBucketIndex("i"); 898 assertEquals("getBucketIndex(i)", 9, bucketIndex); 899 bucketIndex = index.getBucketIndex("\u03B1"); 900 assertEquals("getBucketIndex(Greek alpha)", 27, bucketIndex); 901 // U+50005 is an unassigned code point which sorts at the end, independent of the Hani group. 902 bucketIndex = index.getBucketIndex(UTF16.valueOf(0x50005)); 903 assertEquals("getBucketIndex(U+50005)", 27, bucketIndex); 904 bucketIndex = index.getBucketIndex("\uFFFF"); 905 assertEquals("getBucketIndex(U+FFFF)", 27, bucketIndex); 906 } 907 908 /** 909 * Test AlphabeticIndex vs. Pinyin with script reordering. 910 */ TestPinyinFirst()911 public void TestPinyinFirst() { 912 RuleBasedCollator coll = (RuleBasedCollator) Collator.getInstance(ULocale.CHINESE); 913 coll.setReorderCodes(UScript.HAN); 914 AlphabeticIndex index = new AlphabeticIndex(coll); 915 assertEquals("getBucketCount()", 28, index.getBucketCount()); // ... A-Z ... 916 index.addLabels(ULocale.CHINESE); 917 assertEquals("getBucketCount()", 28, index.getBucketCount()); // ... A-Z ... 918 int bucketIndex = index.getBucketIndex("\u897f"); 919 assertEquals("getBucketIndex(U+897F)", 'X' - 'A' + 1, bucketIndex); 920 bucketIndex = index.getBucketIndex("i"); 921 assertEquals("getBucketIndex(i)", 9, bucketIndex); 922 bucketIndex = index.getBucketIndex("\u03B1"); 923 assertEquals("getBucketIndex(Greek alpha)", 27, bucketIndex); 924 // U+50005 is an unassigned code point which sorts at the end, independent of the Hani group. 925 bucketIndex = index.getBucketIndex(UTF16.valueOf(0x50005)); 926 assertEquals("getBucketIndex(U+50005)", 27, bucketIndex); 927 bucketIndex = index.getBucketIndex("\uFFFF"); 928 assertEquals("getBucketIndex(U+FFFF)", 27, bucketIndex); 929 } 930 931 /** 932 * Test labels with multiple primary weights. 933 */ TestSchSt()934 public void TestSchSt() { 935 AlphabeticIndex index = new AlphabeticIndex(ULocale.GERMAN); 936 index.addLabels(new UnicodeSet("[Æ{Sch*}{St*}]")); 937 // ... A Æ B-R S Sch St T-Z ... 938 ImmutableIndex immIndex = index.buildImmutableIndex(); 939 assertEquals("getBucketCount()", 31, index.getBucketCount()); 940 assertEquals("immutable getBucketCount()", 31, immIndex.getBucketCount()); 941 String[][] testCases = new String[][] { 942 // name, bucket index, bucket label 943 { "Adelbert", "1", "A" }, 944 { "Afrika", "1", "A" }, 945 { "Æsculap", "2", "Æ" }, 946 { "Aesthet", "2", "Æ" }, 947 { "Berlin", "3", "B" }, 948 { "Rilke", "19", "R" }, 949 { "Sacher", "20", "S" }, 950 { "Seiler", "20", "S" }, 951 { "Sultan", "20", "S" }, 952 { "Schiller", "21", "Sch" }, 953 { "Steiff", "22", "St" }, 954 { "Thomas", "23", "T" } 955 }; 956 List<String> labels = index.getBucketLabels(); 957 for (String[] testCase : testCases) { 958 String name = testCase[0]; 959 int bucketIndex = Integer.valueOf(testCase[1]); 960 String label = testCase[2]; 961 String msg = "getBucketIndex(" + name + ")"; 962 assertEquals(msg, bucketIndex, index.getBucketIndex(name)); 963 msg = "immutable " + msg; 964 assertEquals(msg, bucketIndex, immIndex.getBucketIndex(name)); 965 msg = "bucket label (" + name + ")"; 966 assertEquals(msg, label, labels.get(index.getBucketIndex(name))); 967 msg = "immutable " + msg; 968 assertEquals(msg, label, immIndex.getBucket(bucketIndex).getLabel()); 969 } 970 } 971 972 /** 973 * With no real labels, there should be only the underflow label. 974 */ TestNoLabels()975 public void TestNoLabels() { 976 RuleBasedCollator coll = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT); 977 AlphabeticIndex<Integer> index = new AlphabeticIndex<Integer>(coll); 978 index.addRecord("\u897f", 0); 979 index.addRecord("i", 0); 980 index.addRecord("\u03B1", 0); 981 assertEquals("getBucketCount()", 1, index.getBucketCount()); // ... 982 Bucket<Integer> bucket = index.iterator().next(); 983 assertEquals("underflow label type", LabelType.UNDERFLOW, bucket.getLabelType()); 984 assertEquals("all records in the underflow bucket", 3, bucket.size()); 985 } 986 987 /** 988 * Test with the Bopomofo-phonetic tailoring. 989 */ TestChineseZhuyin()990 public void TestChineseZhuyin() { 991 AlphabeticIndex index = new AlphabeticIndex(ULocale.forLanguageTag("zh-u-co-zhuyin")); 992 ImmutableIndex immIndex = index.buildImmutableIndex(); 993 assertEquals("getBucketCount()", 38, immIndex.getBucketCount()); // ... ㄅ ㄆ ㄇ ㄈ ㄉ -- ㄩ ... 994 assertEquals("label 1", "ㄅ", immIndex.getBucket(1).getLabel()); 995 assertEquals("label 2", "ㄆ", immIndex.getBucket(2).getLabel()); 996 assertEquals("label 3", "ㄇ", immIndex.getBucket(3).getLabel()); 997 assertEquals("label 4", "ㄈ", immIndex.getBucket(4).getLabel()); 998 assertEquals("label 5", "ㄉ", immIndex.getBucket(5).getLabel()); 999 } 1000 TestJapaneseKanji()1001 public void TestJapaneseKanji() { 1002 AlphabeticIndex index = new AlphabeticIndex(ULocale.JAPANESE); 1003 AlphabeticIndex.ImmutableIndex immIndex = index.buildImmutableIndex(); 1004 // There are no index characters for Kanji in the Japanese standard collator. 1005 // They should all go into the overflow bucket. 1006 final int[] kanji = { 0x4E9C, 0x95C7, 0x4E00, 0x58F1 }; 1007 int overflowIndex = immIndex.getBucketCount() - 1; 1008 for(int i = 0; i < kanji.length; ++i) { 1009 String msg = String.format("kanji[%d]=U+%04X in overflow bucket", i, kanji[i]); 1010 assertEquals(msg, overflowIndex, immIndex.getBucketIndex(UTF16.valueOf(kanji[i]))); 1011 } 1012 } 1013 TestFrozenCollator()1014 public void TestFrozenCollator() { 1015 // Ticket #9472 1016 RuleBasedCollator coll = (RuleBasedCollator) Collator.getInstance(new ULocale("da")); 1017 coll.setStrength(Collator.IDENTICAL); 1018 coll.freeze(); 1019 // The AlphabeticIndex constructor used to throw an exception 1020 // because it cloned the collator (which preserves frozenness) 1021 // and set the clone's strength to PRIMARY. 1022 AlphabeticIndex index = new AlphabeticIndex(coll); 1023 assertEquals("same strength as input Collator", 1024 Collator.IDENTICAL, index.getCollator().getStrength()); 1025 } 1026 TestChineseUnihan()1027 public void TestChineseUnihan() { 1028 AlphabeticIndex index = new AlphabeticIndex(new ULocale("zh-u-co-unihan")); 1029 index.setMaxLabelCount(500); // ICU 54 default is 99. 1030 AlphabeticIndex.ImmutableIndex immIndex = index.buildImmutableIndex(); 1031 int bucketCount = immIndex.getBucketCount(); 1032 if(bucketCount < 216) { 1033 // There should be at least an underflow and overflow label, 1034 // and one for each of 214 radicals, 1035 // and maybe additional labels for simplified radicals. 1036 // (ICU4C: dataerrln(), prints only a warning if the data is missing) 1037 errln("too few buckets/labels for Chinese/unihan: " + bucketCount + 1038 " (is zh/unihan data available?)"); 1039 return; 1040 } else { 1041 logln("Chinese/unihan has " + bucketCount + " buckets/labels"); 1042 } 1043 // bucketIndex = radical number, adjusted for simplified radicals in lower buckets. 1044 int bucketIndex = index.getBucketIndex("\u4e5d"); 1045 assertEquals("getBucketIndex(U+4E5D)", 5, bucketIndex); 1046 // radical 100, and there is a 90' since Unicode 8 1047 bucketIndex = index.getBucketIndex("\u7527"); 1048 assertEquals("getBucketIndex(U+7527)", 101, bucketIndex); 1049 } 1050 } 1051