• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 /*
3  *******************************************************************************
4  * Copyright (C) 2008-2016, Google Inc, International Business Machines Corporation
5  * and others. All Rights Reserved.
6  *******************************************************************************
7  */
8 package android.icu.text;
9 
10 import java.util.ArrayList;
11 import java.util.Collections;
12 import java.util.Comparator;
13 import java.util.Iterator;
14 import java.util.List;
15 import java.util.Locale;
16 
17 import android.icu.lang.UCharacter;
18 import android.icu.text.AlphabeticIndex.Bucket;
19 import android.icu.text.AlphabeticIndex.Bucket.LabelType;
20 import android.icu.util.LocaleData;
21 import android.icu.util.ULocale;
22 
23 /**
24  * AlphabeticIndex supports the creation of a UI index appropriate for a given language.
25  * It can support either direct use, or use with a client that doesn't support localized collation.
26  * The following is an example of what an index might look like in a UI:
27  *
28  * <pre>
29  *  <b>... A B C D E F G H I J K L M N O P Q R S T U V W X Y Z  ...</b>
30  *
31  *  <b>A</b>
32  *     Addison
33  *     Albertson
34  *     Azensky
35  *  <b>B</b>
36  *     Baecker
37  *  ...
38  * </pre>
39  *
40  * The class can generate a list of labels for use as a UI "index", that is, a list of
41  * clickable characters (or character sequences) that allow the user to see a segment
42  * (bucket) of a larger "target" list. That is, each label corresponds to a bucket in
43  * the target list, where everything in the bucket is greater than or equal to the character
44  * (according to the locale's collation). Strings can be added to the index;
45  * they will be in sorted order in the right bucket.
46  * <p>
47  * The class also supports having buckets for strings before the first (underflow),
48  * after the last (overflow), and between scripts (inflow). For example, if the index
49  * is constructed with labels for Russian and English, Greek characters would fall
50  * into an inflow bucket between the other two scripts.
51  *
52  * <p><em>Note:</em> If you expect to have a lot of ASCII or Latin characters
53  * as well as characters from the user's language,
54  * then it is a good idea to call addLabels(ULocale.English).
55  *
56  * <h2>Direct Use</h2>
57  * <p>The following shows an example of building an index directly.
58  *  The "show..." methods below are just to illustrate usage.
59  *
60  * <pre>
61  * // Create a simple index where the values for the strings are Integers, and add the strings
62  *
63  * AlphabeticIndex&lt;Integer&gt; index = new AlphabeticIndex&lt;Integer&gt;(desiredLocale).addLabels(additionalLocale);
64  * int counter = 0;
65  * for (String item : test) {
66  *     index.addRecord(item, counter++);
67  * }
68  * ...
69  * // Show index at top. We could skip or gray out empty buckets
70  *
71  * for (AlphabeticIndex.Bucket&lt;Integer&gt; bucket : index) {
72  *     if (showAll || bucket.size() != 0) {
73  *         showLabelAtTop(UI, bucket.getLabel());
74  *     }
75  * }
76  *  ...
77  * // Show the buckets with their contents, skipping empty buckets
78  *
79  * for (AlphabeticIndex.Bucket&lt;Integer&gt; bucket : index) {
80  *     if (bucket.size() != 0) {
81  *         showLabelInList(UI, bucket.getLabel());
82  *         for (AlphabeticIndex.Record&lt;Integer&gt; item : bucket) {
83  *             showIndexedItem(UI, item.getName(), item.getData());
84  *         }
85  * </pre>
86  *
87  * The caller can build different UIs using this class.
88  * For example, an index character could be omitted or grayed-out
89  * if its bucket is empty. Small buckets could also be combined based on size, such as:
90  *
91  * <pre>
92  * <b>... A-F G-N O-Z ...</b>
93  * </pre>
94  *
95  * <h2>Client Support</h2>
96  * <p>Callers can also use the {@link AlphabeticIndex.ImmutableIndex}, or the AlphabeticIndex itself,
97  * to support sorting on a client that doesn't support AlphabeticIndex functionality.
98  *
99  * <p>The ImmutableIndex is both immutable and thread-safe.
100  * The corresponding AlphabeticIndex methods are not thread-safe because
101  * they "lazily" build the index buckets.
102  * <ul>
103  * <li>ImmutableIndex.getBucket(index) provides random access to all
104  *     buckets and their labels and label types.
105  * <li>AlphabeticIndex.getBucketLabels() or the bucket iterator on either class
106  *     can be used to get a list of the labels,
107  *     such as "...", "A", "B",..., and send that list to the client.
108  * <li>When the client has a new name, it sends that name to the server.
109  * The server needs to call the following methods,
110  * and communicate the bucketIndex and collationKey back to the client.
111  *
112  * <pre>
113  * int bucketIndex = index.getBucketIndex(name);
114  * String label = immutableIndex.getBucket(bucketIndex).getLabel();  // optional
115  * RawCollationKey collationKey = collator.getRawCollationKey(name, null);
116  * </pre>
117  *
118  * <li>The client would put the name (and associated information) into its bucket for bucketIndex. The collationKey is a
119  * sequence of bytes that can be compared with a binary compare, and produce the right localized result.</li>
120  * </ul>
121  *
122  * @author Mark Davis
123  */
124 public final class AlphabeticIndex<V> implements Iterable<Bucket<V>> {
125     /**
126      * Prefix string for Chinese index buckets.
127      * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
128      */
129     private static final String BASE = "\uFDD0";
130 
131     private static final char CGJ = '\u034F';
132 
133     private static final Comparator<String> binaryCmp = new UTF16.StringComparator(true, false, 0);
134 
135     private final RuleBasedCollator collatorOriginal;
136     private final RuleBasedCollator collatorPrimaryOnly;
137     private RuleBasedCollator collatorExternal;
138 
139     // Comparator for records, so that the Record class can be static.
140     private final Comparator<Record<V>> recordComparator = new Comparator<Record<V>>() {
141         public int compare(Record<V> o1, Record<V> o2) {
142             return collatorOriginal.compare(o1.name, o2.name);
143         }
144     };
145 
146     private final List<String> firstCharsInScripts;
147 
148     // We accumulate these as we build up the input parameters
149     private final UnicodeSet initialLabels = new UnicodeSet();
150     private List<Record<V>> inputList;
151 
152     // Lazy evaluated: null means that we have not built yet.
153     private BucketList<V> buckets;
154 
155     private String overflowLabel = "\u2026";
156     private String underflowLabel = "\u2026";
157     private String inflowLabel = "\u2026";
158 
159     /**
160      * Immutable, thread-safe version of {@link AlphabeticIndex}.
161      * This class provides thread-safe methods for bucketing,
162      * and random access to buckets and their properties,
163      * but does not offer adding records to the index.
164      *
165      * @param <V> The Record value type is unused. It can be omitted for this class
166      * if it was omitted for the AlphabeticIndex that built it.
167      */
168     public static final class ImmutableIndex<V> implements Iterable<Bucket<V>> {
169         private final BucketList<V> buckets;
170         private final Collator collatorPrimaryOnly;
171 
ImmutableIndex(BucketList<V> bucketList, Collator collatorPrimaryOnly)172         private ImmutableIndex(BucketList<V> bucketList, Collator collatorPrimaryOnly) {
173             this.buckets = bucketList;
174             this.collatorPrimaryOnly = collatorPrimaryOnly;
175         }
176 
177         /**
178          * Returns the number of index buckets and labels, including underflow/inflow/overflow.
179          *
180          * @return the number of index buckets
181          */
getBucketCount()182         public int getBucketCount() {
183             return buckets.getBucketCount();
184         }
185 
186         /**
187          * Finds the index bucket for the given name and returns the number of that bucket.
188          * Use {@link #getBucket(int)} to get the bucket's properties.
189          *
190          * @param name the string to be sorted into an index bucket
191          * @return the bucket number for the name
192          */
getBucketIndex(CharSequence name)193         public int getBucketIndex(CharSequence name) {
194             return buckets.getBucketIndex(name, collatorPrimaryOnly);
195         }
196 
197         /**
198          * Returns the index-th bucket. Returns null if the index is out of range.
199          *
200          * @param index bucket number
201          * @return the index-th bucket
202          */
getBucket(int index)203         public Bucket<V> getBucket(int index) {
204             if (0 <= index && index < buckets.getBucketCount()) {
205                 return buckets.immutableVisibleList.get(index);
206             } else {
207                 return null;
208             }
209         }
210 
211         /**
212          * {@inheritDoc}
213          */
iterator()214         public Iterator<Bucket<V>> iterator() {
215             return buckets.iterator();
216         }
217     }
218 
219     /**
220      * Create the index object.
221      *
222      * @param locale
223      *            The locale for the index.
224      */
AlphabeticIndex(ULocale locale)225     public AlphabeticIndex(ULocale locale) {
226         this(locale, null);
227     }
228 
229     /**
230      * Create the index object.
231      *
232      * @param locale
233      *            The locale for the index.
234      */
AlphabeticIndex(Locale locale)235     public AlphabeticIndex(Locale locale) {
236         this(ULocale.forLocale(locale), null);
237     }
238 
239     /**
240      * Create an AlphabeticIndex that uses a specific collator.
241      *
242      * <p>The index will be created with no labels; the addLabels() function must be called
243      * after creation to add the desired labels to the index.
244      *
245      * <p>The index will work directly with the supplied collator. If the caller will need to
246      * continue working with the collator it should be cloned first, so that the
247      * collator provided to the AlphabeticIndex remains unchanged after creation of the index.
248      *
249      * @param collator The collator to use to order the contents of this index.
250      */
AlphabeticIndex(RuleBasedCollator collator)251     public AlphabeticIndex(RuleBasedCollator collator) {
252         this(null, collator);
253     }
254 
255     /**
256      * Internal constructor containing implementation used by public constructors.
257      */
AlphabeticIndex(ULocale locale, RuleBasedCollator collator)258     private AlphabeticIndex(ULocale locale, RuleBasedCollator collator) {
259         collatorOriginal = collator != null ? collator : (RuleBasedCollator) Collator.getInstance(locale);
260         try {
261             collatorPrimaryOnly = collatorOriginal.cloneAsThawed();
262         } catch (Exception e) {
263             // should never happen
264             throw new IllegalStateException("Collator cannot be cloned", e);
265         }
266         collatorPrimaryOnly.setStrength(Collator.PRIMARY);
267         collatorPrimaryOnly.freeze();
268 
269         firstCharsInScripts = getFirstCharactersInScripts();
270         Collections.sort(firstCharsInScripts, collatorPrimaryOnly);
271         // Guard against a degenerate collator where
272         // some script boundary strings are primary ignorable.
273         for (;;) {
274             if (firstCharsInScripts.isEmpty()) {
275                 throw new IllegalArgumentException(
276                         "AlphabeticIndex requires some non-ignorable script boundary strings");
277             }
278             if (collatorPrimaryOnly.compare(firstCharsInScripts.get(0), "") == 0) {
279                 firstCharsInScripts.remove(0);
280             } else {
281                 break;
282             }
283         }
284 
285         // Chinese index characters, which are specific to each of the several Chinese tailorings,
286         // take precedence over the single locale data exemplar set per language.
287         if (!addChineseIndexCharacters() && locale != null) {
288             addIndexExemplars(locale);
289         }
290     }
291 
292     /**
293      * Add more index characters (aside from what are in the locale)
294      * @param additions additional characters to add to the index, such as A-Z.
295      * @return this, for chaining
296      */
addLabels(UnicodeSet additions)297     public AlphabeticIndex<V> addLabels(UnicodeSet additions) {
298         initialLabels.addAll(additions);
299         buckets = null;
300         return this;
301     }
302 
303     /**
304      * Add more index characters (aside from what are in the locale)
305      * @param additions additional characters to add to the index, such as those in Swedish.
306      * @return this, for chaining
307      */
addLabels(ULocale... additions)308     public AlphabeticIndex<V> addLabels(ULocale... additions) {
309         for (ULocale addition : additions) {
310             addIndexExemplars(addition);
311         }
312         buckets = null;
313         return this;
314     }
315 
316     /**
317      * Add more index characters (aside from what are in the locale)
318      * @param additions additional characters to add to the index, such as those in Swedish.
319      * @return this, for chaining
320      */
addLabels(Locale... additions)321     public AlphabeticIndex<V> addLabels(Locale... additions) {
322         for (Locale addition : additions) {
323             addIndexExemplars(ULocale.forLocale(addition));
324         }
325         buckets = null;
326         return this;
327     }
328 
329     /**
330      * Set the overflow label
331      * @param overflowLabel see class description
332      * @return this, for chaining
333      */
setOverflowLabel(String overflowLabel)334     public AlphabeticIndex<V> setOverflowLabel(String overflowLabel) {
335         this.overflowLabel = overflowLabel;
336         buckets = null;
337         return this;
338     }
339 
340     /**
341      * Get the default label used in the IndexCharacters' locale for underflow, eg the last item in: X Y Z ...
342      *
343      * @return underflow label
344      */
getUnderflowLabel()345     public String getUnderflowLabel() {
346         return underflowLabel; // TODO get localized version
347     }
348 
349 
350     /**
351      * Set the underflowLabel label
352      * @param underflowLabel see class description
353      * @return this, for chaining
354      */
setUnderflowLabel(String underflowLabel)355     public AlphabeticIndex<V> setUnderflowLabel(String underflowLabel) {
356         this.underflowLabel = underflowLabel;
357         buckets = null;
358         return this;
359     }
360 
361     /**
362      * Get the default label used in the IndexCharacters' locale for overflow, eg the first item in: ... A B C
363      *
364      * @return overflow label
365      */
getOverflowLabel()366     public String getOverflowLabel() {
367         return overflowLabel; // TODO get localized version
368     }
369 
370 
371     /**
372      * Set the inflowLabel label
373      * @param inflowLabel see class description
374      * @return this, for chaining
375      */
setInflowLabel(String inflowLabel)376     public AlphabeticIndex<V> setInflowLabel(String inflowLabel) {
377         this.inflowLabel = inflowLabel;
378         buckets = null;
379         return this;
380     }
381 
382     /**
383      * Get the default label used for abbreviated buckets <i>between</i> other labels. For example, consider the labels
384      * for Latin and Greek are used: X Y Z ... &#x0391; &#x0392; &#x0393;.
385      *
386      * @return inflow label
387      */
getInflowLabel()388     public String getInflowLabel() {
389         return inflowLabel; // TODO get localized version
390     }
391 
392 
393     /**
394      * Get the limit on the number of labels in the index. The number of buckets can be slightly larger: see getBucketCount().
395      *
396      * @return maxLabelCount maximum number of labels.
397      */
getMaxLabelCount()398     public int getMaxLabelCount() {
399         return maxLabelCount;
400     }
401 
402     /**
403      * Set a limit on the number of labels in the index. The number of buckets can be slightly larger: see
404      * getBucketCount().
405      *
406      * @param maxLabelCount Set the maximum number of labels. Currently, if the number is exceeded, then every
407      *         nth item is removed to bring the count down. A more sophisticated mechanism may be available in the
408      *         future.
409      * @return this, for chaining
410      */
setMaxLabelCount(int maxLabelCount)411     public AlphabeticIndex<V> setMaxLabelCount(int maxLabelCount) {
412         this.maxLabelCount = maxLabelCount;
413         buckets = null;
414         return this;
415     }
416 
417     /**
418      * Determine the best labels to use. This is based on the exemplars, but we also process to make sure that they are unique,
419      * and sort differently, and that the overall list is small enough.
420      */
initLabels()421     private List<String> initLabels() {
422         Normalizer2 nfkdNormalizer = Normalizer2.getNFKDInstance();
423         List<String> indexCharacters = new ArrayList<String>();
424 
425         String firstScriptBoundary = firstCharsInScripts.get(0);
426         String overflowBoundary = firstCharsInScripts.get(firstCharsInScripts.size() - 1);
427 
428         // We make a sorted array of elements.
429         // Some of the input may be redundant.
430         // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
431         // We filter out those cases.
432         for (String item : initialLabels) {
433             boolean checkDistinct;
434             if (!UTF16.hasMoreCodePointsThan(item, 1)) {
435                 checkDistinct = false;
436             } else if(item.charAt(item.length() - 1) == '*' &&
437                     item.charAt(item.length() - 2) != '*') {
438                 // Use a label if it is marked with one trailing star,
439                 // even if the label string sorts the same when all contractions are suppressed.
440                 item = item.substring(0, item.length() - 1);
441                 checkDistinct = false;
442             } else {
443                 checkDistinct = true;
444             }
445             if (collatorPrimaryOnly.compare(item, firstScriptBoundary) < 0) {
446                 // Ignore a primary-ignorable or non-alphabetic index character.
447             } else if (collatorPrimaryOnly.compare(item, overflowBoundary) >= 0) {
448                 // Ignore an index character that will land in the overflow bucket.
449             } else if (checkDistinct && collatorPrimaryOnly.compare(item, separated(item)) == 0) {
450                 // Ignore a multi-code point index character that does not sort distinctly
451                 // from the sequence of its separate characters.
452             } else {
453                 int insertionPoint = Collections.binarySearch(indexCharacters, item, collatorPrimaryOnly);
454                 if (insertionPoint < 0) {
455                     indexCharacters.add(~insertionPoint, item);
456                 } else {
457                     String itemAlreadyIn = indexCharacters.get(insertionPoint);
458                     if (isOneLabelBetterThanOther(nfkdNormalizer, item, itemAlreadyIn)) {
459                         indexCharacters.set(insertionPoint, item);
460                     }
461                 }
462             }
463         }
464 
465         // if the result is still too large, cut down to maxLabelCount elements, by removing every nth element
466 
467         final int size = indexCharacters.size() - 1;
468         if (size > maxLabelCount) {
469             int count = 0;
470             int old = -1;
471             for (Iterator<String> it = indexCharacters.iterator(); it.hasNext();) {
472                 ++count;
473                 it.next();
474                 final int bump = count * maxLabelCount / size;
475                 if (bump == old) {
476                     it.remove();
477                 } else {
478                     old = bump;
479                 }
480             }
481         }
482 
483         return indexCharacters;
484     }
485 
fixLabel(String current)486     private static String fixLabel(String current) {
487         if (!current.startsWith(BASE)) {
488             return current;
489         }
490         int rest = current.charAt(BASE.length());
491         if (0x2800 < rest && rest <= 0x28FF) { // stroke count
492             return (rest-0x2800) + "\u5283";
493         }
494         return current.substring(BASE.length());
495     }
496 
497     /**
498      * This method is called to get the index exemplars. Normally these come from the locale directly,
499      * but if they aren't available, we have to synthesize them.
500      */
addIndexExemplars(ULocale locale)501     private void addIndexExemplars(ULocale locale) {
502         UnicodeSet exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_INDEX);
503         if (exemplars != null) {
504             initialLabels.addAll(exemplars);
505             return;
506         }
507 
508         // The locale data did not include explicit Index characters.
509         // Synthesize a set of them from the locale's standard exemplar characters.
510         exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_STANDARD);
511 
512         exemplars = exemplars.cloneAsThawed();
513         // question: should we add auxiliary exemplars?
514         if (exemplars.containsSome('a', 'z') || exemplars.size() == 0) {
515             exemplars.addAll('a', 'z');
516         }
517         if (exemplars.containsSome(0xAC00, 0xD7A3)) {  // Hangul syllables
518             // cut down to small list
519             exemplars.remove(0xAC00, 0xD7A3).
520                 add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
521                 add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
522                 add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
523                 add(0xD30C).add(0xD558);
524         }
525         if (exemplars.containsSome(0x1200, 0x137F)) {  // Ethiopic block
526             // cut down to small list
527             // make use of the fact that Ethiopic is allocated in 8's, where
528             // the base is 0 mod 8.
529             UnicodeSet ethiopic = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
530             UnicodeSetIterator it = new UnicodeSetIterator(ethiopic);
531             while (it.next() && it.codepoint != UnicodeSetIterator.IS_STRING) {
532                 if ((it.codepoint & 0x7) != 0) {
533                     exemplars.remove(it.codepoint);
534                 }
535             }
536         }
537 
538         // Upper-case any that aren't already so.
539         //   (We only do this for synthesized index characters.)
540         for (String item : exemplars) {
541             initialLabels.add(UCharacter.toUpperCase(locale, item));
542         }
543     }
544 
545     /**
546      * Add Chinese index characters from the tailoring.
547      */
addChineseIndexCharacters()548     private boolean addChineseIndexCharacters() {
549         UnicodeSet contractions = new UnicodeSet();
550         try {
551             collatorPrimaryOnly.internalAddContractions(BASE.charAt(0), contractions);
552         } catch (Exception e) {
553             return false;
554         }
555         if (contractions.isEmpty()) { return false; }
556         initialLabels.addAll(contractions);
557         for (String s : contractions) {
558             assert(s.startsWith(BASE));
559             char c = s.charAt(s.length() - 1);
560             if (0x41 <= c && c <= 0x5A) {  // A-Z
561                 // There are Pinyin labels, add ASCII A-Z labels as well.
562                 initialLabels.add(0x41, 0x5A);  // A-Z
563                 break;
564             }
565         }
566         return true;
567     }
568 
569     /**
570      * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
571      * <p>This is used to test whether contractions sort differently from their components.
572      */
separated(String item)573     private String separated(String item) {
574         StringBuilder result = new StringBuilder();
575         // add a CGJ except within surrogates
576         char last = item.charAt(0);
577         result.append(last);
578         for (int i = 1; i < item.length(); ++i) {
579             char ch = item.charAt(i);
580             if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) {
581                 result.append(CGJ);
582             }
583             result.append(ch);
584             last = ch;
585         }
586         return result.toString();
587     }
588 
589     /**
590      * Builds an immutable, thread-safe version of this instance, without data records.
591      *
592      * @return an immutable index instance
593      */
buildImmutableIndex()594     public ImmutableIndex<V> buildImmutableIndex() {
595         // The current AlphabeticIndex Java code never modifies the bucket list once built.
596         // If it contains no records, we can use it.
597         // addRecord() sets buckets=null rather than inserting the new record into it.
598         BucketList<V> immutableBucketList;
599         if (inputList != null && !inputList.isEmpty()) {
600             // We need a bucket list with no records.
601             immutableBucketList = createBucketList();
602         } else {
603             if (buckets == null) {
604                 buckets = createBucketList();
605             }
606             immutableBucketList = buckets;
607         }
608         return new ImmutableIndex<V>(immutableBucketList, collatorPrimaryOnly);
609     }
610 
611     /**
612      * Get the labels.
613      *
614      * @return The list of bucket labels, after processing.
615      */
getBucketLabels()616     public List<String> getBucketLabels() {
617         initBuckets();
618         ArrayList<String> result = new ArrayList<String>();
619         for (Bucket<V> bucket : buckets) {
620             result.add(bucket.getLabel());
621         }
622         return result;
623     }
624 
625     /**
626      * Get a clone of the collator used internally. Note that for performance reasons, the clone is only done once, and
627      * then stored. The next time it is accessed, the same instance is returned.
628      * <p>
629      * <b><i>Don't use this method across threads if you are changing the settings on the collator, at least not without
630      * synchronizing.</i></b>
631      *
632      * @return a clone of the collator used internally
633      */
getCollator()634     public RuleBasedCollator getCollator() {
635         if (collatorExternal == null) {
636             try {
637                 collatorExternal = (RuleBasedCollator) (collatorOriginal.clone());
638             } catch (Exception e) {
639                 // should never happen
640                 throw new IllegalStateException("Collator cannot be cloned", e);
641             }
642         }
643         return collatorExternal;
644     }
645 
646     /**
647      * Add a record (name and data) to the index. The name will be used to sort the items into buckets, and to sort
648      * within the bucket. Two records may have the same name. When they do, the sort order is according to the order added:
649      * the first added comes first.
650      *
651      * @param name
652      *            Name, such as a name
653      * @param data
654      *            Data, such as an address or link
655      * @return this, for chaining
656      */
addRecord(CharSequence name, V data)657     public AlphabeticIndex<V> addRecord(CharSequence name, V data) {
658         // TODO instead of invalidating, just add to unprocessed list.
659         buckets = null; // invalidate old bucketlist
660         if (inputList == null) {
661             inputList = new ArrayList<Record<V>>();
662         }
663         inputList.add(new Record<V>(name, data));
664         return this;
665     }
666 
667     /**
668      * Get the bucket number for the given name. This routine permits callers to implement their own bucket handling
669      * mechanisms, including client-server handling. For example, when a new name is created on the client, it can ask
670      * the server for the bucket for that name, and the sortkey (using getCollator). Once the client has that
671      * information, it can put the name into the right bucket, and sort it within that bucket, without having access to
672      * the index or collator.
673      * <p>
674      * Note that the bucket number (and sort key) are only valid for the settings of the current AlphabeticIndex; if
675      * those are changed, then the bucket number and sort key must be regenerated.
676      *
677      * @param name
678      *            Name, such as a name
679      * @return the bucket index for the name
680      */
getBucketIndex(CharSequence name)681     public int getBucketIndex(CharSequence name) {
682         initBuckets();
683         return buckets.getBucketIndex(name, collatorPrimaryOnly);
684     }
685 
686     /**
687      * Clear the index.
688      *
689      * @return this, for chaining
690      */
clearRecords()691     public AlphabeticIndex<V> clearRecords() {
692         if (inputList != null && !inputList.isEmpty()) {
693             inputList.clear();
694             buckets = null;
695         }
696         return this;
697     }
698 
699     /**
700      * Return the number of buckets in the index. This will be the same as the number of labels, plus buckets for the underflow, overflow, and inflow(s).
701      *
702      * @return number of buckets
703      */
getBucketCount()704     public int getBucketCount() {
705         initBuckets();
706         return buckets.getBucketCount();
707     }
708 
709     /**
710      * Return the number of records in the index: that is, the total number of distinct &lt;name,data&gt; pairs added with addRecord(...), over all the buckets.
711      *
712      * @return total number of records in buckets
713      */
getRecordCount()714     public int getRecordCount() {
715         return inputList != null ? inputList.size() : 0;
716     }
717 
718     /**
719      * Return an iterator over the buckets.
720      *
721      * @return iterator over buckets.
722      */
iterator()723     public Iterator<Bucket<V>> iterator() {
724         initBuckets();
725         return buckets.iterator();
726     }
727 
728     /**
729      * Creates an index, and buckets and sorts the list of records into the index.
730      */
initBuckets()731     private void initBuckets() {
732         if (buckets != null) {
733             return;
734         }
735         buckets = createBucketList();
736         if (inputList == null || inputList.isEmpty()) {
737             return;
738         }
739 
740         // Sort the records by name.
741         // Stable sort preserves input order of collation duplicates.
742         Collections.sort(inputList, recordComparator);
743 
744         // Now, we traverse all of the input, which is now sorted.
745         // If the item doesn't go in the current bucket, we find the next bucket that contains it.
746         // This makes the process order n*log(n), since we just sort the list and then do a linear process.
747         // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
748         // we need to improve it for that case.
749 
750         Iterator<Bucket<V>> bucketIterator = buckets.fullIterator();
751         Bucket<V> currentBucket = bucketIterator.next();
752         Bucket<V> nextBucket;
753         String upperBoundary;
754         if (bucketIterator.hasNext()) {
755             nextBucket = bucketIterator.next();
756             upperBoundary = nextBucket.lowerBoundary;
757         } else {
758             nextBucket = null;
759             upperBoundary = null;
760         }
761         for (Record<V> r : inputList) {
762             // if the current bucket isn't the right one, find the one that is
763             // We have a special flag for the last bucket so that we don't look any further
764             while (upperBoundary != null &&
765                     collatorPrimaryOnly.compare(r.name, upperBoundary) >= 0) {
766                 currentBucket = nextBucket;
767                 // now reset the boundary that we compare against
768                 if (bucketIterator.hasNext()) {
769                     nextBucket = bucketIterator.next();
770                     upperBoundary = nextBucket.lowerBoundary;
771                 } else {
772                     upperBoundary = null;
773                 }
774             }
775             // now put the record into the bucket.
776             Bucket<V> bucket = currentBucket;
777             if (bucket.displayBucket != null) {
778                 bucket = bucket.displayBucket;
779             }
780             if (bucket.records == null) {
781                 bucket.records = new ArrayList<Record<V>>();
782             }
783             bucket.records.add(r);
784         }
785     }
786 
787     private int maxLabelCount = 99;
788 
789     /**
790      * Returns true if one index character string is "better" than the other.
791      * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
792      * better, and otherwise binary-less-than is better.
793      */
isOneLabelBetterThanOther(Normalizer2 nfkdNormalizer, String one, String other)794     private static boolean isOneLabelBetterThanOther(Normalizer2 nfkdNormalizer, String one, String other) {
795         // This is called with primary-equal strings, but never with one.equals(other).
796         String n1 = nfkdNormalizer.normalize(one);
797         String n2 = nfkdNormalizer.normalize(other);
798         int result = n1.codePointCount(0, n1.length()) - n2.codePointCount(0, n2.length());
799         if (result != 0) {
800             return result < 0;
801         }
802         result = binaryCmp.compare(n1, n2);
803         if (result != 0) {
804             return result < 0;
805         }
806         return binaryCmp.compare(one, other) < 0;
807     }
808 
809     /**
810      * A (name, data) pair, to be sorted by name into one of the index buckets.
811      * The user data is not used by the index implementation.
812      */
813     public static class Record<V> {
814         private final CharSequence name;
815         private final V data;
816 
Record(CharSequence name, V data)817         private Record(CharSequence name, V data) {
818             this.name = name;
819             this.data = data;
820         }
821 
822         /**
823          * Get the name
824          *
825          * @return the name
826          */
getName()827         public CharSequence getName() {
828             return name;
829         }
830 
831         /**
832          * Get the data
833          *
834          * @return the data
835          */
getData()836         public V getData() {
837             return data;
838         }
839 
840         /**
841          * Standard toString()
842          */
toString()843         public String toString() {
844             return name + "=" + data;
845         }
846     }
847 
848     /**
849      * An index "bucket" with a label string and type.
850      * It is referenced by {@link AlphabeticIndex#getBucketIndex(CharSequence)}
851      * and {@link AlphabeticIndex.ImmutableIndex#getBucketIndex(CharSequence)},
852      * returned by {@link AlphabeticIndex.ImmutableIndex#getBucket(int)},
853      * and {@link AlphabeticIndex#addRecord(CharSequence, Object)} adds a record
854      * into a bucket according to the record's name.
855      *
856      * @param <V>
857      *            Data type
858      */
859     public static class Bucket<V> implements Iterable<Record<V>> {
860         private final String label;
861         private final String lowerBoundary;
862         private final LabelType labelType;
863         private Bucket<V> displayBucket;
864         private int displayIndex;
865         private List<Record<V>> records;
866 
867         /**
868          * Type of the label
869          */
870         public enum LabelType {
871             /**
872              * Normal
873              */
874             NORMAL,
875             /**
876              * Underflow (before the first)
877              */
878             UNDERFLOW,
879             /**
880              * Inflow (between scripts)
881              */
882             INFLOW,
883             /**
884              * Overflow (after the last)
885              */
886             OVERFLOW
887         }
888 
889         /**
890          * Set up the bucket.
891          *
892          * @param label
893          *            label for the bucket
894          * @param labelType
895          *            is an underflow, overflow, or inflow bucket
896          */
Bucket(String label, String lowerBoundary, LabelType labelType)897         private Bucket(String label, String lowerBoundary, LabelType labelType) {
898             this.label = label;
899             this.lowerBoundary = lowerBoundary;
900             this.labelType = labelType;
901         }
902 
903         /**
904          * Get the label
905          *
906          * @return label for the bucket
907          */
getLabel()908         public String getLabel() {
909             return label;
910         }
911 
912         /**
913          * Is a normal, underflow, overflow, or inflow bucket
914          *
915          * @return is an underflow, overflow, or inflow bucket
916          */
getLabelType()917         public LabelType getLabelType() {
918             return labelType;
919         }
920 
921         /**
922          * Get the number of records in the bucket.
923          *
924          * @return number of records in bucket
925          */
size()926         public int size() {
927             return records == null ? 0 : records.size();
928         }
929 
930         /**
931          * Iterator over the records in the bucket
932          */
iterator()933         public Iterator<Record<V>> iterator() {
934             if (records == null) {
935                 return Collections.<Record<V>>emptyList().iterator();
936             }
937             return records.iterator();
938         }
939 
940         /**
941          * Standard toString()
942          */
943         @Override
toString()944         public String toString() {
945             return "{" +
946             "labelType=" + labelType
947             + ", " +
948             "lowerBoundary=" + lowerBoundary
949             + ", " +
950             "label=" + label
951             + "}"
952             ;
953         }
954     }
955 
createBucketList()956     private BucketList<V> createBucketList() {
957         // Initialize indexCharacters.
958         List<String> indexCharacters = initLabels();
959 
960         // Variables for hasMultiplePrimaryWeights().
961         long variableTop;
962         if (collatorPrimaryOnly.isAlternateHandlingShifted()) {
963             variableTop = collatorPrimaryOnly.getVariableTop() & 0xffffffffL;
964         } else {
965             variableTop = 0;
966         }
967         boolean hasInvisibleBuckets = false;
968 
969         // Helper arrays for Chinese Pinyin collation.
970         @SuppressWarnings({ "rawtypes", "unchecked" })
971         Bucket<V>[] asciiBuckets = new Bucket[26];
972         @SuppressWarnings({ "rawtypes", "unchecked" })
973         Bucket<V>[] pinyinBuckets = new Bucket[26];
974         boolean hasPinyin = false;
975 
976         ArrayList<Bucket<V>> bucketList = new ArrayList<Bucket<V>>();
977 
978         // underflow bucket
979         bucketList.add(new Bucket<V>(getUnderflowLabel(), "", LabelType.UNDERFLOW));
980 
981         // fix up the list, adding underflow, additions, overflow
982         // Insert inflow labels as needed.
983         int scriptIndex = -1;
984         String scriptUpperBoundary = "";
985         for (String current : indexCharacters) {
986             if (collatorPrimaryOnly.compare(current, scriptUpperBoundary) >= 0) {
987                 // We crossed the script boundary into a new script.
988                 String inflowBoundary = scriptUpperBoundary;
989                 boolean skippedScript = false;
990                 for (;;) {
991                     scriptUpperBoundary = firstCharsInScripts.get(++scriptIndex);
992                     if (collatorPrimaryOnly.compare(current, scriptUpperBoundary) < 0) {
993                         break;
994                     }
995                     skippedScript = true;
996                 }
997                 if (skippedScript && bucketList.size() > 1) {
998                     // We are skipping one or more scripts,
999                     // and we are not just getting out of the underflow label.
1000                     bucketList.add(new Bucket<V>(getInflowLabel(), inflowBoundary,
1001                             LabelType.INFLOW));
1002                 }
1003             }
1004             // Add a bucket with the current label.
1005             Bucket<V> bucket = new Bucket<V>(fixLabel(current), current, LabelType.NORMAL);
1006             bucketList.add(bucket);
1007             // Remember ASCII and Pinyin buckets for Pinyin redirects.
1008             char c;
1009             if (current.length() == 1 && 'A' <= (c = current.charAt(0)) && c <= 'Z') {
1010                 asciiBuckets[c - 'A'] = bucket;
1011             } else if (current.length() == BASE.length() + 1 && current.startsWith(BASE) &&
1012                     'A' <= (c = current.charAt(BASE.length())) && c <= 'Z') {
1013                 pinyinBuckets[c - 'A'] = bucket;
1014                 hasPinyin = true;
1015             }
1016             // Check for multiple primary weights.
1017             if (!current.startsWith(BASE) &&
1018                     hasMultiplePrimaryWeights(collatorPrimaryOnly, variableTop, current) &&
1019                     !current.endsWith("\uffff")) {
1020                 // "Æ" or "Sch" etc.
1021                 for (int i = bucketList.size() - 2;; --i) {
1022                     Bucket<V> singleBucket = bucketList.get(i);
1023                     if (singleBucket.labelType != LabelType.NORMAL) {
1024                         // There is no single-character bucket since the last
1025                         // underflow or inflow label.
1026                         break;
1027                     }
1028                     if (singleBucket.displayBucket == null &&
1029                             !hasMultiplePrimaryWeights(collatorPrimaryOnly, variableTop, singleBucket.lowerBoundary)) {
1030                         // Add an invisible bucket that redirects strings greater than the expansion
1031                         // to the previous single-character bucket.
1032                         // For example, after ... Q R S Sch we add Sch\uFFFF->S
1033                         // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
1034                         bucket = new Bucket<V>("", current + "\uFFFF", LabelType.NORMAL);
1035                         bucket.displayBucket = singleBucket;
1036                         bucketList.add(bucket);
1037                         hasInvisibleBuckets = true;
1038                         break;
1039                     }
1040                 }
1041             }
1042         }
1043         if (bucketList.size() == 1) {
1044             // No real labels, show only the underflow label.
1045             return new BucketList<V>(bucketList, bucketList);
1046         }
1047         // overflow bucket
1048         bucketList.add(new Bucket<V>(getOverflowLabel(), scriptUpperBoundary, LabelType.OVERFLOW)); // final
1049 
1050         if (hasPinyin) {
1051             // Redirect Pinyin buckets.
1052             Bucket<V> asciiBucket = null;
1053             for (int i = 0; i < 26; ++i) {
1054                 if (asciiBuckets[i] != null) {
1055                     asciiBucket = asciiBuckets[i];
1056                 }
1057                 if (pinyinBuckets[i] != null && asciiBucket != null) {
1058                     pinyinBuckets[i].displayBucket = asciiBucket;
1059                     hasInvisibleBuckets = true;
1060                 }
1061             }
1062         }
1063 
1064         if (!hasInvisibleBuckets) {
1065             return new BucketList<V>(bucketList, bucketList);
1066         }
1067         // Merge inflow buckets that are visually adjacent.
1068         // Iterate backwards: Merge inflow into overflow rather than the other way around.
1069         int i = bucketList.size() - 1;
1070         Bucket<V> nextBucket = bucketList.get(i);
1071         while (--i > 0) {
1072             Bucket<V> bucket = bucketList.get(i);
1073             if (bucket.displayBucket != null) {
1074                 continue;  // skip invisible buckets
1075             }
1076             if (bucket.labelType == LabelType.INFLOW) {
1077                 if (nextBucket.labelType != LabelType.NORMAL) {
1078                     bucket.displayBucket = nextBucket;
1079                     continue;
1080                 }
1081             }
1082             nextBucket = bucket;
1083         }
1084 
1085         ArrayList<Bucket<V>> publicBucketList = new ArrayList<Bucket<V>>();
1086         for (Bucket<V> bucket : bucketList) {
1087             if (bucket.displayBucket == null) {
1088                 publicBucketList.add(bucket);
1089             }
1090         }
1091         return new BucketList<V>(bucketList, publicBucketList);
1092     }
1093 
1094     private static class BucketList<V> implements Iterable<Bucket<V>> {
1095         private final ArrayList<Bucket<V>> bucketList;
1096         private final List<Bucket<V>> immutableVisibleList;
1097 
BucketList(ArrayList<Bucket<V>> bucketList, ArrayList<Bucket<V>> publicBucketList)1098         private BucketList(ArrayList<Bucket<V>> bucketList, ArrayList<Bucket<V>> publicBucketList) {
1099             this.bucketList = bucketList;
1100 
1101             int displayIndex = 0;
1102             for (Bucket<V> bucket : publicBucketList) {
1103                 bucket.displayIndex = displayIndex++;
1104             }
1105             immutableVisibleList = Collections.unmodifiableList(publicBucketList);
1106         }
1107 
getBucketCount()1108         private int getBucketCount() {
1109             return immutableVisibleList.size();
1110         }
1111 
getBucketIndex(CharSequence name, Collator collatorPrimaryOnly)1112         private int getBucketIndex(CharSequence name, Collator collatorPrimaryOnly) {
1113             // binary search
1114             int start = 0;
1115             int limit = bucketList.size();
1116             while ((start + 1) < limit) {
1117                 int i = (start + limit) / 2;
1118                 Bucket<V> bucket = bucketList.get(i);
1119                 int nameVsBucket = collatorPrimaryOnly.compare(name, bucket.lowerBoundary);
1120                 if (nameVsBucket < 0) {
1121                     limit = i;
1122                 } else {
1123                     start = i;
1124                 }
1125             }
1126             Bucket<V> bucket = bucketList.get(start);
1127             if (bucket.displayBucket != null) {
1128                 bucket = bucket.displayBucket;
1129             }
1130             return bucket.displayIndex;
1131         }
1132 
1133         /**
1134          * Private iterator over all the buckets, visible and invisible
1135          */
fullIterator()1136         private Iterator<Bucket<V>> fullIterator() {
1137             return bucketList.iterator();
1138         }
1139 
1140         /**
1141          * Iterator over just the visible buckets.
1142          */
iterator()1143         public Iterator<Bucket<V>> iterator() {
1144             return immutableVisibleList.iterator(); // use immutable list to prevent remove().
1145         }
1146     }
1147 
hasMultiplePrimaryWeights( RuleBasedCollator coll, long variableTop, String s)1148     private static boolean hasMultiplePrimaryWeights(
1149             RuleBasedCollator coll, long variableTop, String s) {
1150         long[] ces = coll.internalGetCEs(s);
1151         boolean seenPrimary = false;
1152         for (int i = 0; i < ces.length; ++i) {
1153             long ce = ces[i];
1154             long p = ce >>> 32;
1155             if (p > variableTop) {
1156                 // not primary ignorable
1157                 if (seenPrimary) {
1158                     return true;
1159                 }
1160                 seenPrimary = true;
1161             }
1162         }
1163         return false;
1164     }
1165 
1166     // TODO: Surely we have at least a ticket for porting these mask values to UCharacter.java?!
1167     private static final int GC_LU_MASK = 1 << UCharacter.UPPERCASE_LETTER;
1168     private static final int GC_LL_MASK = 1 << UCharacter.LOWERCASE_LETTER;
1169     private static final int GC_LT_MASK = 1 << UCharacter.TITLECASE_LETTER;
1170     private static final int GC_LM_MASK = 1 << UCharacter.MODIFIER_LETTER;
1171     private static final int GC_LO_MASK = 1 << UCharacter.OTHER_LETTER;
1172     private static final int GC_L_MASK =
1173             GC_LU_MASK|GC_LL_MASK|GC_LT_MASK|GC_LM_MASK|GC_LO_MASK;
1174     private static final int GC_CN_MASK = 1 << UCharacter.GENERAL_OTHER_TYPES;
1175 
1176     /**
1177      * Return a list of the first character in each script. Only exposed for testing.
1178      *
1179      * @return list of first characters in each script
1180      * @deprecated This API is ICU internal, only for testing.
1181      * @hide original deprecated declaration
1182      * @hide draft / provisional / internal are hidden on Android
1183      */
1184     @Deprecated
getFirstCharactersInScripts()1185     public List<String> getFirstCharactersInScripts() {
1186         List<String> dest = new ArrayList<String>(200);
1187         // Fetch the script-first-primary contractions which are defined in the root collator.
1188         // They all start with U+FDD1.
1189         UnicodeSet set = new UnicodeSet();
1190         collatorPrimaryOnly.internalAddContractions(0xFDD1, set);
1191         if (set.isEmpty()) {
1192             throw new UnsupportedOperationException(
1193                     "AlphabeticIndex requires script-first-primary contractions");
1194         }
1195         for (String boundary : set) {
1196             int gcMask = 1 << UCharacter.getType(boundary.codePointAt(1));
1197             if ((gcMask & (GC_L_MASK | GC_CN_MASK)) == 0) {
1198                 // Ignore boundaries for the special reordering groups.
1199                 // Take only those for "real scripts" (where the sample character is a Letter,
1200                 // and the one for unassigned implicit weights (Cn).
1201                 continue;
1202             }
1203             dest.add(boundary);
1204         }
1205         return dest;
1206     }
1207 }
1208