1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2008-2016, Google Inc, International Business Machines Corporation 6 * and others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.text; 10 11 import java.util.ArrayList; 12 import java.util.Collections; 13 import java.util.Comparator; 14 import java.util.Iterator; 15 import java.util.List; 16 import java.util.Locale; 17 18 import com.ibm.icu.lang.UCharacter; 19 import com.ibm.icu.text.AlphabeticIndex.Bucket; 20 import com.ibm.icu.text.AlphabeticIndex.Bucket.LabelType; 21 import com.ibm.icu.util.LocaleData; 22 import com.ibm.icu.util.ULocale; 23 24 /** 25 * AlphabeticIndex supports the creation of a UI index appropriate for a given language. 26 * It can support either direct use, or use with a client that doesn't support localized collation. 27 * The following is an example of what an index might look like in a UI: 28 * 29 * <pre> 30 * <b>... A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ...</b> 31 * 32 * <b>A</b> 33 * Addison 34 * Albertson 35 * Azensky 36 * <b>B</b> 37 * Baecker 38 * ... 39 * </pre> 40 * 41 * The class can generate a list of labels for use as a UI "index", that is, a list of 42 * clickable characters (or character sequences) that allow the user to see a segment 43 * (bucket) of a larger "target" list. That is, each label corresponds to a bucket in 44 * the target list, where everything in the bucket is greater than or equal to the character 45 * (according to the locale's collation). Strings can be added to the index; 46 * they will be in sorted order in the right bucket. 47 * <p> 48 * The class also supports having buckets for strings before the first (underflow), 49 * after the last (overflow), and between scripts (inflow). For example, if the index 50 * is constructed with labels for Russian and English, Greek characters would fall 51 * into an inflow bucket between the other two scripts. 52 * 53 * <p><em>Note:</em> If you expect to have a lot of ASCII or Latin characters 54 * as well as characters from the user's language, 55 * then it is a good idea to call addLabels(ULocale.English). 56 * 57 * <h2>Direct Use</h2> 58 * <p>The following shows an example of building an index directly. 59 * The "show..." methods below are just to illustrate usage. 60 * 61 * <pre> 62 * // Create a simple index where the values for the strings are Integers, and add the strings 63 * 64 * AlphabeticIndex<Integer> index = new AlphabeticIndex<Integer>(desiredLocale).addLabels(additionalLocale); 65 * int counter = 0; 66 * for (String item : test) { 67 * index.addRecord(item, counter++); 68 * } 69 * ... 70 * // Show index at top. We could skip or gray out empty buckets 71 * 72 * for (AlphabeticIndex.Bucket<Integer> bucket : index) { 73 * if (showAll || bucket.size() != 0) { 74 * showLabelAtTop(UI, bucket.getLabel()); 75 * } 76 * } 77 * ... 78 * // Show the buckets with their contents, skipping empty buckets 79 * 80 * for (AlphabeticIndex.Bucket<Integer> bucket : index) { 81 * if (bucket.size() != 0) { 82 * showLabelInList(UI, bucket.getLabel()); 83 * for (AlphabeticIndex.Record<Integer> item : bucket) { 84 * showIndexedItem(UI, item.getName(), item.getData()); 85 * } 86 * </pre> 87 * 88 * The caller can build different UIs using this class. 89 * For example, an index character could be omitted or grayed-out 90 * if its bucket is empty. Small buckets could also be combined based on size, such as: 91 * 92 * <pre> 93 * <b>... A-F G-N O-Z ...</b> 94 * </pre> 95 * 96 * <h2>Client Support</h2> 97 * <p>Callers can also use the {@link AlphabeticIndex.ImmutableIndex}, or the AlphabeticIndex itself, 98 * to support sorting on a client that doesn't support AlphabeticIndex functionality. 99 * 100 * <p>The ImmutableIndex is both immutable and thread-safe. 101 * The corresponding AlphabeticIndex methods are not thread-safe because 102 * they "lazily" build the index buckets. 103 * <ul> 104 * <li>ImmutableIndex.getBucket(index) provides random access to all 105 * buckets and their labels and label types. 106 * <li>AlphabeticIndex.getBucketLabels() or the bucket iterator on either class 107 * can be used to get a list of the labels, 108 * such as "...", "A", "B",..., and send that list to the client. 109 * <li>When the client has a new name, it sends that name to the server. 110 * The server needs to call the following methods, 111 * and communicate the bucketIndex and collationKey back to the client. 112 * 113 * <pre> 114 * int bucketIndex = index.getBucketIndex(name); 115 * String label = immutableIndex.getBucket(bucketIndex).getLabel(); // optional 116 * RawCollationKey collationKey = collator.getRawCollationKey(name, null); 117 * </pre> 118 * 119 * <li>The client would put the name (and associated information) into its bucket for bucketIndex. The collationKey is a 120 * sequence of bytes that can be compared with a binary compare, and produce the right localized result.</li> 121 * </ul> 122 * 123 * @author Mark Davis 124 * @stable ICU 4.8 125 */ 126 public final class AlphabeticIndex<V> implements Iterable<Bucket<V>> { 127 /** 128 * Prefix string for Chinese index buckets. 129 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes 130 */ 131 private static final String BASE = "\uFDD0"; 132 133 private static final char CGJ = '\u034F'; 134 135 private static final Comparator<String> binaryCmp = new UTF16.StringComparator(true, false, 0); 136 137 private final RuleBasedCollator collatorOriginal; 138 private final RuleBasedCollator collatorPrimaryOnly; 139 private RuleBasedCollator collatorExternal; 140 141 // Comparator for records, so that the Record class can be static. 142 private final Comparator<Record<V>> recordComparator = new Comparator<Record<V>>() { 143 @Override 144 public int compare(Record<V> o1, Record<V> o2) { 145 return collatorOriginal.compare(o1.name, o2.name); 146 } 147 }; 148 149 private final List<String> firstCharsInScripts; 150 151 // We accumulate these as we build up the input parameters 152 private final UnicodeSet initialLabels = new UnicodeSet(); 153 private List<Record<V>> inputList; 154 155 // Lazy evaluated: null means that we have not built yet. 156 private BucketList<V> buckets; 157 158 private String overflowLabel = "\u2026"; 159 private String underflowLabel = "\u2026"; 160 private String inflowLabel = "\u2026"; 161 162 /** 163 * Immutable, thread-safe version of {@link AlphabeticIndex}. 164 * This class provides thread-safe methods for bucketing, 165 * and random access to buckets and their properties, 166 * but does not offer adding records to the index. 167 * 168 * @param <V> The Record value type is unused. It can be omitted for this class 169 * if it was omitted for the AlphabeticIndex that built it. 170 * @stable ICU 51 171 */ 172 public static final class ImmutableIndex<V> implements Iterable<Bucket<V>> { 173 private final BucketList<V> buckets; 174 private final Collator collatorPrimaryOnly; 175 ImmutableIndex(BucketList<V> bucketList, Collator collatorPrimaryOnly)176 private ImmutableIndex(BucketList<V> bucketList, Collator collatorPrimaryOnly) { 177 this.buckets = bucketList; 178 this.collatorPrimaryOnly = collatorPrimaryOnly; 179 } 180 181 /** 182 * Returns the number of index buckets and labels, including underflow/inflow/overflow. 183 * 184 * @return the number of index buckets 185 * @stable ICU 51 186 */ getBucketCount()187 public int getBucketCount() { 188 return buckets.getBucketCount(); 189 } 190 191 /** 192 * Finds the index bucket for the given name and returns the number of that bucket. 193 * Use {@link #getBucket(int)} to get the bucket's properties. 194 * 195 * @param name the string to be sorted into an index bucket 196 * @return the bucket number for the name 197 * @stable ICU 51 198 */ getBucketIndex(CharSequence name)199 public int getBucketIndex(CharSequence name) { 200 return buckets.getBucketIndex(name, collatorPrimaryOnly); 201 } 202 203 /** 204 * Returns the index-th bucket. Returns null if the index is out of range. 205 * 206 * @param index bucket number 207 * @return the index-th bucket 208 * @stable ICU 51 209 */ getBucket(int index)210 public Bucket<V> getBucket(int index) { 211 if (0 <= index && index < buckets.getBucketCount()) { 212 return buckets.immutableVisibleList.get(index); 213 } else { 214 return null; 215 } 216 } 217 218 /** 219 * {@inheritDoc} 220 * @stable ICU 51 221 */ 222 @Override iterator()223 public Iterator<Bucket<V>> iterator() { 224 return buckets.iterator(); 225 } 226 } 227 228 /** 229 * Create the index object. 230 * 231 * @param locale 232 * The locale for the index. 233 * @stable ICU 4.8 234 */ AlphabeticIndex(ULocale locale)235 public AlphabeticIndex(ULocale locale) { 236 this(locale, null); 237 } 238 239 /** 240 * Create the index object. 241 * 242 * @param locale 243 * The locale for the index. 244 * @stable ICU 4.8 245 */ AlphabeticIndex(Locale locale)246 public AlphabeticIndex(Locale locale) { 247 this(ULocale.forLocale(locale), null); 248 } 249 250 /** 251 * Create an AlphabeticIndex that uses a specific collator. 252 * 253 * <p>The index will be created with no labels; the addLabels() function must be called 254 * after creation to add the desired labels to the index. 255 * 256 * <p>The index will work directly with the supplied collator. If the caller will need to 257 * continue working with the collator it should be cloned first, so that the 258 * collator provided to the AlphabeticIndex remains unchanged after creation of the index. 259 * 260 * @param collator The collator to use to order the contents of this index. 261 * @stable ICU 51 262 */ AlphabeticIndex(RuleBasedCollator collator)263 public AlphabeticIndex(RuleBasedCollator collator) { 264 this(null, collator); 265 } 266 267 /** 268 * Internal constructor containing implementation used by public constructors. 269 */ AlphabeticIndex(ULocale locale, RuleBasedCollator collator)270 private AlphabeticIndex(ULocale locale, RuleBasedCollator collator) { 271 collatorOriginal = collator != null ? collator : (RuleBasedCollator) Collator.getInstance(locale); 272 try { 273 collatorPrimaryOnly = collatorOriginal.cloneAsThawed(); 274 } catch (Exception e) { 275 // should never happen 276 throw new IllegalStateException("Collator cannot be cloned", e); 277 } 278 collatorPrimaryOnly.setStrength(Collator.PRIMARY); 279 collatorPrimaryOnly.freeze(); 280 281 firstCharsInScripts = getFirstCharactersInScripts(); 282 Collections.sort(firstCharsInScripts, collatorPrimaryOnly); 283 // Guard against a degenerate collator where 284 // some script boundary strings are primary ignorable. 285 for (;;) { 286 if (firstCharsInScripts.isEmpty()) { 287 throw new IllegalArgumentException( 288 "AlphabeticIndex requires some non-ignorable script boundary strings"); 289 } 290 if (collatorPrimaryOnly.compare(firstCharsInScripts.get(0), "") == 0) { 291 firstCharsInScripts.remove(0); 292 } else { 293 break; 294 } 295 } 296 297 // Chinese index characters, which are specific to each of the several Chinese tailorings, 298 // take precedence over the single locale data exemplar set per language. 299 if (!addChineseIndexCharacters() && locale != null) { 300 addIndexExemplars(locale); 301 } 302 } 303 304 /** 305 * Add more index characters (aside from what are in the locale) 306 * @param additions additional characters to add to the index, such as A-Z. 307 * @return this, for chaining 308 * @stable ICU 4.8 309 */ addLabels(UnicodeSet additions)310 public AlphabeticIndex<V> addLabels(UnicodeSet additions) { 311 initialLabels.addAll(additions); 312 buckets = null; 313 return this; 314 } 315 316 /** 317 * Add more index characters (aside from what are in the locale) 318 * @param additions additional characters to add to the index, such as those in Swedish. 319 * @return this, for chaining 320 * @stable ICU 4.8 321 */ addLabels(ULocale... additions)322 public AlphabeticIndex<V> addLabels(ULocale... additions) { 323 for (ULocale addition : additions) { 324 addIndexExemplars(addition); 325 } 326 buckets = null; 327 return this; 328 } 329 330 /** 331 * Add more index characters (aside from what are in the locale) 332 * @param additions additional characters to add to the index, such as those in Swedish. 333 * @return this, for chaining 334 * @stable ICU 4.8 335 */ addLabels(Locale... additions)336 public AlphabeticIndex<V> addLabels(Locale... additions) { 337 for (Locale addition : additions) { 338 addIndexExemplars(ULocale.forLocale(addition)); 339 } 340 buckets = null; 341 return this; 342 } 343 344 /** 345 * Set the overflow label 346 * @param overflowLabel see class description 347 * @return this, for chaining 348 * @stable ICU 4.8 349 */ setOverflowLabel(String overflowLabel)350 public AlphabeticIndex<V> setOverflowLabel(String overflowLabel) { 351 this.overflowLabel = overflowLabel; 352 buckets = null; 353 return this; 354 } 355 356 /** 357 * Get the default label used in the IndexCharacters' locale for underflow, eg the last item in: X Y Z ... 358 * 359 * @return underflow label 360 * @stable ICU 4.8 361 */ getUnderflowLabel()362 public String getUnderflowLabel() { 363 return underflowLabel; // TODO get localized version 364 } 365 366 367 /** 368 * Set the underflowLabel label 369 * @param underflowLabel see class description 370 * @return this, for chaining 371 * @stable ICU 4.8 372 */ setUnderflowLabel(String underflowLabel)373 public AlphabeticIndex<V> setUnderflowLabel(String underflowLabel) { 374 this.underflowLabel = underflowLabel; 375 buckets = null; 376 return this; 377 } 378 379 /** 380 * Get the default label used in the IndexCharacters' locale for overflow, eg the first item in: ... A B C 381 * 382 * @return overflow label 383 * @stable ICU 4.8 384 */ getOverflowLabel()385 public String getOverflowLabel() { 386 return overflowLabel; // TODO get localized version 387 } 388 389 390 /** 391 * Set the inflowLabel label 392 * @param inflowLabel see class description 393 * @return this, for chaining 394 * @stable ICU 4.8 395 */ setInflowLabel(String inflowLabel)396 public AlphabeticIndex<V> setInflowLabel(String inflowLabel) { 397 this.inflowLabel = inflowLabel; 398 buckets = null; 399 return this; 400 } 401 402 /** 403 * Get the default label used for abbreviated buckets <i>between</i> other labels. For example, consider the labels 404 * for Latin and Greek are used: X Y Z ... Α Β Γ. 405 * 406 * @return inflow label 407 * @stable ICU 4.8 408 */ getInflowLabel()409 public String getInflowLabel() { 410 return inflowLabel; // TODO get localized version 411 } 412 413 414 /** 415 * Get the limit on the number of labels in the index. The number of buckets can be slightly larger: see getBucketCount(). 416 * 417 * @return maxLabelCount maximum number of labels. 418 * @stable ICU 4.8 419 */ getMaxLabelCount()420 public int getMaxLabelCount() { 421 return maxLabelCount; 422 } 423 424 /** 425 * Set a limit on the number of labels in the index. The number of buckets can be slightly larger: see 426 * getBucketCount(). 427 * 428 * @param maxLabelCount Set the maximum number of labels. Currently, if the number is exceeded, then every 429 * nth item is removed to bring the count down. A more sophisticated mechanism may be available in the 430 * future. 431 * @return this, for chaining 432 * @stable ICU 4.8 433 */ setMaxLabelCount(int maxLabelCount)434 public AlphabeticIndex<V> setMaxLabelCount(int maxLabelCount) { 435 this.maxLabelCount = maxLabelCount; 436 buckets = null; 437 return this; 438 } 439 440 /** 441 * Determine the best labels to use. This is based on the exemplars, but we also process to make sure that they are unique, 442 * and sort differently, and that the overall list is small enough. 443 */ initLabels()444 private List<String> initLabels() { 445 Normalizer2 nfkdNormalizer = Normalizer2.getNFKDInstance(); 446 List<String> indexCharacters = new ArrayList<String>(); 447 448 String firstScriptBoundary = firstCharsInScripts.get(0); 449 String overflowBoundary = firstCharsInScripts.get(firstCharsInScripts.size() - 1); 450 451 // We make a sorted array of elements. 452 // Some of the input may be redundant. 453 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h". 454 // We filter out those cases. 455 for (String item : initialLabels) { 456 boolean checkDistinct; 457 if (!UTF16.hasMoreCodePointsThan(item, 1)) { 458 checkDistinct = false; 459 } else if(item.charAt(item.length() - 1) == '*' && 460 item.charAt(item.length() - 2) != '*') { 461 // Use a label if it is marked with one trailing star, 462 // even if the label string sorts the same when all contractions are suppressed. 463 item = item.substring(0, item.length() - 1); 464 checkDistinct = false; 465 } else { 466 checkDistinct = true; 467 } 468 if (collatorPrimaryOnly.compare(item, firstScriptBoundary) < 0) { 469 // Ignore a primary-ignorable or non-alphabetic index character. 470 } else if (collatorPrimaryOnly.compare(item, overflowBoundary) >= 0) { 471 // Ignore an index character that will land in the overflow bucket. 472 } else if (checkDistinct && collatorPrimaryOnly.compare(item, separated(item)) == 0) { 473 // Ignore a multi-code point index character that does not sort distinctly 474 // from the sequence of its separate characters. 475 } else { 476 int insertionPoint = Collections.binarySearch(indexCharacters, item, collatorPrimaryOnly); 477 if (insertionPoint < 0) { 478 indexCharacters.add(~insertionPoint, item); 479 } else { 480 String itemAlreadyIn = indexCharacters.get(insertionPoint); 481 if (isOneLabelBetterThanOther(nfkdNormalizer, item, itemAlreadyIn)) { 482 indexCharacters.set(insertionPoint, item); 483 } 484 } 485 } 486 } 487 488 // if the result is still too large, cut down to maxLabelCount elements, by removing every nth element 489 490 final int size = indexCharacters.size() - 1; 491 if (size > maxLabelCount) { 492 int count = 0; 493 int old = -1; 494 for (Iterator<String> it = indexCharacters.iterator(); it.hasNext();) { 495 ++count; 496 it.next(); 497 final int bump = count * maxLabelCount / size; 498 if (bump == old) { 499 it.remove(); 500 } else { 501 old = bump; 502 } 503 } 504 } 505 506 return indexCharacters; 507 } 508 fixLabel(String current)509 private static String fixLabel(String current) { 510 if (!current.startsWith(BASE)) { 511 return current; 512 } 513 int rest = current.charAt(BASE.length()); 514 if (0x2800 < rest && rest <= 0x28FF) { // stroke count 515 return (rest-0x2800) + "\u5283"; 516 } 517 return current.substring(BASE.length()); 518 } 519 520 /** 521 * This method is called to get the index exemplars. Normally these come from the locale directly, 522 * but if they aren't available, we have to synthesize them. 523 */ addIndexExemplars(ULocale locale)524 private void addIndexExemplars(ULocale locale) { 525 UnicodeSet exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_INDEX); 526 if (exemplars != null && !exemplars.isEmpty()) { 527 initialLabels.addAll(exemplars); 528 return; 529 } 530 531 // The locale data did not include explicit Index characters. 532 // Synthesize a set of them from the locale's standard exemplar characters. 533 exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_STANDARD); 534 535 exemplars = exemplars.cloneAsThawed(); 536 // question: should we add auxiliary exemplars? 537 if (exemplars.containsSome('a', 'z') || exemplars.isEmpty()) { 538 exemplars.addAll('a', 'z'); 539 } 540 if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables 541 // cut down to small list 542 exemplars.remove(0xAC00, 0xD7A3). 543 add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C). 544 add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544). 545 add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0). 546 add(0xD30C).add(0xD558); 547 } 548 if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block 549 // cut down to small list 550 // make use of the fact that Ethiopic is allocated in 8's, where 551 // the base is 0 mod 8. 552 UnicodeSet ethiopic = new UnicodeSet("[ሀለሐመሠረሰሸቀቈቐቘበቨተቸኀኈነኘአከኰኸዀወዐዘዠየደዸጀገጐጘጠጨጰጸፀፈፐፘ]"); 553 ethiopic.retainAll(exemplars); 554 exemplars.remove('ሀ', 0x137F).addAll(ethiopic); 555 } 556 557 // Upper-case any that aren't already so. 558 // (We only do this for synthesized index characters.) 559 for (String item : exemplars) { 560 initialLabels.add(UCharacter.toUpperCase(locale, item)); 561 } 562 } 563 564 /** 565 * Add Chinese index characters from the tailoring. 566 */ addChineseIndexCharacters()567 private boolean addChineseIndexCharacters() { 568 UnicodeSet contractions = new UnicodeSet(); 569 try { 570 collatorPrimaryOnly.internalAddContractions(BASE.charAt(0), contractions); 571 } catch (Exception e) { 572 return false; 573 } 574 if (contractions.isEmpty()) { return false; } 575 initialLabels.addAll(contractions); 576 for (String s : contractions) { 577 assert(s.startsWith(BASE)); 578 char c = s.charAt(s.length() - 1); 579 if (0x41 <= c && c <= 0x5A) { // A-Z 580 // There are Pinyin labels, add ASCII A-Z labels as well. 581 initialLabels.add(0x41, 0x5A); // A-Z 582 break; 583 } 584 } 585 return true; 586 } 587 588 /** 589 * Return the string with interspersed CGJs. Input must have more than 2 codepoints. 590 * <p>This is used to test whether contractions sort differently from their components. 591 */ separated(String item)592 private String separated(String item) { 593 StringBuilder result = new StringBuilder(); 594 // add a CGJ except within surrogates 595 char last = item.charAt(0); 596 result.append(last); 597 for (int i = 1; i < item.length(); ++i) { 598 char ch = item.charAt(i); 599 if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) { 600 result.append(CGJ); 601 } 602 result.append(ch); 603 last = ch; 604 } 605 return result.toString(); 606 } 607 608 /** 609 * Builds an immutable, thread-safe version of this instance, without data records. 610 * 611 * @return an immutable index instance 612 * @stable ICU 51 613 */ buildImmutableIndex()614 public ImmutableIndex<V> buildImmutableIndex() { 615 // The current AlphabeticIndex Java code never modifies the bucket list once built. 616 // If it contains no records, we can use it. 617 // addRecord() sets buckets=null rather than inserting the new record into it. 618 BucketList<V> immutableBucketList; 619 if (inputList != null && !inputList.isEmpty()) { 620 // We need a bucket list with no records. 621 immutableBucketList = createBucketList(); 622 } else { 623 if (buckets == null) { 624 buckets = createBucketList(); 625 } 626 immutableBucketList = buckets; 627 } 628 return new ImmutableIndex<V>(immutableBucketList, collatorPrimaryOnly); 629 } 630 631 /** 632 * Get the labels. 633 * 634 * @return The list of bucket labels, after processing. 635 * @stable ICU 4.8 636 */ getBucketLabels()637 public List<String> getBucketLabels() { 638 initBuckets(); 639 ArrayList<String> result = new ArrayList<String>(); 640 for (Bucket<V> bucket : buckets) { 641 result.add(bucket.getLabel()); 642 } 643 return result; 644 } 645 646 /** 647 * Get a clone of the collator used internally. Note that for performance reasons, the clone is only done once, and 648 * then stored. The next time it is accessed, the same instance is returned. 649 * <p> 650 * <b><i>Don't use this method across threads if you are changing the settings on the collator, at least not without 651 * synchronizing.</i></b> 652 * 653 * @return a clone of the collator used internally 654 * @stable ICU 4.8 655 */ getCollator()656 public RuleBasedCollator getCollator() { 657 if (collatorExternal == null) { 658 try { 659 collatorExternal = (RuleBasedCollator) (collatorOriginal.clone()); 660 } catch (Exception e) { 661 // should never happen 662 throw new IllegalStateException("Collator cannot be cloned", e); 663 } 664 } 665 return collatorExternal; 666 } 667 668 /** 669 * Add a record (name and data) to the index. The name will be used to sort the items into buckets, and to sort 670 * within the bucket. Two records may have the same name. When they do, the sort order is according to the order added: 671 * the first added comes first. 672 * 673 * @param name 674 * Name, such as a name 675 * @param data 676 * Data, such as an address or link 677 * @return this, for chaining 678 * @stable ICU 4.8 679 */ addRecord(CharSequence name, V data)680 public AlphabeticIndex<V> addRecord(CharSequence name, V data) { 681 // TODO instead of invalidating, just add to unprocessed list. 682 buckets = null; // invalidate old bucketlist 683 if (inputList == null) { 684 inputList = new ArrayList<Record<V>>(); 685 } 686 inputList.add(new Record<V>(name, data)); 687 return this; 688 } 689 690 /** 691 * Get the bucket number for the given name. This routine permits callers to implement their own bucket handling 692 * mechanisms, including client-server handling. For example, when a new name is created on the client, it can ask 693 * the server for the bucket for that name, and the sortkey (using getCollator). Once the client has that 694 * information, it can put the name into the right bucket, and sort it within that bucket, without having access to 695 * the index or collator. 696 * <p> 697 * Note that the bucket number (and sort key) are only valid for the settings of the current AlphabeticIndex; if 698 * those are changed, then the bucket number and sort key must be regenerated. 699 * 700 * @param name 701 * Name, such as a name 702 * @return the bucket index for the name 703 * @stable ICU 4.8 704 */ getBucketIndex(CharSequence name)705 public int getBucketIndex(CharSequence name) { 706 initBuckets(); 707 return buckets.getBucketIndex(name, collatorPrimaryOnly); 708 } 709 710 /** 711 * Clear the index. 712 * 713 * @return this, for chaining 714 * @stable ICU 4.8 715 */ clearRecords()716 public AlphabeticIndex<V> clearRecords() { 717 if (inputList != null && !inputList.isEmpty()) { 718 inputList.clear(); 719 buckets = null; 720 } 721 return this; 722 } 723 724 /** 725 * Return the number of buckets in the index. This will be the same as the number of labels, plus buckets for the underflow, overflow, and inflow(s). 726 * 727 * @return number of buckets 728 * @stable ICU 4.8 729 */ getBucketCount()730 public int getBucketCount() { 731 initBuckets(); 732 return buckets.getBucketCount(); 733 } 734 735 /** 736 * Return the number of records in the index: that is, the total number of distinct <name,data> pairs added with addRecord(...), over all the buckets. 737 * 738 * @return total number of records in buckets 739 * @stable ICU 4.8 740 */ getRecordCount()741 public int getRecordCount() { 742 return inputList != null ? inputList.size() : 0; 743 } 744 745 /** 746 * Return an iterator over the buckets. 747 * 748 * @return iterator over buckets. 749 * @stable ICU 4.8 750 */ 751 @Override iterator()752 public Iterator<Bucket<V>> iterator() { 753 initBuckets(); 754 return buckets.iterator(); 755 } 756 757 /** 758 * Creates an index, and buckets and sorts the list of records into the index. 759 */ initBuckets()760 private void initBuckets() { 761 if (buckets != null) { 762 return; 763 } 764 buckets = createBucketList(); 765 if (inputList == null || inputList.isEmpty()) { 766 return; 767 } 768 769 // Sort the records by name. 770 // Stable sort preserves input order of collation duplicates. 771 Collections.sort(inputList, recordComparator); 772 773 // Now, we traverse all of the input, which is now sorted. 774 // If the item doesn't go in the current bucket, we find the next bucket that contains it. 775 // This makes the process order n*log(n), since we just sort the list and then do a linear process. 776 // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so 777 // we need to improve it for that case. 778 779 Iterator<Bucket<V>> bucketIterator = buckets.fullIterator(); 780 Bucket<V> currentBucket = bucketIterator.next(); 781 Bucket<V> nextBucket; 782 String upperBoundary; 783 if (bucketIterator.hasNext()) { 784 nextBucket = bucketIterator.next(); 785 upperBoundary = nextBucket.lowerBoundary; 786 } else { 787 nextBucket = null; 788 upperBoundary = null; 789 } 790 for (Record<V> r : inputList) { 791 // if the current bucket isn't the right one, find the one that is 792 // We have a special flag for the last bucket so that we don't look any further 793 while (upperBoundary != null && 794 collatorPrimaryOnly.compare(r.name, upperBoundary) >= 0) { 795 currentBucket = nextBucket; 796 // now reset the boundary that we compare against 797 if (bucketIterator.hasNext()) { 798 nextBucket = bucketIterator.next(); 799 upperBoundary = nextBucket.lowerBoundary; 800 } else { 801 upperBoundary = null; 802 } 803 } 804 // now put the record into the bucket. 805 Bucket<V> bucket = currentBucket; 806 if (bucket.displayBucket != null) { 807 bucket = bucket.displayBucket; 808 } 809 if (bucket.records == null) { 810 bucket.records = new ArrayList<Record<V>>(); 811 } 812 bucket.records.add(r); 813 } 814 } 815 816 private int maxLabelCount = 99; 817 818 /** 819 * Returns true if one index character string is "better" than the other. 820 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is 821 * better, and otherwise binary-less-than is better. 822 */ isOneLabelBetterThanOther(Normalizer2 nfkdNormalizer, String one, String other)823 private static boolean isOneLabelBetterThanOther(Normalizer2 nfkdNormalizer, String one, String other) { 824 // This is called with primary-equal strings, but never with one.equals(other). 825 String n1 = nfkdNormalizer.normalize(one); 826 String n2 = nfkdNormalizer.normalize(other); 827 int result = n1.codePointCount(0, n1.length()) - n2.codePointCount(0, n2.length()); 828 if (result != 0) { 829 return result < 0; 830 } 831 result = binaryCmp.compare(n1, n2); 832 if (result != 0) { 833 return result < 0; 834 } 835 return binaryCmp.compare(one, other) < 0; 836 } 837 838 /** 839 * A (name, data) pair, to be sorted by name into one of the index buckets. 840 * The user data is not used by the index implementation. 841 * 842 * @stable ICU 4.8 843 */ 844 public static class Record<V> { 845 private final CharSequence name; 846 private final V data; 847 Record(CharSequence name, V data)848 private Record(CharSequence name, V data) { 849 this.name = name; 850 this.data = data; 851 } 852 853 /** 854 * Get the name 855 * 856 * @return the name 857 * @stable ICU 4.8 858 */ getName()859 public CharSequence getName() { 860 return name; 861 } 862 863 /** 864 * Get the data 865 * 866 * @return the data 867 * @stable ICU 4.8 868 */ getData()869 public V getData() { 870 return data; 871 } 872 873 /** 874 * Standard toString() 875 * @stable ICU 4.8 876 */ 877 @Override toString()878 public String toString() { 879 return name + "=" + data; 880 } 881 } 882 883 /** 884 * An index "bucket" with a label string and type. 885 * It is referenced by {@link AlphabeticIndex#getBucketIndex(CharSequence)} 886 * and {@link AlphabeticIndex.ImmutableIndex#getBucketIndex(CharSequence)}, 887 * returned by {@link AlphabeticIndex.ImmutableIndex#getBucket(int)}, 888 * and {@link AlphabeticIndex#addRecord(CharSequence, Object)} adds a record 889 * into a bucket according to the record's name. 890 * 891 * @param <V> 892 * Data type 893 * @stable ICU 4.8 894 */ 895 public static class Bucket<V> implements Iterable<Record<V>> { 896 private final String label; 897 private final String lowerBoundary; 898 private final LabelType labelType; 899 private Bucket<V> displayBucket; 900 private int displayIndex; 901 private List<Record<V>> records; 902 903 /** 904 * Type of the label 905 * 906 * @stable ICU 4.8 907 */ 908 public enum LabelType { 909 /** 910 * Normal 911 * @stable ICU 4.8 912 */ 913 NORMAL, 914 /** 915 * Underflow (before the first) 916 * @stable ICU 4.8 917 */ 918 UNDERFLOW, 919 /** 920 * Inflow (between scripts) 921 * @stable ICU 4.8 922 */ 923 INFLOW, 924 /** 925 * Overflow (after the last) 926 * @stable ICU 4.8 927 */ 928 OVERFLOW 929 } 930 931 /** 932 * Set up the bucket. 933 * 934 * @param label 935 * label for the bucket 936 * @param labelType 937 * is an underflow, overflow, or inflow bucket 938 * @stable ICU 4.8 939 */ Bucket(String label, String lowerBoundary, LabelType labelType)940 private Bucket(String label, String lowerBoundary, LabelType labelType) { 941 this.label = label; 942 this.lowerBoundary = lowerBoundary; 943 this.labelType = labelType; 944 } 945 946 /** 947 * Get the label 948 * 949 * @return label for the bucket 950 * @stable ICU 4.8 951 */ getLabel()952 public String getLabel() { 953 return label; 954 } 955 956 /** 957 * Is a normal, underflow, overflow, or inflow bucket 958 * 959 * @return is an underflow, overflow, or inflow bucket 960 * @stable ICU 4.8 961 */ getLabelType()962 public LabelType getLabelType() { 963 return labelType; 964 } 965 966 /** 967 * Get the number of records in the bucket. 968 * 969 * @return number of records in bucket 970 * @stable ICU 4.8 971 */ size()972 public int size() { 973 return records == null ? 0 : records.size(); 974 } 975 976 /** 977 * Iterator over the records in the bucket 978 * @stable ICU 4.8 979 */ 980 @Override iterator()981 public Iterator<Record<V>> iterator() { 982 if (records == null) { 983 return Collections.<Record<V>>emptyList().iterator(); 984 } 985 return records.iterator(); 986 } 987 988 /** 989 * Standard toString() 990 * @stable ICU 4.8 991 */ 992 @Override toString()993 public String toString() { 994 return "{" + 995 "labelType=" + labelType 996 + ", " + 997 "lowerBoundary=" + lowerBoundary 998 + ", " + 999 "label=" + label 1000 + "}" 1001 ; 1002 } 1003 } 1004 createBucketList()1005 private BucketList<V> createBucketList() { 1006 // Initialize indexCharacters. 1007 List<String> indexCharacters = initLabels(); 1008 1009 // Variables for hasMultiplePrimaryWeights(). 1010 long variableTop; 1011 if (collatorPrimaryOnly.isAlternateHandlingShifted()) { 1012 variableTop = collatorPrimaryOnly.getVariableTop() & 0xffffffffL; 1013 } else { 1014 variableTop = 0; 1015 } 1016 boolean hasInvisibleBuckets = false; 1017 1018 // Helper arrays for Chinese Pinyin collation. 1019 @SuppressWarnings({ "rawtypes", "unchecked" }) 1020 Bucket<V>[] asciiBuckets = new Bucket[26]; 1021 @SuppressWarnings({ "rawtypes", "unchecked" }) 1022 Bucket<V>[] pinyinBuckets = new Bucket[26]; 1023 boolean hasPinyin = false; 1024 1025 ArrayList<Bucket<V>> bucketList = new ArrayList<Bucket<V>>(); 1026 1027 // underflow bucket 1028 bucketList.add(new Bucket<V>(getUnderflowLabel(), "", LabelType.UNDERFLOW)); 1029 1030 // fix up the list, adding underflow, additions, overflow 1031 // Insert inflow labels as needed. 1032 int scriptIndex = -1; 1033 String scriptUpperBoundary = ""; 1034 for (String current : indexCharacters) { 1035 if (collatorPrimaryOnly.compare(current, scriptUpperBoundary) >= 0) { 1036 // We crossed the script boundary into a new script. 1037 String inflowBoundary = scriptUpperBoundary; 1038 boolean skippedScript = false; 1039 for (;;) { 1040 scriptUpperBoundary = firstCharsInScripts.get(++scriptIndex); 1041 if (collatorPrimaryOnly.compare(current, scriptUpperBoundary) < 0) { 1042 break; 1043 } 1044 skippedScript = true; 1045 } 1046 if (skippedScript && bucketList.size() > 1) { 1047 // We are skipping one or more scripts, 1048 // and we are not just getting out of the underflow label. 1049 bucketList.add(new Bucket<V>(getInflowLabel(), inflowBoundary, 1050 LabelType.INFLOW)); 1051 } 1052 } 1053 // Add a bucket with the current label. 1054 Bucket<V> bucket = new Bucket<V>(fixLabel(current), current, LabelType.NORMAL); 1055 bucketList.add(bucket); 1056 // Remember ASCII and Pinyin buckets for Pinyin redirects. 1057 char c; 1058 if (current.length() == 1 && 'A' <= (c = current.charAt(0)) && c <= 'Z') { 1059 asciiBuckets[c - 'A'] = bucket; 1060 } else if (current.length() == BASE.length() + 1 && current.startsWith(BASE) && 1061 'A' <= (c = current.charAt(BASE.length())) && c <= 'Z') { 1062 pinyinBuckets[c - 'A'] = bucket; 1063 hasPinyin = true; 1064 } 1065 // Check for multiple primary weights. 1066 if (!current.startsWith(BASE) && 1067 hasMultiplePrimaryWeights(collatorPrimaryOnly, variableTop, current) && 1068 !current.endsWith("\uffff")) { 1069 // "Æ" or "Sch" etc. 1070 for (int i = bucketList.size() - 2;; --i) { 1071 Bucket<V> singleBucket = bucketList.get(i); 1072 if (singleBucket.labelType != LabelType.NORMAL) { 1073 // There is no single-character bucket since the last 1074 // underflow or inflow label. 1075 break; 1076 } 1077 if (singleBucket.displayBucket == null && 1078 !hasMultiplePrimaryWeights(collatorPrimaryOnly, variableTop, singleBucket.lowerBoundary)) { 1079 // Add an invisible bucket that redirects strings greater than the expansion 1080 // to the previous single-character bucket. 1081 // For example, after ... Q R S Sch we add Sch\uFFFF->S 1082 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S. 1083 bucket = new Bucket<V>("", current + "\uFFFF", LabelType.NORMAL); 1084 bucket.displayBucket = singleBucket; 1085 bucketList.add(bucket); 1086 hasInvisibleBuckets = true; 1087 break; 1088 } 1089 } 1090 } 1091 } 1092 if (bucketList.size() == 1) { 1093 // No real labels, show only the underflow label. 1094 return new BucketList<V>(bucketList, bucketList); 1095 } 1096 // overflow bucket 1097 bucketList.add(new Bucket<V>(getOverflowLabel(), scriptUpperBoundary, LabelType.OVERFLOW)); // final 1098 1099 if (hasPinyin) { 1100 // Redirect Pinyin buckets. 1101 Bucket<V> asciiBucket = null; 1102 for (int i = 0; i < 26; ++i) { 1103 if (asciiBuckets[i] != null) { 1104 asciiBucket = asciiBuckets[i]; 1105 } 1106 if (pinyinBuckets[i] != null && asciiBucket != null) { 1107 pinyinBuckets[i].displayBucket = asciiBucket; 1108 hasInvisibleBuckets = true; 1109 } 1110 } 1111 } 1112 1113 if (!hasInvisibleBuckets) { 1114 return new BucketList<V>(bucketList, bucketList); 1115 } 1116 // Merge inflow buckets that are visually adjacent. 1117 // Iterate backwards: Merge inflow into overflow rather than the other way around. 1118 int i = bucketList.size() - 1; 1119 Bucket<V> nextBucket = bucketList.get(i); 1120 while (--i > 0) { 1121 Bucket<V> bucket = bucketList.get(i); 1122 if (bucket.displayBucket != null) { 1123 continue; // skip invisible buckets 1124 } 1125 if (bucket.labelType == LabelType.INFLOW) { 1126 if (nextBucket.labelType != LabelType.NORMAL) { 1127 bucket.displayBucket = nextBucket; 1128 continue; 1129 } 1130 } 1131 nextBucket = bucket; 1132 } 1133 1134 ArrayList<Bucket<V>> publicBucketList = new ArrayList<Bucket<V>>(); 1135 for (Bucket<V> bucket : bucketList) { 1136 if (bucket.displayBucket == null) { 1137 publicBucketList.add(bucket); 1138 } 1139 } 1140 return new BucketList<V>(bucketList, publicBucketList); 1141 } 1142 1143 private static class BucketList<V> implements Iterable<Bucket<V>> { 1144 private final ArrayList<Bucket<V>> bucketList; 1145 private final List<Bucket<V>> immutableVisibleList; 1146 BucketList(ArrayList<Bucket<V>> bucketList, ArrayList<Bucket<V>> publicBucketList)1147 private BucketList(ArrayList<Bucket<V>> bucketList, ArrayList<Bucket<V>> publicBucketList) { 1148 this.bucketList = bucketList; 1149 1150 int displayIndex = 0; 1151 for (Bucket<V> bucket : publicBucketList) { 1152 bucket.displayIndex = displayIndex++; 1153 } 1154 immutableVisibleList = Collections.unmodifiableList(publicBucketList); 1155 } 1156 getBucketCount()1157 private int getBucketCount() { 1158 return immutableVisibleList.size(); 1159 } 1160 getBucketIndex(CharSequence name, Collator collatorPrimaryOnly)1161 private int getBucketIndex(CharSequence name, Collator collatorPrimaryOnly) { 1162 // binary search 1163 int start = 0; 1164 int limit = bucketList.size(); 1165 while ((start + 1) < limit) { 1166 int i = (start + limit) / 2; 1167 Bucket<V> bucket = bucketList.get(i); 1168 int nameVsBucket = collatorPrimaryOnly.compare(name, bucket.lowerBoundary); 1169 if (nameVsBucket < 0) { 1170 limit = i; 1171 } else { 1172 start = i; 1173 } 1174 } 1175 Bucket<V> bucket = bucketList.get(start); 1176 if (bucket.displayBucket != null) { 1177 bucket = bucket.displayBucket; 1178 } 1179 return bucket.displayIndex; 1180 } 1181 1182 /** 1183 * Private iterator over all the buckets, visible and invisible 1184 */ fullIterator()1185 private Iterator<Bucket<V>> fullIterator() { 1186 return bucketList.iterator(); 1187 } 1188 1189 /** 1190 * Iterator over just the visible buckets. 1191 */ 1192 @Override iterator()1193 public Iterator<Bucket<V>> iterator() { 1194 return immutableVisibleList.iterator(); // use immutable list to prevent remove(). 1195 } 1196 } 1197 hasMultiplePrimaryWeights( RuleBasedCollator coll, long variableTop, String s)1198 private static boolean hasMultiplePrimaryWeights( 1199 RuleBasedCollator coll, long variableTop, String s) { 1200 long[] ces = coll.internalGetCEs(s); 1201 boolean seenPrimary = false; 1202 for (int i = 0; i < ces.length; ++i) { 1203 long ce = ces[i]; 1204 long p = ce >>> 32; 1205 if (p > variableTop) { 1206 // not primary ignorable 1207 if (seenPrimary) { 1208 return true; 1209 } 1210 seenPrimary = true; 1211 } 1212 } 1213 return false; 1214 } 1215 1216 // TODO: Surely we have at least a ticket for porting these mask values to UCharacter.java?! 1217 private static final int GC_LU_MASK = 1 << UCharacter.UPPERCASE_LETTER; 1218 private static final int GC_LL_MASK = 1 << UCharacter.LOWERCASE_LETTER; 1219 private static final int GC_LT_MASK = 1 << UCharacter.TITLECASE_LETTER; 1220 private static final int GC_LM_MASK = 1 << UCharacter.MODIFIER_LETTER; 1221 private static final int GC_LO_MASK = 1 << UCharacter.OTHER_LETTER; 1222 private static final int GC_L_MASK = 1223 GC_LU_MASK|GC_LL_MASK|GC_LT_MASK|GC_LM_MASK|GC_LO_MASK; 1224 private static final int GC_CN_MASK = 1 << UCharacter.GENERAL_OTHER_TYPES; 1225 1226 /** 1227 * Return a list of the first character in each script. Only exposed for testing. 1228 * 1229 * @return list of first characters in each script 1230 * @internal 1231 * @deprecated This API is ICU internal, only for testing. 1232 */ 1233 @Deprecated getFirstCharactersInScripts()1234 public List<String> getFirstCharactersInScripts() { 1235 List<String> dest = new ArrayList<String>(200); 1236 // Fetch the script-first-primary contractions which are defined in the root collator. 1237 // They all start with U+FDD1. 1238 UnicodeSet set = new UnicodeSet(); 1239 collatorPrimaryOnly.internalAddContractions(0xFDD1, set); 1240 if (set.isEmpty()) { 1241 throw new UnsupportedOperationException( 1242 "AlphabeticIndex requires script-first-primary contractions"); 1243 } 1244 for (String boundary : set) { 1245 int gcMask = 1 << UCharacter.getType(boundary.codePointAt(1)); 1246 if ((gcMask & (GC_L_MASK | GC_CN_MASK)) == 0) { 1247 // Ignore boundaries for the special reordering groups. 1248 // Take only those for "real scripts" (where the sample character is a Letter, 1249 // and the one for unassigned implicit weights (Cn). 1250 continue; 1251 } 1252 dest.add(boundary); 1253 } 1254 return dest; 1255 } 1256 } 1257