• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 * Copyright (C) 2009-2013, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
6 */
7 
8 #include "unicode/utypes.h"
9 
10 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION
11 
12 #include "unicode/alphaindex.h"
13 #include "unicode/coleitr.h"
14 #include "unicode/coll.h"
15 #include "unicode/localpointer.h"
16 #include "unicode/normalizer2.h"
17 #include "unicode/tblcoll.h"
18 #include "unicode/ulocdata.h"
19 #include "unicode/uniset.h"
20 #include "unicode/uobject.h"
21 #include "unicode/usetiter.h"
22 #include "unicode/utf16.h"
23 
24 #include "cmemory.h"
25 #include "cstring.h"
26 #include "uassert.h"
27 #include "uvector.h"
28 
29 //#include <string>
30 //#include <iostream>
31 
32 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
33 
34 U_NAMESPACE_BEGIN
35 
36 namespace {
37 
38 /**
39  * Prefix string for Chinese index buckets.
40  * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
41  */
42 const UChar BASE[1] = { 0xFDD0 };
43 const int32_t BASE_LENGTH = 1;
44 
45 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
46                                 const UnicodeString &one, const UnicodeString &other);
47 
48 }  // namespace
49 
50 static int32_t U_CALLCONV
51 collatorComparator(const void *context, const void *left, const void *right);
52 
53 static int32_t U_CALLCONV
54 recordCompareFn(const void *context, const void *left, const void *right);
55 
56 //  UVector<Record *> support function, delete a Record.
57 static void U_CALLCONV
alphaIndex_deleteRecord(void * obj)58 alphaIndex_deleteRecord(void *obj) {
59     delete static_cast<AlphabeticIndex::Record *>(obj);
60 }
61 
62 namespace {
63 
ownedString(const UnicodeString & s,LocalPointer<UnicodeString> & owned,UErrorCode & errorCode)64 UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned,
65                            UErrorCode &errorCode) {
66     if (U_FAILURE(errorCode)) { return NULL; }
67     if (owned.isValid()) {
68         return owned.orphan();
69     }
70     UnicodeString *p = new UnicodeString(s);
71     if (p == NULL) {
72         errorCode = U_MEMORY_ALLOCATION_ERROR;
73     }
74     return p;
75 }
76 
getString(const UVector & list,int32_t i)77 inline UnicodeString *getString(const UVector &list, int32_t i) {
78     return static_cast<UnicodeString *>(list[i]);
79 }
80 
getBucket(const UVector & list,int32_t i)81 inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) {
82     return static_cast<AlphabeticIndex::Bucket *>(list[i]);
83 }
84 
getRecord(const UVector & list,int32_t i)85 inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) {
86     return static_cast<AlphabeticIndex::Record *>(list[i]);
87 }
88 
89 /**
90  * Like Java Collections.binarySearch(List, String, Comparator).
91  *
92  * @return the index>=0 where the item was found,
93  *         or the index<0 for inserting the string at ~index in sorted order
94  */
binarySearch(const UVector & list,const UnicodeString & s,const Collator & coll)95 int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) {
96     if (list.size() == 0) { return ~0; }
97     int32_t start = 0;
98     int32_t limit = list.size();
99     for (;;) {
100         int32_t i = (start + limit) / 2;
101         const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i));
102         UErrorCode errorCode = U_ZERO_ERROR;
103         UCollationResult cmp = coll.compare(s, *si, errorCode);
104         if (cmp == UCOL_EQUAL) {
105             return i;
106         } else if (cmp < 0) {
107             if (i == start) {
108                 return ~start;  // insert s before *si
109             }
110             limit = i;
111         } else {
112             if (i == start) {
113                 return ~(start + 1);  // insert s after *si
114             }
115             start = i;
116         }
117     }
118 }
119 
120 }  // namespace
121 
122 // The BucketList is not in the anonymous namespace because only Clang
123 // seems to support its use in other classes from there.
124 // However, we also don't need U_I18N_API because it is not used from outside the i18n library.
125 class BucketList : public UObject {
126 public:
BucketList(UVector * bucketList,UVector * publicBucketList)127     BucketList(UVector *bucketList, UVector *publicBucketList)
128             : bucketList_(bucketList), immutableVisibleList_(publicBucketList) {
129         int32_t displayIndex = 0;
130         for (int32_t i = 0; i < publicBucketList->size(); ++i) {
131             getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++;
132         }
133     }
134 
135     // The virtual destructor must not be inline.
136     // See ticket #8454 for details.
137     virtual ~BucketList();
138 
getBucketCount() const139     int32_t getBucketCount() const {
140         return immutableVisibleList_->size();
141     }
142 
getBucketIndex(const UnicodeString & name,const Collator & collatorPrimaryOnly,UErrorCode & errorCode)143     int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly,
144                            UErrorCode &errorCode) {
145         // binary search
146         int32_t start = 0;
147         int32_t limit = bucketList_->size();
148         while ((start + 1) < limit) {
149             int32_t i = (start + limit) / 2;
150             const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i);
151             UCollationResult nameVsBucket =
152                 collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode);
153             if (nameVsBucket < 0) {
154                 limit = i;
155             } else {
156                 start = i;
157             }
158         }
159         const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start);
160         if (bucket->displayBucket_ != NULL) {
161             bucket = bucket->displayBucket_;
162         }
163         return bucket->displayIndex_;
164     }
165 
166     /** All of the buckets, visible and invisible. */
167     UVector *bucketList_;
168     /** Just the visible buckets. */
169     UVector *immutableVisibleList_;
170 };
171 
~BucketList()172 BucketList::~BucketList() {
173     delete bucketList_;
174     if (immutableVisibleList_ != bucketList_) {
175         delete immutableVisibleList_;
176     }
177 }
178 
~ImmutableIndex()179 AlphabeticIndex::ImmutableIndex::~ImmutableIndex() {
180     delete buckets_;
181     delete collatorPrimaryOnly_;
182 }
183 
184 int32_t
getBucketCount() const185 AlphabeticIndex::ImmutableIndex::getBucketCount() const {
186     return buckets_->getBucketCount();
187 }
188 
189 int32_t
getBucketIndex(const UnicodeString & name,UErrorCode & errorCode) const190 AlphabeticIndex::ImmutableIndex::getBucketIndex(
191         const UnicodeString &name, UErrorCode &errorCode) const {
192     return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode);
193 }
194 
195 const AlphabeticIndex::Bucket *
getBucket(int32_t index) const196 AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const {
197     if (0 <= index && index < buckets_->getBucketCount()) {
198         return icu::getBucket(*buckets_->immutableVisibleList_, index);
199     } else {
200         return NULL;
201     }
202 }
203 
AlphabeticIndex(const Locale & locale,UErrorCode & status)204 AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status)
205         : inputList_(NULL),
206           labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
207           maxLabelCount_(99),
208           initialLabels_(NULL), firstCharsInScripts_(NULL),
209           collator_(NULL), collatorPrimaryOnly_(NULL),
210           buckets_(NULL) {
211     init(&locale, status);
212 }
213 
214 
AlphabeticIndex(RuleBasedCollator * collator,UErrorCode & status)215 AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status)
216         : inputList_(NULL),
217           labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
218           maxLabelCount_(99),
219           initialLabels_(NULL), firstCharsInScripts_(NULL),
220           collator_(collator), collatorPrimaryOnly_(NULL),
221           buckets_(NULL) {
222     init(NULL, status);
223 }
224 
225 
226 
~AlphabeticIndex()227 AlphabeticIndex::~AlphabeticIndex() {
228     delete collator_;
229     delete collatorPrimaryOnly_;
230     delete firstCharsInScripts_;
231     delete buckets_;
232     delete inputList_;
233     delete initialLabels_;
234 }
235 
236 
addLabels(const UnicodeSet & additions,UErrorCode & status)237 AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) {
238     if (U_FAILURE(status)) {
239         return *this;
240     }
241     initialLabels_->addAll(additions);
242     clearBuckets();
243     return *this;
244 }
245 
246 
addLabels(const Locale & locale,UErrorCode & status)247 AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
248     addIndexExemplars(&locale, status);
249     clearBuckets();
250     return *this;
251 }
252 
253 
buildImmutableIndex(UErrorCode & errorCode)254 AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) {
255     if (U_FAILURE(errorCode)) { return NULL; }
256     // In C++, the ImmutableIndex must own its copy of the BucketList,
257     // even if it contains no records, for proper memory management.
258     // We could clone the buckets_ if they are not NULL,
259     // but that would be worth it only if this method is called multiple times,
260     // or called after using the old-style bucket iterator API.
261     LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode));
262     LocalPointer<RuleBasedCollator> coll(
263         static_cast<RuleBasedCollator *>(collatorPrimaryOnly_->clone()));
264     if (immutableBucketList.isNull() || coll.isNull()) {
265         errorCode = U_MEMORY_ALLOCATION_ERROR;
266         return NULL;
267     }
268     ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias());
269     if (immIndex == NULL) {
270         errorCode = U_MEMORY_ALLOCATION_ERROR;
271         return NULL;
272     }
273     // The ImmutableIndex adopted its parameter objects.
274     immutableBucketList.orphan();
275     coll.orphan();
276     return immIndex;
277 }
278 
getBucketCount(UErrorCode & status)279 int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) {
280     initBuckets(status);
281     if (U_FAILURE(status)) {
282         return 0;
283     }
284     return buckets_->getBucketCount();
285 }
286 
287 
getRecordCount(UErrorCode & status)288 int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) {
289     if (U_FAILURE(status) || inputList_ == NULL) {
290         return 0;
291     }
292     return inputList_->size();
293 }
294 
initLabels(UVector & indexCharacters,UErrorCode & errorCode) const295 void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const {
296     const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode);
297     if (U_FAILURE(errorCode)) { return; }
298 
299     const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0);
300     const UnicodeString &overflowBoundary =
301         *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1);
302 
303     // We make a sorted array of elements.
304     // Some of the input may be redundant.
305     // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
306     // We filter out those cases.
307     UnicodeSetIterator iter(*initialLabels_);
308     while (iter.next()) {
309         const UnicodeString *item = &iter.getString();
310         LocalPointer<UnicodeString> ownedItem;
311         UBool checkDistinct;
312         int32_t itemLength = item->length();
313         if (!item->hasMoreChar32Than(0, itemLength, 1)) {
314             checkDistinct = FALSE;
315         } else if(item->charAt(itemLength - 1) == 0x2a &&  // '*'
316                 item->charAt(itemLength - 2) != 0x2a) {
317             // Use a label if it is marked with one trailing star,
318             // even if the label string sorts the same when all contractions are suppressed.
319             ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1));
320             item = ownedItem.getAlias();
321             if (item == NULL) {
322                 errorCode = U_MEMORY_ALLOCATION_ERROR;
323                 return;
324             }
325             checkDistinct = FALSE;
326         } else {
327             checkDistinct = TRUE;
328         }
329         if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) {
330             // Ignore a primary-ignorable or non-alphabetic index character.
331         } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) {
332             // Ignore an index characters that will land in the overflow bucket.
333         } else if (checkDistinct &&
334                 collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) {
335             // Ignore a multi-code point index character that does not sort distinctly
336             // from the sequence of its separate characters.
337         } else {
338             int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_);
339             if (insertionPoint < 0) {
340                 indexCharacters.insertElementAt(
341                     ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode);
342             } else {
343                 const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint);
344                 if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) {
345                     indexCharacters.setElementAt(
346                         ownedString(*item, ownedItem, errorCode), insertionPoint);
347                 }
348             }
349         }
350     }
351     if (U_FAILURE(errorCode)) { return; }
352 
353     // if the result is still too large, cut down to maxCount elements, by removing every nth element
354 
355     int32_t size = indexCharacters.size() - 1;
356     if (size > maxLabelCount_) {
357         int32_t count = 0;
358         int32_t old = -1;
359         for (int32_t i = 0; i < indexCharacters.size();) {
360             ++count;
361             int32_t bump = count * maxLabelCount_ / size;
362             if (bump == old) {
363                 indexCharacters.removeElementAt(i);
364             } else {
365                 old = bump;
366                 ++i;
367             }
368         }
369     }
370 }
371 
372 namespace {
373 
fixLabel(const UnicodeString & current,UnicodeString & temp)374 const UnicodeString &fixLabel(const UnicodeString &current, UnicodeString &temp) {
375     if (!current.startsWith(BASE, BASE_LENGTH)) {
376         return current;
377     }
378     UChar rest = current.charAt(BASE_LENGTH);
379     if (0x2800 < rest && rest <= 0x28FF) { // stroke count
380         int32_t count = rest-0x2800;
381         temp.setTo((UChar)(0x30 + count % 10));
382         if (count >= 10) {
383             count /= 10;
384             temp.insert(0, (UChar)(0x30 + count % 10));
385             if (count >= 10) {
386                 count /= 10;
387                 temp.insert(0, (UChar)(0x30 + count));
388             }
389         }
390         return temp.append((UChar)0x5283);
391     }
392     return temp.setTo(current, BASE_LENGTH);
393 }
394 
hasMultiplePrimaryWeights(CollationElementIterator & cei,int32_t variableTop,const UnicodeString & s,UErrorCode & errorCode)395 UBool hasMultiplePrimaryWeights(
396         CollationElementIterator &cei, int32_t variableTop,
397         const UnicodeString &s, UErrorCode &errorCode) {
398     cei.setText(s, errorCode);
399     UBool seenPrimary = FALSE;
400     for (;;) {
401         int32_t ce32 = cei.next(errorCode);
402         if (ce32 == CollationElementIterator::NULLORDER) {
403             break;
404         }
405         int32_t p = CollationElementIterator::primaryOrder(ce32);
406         if (p > variableTop && (ce32 & 0xc0) != 0xc0) {
407             // not primary ignorable, and not a continuation CE
408             if (seenPrimary) {
409                 return TRUE;
410             }
411             seenPrimary = TRUE;
412         }
413     }
414     return FALSE;
415 }
416 
417 }  // namespace
418 
createBucketList(UErrorCode & errorCode) const419 BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const {
420     // Initialize indexCharacters.
421     UVector indexCharacters(errorCode);
422     indexCharacters.setDeleter(uprv_deleteUObject);
423     initLabels(indexCharacters, errorCode);
424     if (U_FAILURE(errorCode)) { return NULL; }
425 
426     // Variables for hasMultiplePrimaryWeights().
427     LocalPointer<CollationElementIterator> cei(
428         collatorPrimaryOnly_->createCollationElementIterator(emptyString_));
429     if (cei.isNull()) {
430         errorCode = U_MEMORY_ALLOCATION_ERROR;
431         return NULL;
432     }
433     int32_t variableTop;
434     if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) {
435         variableTop = CollationElementIterator::primaryOrder(
436             (int32_t)collatorPrimaryOnly_->getVariableTop(errorCode));
437     } else {
438         variableTop = 0;
439     }
440     UBool hasInvisibleBuckets = FALSE;
441 
442     // Helper arrays for Chinese Pinyin collation.
443     Bucket *asciiBuckets[26] = {
444         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
445         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
446     };
447     Bucket *pinyinBuckets[26] = {
448         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
449         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
450     };
451     UBool hasPinyin = FALSE;
452 
453     LocalPointer<UVector> bucketList(new UVector(errorCode));
454     if (bucketList.isNull()) {
455         errorCode = U_MEMORY_ALLOCATION_ERROR;
456         return NULL;
457     }
458     bucketList->setDeleter(uprv_deleteUObject);
459 
460     // underflow bucket
461     Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW);
462     if (bucket == NULL) {
463         errorCode = U_MEMORY_ALLOCATION_ERROR;
464         return NULL;
465     }
466     bucketList->addElement(bucket, errorCode);
467     if (U_FAILURE(errorCode)) { return NULL; }
468 
469     UnicodeString temp;
470 
471     // fix up the list, adding underflow, additions, overflow
472     // Insert inflow labels as needed.
473     int32_t scriptIndex = -1;
474     const UnicodeString *scriptUpperBoundary = &emptyString_;
475     for (int32_t i = 0; i < indexCharacters.size(); ++i) {
476         UnicodeString &current = *getString(indexCharacters, i);
477         if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) {
478             // We crossed the script boundary into a new script.
479             const UnicodeString &inflowBoundary = *scriptUpperBoundary;
480             UBool skippedScript = FALSE;
481             for (;;) {
482                 scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex);
483                 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) {
484                     break;
485                 }
486                 skippedScript = TRUE;
487             }
488             if (skippedScript && bucketList->size() > 1) {
489                 // We are skipping one or more scripts,
490                 // and we are not just getting out of the underflow label.
491                 bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW);
492                 if (bucket == NULL) {
493                     errorCode = U_MEMORY_ALLOCATION_ERROR;
494                     return NULL;
495                 }
496                 bucketList->addElement(bucket, errorCode);
497             }
498         }
499         // Add a bucket with the current label.
500         bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL);
501         if (bucket == NULL) {
502             errorCode = U_MEMORY_ALLOCATION_ERROR;
503             return NULL;
504         }
505         bucketList->addElement(bucket, errorCode);
506         // Remember ASCII and Pinyin buckets for Pinyin redirects.
507         UChar c;
508         if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) {  // A-Z
509             asciiBuckets[c - 0x41] = bucket;
510         } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) &&
511                 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) {
512             pinyinBuckets[c - 0x41] = bucket;
513             hasPinyin = TRUE;
514         }
515         // Check for multiple primary weights.
516         if (!current.startsWith(BASE, BASE_LENGTH) &&
517                 hasMultiplePrimaryWeights(*cei, variableTop, current, errorCode) &&
518                 current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) {
519             // "AE-ligature" or "Sch" etc.
520             for (int32_t i = bucketList->size() - 2;; --i) {
521                 Bucket *singleBucket = getBucket(*bucketList, i);
522                 if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
523                     // There is no single-character bucket since the last
524                     // underflow or inflow label.
525                     break;
526                 }
527                 if (singleBucket->displayBucket_ == NULL &&
528                         !hasMultiplePrimaryWeights(
529                             *cei, variableTop, singleBucket->lowerBoundary_, errorCode)) {
530                     // Add an invisible bucket that redirects strings greater than the expansion
531                     // to the previous single-character bucket.
532                     // For example, after ... Q R S Sch we add Sch\uFFFF->S
533                     // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
534                     bucket = new Bucket(emptyString_,
535                         UnicodeString(current).append((UChar)0xFFFF),
536                         U_ALPHAINDEX_NORMAL);
537                     if (bucket == NULL) {
538                         errorCode = U_MEMORY_ALLOCATION_ERROR;
539                         return NULL;
540                     }
541                     bucket->displayBucket_ = singleBucket;
542                     bucketList->addElement(bucket, errorCode);
543                     hasInvisibleBuckets = TRUE;
544                     break;
545                 }
546             }
547         }
548     }
549     if (U_FAILURE(errorCode)) { return NULL; }
550     if (bucketList->size() == 1) {
551         // No real labels, show only the underflow label.
552         BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
553         if (bl == NULL) {
554             errorCode = U_MEMORY_ALLOCATION_ERROR;
555             return NULL;
556         }
557         bucketList.orphan();
558         return bl;
559     }
560     // overflow bucket
561     bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW);
562     if (bucket == NULL) {
563         errorCode = U_MEMORY_ALLOCATION_ERROR;
564         return NULL;
565     }
566     bucketList->addElement(bucket, errorCode); // final
567 
568     if (hasPinyin) {
569         // Redirect Pinyin buckets.
570         Bucket *asciiBucket = NULL;
571         for (int32_t i = 0; i < 26; ++i) {
572             if (asciiBuckets[i] != NULL) {
573                 asciiBucket = asciiBuckets[i];
574             }
575             if (pinyinBuckets[i] != NULL && asciiBucket != NULL) {
576                 pinyinBuckets[i]->displayBucket_ = asciiBucket;
577                 hasInvisibleBuckets = TRUE;
578             }
579         }
580     }
581 
582     if (U_FAILURE(errorCode)) { return NULL; }
583     if (!hasInvisibleBuckets) {
584         BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
585         if (bl == NULL) {
586             errorCode = U_MEMORY_ALLOCATION_ERROR;
587             return NULL;
588         }
589         bucketList.orphan();
590         return bl;
591     }
592     // Merge inflow buckets that are visually adjacent.
593     // Iterate backwards: Merge inflow into overflow rather than the other way around.
594     int32_t i = bucketList->size() - 1;
595     Bucket *nextBucket = getBucket(*bucketList, i);
596     while (--i > 0) {
597         bucket = getBucket(*bucketList, i);
598         if (bucket->displayBucket_ != NULL) {
599             continue;  // skip invisible buckets
600         }
601         if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) {
602             if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
603                 bucket->displayBucket_ = nextBucket;
604                 continue;
605             }
606         }
607         nextBucket = bucket;
608     }
609 
610     LocalPointer<UVector> publicBucketList(new UVector(errorCode));
611     if (bucketList.isNull()) {
612         errorCode = U_MEMORY_ALLOCATION_ERROR;
613         return NULL;
614     }
615     // Do not call publicBucketList->setDeleter():
616     // This vector shares its objects with the bucketList.
617     for (int32_t i = 0; i < bucketList->size(); ++i) {
618         bucket = getBucket(*bucketList, i);
619         if (bucket->displayBucket_ == NULL) {
620             publicBucketList->addElement(bucket, errorCode);
621         }
622     }
623     if (U_FAILURE(errorCode)) { return NULL; }
624     BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias());
625     if (bl == NULL) {
626         errorCode = U_MEMORY_ALLOCATION_ERROR;
627         return NULL;
628     }
629     bucketList.orphan();
630     publicBucketList.orphan();
631     return bl;
632 }
633 
634 /**
635  * Creates an index, and buckets and sorts the list of records into the index.
636  */
initBuckets(UErrorCode & errorCode)637 void AlphabeticIndex::initBuckets(UErrorCode &errorCode) {
638     if (U_FAILURE(errorCode) || buckets_ != NULL) {
639         return;
640     }
641     buckets_ = createBucketList(errorCode);
642     if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) {
643         return;
644     }
645 
646     // Sort the records by name.
647     // Stable sort preserves input order of collation duplicates.
648     inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode);
649 
650     // Now, we traverse all of the input, which is now sorted.
651     // If the item doesn't go in the current bucket, we find the next bucket that contains it.
652     // This makes the process order n*log(n), since we just sort the list and then do a linear process.
653     // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
654     // we need to improve it for that case.
655 
656     Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0);
657     int32_t bucketIndex = 1;
658     Bucket *nextBucket;
659     const UnicodeString *upperBoundary;
660     if (bucketIndex < buckets_->bucketList_->size()) {
661         nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
662         upperBoundary = &nextBucket->lowerBoundary_;
663     } else {
664         nextBucket = NULL;
665         upperBoundary = NULL;
666     }
667     for (int32_t i = 0; i < inputList_->size(); ++i) {
668         Record *r = getRecord(*inputList_, i);
669         // if the current bucket isn't the right one, find the one that is
670         // We have a special flag for the last bucket so that we don't look any further
671         while (upperBoundary != NULL &&
672                 collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) {
673             currentBucket = nextBucket;
674             // now reset the boundary that we compare against
675             if (bucketIndex < buckets_->bucketList_->size()) {
676                 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
677                 upperBoundary = &nextBucket->lowerBoundary_;
678             } else {
679                 upperBoundary = NULL;
680             }
681         }
682         // now put the record into the bucket.
683         Bucket *bucket = currentBucket;
684         if (bucket->displayBucket_ != NULL) {
685             bucket = bucket->displayBucket_;
686         }
687         if (bucket->records_ == NULL) {
688             bucket->records_ = new UVector(errorCode);
689             if (bucket->records_ == NULL) {
690                 errorCode = U_MEMORY_ALLOCATION_ERROR;
691                 return;
692             }
693         }
694         bucket->records_->addElement(r, errorCode);
695     }
696 }
697 
clearBuckets()698 void AlphabeticIndex::clearBuckets() {
699     if (buckets_ != NULL) {
700         delete buckets_;
701         buckets_ = NULL;
702         internalResetBucketIterator();
703     }
704 }
705 
internalResetBucketIterator()706 void AlphabeticIndex::internalResetBucketIterator() {
707     labelsIterIndex_ = -1;
708     currentBucket_ = NULL;
709 }
710 
711 
addIndexExemplars(const Locale * locale,UErrorCode & status)712 void AlphabeticIndex::addIndexExemplars(const Locale *locale, UErrorCode &status) {
713     if (U_FAILURE(status)) { return; }
714     // Chinese index characters, which are specific to each of the several Chinese tailorings,
715     // take precedence over the single locale data exemplar set per language.
716     const char *language = locale == NULL ? NULL : locale->getLanguage();
717     if (language == NULL ||
718             uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 ||
719             uprv_strcmp(language, "ko") == 0) {
720         // TODO: This should be done regardless of the language, but it's expensive.
721         // We should add a Collator function (can be @internal)
722         // to enumerate just the contractions that start with a given code point or string.
723         if (addChineseIndexCharacters(status) || U_FAILURE(status)) {
724             return;
725         }
726     }
727     if (locale == NULL) { return; }
728 
729     LocalULocaleDataPointer uld(ulocdata_open(locale->getName(), &status));
730     if (U_FAILURE(status)) {
731         return;
732     }
733 
734     UnicodeSet exemplars;
735     ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
736     if (U_SUCCESS(status)) {
737         initialLabels_->addAll(exemplars);
738         return;
739     }
740     status = U_ZERO_ERROR;  // Clear out U_MISSING_RESOURCE_ERROR
741 
742     // The locale data did not include explicit Index characters.
743     // Synthesize a set of them from the locale's standard exemplar characters.
744     ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
745     if (U_FAILURE(status)) {
746         return;
747     }
748 
749     // question: should we add auxiliary exemplars?
750     if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) {
751         exemplars.add(0x61, 0x7A);
752     }
753     if (exemplars.containsSome(0xAC00, 0xD7A3)) {  // Hangul syllables
754         // cut down to small list
755         exemplars.remove(0xAC00, 0xD7A3).
756             add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
757             add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
758             add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
759             add(0xD30C).add(0xD558);
760     }
761     if (exemplars.containsSome(0x1200, 0x137F)) {  // Ethiopic block
762         // cut down to small list
763         // make use of the fact that Ethiopic is allocated in 8's, where
764         // the base is 0 mod 8.
765         UnicodeSet ethiopic(
766             UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status);
767         UnicodeSetIterator it(ethiopic);
768         while (it.next() && !it.isString()) {
769             if ((it.getCodepoint() & 0x7) != 0) {
770                 exemplars.remove(it.getCodepoint());
771             }
772         }
773     }
774 
775     // Upper-case any that aren't already so.
776     //   (We only do this for synthesized index characters.)
777     UnicodeSetIterator it(exemplars);
778     UnicodeString upperC;
779     while (it.next()) {
780         const UnicodeString &exemplarC = it.getString();
781         upperC = exemplarC;
782         upperC.toUpper(*locale);
783         initialLabels_->add(upperC);
784     }
785 }
786 
addChineseIndexCharacters(UErrorCode & errorCode)787 UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) {
788     UnicodeSet contractions;
789     ucol_getContractionsAndExpansions(collatorPrimaryOnly_->getUCollator(),
790                                       contractions.toUSet(), NULL, FALSE, &errorCode);
791     if (U_FAILURE(errorCode)) { return FALSE; }
792     UnicodeString firstHanBoundary;
793     UBool hasPinyin = FALSE;
794     UnicodeSetIterator iter(contractions);
795     while (iter.next()) {
796         const UnicodeString &s = iter.getString();
797         if (s.startsWith(BASE, BASE_LENGTH)) {
798             initialLabels_->add(s);
799             if (firstHanBoundary.isEmpty() ||
800                     collatorPrimaryOnly_->compare(s, firstHanBoundary, errorCode) < 0) {
801                 firstHanBoundary = s;
802             }
803             UChar c = s.charAt(s.length() - 1);
804             if (0x41 <= c && c <= 0x5A) {  // A-Z
805                 hasPinyin = TRUE;
806             }
807         }
808     }
809     if (hasPinyin) {
810         initialLabels_->add(0x41, 0x5A);  // A-Z
811     }
812     if (!firstHanBoundary.isEmpty()) {
813         // The hardcoded list of script boundaries includes U+4E00
814         // which is tailored to not be the first primary
815         // in all Chinese tailorings except "unihan".
816         // Replace U+4E00 with the first boundary string from the tailoring.
817         // TODO: This becomes obsolete when the root collator gets
818         // reliable script-first-primary mappings.
819         int32_t hanIndex = binarySearch(
820                 *firstCharsInScripts_, UnicodeString((UChar)0x4E00), *collatorPrimaryOnly_);
821         if (hanIndex >= 0) {
822             UnicodeString *fh = new UnicodeString(firstHanBoundary);
823             if (fh == NULL) {
824                 errorCode = U_MEMORY_ALLOCATION_ERROR;
825                 return FALSE;
826             }
827             firstCharsInScripts_->setElementAt(fh, hanIndex);
828         }
829         return TRUE;
830     } else {
831         return FALSE;
832     }
833 }
834 
835 
836 /*
837  * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
838  */
839 static const UChar CGJ = 0x034F;
separated(const UnicodeString & item)840 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
841     UnicodeString result;
842     if (item.length() == 0) {
843         return result;
844     }
845     int32_t i = 0;
846     for (;;) {
847         UChar32  cp = item.char32At(i);
848         result.append(cp);
849         i = item.moveIndex32(i, 1);
850         if (i >= item.length()) {
851             break;
852         }
853         result.append(CGJ);
854     }
855     return result;
856 }
857 
858 
operator ==(const AlphabeticIndex &) const859 UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const {
860     return FALSE;
861 }
862 
863 
operator !=(const AlphabeticIndex &) const864 UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {
865     return FALSE;
866 }
867 
868 
getCollator() const869 const RuleBasedCollator &AlphabeticIndex::getCollator() const {
870     // There are no known non-RuleBasedCollator collators, and none ever expected.
871     // But, in case that changes, better a null pointer than a wrong type.
872     return *dynamic_cast<RuleBasedCollator *>(collator_);
873 }
874 
875 
getInflowLabel() const876 const UnicodeString &AlphabeticIndex::getInflowLabel() const {
877     return inflowLabel_;
878 }
879 
getOverflowLabel() const880 const UnicodeString &AlphabeticIndex::getOverflowLabel() const {
881     return overflowLabel_;
882 }
883 
884 
getUnderflowLabel() const885 const UnicodeString &AlphabeticIndex::getUnderflowLabel() const {
886     return underflowLabel_;
887 }
888 
889 
setInflowLabel(const UnicodeString & label,UErrorCode &)890 AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
891     inflowLabel_ = label;
892     clearBuckets();
893     return *this;
894 }
895 
896 
setOverflowLabel(const UnicodeString & label,UErrorCode &)897 AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
898     overflowLabel_ = label;
899     clearBuckets();
900     return *this;
901 }
902 
903 
setUnderflowLabel(const UnicodeString & label,UErrorCode &)904 AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
905     underflowLabel_ = label;
906     clearBuckets();
907     return *this;
908 }
909 
910 
getMaxLabelCount() const911 int32_t AlphabeticIndex::getMaxLabelCount() const {
912     return maxLabelCount_;
913 }
914 
915 
setMaxLabelCount(int32_t maxLabelCount,UErrorCode & status)916 AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) {
917     if (U_FAILURE(status)) {
918         return *this;
919     }
920     if (maxLabelCount <= 0) {
921         status = U_ILLEGAL_ARGUMENT_ERROR;
922         return *this;
923     }
924     maxLabelCount_ = maxLabelCount;
925     clearBuckets();
926     return *this;
927 }
928 
929 
930 //
931 //  init() - Common code for constructors.
932 //
933 
init(const Locale * locale,UErrorCode & status)934 void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
935     if (U_FAILURE(status)) { return; }
936     if (locale == NULL && collator_ == NULL) {
937         status = U_ILLEGAL_ARGUMENT_ERROR;
938         return;
939     }
940 
941     initialLabels_         = new UnicodeSet();
942     if (initialLabels_ == NULL) {
943         status = U_MEMORY_ALLOCATION_ERROR;
944         return;
945     }
946 
947     inflowLabel_.setTo((UChar)0x2026);    // Ellipsis
948     overflowLabel_ = inflowLabel_;
949     underflowLabel_ = inflowLabel_;
950 
951     if (collator_ == NULL) {
952         collator_ = static_cast<RuleBasedCollator *>(Collator::createInstance(*locale, status));
953         if (U_FAILURE(status)) { return; }
954         if (collator_ == NULL) {
955             status = U_MEMORY_ALLOCATION_ERROR;
956             return;
957         }
958     }
959     collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone());
960     if (collatorPrimaryOnly_ == NULL) {
961         status = U_MEMORY_ALLOCATION_ERROR;
962         return;
963     }
964     collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status);
965     firstCharsInScripts_ = firstStringsInScript(status);
966     if (U_FAILURE(status)) { return; }
967     firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status);
968 
969     // Add index exemplar characters before checking the script boundaries,
970     // since this might modify them.
971     addIndexExemplars(locale, status);
972 
973     UnicodeString _4E00((UChar)0x4E00);
974     int32_t hanIndex = binarySearch(
975             *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_);
976     if (hanIndex >= 0) {
977         // Adjust the Han script boundary if necessary.
978         // TODO: This becomes obsolete when the root collator gets
979         // reliable script-first-primary mappings.
980         UnicodeString _1100((UChar)0x1100);
981         UnicodeString _1112((UChar)0x1112);
982         UnicodeString _4E9C((UChar)0x4E9C);
983         if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 &&
984                 collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) {
985             // The standard Korean tailoring sorts Hanja (Han characters)
986             // as secondary differences from Hangul syllables.
987             // This makes U+4E00 not useful as a Han-script boundary.
988             firstCharsInScripts_->removeElementAt(hanIndex);
989         } else if (collatorPrimaryOnly_->compare(_4E9C, _4E00, status) < 0) {
990             // The standard Japanese tailoring sorts U+4E9C first among Kanji.
991             UnicodeString *fh = new UnicodeString(_4E9C);
992             if (fh == NULL) {
993                 status = U_MEMORY_ALLOCATION_ERROR;
994                 return;
995             }
996             firstCharsInScripts_->setElementAt(fh, hanIndex);
997         }
998     }
999 
1000     // Guard against a degenerate collator where
1001     // some script boundary strings are primary ignorable.
1002     for (;;) {
1003         if (U_FAILURE(status)) { return; }
1004         if (firstCharsInScripts_->isEmpty()) {
1005             // AlphabeticIndex requires some non-ignorable script boundary strings.
1006             status = U_ILLEGAL_ARGUMENT_ERROR;
1007             return;
1008         }
1009         if (collatorPrimaryOnly_->compare(
1010                 *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)),
1011                 emptyString_, status) == UCOL_EQUAL) {
1012             firstCharsInScripts_->removeElementAt(0);
1013         } else {
1014             break;
1015         }
1016     }
1017 }
1018 
1019 
1020 //
1021 //  Comparison function for UVector<UnicodeString *> sorting with a collator.
1022 //
1023 static int32_t U_CALLCONV
collatorComparator(const void * context,const void * left,const void * right)1024 collatorComparator(const void *context, const void *left, const void *right) {
1025     const UElement *leftElement = static_cast<const UElement *>(left);
1026     const UElement *rightElement = static_cast<const UElement *>(right);
1027     const UnicodeString *leftString  = static_cast<const UnicodeString *>(leftElement->pointer);
1028     const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer);
1029 
1030     if (leftString == rightString) {
1031         // Catches case where both are NULL
1032         return 0;
1033     }
1034     if (leftString == NULL) {
1035         return 1;
1036     };
1037     if (rightString == NULL) {
1038         return -1;
1039     }
1040     const Collator *col = static_cast<const Collator *>(context);
1041     UErrorCode errorCode = U_ZERO_ERROR;
1042     return col->compare(*leftString, *rightString, errorCode);
1043 }
1044 
1045 //
1046 //  Comparison function for UVector<Record *> sorting with a collator.
1047 //
1048 static int32_t U_CALLCONV
recordCompareFn(const void * context,const void * left,const void * right)1049 recordCompareFn(const void *context, const void *left, const void *right) {
1050     const UElement *leftElement = static_cast<const UElement *>(left);
1051     const UElement *rightElement = static_cast<const UElement *>(right);
1052     const AlphabeticIndex::Record *leftRec  = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer);
1053     const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer);
1054     const Collator *col = static_cast<const Collator *>(context);
1055     UErrorCode errorCode = U_ZERO_ERROR;
1056     return col->compare(leftRec->name_, rightRec->name_, errorCode);
1057 }
1058 
1059 
1060 /**
1061  * This list contains one character per script that has the
1062  * lowest primary weight for that script in the root collator.
1063  * This list will be copied and sorted to account for script reordering.
1064  *
1065  * <p>TODO: This is fragile. If the first character of a script is tailored
1066  * so that it does not map to the script's lowest primary weight any more,
1067  * then the buckets will be off.
1068  * There are hacks in the code to handle the known CJK tailorings of U+4E00.
1069  *
1070  * <p>We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-before a.
1071  *
1072  * Keep this in sync with HACK_FIRST_CHARS_IN_SCRIPTS in
1073  * ICU4J main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java
1074  */
1075 static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] =  {
1076     0x41, 0, 0x03B1, 0,
1077     0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0,
1078     0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0,
1079     0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0,
1080     0xAAF2, 0,  // Meetei Mayek
1081     0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0,
1082     U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0,  // Sharada
1083     U16_LEAD(0x11680), U16_TRAIL(0x11680), 0,  // Takri
1084     0x1B83, 0,
1085     0xD802, 0xDE00, 0, 0x0E01, 0,
1086     0x0EDE, 0,  // Lao
1087     0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0,
1088     0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0,
1089     U16_LEAD(0x11103), U16_TRAIL(0x11103), 0,  // Chakma
1090     0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0,
1091     0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0,
1092     0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0,
1093     U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0,  // Miao
1094     0xD800, 0xDE80, 0,
1095     0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0,
1096     0xD801, 0xDC80, 0,
1097     U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0,  // Sora Sompeng
1098     0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0,
1099     0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0,
1100     U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0,  // Meroitic Cursive
1101     U16_LEAD(0x10980), U16_TRAIL(0x10980), 0,  // Meroitic Hieroglyphs
1102     0x4E00, 0,
1103     // TODO: The overflow bucket's lowerBoundary string should be the
1104     // first item after the last reordering group in the collator's script order.
1105     // This should normally be the first Unicode code point
1106     // that is unassigned (U+0378 in Unicode 6.3) and untailored.
1107     // However, at least up to ICU 51 the Hani reordering group includes
1108     // unassigned code points,
1109     // and there is no stable string for the start of the trailing-weights range.
1110     // The only known string that sorts "high" is U+FFFF.
1111     // When ICU separates Hani vs. unassigned reordering groups, we need to fix this,
1112     // and fix relevant test code.
1113     // Ideally, FractionalUCA.txt will have a "script first primary"
1114     // for unassigned code points.
1115     0xFFFF, 0
1116 };
1117 
firstStringsInScript(UErrorCode & status)1118 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
1119     if (U_FAILURE(status)) {
1120         return NULL;
1121     }
1122     UVector *dest = new UVector(status);
1123     if (dest == NULL) {
1124         status = U_MEMORY_ALLOCATION_ERROR;
1125         return NULL;
1126     }
1127     dest->setDeleter(uprv_deleteUObject);
1128     const UChar *src  = HACK_FIRST_CHARS_IN_SCRIPTS;
1129     const UChar *limit = src + LENGTHOF(HACK_FIRST_CHARS_IN_SCRIPTS);
1130     do {
1131         if (U_FAILURE(status)) {
1132             return dest;
1133         }
1134         UnicodeString *str = new UnicodeString(src, -1);
1135         if (str == NULL) {
1136             status = U_MEMORY_ALLOCATION_ERROR;
1137             return dest;
1138         }
1139         dest->addElement(str, status);
1140         src += str->length() + 1;
1141     } while (src < limit);
1142     return dest;
1143 }
1144 
1145 
1146 namespace {
1147 
1148 /**
1149  * Returns true if one index character string is "better" than the other.
1150  * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
1151  * better, and otherwise binary-less-than is better.
1152  */
isOneLabelBetterThanOther(const Normalizer2 & nfkdNormalizer,const UnicodeString & one,const UnicodeString & other)1153 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
1154                                 const UnicodeString &one, const UnicodeString &other) {
1155     // This is called with primary-equal strings, but never with one.equals(other).
1156     UErrorCode status = U_ZERO_ERROR;
1157     UnicodeString n1 = nfkdNormalizer.normalize(one, status);
1158     UnicodeString n2 = nfkdNormalizer.normalize(other, status);
1159     if (U_FAILURE(status)) { return FALSE; }
1160     int32_t result = n1.countChar32() - n2.countChar32();
1161     if (result != 0) {
1162         return result < 0;
1163     }
1164     result = n1.compareCodePointOrder(n2);
1165     if (result != 0) {
1166         return result < 0;
1167     }
1168     return one.compareCodePointOrder(other) < 0;
1169 }
1170 
1171 }  // namespace
1172 
1173 //
1174 //  Constructor & Destructor for AlphabeticIndex::Record
1175 //
1176 //     Records are internal only, instances are not directly surfaced in the public API.
1177 //     This class is mostly struct-like, with all public fields.
1178 
Record(const UnicodeString & name,const void * data)1179 AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data)
1180         : name_(name), data_(data) {}
1181 
~Record()1182 AlphabeticIndex::Record::~Record() {
1183 }
1184 
1185 
addRecord(const UnicodeString & name,const void * data,UErrorCode & status)1186 AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) {
1187     if (U_FAILURE(status)) {
1188         return *this;
1189     }
1190     if (inputList_ == NULL) {
1191         inputList_ = new UVector(status);
1192         if (inputList_ == NULL) {
1193             status = U_MEMORY_ALLOCATION_ERROR;
1194             return *this;
1195         }
1196         inputList_->setDeleter(alphaIndex_deleteRecord);
1197     }
1198     Record *r = new Record(name, data);
1199     if (r == NULL) {
1200         status = U_MEMORY_ALLOCATION_ERROR;
1201         return *this;
1202     }
1203     inputList_->addElement(r, status);
1204     clearBuckets();
1205     //std::string ss;
1206     //std::string ss2;
1207     //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
1208     //             "   sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
1209     return *this;
1210 }
1211 
1212 
clearRecords(UErrorCode & status)1213 AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) {
1214     if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) {
1215         inputList_->removeAllElements();
1216         clearBuckets();
1217     }
1218     return *this;
1219 }
1220 
getBucketIndex(const UnicodeString & name,UErrorCode & status)1221 int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) {
1222     initBuckets(status);
1223     if (U_FAILURE(status)) {
1224         return 0;
1225     }
1226     return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status);
1227 }
1228 
1229 
getBucketIndex() const1230 int32_t AlphabeticIndex::getBucketIndex() const {
1231     return labelsIterIndex_;
1232 }
1233 
1234 
nextBucket(UErrorCode & status)1235 UBool AlphabeticIndex::nextBucket(UErrorCode &status) {
1236     if (U_FAILURE(status)) {
1237         return FALSE;
1238     }
1239     if (buckets_ == NULL && currentBucket_ != NULL) {
1240         status = U_ENUM_OUT_OF_SYNC_ERROR;
1241         return FALSE;
1242     }
1243     initBuckets(status);
1244     if (U_FAILURE(status)) {
1245         return FALSE;
1246     }
1247     ++labelsIterIndex_;
1248     if (labelsIterIndex_ >= buckets_->getBucketCount()) {
1249         labelsIterIndex_ = buckets_->getBucketCount();
1250         return FALSE;
1251     }
1252     currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_);
1253     resetRecordIterator();
1254     return TRUE;
1255 }
1256 
getBucketLabel() const1257 const UnicodeString &AlphabeticIndex::getBucketLabel() const {
1258     if (currentBucket_ != NULL) {
1259         return currentBucket_->label_;
1260     } else {
1261         return emptyString_;
1262     }
1263 }
1264 
1265 
getBucketLabelType() const1266 UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const {
1267     if (currentBucket_ != NULL) {
1268         return currentBucket_->labelType_;
1269     } else {
1270         return U_ALPHAINDEX_NORMAL;
1271     }
1272 }
1273 
1274 
getBucketRecordCount() const1275 int32_t AlphabeticIndex::getBucketRecordCount() const {
1276     if (currentBucket_ != NULL && currentBucket_->records_ != NULL) {
1277         return currentBucket_->records_->size();
1278     } else {
1279         return 0;
1280     }
1281 }
1282 
1283 
resetBucketIterator(UErrorCode & status)1284 AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) {
1285     if (U_FAILURE(status)) {
1286         return *this;
1287     }
1288     internalResetBucketIterator();
1289     return *this;
1290 }
1291 
1292 
nextRecord(UErrorCode & status)1293 UBool AlphabeticIndex::nextRecord(UErrorCode &status) {
1294     if (U_FAILURE(status)) {
1295         return FALSE;
1296     }
1297     if (currentBucket_ == NULL) {
1298         // We are trying to iterate over the items in a bucket, but there is no
1299         // current bucket from the enumeration of buckets.
1300         status = U_INVALID_STATE_ERROR;
1301         return FALSE;
1302     }
1303     if (buckets_ == NULL) {
1304         status = U_ENUM_OUT_OF_SYNC_ERROR;
1305         return FALSE;
1306     }
1307     if (currentBucket_->records_ == NULL) {
1308         return FALSE;
1309     }
1310     ++itemsIterIndex_;
1311     if (itemsIterIndex_ >= currentBucket_->records_->size()) {
1312         itemsIterIndex_  = currentBucket_->records_->size();
1313         return FALSE;
1314     }
1315     return TRUE;
1316 }
1317 
1318 
getRecordName() const1319 const UnicodeString &AlphabeticIndex::getRecordName() const {
1320     const UnicodeString *retStr = &emptyString_;
1321     if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
1322         itemsIterIndex_ >= 0 &&
1323         itemsIterIndex_ < currentBucket_->records_->size()) {
1324             Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1325             retStr = &item->name_;
1326     }
1327     return *retStr;
1328 }
1329 
getRecordData() const1330 const void *AlphabeticIndex::getRecordData() const {
1331     const void *retPtr = NULL;
1332     if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
1333         itemsIterIndex_ >= 0 &&
1334         itemsIterIndex_ < currentBucket_->records_->size()) {
1335             Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1336             retPtr = item->data_;
1337     }
1338     return retPtr;
1339 }
1340 
1341 
resetRecordIterator()1342 AlphabeticIndex & AlphabeticIndex::resetRecordIterator() {
1343     itemsIterIndex_ = -1;
1344     return *this;
1345 }
1346 
1347 
1348 
Bucket(const UnicodeString & label,const UnicodeString & lowerBoundary,UAlphabeticIndexLabelType type)1349 AlphabeticIndex::Bucket::Bucket(const UnicodeString &label,
1350                                 const UnicodeString &lowerBoundary,
1351                                 UAlphabeticIndexLabelType type)
1352         : label_(label), lowerBoundary_(lowerBoundary), labelType_(type),
1353           displayBucket_(NULL), displayIndex_(-1),
1354           records_(NULL) {
1355 }
1356 
1357 
~Bucket()1358 AlphabeticIndex::Bucket::~Bucket() {
1359     delete records_;
1360 }
1361 
1362 U_NAMESPACE_END
1363 
1364 #endif
1365