• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 * Copyright (C) 2009-2011, International Business Machines Corporation and    *
4 * others. All Rights Reserved.                                                *
5 *******************************************************************************
6 */
7 
8 /**
9  * \file
10  * \brief C API: AlphabeticIndex class
11  */
12 
13 #include "unicode/utypes.h"
14 
15 #include "unicode/alphaindex.h"
16 #include "unicode/coll.h"
17 #include "unicode/normalizer2.h"
18 #include "unicode/strenum.h"
19 #include "unicode/tblcoll.h"
20 #include "unicode/ulocdata.h"
21 #include "unicode/uniset.h"
22 #include "unicode/uobject.h"
23 #include "unicode/uscript.h"
24 #include "unicode/usetiter.h"
25 #include "unicode/ustring.h"
26 
27 #include "cstring.h"
28 #include "mutex.h"
29 #include "uassert.h"
30 #include "ucln_in.h"
31 #include "uhash.h"
32 #include "uvector.h"
33 
34 //#include <string>
35 // BEGIN android-removed
36 // Apply the change from ICU trunk.
37 // #include <iostream>
38 // END android-removed
39 U_NAMESPACE_BEGIN
40 
41 UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(AlphabeticIndex)
42 
43 // Forward Declarations
44 static int32_t U_CALLCONV
45 PreferenceComparator(const void *context, const void *left, const void *right);
46 
47 static int32_t U_CALLCONV
48 sortCollateComparator(const void *context, const void *left, const void *right);
49 
50 static int32_t U_CALLCONV
51 recordCompareFn(const void *context, const void *left, const void *right);
52 
53 //
54 //  UHash support function, delete a UnicodeSet
55 //     TODO:  move this function into uhash.
56 //
57 static void U_CALLCONV
uhash_deleteUnicodeSet(void * obj)58 uhash_deleteUnicodeSet(void *obj) {
59     delete static_cast<UnicodeSet *>(obj);
60 }
61 
62 //  UVector<Bucket *> support function, delete a Bucket.
63 static void U_CALLCONV
alphaIndex_deleteBucket(void * obj)64 alphaIndex_deleteBucket(void *obj) {
65     delete static_cast<AlphabeticIndex::Bucket *>(obj);
66 }
67 
68 //  UVector<Record *> support function, delete a Record.
69 static void U_CALLCONV
alphaIndex_deleteRecord(void * obj)70 alphaIndex_deleteRecord(void *obj) {
71     delete static_cast<AlphabeticIndex::Record *>(obj);
72 }
73 
74 
75 
76 static const Normalizer2 *nfkdNormalizer;
77 
78 //
79 //  Append the contents of a UnicodeSet to a UVector of UnicodeStrings.
80 //  Append everything - individual characters are handled as strings of length 1.
81 //  The destination vector owns the appended strings.
82 
appendUnicodeSetToUVector(UVector & dest,const UnicodeSet & source,UErrorCode & status)83 static void appendUnicodeSetToUVector(UVector &dest, const UnicodeSet &source, UErrorCode &status) {
84     UnicodeSetIterator setIter(source);
85     while (setIter.next()) {
86         const UnicodeString &str = setIter.getString();
87         dest.addElement(str.clone(), status);
88     }
89 }
90 
91 
AlphabeticIndex(const Locale & locale,UErrorCode & status)92 AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) {
93     init(status);
94     if (U_FAILURE(status)) {
95         return;
96     }
97     locale_ = locale;
98     langType_ = langTypeFromLocale(locale_);
99 
100     collator_ = Collator::createInstance(locale, status);
101     if (collator_ != NULL) {
102         collatorPrimaryOnly_ = collator_->clone();
103     }
104     if (collatorPrimaryOnly_ != NULL) {
105         collatorPrimaryOnly_->setStrength(Collator::PRIMARY);
106     }
107     getIndexExemplars(*initialLabels_, locale, status);
108     indexBuildRequired_ = TRUE;
109     if ((collator_ == NULL || collatorPrimaryOnly_ == NULL) && U_SUCCESS(status)) {
110         status = U_MEMORY_ALLOCATION_ERROR;
111     }
112     firstScriptCharacters_ = firstStringsInScript(status);
113 }
114 
115 
~AlphabeticIndex()116 AlphabeticIndex::~AlphabeticIndex() {
117     uhash_close(alreadyIn_);
118     delete bucketList_;
119     delete collator_;
120     delete collatorPrimaryOnly_;
121     delete firstScriptCharacters_;
122     delete labels_;
123     delete inputRecords_;
124     delete noDistinctSorting_;
125     delete notAlphabetic_;
126     delete initialLabels_;
127 }
128 
129 
addLabels(const UnicodeSet & additions,UErrorCode & status)130 AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) {
131     if (U_FAILURE(status)) {
132         return *this;
133     }
134     initialLabels_->addAll(additions);
135     return *this;
136 }
137 
138 
addLabels(const Locale & locale,UErrorCode & status)139 AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
140     if (U_FAILURE(status)) {
141         return *this;
142     }
143     UnicodeSet additions;
144     getIndexExemplars(additions, locale, status);
145     initialLabels_->addAll(additions);
146     return *this;
147 }
148 
149 
getBucketCount(UErrorCode & status)150 int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) {
151     buildIndex(status);
152     if (U_FAILURE(status)) {
153         return 0;
154     }
155     return bucketList_->size();
156 }
157 
158 
getRecordCount(UErrorCode & status)159 int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) {
160     if (U_FAILURE(status)) {
161         return 0;
162     }
163     return inputRecords_->size();
164 }
165 
166 
buildIndex(UErrorCode & status)167 void AlphabeticIndex::buildIndex(UErrorCode &status) {
168     if (U_FAILURE(status)) {
169         return;
170     }
171     if (!indexBuildRequired_) {
172         return;
173     }
174 
175     // Discard any already-built data.
176     // This is important when the user builds and uses an index, then subsequently modifies it,
177     // necessitating a rebuild.
178 
179     bucketList_->removeAllElements();
180     labels_->removeAllElements();
181     uhash_removeAll(alreadyIn_);
182     noDistinctSorting_->clear();
183     notAlphabetic_->clear();
184 
185     // first sort the incoming Labels, with a "best" ordering among items
186     // that are the same according to the collator
187 
188     UVector preferenceSorting(status);   // Vector of UnicodeStrings; owned by the vector.
189     preferenceSorting.setDeleter(uhash_deleteUnicodeString);
190     appendUnicodeSetToUVector(preferenceSorting, *initialLabels_, status);
191     preferenceSorting.sortWithUComparator(PreferenceComparator, &status, status);
192 
193     // We now make a set of Labels.
194     // Some of the input may, however, be redundant.
195     // That is, we might have c, ch, d, where "ch" sorts just like "c", "h"
196     // So we make a pass through, filtering out those cases.
197     // TODO: filtering these out would seem to be at odds with the eventual goal
198     //       of being able to split buckets that contain too many items.
199 
200     UnicodeSet labelSet;
201     for (int32_t psIndex=0; psIndex<preferenceSorting.size(); psIndex++) {
202         UnicodeString item = *static_cast<const UnicodeString *>(preferenceSorting.elementAt(psIndex));
203         // TODO:  Since preferenceSorting was originally populated from the contents of a UnicodeSet,
204         //        is it even possible for duplicates to show up in this check?
205         if (labelSet.contains(item)) {
206             UnicodeSetIterator itemAlreadyInIter(labelSet);
207             while (itemAlreadyInIter.next()) {
208                 const UnicodeString &itemAlreadyIn = itemAlreadyInIter.getString();
209                 if (collatorPrimaryOnly_->compare(item, itemAlreadyIn) == 0) {
210                     UnicodeSet *targets = static_cast<UnicodeSet *>(uhash_get(alreadyIn_, &itemAlreadyIn));
211                     if (targets == NULL) {
212                         // alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet<String>());
213                         targets = new UnicodeSet();
214                         uhash_put(alreadyIn_, itemAlreadyIn.clone(), targets, &status);
215                     }
216                     targets->add(item);
217                     break;
218                 }
219             }
220         } else if (item.moveIndex32(0, 1) < item.length() &&  // Label contains more than one code point.
221                    collatorPrimaryOnly_->compare(item, separated(item)) == 0) {
222             noDistinctSorting_->add(item);
223         } else if (!ALPHABETIC->containsSome(item)) {
224             notAlphabetic_->add(item);
225         } else {
226             labelSet.add(item);
227         }
228     }
229 
230     // Move the set of Labels from the set into a vector, and sort
231     // according to the collator.
232 
233     appendUnicodeSetToUVector(*labels_, labelSet, status);
234     labels_->sortWithUComparator(sortCollateComparator, collatorPrimaryOnly_, status);
235 
236     // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element
237     //    Implemented by copying the elements to be retained to a new UVector.
238 
239     const int32_t size = labelSet.size() - 1;
240     if (size > maxLabelCount_) {
241         UVector *newLabels = new UVector(status);
242         newLabels->setDeleter(uhash_deleteUnicodeString);
243         int32_t count = 0;
244         int32_t old = -1;
245         for (int32_t srcIndex=0; srcIndex<labels_->size(); srcIndex++) {
246             const UnicodeString *str = static_cast<const UnicodeString *>(labels_->elementAt(srcIndex));
247             ++count;
248             const int32_t bump = count * maxLabelCount_ / size;
249             if (bump == old) {
250                 // it.remove();
251             } else {
252                 newLabels->addElement(str->clone(), status);
253                 old = bump;
254             }
255         }
256         delete labels_;
257         labels_ = newLabels;
258     }
259 
260     // We now know the list of labels.
261     // Create a corresponding list of buckets, one per label.
262 
263     buildBucketList(status);    // Corresponds to Java BucketList constructor.
264 
265     // Bin the Records into the Buckets.
266     bucketRecords(status);
267 
268     indexBuildRequired_ = FALSE;
269     resetBucketIterator(status);
270 }
271 
272 //
273 //  buildBucketList()    Corresponds to the BucketList constructor in the Java version.
274 
buildBucketList(UErrorCode & status)275 void AlphabeticIndex::buildBucketList(UErrorCode &status) {
276     UnicodeString labelStr = getUnderflowLabel();
277     Bucket *b = new Bucket(labelStr, *EMPTY_STRING, U_ALPHAINDEX_UNDERFLOW, status);
278     bucketList_->addElement(b, status);
279 
280     // Build up the list, adding underflow, additions, overflow
281     // insert infix labels as needed, using \uFFFF.
282     const UnicodeString *last = static_cast<UnicodeString *>(labels_->elementAt(0));
283     b = new Bucket(*last, *last, U_ALPHAINDEX_NORMAL, status);
284     bucketList_->addElement(b, status);
285 
286     UnicodeSet lastSet;
287     UnicodeSet set;
288     AlphabeticIndex::getScriptSet(lastSet, *last, status);
289     lastSet.removeAll(*IGNORE_SCRIPTS);
290 
291     for (int i = 1; i < labels_->size(); ++i) {
292         UnicodeString *current = static_cast<UnicodeString *>(labels_->elementAt(i));
293         getScriptSet(set, *current, status);
294         set.removeAll(*IGNORE_SCRIPTS);
295         if (lastSet.containsNone(set)) {
296             // check for adjacent
297             const UnicodeString &overflowComparisonString = getOverflowComparisonString(*last, status);
298             if (collatorPrimaryOnly_->compare(overflowComparisonString, *current) < 0) {
299                 labelStr = getInflowLabel();
300                 b = new Bucket(labelStr, overflowComparisonString, U_ALPHAINDEX_INFLOW, status);
301                 bucketList_->addElement(b, status);
302                 i++;
303                 lastSet = set;
304             }
305         }
306         b = new Bucket(*current, *current, U_ALPHAINDEX_NORMAL, status);
307         bucketList_->addElement(b, status);
308         last = current;
309         lastSet = set;
310     }
311     const UnicodeString &limitString = getOverflowComparisonString(*last, status);
312     b = new Bucket(getOverflowLabel(), limitString, U_ALPHAINDEX_OVERFLOW, status);
313     bucketList_->addElement(b, status);
314     // final overflow bucket
315 }
316 
317 
318 //
319 //   Place all of the raw input records into the correct bucket.
320 //
321 //       Begin by sorting the input records; this lets us bin them in a single pass.
322 //
323 //       Note on storage management:  The input records are owned by the
324 //       inputRecords_ vector, and will (eventually) be auto-deleted by it.
325 //       The Bucket objects have pointers to the Record objects, but do not own them.
326 //
bucketRecords(UErrorCode & status)327 void AlphabeticIndex::bucketRecords(UErrorCode &status) {
328     if (U_FAILURE(status)) {
329         return;
330     }
331 
332     inputRecords_->sortWithUComparator(recordCompareFn, collator_, status);
333     U_ASSERT(bucketList_->size() > 0);   // Should always have at least an overflow
334                                          //   bucket, even if no user labels.
335     int32_t bucketIndex = 0;
336     Bucket *destBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex));
337     Bucket *nextBucket = NULL;
338     if (bucketIndex+1 < bucketList_->size()) {
339         nextBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex+1));
340     }
341     int32_t recordIndex = 0;
342     Record *r = static_cast<Record *>(inputRecords_->elementAt(recordIndex));
343     while (recordIndex < inputRecords_->size()) {
344         if (nextBucket == NULL ||
345             collatorPrimaryOnly_->compare(r->sortingName_, nextBucket->lowerBoundary_) < 0) {
346                 // Record goes in current bucket.  Advance to next record,
347                 // stay on current bucket.
348                 destBucket->records_->addElement(r, status);
349                 ++recordIndex;
350                 r = static_cast<Record *>(inputRecords_->elementAt(recordIndex));
351         } else {
352             // Advance to the next bucket, stay on current record.
353             bucketIndex++;
354             destBucket = nextBucket;
355             if (bucketIndex+1 < bucketList_->size()) {
356                 nextBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex+1));
357             } else {
358                 nextBucket = NULL;
359             }
360             U_ASSERT(destBucket != NULL);
361         }
362     }
363 
364 }
365 
366 
getIndexExemplars(UnicodeSet & dest,const Locale & locale,UErrorCode & status)367 void AlphabeticIndex::getIndexExemplars(UnicodeSet  &dest, const Locale &locale, UErrorCode &status) {
368     if (U_FAILURE(status)) {
369         return;
370     }
371 
372     LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
373     UnicodeSet exemplars;
374     ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
375     if (U_SUCCESS(status)) {
376         dest.addAll(exemplars);
377         return;
378     }
379     status = U_ZERO_ERROR;  // Clear out U_MISSING_RESOURCE_ERROR
380 
381     // Locale data did not include explicit Index characters.
382     // Synthesize a set of them from the locale's standard exemplar characters.
383 
384     ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
385     if (U_FAILURE(status)) {
386         return;
387     }
388 
389     // Upper-case any that aren't already so.
390     //   (We only do this for synthesized index characters.)
391 
392     UnicodeSetIterator it(exemplars);
393     UnicodeString upperC;
394     UnicodeSet  lowersToRemove;
395     UnicodeSet  uppersToAdd;
396     while (it.next()) {
397         const UnicodeString &exemplarC = it.getString();
398         upperC = exemplarC;
399         upperC.toUpper(locale);
400         if (exemplarC != upperC) {
401             lowersToRemove.add(exemplarC);
402             uppersToAdd.add(upperC);
403         }
404     }
405     exemplars.removeAll(lowersToRemove);
406     exemplars.addAll(uppersToAdd);
407 
408     // get the exemplars, and handle special cases
409 
410     // question: should we add auxiliary exemplars?
411     if (exemplars.containsSome(*CORE_LATIN)) {
412         exemplars.addAll(*CORE_LATIN);
413     }
414     if (exemplars.containsSome(*HANGUL)) {
415         // cut down to small list
416         UnicodeSet BLOCK_HANGUL_SYLLABLES(UNICODE_STRING_SIMPLE("[:block=hangul_syllables:]"), status);
417         exemplars.removeAll(BLOCK_HANGUL_SYLLABLES);
418         exemplars.addAll(*HANGUL);
419     }
420     if (exemplars.containsSome(*ETHIOPIC)) {
421         // cut down to small list
422         // make use of the fact that Ethiopic is allocated in 8's, where
423         // the base is 0 mod 8.
424         UnicodeSetIterator  it(*ETHIOPIC);
425         while (it.next() && !it.isString()) {
426             if ((it.getCodepoint() & 0x7) != 0) {
427                 exemplars.remove(it.getCodepoint());
428             }
429         }
430     }
431     dest.addAll(exemplars);
432 }
433 
434 
435 /*
436  * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
437  */
438 static const UChar32 CGJ = (UChar)0x034F;
separated(const UnicodeString & item)439 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
440     UnicodeString result;
441     if (item.length() == 0) {
442         return result;
443     }
444     int32_t i = 0;
445     for (;;) {
446         UChar32  cp = item.char32At(i);
447         result.append(cp);
448         i = item.moveIndex32(i, 1);
449         if (i >= item.length()) {
450             break;
451         }
452         result.append(CGJ);
453     }
454     return result;
455 }
456 
457 
operator ==(const AlphabeticIndex &) const458 UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const {
459     return FALSE;
460 }
461 
462 
operator !=(const AlphabeticIndex &) const463 UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {
464     return FALSE;
465 }
466 
467 
getCollator() const468 const RuleBasedCollator &AlphabeticIndex::getCollator() const {
469     // There are no known non-RuleBasedCollator collators, and none ever expected.
470     // But, in case that changes, better a null pointer than a wrong type.
471     return *dynamic_cast<RuleBasedCollator *>(collator_);
472 }
473 
474 
getInflowLabel() const475 const UnicodeString &AlphabeticIndex::getInflowLabel() const {
476     return inflowLabel_;
477 }
478 
getOverflowLabel() const479 const UnicodeString &AlphabeticIndex::getOverflowLabel() const {
480     return overflowLabel_;
481 }
482 
483 
getUnderflowLabel() const484 const UnicodeString &AlphabeticIndex::getUnderflowLabel() const {
485     return underflowLabel_;
486 }
487 
488 
setInflowLabel(const UnicodeString & label,UErrorCode &)489 AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
490     inflowLabel_ = label;
491     indexBuildRequired_ = TRUE;
492     return *this;
493 }
494 
495 
setOverflowLabel(const UnicodeString & label,UErrorCode &)496 AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
497     overflowLabel_ = label;
498     indexBuildRequired_ = TRUE;
499     return *this;
500 }
501 
502 
setUnderflowLabel(const UnicodeString & label,UErrorCode &)503 AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
504     underflowLabel_ = label;
505     indexBuildRequired_ = TRUE;
506     return *this;
507 }
508 
509 
getMaxLabelCount() const510 int32_t AlphabeticIndex::getMaxLabelCount() const {
511     return maxLabelCount_;
512 }
513 
514 
setMaxLabelCount(int32_t maxLabelCount,UErrorCode & status)515 AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) {
516     if (U_FAILURE(status)) {
517         return *this;
518     }
519     if (maxLabelCount <= 0) {
520         status = U_ILLEGAL_ARGUMENT_ERROR;
521         return *this;
522     }
523     maxLabelCount_ = maxLabelCount;
524     if (maxLabelCount < bucketList_->size()) {
525         indexBuildRequired_ = TRUE;
526     }
527     return *this;
528 }
529 
530 
getOverflowComparisonString(const UnicodeString & lowerLimit,UErrorCode &)531 const UnicodeString &AlphabeticIndex::getOverflowComparisonString(const UnicodeString &lowerLimit, UErrorCode &/*status*/) {
532     for (int32_t i=0; i<firstScriptCharacters_->size(); i++) {
533         const UnicodeString *s =
534                 static_cast<const UnicodeString *>(firstScriptCharacters_->elementAt(i));
535         if (collator_->compare(*s, lowerLimit) > 0) {
536             return *s;
537         }
538     }
539     return *EMPTY_STRING;
540 }
541 
getScriptSet(UnicodeSet & dest,const UnicodeString & codePoint,UErrorCode & status)542 UnicodeSet *AlphabeticIndex::getScriptSet(UnicodeSet &dest, const UnicodeString &codePoint, UErrorCode &status) {
543     if (U_FAILURE(status)) {
544         return &dest;
545     }
546     UChar32 cp = codePoint.char32At(0);
547     UScriptCode scriptCode = uscript_getScript(cp, &status);
548     dest.applyIntPropertyValue(UCHAR_SCRIPT, scriptCode, status);
549     return &dest;
550 }
551 
552 //
553 //  init() - Common code for constructors.
554 //
555 
init(UErrorCode & status)556 void AlphabeticIndex::init(UErrorCode &status) {
557     // Initialize statics if needed.
558     AlphabeticIndex::staticInit(status);
559 
560     // Put the object into a known state so that the destructor will function.
561 
562     alreadyIn_             = NULL;
563     bucketList_            = NULL;
564     collator_              = NULL;
565     collatorPrimaryOnly_   = NULL;
566     currentBucket_         = NULL;
567     firstScriptCharacters_ = NULL;
568     initialLabels_         = NULL;
569     indexBuildRequired_    = TRUE;
570     inputRecords_          = NULL;
571     itemsIterIndex_        = 0;
572     labels_                = NULL;
573     labelsIterIndex_       = 0;
574     maxLabelCount_         = 99;
575     noDistinctSorting_     = NULL;
576     notAlphabetic_         = NULL;
577     recordCounter_         = 0;
578 
579     if (U_FAILURE(status)) {
580         return;
581     }
582     alreadyIn_             = uhash_open(uhash_hashUnicodeString,    // Key Hash,
583                                         uhash_compareUnicodeString, // key Comparator,
584                                         NULL,                       // value Comparator
585                                         &status);
586     uhash_setKeyDeleter(alreadyIn_, uhash_deleteUnicodeString);
587     uhash_setValueDeleter(alreadyIn_, uhash_deleteUnicodeSet);
588 
589     bucketList_            = new UVector(status);
590     bucketList_->setDeleter(alphaIndex_deleteBucket);
591     labels_                = new UVector(status);
592     labels_->setDeleter(uhash_deleteUnicodeString);
593     labels_->setComparer(uhash_compareUnicodeString);
594     inputRecords_          = new UVector(status);
595     inputRecords_->setDeleter(alphaIndex_deleteRecord);
596 
597     noDistinctSorting_     = new UnicodeSet();
598     notAlphabetic_         = new UnicodeSet();
599     initialLabels_         = new UnicodeSet();
600 
601     inflowLabel_.remove();
602     inflowLabel_.append((UChar)0x2026);    // Ellipsis
603     overflowLabel_ = inflowLabel_;
604     underflowLabel_ = inflowLabel_;
605 
606     // TODO:  check for memory allocation failures.
607 }
608 
609 
610 static  UBool  indexCharactersAreInitialized = FALSE;
611 
612 //  Index Characters Clean up function.  Delete statically allocated constant stuff.
613 U_CDECL_BEGIN
indexCharacters_cleanup(void)614 static UBool U_CALLCONV indexCharacters_cleanup(void) {
615     AlphabeticIndex::staticCleanup();
616     return TRUE;
617 }
618 U_CDECL_END
619 
staticCleanup()620 void AlphabeticIndex::staticCleanup() {
621     delete ALPHABETIC;
622     ALPHABETIC = NULL;
623     delete HANGUL;
624     HANGUL = NULL;
625     delete ETHIOPIC;
626     ETHIOPIC = NULL;
627     delete CORE_LATIN;
628     CORE_LATIN = NULL;
629     delete IGNORE_SCRIPTS;
630     IGNORE_SCRIPTS = NULL;
631     delete TO_TRY;
632     TO_TRY = NULL;
633     delete UNIHAN;
634     UNIHAN = NULL;
635     delete EMPTY_STRING;
636     EMPTY_STRING = NULL;
637     nfkdNormalizer = NULL;  // ref to a singleton.  Do not delete.
638     indexCharactersAreInitialized = FALSE;
639 }
640 
641 
642 UnicodeSet *AlphabeticIndex::ALPHABETIC;
643 UnicodeSet *AlphabeticIndex::HANGUL;
644 UnicodeSet *AlphabeticIndex::ETHIOPIC;
645 UnicodeSet *AlphabeticIndex::CORE_LATIN;
646 UnicodeSet *AlphabeticIndex::IGNORE_SCRIPTS;
647 UnicodeSet *AlphabeticIndex::TO_TRY;
648 UnicodeSet *AlphabeticIndex::UNIHAN;
649 const UnicodeString *AlphabeticIndex::EMPTY_STRING;
650 
651 //
652 //  staticInit()    One-time initialization of constants.
653 //                  Thread safe.  Called from constructors.
654 //                  Mutex overhead is not a concern.  AlphabeticIndex constructors are
655 //                  sufficiently heavy that the cost of the mutex check is not significant.
656 
staticInit(UErrorCode & status)657 void AlphabeticIndex::staticInit(UErrorCode &status) {
658     static UMTX IndexCharsInitMutex;
659 
660     Mutex mutex(&IndexCharsInitMutex);
661     if (indexCharactersAreInitialized || U_FAILURE(status)) {
662         return;
663     }
664     UBool finishedInit = FALSE;
665 
666     {
667         UnicodeString alphaString = UNICODE_STRING_SIMPLE("[[:alphabetic:]-[:mark:]]");
668         ALPHABETIC = new UnicodeSet(alphaString, status);
669         if (ALPHABETIC == NULL) {
670             goto err;
671         }
672 
673         HANGUL = new UnicodeSet();
674         HANGUL->add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).add(0xB9C8).add(0xBC14).add(0xC0AC).
675                 add(0xC544).add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).add(0xD30C).add(0xD558);
676         if (HANGUL== NULL) {
677             goto err;
678         }
679 
680 
681         UnicodeString EthiopicStr = UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
682         ETHIOPIC = new UnicodeSet(EthiopicStr, status);
683         if (ETHIOPIC == NULL) {
684             goto err;
685         }
686 
687         CORE_LATIN = new UnicodeSet((UChar32)0x61, (UChar32)0x7a);  // ('a', 'z');
688         if (CORE_LATIN == NULL) {
689             goto err;
690         }
691 
692         UnicodeString IgnoreStr= UNICODE_STRING_SIMPLE(
693                 "[[:sc=Common:][:sc=inherited:][:script=Unknown:][:script=braille:]]");
694         IGNORE_SCRIPTS = new UnicodeSet(IgnoreStr, status);
695         IGNORE_SCRIPTS->freeze();
696         if (IGNORE_SCRIPTS == NULL) {
697             goto err;
698         }
699 
700         UnicodeString nfcqcStr = UNICODE_STRING_SIMPLE("[:^nfcqc=no:]");
701         TO_TRY = new UnicodeSet(nfcqcStr, status);
702         if (TO_TRY == NULL) {
703             goto err;
704         }
705 
706         UnicodeString unihanStr = UNICODE_STRING_SIMPLE("[:script=Hani:]");
707         UNIHAN = new UnicodeSet(unihanStr, status);
708         if (UNIHAN == NULL) {
709             goto err;
710         }
711 
712         EMPTY_STRING = new UnicodeString();
713 
714         nfkdNormalizer = Normalizer2::getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, status);
715         if (nfkdNormalizer == NULL) {
716             goto err;
717         }
718     }
719     finishedInit = TRUE;
720 
721   err:
722     if (!finishedInit && U_SUCCESS(status)) {
723         status = U_MEMORY_ALLOCATION_ERROR;
724     }
725     if (U_FAILURE(status)) {
726         indexCharacters_cleanup();
727         return;
728     }
729     ucln_i18n_registerCleanup(UCLN_I18N_INDEX_CHARACTERS, indexCharacters_cleanup);
730     indexCharactersAreInitialized = TRUE;
731 }
732 
733 
734 //
735 //  Comparison function for UVector<UnicodeString *> sorting with a collator.
736 //
737 static int32_t U_CALLCONV
sortCollateComparator(const void * context,const void * left,const void * right)738 sortCollateComparator(const void *context, const void *left, const void *right) {
739     const UHashTok *leftTok = static_cast<const UHashTok *>(left);
740     const UHashTok *rightTok = static_cast<const UHashTok *>(right);
741     const UnicodeString *leftString  = static_cast<const UnicodeString *>(leftTok->pointer);
742     const UnicodeString *rightString = static_cast<const UnicodeString *>(rightTok->pointer);
743     const Collator *col = static_cast<const Collator *>(context);
744 
745     if (leftString == rightString) {
746         // Catches case where both are NULL
747         return 0;
748     }
749     if (leftString == NULL) {
750         return 1;
751     };
752     if (rightString == NULL) {
753         return -1;
754     }
755     Collator::EComparisonResult r = col->compare(*leftString, *rightString);
756     return (int32_t) r;
757 }
758 
759 //
760 //  Comparison function for UVector<Record *> sorting with a collator.
761 //
762 static int32_t U_CALLCONV
recordCompareFn(const void * context,const void * left,const void * right)763 recordCompareFn(const void *context, const void *left, const void *right) {
764     const UHashTok *leftTok = static_cast<const UHashTok *>(left);
765     const UHashTok *rightTok = static_cast<const UHashTok *>(right);
766     const AlphabeticIndex::Record *leftRec  = static_cast<const AlphabeticIndex::Record *>(leftTok->pointer);
767     const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightTok->pointer);
768     const Collator *col = static_cast<const Collator *>(context);
769 
770     Collator::EComparisonResult r = col->compare(leftRec->sortingName_, rightRec->sortingName_);
771     if (r == Collator::EQUAL) {
772         if (leftRec->serialNumber_ < rightRec->serialNumber_) {
773             r = Collator::LESS;
774         } else if (leftRec->serialNumber_ > rightRec->serialNumber_) {
775             r = Collator::GREATER;
776         }
777     }
778     return (int32_t) r;
779 }
780 
781 
782 #if 0
783 //
784 //  First characters in scripts.
785 //  Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script.
786 //  The vector is sorted according to this index's collation.
787 //
788 //  This code is too slow to use, so for now hard code the data.
789 //    Hard coded implementation is follows.
790 //
791 UVector *AlphabeticIndex::firstStringsInScript(Collator *ruleBasedCollator, UErrorCode &status) {
792 
793     if (U_FAILURE(status)) {
794         return NULL;
795     }
796 
797     UnicodeString results[USCRIPT_CODE_LIMIT];
798     UnicodeString LOWER_A = UNICODE_STRING_SIMPLE("a");
799 
800     UnicodeSetIterator siter(*TO_TRY);
801     while (siter.next()) {
802         const UnicodeString &current = siter.getString();
803         Collator::EComparisonResult r = ruleBasedCollator->compare(current, LOWER_A);
804         if (r < 0) {  // TODO fix; we only want "real" script characters, not
805                       // symbols.
806             continue;
807         }
808 
809         int script = uscript_getScript(current.char32At(0), &status);
810         if (results[script].length() == 0) {
811             results[script] = current;
812         }
813         else if (ruleBasedCollator->compare(current, results[script]) < 0) {
814             results[script] = current;
815         }
816     }
817 
818     UnicodeSet extras;
819     UnicodeSet expansions;
820     RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(ruleBasedCollator);
821     const UCollator *uRuleBasedCollator = rbc->getUCollator();
822     ucol_getContractionsAndExpansions(uRuleBasedCollator, extras.toUSet(), expansions.toUSet(), true, &status);
823     extras.addAll(expansions).removeAll(*TO_TRY);
824     if (extras.size() != 0) {
825         const Normalizer2 *normalizer = Normalizer2::getInstance(NULL, "nfkc", UNORM2_COMPOSE, status);
826         UnicodeSetIterator extrasIter(extras);
827         while (extrasIter.next()) {
828             const UnicodeString &current = extrasIter.next();
829             if (!TO_TRY->containsAll(current))
830                 continue;
831             if (!normalizer->isNormalized(current, status) ||
832                 ruleBasedCollator->compare(current, LOWER_A) < 0) {
833                 continue;
834             }
835             int script = uscript_getScript(current.char32At(0), &status);
836             if (results[script].length() == 0) {
837                 results[script] = current;
838             } else if (ruleBasedCollator->compare(current, results[script]) < 0) {
839                 results[script] = current;
840             }
841         }
842     }
843 
844     UVector *dest = new UVector(status);
845     dest->setDeleter(uhash_deleteUnicodeString);
846     for (uint32_t i = 0; i < sizeof(results) / sizeof(results[0]); ++i) {
847         if (results[i].length() > 0) {
848             dest->addElement(results[i].clone(), status);
849         }
850     }
851     dest->sortWithUComparator(sortCollateComparator, ruleBasedCollator, status);
852     return dest;
853 }
854 #endif
855 
856 
857 //
858 //  First characters in scripts.
859 //  Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script.
860 //  The vector is sorted according to this index's collation.
861 //
862 //  It takes too much time to compute this from character properties, so hard code it for now.
863 //  Character constants copied from corresponding declaration in ICU4J.
864 
865 static UChar HACK_FIRST_CHARS_IN_SCRIPTS[] =  { 0x61, 0, 0x03B1, 0,
866             0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0,
867             0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0,
868             0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0, 0xABC0, 0, 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0, 0x1B83, 0,
869             0xD802, 0xDE00, 0, 0x0E01, 0, 0x0E81, 0, 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0,
870             0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0, 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0,
871             0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0,
872             0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0, 0xD800, 0xDE80, 0,
873             0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0,
874             0xD801, 0xDC80, 0, 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0,
875             0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0, 0x4E00, 0 };
876 
firstStringsInScript(UErrorCode & status)877 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
878     if (U_FAILURE(status)) {
879         return NULL;
880     }
881     UVector *dest = new UVector(status);
882     if (dest == NULL && U_SUCCESS(status)) {
883         status = U_MEMORY_ALLOCATION_ERROR;
884         return NULL;
885     }
886     dest->setDeleter(uhash_deleteUnicodeString);
887     const UChar *src  = HACK_FIRST_CHARS_IN_SCRIPTS;
888     const UChar *limit = src + sizeof(HACK_FIRST_CHARS_IN_SCRIPTS) / sizeof(HACK_FIRST_CHARS_IN_SCRIPTS[0]);
889     do {
890         if (U_FAILURE(status)) {
891             return dest;
892         }
893         UnicodeString *str = new UnicodeString(src, -1);
894         if (str == NULL) {
895             status = U_MEMORY_ALLOCATION_ERROR;
896         }
897         dest->addElement(str, status);
898         src += str->length() + 1;
899     } while (src < limit);
900     dest->sortWithUComparator(sortCollateComparator, collator_, status);
901     return dest;
902 }
903 
904 
langTypeFromLocale(const Locale & loc)905 AlphabeticIndex::ELangType AlphabeticIndex::langTypeFromLocale(const Locale &loc) {
906     const char *lang = loc.getLanguage();
907     if (uprv_strcmp(lang, "zh") != 0) {
908         return kNormal;
909     }
910     const char *script = loc.getScript();
911     if (uprv_strcmp(script, "Hant") == 0) {
912         return kTraditional;
913     }
914     const char *country = loc.getCountry();
915     if (uprv_strcmp(country, "TW") == 0) {
916         return kTraditional;
917     }
918     return kSimplified;
919 }
920 
921 
922 //
923 // Pinyin Hacks.  Direct port from Java.
924 //
925 
926 static const UChar32  probeCharInLong = 0x28EAD;
927 
928 
929 static const UChar PINYIN_LOWER_BOUNDS_SHORT[] = {      // "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz"
930             0x0101, 0x62, 0x63, 0x64, 0x0113, 0x66, 0x67, 0x68, 0x6A, 0x6B, /*l*/0x6C, 0x1E3F, 0x0144, 0x014D,
931             /*p*/0x70, 0x71, 0x72, 0x73, 0x74, /*w*/0x77, 0x78, 0x79, 0x7A};
932 
933 
934 // Pinyin lookup tables copied, pasted (and reformatted) from the ICU4J code.
935 
936 AlphabeticIndex::PinyinLookup AlphabeticIndex::HACK_PINYIN_LOOKUP_SHORT = {
937         {(UChar)0,      (UChar)0, (UChar)0}, // A
938         {(UChar)0x516B, (UChar)0, (UChar)0}, // B
939         {(UChar)0x5693, (UChar)0, (UChar)0}, // C
940         {(UChar)0x5491, (UChar)0, (UChar)0}, // D
941         {(UChar)0x59B8, (UChar)0, (UChar)0}, // E
942         {(UChar)0x53D1, (UChar)0, (UChar)0}, // F
943         {(UChar)0x65EE, (UChar)0, (UChar)0}, // G
944         {(UChar)0x54C8, (UChar)0, (UChar)0}, // H
945         {(UChar)0x4E0C, (UChar)0, (UChar)0}, // J
946         {(UChar)0x5494, (UChar)0, (UChar)0}, // K
947         {(UChar)0x5783, (UChar)0, (UChar)0}, // L
948         {(UChar)0x5452, (UChar)0, (UChar)0}, // M
949         {(UChar)0x5514, (UChar)0, (UChar)0}, // N
950         {(UChar)0x5594, (UChar)0, (UChar)0}, // O
951         {(UChar)0x5991, (UChar)0, (UChar)0}, // P
952         {(UChar)0x4E03, (UChar)0, (UChar)0}, // Q
953         {(UChar)0x513F, (UChar)0, (UChar)0}, // R
954         {(UChar)0x4EE8, (UChar)0, (UChar)0}, // S
955         {(UChar)0x4ED6, (UChar)0, (UChar)0}, // T
956         {(UChar)0x7A75, (UChar)0, (UChar)0}, // W
957         {(UChar)0x5915, (UChar)0, (UChar)0}, // X
958         {(UChar)0x4E2B, (UChar)0, (UChar)0}, // Y
959         {(UChar)0x5E00, (UChar)0, (UChar)0}, // Z
960         {(UChar)0xFFFF, (UChar)0, (UChar)0}, // mark end of array
961     };
962 
963 static const UChar PINYIN_LOWER_BOUNDS_LONG[] = {   // "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz";
964             0x0101, 0x62, 0x63, 0x64, 0x0113, 0x66, 0x67, 0x68, 0x6A, 0x6B, /*l*/0x6C, 0x1E3F, 0x0144, 0x014D,
965             /*p*/0x70, 0x71, 0x72, 0x73, 0x74, /*w*/0x77, 0x78, 0x79, 0x7A};
966 
967 AlphabeticIndex::PinyinLookup AlphabeticIndex::HACK_PINYIN_LOOKUP_LONG = {
968         {(UChar)0,      (UChar)0,      (UChar)0}, // A
969         {(UChar)0x516B, (UChar)0,      (UChar)0}, // b
970         {(UChar)0xD863, (UChar)0xDEAD, (UChar)0}, // c
971         {(UChar)0xD844, (UChar)0xDE51, (UChar)0}, // d
972         {(UChar)0x59B8, (UChar)0,      (UChar)0}, // e
973         {(UChar)0x53D1, (UChar)0,      (UChar)0}, // f
974         {(UChar)0xD844, (UChar)0xDE45, (UChar)0}, // g
975         {(UChar)0x54C8, (UChar)0,      (UChar)0}, // h
976         {(UChar)0x4E0C, (UChar)0,      (UChar)0}, // j
977         {(UChar)0x5494, (UChar)0,      (UChar)0}, // k
978         {(UChar)0x3547, (UChar)0,      (UChar)0}, // l
979         {(UChar)0x5452, (UChar)0,      (UChar)0}, // m
980         {(UChar)0x5514, (UChar)0,      (UChar)0}, // n
981         {(UChar)0x5594, (UChar)0,      (UChar)0}, // o
982         {(UChar)0xD84F, (UChar)0xDC7A, (UChar)0}, // p
983         {(UChar)0x4E03, (UChar)0,      (UChar)0}, // q
984         {(UChar)0x513F, (UChar)0,      (UChar)0}, // r
985         {(UChar)0x4EE8, (UChar)0,      (UChar)0}, // s
986         {(UChar)0x4ED6, (UChar)0,      (UChar)0}, // t
987         {(UChar)0x7A75, (UChar)0,      (UChar)0}, // w
988         {(UChar)0x5915, (UChar)0,      (UChar)0}, // x
989         {(UChar)0x4E2B, (UChar)0,      (UChar)0}, // y
990         {(UChar)0x5E00, (UChar)0,      (UChar)0}, // z
991         {(UChar)0xFFFF, (UChar)0,      (UChar)0}, // mark end of array
992     };
993 
994 
995 //
996 //  Probe the collation data, and decide which Pinyin tables should be used
997 //
998 //  ICU can be built with a choice between two Chinese collations.
999 //  The hack Pinyin tables to use depend on which one is in use.
1000 //  We can assume that any given copy of ICU will have only one of the collations available,
1001 //  and that there is no way, in a given process, to create two alphabetic indexes using
1002 //  different Chinese collations.  Which means the probe can be done once
1003 //  and the results cached.
1004 //
1005 //  This whole arrangement is temporary.
1006 //
1007 AlphabeticIndex::PinyinLookup *AlphabeticIndex::HACK_PINYIN_LOOKUP  = NULL;
1008 const UChar  *AlphabeticIndex::PINYIN_LOWER_BOUNDS = NULL;
1009 
initPinyinBounds(const Collator * col,UErrorCode & status)1010 void AlphabeticIndex::initPinyinBounds(const Collator *col, UErrorCode &status) {
1011     {
1012         Mutex m;
1013         if (PINYIN_LOWER_BOUNDS != NULL) {
1014             return;
1015         }
1016     }
1017     UnicodeSet *colSet = col->getTailoredSet(status);
1018     if (U_FAILURE(status) || colSet == NULL) {
1019         delete colSet;
1020         if (U_SUCCESS(status))  {
1021             status = U_MEMORY_ALLOCATION_ERROR;
1022         }
1023         return;
1024     }
1025     UBool useLongTables = colSet->contains(probeCharInLong);
1026     delete colSet;
1027     {
1028         Mutex m;
1029         if (useLongTables) {
1030             PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_LONG;
1031             HACK_PINYIN_LOOKUP  = &HACK_PINYIN_LOOKUP_LONG;
1032         } else {
1033             PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_SHORT;
1034             HACK_PINYIN_LOOKUP  = &HACK_PINYIN_LOOKUP_SHORT;
1035         }
1036     }
1037 }
1038 
1039 // Pinyin Hack:
1040 //    Modify a Chinese name by prepending a Latin letter.  The modified name is used
1041 //      when putting records (names) into buckets, to put the name under a Latin index heading.
1042 
hackName(UnicodeString & dest,const UnicodeString & name,const Collator * col)1043 void AlphabeticIndex::hackName(UnicodeString &dest, const UnicodeString &name, const Collator *col) {
1044 
1045     if (langType_ != kSimplified || !UNIHAN->contains(name.char32At(0))) {
1046         dest = name;
1047         return;
1048     }
1049 
1050     UErrorCode status = U_ZERO_ERROR;
1051     initPinyinBounds(col, status);
1052     if (U_FAILURE(status)) {
1053         dest = name;
1054         return;
1055     }
1056     // TODO:  use binary search
1057     int index;
1058     for (index=0; ; index++) {
1059         if ((*HACK_PINYIN_LOOKUP)[index][0] == (UChar)0xffff) {
1060             index--;
1061             break;
1062         }
1063         int32_t compareResult = col->compare(name, (*HACK_PINYIN_LOOKUP)[index]);
1064         if (compareResult < 0) {
1065             index--;
1066         }
1067         if (compareResult <= 0) {
1068             break;
1069         }
1070     }
1071     UChar c = PINYIN_LOWER_BOUNDS[index];
1072     dest.setTo(c);
1073     dest.append(name);
1074     return;
1075 }
1076 
1077 
1078 
1079 /**
1080  * Comparator that returns "better" items first, where shorter NFKD is better, and otherwise NFKD binary order is
1081  * better, and otherwise binary order is better.
1082  *
1083  * For use with array sort or UVector.
1084  * @param context  A UErrorCode pointer.
1085  * @param left     A UHashTok pointer, which must refer to a UnicodeString *
1086  * @param right    A UHashTok pointer, which must refer to a UnicodeString *
1087  */
1088 
1089 static int32_t U_CALLCONV
PreferenceComparator(const void * context,const void * left,const void * right)1090 PreferenceComparator(const void *context, const void *left, const void *right) {
1091     const UHashTok *leftTok  = static_cast<const UHashTok *>(left);
1092     const UHashTok *rightTok = static_cast<const UHashTok *>(right);
1093     const UnicodeString *s1  = static_cast<const UnicodeString *>(leftTok->pointer);
1094     const UnicodeString *s2  = static_cast<const UnicodeString *>(rightTok->pointer);
1095     UErrorCode &status       = *(UErrorCode *)(context);   // Cast off both static and const.
1096     if (s1 == s2) {
1097         return 0;
1098     }
1099 
1100     UnicodeString n1 = nfkdNormalizer->normalize(*s1, status);
1101     UnicodeString n2 = nfkdNormalizer->normalize(*s2, status);
1102     int32_t result = n1.length() - n2.length();
1103     if (result != 0) {
1104         return result;
1105     }
1106 
1107     result = n1.compareCodePointOrder(n2);
1108     if (result != 0) {
1109         return result;
1110     }
1111     return s1->compareCodePointOrder(*s2);
1112 }
1113 
1114 
1115 //
1116 //  Constructor & Destructor for AlphabeticIndex::Record
1117 //
1118 //     Records are internal only, instances are not directly surfaced in the public API.
1119 //     This class is mostly struct-like, with all public fields.
1120 
Record(AlphabeticIndex * alphaIndex,const UnicodeString & name,const void * data)1121 AlphabeticIndex::Record::Record(AlphabeticIndex *alphaIndex, const UnicodeString &name, const void *data):
1122     alphaIndex_(alphaIndex), name_(name), data_(data)
1123 {
1124     UnicodeString prefixedName;
1125     alphaIndex->hackName(sortingName_, name_, alphaIndex->collatorPrimaryOnly_);
1126     serialNumber_ = ++alphaIndex->recordCounter_;
1127 }
1128 
~Record()1129 AlphabeticIndex::Record::~Record() {
1130 }
1131 
1132 
addRecord(const UnicodeString & name,const void * data,UErrorCode & status)1133 AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) {
1134     if (U_FAILURE(status)) {
1135         return *this;
1136     }
1137     Record *r = new Record(this, name, data);
1138     inputRecords_->addElement(r, status);
1139     indexBuildRequired_ = TRUE;
1140     //std::string ss;
1141     //std::string ss2;
1142     //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
1143     //             "   sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
1144     return *this;
1145 }
1146 
1147 
clearRecords(UErrorCode & status)1148 AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) {
1149     if (U_FAILURE(status)) {
1150         return *this;
1151     }
1152     inputRecords_->removeAllElements();
1153     indexBuildRequired_ = TRUE;
1154     return *this;
1155 }
1156 
1157 
getBucketIndex(const UnicodeString & name,UErrorCode & status)1158 int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) {
1159     buildIndex(status);
1160     if (U_FAILURE(status)) {
1161         return 0;
1162     }
1163 
1164     // For simplified Chinese prepend a prefix to the name.
1165     //   For non-Chinese locales or non-Chinese names, the name is not modified.
1166 
1167     UnicodeString prefixedName;
1168     hackName(prefixedName, name, collatorPrimaryOnly_);
1169 
1170     // TODO:  use a binary search.
1171     for (int32_t i = 0; i < bucketList_->size(); ++i) {
1172         Bucket *bucket = static_cast<Bucket *>(bucketList_->elementAt(i));
1173         Collator::EComparisonResult comp = collatorPrimaryOnly_->compare(prefixedName, bucket->lowerBoundary_);
1174         if (comp < 0) {
1175             return i - 1;
1176         }
1177     }
1178     // Loop runs until we find the bucket following the one that would hold prefixedName.
1179     // If the prefixedName belongs in the last bucket the loop will drop out the bottom rather
1180     //  than returning from the middle.
1181 
1182     return bucketList_->size() - 1;
1183 }
1184 
1185 
getBucketIndex() const1186 int32_t AlphabeticIndex::getBucketIndex() const {
1187     return labelsIterIndex_;
1188 }
1189 
1190 
nextBucket(UErrorCode & status)1191 UBool AlphabeticIndex::nextBucket(UErrorCode &status) {
1192     if (U_FAILURE(status)) {
1193         return FALSE;
1194     }
1195     if (indexBuildRequired_ && currentBucket_ != NULL) {
1196         status = U_ENUM_OUT_OF_SYNC_ERROR;
1197         return FALSE;
1198     }
1199     buildIndex(status);
1200     if (U_FAILURE(status)) {
1201         return FALSE;
1202     }
1203     ++labelsIterIndex_;
1204     if (labelsIterIndex_ >= bucketList_->size()) {
1205         labelsIterIndex_ = bucketList_->size();
1206         return FALSE;
1207     }
1208     currentBucket_ = static_cast<Bucket *>(bucketList_->elementAt(labelsIterIndex_));
1209     resetRecordIterator();
1210     return TRUE;
1211 }
1212 
getBucketLabel() const1213 const UnicodeString &AlphabeticIndex::getBucketLabel() const {
1214     if (currentBucket_ != NULL) {
1215         return currentBucket_->label_;
1216     } else {
1217         return *EMPTY_STRING;
1218     }
1219 }
1220 
1221 
getBucketLabelType() const1222 UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const {
1223     if (currentBucket_ != NULL) {
1224         return currentBucket_->labelType_;
1225     } else {
1226         return U_ALPHAINDEX_NORMAL;
1227     }
1228 }
1229 
1230 
getBucketRecordCount() const1231 int32_t AlphabeticIndex::getBucketRecordCount() const {
1232     if (currentBucket_ != NULL) {
1233         return currentBucket_->records_->size();
1234     } else {
1235         return 0;
1236     }
1237 }
1238 
1239 
resetBucketIterator(UErrorCode & status)1240 AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) {
1241     if (U_FAILURE(status)) {
1242         return *this;
1243     }
1244     buildIndex(status);
1245     labelsIterIndex_ = -1;
1246     currentBucket_ = NULL;
1247     return *this;
1248 }
1249 
1250 
nextRecord(UErrorCode & status)1251 UBool AlphabeticIndex::nextRecord(UErrorCode &status) {
1252     if (U_FAILURE(status)) {
1253         return FALSE;
1254     }
1255     if (currentBucket_ == NULL) {
1256         // We are trying to iterate over the items in a bucket, but there is no
1257         // current bucket from the enumeration of buckets.
1258         status = U_INVALID_STATE_ERROR;
1259         return FALSE;
1260     }
1261     if (indexBuildRequired_) {
1262         status = U_ENUM_OUT_OF_SYNC_ERROR;
1263         return FALSE;
1264     }
1265     ++itemsIterIndex_;
1266     if (itemsIterIndex_ >= currentBucket_->records_->size()) {
1267         itemsIterIndex_  = currentBucket_->records_->size();
1268         return FALSE;
1269     }
1270     return TRUE;
1271 }
1272 
1273 
getRecordName() const1274 const UnicodeString &AlphabeticIndex::getRecordName() const {
1275     const UnicodeString *retStr = EMPTY_STRING;
1276     if (currentBucket_ != NULL &&
1277         itemsIterIndex_ >= 0 &&
1278         itemsIterIndex_ < currentBucket_->records_->size()) {
1279             Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1280             retStr = &item->name_;
1281     }
1282     return *retStr;
1283 }
1284 
getRecordData() const1285 const void *AlphabeticIndex::getRecordData() const {
1286     const void *retPtr = NULL;
1287     if (currentBucket_ != NULL &&
1288         itemsIterIndex_ >= 0 &&
1289         itemsIterIndex_ < currentBucket_->records_->size()) {
1290             Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1291             retPtr = item->data_;
1292     }
1293     return retPtr;
1294 }
1295 
1296 
resetRecordIterator()1297 AlphabeticIndex & AlphabeticIndex::resetRecordIterator() {
1298     itemsIterIndex_ = -1;
1299     return *this;
1300 }
1301 
1302 
1303 
Bucket(const UnicodeString & label,const UnicodeString & lowerBoundary,UAlphabeticIndexLabelType type,UErrorCode & status)1304 AlphabeticIndex::Bucket::Bucket(const UnicodeString &label,
1305                                 const UnicodeString &lowerBoundary,
1306                                 UAlphabeticIndexLabelType type,
1307                                 UErrorCode &status):
1308          label_(label), lowerBoundary_(lowerBoundary), labelType_(type), records_(NULL) {
1309     if (U_FAILURE(status)) {
1310         return;
1311     }
1312     records_ = new UVector(status);
1313     if (records_ == NULL && U_SUCCESS(status)) {
1314         status = U_MEMORY_ALLOCATION_ERROR;
1315     }
1316 }
1317 
1318 
~Bucket()1319 AlphabeticIndex::Bucket::~Bucket() {
1320     delete records_;
1321 }
1322 
1323 U_NAMESPACE_END
1324