1 /*
2 *******************************************************************************
3 * Copyright (C) 2009-2011, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
6 */
7
8 /**
9 * \file
10 * \brief C API: AlphabeticIndex class
11 */
12
13 #include "unicode/utypes.h"
14
15 #include "unicode/alphaindex.h"
16 #include "unicode/coll.h"
17 #include "unicode/normalizer2.h"
18 #include "unicode/strenum.h"
19 #include "unicode/tblcoll.h"
20 #include "unicode/ulocdata.h"
21 #include "unicode/uniset.h"
22 #include "unicode/uobject.h"
23 #include "unicode/uscript.h"
24 #include "unicode/usetiter.h"
25 #include "unicode/ustring.h"
26
27 #include "cstring.h"
28 #include "mutex.h"
29 #include "uassert.h"
30 #include "ucln_in.h"
31 #include "uhash.h"
32 #include "uvector.h"
33
34 //#include <string>
35 // BEGIN android-removed
36 // Apply the change from ICU trunk.
37 // #include <iostream>
38 // END android-removed
39 U_NAMESPACE_BEGIN
40
41 UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(AlphabeticIndex)
42
43 // Forward Declarations
44 static int32_t U_CALLCONV
45 PreferenceComparator(const void *context, const void *left, const void *right);
46
47 static int32_t U_CALLCONV
48 sortCollateComparator(const void *context, const void *left, const void *right);
49
50 static int32_t U_CALLCONV
51 recordCompareFn(const void *context, const void *left, const void *right);
52
53 //
54 // UHash support function, delete a UnicodeSet
55 // TODO: move this function into uhash.
56 //
57 static void U_CALLCONV
uhash_deleteUnicodeSet(void * obj)58 uhash_deleteUnicodeSet(void *obj) {
59 delete static_cast<UnicodeSet *>(obj);
60 }
61
62 // UVector<Bucket *> support function, delete a Bucket.
63 static void U_CALLCONV
alphaIndex_deleteBucket(void * obj)64 alphaIndex_deleteBucket(void *obj) {
65 delete static_cast<AlphabeticIndex::Bucket *>(obj);
66 }
67
68 // UVector<Record *> support function, delete a Record.
69 static void U_CALLCONV
alphaIndex_deleteRecord(void * obj)70 alphaIndex_deleteRecord(void *obj) {
71 delete static_cast<AlphabeticIndex::Record *>(obj);
72 }
73
74
75
76 static const Normalizer2 *nfkdNormalizer;
77
78 //
79 // Append the contents of a UnicodeSet to a UVector of UnicodeStrings.
80 // Append everything - individual characters are handled as strings of length 1.
81 // The destination vector owns the appended strings.
82
appendUnicodeSetToUVector(UVector & dest,const UnicodeSet & source,UErrorCode & status)83 static void appendUnicodeSetToUVector(UVector &dest, const UnicodeSet &source, UErrorCode &status) {
84 UnicodeSetIterator setIter(source);
85 while (setIter.next()) {
86 const UnicodeString &str = setIter.getString();
87 dest.addElement(str.clone(), status);
88 }
89 }
90
91
AlphabeticIndex(const Locale & locale,UErrorCode & status)92 AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) {
93 init(status);
94 if (U_FAILURE(status)) {
95 return;
96 }
97 locale_ = locale;
98 langType_ = langTypeFromLocale(locale_);
99
100 collator_ = Collator::createInstance(locale, status);
101 if (collator_ != NULL) {
102 collatorPrimaryOnly_ = collator_->clone();
103 }
104 if (collatorPrimaryOnly_ != NULL) {
105 collatorPrimaryOnly_->setStrength(Collator::PRIMARY);
106 }
107 getIndexExemplars(*initialLabels_, locale, status);
108 indexBuildRequired_ = TRUE;
109 if ((collator_ == NULL || collatorPrimaryOnly_ == NULL) && U_SUCCESS(status)) {
110 status = U_MEMORY_ALLOCATION_ERROR;
111 }
112 firstScriptCharacters_ = firstStringsInScript(status);
113 }
114
115
~AlphabeticIndex()116 AlphabeticIndex::~AlphabeticIndex() {
117 uhash_close(alreadyIn_);
118 delete bucketList_;
119 delete collator_;
120 delete collatorPrimaryOnly_;
121 delete firstScriptCharacters_;
122 delete labels_;
123 delete inputRecords_;
124 delete noDistinctSorting_;
125 delete notAlphabetic_;
126 delete initialLabels_;
127 }
128
129
addLabels(const UnicodeSet & additions,UErrorCode & status)130 AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) {
131 if (U_FAILURE(status)) {
132 return *this;
133 }
134 initialLabels_->addAll(additions);
135 return *this;
136 }
137
138
addLabels(const Locale & locale,UErrorCode & status)139 AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
140 if (U_FAILURE(status)) {
141 return *this;
142 }
143 UnicodeSet additions;
144 getIndexExemplars(additions, locale, status);
145 initialLabels_->addAll(additions);
146 return *this;
147 }
148
149
getBucketCount(UErrorCode & status)150 int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) {
151 buildIndex(status);
152 if (U_FAILURE(status)) {
153 return 0;
154 }
155 return bucketList_->size();
156 }
157
158
getRecordCount(UErrorCode & status)159 int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) {
160 if (U_FAILURE(status)) {
161 return 0;
162 }
163 return inputRecords_->size();
164 }
165
166
buildIndex(UErrorCode & status)167 void AlphabeticIndex::buildIndex(UErrorCode &status) {
168 if (U_FAILURE(status)) {
169 return;
170 }
171 if (!indexBuildRequired_) {
172 return;
173 }
174
175 // Discard any already-built data.
176 // This is important when the user builds and uses an index, then subsequently modifies it,
177 // necessitating a rebuild.
178
179 bucketList_->removeAllElements();
180 labels_->removeAllElements();
181 uhash_removeAll(alreadyIn_);
182 noDistinctSorting_->clear();
183 notAlphabetic_->clear();
184
185 // first sort the incoming Labels, with a "best" ordering among items
186 // that are the same according to the collator
187
188 UVector preferenceSorting(status); // Vector of UnicodeStrings; owned by the vector.
189 preferenceSorting.setDeleter(uhash_deleteUnicodeString);
190 appendUnicodeSetToUVector(preferenceSorting, *initialLabels_, status);
191 preferenceSorting.sortWithUComparator(PreferenceComparator, &status, status);
192
193 // We now make a set of Labels.
194 // Some of the input may, however, be redundant.
195 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h"
196 // So we make a pass through, filtering out those cases.
197 // TODO: filtering these out would seem to be at odds with the eventual goal
198 // of being able to split buckets that contain too many items.
199
200 UnicodeSet labelSet;
201 for (int32_t psIndex=0; psIndex<preferenceSorting.size(); psIndex++) {
202 UnicodeString item = *static_cast<const UnicodeString *>(preferenceSorting.elementAt(psIndex));
203 // TODO: Since preferenceSorting was originally populated from the contents of a UnicodeSet,
204 // is it even possible for duplicates to show up in this check?
205 if (labelSet.contains(item)) {
206 UnicodeSetIterator itemAlreadyInIter(labelSet);
207 while (itemAlreadyInIter.next()) {
208 const UnicodeString &itemAlreadyIn = itemAlreadyInIter.getString();
209 if (collatorPrimaryOnly_->compare(item, itemAlreadyIn) == 0) {
210 UnicodeSet *targets = static_cast<UnicodeSet *>(uhash_get(alreadyIn_, &itemAlreadyIn));
211 if (targets == NULL) {
212 // alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet<String>());
213 targets = new UnicodeSet();
214 uhash_put(alreadyIn_, itemAlreadyIn.clone(), targets, &status);
215 }
216 targets->add(item);
217 break;
218 }
219 }
220 } else if (item.moveIndex32(0, 1) < item.length() && // Label contains more than one code point.
221 collatorPrimaryOnly_->compare(item, separated(item)) == 0) {
222 noDistinctSorting_->add(item);
223 } else if (!ALPHABETIC->containsSome(item)) {
224 notAlphabetic_->add(item);
225 } else {
226 labelSet.add(item);
227 }
228 }
229
230 // Move the set of Labels from the set into a vector, and sort
231 // according to the collator.
232
233 appendUnicodeSetToUVector(*labels_, labelSet, status);
234 labels_->sortWithUComparator(sortCollateComparator, collatorPrimaryOnly_, status);
235
236 // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element
237 // Implemented by copying the elements to be retained to a new UVector.
238
239 const int32_t size = labelSet.size() - 1;
240 if (size > maxLabelCount_) {
241 UVector *newLabels = new UVector(status);
242 newLabels->setDeleter(uhash_deleteUnicodeString);
243 int32_t count = 0;
244 int32_t old = -1;
245 for (int32_t srcIndex=0; srcIndex<labels_->size(); srcIndex++) {
246 const UnicodeString *str = static_cast<const UnicodeString *>(labels_->elementAt(srcIndex));
247 ++count;
248 const int32_t bump = count * maxLabelCount_ / size;
249 if (bump == old) {
250 // it.remove();
251 } else {
252 newLabels->addElement(str->clone(), status);
253 old = bump;
254 }
255 }
256 delete labels_;
257 labels_ = newLabels;
258 }
259
260 // We now know the list of labels.
261 // Create a corresponding list of buckets, one per label.
262
263 buildBucketList(status); // Corresponds to Java BucketList constructor.
264
265 // Bin the Records into the Buckets.
266 bucketRecords(status);
267
268 indexBuildRequired_ = FALSE;
269 resetBucketIterator(status);
270 }
271
272 //
273 // buildBucketList() Corresponds to the BucketList constructor in the Java version.
274
buildBucketList(UErrorCode & status)275 void AlphabeticIndex::buildBucketList(UErrorCode &status) {
276 UnicodeString labelStr = getUnderflowLabel();
277 Bucket *b = new Bucket(labelStr, *EMPTY_STRING, U_ALPHAINDEX_UNDERFLOW, status);
278 bucketList_->addElement(b, status);
279
280 // Build up the list, adding underflow, additions, overflow
281 // insert infix labels as needed, using \uFFFF.
282 const UnicodeString *last = static_cast<UnicodeString *>(labels_->elementAt(0));
283 b = new Bucket(*last, *last, U_ALPHAINDEX_NORMAL, status);
284 bucketList_->addElement(b, status);
285
286 UnicodeSet lastSet;
287 UnicodeSet set;
288 AlphabeticIndex::getScriptSet(lastSet, *last, status);
289 lastSet.removeAll(*IGNORE_SCRIPTS);
290
291 for (int i = 1; i < labels_->size(); ++i) {
292 UnicodeString *current = static_cast<UnicodeString *>(labels_->elementAt(i));
293 getScriptSet(set, *current, status);
294 set.removeAll(*IGNORE_SCRIPTS);
295 if (lastSet.containsNone(set)) {
296 // check for adjacent
297 const UnicodeString &overflowComparisonString = getOverflowComparisonString(*last, status);
298 if (collatorPrimaryOnly_->compare(overflowComparisonString, *current) < 0) {
299 labelStr = getInflowLabel();
300 b = new Bucket(labelStr, overflowComparisonString, U_ALPHAINDEX_INFLOW, status);
301 bucketList_->addElement(b, status);
302 i++;
303 lastSet = set;
304 }
305 }
306 b = new Bucket(*current, *current, U_ALPHAINDEX_NORMAL, status);
307 bucketList_->addElement(b, status);
308 last = current;
309 lastSet = set;
310 }
311 const UnicodeString &limitString = getOverflowComparisonString(*last, status);
312 b = new Bucket(getOverflowLabel(), limitString, U_ALPHAINDEX_OVERFLOW, status);
313 bucketList_->addElement(b, status);
314 // final overflow bucket
315 }
316
317
318 //
319 // Place all of the raw input records into the correct bucket.
320 //
321 // Begin by sorting the input records; this lets us bin them in a single pass.
322 //
323 // Note on storage management: The input records are owned by the
324 // inputRecords_ vector, and will (eventually) be auto-deleted by it.
325 // The Bucket objects have pointers to the Record objects, but do not own them.
326 //
bucketRecords(UErrorCode & status)327 void AlphabeticIndex::bucketRecords(UErrorCode &status) {
328 if (U_FAILURE(status)) {
329 return;
330 }
331
332 inputRecords_->sortWithUComparator(recordCompareFn, collator_, status);
333 U_ASSERT(bucketList_->size() > 0); // Should always have at least an overflow
334 // bucket, even if no user labels.
335 int32_t bucketIndex = 0;
336 Bucket *destBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex));
337 Bucket *nextBucket = NULL;
338 if (bucketIndex+1 < bucketList_->size()) {
339 nextBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex+1));
340 }
341 int32_t recordIndex = 0;
342 Record *r = static_cast<Record *>(inputRecords_->elementAt(recordIndex));
343 while (recordIndex < inputRecords_->size()) {
344 if (nextBucket == NULL ||
345 collatorPrimaryOnly_->compare(r->sortingName_, nextBucket->lowerBoundary_) < 0) {
346 // Record goes in current bucket. Advance to next record,
347 // stay on current bucket.
348 destBucket->records_->addElement(r, status);
349 ++recordIndex;
350 r = static_cast<Record *>(inputRecords_->elementAt(recordIndex));
351 } else {
352 // Advance to the next bucket, stay on current record.
353 bucketIndex++;
354 destBucket = nextBucket;
355 if (bucketIndex+1 < bucketList_->size()) {
356 nextBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex+1));
357 } else {
358 nextBucket = NULL;
359 }
360 U_ASSERT(destBucket != NULL);
361 }
362 }
363
364 }
365
366
getIndexExemplars(UnicodeSet & dest,const Locale & locale,UErrorCode & status)367 void AlphabeticIndex::getIndexExemplars(UnicodeSet &dest, const Locale &locale, UErrorCode &status) {
368 if (U_FAILURE(status)) {
369 return;
370 }
371
372 LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
373 UnicodeSet exemplars;
374 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
375 if (U_SUCCESS(status)) {
376 dest.addAll(exemplars);
377 return;
378 }
379 status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR
380
381 // Locale data did not include explicit Index characters.
382 // Synthesize a set of them from the locale's standard exemplar characters.
383
384 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
385 if (U_FAILURE(status)) {
386 return;
387 }
388
389 // Upper-case any that aren't already so.
390 // (We only do this for synthesized index characters.)
391
392 UnicodeSetIterator it(exemplars);
393 UnicodeString upperC;
394 UnicodeSet lowersToRemove;
395 UnicodeSet uppersToAdd;
396 while (it.next()) {
397 const UnicodeString &exemplarC = it.getString();
398 upperC = exemplarC;
399 upperC.toUpper(locale);
400 if (exemplarC != upperC) {
401 lowersToRemove.add(exemplarC);
402 uppersToAdd.add(upperC);
403 }
404 }
405 exemplars.removeAll(lowersToRemove);
406 exemplars.addAll(uppersToAdd);
407
408 // get the exemplars, and handle special cases
409
410 // question: should we add auxiliary exemplars?
411 if (exemplars.containsSome(*CORE_LATIN)) {
412 exemplars.addAll(*CORE_LATIN);
413 }
414 if (exemplars.containsSome(*HANGUL)) {
415 // cut down to small list
416 UnicodeSet BLOCK_HANGUL_SYLLABLES(UNICODE_STRING_SIMPLE("[:block=hangul_syllables:]"), status);
417 exemplars.removeAll(BLOCK_HANGUL_SYLLABLES);
418 exemplars.addAll(*HANGUL);
419 }
420 if (exemplars.containsSome(*ETHIOPIC)) {
421 // cut down to small list
422 // make use of the fact that Ethiopic is allocated in 8's, where
423 // the base is 0 mod 8.
424 UnicodeSetIterator it(*ETHIOPIC);
425 while (it.next() && !it.isString()) {
426 if ((it.getCodepoint() & 0x7) != 0) {
427 exemplars.remove(it.getCodepoint());
428 }
429 }
430 }
431 dest.addAll(exemplars);
432 }
433
434
435 /*
436 * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
437 */
438 static const UChar32 CGJ = (UChar)0x034F;
separated(const UnicodeString & item)439 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
440 UnicodeString result;
441 if (item.length() == 0) {
442 return result;
443 }
444 int32_t i = 0;
445 for (;;) {
446 UChar32 cp = item.char32At(i);
447 result.append(cp);
448 i = item.moveIndex32(i, 1);
449 if (i >= item.length()) {
450 break;
451 }
452 result.append(CGJ);
453 }
454 return result;
455 }
456
457
operator ==(const AlphabeticIndex &) const458 UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const {
459 return FALSE;
460 }
461
462
operator !=(const AlphabeticIndex &) const463 UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {
464 return FALSE;
465 }
466
467
getCollator() const468 const RuleBasedCollator &AlphabeticIndex::getCollator() const {
469 // There are no known non-RuleBasedCollator collators, and none ever expected.
470 // But, in case that changes, better a null pointer than a wrong type.
471 return *dynamic_cast<RuleBasedCollator *>(collator_);
472 }
473
474
getInflowLabel() const475 const UnicodeString &AlphabeticIndex::getInflowLabel() const {
476 return inflowLabel_;
477 }
478
getOverflowLabel() const479 const UnicodeString &AlphabeticIndex::getOverflowLabel() const {
480 return overflowLabel_;
481 }
482
483
getUnderflowLabel() const484 const UnicodeString &AlphabeticIndex::getUnderflowLabel() const {
485 return underflowLabel_;
486 }
487
488
setInflowLabel(const UnicodeString & label,UErrorCode &)489 AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
490 inflowLabel_ = label;
491 indexBuildRequired_ = TRUE;
492 return *this;
493 }
494
495
setOverflowLabel(const UnicodeString & label,UErrorCode &)496 AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
497 overflowLabel_ = label;
498 indexBuildRequired_ = TRUE;
499 return *this;
500 }
501
502
setUnderflowLabel(const UnicodeString & label,UErrorCode &)503 AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
504 underflowLabel_ = label;
505 indexBuildRequired_ = TRUE;
506 return *this;
507 }
508
509
getMaxLabelCount() const510 int32_t AlphabeticIndex::getMaxLabelCount() const {
511 return maxLabelCount_;
512 }
513
514
setMaxLabelCount(int32_t maxLabelCount,UErrorCode & status)515 AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) {
516 if (U_FAILURE(status)) {
517 return *this;
518 }
519 if (maxLabelCount <= 0) {
520 status = U_ILLEGAL_ARGUMENT_ERROR;
521 return *this;
522 }
523 maxLabelCount_ = maxLabelCount;
524 if (maxLabelCount < bucketList_->size()) {
525 indexBuildRequired_ = TRUE;
526 }
527 return *this;
528 }
529
530
getOverflowComparisonString(const UnicodeString & lowerLimit,UErrorCode &)531 const UnicodeString &AlphabeticIndex::getOverflowComparisonString(const UnicodeString &lowerLimit, UErrorCode &/*status*/) {
532 for (int32_t i=0; i<firstScriptCharacters_->size(); i++) {
533 const UnicodeString *s =
534 static_cast<const UnicodeString *>(firstScriptCharacters_->elementAt(i));
535 if (collator_->compare(*s, lowerLimit) > 0) {
536 return *s;
537 }
538 }
539 return *EMPTY_STRING;
540 }
541
getScriptSet(UnicodeSet & dest,const UnicodeString & codePoint,UErrorCode & status)542 UnicodeSet *AlphabeticIndex::getScriptSet(UnicodeSet &dest, const UnicodeString &codePoint, UErrorCode &status) {
543 if (U_FAILURE(status)) {
544 return &dest;
545 }
546 UChar32 cp = codePoint.char32At(0);
547 UScriptCode scriptCode = uscript_getScript(cp, &status);
548 dest.applyIntPropertyValue(UCHAR_SCRIPT, scriptCode, status);
549 return &dest;
550 }
551
552 //
553 // init() - Common code for constructors.
554 //
555
init(UErrorCode & status)556 void AlphabeticIndex::init(UErrorCode &status) {
557 // Initialize statics if needed.
558 AlphabeticIndex::staticInit(status);
559
560 // Put the object into a known state so that the destructor will function.
561
562 alreadyIn_ = NULL;
563 bucketList_ = NULL;
564 collator_ = NULL;
565 collatorPrimaryOnly_ = NULL;
566 currentBucket_ = NULL;
567 firstScriptCharacters_ = NULL;
568 initialLabels_ = NULL;
569 indexBuildRequired_ = TRUE;
570 inputRecords_ = NULL;
571 itemsIterIndex_ = 0;
572 labels_ = NULL;
573 labelsIterIndex_ = 0;
574 maxLabelCount_ = 99;
575 noDistinctSorting_ = NULL;
576 notAlphabetic_ = NULL;
577 recordCounter_ = 0;
578
579 if (U_FAILURE(status)) {
580 return;
581 }
582 alreadyIn_ = uhash_open(uhash_hashUnicodeString, // Key Hash,
583 uhash_compareUnicodeString, // key Comparator,
584 NULL, // value Comparator
585 &status);
586 uhash_setKeyDeleter(alreadyIn_, uhash_deleteUnicodeString);
587 uhash_setValueDeleter(alreadyIn_, uhash_deleteUnicodeSet);
588
589 bucketList_ = new UVector(status);
590 bucketList_->setDeleter(alphaIndex_deleteBucket);
591 labels_ = new UVector(status);
592 labels_->setDeleter(uhash_deleteUnicodeString);
593 labels_->setComparer(uhash_compareUnicodeString);
594 inputRecords_ = new UVector(status);
595 inputRecords_->setDeleter(alphaIndex_deleteRecord);
596
597 noDistinctSorting_ = new UnicodeSet();
598 notAlphabetic_ = new UnicodeSet();
599 initialLabels_ = new UnicodeSet();
600
601 inflowLabel_.remove();
602 inflowLabel_.append((UChar)0x2026); // Ellipsis
603 overflowLabel_ = inflowLabel_;
604 underflowLabel_ = inflowLabel_;
605
606 // TODO: check for memory allocation failures.
607 }
608
609
610 static UBool indexCharactersAreInitialized = FALSE;
611
612 // Index Characters Clean up function. Delete statically allocated constant stuff.
613 U_CDECL_BEGIN
indexCharacters_cleanup(void)614 static UBool U_CALLCONV indexCharacters_cleanup(void) {
615 AlphabeticIndex::staticCleanup();
616 return TRUE;
617 }
618 U_CDECL_END
619
staticCleanup()620 void AlphabeticIndex::staticCleanup() {
621 delete ALPHABETIC;
622 ALPHABETIC = NULL;
623 delete HANGUL;
624 HANGUL = NULL;
625 delete ETHIOPIC;
626 ETHIOPIC = NULL;
627 delete CORE_LATIN;
628 CORE_LATIN = NULL;
629 delete IGNORE_SCRIPTS;
630 IGNORE_SCRIPTS = NULL;
631 delete TO_TRY;
632 TO_TRY = NULL;
633 delete UNIHAN;
634 UNIHAN = NULL;
635 delete EMPTY_STRING;
636 EMPTY_STRING = NULL;
637 nfkdNormalizer = NULL; // ref to a singleton. Do not delete.
638 indexCharactersAreInitialized = FALSE;
639 }
640
641
642 UnicodeSet *AlphabeticIndex::ALPHABETIC;
643 UnicodeSet *AlphabeticIndex::HANGUL;
644 UnicodeSet *AlphabeticIndex::ETHIOPIC;
645 UnicodeSet *AlphabeticIndex::CORE_LATIN;
646 UnicodeSet *AlphabeticIndex::IGNORE_SCRIPTS;
647 UnicodeSet *AlphabeticIndex::TO_TRY;
648 UnicodeSet *AlphabeticIndex::UNIHAN;
649 const UnicodeString *AlphabeticIndex::EMPTY_STRING;
650
651 //
652 // staticInit() One-time initialization of constants.
653 // Thread safe. Called from constructors.
654 // Mutex overhead is not a concern. AlphabeticIndex constructors are
655 // sufficiently heavy that the cost of the mutex check is not significant.
656
staticInit(UErrorCode & status)657 void AlphabeticIndex::staticInit(UErrorCode &status) {
658 static UMTX IndexCharsInitMutex;
659
660 Mutex mutex(&IndexCharsInitMutex);
661 if (indexCharactersAreInitialized || U_FAILURE(status)) {
662 return;
663 }
664 UBool finishedInit = FALSE;
665
666 {
667 UnicodeString alphaString = UNICODE_STRING_SIMPLE("[[:alphabetic:]-[:mark:]]");
668 ALPHABETIC = new UnicodeSet(alphaString, status);
669 if (ALPHABETIC == NULL) {
670 goto err;
671 }
672
673 HANGUL = new UnicodeSet();
674 HANGUL->add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).add(0xB9C8).add(0xBC14).add(0xC0AC).
675 add(0xC544).add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).add(0xD30C).add(0xD558);
676 if (HANGUL== NULL) {
677 goto err;
678 }
679
680
681 UnicodeString EthiopicStr = UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
682 ETHIOPIC = new UnicodeSet(EthiopicStr, status);
683 if (ETHIOPIC == NULL) {
684 goto err;
685 }
686
687 CORE_LATIN = new UnicodeSet((UChar32)0x61, (UChar32)0x7a); // ('a', 'z');
688 if (CORE_LATIN == NULL) {
689 goto err;
690 }
691
692 UnicodeString IgnoreStr= UNICODE_STRING_SIMPLE(
693 "[[:sc=Common:][:sc=inherited:][:script=Unknown:][:script=braille:]]");
694 IGNORE_SCRIPTS = new UnicodeSet(IgnoreStr, status);
695 IGNORE_SCRIPTS->freeze();
696 if (IGNORE_SCRIPTS == NULL) {
697 goto err;
698 }
699
700 UnicodeString nfcqcStr = UNICODE_STRING_SIMPLE("[:^nfcqc=no:]");
701 TO_TRY = new UnicodeSet(nfcqcStr, status);
702 if (TO_TRY == NULL) {
703 goto err;
704 }
705
706 UnicodeString unihanStr = UNICODE_STRING_SIMPLE("[:script=Hani:]");
707 UNIHAN = new UnicodeSet(unihanStr, status);
708 if (UNIHAN == NULL) {
709 goto err;
710 }
711
712 EMPTY_STRING = new UnicodeString();
713
714 nfkdNormalizer = Normalizer2::getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, status);
715 if (nfkdNormalizer == NULL) {
716 goto err;
717 }
718 }
719 finishedInit = TRUE;
720
721 err:
722 if (!finishedInit && U_SUCCESS(status)) {
723 status = U_MEMORY_ALLOCATION_ERROR;
724 }
725 if (U_FAILURE(status)) {
726 indexCharacters_cleanup();
727 return;
728 }
729 ucln_i18n_registerCleanup(UCLN_I18N_INDEX_CHARACTERS, indexCharacters_cleanup);
730 indexCharactersAreInitialized = TRUE;
731 }
732
733
734 //
735 // Comparison function for UVector<UnicodeString *> sorting with a collator.
736 //
737 static int32_t U_CALLCONV
sortCollateComparator(const void * context,const void * left,const void * right)738 sortCollateComparator(const void *context, const void *left, const void *right) {
739 const UHashTok *leftTok = static_cast<const UHashTok *>(left);
740 const UHashTok *rightTok = static_cast<const UHashTok *>(right);
741 const UnicodeString *leftString = static_cast<const UnicodeString *>(leftTok->pointer);
742 const UnicodeString *rightString = static_cast<const UnicodeString *>(rightTok->pointer);
743 const Collator *col = static_cast<const Collator *>(context);
744
745 if (leftString == rightString) {
746 // Catches case where both are NULL
747 return 0;
748 }
749 if (leftString == NULL) {
750 return 1;
751 };
752 if (rightString == NULL) {
753 return -1;
754 }
755 Collator::EComparisonResult r = col->compare(*leftString, *rightString);
756 return (int32_t) r;
757 }
758
759 //
760 // Comparison function for UVector<Record *> sorting with a collator.
761 //
762 static int32_t U_CALLCONV
recordCompareFn(const void * context,const void * left,const void * right)763 recordCompareFn(const void *context, const void *left, const void *right) {
764 const UHashTok *leftTok = static_cast<const UHashTok *>(left);
765 const UHashTok *rightTok = static_cast<const UHashTok *>(right);
766 const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftTok->pointer);
767 const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightTok->pointer);
768 const Collator *col = static_cast<const Collator *>(context);
769
770 Collator::EComparisonResult r = col->compare(leftRec->sortingName_, rightRec->sortingName_);
771 if (r == Collator::EQUAL) {
772 if (leftRec->serialNumber_ < rightRec->serialNumber_) {
773 r = Collator::LESS;
774 } else if (leftRec->serialNumber_ > rightRec->serialNumber_) {
775 r = Collator::GREATER;
776 }
777 }
778 return (int32_t) r;
779 }
780
781
782 #if 0
783 //
784 // First characters in scripts.
785 // Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script.
786 // The vector is sorted according to this index's collation.
787 //
788 // This code is too slow to use, so for now hard code the data.
789 // Hard coded implementation is follows.
790 //
791 UVector *AlphabeticIndex::firstStringsInScript(Collator *ruleBasedCollator, UErrorCode &status) {
792
793 if (U_FAILURE(status)) {
794 return NULL;
795 }
796
797 UnicodeString results[USCRIPT_CODE_LIMIT];
798 UnicodeString LOWER_A = UNICODE_STRING_SIMPLE("a");
799
800 UnicodeSetIterator siter(*TO_TRY);
801 while (siter.next()) {
802 const UnicodeString ¤t = siter.getString();
803 Collator::EComparisonResult r = ruleBasedCollator->compare(current, LOWER_A);
804 if (r < 0) { // TODO fix; we only want "real" script characters, not
805 // symbols.
806 continue;
807 }
808
809 int script = uscript_getScript(current.char32At(0), &status);
810 if (results[script].length() == 0) {
811 results[script] = current;
812 }
813 else if (ruleBasedCollator->compare(current, results[script]) < 0) {
814 results[script] = current;
815 }
816 }
817
818 UnicodeSet extras;
819 UnicodeSet expansions;
820 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(ruleBasedCollator);
821 const UCollator *uRuleBasedCollator = rbc->getUCollator();
822 ucol_getContractionsAndExpansions(uRuleBasedCollator, extras.toUSet(), expansions.toUSet(), true, &status);
823 extras.addAll(expansions).removeAll(*TO_TRY);
824 if (extras.size() != 0) {
825 const Normalizer2 *normalizer = Normalizer2::getInstance(NULL, "nfkc", UNORM2_COMPOSE, status);
826 UnicodeSetIterator extrasIter(extras);
827 while (extrasIter.next()) {
828 const UnicodeString ¤t = extrasIter.next();
829 if (!TO_TRY->containsAll(current))
830 continue;
831 if (!normalizer->isNormalized(current, status) ||
832 ruleBasedCollator->compare(current, LOWER_A) < 0) {
833 continue;
834 }
835 int script = uscript_getScript(current.char32At(0), &status);
836 if (results[script].length() == 0) {
837 results[script] = current;
838 } else if (ruleBasedCollator->compare(current, results[script]) < 0) {
839 results[script] = current;
840 }
841 }
842 }
843
844 UVector *dest = new UVector(status);
845 dest->setDeleter(uhash_deleteUnicodeString);
846 for (uint32_t i = 0; i < sizeof(results) / sizeof(results[0]); ++i) {
847 if (results[i].length() > 0) {
848 dest->addElement(results[i].clone(), status);
849 }
850 }
851 dest->sortWithUComparator(sortCollateComparator, ruleBasedCollator, status);
852 return dest;
853 }
854 #endif
855
856
857 //
858 // First characters in scripts.
859 // Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script.
860 // The vector is sorted according to this index's collation.
861 //
862 // It takes too much time to compute this from character properties, so hard code it for now.
863 // Character constants copied from corresponding declaration in ICU4J.
864
865 static UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = { 0x61, 0, 0x03B1, 0,
866 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0,
867 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0,
868 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0, 0xABC0, 0, 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0, 0x1B83, 0,
869 0xD802, 0xDE00, 0, 0x0E01, 0, 0x0E81, 0, 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0,
870 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0, 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0,
871 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0,
872 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0, 0xD800, 0xDE80, 0,
873 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0,
874 0xD801, 0xDC80, 0, 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0,
875 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0, 0x4E00, 0 };
876
firstStringsInScript(UErrorCode & status)877 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
878 if (U_FAILURE(status)) {
879 return NULL;
880 }
881 UVector *dest = new UVector(status);
882 if (dest == NULL && U_SUCCESS(status)) {
883 status = U_MEMORY_ALLOCATION_ERROR;
884 return NULL;
885 }
886 dest->setDeleter(uhash_deleteUnicodeString);
887 const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS;
888 const UChar *limit = src + sizeof(HACK_FIRST_CHARS_IN_SCRIPTS) / sizeof(HACK_FIRST_CHARS_IN_SCRIPTS[0]);
889 do {
890 if (U_FAILURE(status)) {
891 return dest;
892 }
893 UnicodeString *str = new UnicodeString(src, -1);
894 if (str == NULL) {
895 status = U_MEMORY_ALLOCATION_ERROR;
896 }
897 dest->addElement(str, status);
898 src += str->length() + 1;
899 } while (src < limit);
900 dest->sortWithUComparator(sortCollateComparator, collator_, status);
901 return dest;
902 }
903
904
langTypeFromLocale(const Locale & loc)905 AlphabeticIndex::ELangType AlphabeticIndex::langTypeFromLocale(const Locale &loc) {
906 const char *lang = loc.getLanguage();
907 if (uprv_strcmp(lang, "zh") != 0) {
908 return kNormal;
909 }
910 const char *script = loc.getScript();
911 if (uprv_strcmp(script, "Hant") == 0) {
912 return kTraditional;
913 }
914 const char *country = loc.getCountry();
915 if (uprv_strcmp(country, "TW") == 0) {
916 return kTraditional;
917 }
918 return kSimplified;
919 }
920
921
922 //
923 // Pinyin Hacks. Direct port from Java.
924 //
925
926 static const UChar32 probeCharInLong = 0x28EAD;
927
928
929 static const UChar PINYIN_LOWER_BOUNDS_SHORT[] = { // "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz"
930 0x0101, 0x62, 0x63, 0x64, 0x0113, 0x66, 0x67, 0x68, 0x6A, 0x6B, /*l*/0x6C, 0x1E3F, 0x0144, 0x014D,
931 /*p*/0x70, 0x71, 0x72, 0x73, 0x74, /*w*/0x77, 0x78, 0x79, 0x7A};
932
933
934 // Pinyin lookup tables copied, pasted (and reformatted) from the ICU4J code.
935
936 AlphabeticIndex::PinyinLookup AlphabeticIndex::HACK_PINYIN_LOOKUP_SHORT = {
937 {(UChar)0, (UChar)0, (UChar)0}, // A
938 {(UChar)0x516B, (UChar)0, (UChar)0}, // B
939 {(UChar)0x5693, (UChar)0, (UChar)0}, // C
940 {(UChar)0x5491, (UChar)0, (UChar)0}, // D
941 {(UChar)0x59B8, (UChar)0, (UChar)0}, // E
942 {(UChar)0x53D1, (UChar)0, (UChar)0}, // F
943 {(UChar)0x65EE, (UChar)0, (UChar)0}, // G
944 {(UChar)0x54C8, (UChar)0, (UChar)0}, // H
945 {(UChar)0x4E0C, (UChar)0, (UChar)0}, // J
946 {(UChar)0x5494, (UChar)0, (UChar)0}, // K
947 {(UChar)0x5783, (UChar)0, (UChar)0}, // L
948 {(UChar)0x5452, (UChar)0, (UChar)0}, // M
949 {(UChar)0x5514, (UChar)0, (UChar)0}, // N
950 {(UChar)0x5594, (UChar)0, (UChar)0}, // O
951 {(UChar)0x5991, (UChar)0, (UChar)0}, // P
952 {(UChar)0x4E03, (UChar)0, (UChar)0}, // Q
953 {(UChar)0x513F, (UChar)0, (UChar)0}, // R
954 {(UChar)0x4EE8, (UChar)0, (UChar)0}, // S
955 {(UChar)0x4ED6, (UChar)0, (UChar)0}, // T
956 {(UChar)0x7A75, (UChar)0, (UChar)0}, // W
957 {(UChar)0x5915, (UChar)0, (UChar)0}, // X
958 {(UChar)0x4E2B, (UChar)0, (UChar)0}, // Y
959 {(UChar)0x5E00, (UChar)0, (UChar)0}, // Z
960 {(UChar)0xFFFF, (UChar)0, (UChar)0}, // mark end of array
961 };
962
963 static const UChar PINYIN_LOWER_BOUNDS_LONG[] = { // "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz";
964 0x0101, 0x62, 0x63, 0x64, 0x0113, 0x66, 0x67, 0x68, 0x6A, 0x6B, /*l*/0x6C, 0x1E3F, 0x0144, 0x014D,
965 /*p*/0x70, 0x71, 0x72, 0x73, 0x74, /*w*/0x77, 0x78, 0x79, 0x7A};
966
967 AlphabeticIndex::PinyinLookup AlphabeticIndex::HACK_PINYIN_LOOKUP_LONG = {
968 {(UChar)0, (UChar)0, (UChar)0}, // A
969 {(UChar)0x516B, (UChar)0, (UChar)0}, // b
970 {(UChar)0xD863, (UChar)0xDEAD, (UChar)0}, // c
971 {(UChar)0xD844, (UChar)0xDE51, (UChar)0}, // d
972 {(UChar)0x59B8, (UChar)0, (UChar)0}, // e
973 {(UChar)0x53D1, (UChar)0, (UChar)0}, // f
974 {(UChar)0xD844, (UChar)0xDE45, (UChar)0}, // g
975 {(UChar)0x54C8, (UChar)0, (UChar)0}, // h
976 {(UChar)0x4E0C, (UChar)0, (UChar)0}, // j
977 {(UChar)0x5494, (UChar)0, (UChar)0}, // k
978 {(UChar)0x3547, (UChar)0, (UChar)0}, // l
979 {(UChar)0x5452, (UChar)0, (UChar)0}, // m
980 {(UChar)0x5514, (UChar)0, (UChar)0}, // n
981 {(UChar)0x5594, (UChar)0, (UChar)0}, // o
982 {(UChar)0xD84F, (UChar)0xDC7A, (UChar)0}, // p
983 {(UChar)0x4E03, (UChar)0, (UChar)0}, // q
984 {(UChar)0x513F, (UChar)0, (UChar)0}, // r
985 {(UChar)0x4EE8, (UChar)0, (UChar)0}, // s
986 {(UChar)0x4ED6, (UChar)0, (UChar)0}, // t
987 {(UChar)0x7A75, (UChar)0, (UChar)0}, // w
988 {(UChar)0x5915, (UChar)0, (UChar)0}, // x
989 {(UChar)0x4E2B, (UChar)0, (UChar)0}, // y
990 {(UChar)0x5E00, (UChar)0, (UChar)0}, // z
991 {(UChar)0xFFFF, (UChar)0, (UChar)0}, // mark end of array
992 };
993
994
995 //
996 // Probe the collation data, and decide which Pinyin tables should be used
997 //
998 // ICU can be built with a choice between two Chinese collations.
999 // The hack Pinyin tables to use depend on which one is in use.
1000 // We can assume that any given copy of ICU will have only one of the collations available,
1001 // and that there is no way, in a given process, to create two alphabetic indexes using
1002 // different Chinese collations. Which means the probe can be done once
1003 // and the results cached.
1004 //
1005 // This whole arrangement is temporary.
1006 //
1007 AlphabeticIndex::PinyinLookup *AlphabeticIndex::HACK_PINYIN_LOOKUP = NULL;
1008 const UChar *AlphabeticIndex::PINYIN_LOWER_BOUNDS = NULL;
1009
initPinyinBounds(const Collator * col,UErrorCode & status)1010 void AlphabeticIndex::initPinyinBounds(const Collator *col, UErrorCode &status) {
1011 {
1012 Mutex m;
1013 if (PINYIN_LOWER_BOUNDS != NULL) {
1014 return;
1015 }
1016 }
1017 UnicodeSet *colSet = col->getTailoredSet(status);
1018 if (U_FAILURE(status) || colSet == NULL) {
1019 delete colSet;
1020 if (U_SUCCESS(status)) {
1021 status = U_MEMORY_ALLOCATION_ERROR;
1022 }
1023 return;
1024 }
1025 UBool useLongTables = colSet->contains(probeCharInLong);
1026 delete colSet;
1027 {
1028 Mutex m;
1029 if (useLongTables) {
1030 PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_LONG;
1031 HACK_PINYIN_LOOKUP = &HACK_PINYIN_LOOKUP_LONG;
1032 } else {
1033 PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_SHORT;
1034 HACK_PINYIN_LOOKUP = &HACK_PINYIN_LOOKUP_SHORT;
1035 }
1036 }
1037 }
1038
1039 // Pinyin Hack:
1040 // Modify a Chinese name by prepending a Latin letter. The modified name is used
1041 // when putting records (names) into buckets, to put the name under a Latin index heading.
1042
hackName(UnicodeString & dest,const UnicodeString & name,const Collator * col)1043 void AlphabeticIndex::hackName(UnicodeString &dest, const UnicodeString &name, const Collator *col) {
1044
1045 if (langType_ != kSimplified || !UNIHAN->contains(name.char32At(0))) {
1046 dest = name;
1047 return;
1048 }
1049
1050 UErrorCode status = U_ZERO_ERROR;
1051 initPinyinBounds(col, status);
1052 if (U_FAILURE(status)) {
1053 dest = name;
1054 return;
1055 }
1056 // TODO: use binary search
1057 int index;
1058 for (index=0; ; index++) {
1059 if ((*HACK_PINYIN_LOOKUP)[index][0] == (UChar)0xffff) {
1060 index--;
1061 break;
1062 }
1063 int32_t compareResult = col->compare(name, (*HACK_PINYIN_LOOKUP)[index]);
1064 if (compareResult < 0) {
1065 index--;
1066 }
1067 if (compareResult <= 0) {
1068 break;
1069 }
1070 }
1071 UChar c = PINYIN_LOWER_BOUNDS[index];
1072 dest.setTo(c);
1073 dest.append(name);
1074 return;
1075 }
1076
1077
1078
1079 /**
1080 * Comparator that returns "better" items first, where shorter NFKD is better, and otherwise NFKD binary order is
1081 * better, and otherwise binary order is better.
1082 *
1083 * For use with array sort or UVector.
1084 * @param context A UErrorCode pointer.
1085 * @param left A UHashTok pointer, which must refer to a UnicodeString *
1086 * @param right A UHashTok pointer, which must refer to a UnicodeString *
1087 */
1088
1089 static int32_t U_CALLCONV
PreferenceComparator(const void * context,const void * left,const void * right)1090 PreferenceComparator(const void *context, const void *left, const void *right) {
1091 const UHashTok *leftTok = static_cast<const UHashTok *>(left);
1092 const UHashTok *rightTok = static_cast<const UHashTok *>(right);
1093 const UnicodeString *s1 = static_cast<const UnicodeString *>(leftTok->pointer);
1094 const UnicodeString *s2 = static_cast<const UnicodeString *>(rightTok->pointer);
1095 UErrorCode &status = *(UErrorCode *)(context); // Cast off both static and const.
1096 if (s1 == s2) {
1097 return 0;
1098 }
1099
1100 UnicodeString n1 = nfkdNormalizer->normalize(*s1, status);
1101 UnicodeString n2 = nfkdNormalizer->normalize(*s2, status);
1102 int32_t result = n1.length() - n2.length();
1103 if (result != 0) {
1104 return result;
1105 }
1106
1107 result = n1.compareCodePointOrder(n2);
1108 if (result != 0) {
1109 return result;
1110 }
1111 return s1->compareCodePointOrder(*s2);
1112 }
1113
1114
1115 //
1116 // Constructor & Destructor for AlphabeticIndex::Record
1117 //
1118 // Records are internal only, instances are not directly surfaced in the public API.
1119 // This class is mostly struct-like, with all public fields.
1120
Record(AlphabeticIndex * alphaIndex,const UnicodeString & name,const void * data)1121 AlphabeticIndex::Record::Record(AlphabeticIndex *alphaIndex, const UnicodeString &name, const void *data):
1122 alphaIndex_(alphaIndex), name_(name), data_(data)
1123 {
1124 UnicodeString prefixedName;
1125 alphaIndex->hackName(sortingName_, name_, alphaIndex->collatorPrimaryOnly_);
1126 serialNumber_ = ++alphaIndex->recordCounter_;
1127 }
1128
~Record()1129 AlphabeticIndex::Record::~Record() {
1130 }
1131
1132
addRecord(const UnicodeString & name,const void * data,UErrorCode & status)1133 AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) {
1134 if (U_FAILURE(status)) {
1135 return *this;
1136 }
1137 Record *r = new Record(this, name, data);
1138 inputRecords_->addElement(r, status);
1139 indexBuildRequired_ = TRUE;
1140 //std::string ss;
1141 //std::string ss2;
1142 //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
1143 // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
1144 return *this;
1145 }
1146
1147
clearRecords(UErrorCode & status)1148 AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) {
1149 if (U_FAILURE(status)) {
1150 return *this;
1151 }
1152 inputRecords_->removeAllElements();
1153 indexBuildRequired_ = TRUE;
1154 return *this;
1155 }
1156
1157
getBucketIndex(const UnicodeString & name,UErrorCode & status)1158 int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) {
1159 buildIndex(status);
1160 if (U_FAILURE(status)) {
1161 return 0;
1162 }
1163
1164 // For simplified Chinese prepend a prefix to the name.
1165 // For non-Chinese locales or non-Chinese names, the name is not modified.
1166
1167 UnicodeString prefixedName;
1168 hackName(prefixedName, name, collatorPrimaryOnly_);
1169
1170 // TODO: use a binary search.
1171 for (int32_t i = 0; i < bucketList_->size(); ++i) {
1172 Bucket *bucket = static_cast<Bucket *>(bucketList_->elementAt(i));
1173 Collator::EComparisonResult comp = collatorPrimaryOnly_->compare(prefixedName, bucket->lowerBoundary_);
1174 if (comp < 0) {
1175 return i - 1;
1176 }
1177 }
1178 // Loop runs until we find the bucket following the one that would hold prefixedName.
1179 // If the prefixedName belongs in the last bucket the loop will drop out the bottom rather
1180 // than returning from the middle.
1181
1182 return bucketList_->size() - 1;
1183 }
1184
1185
getBucketIndex() const1186 int32_t AlphabeticIndex::getBucketIndex() const {
1187 return labelsIterIndex_;
1188 }
1189
1190
nextBucket(UErrorCode & status)1191 UBool AlphabeticIndex::nextBucket(UErrorCode &status) {
1192 if (U_FAILURE(status)) {
1193 return FALSE;
1194 }
1195 if (indexBuildRequired_ && currentBucket_ != NULL) {
1196 status = U_ENUM_OUT_OF_SYNC_ERROR;
1197 return FALSE;
1198 }
1199 buildIndex(status);
1200 if (U_FAILURE(status)) {
1201 return FALSE;
1202 }
1203 ++labelsIterIndex_;
1204 if (labelsIterIndex_ >= bucketList_->size()) {
1205 labelsIterIndex_ = bucketList_->size();
1206 return FALSE;
1207 }
1208 currentBucket_ = static_cast<Bucket *>(bucketList_->elementAt(labelsIterIndex_));
1209 resetRecordIterator();
1210 return TRUE;
1211 }
1212
getBucketLabel() const1213 const UnicodeString &AlphabeticIndex::getBucketLabel() const {
1214 if (currentBucket_ != NULL) {
1215 return currentBucket_->label_;
1216 } else {
1217 return *EMPTY_STRING;
1218 }
1219 }
1220
1221
getBucketLabelType() const1222 UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const {
1223 if (currentBucket_ != NULL) {
1224 return currentBucket_->labelType_;
1225 } else {
1226 return U_ALPHAINDEX_NORMAL;
1227 }
1228 }
1229
1230
getBucketRecordCount() const1231 int32_t AlphabeticIndex::getBucketRecordCount() const {
1232 if (currentBucket_ != NULL) {
1233 return currentBucket_->records_->size();
1234 } else {
1235 return 0;
1236 }
1237 }
1238
1239
resetBucketIterator(UErrorCode & status)1240 AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) {
1241 if (U_FAILURE(status)) {
1242 return *this;
1243 }
1244 buildIndex(status);
1245 labelsIterIndex_ = -1;
1246 currentBucket_ = NULL;
1247 return *this;
1248 }
1249
1250
nextRecord(UErrorCode & status)1251 UBool AlphabeticIndex::nextRecord(UErrorCode &status) {
1252 if (U_FAILURE(status)) {
1253 return FALSE;
1254 }
1255 if (currentBucket_ == NULL) {
1256 // We are trying to iterate over the items in a bucket, but there is no
1257 // current bucket from the enumeration of buckets.
1258 status = U_INVALID_STATE_ERROR;
1259 return FALSE;
1260 }
1261 if (indexBuildRequired_) {
1262 status = U_ENUM_OUT_OF_SYNC_ERROR;
1263 return FALSE;
1264 }
1265 ++itemsIterIndex_;
1266 if (itemsIterIndex_ >= currentBucket_->records_->size()) {
1267 itemsIterIndex_ = currentBucket_->records_->size();
1268 return FALSE;
1269 }
1270 return TRUE;
1271 }
1272
1273
getRecordName() const1274 const UnicodeString &AlphabeticIndex::getRecordName() const {
1275 const UnicodeString *retStr = EMPTY_STRING;
1276 if (currentBucket_ != NULL &&
1277 itemsIterIndex_ >= 0 &&
1278 itemsIterIndex_ < currentBucket_->records_->size()) {
1279 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1280 retStr = &item->name_;
1281 }
1282 return *retStr;
1283 }
1284
getRecordData() const1285 const void *AlphabeticIndex::getRecordData() const {
1286 const void *retPtr = NULL;
1287 if (currentBucket_ != NULL &&
1288 itemsIterIndex_ >= 0 &&
1289 itemsIterIndex_ < currentBucket_->records_->size()) {
1290 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1291 retPtr = item->data_;
1292 }
1293 return retPtr;
1294 }
1295
1296
resetRecordIterator()1297 AlphabeticIndex & AlphabeticIndex::resetRecordIterator() {
1298 itemsIterIndex_ = -1;
1299 return *this;
1300 }
1301
1302
1303
Bucket(const UnicodeString & label,const UnicodeString & lowerBoundary,UAlphabeticIndexLabelType type,UErrorCode & status)1304 AlphabeticIndex::Bucket::Bucket(const UnicodeString &label,
1305 const UnicodeString &lowerBoundary,
1306 UAlphabeticIndexLabelType type,
1307 UErrorCode &status):
1308 label_(label), lowerBoundary_(lowerBoundary), labelType_(type), records_(NULL) {
1309 if (U_FAILURE(status)) {
1310 return;
1311 }
1312 records_ = new UVector(status);
1313 if (records_ == NULL && U_SUCCESS(status)) {
1314 status = U_MEMORY_ALLOCATION_ERROR;
1315 }
1316 }
1317
1318
~Bucket()1319 AlphabeticIndex::Bucket::~Bucket() {
1320 delete records_;
1321 }
1322
1323 U_NAMESPACE_END
1324