1 /*
2 *******************************************************************************
3 * Copyright (C) 2009-2013, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION
11
12 #include "unicode/alphaindex.h"
13 #include "unicode/coleitr.h"
14 #include "unicode/coll.h"
15 #include "unicode/localpointer.h"
16 #include "unicode/normalizer2.h"
17 #include "unicode/tblcoll.h"
18 #include "unicode/ulocdata.h"
19 #include "unicode/uniset.h"
20 #include "unicode/uobject.h"
21 #include "unicode/usetiter.h"
22 #include "unicode/utf16.h"
23
24 #include "cmemory.h"
25 #include "cstring.h"
26 #include "uassert.h"
27 #include "uvector.h"
28
29 //#include <string>
30 //#include <iostream>
31
32 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
33
34 U_NAMESPACE_BEGIN
35
36 namespace {
37
38 /**
39 * Prefix string for Chinese index buckets.
40 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
41 */
42 const UChar BASE[1] = { 0xFDD0 };
43 const int32_t BASE_LENGTH = 1;
44
45 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
46 const UnicodeString &one, const UnicodeString &other);
47
48 } // namespace
49
50 static int32_t U_CALLCONV
51 collatorComparator(const void *context, const void *left, const void *right);
52
53 static int32_t U_CALLCONV
54 recordCompareFn(const void *context, const void *left, const void *right);
55
56 // UVector<Record *> support function, delete a Record.
57 static void U_CALLCONV
alphaIndex_deleteRecord(void * obj)58 alphaIndex_deleteRecord(void *obj) {
59 delete static_cast<AlphabeticIndex::Record *>(obj);
60 }
61
62 namespace {
63
ownedString(const UnicodeString & s,LocalPointer<UnicodeString> & owned,UErrorCode & errorCode)64 UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned,
65 UErrorCode &errorCode) {
66 if (U_FAILURE(errorCode)) { return NULL; }
67 if (owned.isValid()) {
68 return owned.orphan();
69 }
70 UnicodeString *p = new UnicodeString(s);
71 if (p == NULL) {
72 errorCode = U_MEMORY_ALLOCATION_ERROR;
73 }
74 return p;
75 }
76
getString(const UVector & list,int32_t i)77 inline UnicodeString *getString(const UVector &list, int32_t i) {
78 return static_cast<UnicodeString *>(list[i]);
79 }
80
getBucket(const UVector & list,int32_t i)81 inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) {
82 return static_cast<AlphabeticIndex::Bucket *>(list[i]);
83 }
84
getRecord(const UVector & list,int32_t i)85 inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) {
86 return static_cast<AlphabeticIndex::Record *>(list[i]);
87 }
88
89 /**
90 * Like Java Collections.binarySearch(List, String, Comparator).
91 *
92 * @return the index>=0 where the item was found,
93 * or the index<0 for inserting the string at ~index in sorted order
94 */
binarySearch(const UVector & list,const UnicodeString & s,const Collator & coll)95 int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) {
96 if (list.size() == 0) { return ~0; }
97 int32_t start = 0;
98 int32_t limit = list.size();
99 for (;;) {
100 int32_t i = (start + limit) / 2;
101 const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i));
102 UErrorCode errorCode = U_ZERO_ERROR;
103 UCollationResult cmp = coll.compare(s, *si, errorCode);
104 if (cmp == UCOL_EQUAL) {
105 return i;
106 } else if (cmp < 0) {
107 if (i == start) {
108 return ~start; // insert s before *si
109 }
110 limit = i;
111 } else {
112 if (i == start) {
113 return ~(start + 1); // insert s after *si
114 }
115 start = i;
116 }
117 }
118 }
119
120 } // namespace
121
122 // The BucketList is not in the anonymous namespace because only Clang
123 // seems to support its use in other classes from there.
124 // However, we also don't need U_I18N_API because it is not used from outside the i18n library.
125 class BucketList : public UObject {
126 public:
BucketList(UVector * bucketList,UVector * publicBucketList)127 BucketList(UVector *bucketList, UVector *publicBucketList)
128 : bucketList_(bucketList), immutableVisibleList_(publicBucketList) {
129 int32_t displayIndex = 0;
130 for (int32_t i = 0; i < publicBucketList->size(); ++i) {
131 getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++;
132 }
133 }
134
135 // The virtual destructor must not be inline.
136 // See ticket #8454 for details.
137 virtual ~BucketList();
138
getBucketCount() const139 int32_t getBucketCount() const {
140 return immutableVisibleList_->size();
141 }
142
getBucketIndex(const UnicodeString & name,const Collator & collatorPrimaryOnly,UErrorCode & errorCode)143 int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly,
144 UErrorCode &errorCode) {
145 // binary search
146 int32_t start = 0;
147 int32_t limit = bucketList_->size();
148 while ((start + 1) < limit) {
149 int32_t i = (start + limit) / 2;
150 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i);
151 UCollationResult nameVsBucket =
152 collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode);
153 if (nameVsBucket < 0) {
154 limit = i;
155 } else {
156 start = i;
157 }
158 }
159 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start);
160 if (bucket->displayBucket_ != NULL) {
161 bucket = bucket->displayBucket_;
162 }
163 return bucket->displayIndex_;
164 }
165
166 /** All of the buckets, visible and invisible. */
167 UVector *bucketList_;
168 /** Just the visible buckets. */
169 UVector *immutableVisibleList_;
170 };
171
~BucketList()172 BucketList::~BucketList() {
173 delete bucketList_;
174 if (immutableVisibleList_ != bucketList_) {
175 delete immutableVisibleList_;
176 }
177 }
178
~ImmutableIndex()179 AlphabeticIndex::ImmutableIndex::~ImmutableIndex() {
180 delete buckets_;
181 delete collatorPrimaryOnly_;
182 }
183
184 int32_t
getBucketCount() const185 AlphabeticIndex::ImmutableIndex::getBucketCount() const {
186 return buckets_->getBucketCount();
187 }
188
189 int32_t
getBucketIndex(const UnicodeString & name,UErrorCode & errorCode) const190 AlphabeticIndex::ImmutableIndex::getBucketIndex(
191 const UnicodeString &name, UErrorCode &errorCode) const {
192 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode);
193 }
194
195 const AlphabeticIndex::Bucket *
getBucket(int32_t index) const196 AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const {
197 if (0 <= index && index < buckets_->getBucketCount()) {
198 return icu::getBucket(*buckets_->immutableVisibleList_, index);
199 } else {
200 return NULL;
201 }
202 }
203
AlphabeticIndex(const Locale & locale,UErrorCode & status)204 AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status)
205 : inputList_(NULL),
206 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
207 maxLabelCount_(99),
208 initialLabels_(NULL), firstCharsInScripts_(NULL),
209 collator_(NULL), collatorPrimaryOnly_(NULL),
210 buckets_(NULL) {
211 init(&locale, status);
212 }
213
214
AlphabeticIndex(RuleBasedCollator * collator,UErrorCode & status)215 AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status)
216 : inputList_(NULL),
217 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
218 maxLabelCount_(99),
219 initialLabels_(NULL), firstCharsInScripts_(NULL),
220 collator_(collator), collatorPrimaryOnly_(NULL),
221 buckets_(NULL) {
222 init(NULL, status);
223 }
224
225
226
~AlphabeticIndex()227 AlphabeticIndex::~AlphabeticIndex() {
228 delete collator_;
229 delete collatorPrimaryOnly_;
230 delete firstCharsInScripts_;
231 delete buckets_;
232 delete inputList_;
233 delete initialLabels_;
234 }
235
236
addLabels(const UnicodeSet & additions,UErrorCode & status)237 AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) {
238 if (U_FAILURE(status)) {
239 return *this;
240 }
241 initialLabels_->addAll(additions);
242 clearBuckets();
243 return *this;
244 }
245
246
addLabels(const Locale & locale,UErrorCode & status)247 AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
248 addIndexExemplars(&locale, status);
249 clearBuckets();
250 return *this;
251 }
252
253
buildImmutableIndex(UErrorCode & errorCode)254 AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) {
255 if (U_FAILURE(errorCode)) { return NULL; }
256 // In C++, the ImmutableIndex must own its copy of the BucketList,
257 // even if it contains no records, for proper memory management.
258 // We could clone the buckets_ if they are not NULL,
259 // but that would be worth it only if this method is called multiple times,
260 // or called after using the old-style bucket iterator API.
261 LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode));
262 LocalPointer<RuleBasedCollator> coll(
263 static_cast<RuleBasedCollator *>(collatorPrimaryOnly_->clone()));
264 if (immutableBucketList.isNull() || coll.isNull()) {
265 errorCode = U_MEMORY_ALLOCATION_ERROR;
266 return NULL;
267 }
268 ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias());
269 if (immIndex == NULL) {
270 errorCode = U_MEMORY_ALLOCATION_ERROR;
271 return NULL;
272 }
273 // The ImmutableIndex adopted its parameter objects.
274 immutableBucketList.orphan();
275 coll.orphan();
276 return immIndex;
277 }
278
getBucketCount(UErrorCode & status)279 int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) {
280 initBuckets(status);
281 if (U_FAILURE(status)) {
282 return 0;
283 }
284 return buckets_->getBucketCount();
285 }
286
287
getRecordCount(UErrorCode & status)288 int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) {
289 if (U_FAILURE(status) || inputList_ == NULL) {
290 return 0;
291 }
292 return inputList_->size();
293 }
294
initLabels(UVector & indexCharacters,UErrorCode & errorCode) const295 void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const {
296 const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode);
297 if (U_FAILURE(errorCode)) { return; }
298
299 const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0);
300 const UnicodeString &overflowBoundary =
301 *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1);
302
303 // We make a sorted array of elements.
304 // Some of the input may be redundant.
305 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
306 // We filter out those cases.
307 UnicodeSetIterator iter(*initialLabels_);
308 while (iter.next()) {
309 const UnicodeString *item = &iter.getString();
310 LocalPointer<UnicodeString> ownedItem;
311 UBool checkDistinct;
312 int32_t itemLength = item->length();
313 if (!item->hasMoreChar32Than(0, itemLength, 1)) {
314 checkDistinct = FALSE;
315 } else if(item->charAt(itemLength - 1) == 0x2a && // '*'
316 item->charAt(itemLength - 2) != 0x2a) {
317 // Use a label if it is marked with one trailing star,
318 // even if the label string sorts the same when all contractions are suppressed.
319 ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1));
320 item = ownedItem.getAlias();
321 if (item == NULL) {
322 errorCode = U_MEMORY_ALLOCATION_ERROR;
323 return;
324 }
325 checkDistinct = FALSE;
326 } else {
327 checkDistinct = TRUE;
328 }
329 if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) {
330 // Ignore a primary-ignorable or non-alphabetic index character.
331 } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) {
332 // Ignore an index characters that will land in the overflow bucket.
333 } else if (checkDistinct &&
334 collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) {
335 // Ignore a multi-code point index character that does not sort distinctly
336 // from the sequence of its separate characters.
337 } else {
338 int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_);
339 if (insertionPoint < 0) {
340 indexCharacters.insertElementAt(
341 ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode);
342 } else {
343 const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint);
344 if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) {
345 indexCharacters.setElementAt(
346 ownedString(*item, ownedItem, errorCode), insertionPoint);
347 }
348 }
349 }
350 }
351 if (U_FAILURE(errorCode)) { return; }
352
353 // if the result is still too large, cut down to maxCount elements, by removing every nth element
354
355 int32_t size = indexCharacters.size() - 1;
356 if (size > maxLabelCount_) {
357 int32_t count = 0;
358 int32_t old = -1;
359 for (int32_t i = 0; i < indexCharacters.size();) {
360 ++count;
361 int32_t bump = count * maxLabelCount_ / size;
362 if (bump == old) {
363 indexCharacters.removeElementAt(i);
364 } else {
365 old = bump;
366 ++i;
367 }
368 }
369 }
370 }
371
372 namespace {
373
fixLabel(const UnicodeString & current,UnicodeString & temp)374 const UnicodeString &fixLabel(const UnicodeString ¤t, UnicodeString &temp) {
375 if (!current.startsWith(BASE, BASE_LENGTH)) {
376 return current;
377 }
378 UChar rest = current.charAt(BASE_LENGTH);
379 if (0x2800 < rest && rest <= 0x28FF) { // stroke count
380 int32_t count = rest-0x2800;
381 temp.setTo((UChar)(0x30 + count % 10));
382 if (count >= 10) {
383 count /= 10;
384 temp.insert(0, (UChar)(0x30 + count % 10));
385 if (count >= 10) {
386 count /= 10;
387 temp.insert(0, (UChar)(0x30 + count));
388 }
389 }
390 return temp.append((UChar)0x5283);
391 }
392 return temp.setTo(current, BASE_LENGTH);
393 }
394
hasMultiplePrimaryWeights(CollationElementIterator & cei,int32_t variableTop,const UnicodeString & s,UErrorCode & errorCode)395 UBool hasMultiplePrimaryWeights(
396 CollationElementIterator &cei, int32_t variableTop,
397 const UnicodeString &s, UErrorCode &errorCode) {
398 cei.setText(s, errorCode);
399 UBool seenPrimary = FALSE;
400 for (;;) {
401 int32_t ce32 = cei.next(errorCode);
402 if (ce32 == CollationElementIterator::NULLORDER) {
403 break;
404 }
405 int32_t p = CollationElementIterator::primaryOrder(ce32);
406 if (p > variableTop && (ce32 & 0xc0) != 0xc0) {
407 // not primary ignorable, and not a continuation CE
408 if (seenPrimary) {
409 return TRUE;
410 }
411 seenPrimary = TRUE;
412 }
413 }
414 return FALSE;
415 }
416
417 } // namespace
418
createBucketList(UErrorCode & errorCode) const419 BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const {
420 // Initialize indexCharacters.
421 UVector indexCharacters(errorCode);
422 indexCharacters.setDeleter(uprv_deleteUObject);
423 initLabels(indexCharacters, errorCode);
424 if (U_FAILURE(errorCode)) { return NULL; }
425
426 // Variables for hasMultiplePrimaryWeights().
427 LocalPointer<CollationElementIterator> cei(
428 collatorPrimaryOnly_->createCollationElementIterator(emptyString_));
429 if (cei.isNull()) {
430 errorCode = U_MEMORY_ALLOCATION_ERROR;
431 return NULL;
432 }
433 int32_t variableTop;
434 if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) {
435 variableTop = CollationElementIterator::primaryOrder(
436 (int32_t)collatorPrimaryOnly_->getVariableTop(errorCode));
437 } else {
438 variableTop = 0;
439 }
440 UBool hasInvisibleBuckets = FALSE;
441
442 // Helper arrays for Chinese Pinyin collation.
443 Bucket *asciiBuckets[26] = {
444 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
445 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
446 };
447 Bucket *pinyinBuckets[26] = {
448 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
449 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
450 };
451 UBool hasPinyin = FALSE;
452
453 LocalPointer<UVector> bucketList(new UVector(errorCode));
454 if (bucketList.isNull()) {
455 errorCode = U_MEMORY_ALLOCATION_ERROR;
456 return NULL;
457 }
458 bucketList->setDeleter(uprv_deleteUObject);
459
460 // underflow bucket
461 Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW);
462 if (bucket == NULL) {
463 errorCode = U_MEMORY_ALLOCATION_ERROR;
464 return NULL;
465 }
466 bucketList->addElement(bucket, errorCode);
467 if (U_FAILURE(errorCode)) { return NULL; }
468
469 UnicodeString temp;
470
471 // fix up the list, adding underflow, additions, overflow
472 // Insert inflow labels as needed.
473 int32_t scriptIndex = -1;
474 const UnicodeString *scriptUpperBoundary = &emptyString_;
475 for (int32_t i = 0; i < indexCharacters.size(); ++i) {
476 UnicodeString ¤t = *getString(indexCharacters, i);
477 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) {
478 // We crossed the script boundary into a new script.
479 const UnicodeString &inflowBoundary = *scriptUpperBoundary;
480 UBool skippedScript = FALSE;
481 for (;;) {
482 scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex);
483 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) {
484 break;
485 }
486 skippedScript = TRUE;
487 }
488 if (skippedScript && bucketList->size() > 1) {
489 // We are skipping one or more scripts,
490 // and we are not just getting out of the underflow label.
491 bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW);
492 if (bucket == NULL) {
493 errorCode = U_MEMORY_ALLOCATION_ERROR;
494 return NULL;
495 }
496 bucketList->addElement(bucket, errorCode);
497 }
498 }
499 // Add a bucket with the current label.
500 bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL);
501 if (bucket == NULL) {
502 errorCode = U_MEMORY_ALLOCATION_ERROR;
503 return NULL;
504 }
505 bucketList->addElement(bucket, errorCode);
506 // Remember ASCII and Pinyin buckets for Pinyin redirects.
507 UChar c;
508 if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z
509 asciiBuckets[c - 0x41] = bucket;
510 } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) &&
511 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) {
512 pinyinBuckets[c - 0x41] = bucket;
513 hasPinyin = TRUE;
514 }
515 // Check for multiple primary weights.
516 if (!current.startsWith(BASE, BASE_LENGTH) &&
517 hasMultiplePrimaryWeights(*cei, variableTop, current, errorCode) &&
518 current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) {
519 // "AE-ligature" or "Sch" etc.
520 for (int32_t i = bucketList->size() - 2;; --i) {
521 Bucket *singleBucket = getBucket(*bucketList, i);
522 if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
523 // There is no single-character bucket since the last
524 // underflow or inflow label.
525 break;
526 }
527 if (singleBucket->displayBucket_ == NULL &&
528 !hasMultiplePrimaryWeights(
529 *cei, variableTop, singleBucket->lowerBoundary_, errorCode)) {
530 // Add an invisible bucket that redirects strings greater than the expansion
531 // to the previous single-character bucket.
532 // For example, after ... Q R S Sch we add Sch\uFFFF->S
533 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
534 bucket = new Bucket(emptyString_,
535 UnicodeString(current).append((UChar)0xFFFF),
536 U_ALPHAINDEX_NORMAL);
537 if (bucket == NULL) {
538 errorCode = U_MEMORY_ALLOCATION_ERROR;
539 return NULL;
540 }
541 bucket->displayBucket_ = singleBucket;
542 bucketList->addElement(bucket, errorCode);
543 hasInvisibleBuckets = TRUE;
544 break;
545 }
546 }
547 }
548 }
549 if (U_FAILURE(errorCode)) { return NULL; }
550 if (bucketList->size() == 1) {
551 // No real labels, show only the underflow label.
552 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
553 if (bl == NULL) {
554 errorCode = U_MEMORY_ALLOCATION_ERROR;
555 return NULL;
556 }
557 bucketList.orphan();
558 return bl;
559 }
560 // overflow bucket
561 bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW);
562 if (bucket == NULL) {
563 errorCode = U_MEMORY_ALLOCATION_ERROR;
564 return NULL;
565 }
566 bucketList->addElement(bucket, errorCode); // final
567
568 if (hasPinyin) {
569 // Redirect Pinyin buckets.
570 Bucket *asciiBucket = NULL;
571 for (int32_t i = 0; i < 26; ++i) {
572 if (asciiBuckets[i] != NULL) {
573 asciiBucket = asciiBuckets[i];
574 }
575 if (pinyinBuckets[i] != NULL && asciiBucket != NULL) {
576 pinyinBuckets[i]->displayBucket_ = asciiBucket;
577 hasInvisibleBuckets = TRUE;
578 }
579 }
580 }
581
582 if (U_FAILURE(errorCode)) { return NULL; }
583 if (!hasInvisibleBuckets) {
584 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
585 if (bl == NULL) {
586 errorCode = U_MEMORY_ALLOCATION_ERROR;
587 return NULL;
588 }
589 bucketList.orphan();
590 return bl;
591 }
592 // Merge inflow buckets that are visually adjacent.
593 // Iterate backwards: Merge inflow into overflow rather than the other way around.
594 int32_t i = bucketList->size() - 1;
595 Bucket *nextBucket = getBucket(*bucketList, i);
596 while (--i > 0) {
597 bucket = getBucket(*bucketList, i);
598 if (bucket->displayBucket_ != NULL) {
599 continue; // skip invisible buckets
600 }
601 if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) {
602 if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
603 bucket->displayBucket_ = nextBucket;
604 continue;
605 }
606 }
607 nextBucket = bucket;
608 }
609
610 LocalPointer<UVector> publicBucketList(new UVector(errorCode));
611 if (bucketList.isNull()) {
612 errorCode = U_MEMORY_ALLOCATION_ERROR;
613 return NULL;
614 }
615 // Do not call publicBucketList->setDeleter():
616 // This vector shares its objects with the bucketList.
617 for (int32_t i = 0; i < bucketList->size(); ++i) {
618 bucket = getBucket(*bucketList, i);
619 if (bucket->displayBucket_ == NULL) {
620 publicBucketList->addElement(bucket, errorCode);
621 }
622 }
623 if (U_FAILURE(errorCode)) { return NULL; }
624 BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias());
625 if (bl == NULL) {
626 errorCode = U_MEMORY_ALLOCATION_ERROR;
627 return NULL;
628 }
629 bucketList.orphan();
630 publicBucketList.orphan();
631 return bl;
632 }
633
634 /**
635 * Creates an index, and buckets and sorts the list of records into the index.
636 */
initBuckets(UErrorCode & errorCode)637 void AlphabeticIndex::initBuckets(UErrorCode &errorCode) {
638 if (U_FAILURE(errorCode) || buckets_ != NULL) {
639 return;
640 }
641 buckets_ = createBucketList(errorCode);
642 if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) {
643 return;
644 }
645
646 // Sort the records by name.
647 // Stable sort preserves input order of collation duplicates.
648 inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode);
649
650 // Now, we traverse all of the input, which is now sorted.
651 // If the item doesn't go in the current bucket, we find the next bucket that contains it.
652 // This makes the process order n*log(n), since we just sort the list and then do a linear process.
653 // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
654 // we need to improve it for that case.
655
656 Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0);
657 int32_t bucketIndex = 1;
658 Bucket *nextBucket;
659 const UnicodeString *upperBoundary;
660 if (bucketIndex < buckets_->bucketList_->size()) {
661 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
662 upperBoundary = &nextBucket->lowerBoundary_;
663 } else {
664 nextBucket = NULL;
665 upperBoundary = NULL;
666 }
667 for (int32_t i = 0; i < inputList_->size(); ++i) {
668 Record *r = getRecord(*inputList_, i);
669 // if the current bucket isn't the right one, find the one that is
670 // We have a special flag for the last bucket so that we don't look any further
671 while (upperBoundary != NULL &&
672 collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) {
673 currentBucket = nextBucket;
674 // now reset the boundary that we compare against
675 if (bucketIndex < buckets_->bucketList_->size()) {
676 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
677 upperBoundary = &nextBucket->lowerBoundary_;
678 } else {
679 upperBoundary = NULL;
680 }
681 }
682 // now put the record into the bucket.
683 Bucket *bucket = currentBucket;
684 if (bucket->displayBucket_ != NULL) {
685 bucket = bucket->displayBucket_;
686 }
687 if (bucket->records_ == NULL) {
688 bucket->records_ = new UVector(errorCode);
689 if (bucket->records_ == NULL) {
690 errorCode = U_MEMORY_ALLOCATION_ERROR;
691 return;
692 }
693 }
694 bucket->records_->addElement(r, errorCode);
695 }
696 }
697
clearBuckets()698 void AlphabeticIndex::clearBuckets() {
699 if (buckets_ != NULL) {
700 delete buckets_;
701 buckets_ = NULL;
702 internalResetBucketIterator();
703 }
704 }
705
internalResetBucketIterator()706 void AlphabeticIndex::internalResetBucketIterator() {
707 labelsIterIndex_ = -1;
708 currentBucket_ = NULL;
709 }
710
711
addIndexExemplars(const Locale * locale,UErrorCode & status)712 void AlphabeticIndex::addIndexExemplars(const Locale *locale, UErrorCode &status) {
713 if (U_FAILURE(status)) { return; }
714 // Chinese index characters, which are specific to each of the several Chinese tailorings,
715 // take precedence over the single locale data exemplar set per language.
716 const char *language = locale == NULL ? NULL : locale->getLanguage();
717 if (language == NULL ||
718 uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 ||
719 uprv_strcmp(language, "ko") == 0) {
720 // TODO: This should be done regardless of the language, but it's expensive.
721 // We should add a Collator function (can be @internal)
722 // to enumerate just the contractions that start with a given code point or string.
723 if (addChineseIndexCharacters(status) || U_FAILURE(status)) {
724 return;
725 }
726 }
727 if (locale == NULL) { return; }
728
729 LocalULocaleDataPointer uld(ulocdata_open(locale->getName(), &status));
730 if (U_FAILURE(status)) {
731 return;
732 }
733
734 UnicodeSet exemplars;
735 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
736 if (U_SUCCESS(status)) {
737 initialLabels_->addAll(exemplars);
738 return;
739 }
740 status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR
741
742 // The locale data did not include explicit Index characters.
743 // Synthesize a set of them from the locale's standard exemplar characters.
744 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
745 if (U_FAILURE(status)) {
746 return;
747 }
748
749 // question: should we add auxiliary exemplars?
750 if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) {
751 exemplars.add(0x61, 0x7A);
752 }
753 if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables
754 // cut down to small list
755 exemplars.remove(0xAC00, 0xD7A3).
756 add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
757 add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
758 add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
759 add(0xD30C).add(0xD558);
760 }
761 if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block
762 // cut down to small list
763 // make use of the fact that Ethiopic is allocated in 8's, where
764 // the base is 0 mod 8.
765 UnicodeSet ethiopic(
766 UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status);
767 UnicodeSetIterator it(ethiopic);
768 while (it.next() && !it.isString()) {
769 if ((it.getCodepoint() & 0x7) != 0) {
770 exemplars.remove(it.getCodepoint());
771 }
772 }
773 }
774
775 // Upper-case any that aren't already so.
776 // (We only do this for synthesized index characters.)
777 UnicodeSetIterator it(exemplars);
778 UnicodeString upperC;
779 while (it.next()) {
780 const UnicodeString &exemplarC = it.getString();
781 upperC = exemplarC;
782 upperC.toUpper(*locale);
783 initialLabels_->add(upperC);
784 }
785 }
786
addChineseIndexCharacters(UErrorCode & errorCode)787 UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) {
788 UnicodeSet contractions;
789 ucol_getContractionsAndExpansions(collatorPrimaryOnly_->getUCollator(),
790 contractions.toUSet(), NULL, FALSE, &errorCode);
791 if (U_FAILURE(errorCode)) { return FALSE; }
792 UnicodeString firstHanBoundary;
793 UBool hasPinyin = FALSE;
794 UnicodeSetIterator iter(contractions);
795 while (iter.next()) {
796 const UnicodeString &s = iter.getString();
797 if (s.startsWith(BASE, BASE_LENGTH)) {
798 initialLabels_->add(s);
799 if (firstHanBoundary.isEmpty() ||
800 collatorPrimaryOnly_->compare(s, firstHanBoundary, errorCode) < 0) {
801 firstHanBoundary = s;
802 }
803 UChar c = s.charAt(s.length() - 1);
804 if (0x41 <= c && c <= 0x5A) { // A-Z
805 hasPinyin = TRUE;
806 }
807 }
808 }
809 if (hasPinyin) {
810 initialLabels_->add(0x41, 0x5A); // A-Z
811 }
812 if (!firstHanBoundary.isEmpty()) {
813 // The hardcoded list of script boundaries includes U+4E00
814 // which is tailored to not be the first primary
815 // in all Chinese tailorings except "unihan".
816 // Replace U+4E00 with the first boundary string from the tailoring.
817 // TODO: This becomes obsolete when the root collator gets
818 // reliable script-first-primary mappings.
819 int32_t hanIndex = binarySearch(
820 *firstCharsInScripts_, UnicodeString((UChar)0x4E00), *collatorPrimaryOnly_);
821 if (hanIndex >= 0) {
822 UnicodeString *fh = new UnicodeString(firstHanBoundary);
823 if (fh == NULL) {
824 errorCode = U_MEMORY_ALLOCATION_ERROR;
825 return FALSE;
826 }
827 firstCharsInScripts_->setElementAt(fh, hanIndex);
828 }
829 return TRUE;
830 } else {
831 return FALSE;
832 }
833 }
834
835
836 /*
837 * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
838 */
839 static const UChar CGJ = 0x034F;
separated(const UnicodeString & item)840 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
841 UnicodeString result;
842 if (item.length() == 0) {
843 return result;
844 }
845 int32_t i = 0;
846 for (;;) {
847 UChar32 cp = item.char32At(i);
848 result.append(cp);
849 i = item.moveIndex32(i, 1);
850 if (i >= item.length()) {
851 break;
852 }
853 result.append(CGJ);
854 }
855 return result;
856 }
857
858
operator ==(const AlphabeticIndex &) const859 UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const {
860 return FALSE;
861 }
862
863
operator !=(const AlphabeticIndex &) const864 UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {
865 return FALSE;
866 }
867
868
getCollator() const869 const RuleBasedCollator &AlphabeticIndex::getCollator() const {
870 // There are no known non-RuleBasedCollator collators, and none ever expected.
871 // But, in case that changes, better a null pointer than a wrong type.
872 return *dynamic_cast<RuleBasedCollator *>(collator_);
873 }
874
875
getInflowLabel() const876 const UnicodeString &AlphabeticIndex::getInflowLabel() const {
877 return inflowLabel_;
878 }
879
getOverflowLabel() const880 const UnicodeString &AlphabeticIndex::getOverflowLabel() const {
881 return overflowLabel_;
882 }
883
884
getUnderflowLabel() const885 const UnicodeString &AlphabeticIndex::getUnderflowLabel() const {
886 return underflowLabel_;
887 }
888
889
setInflowLabel(const UnicodeString & label,UErrorCode &)890 AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
891 inflowLabel_ = label;
892 clearBuckets();
893 return *this;
894 }
895
896
setOverflowLabel(const UnicodeString & label,UErrorCode &)897 AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
898 overflowLabel_ = label;
899 clearBuckets();
900 return *this;
901 }
902
903
setUnderflowLabel(const UnicodeString & label,UErrorCode &)904 AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
905 underflowLabel_ = label;
906 clearBuckets();
907 return *this;
908 }
909
910
getMaxLabelCount() const911 int32_t AlphabeticIndex::getMaxLabelCount() const {
912 return maxLabelCount_;
913 }
914
915
setMaxLabelCount(int32_t maxLabelCount,UErrorCode & status)916 AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) {
917 if (U_FAILURE(status)) {
918 return *this;
919 }
920 if (maxLabelCount <= 0) {
921 status = U_ILLEGAL_ARGUMENT_ERROR;
922 return *this;
923 }
924 maxLabelCount_ = maxLabelCount;
925 clearBuckets();
926 return *this;
927 }
928
929
930 //
931 // init() - Common code for constructors.
932 //
933
init(const Locale * locale,UErrorCode & status)934 void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
935 if (U_FAILURE(status)) { return; }
936 if (locale == NULL && collator_ == NULL) {
937 status = U_ILLEGAL_ARGUMENT_ERROR;
938 return;
939 }
940
941 initialLabels_ = new UnicodeSet();
942 if (initialLabels_ == NULL) {
943 status = U_MEMORY_ALLOCATION_ERROR;
944 return;
945 }
946
947 inflowLabel_.setTo((UChar)0x2026); // Ellipsis
948 overflowLabel_ = inflowLabel_;
949 underflowLabel_ = inflowLabel_;
950
951 if (collator_ == NULL) {
952 collator_ = static_cast<RuleBasedCollator *>(Collator::createInstance(*locale, status));
953 if (U_FAILURE(status)) { return; }
954 if (collator_ == NULL) {
955 status = U_MEMORY_ALLOCATION_ERROR;
956 return;
957 }
958 }
959 collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone());
960 if (collatorPrimaryOnly_ == NULL) {
961 status = U_MEMORY_ALLOCATION_ERROR;
962 return;
963 }
964 collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status);
965 firstCharsInScripts_ = firstStringsInScript(status);
966 if (U_FAILURE(status)) { return; }
967 firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status);
968
969 // Add index exemplar characters before checking the script boundaries,
970 // since this might modify them.
971 addIndexExemplars(locale, status);
972
973 UnicodeString _4E00((UChar)0x4E00);
974 int32_t hanIndex = binarySearch(
975 *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_);
976 if (hanIndex >= 0) {
977 // Adjust the Han script boundary if necessary.
978 // TODO: This becomes obsolete when the root collator gets
979 // reliable script-first-primary mappings.
980 UnicodeString _1100((UChar)0x1100);
981 UnicodeString _1112((UChar)0x1112);
982 UnicodeString _4E9C((UChar)0x4E9C);
983 if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 &&
984 collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) {
985 // The standard Korean tailoring sorts Hanja (Han characters)
986 // as secondary differences from Hangul syllables.
987 // This makes U+4E00 not useful as a Han-script boundary.
988 firstCharsInScripts_->removeElementAt(hanIndex);
989 } else if (collatorPrimaryOnly_->compare(_4E9C, _4E00, status) < 0) {
990 // The standard Japanese tailoring sorts U+4E9C first among Kanji.
991 UnicodeString *fh = new UnicodeString(_4E9C);
992 if (fh == NULL) {
993 status = U_MEMORY_ALLOCATION_ERROR;
994 return;
995 }
996 firstCharsInScripts_->setElementAt(fh, hanIndex);
997 }
998 }
999
1000 // Guard against a degenerate collator where
1001 // some script boundary strings are primary ignorable.
1002 for (;;) {
1003 if (U_FAILURE(status)) { return; }
1004 if (firstCharsInScripts_->isEmpty()) {
1005 // AlphabeticIndex requires some non-ignorable script boundary strings.
1006 status = U_ILLEGAL_ARGUMENT_ERROR;
1007 return;
1008 }
1009 if (collatorPrimaryOnly_->compare(
1010 *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)),
1011 emptyString_, status) == UCOL_EQUAL) {
1012 firstCharsInScripts_->removeElementAt(0);
1013 } else {
1014 break;
1015 }
1016 }
1017 }
1018
1019
1020 //
1021 // Comparison function for UVector<UnicodeString *> sorting with a collator.
1022 //
1023 static int32_t U_CALLCONV
collatorComparator(const void * context,const void * left,const void * right)1024 collatorComparator(const void *context, const void *left, const void *right) {
1025 const UElement *leftElement = static_cast<const UElement *>(left);
1026 const UElement *rightElement = static_cast<const UElement *>(right);
1027 const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer);
1028 const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer);
1029
1030 if (leftString == rightString) {
1031 // Catches case where both are NULL
1032 return 0;
1033 }
1034 if (leftString == NULL) {
1035 return 1;
1036 };
1037 if (rightString == NULL) {
1038 return -1;
1039 }
1040 const Collator *col = static_cast<const Collator *>(context);
1041 UErrorCode errorCode = U_ZERO_ERROR;
1042 return col->compare(*leftString, *rightString, errorCode);
1043 }
1044
1045 //
1046 // Comparison function for UVector<Record *> sorting with a collator.
1047 //
1048 static int32_t U_CALLCONV
recordCompareFn(const void * context,const void * left,const void * right)1049 recordCompareFn(const void *context, const void *left, const void *right) {
1050 const UElement *leftElement = static_cast<const UElement *>(left);
1051 const UElement *rightElement = static_cast<const UElement *>(right);
1052 const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer);
1053 const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer);
1054 const Collator *col = static_cast<const Collator *>(context);
1055 UErrorCode errorCode = U_ZERO_ERROR;
1056 return col->compare(leftRec->name_, rightRec->name_, errorCode);
1057 }
1058
1059
1060 /**
1061 * This list contains one character per script that has the
1062 * lowest primary weight for that script in the root collator.
1063 * This list will be copied and sorted to account for script reordering.
1064 *
1065 * <p>TODO: This is fragile. If the first character of a script is tailored
1066 * so that it does not map to the script's lowest primary weight any more,
1067 * then the buckets will be off.
1068 * There are hacks in the code to handle the known CJK tailorings of U+4E00.
1069 *
1070 * <p>We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-before a.
1071 *
1072 * Keep this in sync with HACK_FIRST_CHARS_IN_SCRIPTS in
1073 * ICU4J main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java
1074 */
1075 static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = {
1076 0x41, 0, 0x03B1, 0,
1077 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0,
1078 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0,
1079 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0,
1080 0xAAF2, 0, // Meetei Mayek
1081 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0,
1082 U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0, // Sharada
1083 U16_LEAD(0x11680), U16_TRAIL(0x11680), 0, // Takri
1084 0x1B83, 0,
1085 0xD802, 0xDE00, 0, 0x0E01, 0,
1086 0x0EDE, 0, // Lao
1087 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0,
1088 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0,
1089 U16_LEAD(0x11103), U16_TRAIL(0x11103), 0, // Chakma
1090 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0,
1091 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0,
1092 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0,
1093 U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0, // Miao
1094 0xD800, 0xDE80, 0,
1095 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0,
1096 0xD801, 0xDC80, 0,
1097 U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0, // Sora Sompeng
1098 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0,
1099 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0,
1100 U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0, // Meroitic Cursive
1101 U16_LEAD(0x10980), U16_TRAIL(0x10980), 0, // Meroitic Hieroglyphs
1102 0x4E00, 0,
1103 // TODO: The overflow bucket's lowerBoundary string should be the
1104 // first item after the last reordering group in the collator's script order.
1105 // This should normally be the first Unicode code point
1106 // that is unassigned (U+0378 in Unicode 6.3) and untailored.
1107 // However, at least up to ICU 51 the Hani reordering group includes
1108 // unassigned code points,
1109 // and there is no stable string for the start of the trailing-weights range.
1110 // The only known string that sorts "high" is U+FFFF.
1111 // When ICU separates Hani vs. unassigned reordering groups, we need to fix this,
1112 // and fix relevant test code.
1113 // Ideally, FractionalUCA.txt will have a "script first primary"
1114 // for unassigned code points.
1115 0xFFFF, 0
1116 };
1117
firstStringsInScript(UErrorCode & status)1118 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
1119 if (U_FAILURE(status)) {
1120 return NULL;
1121 }
1122 UVector *dest = new UVector(status);
1123 if (dest == NULL) {
1124 status = U_MEMORY_ALLOCATION_ERROR;
1125 return NULL;
1126 }
1127 dest->setDeleter(uprv_deleteUObject);
1128 const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS;
1129 const UChar *limit = src + LENGTHOF(HACK_FIRST_CHARS_IN_SCRIPTS);
1130 do {
1131 if (U_FAILURE(status)) {
1132 return dest;
1133 }
1134 UnicodeString *str = new UnicodeString(src, -1);
1135 if (str == NULL) {
1136 status = U_MEMORY_ALLOCATION_ERROR;
1137 return dest;
1138 }
1139 dest->addElement(str, status);
1140 src += str->length() + 1;
1141 } while (src < limit);
1142 return dest;
1143 }
1144
1145
1146 namespace {
1147
1148 /**
1149 * Returns true if one index character string is "better" than the other.
1150 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
1151 * better, and otherwise binary-less-than is better.
1152 */
isOneLabelBetterThanOther(const Normalizer2 & nfkdNormalizer,const UnicodeString & one,const UnicodeString & other)1153 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
1154 const UnicodeString &one, const UnicodeString &other) {
1155 // This is called with primary-equal strings, but never with one.equals(other).
1156 UErrorCode status = U_ZERO_ERROR;
1157 UnicodeString n1 = nfkdNormalizer.normalize(one, status);
1158 UnicodeString n2 = nfkdNormalizer.normalize(other, status);
1159 if (U_FAILURE(status)) { return FALSE; }
1160 int32_t result = n1.countChar32() - n2.countChar32();
1161 if (result != 0) {
1162 return result < 0;
1163 }
1164 result = n1.compareCodePointOrder(n2);
1165 if (result != 0) {
1166 return result < 0;
1167 }
1168 return one.compareCodePointOrder(other) < 0;
1169 }
1170
1171 } // namespace
1172
1173 //
1174 // Constructor & Destructor for AlphabeticIndex::Record
1175 //
1176 // Records are internal only, instances are not directly surfaced in the public API.
1177 // This class is mostly struct-like, with all public fields.
1178
Record(const UnicodeString & name,const void * data)1179 AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data)
1180 : name_(name), data_(data) {}
1181
~Record()1182 AlphabeticIndex::Record::~Record() {
1183 }
1184
1185
addRecord(const UnicodeString & name,const void * data,UErrorCode & status)1186 AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) {
1187 if (U_FAILURE(status)) {
1188 return *this;
1189 }
1190 if (inputList_ == NULL) {
1191 inputList_ = new UVector(status);
1192 if (inputList_ == NULL) {
1193 status = U_MEMORY_ALLOCATION_ERROR;
1194 return *this;
1195 }
1196 inputList_->setDeleter(alphaIndex_deleteRecord);
1197 }
1198 Record *r = new Record(name, data);
1199 if (r == NULL) {
1200 status = U_MEMORY_ALLOCATION_ERROR;
1201 return *this;
1202 }
1203 inputList_->addElement(r, status);
1204 clearBuckets();
1205 //std::string ss;
1206 //std::string ss2;
1207 //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
1208 // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
1209 return *this;
1210 }
1211
1212
clearRecords(UErrorCode & status)1213 AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) {
1214 if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) {
1215 inputList_->removeAllElements();
1216 clearBuckets();
1217 }
1218 return *this;
1219 }
1220
getBucketIndex(const UnicodeString & name,UErrorCode & status)1221 int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) {
1222 initBuckets(status);
1223 if (U_FAILURE(status)) {
1224 return 0;
1225 }
1226 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status);
1227 }
1228
1229
getBucketIndex() const1230 int32_t AlphabeticIndex::getBucketIndex() const {
1231 return labelsIterIndex_;
1232 }
1233
1234
nextBucket(UErrorCode & status)1235 UBool AlphabeticIndex::nextBucket(UErrorCode &status) {
1236 if (U_FAILURE(status)) {
1237 return FALSE;
1238 }
1239 if (buckets_ == NULL && currentBucket_ != NULL) {
1240 status = U_ENUM_OUT_OF_SYNC_ERROR;
1241 return FALSE;
1242 }
1243 initBuckets(status);
1244 if (U_FAILURE(status)) {
1245 return FALSE;
1246 }
1247 ++labelsIterIndex_;
1248 if (labelsIterIndex_ >= buckets_->getBucketCount()) {
1249 labelsIterIndex_ = buckets_->getBucketCount();
1250 return FALSE;
1251 }
1252 currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_);
1253 resetRecordIterator();
1254 return TRUE;
1255 }
1256
getBucketLabel() const1257 const UnicodeString &AlphabeticIndex::getBucketLabel() const {
1258 if (currentBucket_ != NULL) {
1259 return currentBucket_->label_;
1260 } else {
1261 return emptyString_;
1262 }
1263 }
1264
1265
getBucketLabelType() const1266 UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const {
1267 if (currentBucket_ != NULL) {
1268 return currentBucket_->labelType_;
1269 } else {
1270 return U_ALPHAINDEX_NORMAL;
1271 }
1272 }
1273
1274
getBucketRecordCount() const1275 int32_t AlphabeticIndex::getBucketRecordCount() const {
1276 if (currentBucket_ != NULL && currentBucket_->records_ != NULL) {
1277 return currentBucket_->records_->size();
1278 } else {
1279 return 0;
1280 }
1281 }
1282
1283
resetBucketIterator(UErrorCode & status)1284 AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) {
1285 if (U_FAILURE(status)) {
1286 return *this;
1287 }
1288 internalResetBucketIterator();
1289 return *this;
1290 }
1291
1292
nextRecord(UErrorCode & status)1293 UBool AlphabeticIndex::nextRecord(UErrorCode &status) {
1294 if (U_FAILURE(status)) {
1295 return FALSE;
1296 }
1297 if (currentBucket_ == NULL) {
1298 // We are trying to iterate over the items in a bucket, but there is no
1299 // current bucket from the enumeration of buckets.
1300 status = U_INVALID_STATE_ERROR;
1301 return FALSE;
1302 }
1303 if (buckets_ == NULL) {
1304 status = U_ENUM_OUT_OF_SYNC_ERROR;
1305 return FALSE;
1306 }
1307 if (currentBucket_->records_ == NULL) {
1308 return FALSE;
1309 }
1310 ++itemsIterIndex_;
1311 if (itemsIterIndex_ >= currentBucket_->records_->size()) {
1312 itemsIterIndex_ = currentBucket_->records_->size();
1313 return FALSE;
1314 }
1315 return TRUE;
1316 }
1317
1318
getRecordName() const1319 const UnicodeString &AlphabeticIndex::getRecordName() const {
1320 const UnicodeString *retStr = &emptyString_;
1321 if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
1322 itemsIterIndex_ >= 0 &&
1323 itemsIterIndex_ < currentBucket_->records_->size()) {
1324 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1325 retStr = &item->name_;
1326 }
1327 return *retStr;
1328 }
1329
getRecordData() const1330 const void *AlphabeticIndex::getRecordData() const {
1331 const void *retPtr = NULL;
1332 if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
1333 itemsIterIndex_ >= 0 &&
1334 itemsIterIndex_ < currentBucket_->records_->size()) {
1335 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1336 retPtr = item->data_;
1337 }
1338 return retPtr;
1339 }
1340
1341
resetRecordIterator()1342 AlphabeticIndex & AlphabeticIndex::resetRecordIterator() {
1343 itemsIterIndex_ = -1;
1344 return *this;
1345 }
1346
1347
1348
Bucket(const UnicodeString & label,const UnicodeString & lowerBoundary,UAlphabeticIndexLabelType type)1349 AlphabeticIndex::Bucket::Bucket(const UnicodeString &label,
1350 const UnicodeString &lowerBoundary,
1351 UAlphabeticIndexLabelType type)
1352 : label_(label), lowerBoundary_(lowerBoundary), labelType_(type),
1353 displayBucket_(NULL), displayIndex_(-1),
1354 records_(NULL) {
1355 }
1356
1357
~Bucket()1358 AlphabeticIndex::Bucket::~Bucket() {
1359 delete records_;
1360 }
1361
1362 U_NAMESPACE_END
1363
1364 #endif
1365