1 /*
2 ******************************************************************************
3 * Copyright (C) 1996-2009, International Business Machines *
4 * Corporation and others. All Rights Reserved. *
5 ******************************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_COLLATION
11
12 #include "unicode/unistr.h"
13 #include "unicode/putil.h"
14 #include "unicode/usearch.h"
15
16 #include "cmemory.h"
17 #include "unicode/coll.h"
18 #include "unicode/tblcoll.h"
19 #include "unicode/coleitr.h"
20 #include "unicode/ucoleitr.h"
21
22 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
23
24 #include "unicode/uniset.h"
25 #include "unicode/uset.h"
26 #include "unicode/ustring.h"
27 #include "hash.h"
28 #include "uhash.h"
29 #include "ucln_in.h"
30 #include "ucol_imp.h"
31 #include "umutex.h"
32
33 #include "unicode/colldata.h"
34
35 U_NAMESPACE_BEGIN
36
37 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
38 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
39 #define DELETE_ARRAY(array) uprv_free((void *) (array))
40 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
41
42 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CEList)
43
44 #ifdef INSTRUMENT_CELIST
45 int32_t CEList::_active = 0;
46 int32_t CEList::_histogram[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
47 #endif
48
CEList(UCollator * coll,const UnicodeString & string,UErrorCode & status)49 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
50 : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
51 {
52 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
53 UCollationStrength strength = ucol_getStrength(coll);
54 UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
55 uint32_t variableTop = ucol_getVariableTop(coll, &status);
56 uint32_t strengthMask = 0;
57 int32_t order;
58
59 if (U_FAILURE(status)) {
60 return;
61 }
62
63 // **** only set flag if string has Han(gul) ****
64 ucol_forceHanImplicit(elems, &status);
65
66 switch (strength)
67 {
68 default:
69 strengthMask |= UCOL_TERTIARYORDERMASK;
70 /* fall through */
71
72 case UCOL_SECONDARY:
73 strengthMask |= UCOL_SECONDARYORDERMASK;
74 /* fall through */
75
76 case UCOL_PRIMARY:
77 strengthMask |= UCOL_PRIMARYORDERMASK;
78 }
79
80 #ifdef INSTRUMENT_CELIST
81 _active += 1;
82 _histogram[0] += 1;
83 #endif
84
85 ces = ceBuffer;
86
87 while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
88 UBool cont = isContinuation(order);
89
90 order &= strengthMask;
91
92 if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
93 if (strength >= UCOL_QUATERNARY) {
94 order &= UCOL_PRIMARYORDERMASK;
95 } else {
96 order = UCOL_IGNORABLE;
97 }
98 }
99
100 if (order == UCOL_IGNORABLE) {
101 continue;
102 }
103
104 if (cont) {
105 order |= UCOL_CONTINUATION_MARKER;
106 }
107
108 add(order, status);
109 }
110
111 ucol_closeElements(elems);
112 }
113
~CEList()114 CEList::~CEList()
115 {
116 #ifdef INSTRUMENT_CELIST
117 _active -= 1;
118 #endif
119
120 if (ces != ceBuffer) {
121 DELETE_ARRAY(ces);
122 }
123 }
124
add(uint32_t ce,UErrorCode & status)125 void CEList::add(uint32_t ce, UErrorCode &status)
126 {
127 if (U_FAILURE(status)) {
128 return;
129 }
130
131 if (listSize >= listMax) {
132 int32_t newMax = listMax + CELIST_BUFFER_SIZE;
133
134 #ifdef INSTRUMENT_CELIST
135 _histogram[listSize / CELIST_BUFFER_SIZE] += 1;
136 #endif
137
138 uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
139
140 if (newCEs == NULL) {
141 status = U_MEMORY_ALLOCATION_ERROR;
142 return;
143 }
144
145 uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
146
147 if (ces != ceBuffer) {
148 DELETE_ARRAY(ces);
149 }
150
151 ces = newCEs;
152 listMax = newMax;
153 }
154
155 ces[listSize++] = ce;
156 }
157
get(int32_t index) const158 uint32_t CEList::get(int32_t index) const
159 {
160 if (index >= 0 && index < listSize) {
161 return ces[index];
162 }
163
164 return UCOL_NULLORDER;
165 }
166
operator [](int32_t index) const167 uint32_t &CEList::operator[](int32_t index) const
168 {
169 return ces[index];
170 }
171
matchesAt(int32_t offset,const CEList * other) const172 UBool CEList::matchesAt(int32_t offset, const CEList *other) const
173 {
174 if (other == NULL || listSize - offset < other->size()) {
175 return FALSE;
176 }
177
178 for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
179 if (ces[i] != (*other)[j]) {
180 return FALSE;
181 }
182 }
183
184 return TRUE;
185 }
186
size() const187 int32_t CEList::size() const
188 {
189 return listSize;
190 }
191
192 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringList)
193
194 #ifdef INSTRUMENT_STRING_LIST
195 int32_t StringList::_lists = 0;
196 int32_t StringList::_strings = 0;
197 int32_t StringList::_histogram[101] = {0};
198 #endif
199
StringList(UErrorCode & status)200 StringList::StringList(UErrorCode &status)
201 : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
202 {
203 if (U_FAILURE(status)) {
204 return;
205 }
206
207 strings = new UnicodeString [listMax];
208
209 if (strings == NULL) {
210 status = U_MEMORY_ALLOCATION_ERROR;
211 return;
212 }
213
214 #ifdef INSTRUMENT_STRING_LIST
215 _lists += 1;
216 _histogram[0] += 1;
217 #endif
218 }
219
~StringList()220 StringList::~StringList()
221 {
222 delete[] strings;
223 }
224
add(const UnicodeString * string,UErrorCode & status)225 void StringList::add(const UnicodeString *string, UErrorCode &status)
226 {
227 if (U_FAILURE(status)) {
228 return;
229 }
230
231 #ifdef INSTRUMENT_STRING_LIST
232 _strings += 1;
233 #endif
234
235 if (listSize >= listMax) {
236 int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
237
238 UnicodeString *newStrings = new UnicodeString[newMax];
239 if (newStrings == NULL) {
240 status = U_MEMORY_ALLOCATION_ERROR;
241 return;
242 }
243 for (int32_t i=0; i<listSize; ++i) {
244 newStrings[i] = strings[i];
245 }
246
247 #ifdef INSTRUMENT_STRING_LIST
248 int32_t _h = listSize / STRING_LIST_BUFFER_SIZE;
249
250 if (_h > 100) {
251 _h = 100;
252 }
253
254 _histogram[_h] += 1;
255 #endif
256
257 delete[] strings;
258 strings = newStrings;
259 listMax = newMax;
260 }
261
262 // The ctor initialized all the strings in
263 // the array to empty strings, so this
264 // is the same as copying the source string.
265 strings[listSize++].append(*string);
266 }
267
add(const UChar * chars,int32_t count,UErrorCode & status)268 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
269 {
270 const UnicodeString string(chars, count);
271
272 add(&string, status);
273 }
274
get(int32_t index) const275 const UnicodeString *StringList::get(int32_t index) const
276 {
277 if (index >= 0 && index < listSize) {
278 return &strings[index];
279 }
280
281 return NULL;
282 }
283
size() const284 int32_t StringList::size() const
285 {
286 return listSize;
287 }
288
289
290 U_CFUNC void deleteStringList(void *obj);
291
292 class CEToStringsMap : public UMemory
293 {
294 public:
295
296 CEToStringsMap(UErrorCode &status);
297 ~CEToStringsMap();
298
299 void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
300 StringList *getStringList(uint32_t ce) const;
301
302 private:
303
304 void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
305 UHashtable *map;
306 };
307
CEToStringsMap(UErrorCode & status)308 CEToStringsMap::CEToStringsMap(UErrorCode &status)
309 : map(NULL)
310 {
311 if (U_FAILURE(status)) {
312 return;
313 }
314
315 map = uhash_open(uhash_hashLong, uhash_compareLong,
316 uhash_compareCaselessUnicodeString,
317 &status);
318
319 if (U_FAILURE(status)) {
320 return;
321 }
322
323 uhash_setValueDeleter(map, deleteStringList);
324 }
325
~CEToStringsMap()326 CEToStringsMap::~CEToStringsMap()
327 {
328 uhash_close(map);
329 }
330
put(uint32_t ce,UnicodeString * string,UErrorCode & status)331 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
332 {
333 StringList *strings = getStringList(ce);
334
335 if (strings == NULL) {
336 strings = new StringList(status);
337
338 if (strings == NULL || U_FAILURE(status)) {
339 status = U_MEMORY_ALLOCATION_ERROR;
340 return;
341 }
342
343 putStringList(ce, strings, status);
344 }
345
346 strings->add(string, status);
347 }
348
getStringList(uint32_t ce) const349 StringList *CEToStringsMap::getStringList(uint32_t ce) const
350 {
351 return (StringList *) uhash_iget(map, ce);
352 }
353
putStringList(uint32_t ce,StringList * stringList,UErrorCode & status)354 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
355 {
356 uhash_iput(map, ce, (void *) stringList, &status);
357 }
358
deleteStringList(void * obj)359 U_CFUNC void deleteStringList(void *obj)
360 {
361 StringList *strings = (StringList *) obj;
362
363 delete strings;
364 }
365
366 U_CFUNC void deleteCEList(void *obj);
367 U_CFUNC void deleteUnicodeStringKey(void *obj);
368
369 class StringToCEsMap : public UMemory
370 {
371 public:
372 StringToCEsMap(UErrorCode &status);
373 ~StringToCEsMap();
374
375 void put(const UnicodeString *string, const CEList *ces, UErrorCode &status);
376 const CEList *get(const UnicodeString *string);
377 void free(const CEList *list);
378
379 private:
380
381
382 UHashtable *map;
383 };
384
StringToCEsMap(UErrorCode & status)385 StringToCEsMap::StringToCEsMap(UErrorCode &status)
386 : map(NULL)
387 {
388 if (U_FAILURE(status)) {
389 return;
390 }
391
392 map = uhash_open(uhash_hashUnicodeString,
393 uhash_compareUnicodeString,
394 uhash_compareLong,
395 &status);
396
397 if (U_FAILURE(status)) {
398 return;
399 }
400
401 uhash_setValueDeleter(map, deleteCEList);
402 uhash_setKeyDeleter(map, deleteUnicodeStringKey);
403 }
404
~StringToCEsMap()405 StringToCEsMap::~StringToCEsMap()
406 {
407 uhash_close(map);
408 }
409
put(const UnicodeString * string,const CEList * ces,UErrorCode & status)410 void StringToCEsMap::put(const UnicodeString *string, const CEList *ces, UErrorCode &status)
411 {
412 uhash_put(map, (void *) string, (void *) ces, &status);
413 }
414
get(const UnicodeString * string)415 const CEList *StringToCEsMap::get(const UnicodeString *string)
416 {
417 return (const CEList *) uhash_get(map, string);
418 }
419
deleteCEList(void * obj)420 U_CFUNC void deleteCEList(void *obj)
421 {
422 CEList *list = (CEList *) obj;
423
424 delete list;
425 }
426
deleteUnicodeStringKey(void * obj)427 U_CFUNC void deleteUnicodeStringKey(void *obj)
428 {
429 UnicodeString *key = (UnicodeString *) obj;
430
431 delete key;
432 }
433
434 class CollDataCacheEntry : public UMemory
435 {
436 public:
437 CollDataCacheEntry(CollData *theData);
438 ~CollDataCacheEntry();
439
440 CollData *data;
441 int32_t refCount;
442 };
443
CollDataCacheEntry(CollData * theData)444 CollDataCacheEntry::CollDataCacheEntry(CollData *theData)
445 : data(theData), refCount(1)
446 {
447 // nothing else to do
448 }
449
~CollDataCacheEntry()450 CollDataCacheEntry::~CollDataCacheEntry()
451 {
452 // check refCount?
453 delete data;
454 }
455
456 class CollDataCache : public UMemory
457 {
458 public:
459 CollDataCache(UErrorCode &status);
460 ~CollDataCache();
461
462 CollData *get(UCollator *collator, UErrorCode &status);
463 void unref(CollData *collData);
464
465 void flush();
466
467 private:
468 static char *getKey(UCollator *collator, char *keyBuffer, int32_t *charBufferLength);
469 static void deleteKey(char *key);
470
471 UMTX lock;
472 UHashtable *cache;
473 };
474
deleteChars(void *)475 U_CFUNC void deleteChars(void * /*obj*/)
476 {
477 // char *chars = (char *) obj;
478 // All the key strings are owned by the
479 // CollData objects and don't need to
480 // be freed here.
481 //DELETE_ARRAY(chars);
482 }
483
deleteCollDataCacheEntry(void * obj)484 U_CFUNC void deleteCollDataCacheEntry(void *obj)
485 {
486 CollDataCacheEntry *entry = (CollDataCacheEntry *) obj;
487
488 delete entry;
489 }
490
CollDataCache(UErrorCode & status)491 CollDataCache::CollDataCache(UErrorCode &status)
492 : lock(0), cache(NULL)
493 {
494 if (U_FAILURE(status)) {
495 return;
496 }
497
498 cache = uhash_open(uhash_hashChars, uhash_compareChars, uhash_compareLong, &status);
499
500 if (U_FAILURE(status)) {
501 return;
502 }
503
504 uhash_setValueDeleter(cache, deleteCollDataCacheEntry);
505 uhash_setKeyDeleter(cache, deleteChars);
506 }
507
~CollDataCache()508 CollDataCache::~CollDataCache()
509 {
510 umtx_lock(&lock);
511 uhash_close(cache);
512 cache = NULL;
513 umtx_unlock(&lock);
514
515 umtx_destroy(&lock);
516 }
517
get(UCollator * collator,UErrorCode & status)518 CollData *CollDataCache::get(UCollator *collator, UErrorCode &status)
519 {
520 char keyBuffer[KEY_BUFFER_SIZE];
521 int32_t keyLength = KEY_BUFFER_SIZE;
522 char *key = getKey(collator, keyBuffer, &keyLength);
523 CollData *result = NULL, *newData = NULL;
524 CollDataCacheEntry *entry = NULL, *newEntry = NULL;
525
526 umtx_lock(&lock);
527 entry = (CollDataCacheEntry *) uhash_get(cache, key);
528
529 if (entry == NULL) {
530 umtx_unlock(&lock);
531
532 newData = new CollData(collator, key, keyLength, status);
533 newEntry = new CollDataCacheEntry(newData);
534
535 if (U_FAILURE(status) || newData == NULL || newEntry == NULL) {
536 status = U_MEMORY_ALLOCATION_ERROR;
537 return NULL;
538 }
539
540 umtx_lock(&lock);
541 entry = (CollDataCacheEntry *) uhash_get(cache, key);
542
543 if (entry == NULL) {
544 uhash_put(cache, newData->key, newEntry, &status);
545 umtx_unlock(&lock);
546
547 if (U_FAILURE(status)) {
548 delete newEntry;
549 delete newData;
550
551 return NULL;
552 }
553
554 return newData;
555 }
556 }
557
558 result = entry->data;
559 entry->refCount += 1;
560 umtx_unlock(&lock);
561
562 if (key != keyBuffer) {
563 deleteKey(key);
564 }
565
566 if (newEntry != NULL) {
567 delete newEntry;
568 delete newData;
569 }
570
571 return result;
572 }
573
unref(CollData * collData)574 void CollDataCache::unref(CollData *collData)
575 {
576 CollDataCacheEntry *entry = NULL;
577
578 umtx_lock(&lock);
579 entry = (CollDataCacheEntry *) uhash_get(cache, collData->key);
580
581 if (entry != NULL) {
582 entry->refCount -= 1;
583 }
584 umtx_unlock(&lock);
585 }
586
getKey(UCollator * collator,char * keyBuffer,int32_t * keyBufferLength)587 char *CollDataCache::getKey(UCollator *collator, char *keyBuffer, int32_t *keyBufferLength)
588 {
589 UErrorCode status = U_ZERO_ERROR;
590 int32_t len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
591
592 if (len >= *keyBufferLength) {
593 *keyBufferLength = (len + 2) & ~1; // round to even length, leaving room for terminating null
594 keyBuffer = NEW_ARRAY(char, *keyBufferLength);
595 status = U_ZERO_ERROR;
596
597 len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
598 }
599
600 keyBuffer[len] = '\0';
601
602 return keyBuffer;
603 }
604
flush()605 void CollDataCache::flush()
606 {
607 const UHashElement *element;
608 int32_t pos = -1;
609
610 umtx_lock(&lock);
611 while ((element = uhash_nextElement(cache, &pos)) != NULL) {
612 CollDataCacheEntry *entry = (CollDataCacheEntry *) element->value.pointer;
613
614 if (entry->refCount <= 0) {
615 uhash_removeElement(cache, element);
616 }
617 }
618 umtx_unlock(&lock);
619 }
620
deleteKey(char * key)621 void CollDataCache::deleteKey(char *key)
622 {
623 DELETE_ARRAY(key);
624 }
625
626 U_CDECL_BEGIN
coll_data_cleanup(void)627 static UBool coll_data_cleanup(void) {
628 CollData::freeCollDataCache();
629 return TRUE;
630 }
631 U_CDECL_END
632
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData)633 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData)
634
635 CollData::CollData()
636 {
637 // nothing
638 }
639
640 #define CLONE_COLLATOR
641
642 //#define CACHE_CELISTS
CollData(UCollator * collator,char * cacheKey,int32_t cacheKeyLength,UErrorCode & status)643 CollData::CollData(UCollator *collator, char *cacheKey, int32_t cacheKeyLength, UErrorCode &status)
644 : coll(NULL), charsToCEList(NULL), ceToCharsStartingWith(NULL), key(NULL)
645 {
646 // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
647 // i.e. other, control, private use, format, surrogate
648 U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
649 U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
650 USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
651
652 // Han ext. A, Han, Jamo, Hangul, Han Ext. B
653 // i.e. all the characers we handle implicitly
654 U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
655 U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
656 USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
657
658 if (U_FAILURE(status)) {
659 return;
660 }
661
662 USet *expansions = uset_openEmpty();
663 USet *contractions = uset_openEmpty();
664 int32_t itemCount;
665
666 #ifdef CACHE_CELISTS
667 charsToCEList = new StringToCEsMap(status);
668
669 if (U_FAILURE(status)) {
670 goto bail;
671 }
672 #else
673 charsToCEList = NULL;
674 #endif
675
676 ceToCharsStartingWith = new CEToStringsMap(status);
677
678 if (U_FAILURE(status)) {
679 goto bail;
680 }
681
682 if (cacheKeyLength > KEY_BUFFER_SIZE) {
683 key = NEW_ARRAY(char, cacheKeyLength);
684
685 if (key == NULL) {
686 status = U_MEMORY_ALLOCATION_ERROR;
687 goto bail;
688 }
689 } else {
690 key = keyBuffer;
691 }
692
693 ARRAY_COPY(key, cacheKey, cacheKeyLength);
694
695 #ifdef CLONE_COLLATOR
696 coll = ucol_safeClone(collator, NULL, NULL, &status);
697
698 if (U_FAILURE(status)) {
699 goto bail;
700 }
701 #else
702 coll = collator;
703 #endif
704
705 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
706
707 uset_addAll(charsToTest, contractions);
708 uset_addAll(charsToTest, expansions);
709 uset_removeAll(charsToTest, charsToRemove);
710
711 itemCount = uset_getItemCount(charsToTest);
712 for(int32_t item = 0; item < itemCount; item += 1) {
713 UChar32 start = 0, end = 0;
714 UChar buffer[16];
715 int32_t len = uset_getItem(charsToTest, item, &start, &end,
716 buffer, 16, &status);
717
718 if (len == 0) {
719 for (UChar32 ch = start; ch <= end; ch += 1) {
720 UnicodeString *st = new UnicodeString(ch);
721
722 if (st == NULL) {
723 status = U_MEMORY_ALLOCATION_ERROR;
724 break;
725 }
726
727 CEList *ceList = new CEList(coll, *st, status);
728
729 ceToCharsStartingWith->put(ceList->get(0), st, status);
730
731 #ifdef CACHE_CELISTS
732 charsToCEList->put(st, ceList, status);
733 #else
734 delete ceList;
735 delete st;
736 #endif
737 }
738 } else if (len > 0) {
739 UnicodeString *st = new UnicodeString(buffer, len);
740
741 if (st == NULL) {
742 status = U_MEMORY_ALLOCATION_ERROR;
743 break;
744 }
745
746 CEList *ceList = new CEList(coll, *st, status);
747
748 ceToCharsStartingWith->put(ceList->get(0), st, status);
749
750 #ifdef CACHE_CELISTS
751 charsToCEList->put(st, ceList, status);
752 #else
753 delete ceList;
754 delete st;
755 #endif
756 } else {
757 // shouldn't happen...
758 }
759
760 if (U_FAILURE(status)) {
761 break;
762 }
763 }
764
765 bail:
766 uset_close(contractions);
767 uset_close(expansions);
768 uset_close(charsToRemove);
769 uset_close(charsToTest);
770
771 if (U_FAILURE(status)) {
772 return;
773 }
774
775 UChar32 hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A,
776 UCOL_FIRST_HAN_B, UCOL_LAST_HAN_B};
777 UChar jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_JAMO, UCOL_LAST_T_JAMO};
778 UnicodeString hanString = UnicodeString::fromUTF32(hanRanges, ARRAY_SIZE(hanRanges));
779 UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges));
780 CEList hanList(coll, hanString, status);
781 CEList jamoList(coll, jamoString, status);
782 int32_t j = 0;
783
784 if (U_FAILURE(status)) {
785 return;
786 }
787
788 for (int32_t c = 0; c < jamoList.size(); c += 1) {
789 uint32_t jce = jamoList[c];
790
791 if (! isContinuation(jce)) {
792 jamoLimits[j++] = jce;
793 }
794 }
795
796 jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
797
798 minHan = 0xFFFFFFFF;
799 maxHan = 0;
800
801 for(int32_t h = 0; h < hanList.size(); h += 2) {
802 uint32_t han = (uint32_t) hanList[h];
803
804 if (han < minHan) {
805 minHan = han;
806 }
807
808 if (han > maxHan) {
809 maxHan = han;
810 }
811 }
812
813 maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
814 }
815
~CollData()816 CollData::~CollData()
817 {
818 #ifdef CLONE_COLLATOR
819 ucol_close(coll);
820 #endif
821
822 if (key != keyBuffer) {
823 DELETE_ARRAY(key);
824 }
825
826 delete ceToCharsStartingWith;
827
828 #ifdef CACHE_CELISTS
829 delete charsToCEList;
830 #endif
831 }
832
getCollator() const833 UCollator *CollData::getCollator() const
834 {
835 return coll;
836 }
837
getStringList(int32_t ce) const838 const StringList *CollData::getStringList(int32_t ce) const
839 {
840 return ceToCharsStartingWith->getStringList(ce);
841 }
842
getCEList(const UnicodeString * string) const843 const CEList *CollData::getCEList(const UnicodeString *string) const
844 {
845 #ifdef CACHE_CELISTS
846 return charsToCEList->get(string);
847 #else
848 UErrorCode status = U_ZERO_ERROR;
849 const CEList *list = new CEList(coll, *string, status);
850
851 if (U_FAILURE(status)) {
852 delete list;
853 list = NULL;
854 }
855
856 return list;
857 #endif
858 }
859
freeCEList(const CEList * list)860 void CollData::freeCEList(const CEList *list)
861 {
862 #ifndef CACHE_CELISTS
863 delete list;
864 #endif
865 }
866
minLengthInChars(const CEList * ceList,int32_t offset,int32_t * history) const867 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
868 {
869 // find out shortest string for the longest sequence of ces.
870 // this can probably be folded with the minLengthCache...
871
872 if (history[offset] >= 0) {
873 return history[offset];
874 }
875
876 uint32_t ce = ceList->get(offset);
877 int32_t maxOffset = ceList->size();
878 int32_t shortestLength = INT32_MAX;
879 const StringList *strings = ceToCharsStartingWith->getStringList(ce);
880
881 if (strings != NULL) {
882 int32_t stringCount = strings->size();
883
884 for (int32_t s = 0; s < stringCount; s += 1) {
885 const UnicodeString *string = strings->get(s);
886 #ifdef CACHE_CELISTS
887 const CEList *ceList2 = charsToCEList->get(string);
888 #else
889 UErrorCode status = U_ZERO_ERROR;
890 const CEList *ceList2 = new CEList(coll, *string, status);
891
892 if (U_FAILURE(status)) {
893 delete ceList2;
894 ceList2 = NULL;
895 }
896 #endif
897
898 if (ceList->matchesAt(offset, ceList2)) {
899 int32_t clength = ceList2->size();
900 int32_t slength = string->length();
901 int32_t roffset = offset + clength;
902 int32_t rlength = 0;
903
904 if (roffset < maxOffset) {
905 rlength = minLengthInChars(ceList, roffset, history);
906
907 if (rlength <= 0) {
908 // delete before continue to avoid memory leak.
909 #ifndef CACHE_CELISTS
910 delete ceList2;
911 #endif
912 // ignore any dead ends
913 continue;
914 }
915 }
916
917 if (shortestLength > slength + rlength) {
918 shortestLength = slength + rlength;
919 }
920 }
921
922 #ifndef CACHE_CELISTS
923 delete ceList2;
924 #endif
925 }
926 }
927
928 if (shortestLength == INT32_MAX) {
929 // No matching strings at this offset. See if
930 // the CE is in a range we can handle manually.
931 if (ce >= minHan && ce < maxHan) {
932 // all han have implicit orders which
933 // generate two CEs.
934 int32_t roffset = offset + 2;
935 int32_t rlength = 0;
936
937 //history[roffset++] = -1;
938 //history[roffset++] = 1;
939
940 if (roffset < maxOffset) {
941 rlength = minLengthInChars(ceList, roffset, history);
942 }
943
944 if (rlength < 0) {
945 return -1;
946 }
947
948 shortestLength = 1 + rlength;
949 goto have_shortest;
950 } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
951 int32_t roffset = offset;
952 int32_t rlength = 0;
953
954 // **** this loop may not handle archaic Hangul correctly ****
955 for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
956 uint32_t jce = ceList->get(roffset);
957
958 // Some Jamo have 24-bit primary order; skip the
959 // 2nd CE. This should always be OK because if
960 // we're still in the loop all we've seen are
961 // a series of Jamo in LVT order.
962 if (isContinuation(jce)) {
963 continue;
964 }
965
966 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
967 break;
968 }
969 }
970
971 if (roffset == offset) {
972 // we started with a non-L Jamo...
973 // just say it comes from a single character
974 roffset += 1;
975
976 // See if the single Jamo has a 24-bit order.
977 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
978 roffset += 1;
979 }
980 }
981
982 if (roffset < maxOffset) {
983 rlength = minLengthInChars(ceList, roffset, history);
984 }
985
986 if (rlength < 0) {
987 return -1;
988 }
989
990 shortestLength = 1 + rlength;
991 goto have_shortest;
992 }
993
994 // Can't handle it manually either. Just move on.
995 return -1;
996 }
997
998 have_shortest:
999 history[offset] = shortestLength;
1000
1001 return shortestLength;
1002 }
1003
minLengthInChars(const CEList * ceList,int32_t offset) const1004 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
1005 {
1006 int32_t clength = ceList->size();
1007 int32_t *history = NEW_ARRAY(int32_t, clength);
1008
1009 for (int32_t i = 0; i < clength; i += 1) {
1010 history[i] = -1;
1011 }
1012
1013 int32_t minLength = minLengthInChars(ceList, offset, history);
1014
1015 DELETE_ARRAY(history);
1016
1017 return minLength;
1018 }
1019
open(UCollator * collator,UErrorCode & status)1020 CollData *CollData::open(UCollator *collator, UErrorCode &status)
1021 {
1022 if (U_FAILURE(status)) {
1023 return NULL;
1024 }
1025
1026 CollDataCache *cache = getCollDataCache();
1027
1028 return cache->get(collator, status);
1029 }
1030
close(CollData * collData)1031 void CollData::close(CollData *collData)
1032 {
1033 CollDataCache *cache = getCollDataCache();
1034
1035 cache->unref(collData);
1036 }
1037
1038 CollDataCache *CollData::collDataCache = NULL;
1039
getCollDataCache()1040 CollDataCache *CollData::getCollDataCache()
1041 {
1042 UErrorCode status = U_ZERO_ERROR;
1043 CollDataCache *cache = NULL;
1044
1045 UMTX_CHECK(NULL, collDataCache, cache);
1046
1047 if (cache == NULL) {
1048 cache = new CollDataCache(status);
1049
1050 if (U_FAILURE(status)) {
1051 delete cache;
1052 return NULL;
1053 }
1054
1055 umtx_lock(NULL);
1056 if (collDataCache == NULL) {
1057 collDataCache = cache;
1058
1059 ucln_i18n_registerCleanup(UCLN_I18N_COLL_DATA, coll_data_cleanup);
1060 }
1061 umtx_unlock(NULL);
1062
1063 if (collDataCache != cache) {
1064 delete cache;
1065 }
1066 }
1067
1068 return collDataCache;
1069 }
1070
freeCollDataCache()1071 void CollData::freeCollDataCache()
1072 {
1073 CollDataCache *cache = NULL;
1074
1075 UMTX_CHECK(NULL, collDataCache, cache);
1076
1077 if (cache != NULL) {
1078 umtx_lock(NULL);
1079 if (collDataCache != NULL) {
1080 collDataCache = NULL;
1081 } else {
1082 cache = NULL;
1083 }
1084 umtx_unlock(NULL);
1085
1086 delete cache;
1087 }
1088 }
1089
flushCollDataCache()1090 void CollData::flushCollDataCache()
1091 {
1092 CollDataCache *cache = NULL;
1093
1094 UMTX_CHECK(NULL, collDataCache, cache);
1095
1096 // **** this will fail if the another ****
1097 // **** thread deletes the cache here ****
1098 if (cache != NULL) {
1099 cache->flush();
1100 }
1101 }
1102
1103 U_NAMESPACE_END
1104
1105 #endif // #if !UCONFIG_NO_COLLATION
1106