1 /*
2 ******************************************************************************
3 * Copyright (C) 1996-2011, International Business Machines *
4 * Corporation and others. All Rights Reserved. *
5 ******************************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_COLLATION
11
12 #include "unicode/unistr.h"
13 #include "unicode/putil.h"
14 #include "unicode/usearch.h"
15
16 #include "cmemory.h"
17 #include "unicode/coll.h"
18 #include "unicode/tblcoll.h"
19 #include "unicode/coleitr.h"
20 #include "unicode/ucoleitr.h"
21
22 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
23
24 #include "unicode/uniset.h"
25 #include "unicode/uset.h"
26 #include "unicode/ustring.h"
27 #include "hash.h"
28 #include "uhash.h"
29 #include "ucln_in.h"
30 #include "ucol_imp.h"
31 #include "umutex.h"
32
33 #include "unicode/colldata.h"
34
35 U_NAMESPACE_BEGIN
36
37 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
38 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
39 #define DELETE_ARRAY(array) uprv_free((void *) (array))
40 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
41
42 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CEList)
43
44 #ifdef INSTRUMENT_CELIST
45 int32_t CEList::_active = 0;
46 int32_t CEList::_histogram[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
47 #endif
48
CEList(UCollator * coll,const UnicodeString & string,UErrorCode & status)49 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
50 : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
51 {
52 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
53 UCollationStrength strength = ucol_getStrength(coll);
54 UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
55 uint32_t variableTop = ucol_getVariableTop(coll, &status);
56 uint32_t strengthMask = 0;
57 int32_t order;
58
59 if (U_FAILURE(status)) {
60 return;
61 }
62
63 // **** only set flag if string has Han(gul) ****
64 ucol_forceHanImplicit(elems, &status);
65
66 switch (strength)
67 {
68 default:
69 strengthMask |= UCOL_TERTIARYORDERMASK;
70 /* fall through */
71
72 case UCOL_SECONDARY:
73 strengthMask |= UCOL_SECONDARYORDERMASK;
74 /* fall through */
75
76 case UCOL_PRIMARY:
77 strengthMask |= UCOL_PRIMARYORDERMASK;
78 }
79
80 #ifdef INSTRUMENT_CELIST
81 _active += 1;
82 _histogram[0] += 1;
83 #endif
84
85 ces = ceBuffer;
86
87 while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
88 UBool cont = isContinuation(order);
89
90 order &= strengthMask;
91
92 if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
93 if (strength >= UCOL_QUATERNARY) {
94 order &= UCOL_PRIMARYORDERMASK;
95 } else {
96 order = UCOL_IGNORABLE;
97 }
98 }
99
100 if (order == UCOL_IGNORABLE) {
101 continue;
102 }
103
104 if (cont) {
105 order |= UCOL_CONTINUATION_MARKER;
106 }
107
108 add(order, status);
109 }
110
111 ucol_closeElements(elems);
112 }
113
~CEList()114 CEList::~CEList()
115 {
116 #ifdef INSTRUMENT_CELIST
117 _active -= 1;
118 #endif
119
120 if (ces != ceBuffer) {
121 DELETE_ARRAY(ces);
122 }
123 }
124
add(uint32_t ce,UErrorCode & status)125 void CEList::add(uint32_t ce, UErrorCode &status)
126 {
127 if (U_FAILURE(status)) {
128 return;
129 }
130
131 if (listSize >= listMax) {
132 int32_t newMax = listMax + CELIST_BUFFER_SIZE;
133
134 #ifdef INSTRUMENT_CELIST
135 _histogram[listSize / CELIST_BUFFER_SIZE] += 1;
136 #endif
137
138 uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
139
140 if (newCEs == NULL) {
141 status = U_MEMORY_ALLOCATION_ERROR;
142 return;
143 }
144
145 uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
146
147 if (ces != ceBuffer) {
148 DELETE_ARRAY(ces);
149 }
150
151 ces = newCEs;
152 listMax = newMax;
153 }
154
155 ces[listSize++] = ce;
156 }
157
get(int32_t index) const158 uint32_t CEList::get(int32_t index) const
159 {
160 if (index >= 0 && index < listSize) {
161 return ces[index];
162 }
163
164 return UCOL_NULLORDER;
165 }
166
operator [](int32_t index) const167 uint32_t &CEList::operator[](int32_t index) const
168 {
169 return ces[index];
170 }
171
matchesAt(int32_t offset,const CEList * other) const172 UBool CEList::matchesAt(int32_t offset, const CEList *other) const
173 {
174 if (other == NULL || listSize - offset < other->size()) {
175 return FALSE;
176 }
177
178 for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
179 if (ces[i] != (*other)[j]) {
180 return FALSE;
181 }
182 }
183
184 return TRUE;
185 }
186
size() const187 int32_t CEList::size() const
188 {
189 return listSize;
190 }
191
192 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringList)
193
194 #ifdef INSTRUMENT_STRING_LIST
195 int32_t StringList::_lists = 0;
196 int32_t StringList::_strings = 0;
197 int32_t StringList::_histogram[101] = {0};
198 #endif
199
StringList(UErrorCode & status)200 StringList::StringList(UErrorCode &status)
201 : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
202 {
203 if (U_FAILURE(status)) {
204 return;
205 }
206
207 strings = new UnicodeString [listMax];
208
209 if (strings == NULL) {
210 status = U_MEMORY_ALLOCATION_ERROR;
211 return;
212 }
213
214 #ifdef INSTRUMENT_STRING_LIST
215 _lists += 1;
216 _histogram[0] += 1;
217 #endif
218 }
219
~StringList()220 StringList::~StringList()
221 {
222 delete[] strings;
223 }
224
add(const UnicodeString * string,UErrorCode & status)225 void StringList::add(const UnicodeString *string, UErrorCode &status)
226 {
227 if (U_FAILURE(status)) {
228 return;
229 }
230
231 #ifdef INSTRUMENT_STRING_LIST
232 _strings += 1;
233 #endif
234
235 if (listSize >= listMax) {
236 int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
237 UnicodeString *newStrings = new UnicodeString[newMax];
238 if (newStrings == NULL) {
239 status = U_MEMORY_ALLOCATION_ERROR;
240 return;
241 }
242 for (int32_t i=0; i<listSize; ++i) {
243 newStrings[i] = strings[i];
244 }
245
246 #ifdef INSTRUMENT_STRING_LIST
247 int32_t _h = listSize / STRING_LIST_BUFFER_SIZE;
248
249 if (_h > 100) {
250 _h = 100;
251 }
252
253 _histogram[_h] += 1;
254 #endif
255
256 delete[] strings;
257 strings = newStrings;
258 listMax = newMax;
259 }
260
261 // The ctor initialized all the strings in
262 // the array to empty strings, so this
263 // is the same as copying the source string.
264 strings[listSize++].append(*string);
265 }
266
add(const UChar * chars,int32_t count,UErrorCode & status)267 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
268 {
269 const UnicodeString string(chars, count);
270
271 add(&string, status);
272 }
273
get(int32_t index) const274 const UnicodeString *StringList::get(int32_t index) const
275 {
276 if (index >= 0 && index < listSize) {
277 return &strings[index];
278 }
279
280 return NULL;
281 }
282
size() const283 int32_t StringList::size() const
284 {
285 return listSize;
286 }
287
288
289 U_CFUNC void deleteStringList(void *obj);
290
291 class CEToStringsMap : public UMemory
292 {
293 public:
294
295 CEToStringsMap(UErrorCode &status);
296 ~CEToStringsMap();
297
298 void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
299 StringList *getStringList(uint32_t ce) const;
300
301 private:
302
303 void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
304 UHashtable *map;
305 };
306
CEToStringsMap(UErrorCode & status)307 CEToStringsMap::CEToStringsMap(UErrorCode &status)
308 : map(NULL)
309 {
310 if (U_FAILURE(status)) {
311 return;
312 }
313
314 map = uhash_open(uhash_hashLong, uhash_compareLong,
315 uhash_compareCaselessUnicodeString,
316 &status);
317
318 if (U_FAILURE(status)) {
319 return;
320 }
321
322 uhash_setValueDeleter(map, deleteStringList);
323 }
324
~CEToStringsMap()325 CEToStringsMap::~CEToStringsMap()
326 {
327 uhash_close(map);
328 }
329
put(uint32_t ce,UnicodeString * string,UErrorCode & status)330 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
331 {
332 StringList *strings = getStringList(ce);
333
334 if (strings == NULL) {
335 strings = new StringList(status);
336
337 if (strings == NULL || U_FAILURE(status)) {
338 status = U_MEMORY_ALLOCATION_ERROR;
339 return;
340 }
341
342 putStringList(ce, strings, status);
343 }
344
345 strings->add(string, status);
346 }
347
getStringList(uint32_t ce) const348 StringList *CEToStringsMap::getStringList(uint32_t ce) const
349 {
350 return (StringList *) uhash_iget(map, ce);
351 }
352
putStringList(uint32_t ce,StringList * stringList,UErrorCode & status)353 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
354 {
355 uhash_iput(map, ce, (void *) stringList, &status);
356 }
357
deleteStringList(void * obj)358 U_CFUNC void deleteStringList(void *obj)
359 {
360 StringList *strings = (StringList *) obj;
361
362 delete strings;
363 }
364
365 U_CFUNC void deleteCEList(void *obj);
366 U_CFUNC void deleteUnicodeStringKey(void *obj);
367
368 class StringToCEsMap : public UMemory
369 {
370 public:
371 StringToCEsMap(UErrorCode &status);
372 ~StringToCEsMap();
373
374 void put(const UnicodeString *string, const CEList *ces, UErrorCode &status);
375 const CEList *get(const UnicodeString *string);
376 void free(const CEList *list);
377
378 private:
379
380
381 UHashtable *map;
382 };
383
StringToCEsMap(UErrorCode & status)384 StringToCEsMap::StringToCEsMap(UErrorCode &status)
385 : map(NULL)
386 {
387 if (U_FAILURE(status)) {
388 return;
389 }
390
391 map = uhash_open(uhash_hashUnicodeString,
392 uhash_compareUnicodeString,
393 uhash_compareLong,
394 &status);
395
396 if (U_FAILURE(status)) {
397 return;
398 }
399
400 uhash_setValueDeleter(map, deleteCEList);
401 uhash_setKeyDeleter(map, deleteUnicodeStringKey);
402 }
403
~StringToCEsMap()404 StringToCEsMap::~StringToCEsMap()
405 {
406 uhash_close(map);
407 }
408
put(const UnicodeString * string,const CEList * ces,UErrorCode & status)409 void StringToCEsMap::put(const UnicodeString *string, const CEList *ces, UErrorCode &status)
410 {
411 uhash_put(map, (void *) string, (void *) ces, &status);
412 }
413
get(const UnicodeString * string)414 const CEList *StringToCEsMap::get(const UnicodeString *string)
415 {
416 return (const CEList *) uhash_get(map, string);
417 }
418
deleteCEList(void * obj)419 U_CFUNC void deleteCEList(void *obj)
420 {
421 CEList *list = (CEList *) obj;
422
423 delete list;
424 }
425
deleteUnicodeStringKey(void * obj)426 U_CFUNC void deleteUnicodeStringKey(void *obj)
427 {
428 UnicodeString *key = (UnicodeString *) obj;
429
430 delete key;
431 }
432
433 class CollDataCacheEntry : public UMemory
434 {
435 public:
436 CollDataCacheEntry(CollData *theData);
437 ~CollDataCacheEntry();
438
439 CollData *data;
440 int32_t refCount;
441 };
442
CollDataCacheEntry(CollData * theData)443 CollDataCacheEntry::CollDataCacheEntry(CollData *theData)
444 : data(theData), refCount(1)
445 {
446 // nothing else to do
447 }
448
~CollDataCacheEntry()449 CollDataCacheEntry::~CollDataCacheEntry()
450 {
451 // check refCount?
452 delete data;
453 }
454
455 class CollDataCache : public UMemory
456 {
457 public:
458 CollDataCache(UErrorCode &status);
459 ~CollDataCache();
460
461 CollData *get(UCollator *collator, UErrorCode &status);
462 void unref(CollData *collData);
463
464 void flush();
465
466 private:
467 static char *getKey(UCollator *collator, char *keyBuffer, int32_t *charBufferLength);
468 static void deleteKey(char *key);
469
470 UHashtable *cache;
471 };
472 static UMTX lock;
473
deleteChars(void *)474 U_CFUNC void deleteChars(void * /*obj*/)
475 {
476 // char *chars = (char *) obj;
477 // All the key strings are owned by the
478 // CollData objects and don't need to
479 // be freed here.
480 //DELETE_ARRAY(chars);
481 }
482
deleteCollDataCacheEntry(void * obj)483 U_CFUNC void deleteCollDataCacheEntry(void *obj)
484 {
485 CollDataCacheEntry *entry = (CollDataCacheEntry *) obj;
486
487 delete entry;
488 }
489
CollDataCache(UErrorCode & status)490 CollDataCache::CollDataCache(UErrorCode &status)
491 : cache(NULL)
492 {
493 if (U_FAILURE(status)) {
494 return;
495 }
496
497 cache = uhash_open(uhash_hashChars, uhash_compareChars, uhash_compareLong, &status);
498
499 if (U_FAILURE(status)) {
500 return;
501 }
502
503 uhash_setValueDeleter(cache, deleteCollDataCacheEntry);
504 uhash_setKeyDeleter(cache, deleteChars);
505 }
506
~CollDataCache()507 CollDataCache::~CollDataCache()
508 {
509 umtx_lock(&lock);
510 uhash_close(cache);
511 cache = NULL;
512 umtx_unlock(&lock);
513 }
514
get(UCollator * collator,UErrorCode & status)515 CollData *CollDataCache::get(UCollator *collator, UErrorCode &status)
516 {
517 char keyBuffer[KEY_BUFFER_SIZE];
518 int32_t keyLength = KEY_BUFFER_SIZE;
519 char *key = getKey(collator, keyBuffer, &keyLength);
520 CollData *result = NULL, *newData = NULL;
521 CollDataCacheEntry *entry = NULL, *newEntry = NULL;
522
523 umtx_lock(&lock);
524 entry = (CollDataCacheEntry *) uhash_get(cache, key);
525
526 if (entry == NULL) {
527 umtx_unlock(&lock);
528
529 newData = new CollData(collator, key, keyLength, status);
530 newEntry = new CollDataCacheEntry(newData);
531
532 if (U_FAILURE(status) || newData == NULL || newEntry == NULL) {
533 status = U_MEMORY_ALLOCATION_ERROR;
534 return NULL;
535 }
536
537 umtx_lock(&lock);
538 entry = (CollDataCacheEntry *) uhash_get(cache, key);
539
540 if (entry == NULL) {
541 uhash_put(cache, newData->key, newEntry, &status);
542 umtx_unlock(&lock);
543
544 if (U_FAILURE(status)) {
545 delete newEntry;
546 delete newData;
547
548 return NULL;
549 }
550
551 return newData;
552 }
553 }
554
555 result = entry->data;
556 entry->refCount += 1;
557 umtx_unlock(&lock);
558
559 if (key != keyBuffer) {
560 deleteKey(key);
561 }
562
563 if (newEntry != NULL) {
564 delete newEntry;
565 delete newData;
566 }
567
568 return result;
569 }
570
unref(CollData * collData)571 void CollDataCache::unref(CollData *collData)
572 {
573 CollDataCacheEntry *entry = NULL;
574
575 umtx_lock(&lock);
576 entry = (CollDataCacheEntry *) uhash_get(cache, collData->key);
577
578 if (entry != NULL) {
579 entry->refCount -= 1;
580 }
581 umtx_unlock(&lock);
582 }
583
getKey(UCollator * collator,char * keyBuffer,int32_t * keyBufferLength)584 char *CollDataCache::getKey(UCollator *collator, char *keyBuffer, int32_t *keyBufferLength)
585 {
586 UErrorCode status = U_ZERO_ERROR;
587 int32_t len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
588
589 if (len >= *keyBufferLength) {
590 *keyBufferLength = (len + 2) & ~1; // round to even length, leaving room for terminating null
591 keyBuffer = NEW_ARRAY(char, *keyBufferLength);
592 status = U_ZERO_ERROR;
593
594 len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
595 }
596
597 keyBuffer[len] = '\0';
598
599 return keyBuffer;
600 }
601
flush()602 void CollDataCache::flush()
603 {
604 const UHashElement *element;
605 int32_t pos = -1;
606
607 umtx_lock(&lock);
608 while ((element = uhash_nextElement(cache, &pos)) != NULL) {
609 CollDataCacheEntry *entry = (CollDataCacheEntry *) element->value.pointer;
610
611 if (entry->refCount <= 0) {
612 uhash_removeElement(cache, element);
613 }
614 }
615 umtx_unlock(&lock);
616 }
617
deleteKey(char * key)618 void CollDataCache::deleteKey(char *key)
619 {
620 DELETE_ARRAY(key);
621 }
622
623 U_CDECL_BEGIN
coll_data_cleanup(void)624 static UBool coll_data_cleanup(void) {
625 CollData::freeCollDataCache();
626 return TRUE;
627 }
628 U_CDECL_END
629
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData)630 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData)
631
632 CollData::CollData()
633 {
634 // nothing
635 }
636
637 #define CLONE_COLLATOR
638
639 //#define CACHE_CELISTS
CollData(UCollator * collator,char * cacheKey,int32_t cacheKeyLength,UErrorCode & status)640 CollData::CollData(UCollator *collator, char *cacheKey, int32_t cacheKeyLength, UErrorCode &status)
641 : coll(NULL), charsToCEList(NULL), ceToCharsStartingWith(NULL), key(NULL)
642 {
643 // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
644 // i.e. other, control, private use, format, surrogate
645 U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
646 U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
647 USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
648
649 // Han ext. A, Han, Jamo, Hangul, Han Ext. B
650 // i.e. all the characers we handle implicitly
651 U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
652 U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
653 USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
654
655 if (U_FAILURE(status)) {
656 return;
657 }
658
659 USet *expansions = uset_openEmpty();
660 USet *contractions = uset_openEmpty();
661 int32_t itemCount;
662
663 #ifdef CACHE_CELISTS
664 charsToCEList = new StringToCEsMap(status);
665
666 if (U_FAILURE(status)) {
667 goto bail;
668 }
669 #else
670 charsToCEList = NULL;
671 #endif
672
673 ceToCharsStartingWith = new CEToStringsMap(status);
674
675 if (U_FAILURE(status)) {
676 goto bail;
677 }
678
679 if (cacheKeyLength > KEY_BUFFER_SIZE) {
680 key = NEW_ARRAY(char, cacheKeyLength);
681
682 if (key == NULL) {
683 status = U_MEMORY_ALLOCATION_ERROR;
684 goto bail;
685 }
686 } else {
687 key = keyBuffer;
688 }
689
690 ARRAY_COPY(key, cacheKey, cacheKeyLength);
691
692 #ifdef CLONE_COLLATOR
693 coll = ucol_safeClone(collator, NULL, NULL, &status);
694
695 if (U_FAILURE(status)) {
696 goto bail;
697 }
698 #else
699 coll = collator;
700 #endif
701
702 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
703
704 uset_addAll(charsToTest, contractions);
705 uset_addAll(charsToTest, expansions);
706 uset_removeAll(charsToTest, charsToRemove);
707
708 itemCount = uset_getItemCount(charsToTest);
709 for(int32_t item = 0; item < itemCount; item += 1) {
710 UChar32 start = 0, end = 0;
711 UChar buffer[16];
712 int32_t len = uset_getItem(charsToTest, item, &start, &end,
713 buffer, 16, &status);
714
715 if (len == 0) {
716 for (UChar32 ch = start; ch <= end; ch += 1) {
717 UnicodeString *st = new UnicodeString(ch);
718
719 if (st == NULL) {
720 status = U_MEMORY_ALLOCATION_ERROR;
721 break;
722 }
723
724 CEList *ceList = new CEList(coll, *st, status);
725
726 ceToCharsStartingWith->put(ceList->get(0), st, status);
727
728 #ifdef CACHE_CELISTS
729 charsToCEList->put(st, ceList, status);
730 #else
731 delete ceList;
732 delete st;
733 #endif
734 }
735 } else if (len > 0) {
736 UnicodeString *st = new UnicodeString(buffer, len);
737
738 if (st == NULL) {
739 status = U_MEMORY_ALLOCATION_ERROR;
740 break;
741 }
742
743 CEList *ceList = new CEList(coll, *st, status);
744
745 ceToCharsStartingWith->put(ceList->get(0), st, status);
746
747 #ifdef CACHE_CELISTS
748 charsToCEList->put(st, ceList, status);
749 #else
750 delete ceList;
751 delete st;
752 #endif
753 } else {
754 // shouldn't happen...
755 }
756
757 if (U_FAILURE(status)) {
758 break;
759 }
760 }
761
762 bail:
763 uset_close(contractions);
764 uset_close(expansions);
765 uset_close(charsToRemove);
766 uset_close(charsToTest);
767
768 if (U_FAILURE(status)) {
769 return;
770 }
771
772 UChar32 hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A,
773 UCOL_FIRST_HAN_B, UCOL_LAST_HAN_B};
774 UChar jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_JAMO, UCOL_LAST_T_JAMO};
775 UnicodeString hanString = UnicodeString::fromUTF32(hanRanges, ARRAY_SIZE(hanRanges));
776 UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges));
777 CEList hanList(coll, hanString, status);
778 CEList jamoList(coll, jamoString, status);
779 int32_t j = 0;
780
781 if (U_FAILURE(status)) {
782 return;
783 }
784
785 for (int32_t c = 0; c < jamoList.size(); c += 1) {
786 uint32_t jce = jamoList[c];
787
788 if (! isContinuation(jce)) {
789 jamoLimits[j++] = jce;
790 }
791 }
792
793 jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
794
795 minHan = 0xFFFFFFFF;
796 maxHan = 0;
797
798 for(int32_t h = 0; h < hanList.size(); h += 2) {
799 uint32_t han = (uint32_t) hanList[h];
800
801 if (han < minHan) {
802 minHan = han;
803 }
804
805 if (han > maxHan) {
806 maxHan = han;
807 }
808 }
809
810 maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
811 }
812
~CollData()813 CollData::~CollData()
814 {
815 #ifdef CLONE_COLLATOR
816 ucol_close(coll);
817 #endif
818
819 if (key != keyBuffer) {
820 DELETE_ARRAY(key);
821 }
822
823 delete ceToCharsStartingWith;
824
825 #ifdef CACHE_CELISTS
826 delete charsToCEList;
827 #endif
828 }
829
getCollator() const830 UCollator *CollData::getCollator() const
831 {
832 return coll;
833 }
834
getStringList(int32_t ce) const835 const StringList *CollData::getStringList(int32_t ce) const
836 {
837 return ceToCharsStartingWith->getStringList(ce);
838 }
839
getCEList(const UnicodeString * string) const840 const CEList *CollData::getCEList(const UnicodeString *string) const
841 {
842 #ifdef CACHE_CELISTS
843 return charsToCEList->get(string);
844 #else
845 UErrorCode status = U_ZERO_ERROR;
846 const CEList *list = new CEList(coll, *string, status);
847
848 if (U_FAILURE(status)) {
849 delete list;
850 list = NULL;
851 }
852
853 return list;
854 #endif
855 }
856
freeCEList(const CEList * list)857 void CollData::freeCEList(const CEList *list)
858 {
859 #ifndef CACHE_CELISTS
860 delete list;
861 #endif
862 }
863
minLengthInChars(const CEList * ceList,int32_t offset,int32_t * history) const864 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
865 {
866 // find out shortest string for the longest sequence of ces.
867 // this can probably be folded with the minLengthCache...
868
869 if (history[offset] >= 0) {
870 return history[offset];
871 }
872
873 uint32_t ce = ceList->get(offset);
874 int32_t maxOffset = ceList->size();
875 int32_t shortestLength = INT32_MAX;
876 const StringList *strings = ceToCharsStartingWith->getStringList(ce);
877
878 if (strings != NULL) {
879 int32_t stringCount = strings->size();
880
881 for (int32_t s = 0; s < stringCount; s += 1) {
882 const UnicodeString *string = strings->get(s);
883 #ifdef CACHE_CELISTS
884 const CEList *ceList2 = charsToCEList->get(string);
885 #else
886 UErrorCode status = U_ZERO_ERROR;
887 const CEList *ceList2 = new CEList(coll, *string, status);
888
889 if (U_FAILURE(status)) {
890 delete ceList2;
891 ceList2 = NULL;
892 }
893 #endif
894
895 if (ceList->matchesAt(offset, ceList2)) {
896 int32_t clength = ceList2->size();
897 int32_t slength = string->length();
898 int32_t roffset = offset + clength;
899 int32_t rlength = 0;
900
901 if (roffset < maxOffset) {
902 rlength = minLengthInChars(ceList, roffset, history);
903
904 if (rlength <= 0) {
905 // delete before continue to avoid memory leak.
906 #ifndef CACHE_CELISTS
907 delete ceList2;
908 #endif
909 // ignore any dead ends
910 continue;
911 }
912 }
913
914 if (shortestLength > slength + rlength) {
915 shortestLength = slength + rlength;
916 }
917 }
918
919 #ifndef CACHE_CELISTS
920 delete ceList2;
921 #endif
922 }
923 }
924
925 if (shortestLength == INT32_MAX) {
926 // No matching strings at this offset. See if
927 // the CE is in a range we can handle manually.
928 if (ce >= minHan && ce < maxHan) {
929 // all han have implicit orders which
930 // generate two CEs.
931 int32_t roffset = offset + 2;
932 int32_t rlength = 0;
933
934 //history[roffset++] = -1;
935 //history[roffset++] = 1;
936
937 if (roffset < maxOffset) {
938 rlength = minLengthInChars(ceList, roffset, history);
939 }
940
941 if (rlength < 0) {
942 return -1;
943 }
944
945 shortestLength = 1 + rlength;
946 goto have_shortest;
947 } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
948 int32_t roffset = offset;
949 int32_t rlength = 0;
950
951 // **** this loop may not handle archaic Hangul correctly ****
952 for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
953 uint32_t jce = ceList->get(roffset);
954
955 // Some Jamo have 24-bit primary order; skip the
956 // 2nd CE. This should always be OK because if
957 // we're still in the loop all we've seen are
958 // a series of Jamo in LVT order.
959 if (isContinuation(jce)) {
960 continue;
961 }
962
963 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
964 break;
965 }
966 }
967
968 if (roffset == offset) {
969 // we started with a non-L Jamo...
970 // just say it comes from a single character
971 roffset += 1;
972
973 // See if the single Jamo has a 24-bit order.
974 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
975 roffset += 1;
976 }
977 }
978
979 if (roffset < maxOffset) {
980 rlength = minLengthInChars(ceList, roffset, history);
981 }
982
983 if (rlength < 0) {
984 return -1;
985 }
986
987 shortestLength = 1 + rlength;
988 goto have_shortest;
989 }
990
991 // Can't handle it manually either. Just move on.
992 return -1;
993 }
994
995 have_shortest:
996 history[offset] = shortestLength;
997
998 return shortestLength;
999 }
1000
minLengthInChars(const CEList * ceList,int32_t offset) const1001 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
1002 {
1003 int32_t clength = ceList->size();
1004 int32_t *history = NEW_ARRAY(int32_t, clength);
1005
1006 for (int32_t i = 0; i < clength; i += 1) {
1007 history[i] = -1;
1008 }
1009
1010 int32_t minLength = minLengthInChars(ceList, offset, history);
1011
1012 DELETE_ARRAY(history);
1013
1014 return minLength;
1015 }
1016
open(UCollator * collator,UErrorCode & status)1017 CollData *CollData::open(UCollator *collator, UErrorCode &status)
1018 {
1019 if (U_FAILURE(status)) {
1020 return NULL;
1021 }
1022
1023 CollDataCache *cache = getCollDataCache();
1024
1025 return cache->get(collator, status);
1026 }
1027
close(CollData * collData)1028 void CollData::close(CollData *collData)
1029 {
1030 CollDataCache *cache = getCollDataCache();
1031
1032 cache->unref(collData);
1033 }
1034
1035 CollDataCache *CollData::collDataCache = NULL;
1036
getCollDataCache()1037 CollDataCache *CollData::getCollDataCache()
1038 {
1039 UErrorCode status = U_ZERO_ERROR;
1040 CollDataCache *cache = NULL;
1041
1042 UMTX_CHECK(NULL, collDataCache, cache);
1043
1044 if (cache == NULL) {
1045 cache = new CollDataCache(status);
1046
1047 if (U_FAILURE(status)) {
1048 delete cache;
1049 return NULL;
1050 }
1051
1052 umtx_lock(NULL);
1053 if (collDataCache == NULL) {
1054 collDataCache = cache;
1055
1056 ucln_i18n_registerCleanup(UCLN_I18N_COLL_DATA, coll_data_cleanup);
1057 }
1058 umtx_unlock(NULL);
1059
1060 if (collDataCache != cache) {
1061 delete cache;
1062 }
1063 }
1064
1065 return collDataCache;
1066 }
1067
freeCollDataCache()1068 void CollData::freeCollDataCache()
1069 {
1070 CollDataCache *cache = NULL;
1071
1072 UMTX_CHECK(NULL, collDataCache, cache);
1073
1074 if (cache != NULL) {
1075 umtx_lock(NULL);
1076 if (collDataCache != NULL) {
1077 collDataCache = NULL;
1078 } else {
1079 cache = NULL;
1080 }
1081 umtx_unlock(NULL);
1082
1083 delete cache;
1084 }
1085 }
1086
flushCollDataCache()1087 void CollData::flushCollDataCache()
1088 {
1089 CollDataCache *cache = NULL;
1090
1091 UMTX_CHECK(NULL, collDataCache, cache);
1092
1093 // **** this will fail if the another ****
1094 // **** thread deletes the cache here ****
1095 if (cache != NULL) {
1096 cache->flush();
1097 }
1098 }
1099
1100 U_NAMESPACE_END
1101
1102 #endif // #if !UCONFIG_NO_COLLATION
1103