• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  ******************************************************************************
3  *   Copyright (C) 1996-2012, International Business Machines                 *
4  *   Corporation and others.  All Rights Reserved.                            *
5  ******************************************************************************
6  */
7 
8 #include "unicode/utypes.h"
9 
10 #if !UCONFIG_NO_COLLATION
11 
12 #include "unicode/unistr.h"
13 #include "unicode/putil.h"
14 #include "unicode/usearch.h"
15 
16 #include "cmemory.h"
17 #include "unicode/coll.h"
18 #include "unicode/tblcoll.h"
19 #include "unicode/coleitr.h"
20 #include "unicode/ucoleitr.h"
21 
22 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
23 
24 #include "unicode/uniset.h"
25 #include "unicode/uset.h"
26 #include "unicode/ustring.h"
27 #include "hash.h"
28 #include "uhash.h"
29 #include "ucln_in.h"
30 #include "ucol_imp.h"
31 #include "umutex.h"
32 #include "uassert.h"
33 
34 #include "unicode/colldata.h"
35 
36 U_NAMESPACE_BEGIN
37 
38 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
39 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
40 #define DELETE_ARRAY(array) uprv_free((void *) (array))
41 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
42 
43 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CEList)
44 
45 #ifdef INSTRUMENT_CELIST
46 int32_t CEList::_active = 0;
47 int32_t CEList::_histogram[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
48 #endif
49 
CEList(UCollator * coll,const UnicodeString & string,UErrorCode & status)50 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
51     : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
52 {
53     UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
54     UCollationStrength strength = ucol_getStrength(coll);
55     UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) ==  UCOL_SHIFTED;
56     uint32_t variableTop = ucol_getVariableTop(coll, &status);
57     uint32_t strengthMask = 0;
58     int32_t order;
59 
60     if (U_FAILURE(status)) {
61         return;
62     }
63 
64     // **** only set flag if string has Han(gul) ****
65     ucol_forceHanImplicit(elems, &status);
66 
67     switch (strength)
68     {
69     default:
70         strengthMask |= UCOL_TERTIARYORDERMASK;
71         /* fall through */
72 
73     case UCOL_SECONDARY:
74         strengthMask |= UCOL_SECONDARYORDERMASK;
75         /* fall through */
76 
77     case UCOL_PRIMARY:
78         strengthMask |= UCOL_PRIMARYORDERMASK;
79     }
80 
81 #ifdef INSTRUMENT_CELIST
82     _active += 1;
83     _histogram[0] += 1;
84 #endif
85 
86     ces = ceBuffer;
87 
88     while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
89         UBool cont = isContinuation(order);
90 
91         order &= strengthMask;
92 
93         if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
94             if (strength >= UCOL_QUATERNARY) {
95                 order &= UCOL_PRIMARYORDERMASK;
96             } else {
97                 order = UCOL_IGNORABLE;
98             }
99         }
100 
101         if (order == UCOL_IGNORABLE) {
102             continue;
103         }
104 
105         if (cont) {
106             order |= UCOL_CONTINUATION_MARKER;
107         }
108 
109         add(order, status);
110     }
111 
112     ucol_closeElements(elems);
113 }
114 
~CEList()115 CEList::~CEList()
116 {
117 #ifdef INSTRUMENT_CELIST
118     _active -= 1;
119 #endif
120 
121     if (ces != ceBuffer) {
122         DELETE_ARRAY(ces);
123     }
124 }
125 
add(uint32_t ce,UErrorCode & status)126 void CEList::add(uint32_t ce, UErrorCode &status)
127 {
128     if (U_FAILURE(status)) {
129         return;
130     }
131 
132     if (listSize >= listMax) {
133         int32_t newMax = listMax + CELIST_BUFFER_SIZE;
134 
135 #ifdef INSTRUMENT_CELIST
136         _histogram[listSize / CELIST_BUFFER_SIZE] += 1;
137 #endif
138 
139         uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
140 
141         if (newCEs == NULL) {
142             status = U_MEMORY_ALLOCATION_ERROR;
143             return;
144         }
145 
146         uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
147 
148         if (ces != ceBuffer) {
149             DELETE_ARRAY(ces);
150         }
151 
152         ces = newCEs;
153         listMax = newMax;
154     }
155 
156     ces[listSize++] = ce;
157 }
158 
get(int32_t index) const159 uint32_t CEList::get(int32_t index) const
160 {
161     if (index >= 0 && index < listSize) {
162         return ces[index];
163     }
164 
165     return (uint32_t)UCOL_NULLORDER;
166 }
167 
operator [](int32_t index) const168 uint32_t &CEList::operator[](int32_t index) const
169 {
170     return ces[index];
171 }
172 
matchesAt(int32_t offset,const CEList * other) const173 UBool CEList::matchesAt(int32_t offset, const CEList *other) const
174 {
175     if (other == NULL || listSize - offset < other->size()) {
176         return FALSE;
177     }
178 
179     for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
180         if (ces[i] != (*other)[j]) {
181             return FALSE;
182         }
183     }
184 
185     return TRUE;
186 }
187 
size() const188 int32_t CEList::size() const
189 {
190     return listSize;
191 }
192 
193 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringList)
194 
195 #ifdef INSTRUMENT_STRING_LIST
196 int32_t StringList::_lists = 0;
197 int32_t StringList::_strings = 0;
198 int32_t StringList::_histogram[101] = {0};
199 #endif
200 
StringList(UErrorCode & status)201 StringList::StringList(UErrorCode &status)
202     : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
203 {
204     if (U_FAILURE(status)) {
205         return;
206     }
207 
208     strings = new UnicodeString [listMax];
209 
210     if (strings == NULL) {
211         status = U_MEMORY_ALLOCATION_ERROR;
212         return;
213     }
214 
215 #ifdef INSTRUMENT_STRING_LIST
216     _lists += 1;
217     _histogram[0] += 1;
218 #endif
219 }
220 
~StringList()221 StringList::~StringList()
222 {
223     delete[] strings;
224 }
225 
add(const UnicodeString * string,UErrorCode & status)226 void StringList::add(const UnicodeString *string, UErrorCode &status)
227 {
228     if (U_FAILURE(status)) {
229         return;
230     }
231 
232 #ifdef INSTRUMENT_STRING_LIST
233     _strings += 1;
234 #endif
235 
236     if (listSize >= listMax) {
237         int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
238         UnicodeString *newStrings = new UnicodeString[newMax];
239         if (newStrings == NULL) {
240             status = U_MEMORY_ALLOCATION_ERROR;
241             return;
242         }
243         for (int32_t i=0; i<listSize; ++i) {
244             newStrings[i] = strings[i];
245         }
246 
247 #ifdef INSTRUMENT_STRING_LIST
248         int32_t _h = listSize / STRING_LIST_BUFFER_SIZE;
249 
250         if (_h > 100) {
251             _h = 100;
252         }
253 
254         _histogram[_h] += 1;
255 #endif
256 
257         delete[] strings;
258         strings = newStrings;
259         listMax = newMax;
260     }
261 
262     // The ctor initialized all the strings in
263     // the array to empty strings, so this
264     // is the same as copying the source string.
265     strings[listSize++].append(*string);
266 }
267 
add(const UChar * chars,int32_t count,UErrorCode & status)268 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
269 {
270     const UnicodeString string(chars, count);
271 
272     add(&string, status);
273 }
274 
get(int32_t index) const275 const UnicodeString *StringList::get(int32_t index) const
276 {
277     if (index >= 0 && index < listSize) {
278         return &strings[index];
279     }
280 
281     return NULL;
282 }
283 
size() const284 int32_t StringList::size() const
285 {
286     return listSize;
287 }
288 
289 
290 U_CDECL_BEGIN
291 static void U_CALLCONV
deleteStringList(void * obj)292 deleteStringList(void *obj)
293 {
294     StringList *strings = (StringList *) obj;
295 
296     delete strings;
297 }
298 static void U_CALLCONV
deleteCEList(void * obj)299 deleteCEList(void *obj)
300 {
301     CEList *list = (CEList *) obj;
302 
303     delete list;
304 }
305 
306 static void U_CALLCONV
deleteUnicodeStringKey(void * obj)307 deleteUnicodeStringKey(void *obj)
308 {
309     UnicodeString *key = (UnicodeString *) obj;
310 
311     delete key;
312 }
313 
314 static void U_CALLCONV
deleteChars(void *)315 deleteChars(void * /*obj*/)
316 {
317     // char *chars = (char *) obj;
318     // All the key strings are owned by the
319     // CollData objects and don't need to
320     // be freed here.
321   //DELETE_ARRAY(chars);
322 }
323 
324 U_CDECL_END
325 
326 class CEToStringsMap : public UMemory
327 {
328 public:
329 
330     CEToStringsMap(UErrorCode &status);
331     ~CEToStringsMap();
332 
333     void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
334     StringList *getStringList(uint32_t ce) const;
335 
336 private:
337 
338     void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
339     UHashtable *map;
340 };
341 
CEToStringsMap(UErrorCode & status)342 CEToStringsMap::CEToStringsMap(UErrorCode &status)
343     : map(NULL)
344 {
345     if (U_FAILURE(status)) {
346         return;
347     }
348 
349     map = uhash_open(uhash_hashLong, uhash_compareLong,
350                      uhash_compareCaselessUnicodeString,
351                      &status);
352 
353     if (U_FAILURE(status)) {
354         return;
355     }
356 
357     uhash_setValueDeleter(map, deleteStringList);
358 }
359 
~CEToStringsMap()360 CEToStringsMap::~CEToStringsMap()
361 {
362     uhash_close(map);
363 }
364 
put(uint32_t ce,UnicodeString * string,UErrorCode & status)365 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
366 {
367     StringList *strings = getStringList(ce);
368 
369     if (strings == NULL) {
370         strings = new StringList(status);
371 
372         if (strings == NULL || U_FAILURE(status)) {
373             status = U_MEMORY_ALLOCATION_ERROR;
374             return;
375         }
376 
377         putStringList(ce, strings, status);
378     }
379 
380     strings->add(string, status);
381 }
382 
getStringList(uint32_t ce) const383 StringList *CEToStringsMap::getStringList(uint32_t ce) const
384 {
385     return (StringList *) uhash_iget(map, ce);
386 }
387 
putStringList(uint32_t ce,StringList * stringList,UErrorCode & status)388 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
389 {
390     uhash_iput(map, ce, (void *) stringList, &status);
391 }
392 
393 class StringToCEsMap : public UMemory
394 {
395 public:
396     StringToCEsMap(UErrorCode &status);
397     ~StringToCEsMap();
398 
399     void put(const UnicodeString *string, const CEList *ces, UErrorCode &status);
400     const CEList *get(const UnicodeString *string);
401     void free(const CEList *list);
402 
403 private:
404 
405 
406     UHashtable *map;
407 };
408 
StringToCEsMap(UErrorCode & status)409 StringToCEsMap::StringToCEsMap(UErrorCode &status)
410     : map(NULL)
411 {
412     if (U_FAILURE(status)) {
413         return;
414     }
415 
416     map = uhash_open(uhash_hashUnicodeString,
417                      uhash_compareUnicodeString,
418                      uhash_compareLong,
419                      &status);
420 
421     if (U_FAILURE(status)) {
422         return;
423     }
424 
425     uhash_setValueDeleter(map, deleteCEList);
426     uhash_setKeyDeleter(map, deleteUnicodeStringKey);
427 }
428 
~StringToCEsMap()429 StringToCEsMap::~StringToCEsMap()
430 {
431     uhash_close(map);
432 }
433 
put(const UnicodeString * string,const CEList * ces,UErrorCode & status)434 void StringToCEsMap::put(const UnicodeString *string, const CEList *ces, UErrorCode &status)
435 {
436     uhash_put(map, (void *) string, (void *) ces, &status);
437 }
438 
get(const UnicodeString * string)439 const CEList *StringToCEsMap::get(const UnicodeString *string)
440 {
441     return (const CEList *) uhash_get(map, string);
442 }
443 
444 class CollDataCacheEntry : public UMemory
445 {
446 public:
447     CollDataCacheEntry(CollData *theData);
448     ~CollDataCacheEntry();
449 
450     CollData *data;
451     int32_t   refCount;
452 };
453 
CollDataCacheEntry(CollData * theData)454 CollDataCacheEntry::CollDataCacheEntry(CollData *theData)
455     : data(theData), refCount(1)
456 {
457     // nothing else to do
458 }
459 
~CollDataCacheEntry()460 CollDataCacheEntry::~CollDataCacheEntry()
461 {
462     // check refCount?
463     delete data;
464 }
465 
466 class CollDataCache : public UMemory
467 {
468 public:
469     CollDataCache(UErrorCode &status);
470     ~CollDataCache();
471 
472     CollData *get(UCollator *collator, UErrorCode &status);
473     void unref(CollData *collData);
474 
475     void flush();
476 
477 private:
478     static char *getKey(UCollator *collator, char *keyBuffer, int32_t *charBufferLength);
479     static void deleteKey(char *key);
480 
481     UHashtable *cache;
482 };
483 static UMutex lock = U_MUTEX_INITIALIZER;
484 
485 U_CDECL_BEGIN
486 static void U_CALLCONV
deleteCollDataCacheEntry(void * obj)487 deleteCollDataCacheEntry(void *obj)
488 {
489     CollDataCacheEntry *entry = (CollDataCacheEntry *) obj;
490 
491     delete entry;
492 }
493 U_CDECL_END
494 
CollDataCache(UErrorCode & status)495 CollDataCache::CollDataCache(UErrorCode &status)
496     : cache(NULL)
497 {
498     if (U_FAILURE(status)) {
499         return;
500     }
501 
502     cache = uhash_open(uhash_hashChars, uhash_compareChars, uhash_compareLong, &status);
503 
504     if (U_FAILURE(status)) {
505         return;
506     }
507 
508     uhash_setValueDeleter(cache, deleteCollDataCacheEntry);
509     uhash_setKeyDeleter(cache, deleteChars);
510 }
511 
~CollDataCache()512 CollDataCache::~CollDataCache()
513 {
514     umtx_lock(&lock);
515     uhash_close(cache);
516     cache = NULL;
517     umtx_unlock(&lock);
518 }
519 
get(UCollator * collator,UErrorCode & status)520 CollData *CollDataCache::get(UCollator *collator, UErrorCode &status)
521 {
522     char keyBuffer[KEY_BUFFER_SIZE];
523     int32_t keyLength = KEY_BUFFER_SIZE;
524     char *key = getKey(collator, keyBuffer, &keyLength);
525     CollData *result = NULL, *newData = NULL;
526     CollDataCacheEntry *entry = NULL, *newEntry = NULL;
527 
528     umtx_lock(&lock);
529     entry = (CollDataCacheEntry *) uhash_get(cache, key);
530 
531     if (entry == NULL) {
532         umtx_unlock(&lock);
533 
534         newData = new CollData(collator, key, keyLength, status);
535         newEntry = new CollDataCacheEntry(newData);
536 
537         if (U_FAILURE(status) || newData == NULL || newEntry == NULL) {
538             status = U_MEMORY_ALLOCATION_ERROR;
539             return NULL;
540         }
541 
542         umtx_lock(&lock);
543         entry = (CollDataCacheEntry *) uhash_get(cache, key);
544 
545         if (entry == NULL) {
546             uhash_put(cache, newData->key, newEntry, &status);
547             umtx_unlock(&lock);
548 
549             if (U_FAILURE(status)) {
550                 delete newEntry;
551                 delete newData;
552 
553                 return NULL;
554             }
555 
556             return newData;
557         }
558     }
559 
560     result = entry->data;
561     entry->refCount += 1;
562     umtx_unlock(&lock);
563 
564     if (key != keyBuffer) {
565         deleteKey(key);
566     }
567 
568     if (newEntry != NULL) {
569         delete newEntry;
570         delete newData;
571     }
572 
573     return result;
574 }
575 
unref(CollData * collData)576 void CollDataCache::unref(CollData *collData)
577 {
578     CollDataCacheEntry *entry = NULL;
579 
580     umtx_lock(&lock);
581     entry = (CollDataCacheEntry *) uhash_get(cache, collData->key);
582 
583     if (entry != NULL) {
584         entry->refCount -= 1;
585     }
586     umtx_unlock(&lock);
587 }
588 
getKey(UCollator * collator,char * keyBuffer,int32_t * keyBufferLength)589 char *CollDataCache::getKey(UCollator *collator, char *keyBuffer, int32_t *keyBufferLength)
590 {
591     UErrorCode status = U_ZERO_ERROR;
592     int32_t len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
593 
594     if (len >= *keyBufferLength) {
595         *keyBufferLength = (len + 2) & ~1;  // round to even length, leaving room for terminating null
596         keyBuffer = NEW_ARRAY(char, *keyBufferLength);
597         status = U_ZERO_ERROR;
598 
599         len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
600     }
601 
602     keyBuffer[len] = '\0';
603 
604     return keyBuffer;
605 }
606 
flush()607 void CollDataCache::flush()
608 {
609     const UHashElement *element;
610     int32_t pos = -1;
611 
612     umtx_lock(&lock);
613     while ((element = uhash_nextElement(cache, &pos)) != NULL) {
614         CollDataCacheEntry *entry = (CollDataCacheEntry *) element->value.pointer;
615 
616         if (entry->refCount <= 0) {
617             uhash_removeElement(cache, element);
618         }
619     }
620     umtx_unlock(&lock);
621 }
622 
deleteKey(char * key)623 void CollDataCache::deleteKey(char *key)
624 {
625     DELETE_ARRAY(key);
626 }
627 
628 U_CDECL_BEGIN
coll_data_cleanup(void)629 static UBool coll_data_cleanup(void) {
630     CollData::freeCollDataCache();
631   return TRUE;
632 }
633 U_CDECL_END
634 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData)635 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData)
636 
637 CollData::CollData()
638 {
639     // nothing
640 }
641 
642 #define CLONE_COLLATOR
643 
644 //#define CACHE_CELISTS
CollData(UCollator * collator,char * cacheKey,int32_t cacheKeyLength,UErrorCode & status)645 CollData::CollData(UCollator *collator, char *cacheKey, int32_t cacheKeyLength, UErrorCode &status)
646     : coll(NULL), charsToCEList(NULL), ceToCharsStartingWith(NULL), key(NULL)
647 {
648     // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
649     // i.e. other, control, private use, format, surrogate
650     U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
651     U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
652     USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
653 
654     // Han ext. A, Han, Jamo, Hangul, Han Ext. B
655     // i.e. all the characers we handle implicitly
656     U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
657     U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
658     USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
659 
660     if (U_FAILURE(status)) {
661         return;
662     }
663 
664     USet *expansions   = uset_openEmpty();
665     USet *contractions = uset_openEmpty();
666     int32_t itemCount;
667 
668 #ifdef CACHE_CELISTS
669     charsToCEList = new StringToCEsMap(status);
670 
671     if (U_FAILURE(status)) {
672         goto bail;
673     }
674 #else
675     charsToCEList = NULL;
676 #endif
677 
678     ceToCharsStartingWith = new CEToStringsMap(status);
679 
680     if (U_FAILURE(status)) {
681         goto bail;
682     }
683 
684     if (cacheKeyLength > KEY_BUFFER_SIZE) {
685         key = NEW_ARRAY(char, cacheKeyLength);
686 
687         if (key == NULL) {
688             status = U_MEMORY_ALLOCATION_ERROR;
689             goto bail;
690         }
691     } else {
692         key = keyBuffer;
693     }
694 
695     ARRAY_COPY(key, cacheKey, cacheKeyLength);
696 
697 #ifdef CLONE_COLLATOR
698     coll = ucol_safeClone(collator, NULL, NULL, &status);
699 
700     if (U_FAILURE(status)) {
701         goto bail;
702     }
703 #else
704     coll = collator;
705 #endif
706 
707     ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
708 
709     uset_addAll(charsToTest, contractions);
710     uset_addAll(charsToTest, expansions);
711     uset_removeAll(charsToTest, charsToRemove);
712 
713     itemCount = uset_getItemCount(charsToTest);
714     for(int32_t item = 0; item < itemCount; item += 1) {
715         UChar32 start = 0, end = 0;
716         UChar buffer[16];
717         int32_t len = uset_getItem(charsToTest, item, &start, &end,
718                                    buffer, 16, &status);
719 
720         if (len == 0) {
721             for (UChar32 ch = start; ch <= end; ch += 1) {
722                 UnicodeString *st = new UnicodeString(ch);
723 
724                 if (st == NULL) {
725                     status = U_MEMORY_ALLOCATION_ERROR;
726                     break;
727                 }
728 
729                 CEList *ceList = new CEList(coll, *st, status);
730 
731                 ceToCharsStartingWith->put(ceList->get(0), st, status);
732 
733 #ifdef CACHE_CELISTS
734                 charsToCEList->put(st, ceList, status);
735 #else
736                 delete ceList;
737                 delete st;
738 #endif
739             }
740         } else if (len > 0) {
741             UnicodeString *st = new UnicodeString(buffer, len);
742 
743             if (st == NULL) {
744                 status = U_MEMORY_ALLOCATION_ERROR;
745                 break;
746             }
747 
748             CEList *ceList = new CEList(coll, *st, status);
749 
750             ceToCharsStartingWith->put(ceList->get(0), st, status);
751 
752 #ifdef CACHE_CELISTS
753             charsToCEList->put(st, ceList, status);
754 #else
755             delete ceList;
756             delete st;
757 #endif
758         } else {
759             // shouldn't happen...
760         }
761 
762         if (U_FAILURE(status)) {
763              break;
764         }
765     }
766 
767 bail:
768     uset_close(contractions);
769     uset_close(expansions);
770     uset_close(charsToRemove);
771     uset_close(charsToTest);
772 
773     if (U_FAILURE(status)) {
774         return;
775     }
776 
777      UChar32 hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A,
778                             UCOL_FIRST_HAN_B, UCOL_LAST_HAN_B};
779      UChar  jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_JAMO, UCOL_LAST_T_JAMO};
780      UnicodeString hanString = UnicodeString::fromUTF32(hanRanges, ARRAY_SIZE(hanRanges));
781      UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges));
782      CEList hanList(coll, hanString, status);
783      CEList jamoList(coll, jamoString, status);
784      int32_t j = 0;
785 
786      if (U_FAILURE(status)) {
787          return;
788      }
789 
790      for (int32_t c = 0; c < jamoList.size(); c += 1) {
791          uint32_t jce = jamoList[c];
792 
793          if (! isContinuation(jce)) {
794              jamoLimits[j++] = jce;
795          }
796      }
797 
798      jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
799 
800      minHan = 0xFFFFFFFF;
801      maxHan = 0;
802 
803      for(int32_t h = 0; h < hanList.size(); h += 2) {
804          uint32_t han = (uint32_t) hanList[h];
805 
806          if (han < minHan) {
807              minHan = han;
808          }
809 
810          if (han > maxHan) {
811              maxHan = han;
812          }
813      }
814 
815      maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
816 }
817 
~CollData()818 CollData::~CollData()
819 {
820 #ifdef CLONE_COLLATOR
821    ucol_close(coll);
822 #endif
823 
824    if (key != keyBuffer) {
825        DELETE_ARRAY(key);
826    }
827 
828    delete ceToCharsStartingWith;
829 
830 #ifdef CACHE_CELISTS
831    delete charsToCEList;
832 #endif
833 }
834 
getCollator() const835 UCollator *CollData::getCollator() const
836 {
837     return coll;
838 }
839 
getStringList(int32_t ce) const840 const StringList *CollData::getStringList(int32_t ce) const
841 {
842     return ceToCharsStartingWith->getStringList(ce);
843 }
844 
getCEList(const UnicodeString * string) const845 const CEList *CollData::getCEList(const UnicodeString *string) const
846 {
847 #ifdef CACHE_CELISTS
848     return charsToCEList->get(string);
849 #else
850     UErrorCode status = U_ZERO_ERROR;
851     const CEList *list = new CEList(coll, *string, status);
852 
853     if (U_FAILURE(status)) {
854         delete list;
855         list = NULL;
856     }
857 
858     return list;
859 #endif
860 }
861 
freeCEList(const CEList * list)862 void CollData::freeCEList(const CEList *list)
863 {
864 #ifndef CACHE_CELISTS
865     delete list;
866 #endif
867 }
868 
minLengthInChars(const CEList * ceList,int32_t offset,int32_t * history) const869 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
870 {
871     // find out shortest string for the longest sequence of ces.
872     // this can probably be folded with the minLengthCache...
873 
874     if (history[offset] >= 0) {
875         return history[offset];
876     }
877 
878     uint32_t ce = ceList->get(offset);
879     int32_t maxOffset = ceList->size();
880     int32_t shortestLength = INT32_MAX;
881     const StringList *strings = ceToCharsStartingWith->getStringList(ce);
882 
883     if (strings != NULL) {
884         int32_t stringCount = strings->size();
885 
886         for (int32_t s = 0; s < stringCount; s += 1) {
887             const UnicodeString *string = strings->get(s);
888 #ifdef CACHE_CELISTS
889             const CEList *ceList2 = charsToCEList->get(string);
890 #else
891             UErrorCode status = U_ZERO_ERROR;
892             const CEList *ceList2 = new CEList(coll, *string, status);
893 
894             if (U_FAILURE(status)) {
895                 delete ceList2;
896                 ceList2 = NULL;
897             }
898 #endif
899 
900             if (ceList->matchesAt(offset, ceList2)) {
901                 U_ASSERT(ceList2 != NULL);
902                 int32_t clength = ceList2->size();
903                 int32_t slength = string->length();
904                 int32_t roffset = offset + clength;
905                 int32_t rlength = 0;
906 
907                 if (roffset < maxOffset) {
908                     rlength = minLengthInChars(ceList, roffset, history);
909 
910                     if (rlength <= 0) {
911                     // delete before continue to avoid memory leak.
912 #ifndef CACHE_CELISTS
913                         delete ceList2;
914 #endif
915                         // ignore any dead ends
916                         continue;
917                     }
918                 }
919 
920                 if (shortestLength > slength + rlength) {
921                     shortestLength = slength + rlength;
922                 }
923             }
924 
925 #ifndef CACHE_CELISTS
926             delete ceList2;
927 #endif
928         }
929     }
930 
931     if (shortestLength == INT32_MAX) {
932         // No matching strings at this offset. See if
933         // the CE is in a range we can handle manually.
934         if (ce >= minHan && ce < maxHan) {
935             // all han have implicit orders which
936             // generate two CEs.
937             int32_t roffset = offset + 2;
938             int32_t rlength = 0;
939 
940           //history[roffset++] = -1;
941           //history[roffset++] = 1;
942 
943             if (roffset < maxOffset) {
944                 rlength = minLengthInChars(ceList, roffset, history);
945             }
946 
947             if (rlength < 0) {
948                 return -1;
949             }
950 
951             shortestLength = 1 + rlength;
952             goto have_shortest;
953         } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
954             int32_t roffset = offset;
955             int32_t rlength = 0;
956 
957             // **** this loop may not handle archaic Hangul correctly ****
958             for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
959                 uint32_t jce = ceList->get(roffset);
960 
961                 // Some Jamo have 24-bit primary order; skip the
962                 // 2nd CE. This should always be OK because if
963                 // we're still in the loop all we've seen are
964                 // a series of Jamo in LVT order.
965                 if (isContinuation(jce)) {
966                     continue;
967                 }
968 
969                 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
970                     break;
971                 }
972             }
973 
974             if (roffset == offset) {
975                 // we started with a non-L Jamo...
976                 // just say it comes from a single character
977                 roffset += 1;
978 
979                 // See if the single Jamo has a 24-bit order.
980                 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
981                     roffset += 1;
982                 }
983             }
984 
985             if (roffset < maxOffset) {
986                 rlength = minLengthInChars(ceList, roffset, history);
987             }
988 
989             if (rlength < 0) {
990                 return -1;
991             }
992 
993             shortestLength = 1 + rlength;
994             goto have_shortest;
995         }
996 
997         // Can't handle it manually either. Just move on.
998         return -1;
999     }
1000 
1001 have_shortest:
1002     history[offset] = shortestLength;
1003 
1004     return shortestLength;
1005 }
1006 
minLengthInChars(const CEList * ceList,int32_t offset) const1007 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
1008 {
1009     int32_t clength = ceList->size();
1010     int32_t *history = NEW_ARRAY(int32_t, clength);
1011 
1012     for (int32_t i = 0; i < clength; i += 1) {
1013         history[i] = -1;
1014     }
1015 
1016     int32_t minLength = minLengthInChars(ceList, offset, history);
1017 
1018     DELETE_ARRAY(history);
1019 
1020     return minLength;
1021 }
1022 
open(UCollator * collator,UErrorCode & status)1023 CollData *CollData::open(UCollator *collator, UErrorCode &status)
1024 {
1025     if (U_FAILURE(status)) {
1026         return NULL;
1027     }
1028 
1029     CollDataCache *cache = getCollDataCache();
1030 
1031     return cache->get(collator, status);
1032 }
1033 
close(CollData * collData)1034 void CollData::close(CollData *collData)
1035 {
1036     CollDataCache *cache = getCollDataCache();
1037 
1038     cache->unref(collData);
1039 }
1040 
1041 CollDataCache *CollData::collDataCache = NULL;
1042 
getCollDataCache()1043 CollDataCache *CollData::getCollDataCache()
1044 {
1045     UErrorCode status = U_ZERO_ERROR;
1046     CollDataCache *cache = NULL;
1047 
1048     UMTX_CHECK(NULL, collDataCache, cache);
1049 
1050     if (cache == NULL) {
1051         cache = new CollDataCache(status);
1052 
1053         if (U_FAILURE(status)) {
1054             delete cache;
1055             return NULL;
1056         }
1057 
1058         umtx_lock(NULL);
1059         if (collDataCache == NULL) {
1060             collDataCache = cache;
1061 
1062             ucln_i18n_registerCleanup(UCLN_I18N_COLL_DATA, coll_data_cleanup);
1063         }
1064         umtx_unlock(NULL);
1065 
1066         if (collDataCache != cache) {
1067             delete cache;
1068         }
1069     }
1070 
1071     return collDataCache;
1072 }
1073 
freeCollDataCache()1074 void CollData::freeCollDataCache()
1075 {
1076     CollDataCache *cache = NULL;
1077 
1078     UMTX_CHECK(NULL, collDataCache, cache);
1079 
1080     if (cache != NULL) {
1081         umtx_lock(NULL);
1082         if (collDataCache != NULL) {
1083             collDataCache = NULL;
1084         } else {
1085             cache = NULL;
1086         }
1087         umtx_unlock(NULL);
1088 
1089         delete cache;
1090     }
1091 }
1092 
flushCollDataCache()1093 void CollData::flushCollDataCache()
1094 {
1095     CollDataCache *cache = NULL;
1096 
1097     UMTX_CHECK(NULL, collDataCache, cache);
1098 
1099     // **** this will fail if the another ****
1100     // **** thread deletes the cache here ****
1101     if (cache != NULL) {
1102         cache->flush();
1103     }
1104 }
1105 
1106 U_NAMESPACE_END
1107 
1108 #endif // #if !UCONFIG_NO_COLLATION
1109