• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  ******************************************************************************
3  *   Copyright (C) 1996-2009, International Business Machines                 *
4  *   Corporation and others.  All Rights Reserved.                            *
5  ******************************************************************************
6  */
7 
8 #include "unicode/utypes.h"
9 
10 #if !UCONFIG_NO_COLLATION
11 
12 #include "unicode/unistr.h"
13 #include "unicode/putil.h"
14 #include "unicode/usearch.h"
15 
16 #include "cmemory.h"
17 #include "unicode/coll.h"
18 #include "unicode/tblcoll.h"
19 #include "unicode/coleitr.h"
20 #include "unicode/ucoleitr.h"
21 
22 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
23 
24 #include "unicode/uniset.h"
25 #include "unicode/uset.h"
26 #include "unicode/ustring.h"
27 #include "hash.h"
28 #include "uhash.h"
29 #include "ucln_in.h"
30 #include "ucol_imp.h"
31 #include "umutex.h"
32 
33 #include "unicode/colldata.h"
34 
35 U_NAMESPACE_BEGIN
36 
37 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
38 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
39 #define DELETE_ARRAY(array) uprv_free((void *) (array))
40 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
41 
42 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CEList)
43 
44 #ifdef INSTRUMENT_CELIST
45 int32_t CEList::_active = 0;
46 int32_t CEList::_histogram[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
47 #endif
48 
CEList(UCollator * coll,const UnicodeString & string,UErrorCode & status)49 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
50     : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
51 {
52     UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
53     UCollationStrength strength = ucol_getStrength(coll);
54     UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) ==  UCOL_SHIFTED;
55     uint32_t variableTop = ucol_getVariableTop(coll, &status);
56     uint32_t strengthMask = 0;
57     int32_t order;
58 
59     if (U_FAILURE(status)) {
60         return;
61     }
62 
63     // **** only set flag if string has Han(gul) ****
64     ucol_forceHanImplicit(elems, &status);
65 
66     switch (strength)
67     {
68     default:
69         strengthMask |= UCOL_TERTIARYORDERMASK;
70         /* fall through */
71 
72     case UCOL_SECONDARY:
73         strengthMask |= UCOL_SECONDARYORDERMASK;
74         /* fall through */
75 
76     case UCOL_PRIMARY:
77         strengthMask |= UCOL_PRIMARYORDERMASK;
78     }
79 
80 #ifdef INSTRUMENT_CELIST
81     _active += 1;
82     _histogram[0] += 1;
83 #endif
84 
85     ces = ceBuffer;
86 
87     while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
88         UBool cont = isContinuation(order);
89 
90         order &= strengthMask;
91 
92         if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
93             if (strength >= UCOL_QUATERNARY) {
94                 order &= UCOL_PRIMARYORDERMASK;
95             } else {
96                 order = UCOL_IGNORABLE;
97             }
98         }
99 
100         if (order == UCOL_IGNORABLE) {
101             continue;
102         }
103 
104         if (cont) {
105             order |= UCOL_CONTINUATION_MARKER;
106         }
107 
108         add(order, status);
109     }
110 
111     ucol_closeElements(elems);
112 }
113 
~CEList()114 CEList::~CEList()
115 {
116 #ifdef INSTRUMENT_CELIST
117     _active -= 1;
118 #endif
119 
120     if (ces != ceBuffer) {
121         DELETE_ARRAY(ces);
122     }
123 }
124 
add(uint32_t ce,UErrorCode & status)125 void CEList::add(uint32_t ce, UErrorCode &status)
126 {
127     if (U_FAILURE(status)) {
128         return;
129     }
130 
131     if (listSize >= listMax) {
132         int32_t newMax = listMax + CELIST_BUFFER_SIZE;
133 
134 #ifdef INSTRUMENT_CELIST
135         _histogram[listSize / CELIST_BUFFER_SIZE] += 1;
136 #endif
137 
138         uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
139 
140         if (newCEs == NULL) {
141             status = U_MEMORY_ALLOCATION_ERROR;
142             return;
143         }
144 
145         uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
146 
147         if (ces != ceBuffer) {
148             DELETE_ARRAY(ces);
149         }
150 
151         ces = newCEs;
152         listMax = newMax;
153     }
154 
155     ces[listSize++] = ce;
156 }
157 
get(int32_t index) const158 uint32_t CEList::get(int32_t index) const
159 {
160     if (index >= 0 && index < listSize) {
161         return ces[index];
162     }
163 
164     return UCOL_NULLORDER;
165 }
166 
operator [](int32_t index) const167 uint32_t &CEList::operator[](int32_t index) const
168 {
169     return ces[index];
170 }
171 
matchesAt(int32_t offset,const CEList * other) const172 UBool CEList::matchesAt(int32_t offset, const CEList *other) const
173 {
174     if (other == NULL || listSize - offset < other->size()) {
175         return FALSE;
176     }
177 
178     for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
179         if (ces[i] != (*other)[j]) {
180             return FALSE;
181         }
182     }
183 
184     return TRUE;
185 }
186 
size() const187 int32_t CEList::size() const
188 {
189     return listSize;
190 }
191 
192 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringList)
193 
194 #ifdef INSTRUMENT_STRING_LIST
195 int32_t StringList::_lists = 0;
196 int32_t StringList::_strings = 0;
197 int32_t StringList::_histogram[101] = {0};
198 #endif
199 
StringList(UErrorCode & status)200 StringList::StringList(UErrorCode &status)
201     : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
202 {
203     if (U_FAILURE(status)) {
204         return;
205     }
206 
207     strings = new UnicodeString [listMax];
208 
209     if (strings == NULL) {
210         status = U_MEMORY_ALLOCATION_ERROR;
211         return;
212     }
213 
214 #ifdef INSTRUMENT_STRING_LIST
215     _lists += 1;
216     _histogram[0] += 1;
217 #endif
218 }
219 
~StringList()220 StringList::~StringList()
221 {
222     delete[] strings;
223 }
224 
add(const UnicodeString * string,UErrorCode & status)225 void StringList::add(const UnicodeString *string, UErrorCode &status)
226 {
227     if (U_FAILURE(status)) {
228         return;
229     }
230 
231 #ifdef INSTRUMENT_STRING_LIST
232     _strings += 1;
233 #endif
234 
235     if (listSize >= listMax) {
236         int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
237 
238         UnicodeString *newStrings = new UnicodeString[newMax];
239         if (newStrings == NULL) {
240             status = U_MEMORY_ALLOCATION_ERROR;
241             return;
242         }
243         for (int32_t i=0; i<listSize; ++i) {
244             newStrings[i] = strings[i];
245         }
246 
247 #ifdef INSTRUMENT_STRING_LIST
248         int32_t _h = listSize / STRING_LIST_BUFFER_SIZE;
249 
250         if (_h > 100) {
251             _h = 100;
252         }
253 
254         _histogram[_h] += 1;
255 #endif
256 
257         delete[] strings;
258         strings = newStrings;
259         listMax = newMax;
260     }
261 
262     // The ctor initialized all the strings in
263     // the array to empty strings, so this
264     // is the same as copying the source string.
265     strings[listSize++].append(*string);
266 }
267 
add(const UChar * chars,int32_t count,UErrorCode & status)268 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
269 {
270     const UnicodeString string(chars, count);
271 
272     add(&string, status);
273 }
274 
get(int32_t index) const275 const UnicodeString *StringList::get(int32_t index) const
276 {
277     if (index >= 0 && index < listSize) {
278         return &strings[index];
279     }
280 
281     return NULL;
282 }
283 
size() const284 int32_t StringList::size() const
285 {
286     return listSize;
287 }
288 
289 
290 U_CFUNC void deleteStringList(void *obj);
291 
292 class CEToStringsMap : public UMemory
293 {
294 public:
295 
296     CEToStringsMap(UErrorCode &status);
297     ~CEToStringsMap();
298 
299     void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
300     StringList *getStringList(uint32_t ce) const;
301 
302 private:
303 
304     void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
305     UHashtable *map;
306 };
307 
CEToStringsMap(UErrorCode & status)308 CEToStringsMap::CEToStringsMap(UErrorCode &status)
309     : map(NULL)
310 {
311     if (U_FAILURE(status)) {
312         return;
313     }
314 
315     map = uhash_open(uhash_hashLong, uhash_compareLong,
316                      uhash_compareCaselessUnicodeString,
317                      &status);
318 
319     if (U_FAILURE(status)) {
320         return;
321     }
322 
323     uhash_setValueDeleter(map, deleteStringList);
324 }
325 
~CEToStringsMap()326 CEToStringsMap::~CEToStringsMap()
327 {
328     uhash_close(map);
329 }
330 
put(uint32_t ce,UnicodeString * string,UErrorCode & status)331 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
332 {
333     StringList *strings = getStringList(ce);
334 
335     if (strings == NULL) {
336         strings = new StringList(status);
337 
338         if (strings == NULL || U_FAILURE(status)) {
339             status = U_MEMORY_ALLOCATION_ERROR;
340             return;
341         }
342 
343         putStringList(ce, strings, status);
344     }
345 
346     strings->add(string, status);
347 }
348 
getStringList(uint32_t ce) const349 StringList *CEToStringsMap::getStringList(uint32_t ce) const
350 {
351     return (StringList *) uhash_iget(map, ce);
352 }
353 
putStringList(uint32_t ce,StringList * stringList,UErrorCode & status)354 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
355 {
356     uhash_iput(map, ce, (void *) stringList, &status);
357 }
358 
deleteStringList(void * obj)359 U_CFUNC void deleteStringList(void *obj)
360 {
361     StringList *strings = (StringList *) obj;
362 
363     delete strings;
364 }
365 
366 U_CFUNC void deleteCEList(void *obj);
367 U_CFUNC void deleteUnicodeStringKey(void *obj);
368 
369 class StringToCEsMap : public UMemory
370 {
371 public:
372     StringToCEsMap(UErrorCode &status);
373     ~StringToCEsMap();
374 
375     void put(const UnicodeString *string, const CEList *ces, UErrorCode &status);
376     const CEList *get(const UnicodeString *string);
377     void free(const CEList *list);
378 
379 private:
380 
381 
382     UHashtable *map;
383 };
384 
StringToCEsMap(UErrorCode & status)385 StringToCEsMap::StringToCEsMap(UErrorCode &status)
386     : map(NULL)
387 {
388     if (U_FAILURE(status)) {
389         return;
390     }
391 
392     map = uhash_open(uhash_hashUnicodeString,
393                      uhash_compareUnicodeString,
394                      uhash_compareLong,
395                      &status);
396 
397     if (U_FAILURE(status)) {
398         return;
399     }
400 
401     uhash_setValueDeleter(map, deleteCEList);
402     uhash_setKeyDeleter(map, deleteUnicodeStringKey);
403 }
404 
~StringToCEsMap()405 StringToCEsMap::~StringToCEsMap()
406 {
407     uhash_close(map);
408 }
409 
put(const UnicodeString * string,const CEList * ces,UErrorCode & status)410 void StringToCEsMap::put(const UnicodeString *string, const CEList *ces, UErrorCode &status)
411 {
412     uhash_put(map, (void *) string, (void *) ces, &status);
413 }
414 
get(const UnicodeString * string)415 const CEList *StringToCEsMap::get(const UnicodeString *string)
416 {
417     return (const CEList *) uhash_get(map, string);
418 }
419 
deleteCEList(void * obj)420 U_CFUNC void deleteCEList(void *obj)
421 {
422     CEList *list = (CEList *) obj;
423 
424     delete list;
425 }
426 
deleteUnicodeStringKey(void * obj)427 U_CFUNC void deleteUnicodeStringKey(void *obj)
428 {
429     UnicodeString *key = (UnicodeString *) obj;
430 
431     delete key;
432 }
433 
434 class CollDataCacheEntry : public UMemory
435 {
436 public:
437     CollDataCacheEntry(CollData *theData);
438     ~CollDataCacheEntry();
439 
440     CollData *data;
441     int32_t   refCount;
442 };
443 
CollDataCacheEntry(CollData * theData)444 CollDataCacheEntry::CollDataCacheEntry(CollData *theData)
445     : data(theData), refCount(1)
446 {
447     // nothing else to do
448 }
449 
~CollDataCacheEntry()450 CollDataCacheEntry::~CollDataCacheEntry()
451 {
452     // check refCount?
453     delete data;
454 }
455 
456 class CollDataCache : public UMemory
457 {
458 public:
459     CollDataCache(UErrorCode &status);
460     ~CollDataCache();
461 
462     CollData *get(UCollator *collator, UErrorCode &status);
463     void unref(CollData *collData);
464 
465     void flush();
466 
467 private:
468     static char *getKey(UCollator *collator, char *keyBuffer, int32_t *charBufferLength);
469     static void deleteKey(char *key);
470 
471     UMTX lock;
472     UHashtable *cache;
473 };
474 
deleteChars(void *)475 U_CFUNC void deleteChars(void * /*obj*/)
476 {
477     // char *chars = (char *) obj;
478     // All the key strings are owned by the
479     // CollData objects and don't need to
480     // be freed here.
481   //DELETE_ARRAY(chars);
482 }
483 
deleteCollDataCacheEntry(void * obj)484 U_CFUNC void deleteCollDataCacheEntry(void *obj)
485 {
486     CollDataCacheEntry *entry = (CollDataCacheEntry *) obj;
487 
488     delete entry;
489 }
490 
CollDataCache(UErrorCode & status)491 CollDataCache::CollDataCache(UErrorCode &status)
492     : lock(0), cache(NULL)
493 {
494     if (U_FAILURE(status)) {
495         return;
496     }
497 
498     cache = uhash_open(uhash_hashChars, uhash_compareChars, uhash_compareLong, &status);
499 
500     if (U_FAILURE(status)) {
501         return;
502     }
503 
504     uhash_setValueDeleter(cache, deleteCollDataCacheEntry);
505     uhash_setKeyDeleter(cache, deleteChars);
506 }
507 
~CollDataCache()508 CollDataCache::~CollDataCache()
509 {
510     umtx_lock(&lock);
511     uhash_close(cache);
512     cache = NULL;
513     umtx_unlock(&lock);
514 
515     umtx_destroy(&lock);
516 }
517 
get(UCollator * collator,UErrorCode & status)518 CollData *CollDataCache::get(UCollator *collator, UErrorCode &status)
519 {
520     char keyBuffer[KEY_BUFFER_SIZE];
521     int32_t keyLength = KEY_BUFFER_SIZE;
522     char *key = getKey(collator, keyBuffer, &keyLength);
523     CollData *result = NULL, *newData = NULL;
524     CollDataCacheEntry *entry = NULL, *newEntry = NULL;
525 
526     umtx_lock(&lock);
527     entry = (CollDataCacheEntry *) uhash_get(cache, key);
528 
529     if (entry == NULL) {
530         umtx_unlock(&lock);
531 
532         newData = new CollData(collator, key, keyLength, status);
533         newEntry = new CollDataCacheEntry(newData);
534 
535         if (U_FAILURE(status) || newData == NULL || newEntry == NULL) {
536             status = U_MEMORY_ALLOCATION_ERROR;
537             return NULL;
538         }
539 
540         umtx_lock(&lock);
541         entry = (CollDataCacheEntry *) uhash_get(cache, key);
542 
543         if (entry == NULL) {
544             uhash_put(cache, newData->key, newEntry, &status);
545             umtx_unlock(&lock);
546 
547             if (U_FAILURE(status)) {
548                 delete newEntry;
549                 delete newData;
550 
551                 return NULL;
552             }
553 
554             return newData;
555         }
556     }
557 
558     result = entry->data;
559     entry->refCount += 1;
560     umtx_unlock(&lock);
561 
562     if (key != keyBuffer) {
563         deleteKey(key);
564     }
565 
566     if (newEntry != NULL) {
567         delete newEntry;
568         delete newData;
569     }
570 
571     return result;
572 }
573 
unref(CollData * collData)574 void CollDataCache::unref(CollData *collData)
575 {
576     CollDataCacheEntry *entry = NULL;
577 
578     umtx_lock(&lock);
579     entry = (CollDataCacheEntry *) uhash_get(cache, collData->key);
580 
581     if (entry != NULL) {
582         entry->refCount -= 1;
583     }
584     umtx_unlock(&lock);
585 }
586 
getKey(UCollator * collator,char * keyBuffer,int32_t * keyBufferLength)587 char *CollDataCache::getKey(UCollator *collator, char *keyBuffer, int32_t *keyBufferLength)
588 {
589     UErrorCode status = U_ZERO_ERROR;
590     int32_t len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
591 
592     if (len >= *keyBufferLength) {
593         *keyBufferLength = (len + 2) & ~1;  // round to even length, leaving room for terminating null
594         keyBuffer = NEW_ARRAY(char, *keyBufferLength);
595         status = U_ZERO_ERROR;
596 
597         len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
598     }
599 
600     keyBuffer[len] = '\0';
601 
602     return keyBuffer;
603 }
604 
flush()605 void CollDataCache::flush()
606 {
607     const UHashElement *element;
608     int32_t pos = -1;
609 
610     umtx_lock(&lock);
611     while ((element = uhash_nextElement(cache, &pos)) != NULL) {
612         CollDataCacheEntry *entry = (CollDataCacheEntry *) element->value.pointer;
613 
614         if (entry->refCount <= 0) {
615             uhash_removeElement(cache, element);
616         }
617     }
618     umtx_unlock(&lock);
619 }
620 
deleteKey(char * key)621 void CollDataCache::deleteKey(char *key)
622 {
623     DELETE_ARRAY(key);
624 }
625 
626 U_CDECL_BEGIN
coll_data_cleanup(void)627 static UBool coll_data_cleanup(void) {
628     CollData::freeCollDataCache();
629   return TRUE;
630 }
631 U_CDECL_END
632 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData)633 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData)
634 
635 CollData::CollData()
636 {
637     // nothing
638 }
639 
640 #define CLONE_COLLATOR
641 
642 //#define CACHE_CELISTS
CollData(UCollator * collator,char * cacheKey,int32_t cacheKeyLength,UErrorCode & status)643 CollData::CollData(UCollator *collator, char *cacheKey, int32_t cacheKeyLength, UErrorCode &status)
644     : coll(NULL), charsToCEList(NULL), ceToCharsStartingWith(NULL), key(NULL)
645 {
646     // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
647     // i.e. other, control, private use, format, surrogate
648     U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
649     U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
650     USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
651 
652     // Han ext. A, Han, Jamo, Hangul, Han Ext. B
653     // i.e. all the characers we handle implicitly
654     U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
655     U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
656     USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
657 
658     if (U_FAILURE(status)) {
659         return;
660     }
661 
662     USet *expansions   = uset_openEmpty();
663     USet *contractions = uset_openEmpty();
664     int32_t itemCount;
665 
666 #ifdef CACHE_CELISTS
667     charsToCEList = new StringToCEsMap(status);
668 
669     if (U_FAILURE(status)) {
670         goto bail;
671     }
672 #else
673     charsToCEList = NULL;
674 #endif
675 
676     ceToCharsStartingWith = new CEToStringsMap(status);
677 
678     if (U_FAILURE(status)) {
679         goto bail;
680     }
681 
682     if (cacheKeyLength > KEY_BUFFER_SIZE) {
683         key = NEW_ARRAY(char, cacheKeyLength);
684 
685         if (key == NULL) {
686             status = U_MEMORY_ALLOCATION_ERROR;
687             goto bail;
688         }
689     } else {
690         key = keyBuffer;
691     }
692 
693     ARRAY_COPY(key, cacheKey, cacheKeyLength);
694 
695 #ifdef CLONE_COLLATOR
696     coll = ucol_safeClone(collator, NULL, NULL, &status);
697 
698     if (U_FAILURE(status)) {
699         goto bail;
700     }
701 #else
702     coll = collator;
703 #endif
704 
705     ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
706 
707     uset_addAll(charsToTest, contractions);
708     uset_addAll(charsToTest, expansions);
709     uset_removeAll(charsToTest, charsToRemove);
710 
711     itemCount = uset_getItemCount(charsToTest);
712     for(int32_t item = 0; item < itemCount; item += 1) {
713         UChar32 start = 0, end = 0;
714         UChar buffer[16];
715         int32_t len = uset_getItem(charsToTest, item, &start, &end,
716                                    buffer, 16, &status);
717 
718         if (len == 0) {
719             for (UChar32 ch = start; ch <= end; ch += 1) {
720                 UnicodeString *st = new UnicodeString(ch);
721 
722                 if (st == NULL) {
723                     status = U_MEMORY_ALLOCATION_ERROR;
724                     break;
725                 }
726 
727                 CEList *ceList = new CEList(coll, *st, status);
728 
729                 ceToCharsStartingWith->put(ceList->get(0), st, status);
730 
731 #ifdef CACHE_CELISTS
732                 charsToCEList->put(st, ceList, status);
733 #else
734                 delete ceList;
735                 delete st;
736 #endif
737             }
738         } else if (len > 0) {
739             UnicodeString *st = new UnicodeString(buffer, len);
740 
741             if (st == NULL) {
742                 status = U_MEMORY_ALLOCATION_ERROR;
743                 break;
744             }
745 
746             CEList *ceList = new CEList(coll, *st, status);
747 
748             ceToCharsStartingWith->put(ceList->get(0), st, status);
749 
750 #ifdef CACHE_CELISTS
751             charsToCEList->put(st, ceList, status);
752 #else
753             delete ceList;
754             delete st;
755 #endif
756         } else {
757             // shouldn't happen...
758         }
759 
760         if (U_FAILURE(status)) {
761              break;
762         }
763     }
764 
765 bail:
766     uset_close(contractions);
767     uset_close(expansions);
768     uset_close(charsToRemove);
769     uset_close(charsToTest);
770 
771     if (U_FAILURE(status)) {
772         return;
773     }
774 
775      UChar32 hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A,
776                             UCOL_FIRST_HAN_B, UCOL_LAST_HAN_B};
777      UChar  jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_JAMO, UCOL_LAST_T_JAMO};
778      UnicodeString hanString = UnicodeString::fromUTF32(hanRanges, ARRAY_SIZE(hanRanges));
779      UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges));
780      CEList hanList(coll, hanString, status);
781      CEList jamoList(coll, jamoString, status);
782      int32_t j = 0;
783 
784      if (U_FAILURE(status)) {
785          return;
786      }
787 
788      for (int32_t c = 0; c < jamoList.size(); c += 1) {
789          uint32_t jce = jamoList[c];
790 
791          if (! isContinuation(jce)) {
792              jamoLimits[j++] = jce;
793          }
794      }
795 
796      jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
797 
798      minHan = 0xFFFFFFFF;
799      maxHan = 0;
800 
801      for(int32_t h = 0; h < hanList.size(); h += 2) {
802          uint32_t han = (uint32_t) hanList[h];
803 
804          if (han < minHan) {
805              minHan = han;
806          }
807 
808          if (han > maxHan) {
809              maxHan = han;
810          }
811      }
812 
813      maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
814 }
815 
~CollData()816 CollData::~CollData()
817 {
818 #ifdef CLONE_COLLATOR
819    ucol_close(coll);
820 #endif
821 
822    if (key != keyBuffer) {
823        DELETE_ARRAY(key);
824    }
825 
826    delete ceToCharsStartingWith;
827 
828 #ifdef CACHE_CELISTS
829    delete charsToCEList;
830 #endif
831 }
832 
getCollator() const833 UCollator *CollData::getCollator() const
834 {
835     return coll;
836 }
837 
getStringList(int32_t ce) const838 const StringList *CollData::getStringList(int32_t ce) const
839 {
840     return ceToCharsStartingWith->getStringList(ce);
841 }
842 
getCEList(const UnicodeString * string) const843 const CEList *CollData::getCEList(const UnicodeString *string) const
844 {
845 #ifdef CACHE_CELISTS
846     return charsToCEList->get(string);
847 #else
848     UErrorCode status = U_ZERO_ERROR;
849     const CEList *list = new CEList(coll, *string, status);
850 
851     if (U_FAILURE(status)) {
852         delete list;
853         list = NULL;
854     }
855 
856     return list;
857 #endif
858 }
859 
freeCEList(const CEList * list)860 void CollData::freeCEList(const CEList *list)
861 {
862 #ifndef CACHE_CELISTS
863     delete list;
864 #endif
865 }
866 
minLengthInChars(const CEList * ceList,int32_t offset,int32_t * history) const867 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
868 {
869     // find out shortest string for the longest sequence of ces.
870     // this can probably be folded with the minLengthCache...
871 
872     if (history[offset] >= 0) {
873         return history[offset];
874     }
875 
876     uint32_t ce = ceList->get(offset);
877     int32_t maxOffset = ceList->size();
878     int32_t shortestLength = INT32_MAX;
879     const StringList *strings = ceToCharsStartingWith->getStringList(ce);
880 
881     if (strings != NULL) {
882         int32_t stringCount = strings->size();
883 
884         for (int32_t s = 0; s < stringCount; s += 1) {
885             const UnicodeString *string = strings->get(s);
886 #ifdef CACHE_CELISTS
887             const CEList *ceList2 = charsToCEList->get(string);
888 #else
889             UErrorCode status = U_ZERO_ERROR;
890             const CEList *ceList2 = new CEList(coll, *string, status);
891 
892             if (U_FAILURE(status)) {
893                 delete ceList2;
894                 ceList2 = NULL;
895             }
896 #endif
897 
898             if (ceList->matchesAt(offset, ceList2)) {
899                 int32_t clength = ceList2->size();
900                 int32_t slength = string->length();
901                 int32_t roffset = offset + clength;
902                 int32_t rlength = 0;
903 
904                 if (roffset < maxOffset) {
905                     rlength = minLengthInChars(ceList, roffset, history);
906 
907                     if (rlength <= 0) {
908                     // delete before continue to avoid memory leak.
909 #ifndef CACHE_CELISTS
910                         delete ceList2;
911 #endif
912                         // ignore any dead ends
913                         continue;
914                     }
915                 }
916 
917                 if (shortestLength > slength + rlength) {
918                     shortestLength = slength + rlength;
919                 }
920             }
921 
922 #ifndef CACHE_CELISTS
923             delete ceList2;
924 #endif
925         }
926     }
927 
928     if (shortestLength == INT32_MAX) {
929         // No matching strings at this offset. See if
930         // the CE is in a range we can handle manually.
931         if (ce >= minHan && ce < maxHan) {
932             // all han have implicit orders which
933             // generate two CEs.
934             int32_t roffset = offset + 2;
935             int32_t rlength = 0;
936 
937           //history[roffset++] = -1;
938           //history[roffset++] = 1;
939 
940             if (roffset < maxOffset) {
941                 rlength = minLengthInChars(ceList, roffset, history);
942             }
943 
944             if (rlength < 0) {
945                 return -1;
946             }
947 
948             shortestLength = 1 + rlength;
949             goto have_shortest;
950         } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
951             int32_t roffset = offset;
952             int32_t rlength = 0;
953 
954             // **** this loop may not handle archaic Hangul correctly ****
955             for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
956                 uint32_t jce = ceList->get(roffset);
957 
958                 // Some Jamo have 24-bit primary order; skip the
959                 // 2nd CE. This should always be OK because if
960                 // we're still in the loop all we've seen are
961                 // a series of Jamo in LVT order.
962                 if (isContinuation(jce)) {
963                     continue;
964                 }
965 
966                 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
967                     break;
968                 }
969             }
970 
971             if (roffset == offset) {
972                 // we started with a non-L Jamo...
973                 // just say it comes from a single character
974                 roffset += 1;
975 
976                 // See if the single Jamo has a 24-bit order.
977                 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
978                     roffset += 1;
979                 }
980             }
981 
982             if (roffset < maxOffset) {
983                 rlength = minLengthInChars(ceList, roffset, history);
984             }
985 
986             if (rlength < 0) {
987                 return -1;
988             }
989 
990             shortestLength = 1 + rlength;
991             goto have_shortest;
992         }
993 
994         // Can't handle it manually either. Just move on.
995         return -1;
996     }
997 
998 have_shortest:
999     history[offset] = shortestLength;
1000 
1001     return shortestLength;
1002 }
1003 
minLengthInChars(const CEList * ceList,int32_t offset) const1004 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
1005 {
1006     int32_t clength = ceList->size();
1007     int32_t *history = NEW_ARRAY(int32_t, clength);
1008 
1009     for (int32_t i = 0; i < clength; i += 1) {
1010         history[i] = -1;
1011     }
1012 
1013     int32_t minLength = minLengthInChars(ceList, offset, history);
1014 
1015     DELETE_ARRAY(history);
1016 
1017     return minLength;
1018 }
1019 
open(UCollator * collator,UErrorCode & status)1020 CollData *CollData::open(UCollator *collator, UErrorCode &status)
1021 {
1022     if (U_FAILURE(status)) {
1023         return NULL;
1024     }
1025 
1026     CollDataCache *cache = getCollDataCache();
1027 
1028     return cache->get(collator, status);
1029 }
1030 
close(CollData * collData)1031 void CollData::close(CollData *collData)
1032 {
1033     CollDataCache *cache = getCollDataCache();
1034 
1035     cache->unref(collData);
1036 }
1037 
1038 CollDataCache *CollData::collDataCache = NULL;
1039 
getCollDataCache()1040 CollDataCache *CollData::getCollDataCache()
1041 {
1042     UErrorCode status = U_ZERO_ERROR;
1043     CollDataCache *cache = NULL;
1044 
1045     UMTX_CHECK(NULL, collDataCache, cache);
1046 
1047     if (cache == NULL) {
1048         cache = new CollDataCache(status);
1049 
1050         if (U_FAILURE(status)) {
1051             delete cache;
1052             return NULL;
1053         }
1054 
1055         umtx_lock(NULL);
1056         if (collDataCache == NULL) {
1057             collDataCache = cache;
1058 
1059             ucln_i18n_registerCleanup(UCLN_I18N_COLL_DATA, coll_data_cleanup);
1060         }
1061         umtx_unlock(NULL);
1062 
1063         if (collDataCache != cache) {
1064             delete cache;
1065         }
1066     }
1067 
1068     return collDataCache;
1069 }
1070 
freeCollDataCache()1071 void CollData::freeCollDataCache()
1072 {
1073     CollDataCache *cache = NULL;
1074 
1075     UMTX_CHECK(NULL, collDataCache, cache);
1076 
1077     if (cache != NULL) {
1078         umtx_lock(NULL);
1079         if (collDataCache != NULL) {
1080             collDataCache = NULL;
1081         } else {
1082             cache = NULL;
1083         }
1084         umtx_unlock(NULL);
1085 
1086         delete cache;
1087     }
1088 }
1089 
flushCollDataCache()1090 void CollData::flushCollDataCache()
1091 {
1092     CollDataCache *cache = NULL;
1093 
1094     UMTX_CHECK(NULL, collDataCache, cache);
1095 
1096     // **** this will fail if the another ****
1097     // **** thread deletes the cache here ****
1098     if (cache != NULL) {
1099         cache->flush();
1100     }
1101 }
1102 
1103 U_NAMESPACE_END
1104 
1105 #endif // #if !UCONFIG_NO_COLLATION
1106