• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  ******************************************************************************
3  *   Copyright (C) 1996-2011, International Business Machines                 *
4  *   Corporation and others.  All Rights Reserved.                            *
5  ******************************************************************************
6  */
7 
8 #include "unicode/utypes.h"
9 
10 #if !UCONFIG_NO_COLLATION
11 
12 #include "unicode/unistr.h"
13 #include "unicode/putil.h"
14 #include "unicode/usearch.h"
15 
16 #include "cmemory.h"
17 #include "unicode/coll.h"
18 #include "unicode/tblcoll.h"
19 #include "unicode/coleitr.h"
20 #include "unicode/ucoleitr.h"
21 
22 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
23 
24 #include "unicode/uniset.h"
25 #include "unicode/uset.h"
26 #include "unicode/ustring.h"
27 #include "hash.h"
28 #include "uhash.h"
29 #include "ucln_in.h"
30 #include "ucol_imp.h"
31 #include "umutex.h"
32 
33 #include "unicode/colldata.h"
34 
35 U_NAMESPACE_BEGIN
36 
37 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
38 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
39 #define DELETE_ARRAY(array) uprv_free((void *) (array))
40 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
41 
42 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CEList)
43 
44 #ifdef INSTRUMENT_CELIST
45 int32_t CEList::_active = 0;
46 int32_t CEList::_histogram[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
47 #endif
48 
CEList(UCollator * coll,const UnicodeString & string,UErrorCode & status)49 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
50     : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
51 {
52     UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
53     UCollationStrength strength = ucol_getStrength(coll);
54     UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) ==  UCOL_SHIFTED;
55     uint32_t variableTop = ucol_getVariableTop(coll, &status);
56     uint32_t strengthMask = 0;
57     int32_t order;
58 
59     if (U_FAILURE(status)) {
60         return;
61     }
62 
63     // **** only set flag if string has Han(gul) ****
64     ucol_forceHanImplicit(elems, &status);
65 
66     switch (strength)
67     {
68     default:
69         strengthMask |= UCOL_TERTIARYORDERMASK;
70         /* fall through */
71 
72     case UCOL_SECONDARY:
73         strengthMask |= UCOL_SECONDARYORDERMASK;
74         /* fall through */
75 
76     case UCOL_PRIMARY:
77         strengthMask |= UCOL_PRIMARYORDERMASK;
78     }
79 
80 #ifdef INSTRUMENT_CELIST
81     _active += 1;
82     _histogram[0] += 1;
83 #endif
84 
85     ces = ceBuffer;
86 
87     while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
88         UBool cont = isContinuation(order);
89 
90         order &= strengthMask;
91 
92         if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
93             if (strength >= UCOL_QUATERNARY) {
94                 order &= UCOL_PRIMARYORDERMASK;
95             } else {
96                 order = UCOL_IGNORABLE;
97             }
98         }
99 
100         if (order == UCOL_IGNORABLE) {
101             continue;
102         }
103 
104         if (cont) {
105             order |= UCOL_CONTINUATION_MARKER;
106         }
107 
108         add(order, status);
109     }
110 
111     ucol_closeElements(elems);
112 }
113 
~CEList()114 CEList::~CEList()
115 {
116 #ifdef INSTRUMENT_CELIST
117     _active -= 1;
118 #endif
119 
120     if (ces != ceBuffer) {
121         DELETE_ARRAY(ces);
122     }
123 }
124 
add(uint32_t ce,UErrorCode & status)125 void CEList::add(uint32_t ce, UErrorCode &status)
126 {
127     if (U_FAILURE(status)) {
128         return;
129     }
130 
131     if (listSize >= listMax) {
132         int32_t newMax = listMax + CELIST_BUFFER_SIZE;
133 
134 #ifdef INSTRUMENT_CELIST
135         _histogram[listSize / CELIST_BUFFER_SIZE] += 1;
136 #endif
137 
138         uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
139 
140         if (newCEs == NULL) {
141             status = U_MEMORY_ALLOCATION_ERROR;
142             return;
143         }
144 
145         uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
146 
147         if (ces != ceBuffer) {
148             DELETE_ARRAY(ces);
149         }
150 
151         ces = newCEs;
152         listMax = newMax;
153     }
154 
155     ces[listSize++] = ce;
156 }
157 
get(int32_t index) const158 uint32_t CEList::get(int32_t index) const
159 {
160     if (index >= 0 && index < listSize) {
161         return ces[index];
162     }
163 
164     return UCOL_NULLORDER;
165 }
166 
operator [](int32_t index) const167 uint32_t &CEList::operator[](int32_t index) const
168 {
169     return ces[index];
170 }
171 
matchesAt(int32_t offset,const CEList * other) const172 UBool CEList::matchesAt(int32_t offset, const CEList *other) const
173 {
174     if (other == NULL || listSize - offset < other->size()) {
175         return FALSE;
176     }
177 
178     for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
179         if (ces[i] != (*other)[j]) {
180             return FALSE;
181         }
182     }
183 
184     return TRUE;
185 }
186 
size() const187 int32_t CEList::size() const
188 {
189     return listSize;
190 }
191 
192 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringList)
193 
194 #ifdef INSTRUMENT_STRING_LIST
195 int32_t StringList::_lists = 0;
196 int32_t StringList::_strings = 0;
197 int32_t StringList::_histogram[101] = {0};
198 #endif
199 
StringList(UErrorCode & status)200 StringList::StringList(UErrorCode &status)
201     : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
202 {
203     if (U_FAILURE(status)) {
204         return;
205     }
206 
207     strings = new UnicodeString [listMax];
208 
209     if (strings == NULL) {
210         status = U_MEMORY_ALLOCATION_ERROR;
211         return;
212     }
213 
214 #ifdef INSTRUMENT_STRING_LIST
215     _lists += 1;
216     _histogram[0] += 1;
217 #endif
218 }
219 
~StringList()220 StringList::~StringList()
221 {
222     delete[] strings;
223 }
224 
add(const UnicodeString * string,UErrorCode & status)225 void StringList::add(const UnicodeString *string, UErrorCode &status)
226 {
227     if (U_FAILURE(status)) {
228         return;
229     }
230 
231 #ifdef INSTRUMENT_STRING_LIST
232     _strings += 1;
233 #endif
234 
235     if (listSize >= listMax) {
236         int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
237         UnicodeString *newStrings = new UnicodeString[newMax];
238         if (newStrings == NULL) {
239             status = U_MEMORY_ALLOCATION_ERROR;
240             return;
241         }
242         for (int32_t i=0; i<listSize; ++i) {
243             newStrings[i] = strings[i];
244         }
245 
246 #ifdef INSTRUMENT_STRING_LIST
247         int32_t _h = listSize / STRING_LIST_BUFFER_SIZE;
248 
249         if (_h > 100) {
250             _h = 100;
251         }
252 
253         _histogram[_h] += 1;
254 #endif
255 
256         delete[] strings;
257         strings = newStrings;
258         listMax = newMax;
259     }
260 
261     // The ctor initialized all the strings in
262     // the array to empty strings, so this
263     // is the same as copying the source string.
264     strings[listSize++].append(*string);
265 }
266 
add(const UChar * chars,int32_t count,UErrorCode & status)267 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
268 {
269     const UnicodeString string(chars, count);
270 
271     add(&string, status);
272 }
273 
get(int32_t index) const274 const UnicodeString *StringList::get(int32_t index) const
275 {
276     if (index >= 0 && index < listSize) {
277         return &strings[index];
278     }
279 
280     return NULL;
281 }
282 
size() const283 int32_t StringList::size() const
284 {
285     return listSize;
286 }
287 
288 
289 U_CFUNC void deleteStringList(void *obj);
290 
291 class CEToStringsMap : public UMemory
292 {
293 public:
294 
295     CEToStringsMap(UErrorCode &status);
296     ~CEToStringsMap();
297 
298     void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
299     StringList *getStringList(uint32_t ce) const;
300 
301 private:
302 
303     void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
304     UHashtable *map;
305 };
306 
CEToStringsMap(UErrorCode & status)307 CEToStringsMap::CEToStringsMap(UErrorCode &status)
308     : map(NULL)
309 {
310     if (U_FAILURE(status)) {
311         return;
312     }
313 
314     map = uhash_open(uhash_hashLong, uhash_compareLong,
315                      uhash_compareCaselessUnicodeString,
316                      &status);
317 
318     if (U_FAILURE(status)) {
319         return;
320     }
321 
322     uhash_setValueDeleter(map, deleteStringList);
323 }
324 
~CEToStringsMap()325 CEToStringsMap::~CEToStringsMap()
326 {
327     uhash_close(map);
328 }
329 
put(uint32_t ce,UnicodeString * string,UErrorCode & status)330 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
331 {
332     StringList *strings = getStringList(ce);
333 
334     if (strings == NULL) {
335         strings = new StringList(status);
336 
337         if (strings == NULL || U_FAILURE(status)) {
338             status = U_MEMORY_ALLOCATION_ERROR;
339             return;
340         }
341 
342         putStringList(ce, strings, status);
343     }
344 
345     strings->add(string, status);
346 }
347 
getStringList(uint32_t ce) const348 StringList *CEToStringsMap::getStringList(uint32_t ce) const
349 {
350     return (StringList *) uhash_iget(map, ce);
351 }
352 
putStringList(uint32_t ce,StringList * stringList,UErrorCode & status)353 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
354 {
355     uhash_iput(map, ce, (void *) stringList, &status);
356 }
357 
deleteStringList(void * obj)358 U_CFUNC void deleteStringList(void *obj)
359 {
360     StringList *strings = (StringList *) obj;
361 
362     delete strings;
363 }
364 
365 U_CFUNC void deleteCEList(void *obj);
366 U_CFUNC void deleteUnicodeStringKey(void *obj);
367 
368 class StringToCEsMap : public UMemory
369 {
370 public:
371     StringToCEsMap(UErrorCode &status);
372     ~StringToCEsMap();
373 
374     void put(const UnicodeString *string, const CEList *ces, UErrorCode &status);
375     const CEList *get(const UnicodeString *string);
376     void free(const CEList *list);
377 
378 private:
379 
380 
381     UHashtable *map;
382 };
383 
StringToCEsMap(UErrorCode & status)384 StringToCEsMap::StringToCEsMap(UErrorCode &status)
385     : map(NULL)
386 {
387     if (U_FAILURE(status)) {
388         return;
389     }
390 
391     map = uhash_open(uhash_hashUnicodeString,
392                      uhash_compareUnicodeString,
393                      uhash_compareLong,
394                      &status);
395 
396     if (U_FAILURE(status)) {
397         return;
398     }
399 
400     uhash_setValueDeleter(map, deleteCEList);
401     uhash_setKeyDeleter(map, deleteUnicodeStringKey);
402 }
403 
~StringToCEsMap()404 StringToCEsMap::~StringToCEsMap()
405 {
406     uhash_close(map);
407 }
408 
put(const UnicodeString * string,const CEList * ces,UErrorCode & status)409 void StringToCEsMap::put(const UnicodeString *string, const CEList *ces, UErrorCode &status)
410 {
411     uhash_put(map, (void *) string, (void *) ces, &status);
412 }
413 
get(const UnicodeString * string)414 const CEList *StringToCEsMap::get(const UnicodeString *string)
415 {
416     return (const CEList *) uhash_get(map, string);
417 }
418 
deleteCEList(void * obj)419 U_CFUNC void deleteCEList(void *obj)
420 {
421     CEList *list = (CEList *) obj;
422 
423     delete list;
424 }
425 
deleteUnicodeStringKey(void * obj)426 U_CFUNC void deleteUnicodeStringKey(void *obj)
427 {
428     UnicodeString *key = (UnicodeString *) obj;
429 
430     delete key;
431 }
432 
433 class CollDataCacheEntry : public UMemory
434 {
435 public:
436     CollDataCacheEntry(CollData *theData);
437     ~CollDataCacheEntry();
438 
439     CollData *data;
440     int32_t   refCount;
441 };
442 
CollDataCacheEntry(CollData * theData)443 CollDataCacheEntry::CollDataCacheEntry(CollData *theData)
444     : data(theData), refCount(1)
445 {
446     // nothing else to do
447 }
448 
~CollDataCacheEntry()449 CollDataCacheEntry::~CollDataCacheEntry()
450 {
451     // check refCount?
452     delete data;
453 }
454 
455 class CollDataCache : public UMemory
456 {
457 public:
458     CollDataCache(UErrorCode &status);
459     ~CollDataCache();
460 
461     CollData *get(UCollator *collator, UErrorCode &status);
462     void unref(CollData *collData);
463 
464     void flush();
465 
466 private:
467     static char *getKey(UCollator *collator, char *keyBuffer, int32_t *charBufferLength);
468     static void deleteKey(char *key);
469 
470     UHashtable *cache;
471 };
472 static UMTX lock;
473 
deleteChars(void *)474 U_CFUNC void deleteChars(void * /*obj*/)
475 {
476     // char *chars = (char *) obj;
477     // All the key strings are owned by the
478     // CollData objects and don't need to
479     // be freed here.
480   //DELETE_ARRAY(chars);
481 }
482 
deleteCollDataCacheEntry(void * obj)483 U_CFUNC void deleteCollDataCacheEntry(void *obj)
484 {
485     CollDataCacheEntry *entry = (CollDataCacheEntry *) obj;
486 
487     delete entry;
488 }
489 
CollDataCache(UErrorCode & status)490 CollDataCache::CollDataCache(UErrorCode &status)
491     : cache(NULL)
492 {
493     if (U_FAILURE(status)) {
494         return;
495     }
496 
497     cache = uhash_open(uhash_hashChars, uhash_compareChars, uhash_compareLong, &status);
498 
499     if (U_FAILURE(status)) {
500         return;
501     }
502 
503     uhash_setValueDeleter(cache, deleteCollDataCacheEntry);
504     uhash_setKeyDeleter(cache, deleteChars);
505 }
506 
~CollDataCache()507 CollDataCache::~CollDataCache()
508 {
509     umtx_lock(&lock);
510     uhash_close(cache);
511     cache = NULL;
512     umtx_unlock(&lock);
513 }
514 
get(UCollator * collator,UErrorCode & status)515 CollData *CollDataCache::get(UCollator *collator, UErrorCode &status)
516 {
517     char keyBuffer[KEY_BUFFER_SIZE];
518     int32_t keyLength = KEY_BUFFER_SIZE;
519     char *key = getKey(collator, keyBuffer, &keyLength);
520     CollData *result = NULL, *newData = NULL;
521     CollDataCacheEntry *entry = NULL, *newEntry = NULL;
522 
523     umtx_lock(&lock);
524     entry = (CollDataCacheEntry *) uhash_get(cache, key);
525 
526     if (entry == NULL) {
527         umtx_unlock(&lock);
528 
529         newData = new CollData(collator, key, keyLength, status);
530         newEntry = new CollDataCacheEntry(newData);
531 
532         if (U_FAILURE(status) || newData == NULL || newEntry == NULL) {
533             status = U_MEMORY_ALLOCATION_ERROR;
534             return NULL;
535         }
536 
537         umtx_lock(&lock);
538         entry = (CollDataCacheEntry *) uhash_get(cache, key);
539 
540         if (entry == NULL) {
541             uhash_put(cache, newData->key, newEntry, &status);
542             umtx_unlock(&lock);
543 
544             if (U_FAILURE(status)) {
545                 delete newEntry;
546                 delete newData;
547 
548                 return NULL;
549             }
550 
551             return newData;
552         }
553     }
554 
555     result = entry->data;
556     entry->refCount += 1;
557     umtx_unlock(&lock);
558 
559     if (key != keyBuffer) {
560         deleteKey(key);
561     }
562 
563     if (newEntry != NULL) {
564         delete newEntry;
565         delete newData;
566     }
567 
568     return result;
569 }
570 
unref(CollData * collData)571 void CollDataCache::unref(CollData *collData)
572 {
573     CollDataCacheEntry *entry = NULL;
574 
575     umtx_lock(&lock);
576     entry = (CollDataCacheEntry *) uhash_get(cache, collData->key);
577 
578     if (entry != NULL) {
579         entry->refCount -= 1;
580     }
581     umtx_unlock(&lock);
582 }
583 
getKey(UCollator * collator,char * keyBuffer,int32_t * keyBufferLength)584 char *CollDataCache::getKey(UCollator *collator, char *keyBuffer, int32_t *keyBufferLength)
585 {
586     UErrorCode status = U_ZERO_ERROR;
587     int32_t len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
588 
589     if (len >= *keyBufferLength) {
590         *keyBufferLength = (len + 2) & ~1;  // round to even length, leaving room for terminating null
591         keyBuffer = NEW_ARRAY(char, *keyBufferLength);
592         status = U_ZERO_ERROR;
593 
594         len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
595     }
596 
597     keyBuffer[len] = '\0';
598 
599     return keyBuffer;
600 }
601 
flush()602 void CollDataCache::flush()
603 {
604     const UHashElement *element;
605     int32_t pos = -1;
606 
607     umtx_lock(&lock);
608     while ((element = uhash_nextElement(cache, &pos)) != NULL) {
609         CollDataCacheEntry *entry = (CollDataCacheEntry *) element->value.pointer;
610 
611         if (entry->refCount <= 0) {
612             uhash_removeElement(cache, element);
613         }
614     }
615     umtx_unlock(&lock);
616 }
617 
deleteKey(char * key)618 void CollDataCache::deleteKey(char *key)
619 {
620     DELETE_ARRAY(key);
621 }
622 
623 U_CDECL_BEGIN
coll_data_cleanup(void)624 static UBool coll_data_cleanup(void) {
625     CollData::freeCollDataCache();
626   return TRUE;
627 }
628 U_CDECL_END
629 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData)630 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData)
631 
632 CollData::CollData()
633 {
634     // nothing
635 }
636 
637 #define CLONE_COLLATOR
638 
639 //#define CACHE_CELISTS
CollData(UCollator * collator,char * cacheKey,int32_t cacheKeyLength,UErrorCode & status)640 CollData::CollData(UCollator *collator, char *cacheKey, int32_t cacheKeyLength, UErrorCode &status)
641     : coll(NULL), charsToCEList(NULL), ceToCharsStartingWith(NULL), key(NULL)
642 {
643     // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
644     // i.e. other, control, private use, format, surrogate
645     U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
646     U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
647     USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
648 
649     // Han ext. A, Han, Jamo, Hangul, Han Ext. B
650     // i.e. all the characers we handle implicitly
651     U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
652     U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
653     USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
654 
655     if (U_FAILURE(status)) {
656         return;
657     }
658 
659     USet *expansions   = uset_openEmpty();
660     USet *contractions = uset_openEmpty();
661     int32_t itemCount;
662 
663 #ifdef CACHE_CELISTS
664     charsToCEList = new StringToCEsMap(status);
665 
666     if (U_FAILURE(status)) {
667         goto bail;
668     }
669 #else
670     charsToCEList = NULL;
671 #endif
672 
673     ceToCharsStartingWith = new CEToStringsMap(status);
674 
675     if (U_FAILURE(status)) {
676         goto bail;
677     }
678 
679     if (cacheKeyLength > KEY_BUFFER_SIZE) {
680         key = NEW_ARRAY(char, cacheKeyLength);
681 
682         if (key == NULL) {
683             status = U_MEMORY_ALLOCATION_ERROR;
684             goto bail;
685         }
686     } else {
687         key = keyBuffer;
688     }
689 
690     ARRAY_COPY(key, cacheKey, cacheKeyLength);
691 
692 #ifdef CLONE_COLLATOR
693     coll = ucol_safeClone(collator, NULL, NULL, &status);
694 
695     if (U_FAILURE(status)) {
696         goto bail;
697     }
698 #else
699     coll = collator;
700 #endif
701 
702     ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
703 
704     uset_addAll(charsToTest, contractions);
705     uset_addAll(charsToTest, expansions);
706     uset_removeAll(charsToTest, charsToRemove);
707 
708     itemCount = uset_getItemCount(charsToTest);
709     for(int32_t item = 0; item < itemCount; item += 1) {
710         UChar32 start = 0, end = 0;
711         UChar buffer[16];
712         int32_t len = uset_getItem(charsToTest, item, &start, &end,
713                                    buffer, 16, &status);
714 
715         if (len == 0) {
716             for (UChar32 ch = start; ch <= end; ch += 1) {
717                 UnicodeString *st = new UnicodeString(ch);
718 
719                 if (st == NULL) {
720                     status = U_MEMORY_ALLOCATION_ERROR;
721                     break;
722                 }
723 
724                 CEList *ceList = new CEList(coll, *st, status);
725 
726                 ceToCharsStartingWith->put(ceList->get(0), st, status);
727 
728 #ifdef CACHE_CELISTS
729                 charsToCEList->put(st, ceList, status);
730 #else
731                 delete ceList;
732                 delete st;
733 #endif
734             }
735         } else if (len > 0) {
736             UnicodeString *st = new UnicodeString(buffer, len);
737 
738             if (st == NULL) {
739                 status = U_MEMORY_ALLOCATION_ERROR;
740                 break;
741             }
742 
743             CEList *ceList = new CEList(coll, *st, status);
744 
745             ceToCharsStartingWith->put(ceList->get(0), st, status);
746 
747 #ifdef CACHE_CELISTS
748             charsToCEList->put(st, ceList, status);
749 #else
750             delete ceList;
751             delete st;
752 #endif
753         } else {
754             // shouldn't happen...
755         }
756 
757         if (U_FAILURE(status)) {
758              break;
759         }
760     }
761 
762 bail:
763     uset_close(contractions);
764     uset_close(expansions);
765     uset_close(charsToRemove);
766     uset_close(charsToTest);
767 
768     if (U_FAILURE(status)) {
769         return;
770     }
771 
772      UChar32 hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A,
773                             UCOL_FIRST_HAN_B, UCOL_LAST_HAN_B};
774      UChar  jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_JAMO, UCOL_LAST_T_JAMO};
775      UnicodeString hanString = UnicodeString::fromUTF32(hanRanges, ARRAY_SIZE(hanRanges));
776      UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges));
777      CEList hanList(coll, hanString, status);
778      CEList jamoList(coll, jamoString, status);
779      int32_t j = 0;
780 
781      if (U_FAILURE(status)) {
782          return;
783      }
784 
785      for (int32_t c = 0; c < jamoList.size(); c += 1) {
786          uint32_t jce = jamoList[c];
787 
788          if (! isContinuation(jce)) {
789              jamoLimits[j++] = jce;
790          }
791      }
792 
793      jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
794 
795      minHan = 0xFFFFFFFF;
796      maxHan = 0;
797 
798      for(int32_t h = 0; h < hanList.size(); h += 2) {
799          uint32_t han = (uint32_t) hanList[h];
800 
801          if (han < minHan) {
802              minHan = han;
803          }
804 
805          if (han > maxHan) {
806              maxHan = han;
807          }
808      }
809 
810      maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
811 }
812 
~CollData()813 CollData::~CollData()
814 {
815 #ifdef CLONE_COLLATOR
816    ucol_close(coll);
817 #endif
818 
819    if (key != keyBuffer) {
820        DELETE_ARRAY(key);
821    }
822 
823    delete ceToCharsStartingWith;
824 
825 #ifdef CACHE_CELISTS
826    delete charsToCEList;
827 #endif
828 }
829 
getCollator() const830 UCollator *CollData::getCollator() const
831 {
832     return coll;
833 }
834 
getStringList(int32_t ce) const835 const StringList *CollData::getStringList(int32_t ce) const
836 {
837     return ceToCharsStartingWith->getStringList(ce);
838 }
839 
getCEList(const UnicodeString * string) const840 const CEList *CollData::getCEList(const UnicodeString *string) const
841 {
842 #ifdef CACHE_CELISTS
843     return charsToCEList->get(string);
844 #else
845     UErrorCode status = U_ZERO_ERROR;
846     const CEList *list = new CEList(coll, *string, status);
847 
848     if (U_FAILURE(status)) {
849         delete list;
850         list = NULL;
851     }
852 
853     return list;
854 #endif
855 }
856 
freeCEList(const CEList * list)857 void CollData::freeCEList(const CEList *list)
858 {
859 #ifndef CACHE_CELISTS
860     delete list;
861 #endif
862 }
863 
minLengthInChars(const CEList * ceList,int32_t offset,int32_t * history) const864 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
865 {
866     // find out shortest string for the longest sequence of ces.
867     // this can probably be folded with the minLengthCache...
868 
869     if (history[offset] >= 0) {
870         return history[offset];
871     }
872 
873     uint32_t ce = ceList->get(offset);
874     int32_t maxOffset = ceList->size();
875     int32_t shortestLength = INT32_MAX;
876     const StringList *strings = ceToCharsStartingWith->getStringList(ce);
877 
878     if (strings != NULL) {
879         int32_t stringCount = strings->size();
880 
881         for (int32_t s = 0; s < stringCount; s += 1) {
882             const UnicodeString *string = strings->get(s);
883 #ifdef CACHE_CELISTS
884             const CEList *ceList2 = charsToCEList->get(string);
885 #else
886             UErrorCode status = U_ZERO_ERROR;
887             const CEList *ceList2 = new CEList(coll, *string, status);
888 
889             if (U_FAILURE(status)) {
890                 delete ceList2;
891                 ceList2 = NULL;
892             }
893 #endif
894 
895             if (ceList->matchesAt(offset, ceList2)) {
896                 int32_t clength = ceList2->size();
897                 int32_t slength = string->length();
898                 int32_t roffset = offset + clength;
899                 int32_t rlength = 0;
900 
901                 if (roffset < maxOffset) {
902                     rlength = minLengthInChars(ceList, roffset, history);
903 
904                     if (rlength <= 0) {
905                     // delete before continue to avoid memory leak.
906 #ifndef CACHE_CELISTS
907                         delete ceList2;
908 #endif
909                         // ignore any dead ends
910                         continue;
911                     }
912                 }
913 
914                 if (shortestLength > slength + rlength) {
915                     shortestLength = slength + rlength;
916                 }
917             }
918 
919 #ifndef CACHE_CELISTS
920             delete ceList2;
921 #endif
922         }
923     }
924 
925     if (shortestLength == INT32_MAX) {
926         // No matching strings at this offset. See if
927         // the CE is in a range we can handle manually.
928         if (ce >= minHan && ce < maxHan) {
929             // all han have implicit orders which
930             // generate two CEs.
931             int32_t roffset = offset + 2;
932             int32_t rlength = 0;
933 
934           //history[roffset++] = -1;
935           //history[roffset++] = 1;
936 
937             if (roffset < maxOffset) {
938                 rlength = minLengthInChars(ceList, roffset, history);
939             }
940 
941             if (rlength < 0) {
942                 return -1;
943             }
944 
945             shortestLength = 1 + rlength;
946             goto have_shortest;
947         } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
948             int32_t roffset = offset;
949             int32_t rlength = 0;
950 
951             // **** this loop may not handle archaic Hangul correctly ****
952             for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
953                 uint32_t jce = ceList->get(roffset);
954 
955                 // Some Jamo have 24-bit primary order; skip the
956                 // 2nd CE. This should always be OK because if
957                 // we're still in the loop all we've seen are
958                 // a series of Jamo in LVT order.
959                 if (isContinuation(jce)) {
960                     continue;
961                 }
962 
963                 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
964                     break;
965                 }
966             }
967 
968             if (roffset == offset) {
969                 // we started with a non-L Jamo...
970                 // just say it comes from a single character
971                 roffset += 1;
972 
973                 // See if the single Jamo has a 24-bit order.
974                 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
975                     roffset += 1;
976                 }
977             }
978 
979             if (roffset < maxOffset) {
980                 rlength = minLengthInChars(ceList, roffset, history);
981             }
982 
983             if (rlength < 0) {
984                 return -1;
985             }
986 
987             shortestLength = 1 + rlength;
988             goto have_shortest;
989         }
990 
991         // Can't handle it manually either. Just move on.
992         return -1;
993     }
994 
995 have_shortest:
996     history[offset] = shortestLength;
997 
998     return shortestLength;
999 }
1000 
minLengthInChars(const CEList * ceList,int32_t offset) const1001 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
1002 {
1003     int32_t clength = ceList->size();
1004     int32_t *history = NEW_ARRAY(int32_t, clength);
1005 
1006     for (int32_t i = 0; i < clength; i += 1) {
1007         history[i] = -1;
1008     }
1009 
1010     int32_t minLength = minLengthInChars(ceList, offset, history);
1011 
1012     DELETE_ARRAY(history);
1013 
1014     return minLength;
1015 }
1016 
open(UCollator * collator,UErrorCode & status)1017 CollData *CollData::open(UCollator *collator, UErrorCode &status)
1018 {
1019     if (U_FAILURE(status)) {
1020         return NULL;
1021     }
1022 
1023     CollDataCache *cache = getCollDataCache();
1024 
1025     return cache->get(collator, status);
1026 }
1027 
close(CollData * collData)1028 void CollData::close(CollData *collData)
1029 {
1030     CollDataCache *cache = getCollDataCache();
1031 
1032     cache->unref(collData);
1033 }
1034 
1035 CollDataCache *CollData::collDataCache = NULL;
1036 
getCollDataCache()1037 CollDataCache *CollData::getCollDataCache()
1038 {
1039     UErrorCode status = U_ZERO_ERROR;
1040     CollDataCache *cache = NULL;
1041 
1042     UMTX_CHECK(NULL, collDataCache, cache);
1043 
1044     if (cache == NULL) {
1045         cache = new CollDataCache(status);
1046 
1047         if (U_FAILURE(status)) {
1048             delete cache;
1049             return NULL;
1050         }
1051 
1052         umtx_lock(NULL);
1053         if (collDataCache == NULL) {
1054             collDataCache = cache;
1055 
1056             ucln_i18n_registerCleanup(UCLN_I18N_COLL_DATA, coll_data_cleanup);
1057         }
1058         umtx_unlock(NULL);
1059 
1060         if (collDataCache != cache) {
1061             delete cache;
1062         }
1063     }
1064 
1065     return collDataCache;
1066 }
1067 
freeCollDataCache()1068 void CollData::freeCollDataCache()
1069 {
1070     CollDataCache *cache = NULL;
1071 
1072     UMTX_CHECK(NULL, collDataCache, cache);
1073 
1074     if (cache != NULL) {
1075         umtx_lock(NULL);
1076         if (collDataCache != NULL) {
1077             collDataCache = NULL;
1078         } else {
1079             cache = NULL;
1080         }
1081         umtx_unlock(NULL);
1082 
1083         delete cache;
1084     }
1085 }
1086 
flushCollDataCache()1087 void CollData::flushCollDataCache()
1088 {
1089     CollDataCache *cache = NULL;
1090 
1091     UMTX_CHECK(NULL, collDataCache, cache);
1092 
1093     // **** this will fail if the another ****
1094     // **** thread deletes the cache here ****
1095     if (cache != NULL) {
1096         cache->flush();
1097     }
1098 }
1099 
1100 U_NAMESPACE_END
1101 
1102 #endif // #if !UCONFIG_NO_COLLATION
1103