1 /*
2 ******************************************************************************
3 * Copyright (C) 1996-2012, International Business Machines *
4 * Corporation and others. All Rights Reserved. *
5 ******************************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_COLLATION
11
12 #include "unicode/unistr.h"
13 #include "unicode/putil.h"
14 #include "unicode/usearch.h"
15
16 #include "cmemory.h"
17 #include "unicode/coll.h"
18 #include "unicode/tblcoll.h"
19 #include "unicode/coleitr.h"
20 #include "unicode/ucoleitr.h"
21
22 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
23
24 #include "unicode/uniset.h"
25 #include "unicode/uset.h"
26 #include "unicode/ustring.h"
27 #include "hash.h"
28 #include "uhash.h"
29 #include "ucln_in.h"
30 #include "ucol_imp.h"
31 #include "umutex.h"
32 #include "uassert.h"
33
34 #include "unicode/colldata.h"
35
36 U_NAMESPACE_BEGIN
37
38 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
39 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
40 #define DELETE_ARRAY(array) uprv_free((void *) (array))
41 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
42
43 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CEList)
44
45 #ifdef INSTRUMENT_CELIST
46 int32_t CEList::_active = 0;
47 int32_t CEList::_histogram[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
48 #endif
49
CEList(UCollator * coll,const UnicodeString & string,UErrorCode & status)50 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
51 : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
52 {
53 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
54 UCollationStrength strength = ucol_getStrength(coll);
55 UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
56 uint32_t variableTop = ucol_getVariableTop(coll, &status);
57 uint32_t strengthMask = 0;
58 int32_t order;
59
60 if (U_FAILURE(status)) {
61 return;
62 }
63
64 // **** only set flag if string has Han(gul) ****
65 ucol_forceHanImplicit(elems, &status);
66
67 switch (strength)
68 {
69 default:
70 strengthMask |= UCOL_TERTIARYORDERMASK;
71 /* fall through */
72
73 case UCOL_SECONDARY:
74 strengthMask |= UCOL_SECONDARYORDERMASK;
75 /* fall through */
76
77 case UCOL_PRIMARY:
78 strengthMask |= UCOL_PRIMARYORDERMASK;
79 }
80
81 #ifdef INSTRUMENT_CELIST
82 _active += 1;
83 _histogram[0] += 1;
84 #endif
85
86 ces = ceBuffer;
87
88 while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
89 UBool cont = isContinuation(order);
90
91 order &= strengthMask;
92
93 if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
94 if (strength >= UCOL_QUATERNARY) {
95 order &= UCOL_PRIMARYORDERMASK;
96 } else {
97 order = UCOL_IGNORABLE;
98 }
99 }
100
101 if (order == UCOL_IGNORABLE) {
102 continue;
103 }
104
105 if (cont) {
106 order |= UCOL_CONTINUATION_MARKER;
107 }
108
109 add(order, status);
110 }
111
112 ucol_closeElements(elems);
113 }
114
~CEList()115 CEList::~CEList()
116 {
117 #ifdef INSTRUMENT_CELIST
118 _active -= 1;
119 #endif
120
121 if (ces != ceBuffer) {
122 DELETE_ARRAY(ces);
123 }
124 }
125
add(uint32_t ce,UErrorCode & status)126 void CEList::add(uint32_t ce, UErrorCode &status)
127 {
128 if (U_FAILURE(status)) {
129 return;
130 }
131
132 if (listSize >= listMax) {
133 int32_t newMax = listMax + CELIST_BUFFER_SIZE;
134
135 #ifdef INSTRUMENT_CELIST
136 _histogram[listSize / CELIST_BUFFER_SIZE] += 1;
137 #endif
138
139 uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
140
141 if (newCEs == NULL) {
142 status = U_MEMORY_ALLOCATION_ERROR;
143 return;
144 }
145
146 uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
147
148 if (ces != ceBuffer) {
149 DELETE_ARRAY(ces);
150 }
151
152 ces = newCEs;
153 listMax = newMax;
154 }
155
156 ces[listSize++] = ce;
157 }
158
get(int32_t index) const159 uint32_t CEList::get(int32_t index) const
160 {
161 if (index >= 0 && index < listSize) {
162 return ces[index];
163 }
164
165 return (uint32_t)UCOL_NULLORDER;
166 }
167
operator [](int32_t index) const168 uint32_t &CEList::operator[](int32_t index) const
169 {
170 return ces[index];
171 }
172
matchesAt(int32_t offset,const CEList * other) const173 UBool CEList::matchesAt(int32_t offset, const CEList *other) const
174 {
175 if (other == NULL || listSize - offset < other->size()) {
176 return FALSE;
177 }
178
179 for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
180 if (ces[i] != (*other)[j]) {
181 return FALSE;
182 }
183 }
184
185 return TRUE;
186 }
187
size() const188 int32_t CEList::size() const
189 {
190 return listSize;
191 }
192
193 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringList)
194
195 #ifdef INSTRUMENT_STRING_LIST
196 int32_t StringList::_lists = 0;
197 int32_t StringList::_strings = 0;
198 int32_t StringList::_histogram[101] = {0};
199 #endif
200
StringList(UErrorCode & status)201 StringList::StringList(UErrorCode &status)
202 : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
203 {
204 if (U_FAILURE(status)) {
205 return;
206 }
207
208 strings = new UnicodeString [listMax];
209
210 if (strings == NULL) {
211 status = U_MEMORY_ALLOCATION_ERROR;
212 return;
213 }
214
215 #ifdef INSTRUMENT_STRING_LIST
216 _lists += 1;
217 _histogram[0] += 1;
218 #endif
219 }
220
~StringList()221 StringList::~StringList()
222 {
223 delete[] strings;
224 }
225
add(const UnicodeString * string,UErrorCode & status)226 void StringList::add(const UnicodeString *string, UErrorCode &status)
227 {
228 if (U_FAILURE(status)) {
229 return;
230 }
231
232 #ifdef INSTRUMENT_STRING_LIST
233 _strings += 1;
234 #endif
235
236 if (listSize >= listMax) {
237 int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
238 UnicodeString *newStrings = new UnicodeString[newMax];
239 if (newStrings == NULL) {
240 status = U_MEMORY_ALLOCATION_ERROR;
241 return;
242 }
243 for (int32_t i=0; i<listSize; ++i) {
244 newStrings[i] = strings[i];
245 }
246
247 #ifdef INSTRUMENT_STRING_LIST
248 int32_t _h = listSize / STRING_LIST_BUFFER_SIZE;
249
250 if (_h > 100) {
251 _h = 100;
252 }
253
254 _histogram[_h] += 1;
255 #endif
256
257 delete[] strings;
258 strings = newStrings;
259 listMax = newMax;
260 }
261
262 // The ctor initialized all the strings in
263 // the array to empty strings, so this
264 // is the same as copying the source string.
265 strings[listSize++].append(*string);
266 }
267
add(const UChar * chars,int32_t count,UErrorCode & status)268 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
269 {
270 const UnicodeString string(chars, count);
271
272 add(&string, status);
273 }
274
get(int32_t index) const275 const UnicodeString *StringList::get(int32_t index) const
276 {
277 if (index >= 0 && index < listSize) {
278 return &strings[index];
279 }
280
281 return NULL;
282 }
283
size() const284 int32_t StringList::size() const
285 {
286 return listSize;
287 }
288
289
290 U_CDECL_BEGIN
291 static void U_CALLCONV
deleteStringList(void * obj)292 deleteStringList(void *obj)
293 {
294 StringList *strings = (StringList *) obj;
295
296 delete strings;
297 }
298 static void U_CALLCONV
deleteCEList(void * obj)299 deleteCEList(void *obj)
300 {
301 CEList *list = (CEList *) obj;
302
303 delete list;
304 }
305
306 static void U_CALLCONV
deleteUnicodeStringKey(void * obj)307 deleteUnicodeStringKey(void *obj)
308 {
309 UnicodeString *key = (UnicodeString *) obj;
310
311 delete key;
312 }
313
314 static void U_CALLCONV
deleteChars(void *)315 deleteChars(void * /*obj*/)
316 {
317 // char *chars = (char *) obj;
318 // All the key strings are owned by the
319 // CollData objects and don't need to
320 // be freed here.
321 //DELETE_ARRAY(chars);
322 }
323
324 U_CDECL_END
325
326 class CEToStringsMap : public UMemory
327 {
328 public:
329
330 CEToStringsMap(UErrorCode &status);
331 ~CEToStringsMap();
332
333 void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
334 StringList *getStringList(uint32_t ce) const;
335
336 private:
337
338 void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
339 UHashtable *map;
340 };
341
CEToStringsMap(UErrorCode & status)342 CEToStringsMap::CEToStringsMap(UErrorCode &status)
343 : map(NULL)
344 {
345 if (U_FAILURE(status)) {
346 return;
347 }
348
349 map = uhash_open(uhash_hashLong, uhash_compareLong,
350 uhash_compareCaselessUnicodeString,
351 &status);
352
353 if (U_FAILURE(status)) {
354 return;
355 }
356
357 uhash_setValueDeleter(map, deleteStringList);
358 }
359
~CEToStringsMap()360 CEToStringsMap::~CEToStringsMap()
361 {
362 uhash_close(map);
363 }
364
put(uint32_t ce,UnicodeString * string,UErrorCode & status)365 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
366 {
367 StringList *strings = getStringList(ce);
368
369 if (strings == NULL) {
370 strings = new StringList(status);
371
372 if (strings == NULL || U_FAILURE(status)) {
373 status = U_MEMORY_ALLOCATION_ERROR;
374 return;
375 }
376
377 putStringList(ce, strings, status);
378 }
379
380 strings->add(string, status);
381 }
382
getStringList(uint32_t ce) const383 StringList *CEToStringsMap::getStringList(uint32_t ce) const
384 {
385 return (StringList *) uhash_iget(map, ce);
386 }
387
putStringList(uint32_t ce,StringList * stringList,UErrorCode & status)388 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
389 {
390 uhash_iput(map, ce, (void *) stringList, &status);
391 }
392
393 class StringToCEsMap : public UMemory
394 {
395 public:
396 StringToCEsMap(UErrorCode &status);
397 ~StringToCEsMap();
398
399 void put(const UnicodeString *string, const CEList *ces, UErrorCode &status);
400 const CEList *get(const UnicodeString *string);
401 void free(const CEList *list);
402
403 private:
404
405
406 UHashtable *map;
407 };
408
StringToCEsMap(UErrorCode & status)409 StringToCEsMap::StringToCEsMap(UErrorCode &status)
410 : map(NULL)
411 {
412 if (U_FAILURE(status)) {
413 return;
414 }
415
416 map = uhash_open(uhash_hashUnicodeString,
417 uhash_compareUnicodeString,
418 uhash_compareLong,
419 &status);
420
421 if (U_FAILURE(status)) {
422 return;
423 }
424
425 uhash_setValueDeleter(map, deleteCEList);
426 uhash_setKeyDeleter(map, deleteUnicodeStringKey);
427 }
428
~StringToCEsMap()429 StringToCEsMap::~StringToCEsMap()
430 {
431 uhash_close(map);
432 }
433
put(const UnicodeString * string,const CEList * ces,UErrorCode & status)434 void StringToCEsMap::put(const UnicodeString *string, const CEList *ces, UErrorCode &status)
435 {
436 uhash_put(map, (void *) string, (void *) ces, &status);
437 }
438
get(const UnicodeString * string)439 const CEList *StringToCEsMap::get(const UnicodeString *string)
440 {
441 return (const CEList *) uhash_get(map, string);
442 }
443
444 class CollDataCacheEntry : public UMemory
445 {
446 public:
447 CollDataCacheEntry(CollData *theData);
448 ~CollDataCacheEntry();
449
450 CollData *data;
451 int32_t refCount;
452 };
453
CollDataCacheEntry(CollData * theData)454 CollDataCacheEntry::CollDataCacheEntry(CollData *theData)
455 : data(theData), refCount(1)
456 {
457 // nothing else to do
458 }
459
~CollDataCacheEntry()460 CollDataCacheEntry::~CollDataCacheEntry()
461 {
462 // check refCount?
463 delete data;
464 }
465
466 class CollDataCache : public UMemory
467 {
468 public:
469 CollDataCache(UErrorCode &status);
470 ~CollDataCache();
471
472 CollData *get(UCollator *collator, UErrorCode &status);
473 void unref(CollData *collData);
474
475 void flush();
476
477 private:
478 static char *getKey(UCollator *collator, char *keyBuffer, int32_t *charBufferLength);
479 static void deleteKey(char *key);
480
481 UHashtable *cache;
482 };
483 static UMutex lock = U_MUTEX_INITIALIZER;
484
485 U_CDECL_BEGIN
486 static void U_CALLCONV
deleteCollDataCacheEntry(void * obj)487 deleteCollDataCacheEntry(void *obj)
488 {
489 CollDataCacheEntry *entry = (CollDataCacheEntry *) obj;
490
491 delete entry;
492 }
493 U_CDECL_END
494
CollDataCache(UErrorCode & status)495 CollDataCache::CollDataCache(UErrorCode &status)
496 : cache(NULL)
497 {
498 if (U_FAILURE(status)) {
499 return;
500 }
501
502 cache = uhash_open(uhash_hashChars, uhash_compareChars, uhash_compareLong, &status);
503
504 if (U_FAILURE(status)) {
505 return;
506 }
507
508 uhash_setValueDeleter(cache, deleteCollDataCacheEntry);
509 uhash_setKeyDeleter(cache, deleteChars);
510 }
511
~CollDataCache()512 CollDataCache::~CollDataCache()
513 {
514 umtx_lock(&lock);
515 uhash_close(cache);
516 cache = NULL;
517 umtx_unlock(&lock);
518 }
519
get(UCollator * collator,UErrorCode & status)520 CollData *CollDataCache::get(UCollator *collator, UErrorCode &status)
521 {
522 char keyBuffer[KEY_BUFFER_SIZE];
523 int32_t keyLength = KEY_BUFFER_SIZE;
524 char *key = getKey(collator, keyBuffer, &keyLength);
525 CollData *result = NULL, *newData = NULL;
526 CollDataCacheEntry *entry = NULL, *newEntry = NULL;
527
528 umtx_lock(&lock);
529 entry = (CollDataCacheEntry *) uhash_get(cache, key);
530
531 if (entry == NULL) {
532 umtx_unlock(&lock);
533
534 newData = new CollData(collator, key, keyLength, status);
535 newEntry = new CollDataCacheEntry(newData);
536
537 if (U_FAILURE(status) || newData == NULL || newEntry == NULL) {
538 status = U_MEMORY_ALLOCATION_ERROR;
539 return NULL;
540 }
541
542 umtx_lock(&lock);
543 entry = (CollDataCacheEntry *) uhash_get(cache, key);
544
545 if (entry == NULL) {
546 uhash_put(cache, newData->key, newEntry, &status);
547 umtx_unlock(&lock);
548
549 if (U_FAILURE(status)) {
550 delete newEntry;
551 delete newData;
552
553 return NULL;
554 }
555
556 return newData;
557 }
558 }
559
560 result = entry->data;
561 entry->refCount += 1;
562 umtx_unlock(&lock);
563
564 if (key != keyBuffer) {
565 deleteKey(key);
566 }
567
568 if (newEntry != NULL) {
569 delete newEntry;
570 delete newData;
571 }
572
573 return result;
574 }
575
unref(CollData * collData)576 void CollDataCache::unref(CollData *collData)
577 {
578 CollDataCacheEntry *entry = NULL;
579
580 umtx_lock(&lock);
581 entry = (CollDataCacheEntry *) uhash_get(cache, collData->key);
582
583 if (entry != NULL) {
584 entry->refCount -= 1;
585 }
586 umtx_unlock(&lock);
587 }
588
getKey(UCollator * collator,char * keyBuffer,int32_t * keyBufferLength)589 char *CollDataCache::getKey(UCollator *collator, char *keyBuffer, int32_t *keyBufferLength)
590 {
591 UErrorCode status = U_ZERO_ERROR;
592 int32_t len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
593
594 if (len >= *keyBufferLength) {
595 *keyBufferLength = (len + 2) & ~1; // round to even length, leaving room for terminating null
596 keyBuffer = NEW_ARRAY(char, *keyBufferLength);
597 status = U_ZERO_ERROR;
598
599 len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
600 }
601
602 keyBuffer[len] = '\0';
603
604 return keyBuffer;
605 }
606
flush()607 void CollDataCache::flush()
608 {
609 const UHashElement *element;
610 int32_t pos = -1;
611
612 umtx_lock(&lock);
613 while ((element = uhash_nextElement(cache, &pos)) != NULL) {
614 CollDataCacheEntry *entry = (CollDataCacheEntry *) element->value.pointer;
615
616 if (entry->refCount <= 0) {
617 uhash_removeElement(cache, element);
618 }
619 }
620 umtx_unlock(&lock);
621 }
622
deleteKey(char * key)623 void CollDataCache::deleteKey(char *key)
624 {
625 DELETE_ARRAY(key);
626 }
627
628 U_CDECL_BEGIN
coll_data_cleanup(void)629 static UBool coll_data_cleanup(void) {
630 CollData::freeCollDataCache();
631 return TRUE;
632 }
633 U_CDECL_END
634
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData)635 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData)
636
637 CollData::CollData()
638 {
639 // nothing
640 }
641
642 #define CLONE_COLLATOR
643
644 //#define CACHE_CELISTS
CollData(UCollator * collator,char * cacheKey,int32_t cacheKeyLength,UErrorCode & status)645 CollData::CollData(UCollator *collator, char *cacheKey, int32_t cacheKeyLength, UErrorCode &status)
646 : coll(NULL), charsToCEList(NULL), ceToCharsStartingWith(NULL), key(NULL)
647 {
648 // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
649 // i.e. other, control, private use, format, surrogate
650 U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
651 U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
652 USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
653
654 // Han ext. A, Han, Jamo, Hangul, Han Ext. B
655 // i.e. all the characers we handle implicitly
656 U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
657 U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
658 USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
659
660 if (U_FAILURE(status)) {
661 return;
662 }
663
664 USet *expansions = uset_openEmpty();
665 USet *contractions = uset_openEmpty();
666 int32_t itemCount;
667
668 #ifdef CACHE_CELISTS
669 charsToCEList = new StringToCEsMap(status);
670
671 if (U_FAILURE(status)) {
672 goto bail;
673 }
674 #else
675 charsToCEList = NULL;
676 #endif
677
678 ceToCharsStartingWith = new CEToStringsMap(status);
679
680 if (U_FAILURE(status)) {
681 goto bail;
682 }
683
684 if (cacheKeyLength > KEY_BUFFER_SIZE) {
685 key = NEW_ARRAY(char, cacheKeyLength);
686
687 if (key == NULL) {
688 status = U_MEMORY_ALLOCATION_ERROR;
689 goto bail;
690 }
691 } else {
692 key = keyBuffer;
693 }
694
695 ARRAY_COPY(key, cacheKey, cacheKeyLength);
696
697 #ifdef CLONE_COLLATOR
698 coll = ucol_safeClone(collator, NULL, NULL, &status);
699
700 if (U_FAILURE(status)) {
701 goto bail;
702 }
703 #else
704 coll = collator;
705 #endif
706
707 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
708
709 uset_addAll(charsToTest, contractions);
710 uset_addAll(charsToTest, expansions);
711 uset_removeAll(charsToTest, charsToRemove);
712
713 itemCount = uset_getItemCount(charsToTest);
714 for(int32_t item = 0; item < itemCount; item += 1) {
715 UChar32 start = 0, end = 0;
716 UChar buffer[16];
717 int32_t len = uset_getItem(charsToTest, item, &start, &end,
718 buffer, 16, &status);
719
720 if (len == 0) {
721 for (UChar32 ch = start; ch <= end; ch += 1) {
722 UnicodeString *st = new UnicodeString(ch);
723
724 if (st == NULL) {
725 status = U_MEMORY_ALLOCATION_ERROR;
726 break;
727 }
728
729 CEList *ceList = new CEList(coll, *st, status);
730
731 ceToCharsStartingWith->put(ceList->get(0), st, status);
732
733 #ifdef CACHE_CELISTS
734 charsToCEList->put(st, ceList, status);
735 #else
736 delete ceList;
737 delete st;
738 #endif
739 }
740 } else if (len > 0) {
741 UnicodeString *st = new UnicodeString(buffer, len);
742
743 if (st == NULL) {
744 status = U_MEMORY_ALLOCATION_ERROR;
745 break;
746 }
747
748 CEList *ceList = new CEList(coll, *st, status);
749
750 ceToCharsStartingWith->put(ceList->get(0), st, status);
751
752 #ifdef CACHE_CELISTS
753 charsToCEList->put(st, ceList, status);
754 #else
755 delete ceList;
756 delete st;
757 #endif
758 } else {
759 // shouldn't happen...
760 }
761
762 if (U_FAILURE(status)) {
763 break;
764 }
765 }
766
767 bail:
768 uset_close(contractions);
769 uset_close(expansions);
770 uset_close(charsToRemove);
771 uset_close(charsToTest);
772
773 if (U_FAILURE(status)) {
774 return;
775 }
776
777 UChar32 hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A,
778 UCOL_FIRST_HAN_B, UCOL_LAST_HAN_B};
779 UChar jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_JAMO, UCOL_LAST_T_JAMO};
780 UnicodeString hanString = UnicodeString::fromUTF32(hanRanges, ARRAY_SIZE(hanRanges));
781 UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges));
782 CEList hanList(coll, hanString, status);
783 CEList jamoList(coll, jamoString, status);
784 int32_t j = 0;
785
786 if (U_FAILURE(status)) {
787 return;
788 }
789
790 for (int32_t c = 0; c < jamoList.size(); c += 1) {
791 uint32_t jce = jamoList[c];
792
793 if (! isContinuation(jce)) {
794 jamoLimits[j++] = jce;
795 }
796 }
797
798 jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
799
800 minHan = 0xFFFFFFFF;
801 maxHan = 0;
802
803 for(int32_t h = 0; h < hanList.size(); h += 2) {
804 uint32_t han = (uint32_t) hanList[h];
805
806 if (han < minHan) {
807 minHan = han;
808 }
809
810 if (han > maxHan) {
811 maxHan = han;
812 }
813 }
814
815 maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
816 }
817
~CollData()818 CollData::~CollData()
819 {
820 #ifdef CLONE_COLLATOR
821 ucol_close(coll);
822 #endif
823
824 if (key != keyBuffer) {
825 DELETE_ARRAY(key);
826 }
827
828 delete ceToCharsStartingWith;
829
830 #ifdef CACHE_CELISTS
831 delete charsToCEList;
832 #endif
833 }
834
getCollator() const835 UCollator *CollData::getCollator() const
836 {
837 return coll;
838 }
839
getStringList(int32_t ce) const840 const StringList *CollData::getStringList(int32_t ce) const
841 {
842 return ceToCharsStartingWith->getStringList(ce);
843 }
844
getCEList(const UnicodeString * string) const845 const CEList *CollData::getCEList(const UnicodeString *string) const
846 {
847 #ifdef CACHE_CELISTS
848 return charsToCEList->get(string);
849 #else
850 UErrorCode status = U_ZERO_ERROR;
851 const CEList *list = new CEList(coll, *string, status);
852
853 if (U_FAILURE(status)) {
854 delete list;
855 list = NULL;
856 }
857
858 return list;
859 #endif
860 }
861
freeCEList(const CEList * list)862 void CollData::freeCEList(const CEList *list)
863 {
864 #ifndef CACHE_CELISTS
865 delete list;
866 #endif
867 }
868
minLengthInChars(const CEList * ceList,int32_t offset,int32_t * history) const869 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
870 {
871 // find out shortest string for the longest sequence of ces.
872 // this can probably be folded with the minLengthCache...
873
874 if (history[offset] >= 0) {
875 return history[offset];
876 }
877
878 uint32_t ce = ceList->get(offset);
879 int32_t maxOffset = ceList->size();
880 int32_t shortestLength = INT32_MAX;
881 const StringList *strings = ceToCharsStartingWith->getStringList(ce);
882
883 if (strings != NULL) {
884 int32_t stringCount = strings->size();
885
886 for (int32_t s = 0; s < stringCount; s += 1) {
887 const UnicodeString *string = strings->get(s);
888 #ifdef CACHE_CELISTS
889 const CEList *ceList2 = charsToCEList->get(string);
890 #else
891 UErrorCode status = U_ZERO_ERROR;
892 const CEList *ceList2 = new CEList(coll, *string, status);
893
894 if (U_FAILURE(status)) {
895 delete ceList2;
896 ceList2 = NULL;
897 }
898 #endif
899
900 if (ceList->matchesAt(offset, ceList2)) {
901 U_ASSERT(ceList2 != NULL);
902 int32_t clength = ceList2->size();
903 int32_t slength = string->length();
904 int32_t roffset = offset + clength;
905 int32_t rlength = 0;
906
907 if (roffset < maxOffset) {
908 rlength = minLengthInChars(ceList, roffset, history);
909
910 if (rlength <= 0) {
911 // delete before continue to avoid memory leak.
912 #ifndef CACHE_CELISTS
913 delete ceList2;
914 #endif
915 // ignore any dead ends
916 continue;
917 }
918 }
919
920 if (shortestLength > slength + rlength) {
921 shortestLength = slength + rlength;
922 }
923 }
924
925 #ifndef CACHE_CELISTS
926 delete ceList2;
927 #endif
928 }
929 }
930
931 if (shortestLength == INT32_MAX) {
932 // No matching strings at this offset. See if
933 // the CE is in a range we can handle manually.
934 if (ce >= minHan && ce < maxHan) {
935 // all han have implicit orders which
936 // generate two CEs.
937 int32_t roffset = offset + 2;
938 int32_t rlength = 0;
939
940 //history[roffset++] = -1;
941 //history[roffset++] = 1;
942
943 if (roffset < maxOffset) {
944 rlength = minLengthInChars(ceList, roffset, history);
945 }
946
947 if (rlength < 0) {
948 return -1;
949 }
950
951 shortestLength = 1 + rlength;
952 goto have_shortest;
953 } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
954 int32_t roffset = offset;
955 int32_t rlength = 0;
956
957 // **** this loop may not handle archaic Hangul correctly ****
958 for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
959 uint32_t jce = ceList->get(roffset);
960
961 // Some Jamo have 24-bit primary order; skip the
962 // 2nd CE. This should always be OK because if
963 // we're still in the loop all we've seen are
964 // a series of Jamo in LVT order.
965 if (isContinuation(jce)) {
966 continue;
967 }
968
969 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
970 break;
971 }
972 }
973
974 if (roffset == offset) {
975 // we started with a non-L Jamo...
976 // just say it comes from a single character
977 roffset += 1;
978
979 // See if the single Jamo has a 24-bit order.
980 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
981 roffset += 1;
982 }
983 }
984
985 if (roffset < maxOffset) {
986 rlength = minLengthInChars(ceList, roffset, history);
987 }
988
989 if (rlength < 0) {
990 return -1;
991 }
992
993 shortestLength = 1 + rlength;
994 goto have_shortest;
995 }
996
997 // Can't handle it manually either. Just move on.
998 return -1;
999 }
1000
1001 have_shortest:
1002 history[offset] = shortestLength;
1003
1004 return shortestLength;
1005 }
1006
minLengthInChars(const CEList * ceList,int32_t offset) const1007 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
1008 {
1009 int32_t clength = ceList->size();
1010 int32_t *history = NEW_ARRAY(int32_t, clength);
1011
1012 for (int32_t i = 0; i < clength; i += 1) {
1013 history[i] = -1;
1014 }
1015
1016 int32_t minLength = minLengthInChars(ceList, offset, history);
1017
1018 DELETE_ARRAY(history);
1019
1020 return minLength;
1021 }
1022
open(UCollator * collator,UErrorCode & status)1023 CollData *CollData::open(UCollator *collator, UErrorCode &status)
1024 {
1025 if (U_FAILURE(status)) {
1026 return NULL;
1027 }
1028
1029 CollDataCache *cache = getCollDataCache();
1030
1031 return cache->get(collator, status);
1032 }
1033
close(CollData * collData)1034 void CollData::close(CollData *collData)
1035 {
1036 CollDataCache *cache = getCollDataCache();
1037
1038 cache->unref(collData);
1039 }
1040
1041 CollDataCache *CollData::collDataCache = NULL;
1042
getCollDataCache()1043 CollDataCache *CollData::getCollDataCache()
1044 {
1045 UErrorCode status = U_ZERO_ERROR;
1046 CollDataCache *cache = NULL;
1047
1048 UMTX_CHECK(NULL, collDataCache, cache);
1049
1050 if (cache == NULL) {
1051 cache = new CollDataCache(status);
1052
1053 if (U_FAILURE(status)) {
1054 delete cache;
1055 return NULL;
1056 }
1057
1058 umtx_lock(NULL);
1059 if (collDataCache == NULL) {
1060 collDataCache = cache;
1061
1062 ucln_i18n_registerCleanup(UCLN_I18N_COLL_DATA, coll_data_cleanup);
1063 }
1064 umtx_unlock(NULL);
1065
1066 if (collDataCache != cache) {
1067 delete cache;
1068 }
1069 }
1070
1071 return collDataCache;
1072 }
1073
freeCollDataCache()1074 void CollData::freeCollDataCache()
1075 {
1076 CollDataCache *cache = NULL;
1077
1078 UMTX_CHECK(NULL, collDataCache, cache);
1079
1080 if (cache != NULL) {
1081 umtx_lock(NULL);
1082 if (collDataCache != NULL) {
1083 collDataCache = NULL;
1084 } else {
1085 cache = NULL;
1086 }
1087 umtx_unlock(NULL);
1088
1089 delete cache;
1090 }
1091 }
1092
flushCollDataCache()1093 void CollData::flushCollDataCache()
1094 {
1095 CollDataCache *cache = NULL;
1096
1097 UMTX_CHECK(NULL, collDataCache, cache);
1098
1099 // **** this will fail if the another ****
1100 // **** thread deletes the cache here ****
1101 if (cache != NULL) {
1102 cache->flush();
1103 }
1104 }
1105
1106 U_NAMESPACE_END
1107
1108 #endif // #if !UCONFIG_NO_COLLATION
1109