• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  ******************************************************************************
3  *   Copyright (C) 1996-2012, International Business Machines                 *
4  *   Corporation and others.  All Rights Reserved.                            *
5  ******************************************************************************
6  */
7 
8 #include "unicode/utypes.h"
9 
10 #if !UCONFIG_NO_COLLATION
11 
12 #include "unicode/unistr.h"
13 #include "unicode/usearch.h"
14 
15 #include "cmemory.h"
16 #include "unicode/coll.h"
17 #include "unicode/tblcoll.h"
18 #include "unicode/coleitr.h"
19 #include "unicode/ucoleitr.h"
20 
21 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
22 
23 #include "unicode/uniset.h"
24 #include "unicode/uset.h"
25 #include "unicode/ustring.h"
26 #include "hash.h"
27 #include "uhash.h"
28 #include "ucol_imp.h"
29 #include "uassert.h"
30 
31 #include "colldata.h"
32 
33 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
34 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
35 #define DELETE_ARRAY(array) uprv_free((void *) (array))
36 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
37 
CEList(UCollator * coll,const UnicodeString & string,UErrorCode & status)38 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
39     : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
40 {
41     UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
42     UCollationStrength strength = ucol_getStrength(coll);
43     UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) ==  UCOL_SHIFTED;
44     uint32_t variableTop = ucol_getVariableTop(coll, &status);
45     uint32_t strengthMask = 0;
46     int32_t order;
47 
48     if (U_FAILURE(status)) {
49         return;
50     }
51 
52     // **** only set flag if string has Han(gul) ****
53     ucol_forceHanImplicit(elems, &status);
54 
55     switch (strength)
56     {
57     default:
58         strengthMask |= UCOL_TERTIARYORDERMASK;
59         /* fall through */
60 
61     case UCOL_SECONDARY:
62         strengthMask |= UCOL_SECONDARYORDERMASK;
63         /* fall through */
64 
65     case UCOL_PRIMARY:
66         strengthMask |= UCOL_PRIMARYORDERMASK;
67     }
68 
69     ces = ceBuffer;
70 
71     while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
72         UBool cont = isContinuation(order);
73 
74         order &= strengthMask;
75 
76         if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
77             if (strength >= UCOL_QUATERNARY) {
78                 order &= UCOL_PRIMARYORDERMASK;
79             } else {
80                 order = UCOL_IGNORABLE;
81             }
82         }
83 
84         if (order == UCOL_IGNORABLE) {
85             continue;
86         }
87 
88         if (cont) {
89             order |= UCOL_CONTINUATION_MARKER;
90         }
91 
92         add(order, status);
93     }
94 
95     ucol_closeElements(elems);
96 }
97 
~CEList()98 CEList::~CEList()
99 {
100     if (ces != ceBuffer) {
101         DELETE_ARRAY(ces);
102     }
103 }
104 
add(uint32_t ce,UErrorCode & status)105 void CEList::add(uint32_t ce, UErrorCode &status)
106 {
107     if (U_FAILURE(status)) {
108         return;
109     }
110 
111     if (listSize >= listMax) {
112         int32_t newMax = listMax + CELIST_BUFFER_SIZE;
113         uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
114 
115         if (newCEs == NULL) {
116             status = U_MEMORY_ALLOCATION_ERROR;
117             return;
118         }
119 
120         uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
121 
122         if (ces != ceBuffer) {
123             DELETE_ARRAY(ces);
124         }
125 
126         ces = newCEs;
127         listMax = newMax;
128     }
129 
130     ces[listSize++] = ce;
131 }
132 
get(int32_t index) const133 uint32_t CEList::get(int32_t index) const
134 {
135     if (index >= 0 && index < listSize) {
136         return ces[index];
137     }
138 
139     return (uint32_t)UCOL_NULLORDER;
140 }
141 
operator [](int32_t index) const142 uint32_t &CEList::operator[](int32_t index) const
143 {
144     return ces[index];
145 }
146 
matchesAt(int32_t offset,const CEList * other) const147 UBool CEList::matchesAt(int32_t offset, const CEList *other) const
148 {
149     if (other == NULL || listSize - offset < other->size()) {
150         return FALSE;
151     }
152 
153     for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
154         if (ces[i] != (*other)[j]) {
155             return FALSE;
156         }
157     }
158 
159     return TRUE;
160 }
161 
size() const162 int32_t CEList::size() const
163 {
164     return listSize;
165 }
166 
StringList(UErrorCode & status)167 StringList::StringList(UErrorCode &status)
168     : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
169 {
170     if (U_FAILURE(status)) {
171         return;
172     }
173 
174     strings = new UnicodeString [listMax];
175 
176     if (strings == NULL) {
177         status = U_MEMORY_ALLOCATION_ERROR;
178         return;
179     }
180 }
181 
~StringList()182 StringList::~StringList()
183 {
184     delete[] strings;
185 }
186 
add(const UnicodeString * string,UErrorCode & status)187 void StringList::add(const UnicodeString *string, UErrorCode &status)
188 {
189     if (U_FAILURE(status)) {
190         return;
191     }
192     if (listSize >= listMax) {
193         int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
194         UnicodeString *newStrings = new UnicodeString[newMax];
195         if (newStrings == NULL) {
196             status = U_MEMORY_ALLOCATION_ERROR;
197             return;
198         }
199         for (int32_t i=0; i<listSize; ++i) {
200             newStrings[i] = strings[i];
201         }
202         delete[] strings;
203         strings = newStrings;
204         listMax = newMax;
205     }
206 
207     // The ctor initialized all the strings in
208     // the array to empty strings, so this
209     // is the same as copying the source string.
210     strings[listSize++].append(*string);
211 }
212 
add(const UChar * chars,int32_t count,UErrorCode & status)213 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
214 {
215     const UnicodeString string(chars, count);
216 
217     add(&string, status);
218 }
219 
get(int32_t index) const220 const UnicodeString *StringList::get(int32_t index) const
221 {
222     if (index >= 0 && index < listSize) {
223         return &strings[index];
224     }
225 
226     return NULL;
227 }
228 
size() const229 int32_t StringList::size() const
230 {
231     return listSize;
232 }
233 
234 
235 U_CDECL_BEGIN
236 static void U_CALLCONV
deleteStringList(void * obj)237 deleteStringList(void *obj)
238 {
239     StringList *strings = (StringList *) obj;
240 
241     delete strings;
242 }
243 U_CDECL_END
244 
245 class CEToStringsMap
246 {
247 public:
248     CEToStringsMap(UErrorCode &status);
249     ~CEToStringsMap();
250 
251     void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
252     StringList *getStringList(uint32_t ce) const;
253 
254 private:
255     void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
256     UHashtable *map;
257 };
258 
CEToStringsMap(UErrorCode & status)259 CEToStringsMap::CEToStringsMap(UErrorCode &status)
260     : map(NULL)
261 {
262     if (U_FAILURE(status)) {
263         return;
264     }
265 
266     map = uhash_open(uhash_hashLong, uhash_compareLong,
267                      uhash_compareCaselessUnicodeString,
268                      &status);
269 
270     if (U_FAILURE(status)) {
271         return;
272     }
273 
274     uhash_setValueDeleter(map, deleteStringList);
275 }
276 
~CEToStringsMap()277 CEToStringsMap::~CEToStringsMap()
278 {
279     uhash_close(map);
280 }
281 
put(uint32_t ce,UnicodeString * string,UErrorCode & status)282 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
283 {
284     StringList *strings = getStringList(ce);
285 
286     if (strings == NULL) {
287         strings = new StringList(status);
288 
289         if (strings == NULL || U_FAILURE(status)) {
290             status = U_MEMORY_ALLOCATION_ERROR;
291             return;
292         }
293 
294         putStringList(ce, strings, status);
295     }
296 
297     strings->add(string, status);
298 }
299 
getStringList(uint32_t ce) const300 StringList *CEToStringsMap::getStringList(uint32_t ce) const
301 {
302     return (StringList *) uhash_iget(map, ce);
303 }
304 
putStringList(uint32_t ce,StringList * stringList,UErrorCode & status)305 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
306 {
307     uhash_iput(map, ce, (void *) stringList, &status);
308 }
309 
310 #define CLONE_COLLATOR
311 
CollData(UCollator * collator,UErrorCode & status)312 CollData::CollData(UCollator *collator, UErrorCode &status)
313     : coll(NULL), ceToCharsStartingWith(NULL)
314 {
315     // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
316     // i.e. other, control, private use, format, surrogate
317     U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
318     U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
319     USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
320 
321     // Han ext. A, Han, Jamo, Hangul, Han Ext. B
322     // i.e. all the characers we handle implicitly
323     U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
324     U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
325     USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
326 
327     if (U_FAILURE(status)) {
328         return;
329     }
330 
331     USet *expansions   = uset_openEmpty();
332     USet *contractions = uset_openEmpty();
333     int32_t itemCount;
334 
335     ceToCharsStartingWith = new CEToStringsMap(status);
336 
337     if (U_FAILURE(status)) {
338         goto bail;
339     }
340 
341 #ifdef CLONE_COLLATOR
342     coll = ucol_safeClone(collator, NULL, NULL, &status);
343 
344     if (U_FAILURE(status)) {
345         goto bail;
346     }
347 #else
348     coll = collator;
349 #endif
350 
351     ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
352 
353     uset_addAll(charsToTest, contractions);
354     uset_addAll(charsToTest, expansions);
355     uset_removeAll(charsToTest, charsToRemove);
356 
357     itemCount = uset_getItemCount(charsToTest);
358     for(int32_t item = 0; item < itemCount; item += 1) {
359         UChar32 start = 0, end = 0;
360         UChar buffer[16];
361         int32_t len = uset_getItem(charsToTest, item, &start, &end,
362                                    buffer, 16, &status);
363 
364         if (len == 0) {
365             for (UChar32 ch = start; ch <= end; ch += 1) {
366                 UnicodeString *st = new UnicodeString(ch);
367 
368                 if (st == NULL) {
369                     status = U_MEMORY_ALLOCATION_ERROR;
370                     break;
371                 }
372 
373                 CEList *ceList = new CEList(coll, *st, status);
374 
375                 ceToCharsStartingWith->put(ceList->get(0), st, status);
376 
377                 delete ceList;
378                 delete st;
379             }
380         } else if (len > 0) {
381             UnicodeString *st = new UnicodeString(buffer, len);
382 
383             if (st == NULL) {
384                 status = U_MEMORY_ALLOCATION_ERROR;
385                 break;
386             }
387 
388             CEList *ceList = new CEList(coll, *st, status);
389 
390             ceToCharsStartingWith->put(ceList->get(0), st, status);
391 
392             delete ceList;
393             delete st;
394         } else {
395             // shouldn't happen...
396         }
397 
398         if (U_FAILURE(status)) {
399              break;
400         }
401     }
402 
403 bail:
404     uset_close(contractions);
405     uset_close(expansions);
406     uset_close(charsToRemove);
407     uset_close(charsToTest);
408 
409     if (U_FAILURE(status)) {
410         return;
411     }
412 
413      UChar32 hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A,
414                             UCOL_FIRST_HAN_B, UCOL_LAST_HAN_B};
415      UChar  jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_JAMO, UCOL_LAST_T_JAMO};
416      UnicodeString hanString = UnicodeString::fromUTF32(hanRanges, ARRAY_SIZE(hanRanges));
417      UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges));
418      CEList hanList(coll, hanString, status);
419      CEList jamoList(coll, jamoString, status);
420      int32_t j = 0;
421 
422      if (U_FAILURE(status)) {
423          return;
424      }
425 
426      for (int32_t c = 0; c < jamoList.size(); c += 1) {
427          uint32_t jce = jamoList[c];
428 
429          if (! isContinuation(jce)) {
430              jamoLimits[j++] = jce;
431          }
432      }
433 
434      jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
435 
436      minHan = 0xFFFFFFFF;
437      maxHan = 0;
438 
439      for(int32_t h = 0; h < hanList.size(); h += 2) {
440          uint32_t han = (uint32_t) hanList[h];
441 
442          if (han < minHan) {
443              minHan = han;
444          }
445 
446          if (han > maxHan) {
447              maxHan = han;
448          }
449      }
450 
451      maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
452 }
453 
~CollData()454 CollData::~CollData()
455 {
456 #ifdef CLONE_COLLATOR
457    ucol_close(coll);
458 #endif
459 
460    delete ceToCharsStartingWith;
461 }
462 
getCollator() const463 UCollator *CollData::getCollator() const
464 {
465     return coll;
466 }
467 
getStringList(int32_t ce) const468 const StringList *CollData::getStringList(int32_t ce) const
469 {
470     return ceToCharsStartingWith->getStringList(ce);
471 }
472 
getCEList(const UnicodeString * string) const473 const CEList *CollData::getCEList(const UnicodeString *string) const
474 {
475     UErrorCode status = U_ZERO_ERROR;
476     const CEList *list = new CEList(coll, *string, status);
477 
478     if (U_FAILURE(status)) {
479         delete list;
480         list = NULL;
481     }
482 
483     return list;
484 }
485 
freeCEList(const CEList * list)486 void CollData::freeCEList(const CEList *list)
487 {
488     delete list;
489 }
490 
minLengthInChars(const CEList * ceList,int32_t offset,int32_t * history) const491 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
492 {
493     // find out shortest string for the longest sequence of ces.
494     // this can probably be folded with the minLengthCache...
495 
496     if (history[offset] >= 0) {
497         return history[offset];
498     }
499 
500     uint32_t ce = ceList->get(offset);
501     int32_t maxOffset = ceList->size();
502     int32_t shortestLength = INT32_MAX;
503     const StringList *strings = ceToCharsStartingWith->getStringList(ce);
504 
505     if (strings != NULL) {
506         int32_t stringCount = strings->size();
507 
508         for (int32_t s = 0; s < stringCount; s += 1) {
509             const UnicodeString *string = strings->get(s);
510             UErrorCode status = U_ZERO_ERROR;
511             const CEList *ceList2 = new CEList(coll, *string, status);
512 
513             if (U_FAILURE(status)) {
514                 delete ceList2;
515                 ceList2 = NULL;
516             }
517 
518             if (ceList->matchesAt(offset, ceList2)) {
519                 U_ASSERT(ceList2 != NULL);
520                 int32_t clength = ceList2->size();
521                 int32_t slength = string->length();
522                 int32_t roffset = offset + clength;
523                 int32_t rlength = 0;
524 
525                 if (roffset < maxOffset) {
526                     rlength = minLengthInChars(ceList, roffset, history);
527 
528                     if (rlength <= 0) {
529                     // delete before continue to avoid memory leak.
530                         delete ceList2;
531 
532                         // ignore any dead ends
533                         continue;
534                     }
535                 }
536 
537                 if (shortestLength > slength + rlength) {
538                     shortestLength = slength + rlength;
539                 }
540             }
541 
542             delete ceList2;
543         }
544     }
545 
546     if (shortestLength == INT32_MAX) {
547         // No matching strings at this offset. See if
548         // the CE is in a range we can handle manually.
549         if (ce >= minHan && ce < maxHan) {
550             // all han have implicit orders which
551             // generate two CEs.
552             int32_t roffset = offset + 2;
553             int32_t rlength = 0;
554 
555           //history[roffset++] = -1;
556           //history[roffset++] = 1;
557 
558             if (roffset < maxOffset) {
559                 rlength = minLengthInChars(ceList, roffset, history);
560             }
561 
562             if (rlength < 0) {
563                 return -1;
564             }
565 
566             shortestLength = 1 + rlength;
567             goto have_shortest;
568         } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
569             int32_t roffset = offset;
570             int32_t rlength = 0;
571 
572             // **** this loop may not handle archaic Hangul correctly ****
573             for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
574                 uint32_t jce = ceList->get(roffset);
575 
576                 // Some Jamo have 24-bit primary order; skip the
577                 // 2nd CE. This should always be OK because if
578                 // we're still in the loop all we've seen are
579                 // a series of Jamo in LVT order.
580                 if (isContinuation(jce)) {
581                     continue;
582                 }
583 
584                 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
585                     break;
586                 }
587             }
588 
589             if (roffset == offset) {
590                 // we started with a non-L Jamo...
591                 // just say it comes from a single character
592                 roffset += 1;
593 
594                 // See if the single Jamo has a 24-bit order.
595                 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
596                     roffset += 1;
597                 }
598             }
599 
600             if (roffset < maxOffset) {
601                 rlength = minLengthInChars(ceList, roffset, history);
602             }
603 
604             if (rlength < 0) {
605                 return -1;
606             }
607 
608             shortestLength = 1 + rlength;
609             goto have_shortest;
610         }
611 
612         // Can't handle it manually either. Just move on.
613         return -1;
614     }
615 
616 have_shortest:
617     history[offset] = shortestLength;
618 
619     return shortestLength;
620 }
621 
minLengthInChars(const CEList * ceList,int32_t offset) const622 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
623 {
624     int32_t clength = ceList->size();
625     int32_t *history = NEW_ARRAY(int32_t, clength);
626 
627     for (int32_t i = 0; i < clength; i += 1) {
628         history[i] = -1;
629     }
630 
631     int32_t minLength = minLengthInChars(ceList, offset, history);
632 
633     DELETE_ARRAY(history);
634 
635     return minLength;
636 }
637 
638 #endif // #if !UCONFIG_NO_COLLATION
639