• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  ******************************************************************************
3  *   Copyright (C) 1996-2014, International Business Machines
4  *   Corporation and others.  All Rights Reserved.
5  ******************************************************************************
6  */
7 
8 #include "unicode/utypes.h"
9 
10 #if !UCONFIG_NO_COLLATION
11 
12 #include "unicode/unistr.h"
13 #include "unicode/usearch.h"
14 
15 #include "cmemory.h"
16 #include "unicode/coll.h"
17 #include "unicode/tblcoll.h"
18 #include "unicode/coleitr.h"
19 #include "unicode/ucoleitr.h"
20 
21 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
22 
23 #include "unicode/uniset.h"
24 #include "unicode/uset.h"
25 #include "unicode/usetiter.h"
26 #include "unicode/ustring.h"
27 #include "hash.h"
28 #include "normalizer2impl.h"
29 #include "uhash.h"
30 #include "usrchimp.h"
31 #include "uassert.h"
32 
33 #include "colldata.h"
34 
35 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
36 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
37 #define DELETE_ARRAY(array) uprv_free((void *) (array))
38 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
39 
CEList(UCollator * coll,const UnicodeString & string,UErrorCode & status)40 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
41     : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
42 {
43     UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
44     UCollationStrength strength = ucol_getStrength(coll);
45     UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) ==  UCOL_SHIFTED;
46     uint32_t variableTop = ucol_getVariableTop(coll, &status);
47     uint32_t strengthMask = 0;
48     int32_t order;
49 
50     if (U_FAILURE(status)) {
51         return;
52     }
53 
54     // **** only set flag if string has Han(gul) ****
55     // ucol_forceHanImplicit(elems, &status); -- removed for ticket #10476
56 
57     switch (strength)
58     {
59     default:
60         strengthMask |= UCOL_TERTIARYORDERMASK;
61         /* fall through */
62 
63     case UCOL_SECONDARY:
64         strengthMask |= UCOL_SECONDARYORDERMASK;
65         /* fall through */
66 
67     case UCOL_PRIMARY:
68         strengthMask |= UCOL_PRIMARYORDERMASK;
69     }
70 
71     ces = ceBuffer;
72 
73     while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
74         UBool cont = isContinuation(order);
75 
76         order &= strengthMask;
77 
78         if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
79             if (strength >= UCOL_QUATERNARY) {
80                 order &= UCOL_PRIMARYORDERMASK;
81             } else {
82                 order = UCOL_IGNORABLE;
83             }
84         }
85 
86         if (order == UCOL_IGNORABLE) {
87             continue;
88         }
89 
90         if (cont) {
91             order |= UCOL_CONTINUATION_MARKER;
92         }
93 
94         add(order, status);
95     }
96 
97     ucol_closeElements(elems);
98 }
99 
~CEList()100 CEList::~CEList()
101 {
102     if (ces != ceBuffer) {
103         DELETE_ARRAY(ces);
104     }
105 }
106 
add(uint32_t ce,UErrorCode & status)107 void CEList::add(uint32_t ce, UErrorCode &status)
108 {
109     if (U_FAILURE(status)) {
110         return;
111     }
112 
113     if (listSize >= listMax) {
114         int32_t newMax = listMax + CELIST_BUFFER_SIZE;
115         uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
116 
117         if (newCEs == NULL) {
118             status = U_MEMORY_ALLOCATION_ERROR;
119             return;
120         }
121 
122         uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
123 
124         if (ces != ceBuffer) {
125             DELETE_ARRAY(ces);
126         }
127 
128         ces = newCEs;
129         listMax = newMax;
130     }
131 
132     ces[listSize++] = ce;
133 }
134 
get(int32_t index) const135 uint32_t CEList::get(int32_t index) const
136 {
137     if (index >= 0 && index < listSize) {
138         return ces[index];
139     }
140 
141     return (uint32_t)UCOL_NULLORDER;
142 }
143 
operator [](int32_t index) const144 uint32_t &CEList::operator[](int32_t index) const
145 {
146     return ces[index];
147 }
148 
matchesAt(int32_t offset,const CEList * other) const149 UBool CEList::matchesAt(int32_t offset, const CEList *other) const
150 {
151     if (other == NULL || listSize - offset < other->size()) {
152         return FALSE;
153     }
154 
155     for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
156         if (ces[i] != (*other)[j]) {
157             return FALSE;
158         }
159     }
160 
161     return TRUE;
162 }
163 
size() const164 int32_t CEList::size() const
165 {
166     return listSize;
167 }
168 
StringList(UErrorCode & status)169 StringList::StringList(UErrorCode &status)
170     : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
171 {
172     if (U_FAILURE(status)) {
173         return;
174     }
175 
176     strings = new UnicodeString [listMax];
177 
178     if (strings == NULL) {
179         status = U_MEMORY_ALLOCATION_ERROR;
180         return;
181     }
182 }
183 
~StringList()184 StringList::~StringList()
185 {
186     delete[] strings;
187 }
188 
add(const UnicodeString * string,UErrorCode & status)189 void StringList::add(const UnicodeString *string, UErrorCode &status)
190 {
191     if (U_FAILURE(status)) {
192         return;
193     }
194     if (listSize >= listMax) {
195         int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
196         UnicodeString *newStrings = new UnicodeString[newMax];
197         if (newStrings == NULL) {
198             status = U_MEMORY_ALLOCATION_ERROR;
199             return;
200         }
201         for (int32_t i=0; i<listSize; ++i) {
202             newStrings[i] = strings[i];
203         }
204         delete[] strings;
205         strings = newStrings;
206         listMax = newMax;
207     }
208 
209     // The ctor initialized all the strings in
210     // the array to empty strings, so this
211     // is the same as copying the source string.
212     strings[listSize++].append(*string);
213 }
214 
add(const UChar * chars,int32_t count,UErrorCode & status)215 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
216 {
217     const UnicodeString string(chars, count);
218 
219     add(&string, status);
220 }
221 
get(int32_t index) const222 const UnicodeString *StringList::get(int32_t index) const
223 {
224     if (index >= 0 && index < listSize) {
225         return &strings[index];
226     }
227 
228     return NULL;
229 }
230 
size() const231 int32_t StringList::size() const
232 {
233     return listSize;
234 }
235 
236 
237 U_CDECL_BEGIN
238 static void U_CALLCONV
deleteStringList(void * obj)239 deleteStringList(void *obj)
240 {
241     StringList *strings = (StringList *) obj;
242 
243     delete strings;
244 }
245 U_CDECL_END
246 
247 class CEToStringsMap
248 {
249 public:
250     CEToStringsMap(UErrorCode &status);
251     ~CEToStringsMap();
252 
253     void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
254     StringList *getStringList(uint32_t ce) const;
255 
256 private:
257     void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
258     UHashtable *map;
259 };
260 
CEToStringsMap(UErrorCode & status)261 CEToStringsMap::CEToStringsMap(UErrorCode &status)
262     : map(NULL)
263 {
264     if (U_FAILURE(status)) {
265         return;
266     }
267 
268     map = uhash_open(uhash_hashLong, uhash_compareLong,
269                      uhash_compareCaselessUnicodeString,
270                      &status);
271 
272     if (U_FAILURE(status)) {
273         return;
274     }
275 
276     uhash_setValueDeleter(map, deleteStringList);
277 }
278 
~CEToStringsMap()279 CEToStringsMap::~CEToStringsMap()
280 {
281     uhash_close(map);
282 }
283 
put(uint32_t ce,UnicodeString * string,UErrorCode & status)284 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
285 {
286     StringList *strings = getStringList(ce);
287 
288     if (strings == NULL) {
289         strings = new StringList(status);
290 
291         if (strings == NULL || U_FAILURE(status)) {
292             status = U_MEMORY_ALLOCATION_ERROR;
293             return;
294         }
295 
296         putStringList(ce, strings, status);
297     }
298 
299     strings->add(string, status);
300 }
301 
getStringList(uint32_t ce) const302 StringList *CEToStringsMap::getStringList(uint32_t ce) const
303 {
304     return (StringList *) uhash_iget(map, ce);
305 }
306 
putStringList(uint32_t ce,StringList * stringList,UErrorCode & status)307 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
308 {
309     uhash_iput(map, ce, (void *) stringList, &status);
310 }
311 
312 #define CLONE_COLLATOR
313 
CollData(UCollator * collator,UErrorCode & status)314 CollData::CollData(UCollator *collator, UErrorCode &status)
315     : coll(NULL), ceToCharsStartingWith(NULL)
316 {
317     // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
318     // i.e. other, control, private use, format, surrogate
319     U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
320     U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
321     USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
322 
323     // Han ext. A, Han, Jamo, Hangul, Han Ext. B
324     // i.e. all the characers we handle implicitly
325     U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
326     U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
327     USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
328 
329     if (U_FAILURE(status)) {
330         return;
331     }
332 
333     USet *expansions   = uset_openEmpty();
334     USet *contractions = uset_openEmpty();
335     int32_t itemCount;
336 
337     ceToCharsStartingWith = new CEToStringsMap(status);
338 
339     if (U_FAILURE(status)) {
340         goto bail;
341     }
342 
343 #ifdef CLONE_COLLATOR
344     coll = ucol_safeClone(collator, NULL, NULL, &status);
345 
346     if (U_FAILURE(status)) {
347         goto bail;
348     }
349 #else
350     coll = collator;
351 #endif
352 
353     ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
354 
355     uset_addAll(charsToTest, contractions);
356     uset_addAll(charsToTest, expansions);
357     uset_removeAll(charsToTest, charsToRemove);
358 
359     itemCount = uset_getItemCount(charsToTest);
360     for(int32_t item = 0; item < itemCount; item += 1) {
361         UChar32 start = 0, end = 0;
362         UChar buffer[16];
363         int32_t len = uset_getItem(charsToTest, item, &start, &end,
364                                    buffer, 16, &status);
365 
366         if (len == 0) {
367             for (UChar32 ch = start; ch <= end; ch += 1) {
368                 UnicodeString *st = new UnicodeString(ch);
369 
370                 if (st == NULL) {
371                     status = U_MEMORY_ALLOCATION_ERROR;
372                     break;
373                 }
374 
375                 CEList *ceList = new CEList(coll, *st, status);
376 
377                 ceToCharsStartingWith->put(ceList->get(0), st, status);
378 
379                 delete ceList;
380                 delete st;
381             }
382         } else if (len > 0) {
383             UnicodeString *st = new UnicodeString(buffer, len);
384 
385             if (st == NULL) {
386                 status = U_MEMORY_ALLOCATION_ERROR;
387                 break;
388             }
389 
390             CEList *ceList = new CEList(coll, *st, status);
391 
392             ceToCharsStartingWith->put(ceList->get(0), st, status);
393 
394             delete ceList;
395             delete st;
396         } else {
397             // shouldn't happen...
398         }
399 
400         if (U_FAILURE(status)) {
401              break;
402         }
403     }
404 
405 bail:
406     uset_close(contractions);
407     uset_close(expansions);
408     uset_close(charsToRemove);
409     uset_close(charsToTest);
410 
411     if (U_FAILURE(status)) {
412         return;
413     }
414 
415     UnicodeSet hanRanges(UNICODE_STRING_SIMPLE("[:Unified_Ideograph:]"), status);
416     if (U_FAILURE(status)) {
417         return;
418     }
419     UnicodeSetIterator hanIter(hanRanges);
420     UnicodeString hanString;
421     while(hanIter.nextRange()) {
422         hanString.append(hanIter.getCodepoint());
423         hanString.append(hanIter.getCodepointEnd());
424     }
425     // TODO: Why U+11FF? The old code had an outdated UCOL_LAST_T_JAMO=0x11F9,
426     // but as of Unicode 6.3 the 11xx block is filled,
427     // and there are also more Jamo T at U+D7CB..U+D7FB.
428     // Maybe use [:HST=T:] and look for the end of the last range?
429     // Maybe use script boundary mappings instead of this code??
430     UChar  jamoRanges[] = {Hangul::JAMO_L_BASE, Hangul::JAMO_V_BASE, Hangul::JAMO_T_BASE + 1, 0x11FF};
431      UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges));
432      CEList hanList(coll, hanString, status);
433      CEList jamoList(coll, jamoString, status);
434      int32_t j = 0;
435 
436      if (U_FAILURE(status)) {
437          return;
438      }
439 
440      for (int32_t c = 0; c < jamoList.size(); c += 1) {
441          uint32_t jce = jamoList[c];
442 
443          if (! isContinuation(jce)) {
444              jamoLimits[j++] = jce;
445          }
446      }
447 
448      jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
449 
450      minHan = 0xFFFFFFFF;
451      maxHan = 0;
452 
453      for(int32_t h = 0; h < hanList.size(); h += 2) {
454          uint32_t han = (uint32_t) hanList[h];
455 
456          if (han < minHan) {
457              minHan = han;
458          }
459 
460          if (han > maxHan) {
461              maxHan = han;
462          }
463      }
464 
465      maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
466 }
467 
~CollData()468 CollData::~CollData()
469 {
470 #ifdef CLONE_COLLATOR
471    ucol_close(coll);
472 #endif
473 
474    delete ceToCharsStartingWith;
475 }
476 
getCollator() const477 UCollator *CollData::getCollator() const
478 {
479     return coll;
480 }
481 
getStringList(int32_t ce) const482 const StringList *CollData::getStringList(int32_t ce) const
483 {
484     return ceToCharsStartingWith->getStringList(ce);
485 }
486 
getCEList(const UnicodeString * string) const487 const CEList *CollData::getCEList(const UnicodeString *string) const
488 {
489     UErrorCode status = U_ZERO_ERROR;
490     const CEList *list = new CEList(coll, *string, status);
491 
492     if (U_FAILURE(status)) {
493         delete list;
494         list = NULL;
495     }
496 
497     return list;
498 }
499 
freeCEList(const CEList * list)500 void CollData::freeCEList(const CEList *list)
501 {
502     delete list;
503 }
504 
minLengthInChars(const CEList * ceList,int32_t offset,int32_t * history) const505 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
506 {
507     // find out shortest string for the longest sequence of ces.
508     // this can probably be folded with the minLengthCache...
509 
510     if (history[offset] >= 0) {
511         return history[offset];
512     }
513 
514     uint32_t ce = ceList->get(offset);
515     int32_t maxOffset = ceList->size();
516     int32_t shortestLength = INT32_MAX;
517     const StringList *strings = ceToCharsStartingWith->getStringList(ce);
518 
519     if (strings != NULL) {
520         int32_t stringCount = strings->size();
521 
522         for (int32_t s = 0; s < stringCount; s += 1) {
523             const UnicodeString *string = strings->get(s);
524             UErrorCode status = U_ZERO_ERROR;
525             const CEList *ceList2 = new CEList(coll, *string, status);
526 
527             if (U_FAILURE(status)) {
528                 delete ceList2;
529                 ceList2 = NULL;
530             }
531 
532             if (ceList->matchesAt(offset, ceList2)) {
533                 U_ASSERT(ceList2 != NULL);
534                 int32_t clength = ceList2->size();
535                 int32_t slength = string->length();
536                 int32_t roffset = offset + clength;
537                 int32_t rlength = 0;
538 
539                 if (roffset < maxOffset) {
540                     rlength = minLengthInChars(ceList, roffset, history);
541 
542                     if (rlength <= 0) {
543                     // delete before continue to avoid memory leak.
544                         delete ceList2;
545 
546                         // ignore any dead ends
547                         continue;
548                     }
549                 }
550 
551                 if (shortestLength > slength + rlength) {
552                     shortestLength = slength + rlength;
553                 }
554             }
555 
556             delete ceList2;
557         }
558     }
559 
560     if (shortestLength == INT32_MAX) {
561         // No matching strings at this offset. See if
562         // the CE is in a range we can handle manually.
563         if (ce >= minHan && ce < maxHan) {
564             // all han have implicit orders which
565             // generate two CEs.
566             int32_t roffset = offset + 2;
567             int32_t rlength = 0;
568 
569           //history[roffset++] = -1;
570           //history[roffset++] = 1;
571 
572             if (roffset < maxOffset) {
573                 rlength = minLengthInChars(ceList, roffset, history);
574             }
575 
576             if (rlength < 0) {
577                 return -1;
578             }
579 
580             shortestLength = 1 + rlength;
581             goto have_shortest;
582         } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
583             int32_t roffset = offset;
584             int32_t rlength = 0;
585 
586             // **** this loop may not handle archaic Hangul correctly ****
587             for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
588                 uint32_t jce = ceList->get(roffset);
589 
590                 // Some Jamo have 24-bit primary order; skip the
591                 // 2nd CE. This should always be OK because if
592                 // we're still in the loop all we've seen are
593                 // a series of Jamo in LVT order.
594                 if (isContinuation(jce)) {
595                     continue;
596                 }
597 
598                 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
599                     break;
600                 }
601             }
602 
603             if (roffset == offset) {
604                 // we started with a non-L Jamo...
605                 // just say it comes from a single character
606                 roffset += 1;
607 
608                 // See if the single Jamo has a 24-bit order.
609                 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
610                     roffset += 1;
611                 }
612             }
613 
614             if (roffset < maxOffset) {
615                 rlength = minLengthInChars(ceList, roffset, history);
616             }
617 
618             if (rlength < 0) {
619                 return -1;
620             }
621 
622             shortestLength = 1 + rlength;
623             goto have_shortest;
624         }
625 
626         // Can't handle it manually either. Just move on.
627         return -1;
628     }
629 
630 have_shortest:
631     history[offset] = shortestLength;
632 
633     return shortestLength;
634 }
635 
minLengthInChars(const CEList * ceList,int32_t offset) const636 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
637 {
638     int32_t clength = ceList->size();
639     int32_t *history = NEW_ARRAY(int32_t, clength);
640 
641     for (int32_t i = 0; i < clength; i += 1) {
642         history[i] = -1;
643     }
644 
645     int32_t minLength = minLengthInChars(ceList, offset, history);
646 
647     DELETE_ARRAY(history);
648 
649     return minLength;
650 }
651 
652 #endif // #if !UCONFIG_NO_COLLATION
653